patches(paged): trim series to Phase135 routed-FFN line, sync to fork 1edddc8fe

The campaign patches 0048-0063 were added without matching fork commits.
After a keep/drop review, the series is trimmed and re-mirrored 1:1 onto the
fork branch mudler/llama.cpp:localai-paged (HEAD 1edddc8fe, on 51168c5ee).

Kept, renumbered from the fork (now carry Assisted-by + Signed-off-by):
- 0048 test(paged): cover MoE swiglu down chain      (was 0051, fd920cf8a)
- 0049 test(paged): cover MoE weighted combine chain (was 0052, a85c1e098)
- 0050 test(paged): cover ragged MoE dispatch        (was 0053, 2fed6aacf)
- 0051 fix(speculative): disable backend sampling for MTP drafts (was 0054, f1d976f06)
- 0052 feat(paged): whole-pattern MoE matcher + routed-FFN fused NVFP4-quant
       down MMQ (new, 1edddc8fe)

Dropped (no fork commits, removed from the series):
- 0048-0050 W4A16 grouped-tile pack/tune/pad: dead line, W4A16 ~1.5x slower
  than grouped-MMQ.
- 0055-0063 speculative/moe/mul-mat/cublas route traces + the rejected small-M
  tile-policy knob (0059).
- All other 110-140 campaign markers not needed by Phase135 (GPU-sort,
  W4A16-direct-A, boundary trace/timing, Phase133 sorted-F32, Phase134
  fused-SWIGLU, Phase138 finalize) carry no code in this tree.

Tree-hash proof (the mirror invariant): a fresh detached worktree at
LLAMA_VERSION 0ed235ea2c17a19fc8238668653946721ed136fd with every on-disk
patches/paged/0*.patch applied in numeric order (git apply) stages to tree
097c862c6834b7d8b90419b305b8402155ef8373, byte-identical to fork HEAD
1edddc8fe's tree. Series is 43 patches (0001-0047 unchanged + 0048-0052).

Gated on GB10 sm_121a: default md5 MoE 8cb0ce23 / dense 5951a5b4 unchanged;
opt-in md5-clean; MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46,
MOE_SWIGLU_DOWN 7/7, MUL_MAT_ID_RAGGED_MOE 6/6; six mmq_moe_quantized_raw
markers with zero sorted launches on the opt-in sentinel.

Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-07-02 10:19:10 +00:00
parent 1aba41082b
commit b529cc5420
17 changed files with 950 additions and 2163 deletions

View File

@@ -1,121 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 30 Jun 2026 21:19:07 +0000
Subject: [PATCH] feat(paged): pack W4A16 grouped tile metadata
Collapse the grouped W4A16 tile metadata from three host vectors and three device copies into one aligned descriptor array and one H2D copy. This keeps the path default-off while reducing launch-side metadata overhead for the experimental W4A16 MoE prefill path.
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/w4a16-gemm.cu | 46 ++++++++++++++++++--------------
1 file changed, 26 insertions(+), 20 deletions(-)
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
index f348f31ad..899e1a23f 100644
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
@@ -18,6 +18,14 @@ typedef tile<16, 8, nv_bfloat162> tile_A; // A operand: M=16, K=16
typedef tile< 8, 8, nv_bfloat162> tile_B; // B operand: N=8, K=16
typedef tile<16, 8, float> tile_C; // accumulator: M=16, N=8
+struct alignas(16) w4a16_tile_desc {
+ int expert;
+ int row0;
+ int rows;
+ int pad;
+};
+static_assert(sizeof(w4a16_tile_desc) == 16, "w4a16 tile descriptors must stay 16 bytes");
+
#ifndef LLAMA_W4A16_PREFILL_M
#define LLAMA_W4A16_PREFILL_M 0
#endif // LLAMA_W4A16_PREFILL_M
@@ -92,9 +100,7 @@ static __global__ void w4a16_grouped_kernel(
const nv_bfloat16 * __restrict__ Abf, // [pad_rows, K] bf16
const block_nvfp4 * __restrict__ W0, // src0 base (expert 0)
float * __restrict__ C, // [total_rows, N] f32
- const int * __restrict__ g_tile_expert,
- const int * __restrict__ g_tile_row0,
- const int * __restrict__ g_tile_rows,
+ const w4a16_tile_desc * __restrict__ g_tiles,
int N, int K, int64_t expert_stride_blocks) {
#if defined(AMPERE_MMA_AVAILABLE) && defined(CP_ASYNC_AVAILABLE)
constexpr int BK = 64; // one nvfp4 block
@@ -129,9 +135,10 @@ static __global__ void w4a16_grouped_kernel(
const int tid = warp*32 + lane; // linear id for the cp.async strided copies
const int wrow = warp / WARPS_N, wcol = warp % WARPS_N;
- const int e = g_tile_expert[blockIdx.y];
- const int row0 = g_tile_row0[blockIdx.y];
- const int rcount = g_tile_rows[blockIdx.y];
+ const w4a16_tile_desc tile_desc = g_tiles[blockIdx.y];
+ const int e = tile_desc.expert;
+ const int row0 = tile_desc.row0;
+ const int rcount = tile_desc.rows;
const int blockCol = blockIdx.x*BN;
const int Kb = K/64;
const block_nvfp4 * We = W0 + (int64_t) e*expert_stride_blocks; // expert e weight base
@@ -238,7 +245,7 @@ static __global__ void w4a16_grouped_kernel(
}
#else
GGML_UNUSED(Abf); GGML_UNUSED(W0); GGML_UNUSED(C);
- GGML_UNUSED(g_tile_expert); GGML_UNUSED(g_tile_row0); GGML_UNUSED(g_tile_rows);
+ GGML_UNUSED(g_tiles);
GGML_UNUSED(N); GGML_UNUSED(K); GGML_UNUSED(expert_stride_blocks);
NO_DEVICE_CODE;
#endif // AMPERE_MMA_AVAILABLE && CP_ASYNC_AVAILABLE
@@ -296,18 +303,21 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
return;
}
- std::vector<int32_t> h_tile_expert, h_tile_row0, h_tile_rows;
+ std::vector<w4a16_tile_desc> h_tiles;
int64_t row = 0;
for (int64_t e = 0; e < n_experts; e++) {
const int t = tokens_per_expert[e];
for (int off = 0; off < t; off += BM) {
- h_tile_expert.push_back((int32_t) e);
- h_tile_row0.push_back((int32_t) (row + off));
- h_tile_rows.push_back((int32_t) std::min(BM, t - off));
+ h_tiles.push_back({
+ (int) e,
+ (int) (row + off),
+ (int) std::min(BM, t - off),
+ 0,
+ });
}
row += t;
}
- const int n_tiles = (int) h_tile_expert.size();
+ const int n_tiles = (int) h_tiles.size();
if (getenv("LLAMA_W4A16_DEBUG")) {
int max_tpe = 0, multi = 0;
@@ -319,13 +329,9 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
(long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi);
}
- // device: tile map
- ggml_cuda_pool_alloc<int32_t> d_tile_expert(ctx.pool(), n_tiles);
- ggml_cuda_pool_alloc<int32_t> d_tile_row0 (ctx.pool(), n_tiles);
- ggml_cuda_pool_alloc<int32_t> d_tile_rows (ctx.pool(), n_tiles);
- CUDA_CHECK(cudaMemcpyAsync(d_tile_expert.ptr, h_tile_expert.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
- CUDA_CHECK(cudaMemcpyAsync(d_tile_row0.ptr, h_tile_row0.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
- CUDA_CHECK(cudaMemcpyAsync(d_tile_rows.ptr, h_tile_rows.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
+ // device: packed tile map; one pageable H2D copy instead of three tiny copies
+ ggml_cuda_pool_alloc<w4a16_tile_desc> d_tiles(ctx.pool(), n_tiles);
+ CUDA_CHECK(cudaMemcpyAsync(d_tiles.ptr, h_tiles.data(), n_tiles*sizeof(w4a16_tile_desc), cudaMemcpyHostToDevice, stream));
// activations: f32 -> bf16 (cheap cast, NO act-quant), zero-padded so every tile's BM-row read
// stays in-bounds. A tile's row0 is generally NOT BM-aligned (experts start mid-buffer), and a
@@ -353,7 +359,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
dim3 block(32, WARPS_M*WARPS_N); // 2D: threadIdx.x = warp lane, threadIdx.y = warp
kern<<<grid, block, smem_bytes, stream>>>(
Abf.get(), (const block_nvfp4 *) src0->data, dst_sorted,
- d_tile_expert.ptr, d_tile_row0.ptr, d_tile_rows.ptr,
+ d_tiles.ptr,
(int) N, (int) K, expert_stride_blocks);
CUDA_CHECK(cudaGetLastError());
}
--
2.43.0

View File

@@ -1,8 +1,10 @@
From cd56cf037379b084d6bb0ed47db8b785c828be86 Mon Sep 17 00:00:00 2001
From fd920cf8a7fe9cc7753cd0640411ce771edfeaca Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 30 Jun 2026 23:18:38 +0000
Subject: [PATCH] test(paged): cover MoE swiglu down chain
Subject: [PATCH 48/52] test(paged): cover MoE swiglu down chain
Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
tests/test-backend-ops.cpp | 92 ++++++++++++++++++++++++++++++++++++++
1 file changed, 92 insertions(+)

View File

@@ -1,93 +0,0 @@
From 7dfa0e17548c5f04f83d2cc2a057b0a9941b599a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 30 Jun 2026 21:44:54 +0000
Subject: [PATCH] feat(paged): tune W4A16 grouped tile shape
Select a BM=32 grouped W4A16 MoE prefill shape by default after the GB10 shape sweep, while keeping the prior 64x128 shape and additional diagnostics selectable through LLAMA_W4A16_SHAPE.
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/w4a16-gemm.cu | 42 +++++++++++++++++++++++++++++---
1 file changed, 38 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
index 899e1a23f..ca8864292 100644
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
@@ -4,6 +4,7 @@
#include <algorithm>
#include <cstdint>
#include <cstdlib>
+#include <cstring>
#include <vector>
// ===========================================================================
@@ -281,7 +282,23 @@ bool ggml_cuda_w4a16_moe_grouped_should_engage(
return true;
}
-void ggml_cuda_mul_mat_id_w4a16_grouped(
+static bool ggml_cuda_w4a16_use_base_shape() {
+ static const bool use_base = [] {
+ const char * e = getenv("LLAMA_W4A16_SHAPE");
+ if (e == nullptr || e[0] == '\0' || strcmp(e, "default") == 0 || strcmp(e, "bm32") == 0 || strcmp(e, "32x128") == 0) {
+ return false;
+ }
+ if (strcmp(e, "base") == 0 || strcmp(e, "64x128") == 0) {
+ return true;
+ }
+ fprintf(stderr, "[w4a16] unknown LLAMA_W4A16_SHAPE=%s, using default bm32\n", e);
+ return false;
+ }();
+ return use_base;
+}
+
+template<int BM, int BN, int WARPS_M, int WARPS_N, int STAGES>
+static void ggml_cuda_mul_mat_id_w4a16_grouped_impl(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0,
const float * src1_sorted,
@@ -290,9 +307,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
int64_t n_experts, int64_t K, int64_t N,
cudaStream_t stream) {
GGML_ASSERT(src0->type == GGML_TYPE_NVFP4);
- GGML_ASSERT(N % 128 == 0 && K % 64 == 0);
-
- constexpr int BM = 64, BN = 128, WARPS_M = 2, WARPS_N = 4, STAGES = 2;
+ GGML_ASSERT(N % BN == 0 && K % 64 == 0);
// host: build the per-M-tile expert map (ragged, no tile crosses an expert boundary)
int64_t total_rows = 0;
@@ -327,6 +342,8 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
}
fprintf(stderr, "[w4a16] engaged: total_rows=%lld n_experts=%lld K=%lld N=%lld n_tiles=%d max_tpe=%d multi_tile_experts=%d\n",
(long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi);
+ fprintf(stderr, "[w4a16] shape: BM=%d BN=%d WARPS_M=%d WARPS_N=%d STAGES=%d\n",
+ BM, BN, WARPS_M, WARPS_N, STAGES);
}
// device: packed tile map; one pageable H2D copy instead of three tiny copies
@@ -363,3 +380,20 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
(int) N, (int) K, expert_stride_blocks);
CUDA_CHECK(cudaGetLastError());
}
+
+void ggml_cuda_mul_mat_id_w4a16_grouped(
+ ggml_backend_cuda_context & ctx,
+ const ggml_tensor * src0,
+ const float * src1_sorted,
+ float * dst_sorted,
+ const int * tokens_per_expert,
+ int64_t n_experts, int64_t K, int64_t N,
+ cudaStream_t stream) {
+ if (ggml_cuda_w4a16_use_base_shape()) {
+ ggml_cuda_mul_mat_id_w4a16_grouped_impl<64, 128, 2, 4, 2>(
+ ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream);
+ } else {
+ ggml_cuda_mul_mat_id_w4a16_grouped_impl<32, 128, 1, 4, 2>(
+ ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream);
+ }
+}
--
2.43.0

View File

@@ -1,8 +1,10 @@
From 3ef7eb9e412eb34f8656675862f6753c65d28ec9 Mon Sep 17 00:00:00 2001
From a85c1e098e22eb587fd80220986f35a8d6e11300 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 30 Jun 2026 23:50:33 +0000
Subject: [PATCH] test(paged): cover MoE weighted combine chain
Subject: [PATCH 49/52] test(paged): cover MoE weighted combine chain
Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
tests/test-backend-ops.cpp | 90 ++++++++++++++++++++++++++++++++++++++
1 file changed, 90 insertions(+)

View File

@@ -1,56 +0,0 @@
From d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 30 Jun 2026 22:13:09 +0000
Subject: [PATCH] feat(paged): pad W4A16 A shared tile stride
Pad the grouped W4A16 A operand shared-memory row stride to reduce bank pressure on GB10 while preserving the selected bm32/default and base launch shapes.
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/w4a16-gemm.cu | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
index ca8864292..b17e022b0 100644
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
@@ -111,7 +111,8 @@ static __global__ void w4a16_grouped_kernel(
constexpr int MF = WM/16, NF = WN/8;
constexpr int AN = BK/2; // bf16 pairs per A smem row (nv_bfloat162)
- constexpr int SZ_A = BM*AN; // nv_bfloat162 per stage
+ constexpr int ASTR = AN + 4; // padded A smem row stride, in nv_bfloat162
+ constexpr int SZ_A = BM*ASTR; // nv_bfloat162 per stage
constexpr int SZ_WQ = BN*8; // u32 per stage (32 qs bytes/row)
constexpr int SZ_WD = BN; // u32 per stage (4 scale bytes/row)
@@ -155,7 +156,7 @@ static __global__ void w4a16_grouped_kernel(
const int c = idx % (BK/8); // 16B chunk in the row
const int r = idx / (BK/8); // row in tile
const nv_bfloat16 * src = Abf + (int64_t)(row0 + r)*K + (int64_t)kt*BK + c*8;
- w4a16_cp_async16(((char *) sA[st]) + (r*AN + c*4)*sizeof(uint32_t), src);
+ w4a16_cp_async16(((char *) sA[st]) + (r*ASTR + c*4)*sizeof(uint32_t), src);
}
// W qs: BN rows x 32 bytes = BN x 8 u32 (each block's qs at byte offset 4)
#pragma unroll 1
@@ -198,7 +199,7 @@ static __global__ void w4a16_grouped_kernel(
#pragma unroll
for (int mi = 0; mi < MF; mi++) {
const int rb = wrow*WM + mi*16;
- load_ldmatrix(A_frag[mi], sAcur + rb*AN + kk*8, AN);
+ load_ldmatrix(A_frag[mi], sAcur + rb*ASTR + kk*8, ASTR);
}
// B fragments: in-register FP4->bf16 dequant (correct-by-construction via tile get_i/get_j)
tile_B B_frag[NF];
@@ -368,7 +369,7 @@ static void ggml_cuda_mul_mat_id_w4a16_grouped_impl(
const int64_t expert_stride_blocks = (int64_t) (src0->nb[2] / sizeof(block_nvfp4));
auto kern = w4a16_grouped_kernel<BM, BN, WARPS_M, WARPS_N, STAGES>;
- constexpr int STAGE_U32 = BM*(64/2) + BN*8 + BN;
+ constexpr int STAGE_U32 = BM*((64/2) + 4) + BN*8 + BN;
const int smem_bytes = STAGES * STAGE_U32 * (int) sizeof(uint32_t);
CUDA_CHECK(cudaFuncSetAttribute(kern, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
--
2.43.0

View File

@@ -1,8 +1,10 @@
From e21732fc47206d5878e3b977bbd21858a3ba4ab0 Mon Sep 17 00:00:00 2001
From 2fed6aacff14537864bbf754c7552740131d4eaf Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 00:39:52 +0000
Subject: [PATCH] test(paged): cover ragged MoE dispatch
Subject: [PATCH 50/52] test(paged): cover ragged MoE dispatch
Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
tests/test-backend-ops.cpp | 118 +++++++++++++++++++++++++++++++++++++
1 file changed, 118 insertions(+)

View File

@@ -1,8 +1,11 @@
From 3eba64afff6ecaa25da11f0e394717224f221c9a Mon Sep 17 00:00:00 2001
From f1d976f06fb92655106709256dd093ffefb85e2b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 00:50:36 +0000
Subject: [PATCH] fix(speculative): disable backend sampling for MTP drafts
Subject: [PATCH 51/52] fix(speculative): disable backend sampling for MTP
drafts
Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
common/speculative.cpp | 6 ++++++
1 file changed, 6 insertions(+)

View File

@@ -0,0 +1,933 @@
From 1edddc8fe93bb2fec5f831bbde5df2b7480a7b05 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 2 Jul 2026 12:15:38 +0200
Subject: [PATCH 52/52] feat(paged): whole-pattern MoE matcher + routed-FFN
fused NVFP4-quant down MMQ
Add the routed-FFN fused-quant line for the paged NVFP4 MoE decode step,
all default-off and md5-clean, gating a fused SwiGLU-to-NVFP4-quant plus a
raw pre-quantized down-projection MMQ that skips the intermediate F32
materialize + re-quantize of the standard gate_up -> SwiGLU -> down chain.
Pieces (all guarded, no effect unless explicitly enabled):
- Whole-pattern MoE matcher + executor hook in ggml_cuda_try_fuse
(LLAMA_MOE_WHOLE_PATTERN_EXEC and the *_TRACE/_EARLY_TRACE diagnostics).
Detects the gate_up(MUL_MAT_ID) -> view/view -> SwiGLU(GLU) -> down(MUL_MAT_ID)
sub-graph early in the fusion pass and, when engaged, runs the whole chain
through a single executor instead of node-by-node.
- Routed-FFN PoC scaffold ggml/src/ggml-cuda/moe-ffn.{cu,cuh} + a narrow hook
(LLAMA_MOE_ROUTED_FFN_POC). ggml_cuda_compute_forward is de-static-ed so the
executor translation unit can drive the standard op path for the fallback
legs. The executor tries the fused-quant path first, else falls back to the
stock compute_forward for glu + down (bit-identical to default).
- Fused SwiGLU-to-NVFP4-quant + raw down MMQ (LLAMA_MOE_ROUTED_FFN_FUSED_QUANT):
moe_swiglu_nvfp4_quant_kernel writes block_fp4_mmq activations directly, then
ggml_cuda_mul_mat_q_moe_quantized (with the local ggml_cuda_mmq_ids_meta
refactor of the expert-sorted ids/bounds prep) runs the down GEMM on the
pre-quantized rows. Native FP4 (Blackwell) only; NVFP4 down weights only.
Gated on GB10 (sm_121a), before/after this commit:
- Canonical default-path greedy md5 unchanged: MoE q36-35b-a3b-nvfp4
8cb0ce23777bf55f92f63d0292c756b0, dense q36-27b-nvfp4
5951a5b4d624ce891e22ab5fca9bc439.
- md5-clean opt-in: LLAMA_MOE_ROUTED_FFN_POC=1 LLAMA_MOE_ROUTED_FFN_FUSED_QUANT=1
keeps the MoE md5 byte-identical (8cb0ce23...).
- test-backend-ops: MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46,
and the MoE sentinels MOE_SWIGLU_DOWN 7/7 + MUL_MAT_ID_RAGGED_MOE 6/6 pass both
default and opt-in. Opt-in emits exactly six route=mmq_moe_quantized_raw markers
with zero mmq_moe_sorted_raw launches (fused path provably engaged).
- Serving effect is flat-to-slightly-positive and not a shipped default:
decode agg 326.9 -> 332.7 t/s, mmq_nvfp4 6009 -> 5915 ms, aggregate flat
(~/bench/phase135_routed_ffn_fused_quant_serving/20260702_082102).
The rejected/neutral neighbours of this line (Phase133 sorted-F32 down,
Phase134 fused-SWIGLU-only, Phase138 finalize/weighted-combine fusion, the
W4A16 grouped-tile pack/tune/pad line, GPU-sort, boundary/layout/quant traces)
are deliberately excluded and carry no markers in this tree.
Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
ggml/src/ggml-cuda/ggml-cuda.cu | 304 +++++++++++++++++++++++++++++++-
ggml/src/ggml-cuda/mmq.cu | 148 ++++++++++++++++
ggml/src/ggml-cuda/mmq.cuh | 25 +++
ggml/src/ggml-cuda/moe-ffn.cu | 296 +++++++++++++++++++++++++++++++
ggml/src/ggml-cuda/moe-ffn.cuh | 24 +++
5 files changed, 796 insertions(+), 1 deletion(-)
create mode 100644 ggml/src/ggml-cuda/moe-ffn.cu
create mode 100644 ggml/src/ggml-cuda/moe-ffn.cuh
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 374949f25..ef1bdc3b4 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -32,6 +32,7 @@
#include "ggml-cuda/im2col.cuh"
#include "ggml-cuda/mmf.cuh"
#include "ggml-cuda/mmq.cuh"
+#include "ggml-cuda/moe-ffn.cuh"
#include "ggml-cuda/mmvf.cuh"
#include "ggml-cuda/mmvq.cuh"
#include "ggml-cuda/norm.cuh"
@@ -2854,7 +2855,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
nb1, nb2, nb3, stream);
}
-static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
+bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
switch (dst->op) {
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
@@ -4032,6 +4033,201 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
return false;
}
+static inline const char * ggml_cuda_moe_wp_trace_tensor_name(const ggml_tensor * t) {
+ return t != nullptr && t->name[0] != '\0' ? t->name : "-";
+}
+
+static inline int ggml_cuda_moe_whole_pattern_trace_limit() {
+ static const int value = []() {
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_TRACE");
+ if (s == nullptr || strcmp(s, "0") == 0) {
+ return 0;
+ }
+ const int parsed = atoi(s);
+ return parsed > 0 ? parsed : 128;
+ }();
+
+ return value;
+}
+
+static inline bool ggml_cuda_moe_whole_pattern_trace_take(std::atomic<int> & counter) {
+ const int trace_limit = ggml_cuda_moe_whole_pattern_trace_limit();
+ if (trace_limit <= 0) {
+ return false;
+ }
+
+ const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed);
+ return trace_idx < trace_limit;
+}
+
+static inline int ggml_cuda_moe_whole_pattern_early_trace_limit() {
+ static const int value = []() {
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EARLY_TRACE");
+ if (s == nullptr || strcmp(s, "0") == 0) {
+ return 0;
+ }
+ const int parsed = atoi(s);
+ return parsed > 0 ? parsed : 128;
+ }();
+
+ return value;
+}
+
+static inline bool ggml_cuda_moe_whole_pattern_exec_enabled() {
+ static const bool value = []() {
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC");
+ return s != nullptr && atoi(s) != 0;
+ }();
+
+ return value;
+}
+
+static inline int ggml_cuda_moe_whole_pattern_exec_trace_limit() {
+ static const int value = []() {
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC_TRACE");
+ if (s == nullptr || strcmp(s, "0") == 0) {
+ return 0;
+ }
+ const int parsed = atoi(s);
+ return parsed > 0 ? parsed : 128;
+ }();
+
+ return value;
+}
+
+static inline bool ggml_cuda_moe_whole_pattern_exec_trace_take(std::atomic<int> & counter) {
+ const int trace_limit = ggml_cuda_moe_whole_pattern_exec_trace_limit();
+ if (trace_limit <= 0) {
+ return false;
+ }
+
+ const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed);
+ return trace_idx < trace_limit;
+}
+
+struct ggml_cuda_moe_whole_pattern {
+ const ggml_tensor * gate_up = nullptr;
+ const ggml_tensor * gate = nullptr;
+ const ggml_tensor * up = nullptr;
+ const ggml_tensor * glu = nullptr;
+ const ggml_tensor * down = nullptr;
+ const ggml_tensor * ids = nullptr;
+
+ bool view_pair = false;
+ bool ids_match = false;
+ bool swiglu = false;
+ bool supported_type = false;
+ bool supported = false;
+};
+
+static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect(const ggml_tensor * glu, const ggml_tensor * down) {
+ ggml_cuda_moe_whole_pattern pattern{};
+ pattern.glu = glu;
+ pattern.down = down;
+
+ if (glu == nullptr || down == nullptr || glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) {
+ return pattern;
+ }
+
+ pattern.gate = glu->src[0];
+ pattern.up = glu->src[1];
+ pattern.ids = down->src[2];
+
+ pattern.view_pair = pattern.gate != nullptr && pattern.up != nullptr &&
+ pattern.gate->op == GGML_OP_VIEW && pattern.up->op == GGML_OP_VIEW &&
+ pattern.gate->view_src != nullptr && pattern.gate->view_src == pattern.up->view_src;
+ if (!pattern.view_pair) {
+ return pattern;
+ }
+
+ pattern.gate_up = pattern.gate->view_src;
+ if (pattern.gate_up == nullptr || pattern.gate_up->op != GGML_OP_MUL_MAT_ID) {
+ return pattern;
+ }
+
+ pattern.ids_match = pattern.gate_up->src[2] == pattern.ids;
+ pattern.swiglu = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU;
+ pattern.supported_type = down->src[0] != nullptr &&
+ (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4);
+ pattern.supported = pattern.ids_match && pattern.swiglu && pattern.supported_type;
+
+ return pattern;
+}
+
+static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect_early(const ggml_cgraph * cgraph, int i) {
+ ggml_cuda_moe_whole_pattern pattern{};
+
+ if (cgraph == nullptr || i + 4 >= cgraph->n_nodes) {
+ return pattern;
+ }
+
+ const ggml_tensor * gate_up = cgraph->nodes[i + 0];
+ const ggml_tensor * view0 = cgraph->nodes[i + 1];
+ const ggml_tensor * view1 = cgraph->nodes[i + 2];
+ const ggml_tensor * glu = cgraph->nodes[i + 3];
+ const ggml_tensor * down = cgraph->nodes[i + 4];
+
+ pattern.gate_up = gate_up;
+ pattern.glu = glu;
+ pattern.down = down;
+
+ if (gate_up == nullptr || view0 == nullptr || view1 == nullptr || glu == nullptr || down == nullptr ||
+ gate_up->op != GGML_OP_MUL_MAT_ID || view0->op != GGML_OP_VIEW || view1->op != GGML_OP_VIEW ||
+ glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) {
+ return pattern;
+ }
+
+ pattern.view_pair = view0->view_src == gate_up && view1->view_src == gate_up;
+ if (!pattern.view_pair) {
+ return pattern;
+ }
+
+ if (glu->src[0] == view0 && glu->src[1] == view1) {
+ pattern.gate = view0;
+ pattern.up = view1;
+ } else if (glu->src[0] == view1 && glu->src[1] == view0) {
+ pattern.gate = view1;
+ pattern.up = view0;
+ } else {
+ return pattern;
+ }
+
+ if (down->src[1] != glu) {
+ return pattern;
+ }
+
+ pattern.ids = down->src[2];
+ pattern.ids_match = gate_up->src[2] == pattern.ids;
+ pattern.swiglu = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU;
+ pattern.supported_type = down->src[0] != nullptr &&
+ (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4);
+ pattern.supported = pattern.ids_match && pattern.swiglu && pattern.supported_type;
+
+ return pattern;
+}
+
+static bool ggml_cuda_moe_whole_pattern_exec_proof(
+ ggml_backend_cuda_context * cuda_ctx,
+ const ggml_cuda_moe_whole_pattern & pattern) {
+ GGML_ASSERT(cuda_ctx != nullptr);
+ GGML_ASSERT(pattern.supported);
+ GGML_ASSERT(pattern.gate_up != nullptr);
+ GGML_ASSERT(pattern.glu != nullptr);
+ GGML_ASSERT(pattern.down != nullptr);
+
+ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.gate_up))) {
+ return false;
+ }
+ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.glu))) {
+ return false;
+ }
+ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.down))) {
+ return false;
+ }
+
+ return true;
+}
+
// try and fuse nodes and return the number of nodes to skip
static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, int i) {
@@ -4042,6 +4238,112 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
ggml_tensor * node = cgraph->nodes[i];
+ static std::atomic<int> moe_whole_pattern_early_trace_count{0};
+ const bool routed_ffn_poc = ggml_cuda_moe_routed_ffn_poc_enabled();
+ const bool whole_pattern_exec = ggml_cuda_moe_whole_pattern_exec_enabled();
+ const int whole_pattern_early_trace_limit = ggml_cuda_moe_whole_pattern_early_trace_limit();
+ if (node->op == GGML_OP_MUL_MAT_ID &&
+ (routed_ffn_poc || whole_pattern_exec || whole_pattern_early_trace_limit > 0)) {
+ const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect_early(cgraph, i);
+ if (pattern.view_pair) {
+ const int trace_idx = moe_whole_pattern_early_trace_count.fetch_add(1, std::memory_order_relaxed);
+ if (trace_idx < whole_pattern_early_trace_limit) {
+ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
+ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
+ fprintf(stderr,
+ "[LLAMA_MOE_WHOLE_PATTERN_EARLY] supported=%d skip_ready=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s"
+ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64
+ " n_embd=%" PRId64 " n_ff=%" PRId64
+ " ids_match=%d swiglu=%d\n",
+ pattern.supported ? 1 : 0,
+ pattern.supported ? 4 : 0,
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.up),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
+ down_w != nullptr ? ggml_type_name(down_w->type) : "-",
+ down_x != nullptr ? down_x->ne[2] : 0,
+ pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
+ down_w != nullptr ? down_w->ne[2] : 0,
+ down_w != nullptr ? down_w->ne[1] : 0,
+ down_w != nullptr ? down_w->ne[0] : 0,
+ pattern.ids_match ? 1 : 0,
+ pattern.swiglu ? 1 : 0);
+ }
+ }
+
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+ const bool poc_supported = routed_ffn_poc && ggml_cuda_moe_routed_ffn_poc_should_engage(
+ pattern.gate_up, pattern.gate, pattern.up, pattern.glu, pattern.down, pattern.ids, cc);
+
+ if ((poc_supported || (whole_pattern_exec && pattern.supported))) {
+ const bool ok = poc_supported ?
+ ggml_cuda_moe_routed_ffn_poc(
+ *cuda_ctx,
+ const_cast<ggml_tensor *>(pattern.gate_up),
+ const_cast<ggml_tensor *>(pattern.gate),
+ const_cast<ggml_tensor *>(pattern.up),
+ const_cast<ggml_tensor *>(pattern.glu),
+ const_cast<ggml_tensor *>(pattern.down)) :
+ ggml_cuda_moe_whole_pattern_exec_proof(cuda_ctx, pattern);
+ GGML_ASSERT(ok);
+
+ static std::atomic<int> moe_whole_pattern_exec_trace_count{0};
+ if (ggml_cuda_moe_whole_pattern_exec_trace_take(moe_whole_pattern_exec_trace_count)) {
+ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
+ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
+ fprintf(stderr,
+ "[LLAMA_MOE_WHOLE_PATTERN_EXEC] skip=4 gate_up=%s glu=%s down=%s ids=%s"
+ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64 "\n",
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
+ down_x != nullptr ? down_x->ne[2] : 0,
+ pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
+ down_w != nullptr ? down_w->ne[2] : 0);
+ }
+
+ return 4;
+ }
+ }
+
+ if (node->op == GGML_OP_GLU && i + 1 < cgraph->n_nodes && cgraph->nodes[i + 1]->op == GGML_OP_MUL_MAT_ID) {
+ static std::atomic<int> moe_whole_pattern_trace_count{0};
+ const bool whole_trace = ggml_cuda_moe_whole_pattern_trace_take(moe_whole_pattern_trace_count);
+
+ if (whole_trace) {
+ const ggml_tensor * down = cgraph->nodes[i + 1];
+ const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect(node, down);
+
+ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
+ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
+ fprintf(stderr,
+ "[LLAMA_MOE_WHOLE_PATTERN] supported=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s"
+ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64
+ " n_embd=%" PRId64 " n_ff=%" PRId64
+ " view_pair=%d ids_match=%d swiglu=%d\n",
+ pattern.supported ? 1 : 0,
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.up),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
+ down_w != nullptr ? ggml_type_name(down_w->type) : "-",
+ down_x != nullptr ? down_x->ne[2] : 0,
+ pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
+ down_w != nullptr ? down_w->ne[2] : 0,
+ down_w != nullptr ? down_w->ne[1] : 0,
+ down_w != nullptr ? down_w->ne[0] : 0,
+ pattern.view_pair ? 1 : 0,
+ pattern.ids_match ? 1 : 0,
+ pattern.swiglu ? 1 : 0);
+ }
+ }
+
//topk-moe
if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index dc5c2d198..d8f39d395 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -1,4 +1,7 @@
#include <cstdlib>
+#include <atomic>
+#include <cinttypes>
+#include <cstdio>
#include "common.cuh"
#include "mmq.cuh"
#include "quantize.cuh"
@@ -75,6 +78,151 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
}
}
+static inline int ggml_cuda_quant_trace_limit() {
+ static const int value = []() {
+ const char * s = getenv("LLAMA_QUANT_TRACE");
+ return s ? atoi(s) : 0;
+ }();
+
+ return value;
+}
+
+static inline const char * ggml_cuda_quant_trace_tensor_name(const ggml_tensor * t) {
+ return t != nullptr && t->name[0] != '\0' ? t->name : "-";
+}
+
+static inline void ggml_cuda_quant_trace(
+ const char * route, const ggml_tensor * src0, const ggml_tensor * src1,
+ const ggml_tensor * ids, const ggml_tensor * dst, const int native_fp4,
+ const int dedup, const int gathered, const int64_t ne10, const int64_t ne10_padded,
+ const int64_t rows, const int64_t ne12, const int64_t n_expert_used) {
+ const int trace_limit = ggml_cuda_quant_trace_limit();
+ if (trace_limit <= 0) {
+ return;
+ }
+
+ static std::atomic<int> trace_count{0};
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
+ if (trace_idx >= trace_limit) {
+ return;
+ }
+
+ fprintf(stderr,
+ "[LLAMA_QUANT_TRACE] route=%s src0=%s src0_type=%s src1=%s src1_type=%s dst=%s dst_type=%s ids=%s "
+ "native_fp4=%d dedup=%d gathered=%d K=%" PRId64 " Kpad=%" PRId64 " rows=%" PRId64
+ " ne12=%" PRId64 " experts=%" PRId64 "\n",
+ route, ggml_cuda_quant_trace_tensor_name(src0), src0 != nullptr ? ggml_type_name(src0->type) : "-",
+ ggml_cuda_quant_trace_tensor_name(src1), src1 != nullptr ? ggml_type_name(src1->type) : "-",
+ ggml_cuda_quant_trace_tensor_name(dst), dst != nullptr ? ggml_type_name(dst->type) : "-",
+ ggml_cuda_quant_trace_tensor_name(ids), native_fp4, dedup, gathered,
+ ne10, ne10_padded, rows, ne12, n_expert_used);
+}
+
+ggml_cuda_mmq_ids_meta::ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts)
+ : ne_get_rows(ne_get_rows) {
+ alloc(pool, ne_get_rows, n_experts);
+}
+
+void ggml_cuda_mmq_ids_meta::alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts) {
+ ids_src1.alloc(pool, ne_get_rows);
+ ids_dst.alloc(pool, ne_get_rows);
+ expert_bounds.alloc(pool, n_experts + 1);
+ this->ne_get_rows = ne_get_rows;
+}
+
+void ggml_cuda_mmq_ids_meta::build(
+ const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens,
+ int64_t n_expert_used, int64_t nchannels_y, int64_t sis1,
+ cudaStream_t stream) {
+ GGML_ASSERT(ids->nb[0] == ggml_element_size(ids));
+ const int si1 = ids->nb[1] / ggml_element_size(ids);
+
+ ggml_cuda_launch_mm_ids_helper(
+ (const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(),
+ n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+ CUDA_CHECK(cudaGetLastError());
+}
+
+static void ggml_cuda_mul_mat_q_moe_quantized_impl(
+ ggml_backend_cuda_context & ctx,
+ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
+ const int32_t * ids_dst, const int32_t * expert_bounds,
+ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
+ int64_t ncols_src1_padded) {
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+ GGML_ASSERT( src1_q != nullptr);
+ GGML_ASSERT( ids_dst != nullptr);
+ GGML_ASSERT( expert_bounds != nullptr);
+ GGML_ASSERT( n_tokens > 0);
+ GGML_ASSERT( n_expert_used > 0);
+ GGML_ASSERT( n_experts > 0);
+ GGML_ASSERT( ncols_src1_padded > 0);
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
+ const int64_t ne02 = src0->ne[2];
+ const int64_t ne03 = src0->ne[3];
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+ const int64_t ne2 = dst->ne[2];
+ const int64_t ne3 = dst->ne[3];
+ const int64_t nb00 = src0->nb[0];
+ const int64_t nb01 = src0->nb[1];
+ const int64_t nb02 = src0->nb[2];
+ const int64_t nb03 = src0->nb[3];
+ const int64_t nb0 = dst->nb[0];
+ const int64_t nb1 = dst->nb[1];
+ const int64_t nb2 = dst->nb[2];
+ const int64_t nb3 = dst->nb[3];
+
+ const size_t ts_src0 = ggml_type_size(src0->type);
+ const size_t ts_dst = ggml_type_size(dst->type);
+
+ GGML_ASSERT(nb00 == (int64_t) ts_src0);
+ GGML_ASSERT(nb0 == (int64_t) ts_dst);
+ GGML_ASSERT(ne00 <= ncols_src1_padded);
+ GGML_ASSERT(ne01 == ne0);
+ GGML_ASSERT(ne02 == n_experts);
+ GGML_ASSERT(ne1 == n_expert_used);
+ GGML_ASSERT(ne2 == n_tokens);
+
+ cudaStream_t stream = ctx.stream();
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+ const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
+ || GGML_CUDA_CC_IS_CDNA(cc);
+
+ const int64_t ne_get_rows = n_tokens * n_expert_used;
+ const int64_t s01 = nb01 / ts_src0;
+ const int64_t s1 = nb1 / ts_dst;
+ const int64_t s02 = nb02 / ts_src0;
+ const int64_t s2 = nb2 / ts_dst;
+ const int64_t s03 = nb03 / ts_src0;
+ const int64_t s3 = nb3 / ts_dst;
+ const int64_t s12 = ne_get_rows * ncols_src1_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int));
+ const int64_t s13 = n_tokens * s12;
+
+ const mmq_args args = {
+ (const char *) src0->data, src0->type, (const int *) src1_q, ids_dst, expert_bounds, (float *) dst->data,
+ ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
+ n_experts, n_experts, s02, s12, s2,
+ ne03, ne3, s03, s13, s3,
+ use_stream_k, n_tokens};
+
+ ggml_cuda_quant_trace("mmq_moe_quantized_raw", src0, nullptr, nullptr, dst, 1,
+ 0, 0, ne00, ncols_src1_padded, ne_get_rows, n_tokens, n_expert_used);
+ ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
+}
+
+void ggml_cuda_mul_mat_q_moe_quantized(
+ ggml_backend_cuda_context & ctx,
+ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
+ const int32_t * ids_dst, const int32_t * expert_bounds,
+ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
+ int64_t ncols_src1_padded) {
+ ggml_cuda_mul_mat_q_moe_quantized_impl(ctx, src0, src1_q, dst, ids_dst, expert_bounds,
+ n_tokens, n_expert_used, n_experts, ncols_src1_padded);
+}
+
void ggml_cuda_mul_mat_q(
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index b53e38a8b..690dc694a 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -4334,6 +4334,31 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
void ggml_cuda_mul_mat_q(
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+struct ggml_cuda_mmq_ids_meta {
+ ggml_cuda_pool_alloc<int32_t> ids_src1;
+ ggml_cuda_pool_alloc<int32_t> ids_dst;
+ ggml_cuda_pool_alloc<int32_t> expert_bounds;
+
+ int64_t ne_get_rows = 0;
+
+ ggml_cuda_mmq_ids_meta() = default;
+ ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts);
+
+ void alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts);
+
+ void build(
+ const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens,
+ int64_t n_expert_used, int64_t nchannels_y, int64_t sis1,
+ cudaStream_t stream);
+};
+
+void ggml_cuda_mul_mat_q_moe_quantized(
+ ggml_backend_cuda_context & ctx,
+ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
+ const int32_t * ids_dst, const int32_t * expert_bounds,
+ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
+ int64_t ncols_src1_padded);
+
void ggml_cuda_op_mul_mat_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
diff --git a/ggml/src/ggml-cuda/moe-ffn.cu b/ggml/src/ggml-cuda/moe-ffn.cu
new file mode 100644
index 000000000..507390d65
--- /dev/null
+++ b/ggml/src/ggml-cuda/moe-ffn.cu
@@ -0,0 +1,296 @@
+#include "moe-ffn.cuh"
+#include "getrows.cuh"
+#include "mmq.cuh"
+#include "unary.cuh"
+
+#include <cstdlib>
+
+bool ggml_cuda_moe_routed_ffn_poc_enabled() {
+ static const bool enabled = [] {
+ const char * value = getenv("LLAMA_MOE_ROUTED_FFN_POC");
+ return value != nullptr && atoi(value) != 0;
+ }();
+ return enabled;
+}
+
+bool ggml_cuda_moe_routed_ffn_poc_should_engage(
+ const ggml_tensor * gate_up,
+ const ggml_tensor * gate,
+ const ggml_tensor * up,
+ const ggml_tensor * glu,
+ const ggml_tensor * down,
+ const ggml_tensor * ids,
+ int cc) {
+ if (!blackwell_mma_available(cc)) {
+ return false;
+ }
+ if (gate_up == nullptr || gate == nullptr || up == nullptr || glu == nullptr || down == nullptr || ids == nullptr) {
+ return false;
+ }
+ if (gate_up->op != GGML_OP_MUL_MAT_ID || down->op != GGML_OP_MUL_MAT_ID) {
+ return false;
+ }
+ if (gate->op != GGML_OP_VIEW || up->op != GGML_OP_VIEW || gate->view_src != gate_up || up->view_src != gate_up) {
+ return false;
+ }
+ if (glu->op != GGML_OP_GLU || ggml_get_glu_op(glu) != GGML_GLU_OP_SWIGLU || down->src[1] != glu) {
+ return false;
+ }
+ if (gate_up->src[2] != ids || down->src[2] != ids) {
+ return false;
+ }
+
+ const ggml_tensor * down_w = down->src[0];
+ if (down_w == nullptr || (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4)) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool ggml_cuda_moe_routed_ffn_fused_quant_enabled() {
+ static const bool enabled = [] {
+ const char * value = getenv("LLAMA_MOE_ROUTED_FFN_FUSED_QUANT");
+ return value != nullptr && atoi(value) != 0;
+ }();
+ return enabled;
+}
+
+static bool ggml_cuda_moe_routed_ffn_down_supported(const ggml_tensor * glu, const ggml_tensor * down) {
+ const ggml_tensor * down_w = down != nullptr ? down->src[0] : nullptr;
+ const ggml_tensor * ids = down != nullptr ? down->src[2] : nullptr;
+ if (glu == nullptr || down == nullptr || down_w == nullptr || ids == nullptr) {
+ return false;
+ }
+ if (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4) {
+ return false;
+ }
+ if (glu->type != GGML_TYPE_F32 || down->type != GGML_TYPE_F32 || ids->type != GGML_TYPE_I32) {
+ return false;
+ }
+ if (glu->ne[3] != 1 || down->ne[3] != 1 || ids->ne[2] != 1 || ids->ne[3] != 1) {
+ return false;
+ }
+ if (down_w->ne[0] != glu->ne[0] || down_w->ne[1] != down->ne[0] || down_w->ne[2] <= 0) {
+ return false;
+ }
+ if (ids->ne[0] != glu->ne[1] || ids->ne[1] != glu->ne[2]) {
+ return false;
+ }
+ if (down->ne[1] != glu->ne[1] || down->ne[2] != glu->ne[2]) {
+ return false;
+ }
+ if (ids->nb[0] != ggml_element_size(ids)) {
+ return false;
+ }
+ if (glu->nb[0] != sizeof(float) || down->nb[0] != sizeof(float)) {
+ return false;
+ }
+ if (glu->nb[1] != (size_t) (glu->ne[0] * (int64_t) sizeof(float)) ||
+ glu->nb[2] != (size_t) (glu->ne[1] * (int64_t) glu->nb[1])) {
+ return false;
+ }
+ if (down->nb[1] != (size_t) (down->ne[0] * (int64_t) sizeof(float)) ||
+ down->nb[2] != (size_t) (down->ne[1] * (int64_t) down->nb[1])) {
+ return false;
+ }
+
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+ return ggml_cuda_should_use_mmq(down_w->type, cc, glu->ne[2], down_w->ne[2]);
+}
+
+static __global__ void moe_swiglu_nvfp4_quant_kernel(
+ const float * __restrict__ gate,
+ const float * __restrict__ up,
+ const int32_t * __restrict__ ids_src1,
+ void * __restrict__ vy,
+ int64_t n_ff,
+ int64_t n_ff_padded,
+ int64_t n_rows,
+ int64_t n_used,
+ int64_t gate_s1,
+ int64_t gate_s2,
+ int64_t up_s1,
+ int64_t up_s2) {
+#if defined(BLACKWELL_MMA_AVAILABLE)
+ const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB;
+ if (i0_base >= n_ff_padded) {
+ return;
+ }
+
+ const int64_t row = blockIdx.x;
+ const int64_t src_row = ids_src1[row];
+ const int64_t token = src_row / n_used;
+ const int64_t used = src_row - token * n_used;
+ const int64_t k_block = i0_base / QK_K;
+ const int64_t blocks_per_col = (n_ff_padded + QK_K - 1) / QK_K;
+ if (k_block >= blocks_per_col) {
+ return;
+ }
+
+ block_fp4_mmq * y = (block_fp4_mmq *) vy;
+ block_fp4_mmq * yb = y + k_block * n_rows + row;
+ const int sub = (i0_base % QK_K) / QK_NVFP4_SUB;
+
+ float vals_raw[QK_NVFP4_SUB];
+ float amax_raw = 0.0f;
+#pragma unroll
+ for (int k = 0; k < QK_NVFP4_SUB; k++) {
+ const int64_t i0 = i0_base + k;
+ if (i0 < n_ff) {
+ const float g = gate[token * gate_s2 + used * gate_s1 + i0];
+ const float u = up[token * up_s2 + used * up_s1 + i0];
+ const float v = ggml_cuda_op_silu_single(g) * u;
+ vals_raw[k] = v;
+ amax_raw = fmaxf(amax_raw, fabsf(v));
+ } else {
+ vals_raw[k] = 0.0f;
+ }
+ }
+
+ static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2 };
+ const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
+
+ float best_err = FLT_MAX;
+ uint8_t fp8_code = 0;
+ float subblock_scale = 0.0f;
+
+#pragma unroll
+ for (int i = 0; i < 5; i++) {
+ const int test_code = first_fp8_code + test_offsets[i];
+ if (test_code < 0 || test_code > 0x7e) {
+ continue;
+ }
+ const uint8_t code = (uint8_t) test_code;
+ const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
+ const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
+ float cur_err = 0.0f;
+#pragma unroll
+ for (int k = 0; k < QK_NVFP4_SUB; ++k) {
+ const float v = vals_raw[k];
+ const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale);
+ const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale;
+ cur_err = fmaf(err_diff, err_diff, cur_err);
+ }
+
+ if (cur_err < best_err) {
+ best_err = cur_err;
+ fp8_code = test_code;
+ subblock_scale = test_scale;
+ }
+ }
+
+ const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
+ uint32_t q0 = 0;
+ uint32_t q1 = 0;
+#pragma unroll
+ for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
+ q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 0], inv_scale) << (8 * k);
+ q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 8], inv_scale) << (8 * k + 4);
+ q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 4], inv_scale) << (8 * k);
+ q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4);
+ }
+
+ uint32_t * yqs = reinterpret_cast<uint32_t *>(yb->qs);
+ yqs[2 * sub + 0] = q0;
+ yqs[2 * sub + 1] = q1;
+ reinterpret_cast<uint8_t *>(yb->d4)[sub] = fp8_code;
+#else
+ NO_DEVICE_CODE;
+#endif
+}
+
+static bool ggml_cuda_moe_routed_ffn_fused_quant(
+ ggml_backend_cuda_context & ctx,
+ ggml_tensor * gate,
+ ggml_tensor * up,
+ ggml_tensor * glu,
+ ggml_tensor * down) {
+ if (!ggml_cuda_moe_routed_ffn_fused_quant_enabled()) {
+ return false;
+ }
+ if (!ggml_cuda_moe_routed_ffn_down_supported(glu, down)) {
+ return false;
+ }
+ const ggml_tensor * down_w = down->src[0];
+ const ggml_tensor * ids = down->src[2];
+ if (down_w->type != GGML_TYPE_NVFP4) {
+ return false;
+ }
+ if (gate == nullptr || up == nullptr || gate->type != GGML_TYPE_F32 || up->type != GGML_TYPE_F32) {
+ return false;
+ }
+ if (gate->ne[0] != glu->ne[0] || gate->ne[1] != glu->ne[1] || gate->ne[2] != glu->ne[2] || gate->ne[3] != glu->ne[3]) {
+ return false;
+ }
+ if (up->ne[0] != glu->ne[0] || up->ne[1] != glu->ne[1] || up->ne[2] != glu->ne[2] || up->ne[3] != glu->ne[3]) {
+ return false;
+ }
+ if (gate->nb[0] != sizeof(float) || up->nb[0] != sizeof(float)) {
+ return false;
+ }
+
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+ if (!blackwell_mma_available(cc)) {
+ return false;
+ }
+
+ const int64_t n_ff = glu->ne[0];
+ const int64_t n_ff_padded = GGML_PAD(n_ff, MATRIX_ROW_PADDING);
+ if (n_ff % QK_NVFP4 != 0) {
+ return false;
+ }
+
+ const int64_t n_expert_used = ids->ne[0];
+ const int64_t n_tokens = glu->ne[2];
+ const int64_t n_experts = down_w->ne[2];
+ const int64_t ne_get_rows = n_tokens * n_expert_used;
+
+ ggml_cuda_mmq_ids_meta ids_meta(ctx.pool(), ne_get_rows, n_experts);
+ const int64_t sis1 = glu->nb[2] / glu->nb[1];
+ ids_meta.build(ids, n_experts, n_tokens, n_expert_used, glu->ne[1], sis1, ctx.stream());
+
+ const size_t nbytes_src1_q = ne_get_rows * n_ff_padded * sizeof(block_fp4_mmq) / QK_K +
+ get_mmq_x_max_host(cc) * sizeof(block_q8_1_mmq);
+ ggml_cuda_pool_alloc<char> src1_q(ctx.pool(), nbytes_src1_q);
+
+ constexpr int nvfp4_block_size = 128;
+ const int64_t block_num_y = (n_ff_padded + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size);
+ const dim3 block_size(nvfp4_block_size, 1, 1);
+ const dim3 num_blocks(ne_get_rows, block_num_y, 1);
+ moe_swiglu_nvfp4_quant_kernel<<<num_blocks, block_size, 0, ctx.stream()>>>(
+ (const float *) gate->data, (const float *) up->data, ids_meta.ids_src1.get(), src1_q.get(),
+ n_ff, n_ff_padded, ne_get_rows, n_expert_used,
+ gate->nb[1] / sizeof(float), gate->nb[2] / sizeof(float),
+ up->nb[1] / sizeof(float), up->nb[2] / sizeof(float));
+ CUDA_CHECK(cudaGetLastError());
+
+ ggml_cuda_mul_mat_q_moe_quantized(
+ ctx, down_w, src1_q.get(), down,
+ ids_meta.ids_dst.get(), ids_meta.expert_bounds.get(),
+ n_tokens, n_expert_used, n_experts, n_ff_padded);
+ return true;
+}
+
+bool ggml_cuda_moe_routed_ffn_poc(
+ ggml_backend_cuda_context & ctx,
+ ggml_tensor * gate_up,
+ ggml_tensor * gate,
+ ggml_tensor * up,
+ ggml_tensor * glu,
+ ggml_tensor * down) {
+ if (!ggml_cuda_compute_forward(ctx, gate_up)) {
+ return false;
+ }
+ if (ggml_cuda_moe_routed_ffn_fused_quant(ctx, gate, up, glu, down)) {
+ return true;
+ }
+ if (!ggml_cuda_compute_forward(ctx, glu)) {
+ return false;
+ }
+ if (!ggml_cuda_compute_forward(ctx, down)) {
+ return false;
+ }
+
+ return true;
+}
diff --git a/ggml/src/ggml-cuda/moe-ffn.cuh b/ggml/src/ggml-cuda/moe-ffn.cuh
new file mode 100644
index 000000000..ce385d31a
--- /dev/null
+++ b/ggml/src/ggml-cuda/moe-ffn.cuh
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "common.cuh"
+
+bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_moe_routed_ffn_poc_enabled();
+
+bool ggml_cuda_moe_routed_ffn_poc_should_engage(
+ const ggml_tensor * gate_up,
+ const ggml_tensor * gate,
+ const ggml_tensor * up,
+ const ggml_tensor * glu,
+ const ggml_tensor * down,
+ const ggml_tensor * ids,
+ int cc);
+
+bool ggml_cuda_moe_routed_ffn_poc(
+ ggml_backend_cuda_context & ctx,
+ ggml_tensor * gate_up,
+ ggml_tensor * gate,
+ ggml_tensor * up,
+ ggml_tensor * glu,
+ ggml_tensor * down);
--
2.43.0

View File

@@ -1,57 +0,0 @@
From fb9402661291e0488a3e2bf2f3948ebcd18e18c9 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 02:41:22 +0000
Subject: [PATCH] feat(server): trace speculative batch shapes
Add an env-gated LLAMA_SPEC_SHAPE_TRACE log around the server batch rows emitted by normal decode and speculative verification slots. This keeps the instrumentation default-off while exposing the row/output shape entropy that prevents CUDA graph reuse under MTP serving.
Assisted-by: Codex:gpt-5
---
tools/server/server-context.cpp | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index a77e2676d..fd8348af6 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -457,12 +457,22 @@ struct server_slot {
// add sampled token of this slot to the batch, optionally add the speculative draft tokens if any
void handle_last_sampled_token(server_batch & batch) {
+ static const bool spec_shape_trace = getenv("LLAMA_SPEC_SHAPE_TRACE") != nullptr;
+ const int32_t batch_before = batch.size();
+
bool add_ok = true;
if (spec_draft.empty()) {
// no speculative decoding
- i_batch = batch.size();
+ i_batch = batch_before;
+
+ const int32_t pos0 = prompt.tokens.pos_next();
+
+ add_ok &= batch.add(id, sampled, pos0, true);
- add_ok &= batch.add(id, sampled, prompt.tokens.pos_next(), true);
+ if (spec_shape_trace) {
+ SLT_INF(*this, "spec shape: kind=decode batch_before=%d rows=1 outputs=1 draft=0 pos0=%d slot_tokens=%zu\n",
+ batch_before, pos0, prompt.tokens.size());
+ }
SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n",
sampled, n_ctx, prompt.n_tokens(), truncated);
@@ -479,6 +489,12 @@ struct server_slot {
auto pos0 = prompt.tokens.pos_next();
+ if (spec_shape_trace) {
+ SLT_INF(*this, "spec shape: kind=verify batch_before=%d rows=%zu outputs=%zu draft=%zu spec_i_first=%d spec_i_last=%d pos0=%d slot_tokens=%zu\n",
+ batch_before, spec_draft.size() + 1, spec_draft.size() + 1, spec_draft.size(),
+ spec_i_batch.front(), spec_i_batch.back(), pos0, prompt.tokens.size());
+ }
+
add_ok &= batch.add(id, sampled, pos0++, true);
for (auto token : spec_draft) {
add_ok &= batch.add(this->id, token, pos0++, true);
--
2.43.0

View File

@@ -1,212 +0,0 @@
From 20a99518a39acbb4474fa9c97121fc7b9f07c1ef Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 04:27:19 +0000
Subject: [PATCH] feat(cuda): trace moe mmq batch shapes
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/mmq-shape-trace.h | 66 ++++++++++++++++++++++++++++
ggml/src/ggml-cuda/mmq.cuh | 31 ++++++++++++-
tests/CMakeLists.txt | 2 +
tests/test-cuda-mmq-shape-trace.cpp | 42 ++++++++++++++++++
4 files changed, 140 insertions(+), 1 deletion(-)
create mode 100644 ggml/src/ggml-cuda/mmq-shape-trace.h
create mode 100644 tests/test-cuda-mmq-shape-trace.cpp
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
new file mode 100644
index 000000000..9d41b7c80
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+
+struct ggml_cuda_mmq_shape {
+ int type;
+ bool is_moe;
+ int64_t ncols_dst;
+ int64_t nchannels_x;
+ int64_t ncols_max;
+ int64_t n_active_est;
+ int64_t density;
+ int mmq_x_max;
+ int mmq_x_lim;
+ int mmq_x_best;
+ int mmq_y;
+ bool use_stream_k;
+};
+
+static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
+ const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
+ const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
+ const int mmq_y, const bool use_stream_k) {
+ int64_t n_active_est = 0;
+ int64_t density = 0;
+ if (is_moe && ncols_dst > 0 && nchannels_x > 0) {
+ n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x;
+ density = (ncols_dst + n_active_est - 1) / n_active_est;
+ }
+
+ return {
+ type,
+ is_moe,
+ ncols_dst,
+ nchannels_x,
+ ncols_max,
+ n_active_est,
+ density,
+ mmq_x_max,
+ mmq_x_lim,
+ mmq_x_best,
+ mmq_y,
+ use_stream_k,
+ };
+}
+
+static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) {
+ return std::snprintf(buf, size,
+ "type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
+ "n_active_est=%lld density=%lld mmq_x_max=%d mmq_x_lim=%d "
+ "mmq_x_best=%d mmq_y=%d stream_k=%d",
+ shape.type,
+ shape.is_moe ? 1 : 0,
+ (long long) shape.ncols_dst,
+ (long long) shape.nchannels_x,
+ (long long) shape.ncols_max,
+ (long long) shape.n_active_est,
+ (long long) shape.density,
+ shape.mmq_x_max,
+ shape.mmq_x_lim,
+ shape.mmq_x_best,
+ shape.mmq_y,
+ shape.use_stream_k ? 1 : 0);
+}
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index b53e38a8b..6bc943738 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -3,10 +3,14 @@
#include "common.cuh"
#include "vecdotq.cuh"
#include "mma.cuh"
+#include "mmq-shape-trace.h"
+#include <atomic>
#include <climits>
#include <cstdint>
+#include <cstdio>
#include <cstdlib>
+#include <cstring>
using namespace ggml_cuda_mma;
@@ -4163,6 +4167,18 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() {
return c;
}
+static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
+ static const int limit = []() -> int {
+ const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
+ if (s == nullptr || strcmp(s, "0") == 0) {
+ return 0;
+ }
+ const int parsed = atoi(s);
+ return parsed > 0 ? parsed : 256;
+ }();
+ return limit;
+}
+
template <ggml_type type>
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
const int id = ggml_cuda_get_device();
@@ -4249,6 +4265,20 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
}
}
+ if (args.expert_bounds != nullptr) {
+ static std::atomic<int> trace_count{0};
+ const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
+ const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit;
+ if (trace_index >= 0 && trace_index < trace_limit) {
+ char buf[256];
+ const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
+ (int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max,
+ mmq_x_max, mmq_x_lim, mmq_x_best, mmq_y, args.use_stream_k);
+ ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
+ fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf);
+ }
+ }
+
switch (mmq_x_best) {
case 8:
launch_mul_mat_q<type, 8>(ctx, args, stream);
@@ -4341,4 +4371,3 @@ void ggml_cuda_op_mul_mat_q(
const int64_t src1_padded_row_size, cudaStream_t stream);
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts);
-
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 24592a279..0a5194c87 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -234,6 +234,8 @@ llama_build_and_test(test-thread-safety.cpp ARGS -m "${MODEL_DEST}" -ngl 99 -p "
set_tests_properties(test-thread-safety PROPERTIES FIXTURES_REQUIRED test-download-model)
llama_build_and_test(test-arg-parser.cpp)
+llama_build_and_test(test-cuda-mmq-shape-trace.cpp)
+target_include_directories(test-cuda-mmq-shape-trace PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
# TODO: repair known memory leaks
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
new file mode 100644
index 000000000..8620169c0
--- /dev/null
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -0,0 +1,42 @@
+#include "ggml-cuda/mmq-shape-trace.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+static void require(bool ok, const char * what) {
+ if (!ok) {
+ std::fprintf(stderr, "require failed: %s\n", what);
+ std::exit(1);
+ }
+}
+
+int main() {
+ const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
+ /* type */ 39,
+ /* is_moe */ true,
+ /* ncols_dst */ 1024,
+ /* nchannels_x */ 256,
+ /* ncols_max */ 128,
+ /* mmq_x_max */ 128,
+ /* mmq_x_lim */ 64,
+ /* mmq_x_best */ 64,
+ /* mmq_y */ 128,
+ /* use_stream_k */ true);
+
+ require(shape.n_active_est == 256, "active expert estimate is capped by expert count");
+ require(shape.density == 4, "density is ceil(assignments / active experts)");
+
+ char buf[256];
+ const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
+
+ require(n > 0, "format returns byte count");
+ require(std::strstr(buf, "moe=1") != nullptr, "trace includes moe flag");
+ require(std::strstr(buf, "ncols_dst=1024") != nullptr, "trace includes routed assignment count");
+ require(std::strstr(buf, "n_active_est=256") != nullptr, "trace includes active estimate");
+ require(std::strstr(buf, "density=4") != nullptr, "trace includes density");
+ require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile");
+ require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag");
+
+ return 0;
+}

View File

@@ -1,223 +0,0 @@
From c78e537b56e3446f8aa645c6700aacf263639bd8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 04:49:55 +0000
Subject: [PATCH] feat(cuda): trace moe mmq launch shapes
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/mmq-shape-trace.h | 65 ++++++++++++++++++++++++++++
ggml/src/ggml-cuda/mmq.cuh | 47 +++++++++++++-------
tests/test-cuda-mmq-shape-trace.cpp | 35 +++++++++++++++
3 files changed, 132 insertions(+), 15 deletions(-)
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
index 9d41b7c80..98bc21f7f 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -19,6 +19,24 @@ struct ggml_cuda_mmq_shape {
bool use_stream_k;
};
+struct ggml_cuda_mmq_launch_shape {
+ int type;
+ bool is_moe;
+ int64_t ncols_dst;
+ int64_t ncols_max;
+ int mmq_x;
+ int mmq_y;
+ int ntx;
+ int nty;
+ int ntzw;
+ int ntiles_dst;
+ int nsm;
+ int tiles_nwaves;
+ int tiles_efficiency_percent;
+ int stream_k_blocks;
+ bool fixup_needed;
+};
+
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
@@ -46,6 +64,30 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
};
}
+static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make(
+ const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max,
+ const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw,
+ const int ntiles_dst, const int nsm, const int tiles_nwaves, const int tiles_efficiency_percent,
+ const int stream_k_blocks, const bool fixup_needed) {
+ return {
+ type,
+ is_moe,
+ ncols_dst,
+ ncols_max,
+ mmq_x,
+ mmq_y,
+ ntx,
+ nty,
+ ntzw,
+ ntiles_dst,
+ nsm,
+ tiles_nwaves,
+ tiles_efficiency_percent,
+ stream_k_blocks,
+ fixup_needed,
+ };
+}
+
static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) {
return std::snprintf(buf, size,
"type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
@@ -64,3 +106,26 @@ static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, cons
shape.mmq_y,
shape.use_stream_k ? 1 : 0);
}
+
+static inline int ggml_cuda_mmq_launch_shape_format(
+ char * buf, const size_t size, const ggml_cuda_mmq_launch_shape & shape) {
+ return std::snprintf(buf, size,
+ "type=%d moe=%d ncols_dst=%lld ncols_max=%lld mmq_x=%d mmq_y=%d "
+ "ntx=%d nty=%d ntzw=%d ntiles_dst=%d nsm=%d tiles_nwaves=%d "
+ "tiles_efficiency=%d stream_k_blocks=%d fixup=%d",
+ shape.type,
+ shape.is_moe ? 1 : 0,
+ (long long) shape.ncols_dst,
+ (long long) shape.ncols_max,
+ shape.mmq_x,
+ shape.mmq_y,
+ shape.ntx,
+ shape.nty,
+ shape.ntzw,
+ shape.ntiles_dst,
+ shape.nsm,
+ shape.tiles_nwaves,
+ shape.tiles_efficiency_percent,
+ shape.stream_k_blocks,
+ shape.fixup_needed ? 1 : 0);
+}
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 6bc943738..34002edf7 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -3989,6 +3989,24 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
}
+static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
+ static const int limit = []() -> int {
+ const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
+ if (s == nullptr || strcmp(s, "0") == 0) {
+ return 0;
+ }
+ const int parsed = atoi(s);
+ return parsed > 0 ? parsed : 256;
+ }();
+ return limit;
+}
+
+static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic<int> & counter) {
+ const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
+ const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit;
+ return trace_index >= 0 && trace_index < trace_limit;
+}
+
template <ggml_type type, int mmq_x>
static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
const int id = ggml_cuda_get_device();
@@ -4054,6 +4072,19 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
+ if (args.expert_bounds != nullptr) {
+ static std::atomic<int> trace_count{0};
+ if (ggml_cuda_moe_mmq_trace_take(trace_count)) {
+ char buf[256];
+ const ggml_cuda_mmq_launch_shape shape = ggml_cuda_mmq_launch_shape_make(
+ (int) type, true, args.ncols_dst, args.ncols_max, mmq_x, mmq_y,
+ ntx, nty, ntzw, ntiles_dst, nsm, tiles_nwaves, tiles_efficiency_percent,
+ block_nums_stream_k.x, fixup_needed);
+ ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), shape);
+ fprintf(stderr, "[LLAMA_MOE_MMQ_LAUNCH] %s\n", buf);
+ }
+ }
+
ggml_cuda_pool & pool = ctx.pool(id);
ggml_cuda_pool_alloc<float> tmp_fixup(pool);
if (fixup_needed) {
@@ -4167,18 +4198,6 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() {
return c;
}
-static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
- static const int limit = []() -> int {
- const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
- if (s == nullptr || strcmp(s, "0") == 0) {
- return 0;
- }
- const int parsed = atoi(s);
- return parsed > 0 ? parsed : 256;
- }();
- return limit;
-}
-
template <ggml_type type>
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
const int id = ggml_cuda_get_device();
@@ -4267,9 +4286,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
if (args.expert_bounds != nullptr) {
static std::atomic<int> trace_count{0};
- const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
- const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit;
- if (trace_index >= 0 && trace_index < trace_limit) {
+ if (ggml_cuda_moe_mmq_trace_take(trace_count)) {
char buf[256];
const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
(int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max,
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
index 8620169c0..86ee15e02 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -38,5 +38,40 @@ int main() {
require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile");
require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag");
+ const ggml_cuda_mmq_launch_shape launch = ggml_cuda_mmq_launch_shape_make(
+ /* type */ 39,
+ /* is_moe */ true,
+ /* ncols_dst */ 1024,
+ /* ncols_max */ 128,
+ /* mmq_x */ 64,
+ /* mmq_y */ 128,
+ /* ntx */ 2,
+ /* nty */ 4,
+ /* ntzw */ 3,
+ /* ntiles_dst */ 24,
+ /* nsm */ 16,
+ /* tiles_nwaves */ 2,
+ /* tiles_efficiency_percent */ 75,
+ /* stream_k_blocks */ 16,
+ /* fixup_needed */ true);
+
+ const int launch_n = ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), launch);
+
+ require(launch_n > 0, "launch format returns byte count");
+ require(std::strstr(buf, "moe=1") != nullptr, "launch trace includes moe flag");
+ require(std::strstr(buf, "ncols_dst=1024") != nullptr, "launch trace includes routed assignment count");
+ require(std::strstr(buf, "ncols_max=128") != nullptr, "launch trace includes max column count");
+ require(std::strstr(buf, "mmq_x=64") != nullptr, "launch trace includes compiled x tile");
+ require(std::strstr(buf, "mmq_y=128") != nullptr, "launch trace includes compiled y tile");
+ require(std::strstr(buf, "ntx=2") != nullptr, "launch trace includes x tile count");
+ require(std::strstr(buf, "nty=4") != nullptr, "launch trace includes y tile count");
+ require(std::strstr(buf, "ntzw=3") != nullptr, "launch trace includes batch tile count");
+ require(std::strstr(buf, "ntiles_dst=24") != nullptr, "launch trace includes total tile count");
+ require(std::strstr(buf, "nsm=16") != nullptr, "launch trace includes SM count");
+ require(std::strstr(buf, "tiles_nwaves=2") != nullptr, "launch trace includes wave count");
+ require(std::strstr(buf, "tiles_efficiency=75") != nullptr, "launch trace includes stream-k efficiency");
+ require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count");
+ require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag");
+
return 0;
}

View File

@@ -1,182 +0,0 @@
From 2a9964d290a543d14db972d8d2927ee9d2974f7e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 05:05:17 +0000
Subject: [PATCH] feat(cuda): trace moe small-m mmq candidates
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/mmq-shape-trace.h | 58 ++++++++++++++++++++++++++++
ggml/src/ggml-cuda/mmq.cuh | 27 +++++++++++++
tests/test-cuda-mmq-shape-trace.cpp | 35 +++++++++++++++++
3 files changed, 120 insertions(+)
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
index 98bc21f7f..47453d91f 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -37,6 +37,18 @@ struct ggml_cuda_mmq_launch_shape {
bool fixup_needed;
};
+struct ggml_cuda_mmq_small_m_shape {
+ bool is_moe;
+ int64_t ncols_dst;
+ int64_t nchannels_x;
+ int64_t ncols_max;
+ int64_t n_active_est;
+ int64_t density;
+ int mmq_x_best;
+ bool use_stream_k;
+ bool is_candidate;
+};
+
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
@@ -64,6 +76,36 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
};
}
+static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make(
+ const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
+ const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) {
+ int64_t n_active_est = 0;
+ int64_t density = 0;
+ if (is_moe && ncols_dst > 0 && nchannels_x > 0) {
+ n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x;
+ density = (ncols_dst + n_active_est - 1) / n_active_est;
+ }
+
+ const bool is_candidate =
+ is_moe &&
+ use_stream_k &&
+ ncols_max > 0 && ncols_max <= 128 &&
+ density > 0 && density <= 4 &&
+ mmq_x_best > 0 && mmq_x_best <= 64;
+
+ return {
+ is_moe,
+ ncols_dst,
+ nchannels_x,
+ ncols_max,
+ n_active_est,
+ density,
+ mmq_x_best,
+ use_stream_k,
+ is_candidate,
+ };
+}
+
static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make(
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max,
const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw,
@@ -129,3 +171,19 @@ static inline int ggml_cuda_mmq_launch_shape_format(
shape.stream_k_blocks,
shape.fixup_needed ? 1 : 0);
}
+
+static inline int ggml_cuda_mmq_small_m_shape_format(
+ char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
+ return std::snprintf(buf, size,
+ "candidate=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
+ "n_active_est=%lld density=%lld mmq_x_best=%d stream_k=%d",
+ shape.is_candidate ? 1 : 0,
+ shape.is_moe ? 1 : 0,
+ (long long) shape.ncols_dst,
+ (long long) shape.nchannels_x,
+ (long long) shape.ncols_max,
+ (long long) shape.n_active_est,
+ (long long) shape.density,
+ shape.mmq_x_best,
+ shape.use_stream_k ? 1 : 0);
+}
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 34002edf7..25ead9e7b 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -4007,6 +4007,24 @@ static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic<int> & counter) {
return trace_index >= 0 && trace_index < trace_limit;
}
+static inline int ggml_cuda_moe_mmq_small_m_trace_limit() {
+ static const int limit = []() -> int {
+ const char * s = getenv("LLAMA_MOE_MMQ_SMALL_M_TRACE");
+ if (s == nullptr || strcmp(s, "0") == 0) {
+ return 0;
+ }
+ const int parsed = atoi(s);
+ return parsed > 0 ? parsed : 256;
+ }();
+ return limit;
+}
+
+static inline bool ggml_cuda_moe_mmq_small_m_trace_take(std::atomic<int> & counter) {
+ const int trace_limit = ggml_cuda_moe_mmq_small_m_trace_limit();
+ const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit;
+ return trace_index >= 0 && trace_index < trace_limit;
+}
+
template <ggml_type type, int mmq_x>
static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
const int id = ggml_cuda_get_device();
@@ -4294,6 +4312,15 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf);
}
+
+ static std::atomic<int> small_m_trace_count{0};
+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
+ true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_best, args.use_stream_k);
+ if (small_m.is_candidate && ggml_cuda_moe_mmq_small_m_trace_take(small_m_trace_count)) {
+ char buf[256];
+ ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m);
+ fprintf(stderr, "[LLAMA_MOE_MMQ_SMALL_M] %s\n", buf);
+ }
}
switch (mmq_x_best) {
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
index 86ee15e02..9f36ce1a1 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -73,5 +73,40 @@ int main() {
require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count");
require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag");
+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
+ /* is_moe */ true,
+ /* ncols_dst */ 1024,
+ /* nchannels_x */ 256,
+ /* ncols_max */ 128,
+ /* mmq_x_best */ 64,
+ /* use_stream_k */ true);
+
+ require(small_m.is_candidate, "decode-like low-density MoE shape is a small-M candidate");
+ require(small_m.n_active_est == 256, "small-M active estimate is capped by expert count");
+ require(small_m.density == 4, "small-M density is ceil(assignments / active experts)");
+
+ require(!ggml_cuda_mmq_small_m_shape_make(
+ /* is_moe */ false, 1024, 256, 128, 64, true).is_candidate,
+ "dense shape is excluded");
+ require(!ggml_cuda_mmq_small_m_shape_make(
+ /* is_moe */ true, 4096, 256, 512, 128, true).is_candidate,
+ "prefill-like shape is excluded");
+ require(!ggml_cuda_mmq_small_m_shape_make(
+ /* is_moe */ true, 4096, 256, 128, 64, true).is_candidate,
+ "high-density shape is excluded");
+ require(!ggml_cuda_mmq_small_m_shape_make(
+ /* is_moe */ true, 1024, 256, 128, 128, true).is_candidate,
+ "large selected tile is excluded");
+ require(!ggml_cuda_mmq_small_m_shape_make(
+ /* is_moe */ true, 1024, 256, 128, 64, false).is_candidate,
+ "non-stream-k shape is excluded");
+
+ const int small_m_n = ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m);
+
+ require(small_m_n > 0, "small-M format returns byte count");
+ require(std::strstr(buf, "candidate=1") != nullptr, "small-M trace includes candidate flag");
+ require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density");
+ require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile");
+
return 0;
}

View File

@@ -1,80 +0,0 @@
From fbed2abaa9f5af8e500f95c8dda86b305450ceff Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 05:17:39 +0000
Subject: [PATCH] feat(cuda): gate moe small-m mmq tile policy
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/mmq-shape-trace.h | 9 +++++++++
ggml/src/ggml-cuda/mmq.cuh | 13 +++++++++++++
tests/test-cuda-mmq-shape-trace.cpp | 10 ++++++++++
3 files changed, 32 insertions(+)
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
index 47453d91f..dfb4e898a 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -187,3 +187,12 @@ static inline int ggml_cuda_mmq_small_m_shape_format(
shape.mmq_x_best,
shape.use_stream_k ? 1 : 0);
}
+
+static inline int ggml_cuda_mmq_small_m_tile_limit(
+ const ggml_cuda_mmq_small_m_shape & shape, const int current_limit, const int requested_tile) {
+ if (!shape.is_candidate || requested_tile < 8 || requested_tile >= current_limit) {
+ return current_limit;
+ }
+
+ return requested_tile;
+}
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 25ead9e7b..16b3fcca4 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -4201,6 +4201,15 @@ static inline int ggml_cuda_moe_density_max() {
return d;
}
+static inline int ggml_cuda_moe_small_m_tile() {
+ static const int t = []() -> int {
+ const char * s = getenv("LLAMA_MOE_SMALL_M_TILE");
+ const int v = s ? atoi(s) : 0;
+ return v >= 8 ? v : 0;
+ }();
+ return t;
+}
+
// [paged patch 0017 / track B] DENSE NVFP4 decode mmq_x re-read occupancy DIAGNOSTIC (env, default off).
// GGML_CUDA_FP4_DENSE_MMQ_X=<n> caps the dense (non-MoE) NVFP4 col-tile to <n>, splitting the M=128
// decode ubatch into ceil(128/n) col-tiles. Each col-tile re-reads the full weight set (fatal cost
@@ -4282,6 +4291,10 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
}
}
}
+
+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
+ true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_lim, args.use_stream_k);
+ mmq_x_lim = ggml_cuda_mmq_small_m_tile_limit(small_m, mmq_x_lim, ggml_cuda_moe_small_m_tile());
}
int mmq_x_best = 0;
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
index 9f36ce1a1..f7863f03a 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -108,5 +108,15 @@ int main() {
require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density");
require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile");
+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 0) == 64,
+ "small-M tile override is default-off");
+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 16) == 16,
+ "small-M tile override caps candidate tile limit");
+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 128) == 64,
+ "small-M tile override ignores non-smaller tiles");
+ require(ggml_cuda_mmq_small_m_tile_limit(
+ ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128,
+ "small-M tile override excludes prefill-like shapes");
+
return 0;
}

View File

@@ -1,292 +0,0 @@
From 6c332094ca2fbb1e3211427c5f919adcaa89c588 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 05:32:27 +0000
Subject: [PATCH] feat(cuda): trace moe mmid routes
Add a default-off LLAMA_MOE_MMID_ROUTE_TRACE diagnostic for MUL_MAT_ID dispatch routes.
The trace reports whether a call uses MMVQ, MMVF, grouped MMQ, MMF, or host-sync fallback while preserving the existing dispatch predicates.
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/ggml-cuda.cu | 34 +++++++++--
ggml/src/ggml-cuda/mmq-shape-trace.h | 88 ++++++++++++++++++++++++++++
tests/test-cuda-mmq-shape-trace.cpp | 82 ++++++++++++++++++++++++++
3 files changed, 200 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 374949f25..a1754df39 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2685,6 +2685,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
}
}
+static inline int ggml_cuda_moe_mmid_route_trace_limit() {
+ static const int value = []() {
+ const char * s = getenv("LLAMA_MOE_MMID_ROUTE_TRACE");
+ return s ? atoi(s) : 0;
+ }();
+
+ return value;
+}
+
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
@@ -2697,13 +2706,30 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
GGML_TENSOR_BINARY_OP_LOCALS
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+ const bool src0_quantized = ggml_is_quantized(src0->type);
+ const int mmvq_mmid_max = src0_quantized ? get_mmvq_mmid_max_batch(src0->type, cc) : MMVQ_MAX_BATCH_SIZE;
+ const bool use_mmq = ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02);
+ const bool use_mmf = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true);
+
+ const int mmid_trace_limit = ggml_cuda_moe_mmid_route_trace_limit();
+ if (mmid_trace_limit > 0) {
+ static std::atomic<int> trace_count{0};
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
+ if (trace_idx < mmid_trace_limit) {
+ const ggml_cuda_mmid_route_shape route_shape = ggml_cuda_mmid_route_shape_make(
+ src0->type, ne2, ne12, /*n_experts=*/ne02, mmvq_mmid_max, use_mmq, use_mmf,
+ GGML_CUDA_CC_IS_AMD(cc), src0_quantized);
+ char buf[256];
+ ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), route_shape);
+ fprintf(stderr, "[LLAMA_MOE_MMID_ROUTE] %s\n", buf);
+ }
+ }
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
- if (ggml_is_quantized(src0->type)) {
- const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
+ if (src0_quantized) {
if (ne2 <= mmvq_mmid_max) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
return;
@@ -2716,12 +2742,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
}
- if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
+ if (use_mmq) {
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
return;
}
- if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
+ if (use_mmf) {
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
return;
}
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
index dfb4e898a..da234a302 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -49,6 +49,40 @@ struct ggml_cuda_mmq_small_m_shape {
bool is_candidate;
};
+enum ggml_cuda_mmid_route {
+ GGML_CUDA_MMID_ROUTE_MMVQ,
+ GGML_CUDA_MMID_ROUTE_MMVF,
+ GGML_CUDA_MMID_ROUTE_MMQ,
+ GGML_CUDA_MMID_ROUTE_MMF,
+ GGML_CUDA_MMID_ROUTE_FALLBACK,
+};
+
+struct ggml_cuda_mmid_route_shape {
+ ggml_cuda_mmid_route route;
+ int type;
+ int64_t ne2;
+ int64_t ne12;
+ int64_t n_experts;
+ int mmvq_max;
+ bool use_mmq;
+ bool use_mmf;
+ bool is_amd;
+ bool is_quantized;
+ bool host_sync;
+};
+
+static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
+ switch (route) {
+ case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq";
+ case GGML_CUDA_MMID_ROUTE_MMVF: return "mmvf";
+ case GGML_CUDA_MMID_ROUTE_MMQ: return "mmq";
+ case GGML_CUDA_MMID_ROUTE_MMF: return "mmf";
+ case GGML_CUDA_MMID_ROUTE_FALLBACK: return "fallback";
+ }
+
+ return "unknown";
+}
+
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
@@ -76,6 +110,42 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
};
}
+static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
+ const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
+ const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
+ const bool is_quantized) {
+ ggml_cuda_mmid_route route = GGML_CUDA_MMID_ROUTE_FALLBACK;
+ if (ne2 <= mmvq_max) {
+ if (is_quantized) {
+ route = GGML_CUDA_MMID_ROUTE_MMVQ;
+ } else if (is_amd) {
+ route = GGML_CUDA_MMID_ROUTE_MMVF;
+ }
+ }
+
+ if (route == GGML_CUDA_MMID_ROUTE_FALLBACK) {
+ if (use_mmq) {
+ route = GGML_CUDA_MMID_ROUTE_MMQ;
+ } else if (use_mmf) {
+ route = GGML_CUDA_MMID_ROUTE_MMF;
+ }
+ }
+
+ return {
+ route,
+ type,
+ ne2,
+ ne12,
+ n_experts,
+ mmvq_max,
+ use_mmq,
+ use_mmf,
+ is_amd,
+ is_quantized,
+ route == GGML_CUDA_MMID_ROUTE_FALLBACK,
+ };
+}
+
static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make(
const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) {
@@ -172,6 +242,24 @@ static inline int ggml_cuda_mmq_launch_shape_format(
shape.fixup_needed ? 1 : 0);
}
+static inline int ggml_cuda_mmid_route_shape_format(
+ char * buf, const size_t size, const ggml_cuda_mmid_route_shape & shape) {
+ return std::snprintf(buf, size,
+ "route=%s type=%d host_sync=%d ne2=%lld ne12=%lld n_experts=%lld "
+ "mmvq_max=%d use_mmq=%d use_mmf=%d is_amd=%d is_quantized=%d",
+ ggml_cuda_mmid_route_name(shape.route),
+ shape.type,
+ shape.host_sync ? 1 : 0,
+ (long long) shape.ne2,
+ (long long) shape.ne12,
+ (long long) shape.n_experts,
+ shape.mmvq_max,
+ shape.use_mmq ? 1 : 0,
+ shape.use_mmf ? 1 : 0,
+ shape.is_amd ? 1 : 0,
+ shape.is_quantized ? 1 : 0);
+}
+
static inline int ggml_cuda_mmq_small_m_shape_format(
char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
return std::snprintf(buf, size,
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
index f7863f03a..e190cf1ac 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -118,5 +118,87 @@ int main() {
ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128,
"small-M tile override excludes prefill-like shapes");
+ const ggml_cuda_mmid_route_shape mmvq = ggml_cuda_mmid_route_shape_make(
+ /* type */ 39,
+ /* ne2 */ 1,
+ /* ne12 */ 1,
+ /* ne02 */ 256,
+ /* mmvq_max */ 4,
+ /* use_mmq */ true,
+ /* use_mmf */ true,
+ /* is_amd */ false,
+ /* is_quantized */ true);
+
+ require(mmvq.route == GGML_CUDA_MMID_ROUTE_MMVQ, "MMVQ wins when within its batch cap");
+ require(!mmvq.host_sync, "MMVQ route is graph-safe");
+
+ const ggml_cuda_mmid_route_shape mmq = ggml_cuda_mmid_route_shape_make(
+ /* type */ 39,
+ /* ne2 */ 128,
+ /* ne12 */ 128,
+ /* ne02 */ 256,
+ /* mmvq_max */ 4,
+ /* use_mmq */ true,
+ /* use_mmf */ true,
+ /* is_amd */ false,
+ /* is_quantized */ true);
+
+ require(mmq.route == GGML_CUDA_MMID_ROUTE_MMQ, "grouped MMQ wins after MMVQ cap");
+ require(!mmq.host_sync, "grouped MMQ route is graph-safe");
+
+ const ggml_cuda_mmid_route_shape mmf = ggml_cuda_mmid_route_shape_make(
+ /* type */ 1,
+ /* ne2 */ 128,
+ /* ne12 */ 128,
+ /* ne02 */ 256,
+ /* mmvq_max */ 4,
+ /* use_mmq */ false,
+ /* use_mmf */ true,
+ /* is_amd */ false,
+ /* is_quantized */ false);
+
+ require(mmf.route == GGML_CUDA_MMID_ROUTE_MMF, "MMF wins when grouped MMQ is unavailable");
+ require(!mmf.host_sync, "MMF route is graph-safe");
+
+ const ggml_cuda_mmid_route_shape fallback = ggml_cuda_mmid_route_shape_make(
+ /* type */ 0,
+ /* ne2 */ 128,
+ /* ne12 */ 128,
+ /* ne02 */ 256,
+ /* mmvq_max */ 4,
+ /* use_mmq */ false,
+ /* use_mmf */ false,
+ /* is_amd */ false,
+ /* is_quantized */ false);
+
+ require(fallback.route == GGML_CUDA_MMID_ROUTE_FALLBACK, "fallback is used when device routes do not match");
+ require(fallback.host_sync, "fallback route requires host synchronization");
+
+ const ggml_cuda_mmid_route_shape amd_mmvf = ggml_cuda_mmid_route_shape_make(
+ /* type */ 1,
+ /* ne2 */ 1,
+ /* ne12 */ 1,
+ /* ne02 */ 256,
+ /* mmvq_max */ 4,
+ /* use_mmq */ false,
+ /* use_mmf */ false,
+ /* is_amd */ true,
+ /* is_quantized */ false);
+
+ require(amd_mmvf.route == GGML_CUDA_MMID_ROUTE_MMVF, "AMD float vector route wins within MMVQ cap");
+ require(!amd_mmvf.host_sync, "AMD float vector route is graph-safe");
+
+ const int route_n = ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), mmq);
+
+ require(route_n > 0, "MMID route format returns byte count");
+ require(std::strstr(buf, "route=mmq") != nullptr, "MMID trace includes route name");
+ require(std::strstr(buf, "host_sync=0") != nullptr, "MMID trace includes host sync flag");
+ require(std::strstr(buf, "ne2=128") != nullptr, "MMID trace includes destination batch");
+ require(std::strstr(buf, "ne12=128") != nullptr, "MMID trace includes routed token count");
+ require(std::strstr(buf, "n_experts=256") != nullptr, "MMID trace includes expert count");
+ require(std::strstr(buf, "mmvq_max=4") != nullptr, "MMID trace includes MMVQ cap");
+ require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate");
+ require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate");
+
return 0;
}
--
2.43.0

View File

@@ -1,345 +0,0 @@
From 486c28c63d5297afd06e5a2bdbd4fb89cad749cd Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 05:49:12 +0000
Subject: [PATCH] feat(cuda): trace mul mat routes
Add a default-off LLAMA_MUL_MAT_ROUTE_TRACE diagnostic for regular MUL_MAT dispatch routes.
The trace classifies projection-heavy calls without changing dispatch behavior.
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/ggml-cuda.cu | 44 +++++++++-
ggml/src/ggml-cuda/mmq-shape-trace.h | 117 +++++++++++++++++++++++++++
tests/test-cuda-mmq-shape-trace.cpp | 85 +++++++++++++++++++
3 files changed, 244 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index a1754df39..cd34aff13 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2580,6 +2580,32 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
return use_mul_mat_vec_q;
}
+static inline int ggml_cuda_mul_mat_route_trace_limit() {
+ static const int value = []() {
+ const char * s = getenv("LLAMA_MUL_MAT_ROUTE_TRACE");
+ return s ? atoi(s) : 0;
+ }();
+
+ return value;
+}
+
+static inline void ggml_cuda_mul_mat_route_trace(const ggml_cuda_mul_mat_route_shape & shape) {
+ const int trace_limit = ggml_cuda_mul_mat_route_trace_limit();
+ if (trace_limit <= 0) {
+ return;
+ }
+
+ static std::atomic<int> trace_count{0};
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
+ if (trace_idx >= trace_limit) {
+ return;
+ }
+
+ char buf[256];
+ ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), shape);
+ fprintf(stderr, "[LLAMA_MUL_MAT_ROUTE] %s\n", buf);
+}
+
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
@@ -2591,6 +2617,10 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
if (!split) {
const int cc_fp4 = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (ggml_cuda_fp4_prefill_should_engage(src0, src1, dst, cc_fp4)) {
+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
+ /*use_vec_f=*/false, /*use_mat_f=*/false, /*use_vec_q=*/false, /*use_mmq=*/false,
+ /*use_batched_cublas=*/false, /*use_fp4_prefill=*/true, /*use_fwht=*/false));
ggml_cuda_mul_mat_fp4_large_m(ctx, src0, src1, dst);
return;
}
@@ -2654,12 +2684,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
+ bool use_batched_cublas = !split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
+ && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1;
const int32_t hint = ggml_get_op_params_i32(dst, 1);
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
+ use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q,
+ use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/true));
return;
}
+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
+ use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q,
+ use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/false));
+
if (!split && use_mul_mat_vec_f) {
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
@@ -2670,8 +2711,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
} else if (!split && use_mul_mat_q) {
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
- } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
- && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+ } else if (use_batched_cublas) {
// general KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_mul_mat_vec_f) {
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
index da234a302..8ac373fd9 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -71,6 +71,37 @@ struct ggml_cuda_mmid_route_shape {
bool host_sync;
};
+enum ggml_cuda_mul_mat_route {
+ GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL,
+ GGML_CUDA_MUL_MAT_ROUTE_FWHT,
+ GGML_CUDA_MUL_MAT_ROUTE_VEC_F,
+ GGML_CUDA_MUL_MAT_ROUTE_MAT_F,
+ GGML_CUDA_MUL_MAT_ROUTE_VEC_Q,
+ GGML_CUDA_MUL_MAT_ROUTE_MMQ,
+ GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS,
+ GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F,
+ GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q,
+ GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ,
+ GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
+};
+
+struct ggml_cuda_mul_mat_route_shape {
+ ggml_cuda_mul_mat_route route;
+ int type;
+ int64_t ne1;
+ int64_t ne11;
+ int64_t ne12;
+ int64_t ne13;
+ bool split;
+ bool use_vec_f;
+ bool use_mat_f;
+ bool use_vec_q;
+ bool use_mmq;
+ bool use_batched_cublas;
+ bool use_fp4_prefill;
+ bool use_fwht;
+};
+
static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
switch (route) {
case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq";
@@ -83,6 +114,24 @@ static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route
return "unknown";
}
+static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_route route) {
+ switch (route) {
+ case GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL: return "fp4_prefill";
+ case GGML_CUDA_MUL_MAT_ROUTE_FWHT: return "fwht";
+ case GGML_CUDA_MUL_MAT_ROUTE_VEC_F: return "vec_f";
+ case GGML_CUDA_MUL_MAT_ROUTE_MAT_F: return "mat_f";
+ case GGML_CUDA_MUL_MAT_ROUTE_VEC_Q: return "vec_q";
+ case GGML_CUDA_MUL_MAT_ROUTE_MMQ: return "mmq";
+ case GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS: return "batched_cublas";
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F: return "op_vec_f";
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q: return "op_vec_q";
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ: return "op_mmq";
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS: return "op_cublas";
+ }
+
+ return "unknown";
+}
+
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
@@ -110,6 +159,52 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
};
}
+static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make(
+ const int type, const int64_t ne1, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+ const bool split, const bool use_vec_f, const bool use_mat_f, const bool use_vec_q,
+ const bool use_mmq, const bool use_batched_cublas, const bool use_fp4_prefill,
+ const bool use_fwht) {
+ ggml_cuda_mul_mat_route route = GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS;
+ if (use_fp4_prefill) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL;
+ } else if (use_fwht) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_FWHT;
+ } else if (!split && use_vec_f) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_VEC_F;
+ } else if (!split && use_mat_f) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_MAT_F;
+ } else if (!split && use_vec_q) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_VEC_Q;
+ } else if (!split && use_mmq) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_MMQ;
+ } else if (!split && use_batched_cublas) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS;
+ } else if (use_vec_f) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F;
+ } else if (use_vec_q) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q;
+ } else if (use_mmq) {
+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ;
+ }
+
+ return {
+ route,
+ type,
+ ne1,
+ ne11,
+ ne12,
+ ne13,
+ split,
+ use_vec_f,
+ use_mat_f,
+ use_vec_q,
+ use_mmq,
+ use_batched_cublas,
+ use_fp4_prefill,
+ use_fwht,
+ };
+}
+
static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
@@ -260,6 +355,28 @@ static inline int ggml_cuda_mmid_route_shape_format(
shape.is_quantized ? 1 : 0);
}
+static inline int ggml_cuda_mul_mat_route_shape_format(
+ char * buf, const size_t size, const ggml_cuda_mul_mat_route_shape & shape) {
+ return std::snprintf(buf, size,
+ "route=%s type=%d ne1=%lld ne11=%lld ne12=%lld ne13=%lld split=%d "
+ "use_vec_f=%d use_mat_f=%d use_vec_q=%d use_mmq=%d use_batched_cublas=%d "
+ "use_fp4_prefill=%d use_fwht=%d",
+ ggml_cuda_mul_mat_route_name(shape.route),
+ shape.type,
+ (long long) shape.ne1,
+ (long long) shape.ne11,
+ (long long) shape.ne12,
+ (long long) shape.ne13,
+ shape.split ? 1 : 0,
+ shape.use_vec_f ? 1 : 0,
+ shape.use_mat_f ? 1 : 0,
+ shape.use_vec_q ? 1 : 0,
+ shape.use_mmq ? 1 : 0,
+ shape.use_batched_cublas ? 1 : 0,
+ shape.use_fp4_prefill ? 1 : 0,
+ shape.use_fwht ? 1 : 0);
+}
+
static inline int ggml_cuda_mmq_small_m_shape_format(
char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
return std::snprintf(buf, size,
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
index e190cf1ac..2bd41d1d8 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -200,5 +200,90 @@ int main() {
require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate");
require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate");
+ const ggml_cuda_mul_mat_route_shape mat_f = ggml_cuda_mul_mat_route_shape_make(
+ /* type */ 30,
+ /* ne1 */ 128,
+ /* ne11 */ 128,
+ /* ne12 */ 1,
+ /* ne13 */ 1,
+ /* split */ false,
+ /* use_vec_f */ false,
+ /* use_mat_f */ true,
+ /* use_vec_q */ false,
+ /* use_mmq */ false,
+ /* use_batched_cublas */ false,
+ /* use_fp4_prefill */ false,
+ /* use_fwht */ false);
+
+ require(mat_f.route == GGML_CUDA_MUL_MAT_ROUTE_MAT_F, "regular MUL_MAT prefers direct mat_f when available");
+ require(!mat_f.split, "regular MUL_MAT trace records split flag");
+
+ const ggml_cuda_mul_mat_route_shape batched = ggml_cuda_mul_mat_route_shape_make(
+ /* type */ 31,
+ /* ne1 */ 128,
+ /* ne11 */ 128,
+ /* ne12 */ 4,
+ /* ne13 */ 1,
+ /* split */ false,
+ /* use_vec_f */ false,
+ /* use_mat_f */ false,
+ /* use_vec_q */ false,
+ /* use_mmq */ false,
+ /* use_batched_cublas */ true,
+ /* use_fp4_prefill */ false,
+ /* use_fwht */ false);
+
+ require(batched.route == GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS,
+ "regular MUL_MAT records batched cuBLAS route");
+
+ const ggml_cuda_mul_mat_route_shape op_cublas = ggml_cuda_mul_mat_route_shape_make(
+ /* type */ 0,
+ /* ne1 */ 16,
+ /* ne11 */ 16,
+ /* ne12 */ 1,
+ /* ne13 */ 1,
+ /* split */ true,
+ /* use_vec_f */ false,
+ /* use_mat_f */ false,
+ /* use_vec_q */ false,
+ /* use_mmq */ false,
+ /* use_batched_cublas */ false,
+ /* use_fp4_prefill */ false,
+ /* use_fwht */ false);
+
+ require(op_cublas.route == GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
+ "regular MUL_MAT fallback records op cublas route");
+
+ const ggml_cuda_mul_mat_route_shape fp4_prefill = ggml_cuda_mul_mat_route_shape_make(
+ /* type */ 39,
+ /* ne1 */ 512,
+ /* ne11 */ 512,
+ /* ne12 */ 1,
+ /* ne13 */ 1,
+ /* split */ false,
+ /* use_vec_f */ false,
+ /* use_mat_f */ false,
+ /* use_vec_q */ false,
+ /* use_mmq */ true,
+ /* use_batched_cublas */ false,
+ /* use_fp4_prefill */ true,
+ /* use_fwht */ false);
+
+ require(fp4_prefill.route == GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL,
+ "regular MUL_MAT records native FP4 prefill route before MMQ");
+
+ const int mul_mat_route_n = ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), mat_f);
+
+ require(mul_mat_route_n > 0, "regular MUL_MAT route format returns byte count");
+ require(std::strstr(buf, "route=mat_f") != nullptr, "regular MUL_MAT trace includes route name");
+ require(std::strstr(buf, "type=30") != nullptr, "regular MUL_MAT trace includes type");
+ require(std::strstr(buf, "ne1=128") != nullptr, "regular MUL_MAT trace includes output columns");
+ require(std::strstr(buf, "ne11=128") != nullptr, "regular MUL_MAT trace includes src1 columns");
+ require(std::strstr(buf, "ne12=1") != nullptr, "regular MUL_MAT trace includes src1 batch dim");
+ require(std::strstr(buf, "split=0") != nullptr, "regular MUL_MAT trace includes split flag");
+ require(std::strstr(buf, "use_mat_f=1") != nullptr, "regular MUL_MAT trace includes mat_f predicate");
+ require(std::strstr(buf, "use_batched_cublas=0") != nullptr,
+ "regular MUL_MAT trace includes batched cuBLAS predicate");
+
return 0;
}
--
2.43.0

View File

@@ -1,332 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 06:20:31 +0000
Subject: [PATCH] feat(cuda): trace cublas routes
Add a default-off LLAMA_CUBLAS_ROUTE_TRACE diagnostic around the generic cuBLAS MUL_MAT path.
The trace classifies NVFP4/BF16/FP16/SGEMM subroutes without changing branch behavior, and extends the route helper test coverage.
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/ggml-cuda.cu | 53 +++++++++++--
ggml/src/ggml-cuda/mmq-shape-trace.h | 108 +++++++++++++++++++++++++++
tests/test-cuda-mmq-shape-trace.cpp | 62 +++++++++++++++
3 files changed, 216 insertions(+), 7 deletions(-)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cd34aff13..eff197818 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1627,6 +1627,32 @@ static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type
return compute_type;
}
+static inline int ggml_cuda_cublas_route_trace_limit() {
+ static const int value = []() {
+ const char * s = getenv("LLAMA_CUBLAS_ROUTE_TRACE");
+ return s ? atoi(s) : 0;
+ }();
+
+ return value;
+}
+
+static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_shape & shape) {
+ const int trace_limit = ggml_cuda_cublas_route_trace_limit();
+ if (trace_limit <= 0) {
+ return;
+ }
+
+ static std::atomic<int> trace_count{0};
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
+ if (trace_idx >= trace_limit) {
+ return;
+ }
+
+ char buf[320];
+ ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape);
+ fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf);
+}
+
static void ggml_cuda_op_mul_mat_cublas(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
@@ -1662,7 +1688,22 @@ static void ggml_cuda_op_mul_mat_cublas(
row_diff == src0->ne[1] &&
dst->op_params[0] == GGML_PREC_DEFAULT;
- if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+ const bool src0_contig = ggml_is_contiguous(src0);
+ const bool full_rows = row_diff == src0->ne[1];
+ const bool fast_fp16 = fast_fp16_hardware_available(cc);
+ bool force_fp32 = false;
+ bool force_fp16 = false;
+ if (fast_fp16 && use_fp16) {
+ const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
+ force_fp32 = force_compute_type.fp32;
+ force_fp16 = force_compute_type.fp16;
+ }
+ ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make(
+ src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc,
+ supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows,
+ GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA));
+
+ if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) {
// Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when
// ggml_cuda_should_use_mmq() returned false (large-M dense prefill).
// Dequant the FP4 weights to a TRANSIENT bf16 pool buffer and run a
@@ -1702,7 +1743,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
- } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+ } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && src0_contig && full_rows) {
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
if (src1->type != GGML_TYPE_BF16) {
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
@@ -1730,7 +1771,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
- } else if (fast_fp16_hardware_available(cc) && use_fp16) {
+ } else if (fast_fp16 && use_fp16) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
if (src0->type != GGML_TYPE_F16) {
@@ -1754,12 +1795,10 @@ static void ggml_cuda_op_mul_mat_cublas(
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
- const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
-
- if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
+ if (!force_fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|| GGML_CUDA_CC_IS_RDNA4(cc)
|| cc == GGML_CUDA_CC_VOLTA
- || force_compute_type.fp32))
+ || force_fp32))
{
const float alpha = 1.0f;
const float beta = 0.0f;
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
index 8ac373fd9..f5b4ecf2c 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -85,6 +85,14 @@ enum ggml_cuda_mul_mat_route {
GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
};
+enum ggml_cuda_cublas_route {
+ GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
+ GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
+ GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
+ GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
+ GGML_CUDA_CUBLAS_ROUTE_SGEMM,
+};
+
struct ggml_cuda_mul_mat_route_shape {
ggml_cuda_mul_mat_route route;
int type;
@@ -102,6 +110,27 @@ struct ggml_cuda_mul_mat_route_shape {
bool use_fwht;
};
+struct ggml_cuda_cublas_route_shape {
+ ggml_cuda_cublas_route route;
+ int type;
+ int src1_type;
+ int64_t row_diff;
+ int64_t src1_ncols;
+ int64_t ne00;
+ int64_t ne10;
+ int64_t ldc;
+ bool supports_bf16;
+ bool use_fp16;
+ bool fast_fp16;
+ bool force_fp32;
+ bool force_fp16;
+ bool src0_contig;
+ bool full_rows;
+ bool is_cdna;
+ bool is_rdna4;
+ bool is_volta;
+};
+
static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
switch (route) {
case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq";
@@ -132,6 +161,18 @@ static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_
return "unknown";
}
+static inline const char * ggml_cuda_cublas_route_name(const ggml_cuda_cublas_route route) {
+ switch (route) {
+ case GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC: return "nvfp4_bf16_tc";
+ case GGML_CUDA_CUBLAS_ROUTE_BF16_TC: return "bf16_tc";
+ case GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F: return "f16_tc_32f";
+ case GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F: return "f16_tc_16f";
+ case GGML_CUDA_CUBLAS_ROUTE_SGEMM: return "sgemm";
+ }
+
+ return "unknown";
+}
+
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
@@ -205,6 +246,47 @@ static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make(
};
}
+static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
+ const int type, const int src1_type, const int64_t row_diff, const int64_t src1_ncols,
+ const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16,
+ const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16,
+ const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4,
+ const bool is_volta) {
+ ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM;
+ if (supports_bf16 && type == 40 && src0_contig && full_rows) {
+ route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC;
+ } else if (supports_bf16 && type == 30 && src0_contig && full_rows) {
+ route = GGML_CUDA_CUBLAS_ROUTE_BF16_TC;
+ } else if (fast_fp16 && use_fp16) {
+ if (!force_fp16 && (is_cdna || is_rdna4 || is_volta || force_fp32)) {
+ route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F;
+ } else {
+ route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F;
+ }
+ }
+
+ return {
+ route,
+ type,
+ src1_type,
+ row_diff,
+ src1_ncols,
+ ne00,
+ ne10,
+ ldc,
+ supports_bf16,
+ use_fp16,
+ fast_fp16,
+ force_fp32,
+ force_fp16,
+ src0_contig,
+ full_rows,
+ is_cdna,
+ is_rdna4,
+ is_volta,
+ };
+}
+
static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
@@ -377,6 +459,32 @@ static inline int ggml_cuda_mul_mat_route_shape_format(
shape.use_fwht ? 1 : 0);
}
+static inline int ggml_cuda_cublas_route_shape_format(
+ char * buf, const size_t size, const ggml_cuda_cublas_route_shape & shape) {
+ return std::snprintf(buf, size,
+ "route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld "
+ "supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d "
+ "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d",
+ ggml_cuda_cublas_route_name(shape.route),
+ shape.type,
+ shape.src1_type,
+ (long long) shape.row_diff,
+ (long long) shape.src1_ncols,
+ (long long) shape.ne00,
+ (long long) shape.ne10,
+ (long long) shape.ldc,
+ shape.supports_bf16 ? 1 : 0,
+ shape.use_fp16 ? 1 : 0,
+ shape.fast_fp16 ? 1 : 0,
+ shape.force_fp32 ? 1 : 0,
+ shape.force_fp16 ? 1 : 0,
+ shape.src0_contig ? 1 : 0,
+ shape.full_rows ? 1 : 0,
+ shape.is_cdna ? 1 : 0,
+ shape.is_rdna4 ? 1 : 0,
+ shape.is_volta ? 1 : 0);
+}
+
static inline int ggml_cuda_mmq_small_m_shape_format(
char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
return std::snprintf(buf, size,
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
index 2bd41d1d8..1443749c3 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -285,5 +285,67 @@ int main() {
require(std::strstr(buf, "use_batched_cublas=0") != nullptr,
"regular MUL_MAT trace includes batched cuBLAS predicate");
+ const ggml_cuda_cublas_route_shape bf16_tc = ggml_cuda_cublas_route_shape_make(
+ /* type */ 30,
+ /* src1_type */ 0,
+ /* row_diff */ 18,
+ /* src1_ncols */ 18,
+ /* ne00 */ 1024,
+ /* ne10 */ 1024,
+ /* ldc */ 18,
+ /* supports_bf16 */ true,
+ /* use_fp16 */ false,
+ /* fast_fp16 */ true,
+ /* force_fp32 */ false,
+ /* force_fp16 */ false,
+ /* src0_contig */ true,
+ /* full_rows */ true,
+ /* is_cdna */ false,
+ /* is_rdna4 */ false,
+ /* is_volta */ false);
+
+ require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
+ "cuBLAS records native BF16 tensor-core route");
+
+ const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make(
+ /* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true,
+ false, false, false);
+
+ require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
+ "cuBLAS records NVFP4 dequant-to-BF16 tensor-core route");
+
+ const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make(
+ /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true,
+ false, false, false);
+
+ require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
+ "cuBLAS records default FP16 tensor-core 16F compute route");
+
+ const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make(
+ /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true,
+ false, false, false);
+
+ require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
+ "cuBLAS records forced FP16 tensor-core 32F compute route");
+
+ const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make(
+ /* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true,
+ false, false, false);
+
+ require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM,
+ "cuBLAS records SGEMM fallback route");
+
+ const int cublas_route_n = ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), bf16_tc);
+
+ require(cublas_route_n > 0, "cuBLAS route format returns byte count");
+ require(std::strstr(buf, "route=bf16_tc") != nullptr, "cuBLAS trace includes route name");
+ require(std::strstr(buf, "type=30") != nullptr, "cuBLAS trace includes src0 type");
+ require(std::strstr(buf, "src1_type=0") != nullptr, "cuBLAS trace includes src1 type");
+ require(std::strstr(buf, "row_diff=18") != nullptr, "cuBLAS trace includes row count");
+ require(std::strstr(buf, "src1_ncols=18") != nullptr, "cuBLAS trace includes source column count");
+ require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate");
+ require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate");
+ require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate");
+
return 0;
}
--
2.43.0

View File

@@ -1,162 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 06:38:11 +0000
Subject: [PATCH] feat(cuda): trace cublas tensor names
Extend LLAMA_CUBLAS_ROUTE_TRACE with src0/src1/dst tensor names so SGEMM and BF16 cuBLAS buckets can be tied back to graph nodes.
Assisted-by: Codex:gpt-5
---
ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++--
ggml/src/ggml-cuda/mmq-shape-trace.h | 16 +++++++++++++---
tests/test-cuda-mmq-shape-trace.cpp | 18 ++++++++++++------
3 files changed, 28 insertions(+), 11 deletions(-)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index eff197818..1da67d2af 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1648,7 +1648,7 @@ static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_sha
return;
}
- char buf[320];
+ char buf[512];
ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape);
fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf);
}
@@ -1701,7 +1701,8 @@ static void ggml_cuda_op_mul_mat_cublas(
ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make(
src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc,
supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows,
- GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA));
+ GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA,
+ src0->name, src1->name, dst->name));
if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) {
// Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
index f5b4ecf2c..b55c7467c 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
@@ -129,6 +129,9 @@ struct ggml_cuda_cublas_route_shape {
bool is_cdna;
bool is_rdna4;
bool is_volta;
+ const char * src0_name;
+ const char * src1_name;
+ const char * dst_name;
};
static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
@@ -251,7 +254,7 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16,
const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16,
const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4,
- const bool is_volta) {
+ const bool is_volta, const char * src0_name, const char * src1_name, const char * dst_name) {
ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM;
if (supports_bf16 && type == 40 && src0_contig && full_rows) {
route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC;
@@ -284,6 +287,9 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
is_cdna,
is_rdna4,
is_volta,
+ src0_name ? src0_name : "",
+ src1_name ? src1_name : "",
+ dst_name ? dst_name : "",
};
}
@@ -464,7 +470,8 @@ static inline int ggml_cuda_cublas_route_shape_format(
return std::snprintf(buf, size,
"route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld "
"supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d "
- "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d",
+ "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d "
+ "src0=%s src1=%s dst=%s",
ggml_cuda_cublas_route_name(shape.route),
shape.type,
shape.src1_type,
@@ -482,7 +489,10 @@ static inline int ggml_cuda_cublas_route_shape_format(
shape.full_rows ? 1 : 0,
shape.is_cdna ? 1 : 0,
shape.is_rdna4 ? 1 : 0,
- shape.is_volta ? 1 : 0);
+ shape.is_volta ? 1 : 0,
+ shape.src0_name,
+ shape.src1_name,
+ shape.dst_name);
}
static inline int ggml_cuda_mmq_small_m_shape_format(
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
index 1443749c3..2547193ce 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
+++ b/tests/test-cuda-mmq-shape-trace.cpp
@@ -27,7 +27,7 @@ int main() {
require(shape.n_active_est == 256, "active expert estimate is capped by expert count");
require(shape.density == 4, "density is ceil(assignments / active experts)");
- char buf[256];
+ char buf[512];
const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
require(n > 0, "format returns byte count");
@@ -302,35 +302,38 @@ int main() {
/* full_rows */ true,
/* is_cdna */ false,
/* is_rdna4 */ false,
- /* is_volta */ false);
+ /* is_volta */ false,
+ /* src0_name */ "blk.0.proj.weight",
+ /* src1_name */ "blk.0.proj.inp",
+ /* dst_name */ "blk.0.proj.out");
require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
"cuBLAS records native BF16 tensor-core route");
const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make(
/* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true,
- false, false, false);
+ false, false, false, "nvfp4.weight", "nvfp4.inp", "nvfp4.out");
require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
"cuBLAS records NVFP4 dequant-to-BF16 tensor-core route");
const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make(
/* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true,
- false, false, false);
+ false, false, false, "f16.weight", "f16.inp", "f16.out");
require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
"cuBLAS records default FP16 tensor-core 16F compute route");
const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make(
/* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true,
- false, false, false);
+ false, false, false, "f16.weight", "f16.inp", "f16.out");
require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
"cuBLAS records forced FP16 tensor-core 32F compute route");
const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make(
/* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true,
- false, false, false);
+ false, false, false, "f32.weight", "f32.inp", "f32.out");
require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM,
"cuBLAS records SGEMM fallback route");
@@ -346,6 +349,9 @@ int main() {
require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate");
require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate");
require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate");
+ require(std::strstr(buf, "src0=blk.0.proj.weight") != nullptr, "cuBLAS trace includes src0 name");
+ require(std::strstr(buf, "src1=blk.0.proj.inp") != nullptr, "cuBLAS trace includes src1 name");
+ require(std::strstr(buf, "dst=blk.0.proj.out") != nullptr, "cuBLAS trace includes dst name");
return 0;
}
--
2.43.0