mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-02 20:37:03 -04:00
patches(paged): trim series to Phase135 routed-FFN line, sync to fork 1edddc8fe
The campaign patches 0048-0063 were added without matching fork commits.
After a keep/drop review, the series is trimmed and re-mirrored 1:1 onto the
fork branch mudler/llama.cpp:localai-paged (HEAD 1edddc8fe, on 51168c5ee).
Kept, renumbered from the fork (now carry Assisted-by + Signed-off-by):
- 0048 test(paged): cover MoE swiglu down chain (was 0051, fd920cf8a)
- 0049 test(paged): cover MoE weighted combine chain (was 0052, a85c1e098)
- 0050 test(paged): cover ragged MoE dispatch (was 0053, 2fed6aacf)
- 0051 fix(speculative): disable backend sampling for MTP drafts (was 0054, f1d976f06)
- 0052 feat(paged): whole-pattern MoE matcher + routed-FFN fused NVFP4-quant
down MMQ (new, 1edddc8fe)
Dropped (no fork commits, removed from the series):
- 0048-0050 W4A16 grouped-tile pack/tune/pad: dead line, W4A16 ~1.5x slower
than grouped-MMQ.
- 0055-0063 speculative/moe/mul-mat/cublas route traces + the rejected small-M
tile-policy knob (0059).
- All other 110-140 campaign markers not needed by Phase135 (GPU-sort,
W4A16-direct-A, boundary trace/timing, Phase133 sorted-F32, Phase134
fused-SWIGLU, Phase138 finalize) carry no code in this tree.
Tree-hash proof (the mirror invariant): a fresh detached worktree at
LLAMA_VERSION 0ed235ea2c17a19fc8238668653946721ed136fd with every on-disk
patches/paged/0*.patch applied in numeric order (git apply) stages to tree
097c862c6834b7d8b90419b305b8402155ef8373, byte-identical to fork HEAD
1edddc8fe's tree. Series is 43 patches (0001-0047 unchanged + 0048-0052).
Gated on GB10 sm_121a: default md5 MoE 8cb0ce23 / dense 5951a5b4 unchanged;
opt-in md5-clean; MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46,
MOE_SWIGLU_DOWN 7/7, MUL_MAT_ID_RAGGED_MOE 6/6; six mmq_moe_quantized_raw
markers with zero sorted launches on the opt-in sentinel.
Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -1,121 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 21:19:07 +0000
|
||||
Subject: [PATCH] feat(paged): pack W4A16 grouped tile metadata
|
||||
|
||||
Collapse the grouped W4A16 tile metadata from three host vectors and three device copies into one aligned descriptor array and one H2D copy. This keeps the path default-off while reducing launch-side metadata overhead for the experimental W4A16 MoE prefill path.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/w4a16-gemm.cu | 46 ++++++++++++++++++--------------
|
||||
1 file changed, 26 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
index f348f31ad..899e1a23f 100644
|
||||
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
@@ -18,6 +18,14 @@ typedef tile<16, 8, nv_bfloat162> tile_A; // A operand: M=16, K=16
|
||||
typedef tile< 8, 8, nv_bfloat162> tile_B; // B operand: N=8, K=16
|
||||
typedef tile<16, 8, float> tile_C; // accumulator: M=16, N=8
|
||||
|
||||
+struct alignas(16) w4a16_tile_desc {
|
||||
+ int expert;
|
||||
+ int row0;
|
||||
+ int rows;
|
||||
+ int pad;
|
||||
+};
|
||||
+static_assert(sizeof(w4a16_tile_desc) == 16, "w4a16 tile descriptors must stay 16 bytes");
|
||||
+
|
||||
#ifndef LLAMA_W4A16_PREFILL_M
|
||||
#define LLAMA_W4A16_PREFILL_M 0
|
||||
#endif // LLAMA_W4A16_PREFILL_M
|
||||
@@ -92,9 +100,7 @@ static __global__ void w4a16_grouped_kernel(
|
||||
const nv_bfloat16 * __restrict__ Abf, // [pad_rows, K] bf16
|
||||
const block_nvfp4 * __restrict__ W0, // src0 base (expert 0)
|
||||
float * __restrict__ C, // [total_rows, N] f32
|
||||
- const int * __restrict__ g_tile_expert,
|
||||
- const int * __restrict__ g_tile_row0,
|
||||
- const int * __restrict__ g_tile_rows,
|
||||
+ const w4a16_tile_desc * __restrict__ g_tiles,
|
||||
int N, int K, int64_t expert_stride_blocks) {
|
||||
#if defined(AMPERE_MMA_AVAILABLE) && defined(CP_ASYNC_AVAILABLE)
|
||||
constexpr int BK = 64; // one nvfp4 block
|
||||
@@ -129,9 +135,10 @@ static __global__ void w4a16_grouped_kernel(
|
||||
const int tid = warp*32 + lane; // linear id for the cp.async strided copies
|
||||
const int wrow = warp / WARPS_N, wcol = warp % WARPS_N;
|
||||
|
||||
- const int e = g_tile_expert[blockIdx.y];
|
||||
- const int row0 = g_tile_row0[blockIdx.y];
|
||||
- const int rcount = g_tile_rows[blockIdx.y];
|
||||
+ const w4a16_tile_desc tile_desc = g_tiles[blockIdx.y];
|
||||
+ const int e = tile_desc.expert;
|
||||
+ const int row0 = tile_desc.row0;
|
||||
+ const int rcount = tile_desc.rows;
|
||||
const int blockCol = blockIdx.x*BN;
|
||||
const int Kb = K/64;
|
||||
const block_nvfp4 * We = W0 + (int64_t) e*expert_stride_blocks; // expert e weight base
|
||||
@@ -238,7 +245,7 @@ static __global__ void w4a16_grouped_kernel(
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Abf); GGML_UNUSED(W0); GGML_UNUSED(C);
|
||||
- GGML_UNUSED(g_tile_expert); GGML_UNUSED(g_tile_row0); GGML_UNUSED(g_tile_rows);
|
||||
+ GGML_UNUSED(g_tiles);
|
||||
GGML_UNUSED(N); GGML_UNUSED(K); GGML_UNUSED(expert_stride_blocks);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMPERE_MMA_AVAILABLE && CP_ASYNC_AVAILABLE
|
||||
@@ -296,18 +303,21 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
return;
|
||||
}
|
||||
|
||||
- std::vector<int32_t> h_tile_expert, h_tile_row0, h_tile_rows;
|
||||
+ std::vector<w4a16_tile_desc> h_tiles;
|
||||
int64_t row = 0;
|
||||
for (int64_t e = 0; e < n_experts; e++) {
|
||||
const int t = tokens_per_expert[e];
|
||||
for (int off = 0; off < t; off += BM) {
|
||||
- h_tile_expert.push_back((int32_t) e);
|
||||
- h_tile_row0.push_back((int32_t) (row + off));
|
||||
- h_tile_rows.push_back((int32_t) std::min(BM, t - off));
|
||||
+ h_tiles.push_back({
|
||||
+ (int) e,
|
||||
+ (int) (row + off),
|
||||
+ (int) std::min(BM, t - off),
|
||||
+ 0,
|
||||
+ });
|
||||
}
|
||||
row += t;
|
||||
}
|
||||
- const int n_tiles = (int) h_tile_expert.size();
|
||||
+ const int n_tiles = (int) h_tiles.size();
|
||||
|
||||
if (getenv("LLAMA_W4A16_DEBUG")) {
|
||||
int max_tpe = 0, multi = 0;
|
||||
@@ -319,13 +329,9 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
(long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi);
|
||||
}
|
||||
|
||||
- // device: tile map
|
||||
- ggml_cuda_pool_alloc<int32_t> d_tile_expert(ctx.pool(), n_tiles);
|
||||
- ggml_cuda_pool_alloc<int32_t> d_tile_row0 (ctx.pool(), n_tiles);
|
||||
- ggml_cuda_pool_alloc<int32_t> d_tile_rows (ctx.pool(), n_tiles);
|
||||
- CUDA_CHECK(cudaMemcpyAsync(d_tile_expert.ptr, h_tile_expert.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
- CUDA_CHECK(cudaMemcpyAsync(d_tile_row0.ptr, h_tile_row0.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
- CUDA_CHECK(cudaMemcpyAsync(d_tile_rows.ptr, h_tile_rows.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
||||
+ // device: packed tile map; one pageable H2D copy instead of three tiny copies
|
||||
+ ggml_cuda_pool_alloc<w4a16_tile_desc> d_tiles(ctx.pool(), n_tiles);
|
||||
+ CUDA_CHECK(cudaMemcpyAsync(d_tiles.ptr, h_tiles.data(), n_tiles*sizeof(w4a16_tile_desc), cudaMemcpyHostToDevice, stream));
|
||||
|
||||
// activations: f32 -> bf16 (cheap cast, NO act-quant), zero-padded so every tile's BM-row read
|
||||
// stays in-bounds. A tile's row0 is generally NOT BM-aligned (experts start mid-buffer), and a
|
||||
@@ -353,7 +359,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
dim3 block(32, WARPS_M*WARPS_N); // 2D: threadIdx.x = warp lane, threadIdx.y = warp
|
||||
kern<<<grid, block, smem_bytes, stream>>>(
|
||||
Abf.get(), (const block_nvfp4 *) src0->data, dst_sorted,
|
||||
- d_tile_expert.ptr, d_tile_row0.ptr, d_tile_rows.ptr,
|
||||
+ d_tiles.ptr,
|
||||
(int) N, (int) K, expert_stride_blocks);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
From cd56cf037379b084d6bb0ed47db8b785c828be86 Mon Sep 17 00:00:00 2001
|
||||
From fd920cf8a7fe9cc7753cd0640411ce771edfeaca Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 23:18:38 +0000
|
||||
Subject: [PATCH] test(paged): cover MoE swiglu down chain
|
||||
Subject: [PATCH 48/52] test(paged): cover MoE swiglu down chain
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
tests/test-backend-ops.cpp | 92 ++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 92 insertions(+)
|
||||
@@ -1,93 +0,0 @@
|
||||
From 7dfa0e17548c5f04f83d2cc2a057b0a9941b599a Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 21:44:54 +0000
|
||||
Subject: [PATCH] feat(paged): tune W4A16 grouped tile shape
|
||||
|
||||
Select a BM=32 grouped W4A16 MoE prefill shape by default after the GB10 shape sweep, while keeping the prior 64x128 shape and additional diagnostics selectable through LLAMA_W4A16_SHAPE.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/w4a16-gemm.cu | 42 +++++++++++++++++++++++++++++---
|
||||
1 file changed, 38 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
index 899e1a23f..ca8864292 100644
|
||||
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
+#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
// ===========================================================================
|
||||
@@ -281,7 +282,23 @@ bool ggml_cuda_w4a16_moe_grouped_should_engage(
|
||||
return true;
|
||||
}
|
||||
|
||||
-void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
+static bool ggml_cuda_w4a16_use_base_shape() {
|
||||
+ static const bool use_base = [] {
|
||||
+ const char * e = getenv("LLAMA_W4A16_SHAPE");
|
||||
+ if (e == nullptr || e[0] == '\0' || strcmp(e, "default") == 0 || strcmp(e, "bm32") == 0 || strcmp(e, "32x128") == 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (strcmp(e, "base") == 0 || strcmp(e, "64x128") == 0) {
|
||||
+ return true;
|
||||
+ }
|
||||
+ fprintf(stderr, "[w4a16] unknown LLAMA_W4A16_SHAPE=%s, using default bm32\n", e);
|
||||
+ return false;
|
||||
+ }();
|
||||
+ return use_base;
|
||||
+}
|
||||
+
|
||||
+template<int BM, int BN, int WARPS_M, int WARPS_N, int STAGES>
|
||||
+static void ggml_cuda_mul_mat_id_w4a16_grouped_impl(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0,
|
||||
const float * src1_sorted,
|
||||
@@ -290,9 +307,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
int64_t n_experts, int64_t K, int64_t N,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_NVFP4);
|
||||
- GGML_ASSERT(N % 128 == 0 && K % 64 == 0);
|
||||
-
|
||||
- constexpr int BM = 64, BN = 128, WARPS_M = 2, WARPS_N = 4, STAGES = 2;
|
||||
+ GGML_ASSERT(N % BN == 0 && K % 64 == 0);
|
||||
|
||||
// host: build the per-M-tile expert map (ragged, no tile crosses an expert boundary)
|
||||
int64_t total_rows = 0;
|
||||
@@ -327,6 +342,8 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
}
|
||||
fprintf(stderr, "[w4a16] engaged: total_rows=%lld n_experts=%lld K=%lld N=%lld n_tiles=%d max_tpe=%d multi_tile_experts=%d\n",
|
||||
(long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi);
|
||||
+ fprintf(stderr, "[w4a16] shape: BM=%d BN=%d WARPS_M=%d WARPS_N=%d STAGES=%d\n",
|
||||
+ BM, BN, WARPS_M, WARPS_N, STAGES);
|
||||
}
|
||||
|
||||
// device: packed tile map; one pageable H2D copy instead of three tiny copies
|
||||
@@ -363,3 +380,20 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
(int) N, (int) K, expert_stride_blocks);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
+
|
||||
+void ggml_cuda_mul_mat_id_w4a16_grouped(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0,
|
||||
+ const float * src1_sorted,
|
||||
+ float * dst_sorted,
|
||||
+ const int * tokens_per_expert,
|
||||
+ int64_t n_experts, int64_t K, int64_t N,
|
||||
+ cudaStream_t stream) {
|
||||
+ if (ggml_cuda_w4a16_use_base_shape()) {
|
||||
+ ggml_cuda_mul_mat_id_w4a16_grouped_impl<64, 128, 2, 4, 2>(
|
||||
+ ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream);
|
||||
+ } else {
|
||||
+ ggml_cuda_mul_mat_id_w4a16_grouped_impl<32, 128, 1, 4, 2>(
|
||||
+ ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream);
|
||||
+ }
|
||||
+}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
From 3ef7eb9e412eb34f8656675862f6753c65d28ec9 Mon Sep 17 00:00:00 2001
|
||||
From a85c1e098e22eb587fd80220986f35a8d6e11300 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 23:50:33 +0000
|
||||
Subject: [PATCH] test(paged): cover MoE weighted combine chain
|
||||
Subject: [PATCH 49/52] test(paged): cover MoE weighted combine chain
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
tests/test-backend-ops.cpp | 90 ++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 90 insertions(+)
|
||||
@@ -1,56 +0,0 @@
|
||||
From d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 22:13:09 +0000
|
||||
Subject: [PATCH] feat(paged): pad W4A16 A shared tile stride
|
||||
|
||||
Pad the grouped W4A16 A operand shared-memory row stride to reduce bank pressure on GB10 while preserving the selected bm32/default and base launch shapes.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/w4a16-gemm.cu | 9 +++++----
|
||||
1 file changed, 5 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
index ca8864292..b17e022b0 100644
|
||||
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
|
||||
@@ -111,7 +111,8 @@ static __global__ void w4a16_grouped_kernel(
|
||||
constexpr int MF = WM/16, NF = WN/8;
|
||||
|
||||
constexpr int AN = BK/2; // bf16 pairs per A smem row (nv_bfloat162)
|
||||
- constexpr int SZ_A = BM*AN; // nv_bfloat162 per stage
|
||||
+ constexpr int ASTR = AN + 4; // padded A smem row stride, in nv_bfloat162
|
||||
+ constexpr int SZ_A = BM*ASTR; // nv_bfloat162 per stage
|
||||
constexpr int SZ_WQ = BN*8; // u32 per stage (32 qs bytes/row)
|
||||
constexpr int SZ_WD = BN; // u32 per stage (4 scale bytes/row)
|
||||
|
||||
@@ -155,7 +156,7 @@ static __global__ void w4a16_grouped_kernel(
|
||||
const int c = idx % (BK/8); // 16B chunk in the row
|
||||
const int r = idx / (BK/8); // row in tile
|
||||
const nv_bfloat16 * src = Abf + (int64_t)(row0 + r)*K + (int64_t)kt*BK + c*8;
|
||||
- w4a16_cp_async16(((char *) sA[st]) + (r*AN + c*4)*sizeof(uint32_t), src);
|
||||
+ w4a16_cp_async16(((char *) sA[st]) + (r*ASTR + c*4)*sizeof(uint32_t), src);
|
||||
}
|
||||
// W qs: BN rows x 32 bytes = BN x 8 u32 (each block's qs at byte offset 4)
|
||||
#pragma unroll 1
|
||||
@@ -198,7 +199,7 @@ static __global__ void w4a16_grouped_kernel(
|
||||
#pragma unroll
|
||||
for (int mi = 0; mi < MF; mi++) {
|
||||
const int rb = wrow*WM + mi*16;
|
||||
- load_ldmatrix(A_frag[mi], sAcur + rb*AN + kk*8, AN);
|
||||
+ load_ldmatrix(A_frag[mi], sAcur + rb*ASTR + kk*8, ASTR);
|
||||
}
|
||||
// B fragments: in-register FP4->bf16 dequant (correct-by-construction via tile get_i/get_j)
|
||||
tile_B B_frag[NF];
|
||||
@@ -368,7 +369,7 @@ static void ggml_cuda_mul_mat_id_w4a16_grouped_impl(
|
||||
const int64_t expert_stride_blocks = (int64_t) (src0->nb[2] / sizeof(block_nvfp4));
|
||||
|
||||
auto kern = w4a16_grouped_kernel<BM, BN, WARPS_M, WARPS_N, STAGES>;
|
||||
- constexpr int STAGE_U32 = BM*(64/2) + BN*8 + BN;
|
||||
+ constexpr int STAGE_U32 = BM*((64/2) + 4) + BN*8 + BN;
|
||||
const int smem_bytes = STAGES * STAGE_U32 * (int) sizeof(uint32_t);
|
||||
CUDA_CHECK(cudaFuncSetAttribute(kern, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
|
||||
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
From e21732fc47206d5878e3b977bbd21858a3ba4ab0 Mon Sep 17 00:00:00 2001
|
||||
From 2fed6aacff14537864bbf754c7552740131d4eaf Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 00:39:52 +0000
|
||||
Subject: [PATCH] test(paged): cover ragged MoE dispatch
|
||||
Subject: [PATCH 50/52] test(paged): cover ragged MoE dispatch
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
tests/test-backend-ops.cpp | 118 +++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 118 insertions(+)
|
||||
@@ -1,8 +1,11 @@
|
||||
From 3eba64afff6ecaa25da11f0e394717224f221c9a Mon Sep 17 00:00:00 2001
|
||||
From f1d976f06fb92655106709256dd093ffefb85e2b Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 00:50:36 +0000
|
||||
Subject: [PATCH] fix(speculative): disable backend sampling for MTP drafts
|
||||
Subject: [PATCH 51/52] fix(speculative): disable backend sampling for MTP
|
||||
drafts
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
common/speculative.cpp | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
@@ -0,0 +1,933 @@
|
||||
From 1edddc8fe93bb2fec5f831bbde5df2b7480a7b05 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Thu, 2 Jul 2026 12:15:38 +0200
|
||||
Subject: [PATCH 52/52] feat(paged): whole-pattern MoE matcher + routed-FFN
|
||||
fused NVFP4-quant down MMQ
|
||||
|
||||
Add the routed-FFN fused-quant line for the paged NVFP4 MoE decode step,
|
||||
all default-off and md5-clean, gating a fused SwiGLU-to-NVFP4-quant plus a
|
||||
raw pre-quantized down-projection MMQ that skips the intermediate F32
|
||||
materialize + re-quantize of the standard gate_up -> SwiGLU -> down chain.
|
||||
|
||||
Pieces (all guarded, no effect unless explicitly enabled):
|
||||
|
||||
- Whole-pattern MoE matcher + executor hook in ggml_cuda_try_fuse
|
||||
(LLAMA_MOE_WHOLE_PATTERN_EXEC and the *_TRACE/_EARLY_TRACE diagnostics).
|
||||
Detects the gate_up(MUL_MAT_ID) -> view/view -> SwiGLU(GLU) -> down(MUL_MAT_ID)
|
||||
sub-graph early in the fusion pass and, when engaged, runs the whole chain
|
||||
through a single executor instead of node-by-node.
|
||||
|
||||
- Routed-FFN PoC scaffold ggml/src/ggml-cuda/moe-ffn.{cu,cuh} + a narrow hook
|
||||
(LLAMA_MOE_ROUTED_FFN_POC). ggml_cuda_compute_forward is de-static-ed so the
|
||||
executor translation unit can drive the standard op path for the fallback
|
||||
legs. The executor tries the fused-quant path first, else falls back to the
|
||||
stock compute_forward for glu + down (bit-identical to default).
|
||||
|
||||
- Fused SwiGLU-to-NVFP4-quant + raw down MMQ (LLAMA_MOE_ROUTED_FFN_FUSED_QUANT):
|
||||
moe_swiglu_nvfp4_quant_kernel writes block_fp4_mmq activations directly, then
|
||||
ggml_cuda_mul_mat_q_moe_quantized (with the local ggml_cuda_mmq_ids_meta
|
||||
refactor of the expert-sorted ids/bounds prep) runs the down GEMM on the
|
||||
pre-quantized rows. Native FP4 (Blackwell) only; NVFP4 down weights only.
|
||||
|
||||
Gated on GB10 (sm_121a), before/after this commit:
|
||||
- Canonical default-path greedy md5 unchanged: MoE q36-35b-a3b-nvfp4
|
||||
8cb0ce23777bf55f92f63d0292c756b0, dense q36-27b-nvfp4
|
||||
5951a5b4d624ce891e22ab5fca9bc439.
|
||||
- md5-clean opt-in: LLAMA_MOE_ROUTED_FFN_POC=1 LLAMA_MOE_ROUTED_FFN_FUSED_QUANT=1
|
||||
keeps the MoE md5 byte-identical (8cb0ce23...).
|
||||
- test-backend-ops: MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46,
|
||||
and the MoE sentinels MOE_SWIGLU_DOWN 7/7 + MUL_MAT_ID_RAGGED_MOE 6/6 pass both
|
||||
default and opt-in. Opt-in emits exactly six route=mmq_moe_quantized_raw markers
|
||||
with zero mmq_moe_sorted_raw launches (fused path provably engaged).
|
||||
- Serving effect is flat-to-slightly-positive and not a shipped default:
|
||||
decode agg 326.9 -> 332.7 t/s, mmq_nvfp4 6009 -> 5915 ms, aggregate flat
|
||||
(~/bench/phase135_routed_ffn_fused_quant_serving/20260702_082102).
|
||||
|
||||
The rejected/neutral neighbours of this line (Phase133 sorted-F32 down,
|
||||
Phase134 fused-SWIGLU-only, Phase138 finalize/weighted-combine fusion, the
|
||||
W4A16 grouped-tile pack/tune/pad line, GPU-sort, boundary/layout/quant traces)
|
||||
are deliberately excluded and carry no markers in this tree.
|
||||
|
||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 304 +++++++++++++++++++++++++++++++-
|
||||
ggml/src/ggml-cuda/mmq.cu | 148 ++++++++++++++++
|
||||
ggml/src/ggml-cuda/mmq.cuh | 25 +++
|
||||
ggml/src/ggml-cuda/moe-ffn.cu | 296 +++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/moe-ffn.cuh | 24 +++
|
||||
5 files changed, 796 insertions(+), 1 deletion(-)
|
||||
create mode 100644 ggml/src/ggml-cuda/moe-ffn.cu
|
||||
create mode 100644 ggml/src/ggml-cuda/moe-ffn.cuh
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 374949f25..ef1bdc3b4 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "ggml-cuda/im2col.cuh"
|
||||
#include "ggml-cuda/mmf.cuh"
|
||||
#include "ggml-cuda/mmq.cuh"
|
||||
+#include "ggml-cuda/moe-ffn.cuh"
|
||||
#include "ggml-cuda/mmvf.cuh"
|
||||
#include "ggml-cuda/mmvq.cuh"
|
||||
#include "ggml-cuda/norm.cuh"
|
||||
@@ -2854,7 +2855,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
nb1, nb2, nb3, stream);
|
||||
}
|
||||
|
||||
-static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
||||
+bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ARGMAX:
|
||||
ggml_cuda_argmax(ctx, dst);
|
||||
@@ -4032,6 +4033,201 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
return false;
|
||||
}
|
||||
|
||||
+static inline const char * ggml_cuda_moe_wp_trace_tensor_name(const ggml_tensor * t) {
|
||||
+ return t != nullptr && t->name[0] != '\0' ? t->name : "-";
|
||||
+}
|
||||
+
|
||||
+static inline int ggml_cuda_moe_whole_pattern_trace_limit() {
|
||||
+ static const int value = []() {
|
||||
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_TRACE");
|
||||
+ if (s == nullptr || strcmp(s, "0") == 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ const int parsed = atoi(s);
|
||||
+ return parsed > 0 ? parsed : 128;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
+static inline bool ggml_cuda_moe_whole_pattern_trace_take(std::atomic<int> & counter) {
|
||||
+ const int trace_limit = ggml_cuda_moe_whole_pattern_trace_limit();
|
||||
+ if (trace_limit <= 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed);
|
||||
+ return trace_idx < trace_limit;
|
||||
+}
|
||||
+
|
||||
+static inline int ggml_cuda_moe_whole_pattern_early_trace_limit() {
|
||||
+ static const int value = []() {
|
||||
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EARLY_TRACE");
|
||||
+ if (s == nullptr || strcmp(s, "0") == 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ const int parsed = atoi(s);
|
||||
+ return parsed > 0 ? parsed : 128;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
+static inline bool ggml_cuda_moe_whole_pattern_exec_enabled() {
|
||||
+ static const bool value = []() {
|
||||
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC");
|
||||
+ return s != nullptr && atoi(s) != 0;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
+static inline int ggml_cuda_moe_whole_pattern_exec_trace_limit() {
|
||||
+ static const int value = []() {
|
||||
+ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC_TRACE");
|
||||
+ if (s == nullptr || strcmp(s, "0") == 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ const int parsed = atoi(s);
|
||||
+ return parsed > 0 ? parsed : 128;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
+static inline bool ggml_cuda_moe_whole_pattern_exec_trace_take(std::atomic<int> & counter) {
|
||||
+ const int trace_limit = ggml_cuda_moe_whole_pattern_exec_trace_limit();
|
||||
+ if (trace_limit <= 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed);
|
||||
+ return trace_idx < trace_limit;
|
||||
+}
|
||||
+
|
||||
+struct ggml_cuda_moe_whole_pattern {
|
||||
+ const ggml_tensor * gate_up = nullptr;
|
||||
+ const ggml_tensor * gate = nullptr;
|
||||
+ const ggml_tensor * up = nullptr;
|
||||
+ const ggml_tensor * glu = nullptr;
|
||||
+ const ggml_tensor * down = nullptr;
|
||||
+ const ggml_tensor * ids = nullptr;
|
||||
+
|
||||
+ bool view_pair = false;
|
||||
+ bool ids_match = false;
|
||||
+ bool swiglu = false;
|
||||
+ bool supported_type = false;
|
||||
+ bool supported = false;
|
||||
+};
|
||||
+
|
||||
+static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect(const ggml_tensor * glu, const ggml_tensor * down) {
|
||||
+ ggml_cuda_moe_whole_pattern pattern{};
|
||||
+ pattern.glu = glu;
|
||||
+ pattern.down = down;
|
||||
+
|
||||
+ if (glu == nullptr || down == nullptr || glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ pattern.gate = glu->src[0];
|
||||
+ pattern.up = glu->src[1];
|
||||
+ pattern.ids = down->src[2];
|
||||
+
|
||||
+ pattern.view_pair = pattern.gate != nullptr && pattern.up != nullptr &&
|
||||
+ pattern.gate->op == GGML_OP_VIEW && pattern.up->op == GGML_OP_VIEW &&
|
||||
+ pattern.gate->view_src != nullptr && pattern.gate->view_src == pattern.up->view_src;
|
||||
+ if (!pattern.view_pair) {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ pattern.gate_up = pattern.gate->view_src;
|
||||
+ if (pattern.gate_up == nullptr || pattern.gate_up->op != GGML_OP_MUL_MAT_ID) {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ pattern.ids_match = pattern.gate_up->src[2] == pattern.ids;
|
||||
+ pattern.swiglu = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU;
|
||||
+ pattern.supported_type = down->src[0] != nullptr &&
|
||||
+ (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4);
|
||||
+ pattern.supported = pattern.ids_match && pattern.swiglu && pattern.supported_type;
|
||||
+
|
||||
+ return pattern;
|
||||
+}
|
||||
+
|
||||
+static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect_early(const ggml_cgraph * cgraph, int i) {
|
||||
+ ggml_cuda_moe_whole_pattern pattern{};
|
||||
+
|
||||
+ if (cgraph == nullptr || i + 4 >= cgraph->n_nodes) {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ const ggml_tensor * gate_up = cgraph->nodes[i + 0];
|
||||
+ const ggml_tensor * view0 = cgraph->nodes[i + 1];
|
||||
+ const ggml_tensor * view1 = cgraph->nodes[i + 2];
|
||||
+ const ggml_tensor * glu = cgraph->nodes[i + 3];
|
||||
+ const ggml_tensor * down = cgraph->nodes[i + 4];
|
||||
+
|
||||
+ pattern.gate_up = gate_up;
|
||||
+ pattern.glu = glu;
|
||||
+ pattern.down = down;
|
||||
+
|
||||
+ if (gate_up == nullptr || view0 == nullptr || view1 == nullptr || glu == nullptr || down == nullptr ||
|
||||
+ gate_up->op != GGML_OP_MUL_MAT_ID || view0->op != GGML_OP_VIEW || view1->op != GGML_OP_VIEW ||
|
||||
+ glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ pattern.view_pair = view0->view_src == gate_up && view1->view_src == gate_up;
|
||||
+ if (!pattern.view_pair) {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ if (glu->src[0] == view0 && glu->src[1] == view1) {
|
||||
+ pattern.gate = view0;
|
||||
+ pattern.up = view1;
|
||||
+ } else if (glu->src[0] == view1 && glu->src[1] == view0) {
|
||||
+ pattern.gate = view1;
|
||||
+ pattern.up = view0;
|
||||
+ } else {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ if (down->src[1] != glu) {
|
||||
+ return pattern;
|
||||
+ }
|
||||
+
|
||||
+ pattern.ids = down->src[2];
|
||||
+ pattern.ids_match = gate_up->src[2] == pattern.ids;
|
||||
+ pattern.swiglu = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU;
|
||||
+ pattern.supported_type = down->src[0] != nullptr &&
|
||||
+ (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4);
|
||||
+ pattern.supported = pattern.ids_match && pattern.swiglu && pattern.supported_type;
|
||||
+
|
||||
+ return pattern;
|
||||
+}
|
||||
+
|
||||
+static bool ggml_cuda_moe_whole_pattern_exec_proof(
|
||||
+ ggml_backend_cuda_context * cuda_ctx,
|
||||
+ const ggml_cuda_moe_whole_pattern & pattern) {
|
||||
+ GGML_ASSERT(cuda_ctx != nullptr);
|
||||
+ GGML_ASSERT(pattern.supported);
|
||||
+ GGML_ASSERT(pattern.gate_up != nullptr);
|
||||
+ GGML_ASSERT(pattern.glu != nullptr);
|
||||
+ GGML_ASSERT(pattern.down != nullptr);
|
||||
+
|
||||
+ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.gate_up))) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.glu))) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.down))) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
// try and fuse nodes and return the number of nodes to skip
|
||||
static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, int i) {
|
||||
|
||||
@@ -4042,6 +4238,112 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
|
||||
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
+ static std::atomic<int> moe_whole_pattern_early_trace_count{0};
|
||||
+ const bool routed_ffn_poc = ggml_cuda_moe_routed_ffn_poc_enabled();
|
||||
+ const bool whole_pattern_exec = ggml_cuda_moe_whole_pattern_exec_enabled();
|
||||
+ const int whole_pattern_early_trace_limit = ggml_cuda_moe_whole_pattern_early_trace_limit();
|
||||
+ if (node->op == GGML_OP_MUL_MAT_ID &&
|
||||
+ (routed_ffn_poc || whole_pattern_exec || whole_pattern_early_trace_limit > 0)) {
|
||||
+ const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect_early(cgraph, i);
|
||||
+ if (pattern.view_pair) {
|
||||
+ const int trace_idx = moe_whole_pattern_early_trace_count.fetch_add(1, std::memory_order_relaxed);
|
||||
+ if (trace_idx < whole_pattern_early_trace_limit) {
|
||||
+ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
|
||||
+ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
|
||||
+ fprintf(stderr,
|
||||
+ "[LLAMA_MOE_WHOLE_PATTERN_EARLY] supported=%d skip_ready=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s"
|
||||
+ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64
|
||||
+ " n_embd=%" PRId64 " n_ff=%" PRId64
|
||||
+ " ids_match=%d swiglu=%d\n",
|
||||
+ pattern.supported ? 1 : 0,
|
||||
+ pattern.supported ? 4 : 0,
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.up),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
|
||||
+ down_w != nullptr ? ggml_type_name(down_w->type) : "-",
|
||||
+ down_x != nullptr ? down_x->ne[2] : 0,
|
||||
+ pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
|
||||
+ down_w != nullptr ? down_w->ne[2] : 0,
|
||||
+ down_w != nullptr ? down_w->ne[1] : 0,
|
||||
+ down_w != nullptr ? down_w->ne[0] : 0,
|
||||
+ pattern.ids_match ? 1 : 0,
|
||||
+ pattern.swiglu ? 1 : 0);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
+ const bool poc_supported = routed_ffn_poc && ggml_cuda_moe_routed_ffn_poc_should_engage(
|
||||
+ pattern.gate_up, pattern.gate, pattern.up, pattern.glu, pattern.down, pattern.ids, cc);
|
||||
+
|
||||
+ if ((poc_supported || (whole_pattern_exec && pattern.supported))) {
|
||||
+ const bool ok = poc_supported ?
|
||||
+ ggml_cuda_moe_routed_ffn_poc(
|
||||
+ *cuda_ctx,
|
||||
+ const_cast<ggml_tensor *>(pattern.gate_up),
|
||||
+ const_cast<ggml_tensor *>(pattern.gate),
|
||||
+ const_cast<ggml_tensor *>(pattern.up),
|
||||
+ const_cast<ggml_tensor *>(pattern.glu),
|
||||
+ const_cast<ggml_tensor *>(pattern.down)) :
|
||||
+ ggml_cuda_moe_whole_pattern_exec_proof(cuda_ctx, pattern);
|
||||
+ GGML_ASSERT(ok);
|
||||
+
|
||||
+ static std::atomic<int> moe_whole_pattern_exec_trace_count{0};
|
||||
+ if (ggml_cuda_moe_whole_pattern_exec_trace_take(moe_whole_pattern_exec_trace_count)) {
|
||||
+ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
|
||||
+ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
|
||||
+ fprintf(stderr,
|
||||
+ "[LLAMA_MOE_WHOLE_PATTERN_EXEC] skip=4 gate_up=%s glu=%s down=%s ids=%s"
|
||||
+ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64 "\n",
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
|
||||
+ down_x != nullptr ? down_x->ne[2] : 0,
|
||||
+ pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
|
||||
+ down_w != nullptr ? down_w->ne[2] : 0);
|
||||
+ }
|
||||
+
|
||||
+ return 4;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (node->op == GGML_OP_GLU && i + 1 < cgraph->n_nodes && cgraph->nodes[i + 1]->op == GGML_OP_MUL_MAT_ID) {
|
||||
+ static std::atomic<int> moe_whole_pattern_trace_count{0};
|
||||
+ const bool whole_trace = ggml_cuda_moe_whole_pattern_trace_take(moe_whole_pattern_trace_count);
|
||||
+
|
||||
+ if (whole_trace) {
|
||||
+ const ggml_tensor * down = cgraph->nodes[i + 1];
|
||||
+ const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect(node, down);
|
||||
+
|
||||
+ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
|
||||
+ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
|
||||
+ fprintf(stderr,
|
||||
+ "[LLAMA_MOE_WHOLE_PATTERN] supported=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s"
|
||||
+ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64
|
||||
+ " n_embd=%" PRId64 " n_ff=%" PRId64
|
||||
+ " view_pair=%d ids_match=%d swiglu=%d\n",
|
||||
+ pattern.supported ? 1 : 0,
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.up),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
|
||||
+ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
|
||||
+ down_w != nullptr ? ggml_type_name(down_w->type) : "-",
|
||||
+ down_x != nullptr ? down_x->ne[2] : 0,
|
||||
+ pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
|
||||
+ down_w != nullptr ? down_w->ne[2] : 0,
|
||||
+ down_w != nullptr ? down_w->ne[1] : 0,
|
||||
+ down_w != nullptr ? down_w->ne[0] : 0,
|
||||
+ pattern.view_pair ? 1 : 0,
|
||||
+ pattern.ids_match ? 1 : 0,
|
||||
+ pattern.swiglu ? 1 : 0);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
//topk-moe
|
||||
if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
|
||||
cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
|
||||
index dc5c2d198..d8f39d395 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cu
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cu
|
||||
@@ -1,4 +1,7 @@
|
||||
#include <cstdlib>
|
||||
+#include <atomic>
|
||||
+#include <cinttypes>
|
||||
+#include <cstdio>
|
||||
#include "common.cuh"
|
||||
#include "mmq.cuh"
|
||||
#include "quantize.cuh"
|
||||
@@ -75,6 +78,151 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
|
||||
}
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_quant_trace_limit() {
|
||||
+ static const int value = []() {
|
||||
+ const char * s = getenv("LLAMA_QUANT_TRACE");
|
||||
+ return s ? atoi(s) : 0;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
+static inline const char * ggml_cuda_quant_trace_tensor_name(const ggml_tensor * t) {
|
||||
+ return t != nullptr && t->name[0] != '\0' ? t->name : "-";
|
||||
+}
|
||||
+
|
||||
+static inline void ggml_cuda_quant_trace(
|
||||
+ const char * route, const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
+ const ggml_tensor * ids, const ggml_tensor * dst, const int native_fp4,
|
||||
+ const int dedup, const int gathered, const int64_t ne10, const int64_t ne10_padded,
|
||||
+ const int64_t rows, const int64_t ne12, const int64_t n_expert_used) {
|
||||
+ const int trace_limit = ggml_cuda_quant_trace_limit();
|
||||
+ if (trace_limit <= 0) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ static std::atomic<int> trace_count{0};
|
||||
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
|
||||
+ if (trace_idx >= trace_limit) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ fprintf(stderr,
|
||||
+ "[LLAMA_QUANT_TRACE] route=%s src0=%s src0_type=%s src1=%s src1_type=%s dst=%s dst_type=%s ids=%s "
|
||||
+ "native_fp4=%d dedup=%d gathered=%d K=%" PRId64 " Kpad=%" PRId64 " rows=%" PRId64
|
||||
+ " ne12=%" PRId64 " experts=%" PRId64 "\n",
|
||||
+ route, ggml_cuda_quant_trace_tensor_name(src0), src0 != nullptr ? ggml_type_name(src0->type) : "-",
|
||||
+ ggml_cuda_quant_trace_tensor_name(src1), src1 != nullptr ? ggml_type_name(src1->type) : "-",
|
||||
+ ggml_cuda_quant_trace_tensor_name(dst), dst != nullptr ? ggml_type_name(dst->type) : "-",
|
||||
+ ggml_cuda_quant_trace_tensor_name(ids), native_fp4, dedup, gathered,
|
||||
+ ne10, ne10_padded, rows, ne12, n_expert_used);
|
||||
+}
|
||||
+
|
||||
+ggml_cuda_mmq_ids_meta::ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts)
|
||||
+ : ne_get_rows(ne_get_rows) {
|
||||
+ alloc(pool, ne_get_rows, n_experts);
|
||||
+}
|
||||
+
|
||||
+void ggml_cuda_mmq_ids_meta::alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts) {
|
||||
+ ids_src1.alloc(pool, ne_get_rows);
|
||||
+ ids_dst.alloc(pool, ne_get_rows);
|
||||
+ expert_bounds.alloc(pool, n_experts + 1);
|
||||
+ this->ne_get_rows = ne_get_rows;
|
||||
+}
|
||||
+
|
||||
+void ggml_cuda_mmq_ids_meta::build(
|
||||
+ const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens,
|
||||
+ int64_t n_expert_used, int64_t nchannels_y, int64_t sis1,
|
||||
+ cudaStream_t stream) {
|
||||
+ GGML_ASSERT(ids->nb[0] == ggml_element_size(ids));
|
||||
+ const int si1 = ids->nb[1] / ggml_element_size(ids);
|
||||
+
|
||||
+ ggml_cuda_launch_mm_ids_helper(
|
||||
+ (const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(),
|
||||
+ n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
|
||||
+ CUDA_CHECK(cudaGetLastError());
|
||||
+}
|
||||
+
|
||||
+static void ggml_cuda_mul_mat_q_moe_quantized_impl(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
|
||||
+ const int32_t * ids_dst, const int32_t * expert_bounds,
|
||||
+ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
|
||||
+ int64_t ncols_src1_padded) {
|
||||
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT( src1_q != nullptr);
|
||||
+ GGML_ASSERT( ids_dst != nullptr);
|
||||
+ GGML_ASSERT( expert_bounds != nullptr);
|
||||
+ GGML_ASSERT( n_tokens > 0);
|
||||
+ GGML_ASSERT( n_expert_used > 0);
|
||||
+ GGML_ASSERT( n_experts > 0);
|
||||
+ GGML_ASSERT( ncols_src1_padded > 0);
|
||||
+
|
||||
+ const int64_t ne00 = src0->ne[0];
|
||||
+ const int64_t ne01 = src0->ne[1];
|
||||
+ const int64_t ne02 = src0->ne[2];
|
||||
+ const int64_t ne03 = src0->ne[3];
|
||||
+ const int64_t ne0 = dst->ne[0];
|
||||
+ const int64_t ne1 = dst->ne[1];
|
||||
+ const int64_t ne2 = dst->ne[2];
|
||||
+ const int64_t ne3 = dst->ne[3];
|
||||
+ const int64_t nb00 = src0->nb[0];
|
||||
+ const int64_t nb01 = src0->nb[1];
|
||||
+ const int64_t nb02 = src0->nb[2];
|
||||
+ const int64_t nb03 = src0->nb[3];
|
||||
+ const int64_t nb0 = dst->nb[0];
|
||||
+ const int64_t nb1 = dst->nb[1];
|
||||
+ const int64_t nb2 = dst->nb[2];
|
||||
+ const int64_t nb3 = dst->nb[3];
|
||||
+
|
||||
+ const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
+ const size_t ts_dst = ggml_type_size(dst->type);
|
||||
+
|
||||
+ GGML_ASSERT(nb00 == (int64_t) ts_src0);
|
||||
+ GGML_ASSERT(nb0 == (int64_t) ts_dst);
|
||||
+ GGML_ASSERT(ne00 <= ncols_src1_padded);
|
||||
+ GGML_ASSERT(ne01 == ne0);
|
||||
+ GGML_ASSERT(ne02 == n_experts);
|
||||
+ GGML_ASSERT(ne1 == n_expert_used);
|
||||
+ GGML_ASSERT(ne2 == n_tokens);
|
||||
+
|
||||
+ cudaStream_t stream = ctx.stream();
|
||||
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
+ const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
+ || GGML_CUDA_CC_IS_CDNA(cc);
|
||||
+
|
||||
+ const int64_t ne_get_rows = n_tokens * n_expert_used;
|
||||
+ const int64_t s01 = nb01 / ts_src0;
|
||||
+ const int64_t s1 = nb1 / ts_dst;
|
||||
+ const int64_t s02 = nb02 / ts_src0;
|
||||
+ const int64_t s2 = nb2 / ts_dst;
|
||||
+ const int64_t s03 = nb03 / ts_src0;
|
||||
+ const int64_t s3 = nb3 / ts_dst;
|
||||
+ const int64_t s12 = ne_get_rows * ncols_src1_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int));
|
||||
+ const int64_t s13 = n_tokens * s12;
|
||||
+
|
||||
+ const mmq_args args = {
|
||||
+ (const char *) src0->data, src0->type, (const int *) src1_q, ids_dst, expert_bounds, (float *) dst->data,
|
||||
+ ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
|
||||
+ n_experts, n_experts, s02, s12, s2,
|
||||
+ ne03, ne3, s03, s13, s3,
|
||||
+ use_stream_k, n_tokens};
|
||||
+
|
||||
+ ggml_cuda_quant_trace("mmq_moe_quantized_raw", src0, nullptr, nullptr, dst, 1,
|
||||
+ 0, 0, ne00, ncols_src1_padded, ne_get_rows, n_tokens, n_expert_used);
|
||||
+ ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
|
||||
+}
|
||||
+
|
||||
+void ggml_cuda_mul_mat_q_moe_quantized(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
|
||||
+ const int32_t * ids_dst, const int32_t * expert_bounds,
|
||||
+ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
|
||||
+ int64_t ncols_src1_padded) {
|
||||
+ ggml_cuda_mul_mat_q_moe_quantized_impl(ctx, src0, src1_q, dst, ids_dst, expert_bounds,
|
||||
+ n_tokens, n_expert_used, n_experts, ncols_src1_padded);
|
||||
+}
|
||||
+
|
||||
void ggml_cuda_mul_mat_q(
|
||||
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index b53e38a8b..690dc694a 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -4334,6 +4334,31 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
|
||||
void ggml_cuda_mul_mat_q(
|
||||
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
+struct ggml_cuda_mmq_ids_meta {
|
||||
+ ggml_cuda_pool_alloc<int32_t> ids_src1;
|
||||
+ ggml_cuda_pool_alloc<int32_t> ids_dst;
|
||||
+ ggml_cuda_pool_alloc<int32_t> expert_bounds;
|
||||
+
|
||||
+ int64_t ne_get_rows = 0;
|
||||
+
|
||||
+ ggml_cuda_mmq_ids_meta() = default;
|
||||
+ ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts);
|
||||
+
|
||||
+ void alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts);
|
||||
+
|
||||
+ void build(
|
||||
+ const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens,
|
||||
+ int64_t n_expert_used, int64_t nchannels_y, int64_t sis1,
|
||||
+ cudaStream_t stream);
|
||||
+};
|
||||
+
|
||||
+void ggml_cuda_mul_mat_q_moe_quantized(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
|
||||
+ const int32_t * ids_dst, const int32_t * expert_bounds,
|
||||
+ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
|
||||
+ int64_t ncols_src1_padded);
|
||||
+
|
||||
void ggml_cuda_op_mul_mat_q(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
diff --git a/ggml/src/ggml-cuda/moe-ffn.cu b/ggml/src/ggml-cuda/moe-ffn.cu
|
||||
new file mode 100644
|
||||
index 000000000..507390d65
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/ggml-cuda/moe-ffn.cu
|
||||
@@ -0,0 +1,296 @@
|
||||
+#include "moe-ffn.cuh"
|
||||
+#include "getrows.cuh"
|
||||
+#include "mmq.cuh"
|
||||
+#include "unary.cuh"
|
||||
+
|
||||
+#include <cstdlib>
|
||||
+
|
||||
+bool ggml_cuda_moe_routed_ffn_poc_enabled() {
|
||||
+ static const bool enabled = [] {
|
||||
+ const char * value = getenv("LLAMA_MOE_ROUTED_FFN_POC");
|
||||
+ return value != nullptr && atoi(value) != 0;
|
||||
+ }();
|
||||
+ return enabled;
|
||||
+}
|
||||
+
|
||||
+bool ggml_cuda_moe_routed_ffn_poc_should_engage(
|
||||
+ const ggml_tensor * gate_up,
|
||||
+ const ggml_tensor * gate,
|
||||
+ const ggml_tensor * up,
|
||||
+ const ggml_tensor * glu,
|
||||
+ const ggml_tensor * down,
|
||||
+ const ggml_tensor * ids,
|
||||
+ int cc) {
|
||||
+ if (!blackwell_mma_available(cc)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (gate_up == nullptr || gate == nullptr || up == nullptr || glu == nullptr || down == nullptr || ids == nullptr) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (gate_up->op != GGML_OP_MUL_MAT_ID || down->op != GGML_OP_MUL_MAT_ID) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (gate->op != GGML_OP_VIEW || up->op != GGML_OP_VIEW || gate->view_src != gate_up || up->view_src != gate_up) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (glu->op != GGML_OP_GLU || ggml_get_glu_op(glu) != GGML_GLU_OP_SWIGLU || down->src[1] != glu) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (gate_up->src[2] != ids || down->src[2] != ids) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ const ggml_tensor * down_w = down->src[0];
|
||||
+ if (down_w == nullptr || (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool ggml_cuda_moe_routed_ffn_fused_quant_enabled() {
|
||||
+ static const bool enabled = [] {
|
||||
+ const char * value = getenv("LLAMA_MOE_ROUTED_FFN_FUSED_QUANT");
|
||||
+ return value != nullptr && atoi(value) != 0;
|
||||
+ }();
|
||||
+ return enabled;
|
||||
+}
|
||||
+
|
||||
+static bool ggml_cuda_moe_routed_ffn_down_supported(const ggml_tensor * glu, const ggml_tensor * down) {
|
||||
+ const ggml_tensor * down_w = down != nullptr ? down->src[0] : nullptr;
|
||||
+ const ggml_tensor * ids = down != nullptr ? down->src[2] : nullptr;
|
||||
+ if (glu == nullptr || down == nullptr || down_w == nullptr || ids == nullptr) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (glu->type != GGML_TYPE_F32 || down->type != GGML_TYPE_F32 || ids->type != GGML_TYPE_I32) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (glu->ne[3] != 1 || down->ne[3] != 1 || ids->ne[2] != 1 || ids->ne[3] != 1) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (down_w->ne[0] != glu->ne[0] || down_w->ne[1] != down->ne[0] || down_w->ne[2] <= 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (ids->ne[0] != glu->ne[1] || ids->ne[1] != glu->ne[2]) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (down->ne[1] != glu->ne[1] || down->ne[2] != glu->ne[2]) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (ids->nb[0] != ggml_element_size(ids)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (glu->nb[0] != sizeof(float) || down->nb[0] != sizeof(float)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (glu->nb[1] != (size_t) (glu->ne[0] * (int64_t) sizeof(float)) ||
|
||||
+ glu->nb[2] != (size_t) (glu->ne[1] * (int64_t) glu->nb[1])) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (down->nb[1] != (size_t) (down->ne[0] * (int64_t) sizeof(float)) ||
|
||||
+ down->nb[2] != (size_t) (down->ne[1] * (int64_t) down->nb[1])) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
+ return ggml_cuda_should_use_mmq(down_w->type, cc, glu->ne[2], down_w->ne[2]);
|
||||
+}
|
||||
+
|
||||
+static __global__ void moe_swiglu_nvfp4_quant_kernel(
|
||||
+ const float * __restrict__ gate,
|
||||
+ const float * __restrict__ up,
|
||||
+ const int32_t * __restrict__ ids_src1,
|
||||
+ void * __restrict__ vy,
|
||||
+ int64_t n_ff,
|
||||
+ int64_t n_ff_padded,
|
||||
+ int64_t n_rows,
|
||||
+ int64_t n_used,
|
||||
+ int64_t gate_s1,
|
||||
+ int64_t gate_s2,
|
||||
+ int64_t up_s1,
|
||||
+ int64_t up_s2) {
|
||||
+#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
+ const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB;
|
||||
+ if (i0_base >= n_ff_padded) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ const int64_t row = blockIdx.x;
|
||||
+ const int64_t src_row = ids_src1[row];
|
||||
+ const int64_t token = src_row / n_used;
|
||||
+ const int64_t used = src_row - token * n_used;
|
||||
+ const int64_t k_block = i0_base / QK_K;
|
||||
+ const int64_t blocks_per_col = (n_ff_padded + QK_K - 1) / QK_K;
|
||||
+ if (k_block >= blocks_per_col) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ block_fp4_mmq * y = (block_fp4_mmq *) vy;
|
||||
+ block_fp4_mmq * yb = y + k_block * n_rows + row;
|
||||
+ const int sub = (i0_base % QK_K) / QK_NVFP4_SUB;
|
||||
+
|
||||
+ float vals_raw[QK_NVFP4_SUB];
|
||||
+ float amax_raw = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int k = 0; k < QK_NVFP4_SUB; k++) {
|
||||
+ const int64_t i0 = i0_base + k;
|
||||
+ if (i0 < n_ff) {
|
||||
+ const float g = gate[token * gate_s2 + used * gate_s1 + i0];
|
||||
+ const float u = up[token * up_s2 + used * up_s1 + i0];
|
||||
+ const float v = ggml_cuda_op_silu_single(g) * u;
|
||||
+ vals_raw[k] = v;
|
||||
+ amax_raw = fmaxf(amax_raw, fabsf(v));
|
||||
+ } else {
|
||||
+ vals_raw[k] = 0.0f;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2 };
|
||||
+ const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
|
||||
+
|
||||
+ float best_err = FLT_MAX;
|
||||
+ uint8_t fp8_code = 0;
|
||||
+ float subblock_scale = 0.0f;
|
||||
+
|
||||
+#pragma unroll
|
||||
+ for (int i = 0; i < 5; i++) {
|
||||
+ const int test_code = first_fp8_code + test_offsets[i];
|
||||
+ if (test_code < 0 || test_code > 0x7e) {
|
||||
+ continue;
|
||||
+ }
|
||||
+ const uint8_t code = (uint8_t) test_code;
|
||||
+ const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
|
||||
+ const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
|
||||
+ float cur_err = 0.0f;
|
||||
+#pragma unroll
|
||||
+ for (int k = 0; k < QK_NVFP4_SUB; ++k) {
|
||||
+ const float v = vals_raw[k];
|
||||
+ const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale);
|
||||
+ const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale;
|
||||
+ cur_err = fmaf(err_diff, err_diff, cur_err);
|
||||
+ }
|
||||
+
|
||||
+ if (cur_err < best_err) {
|
||||
+ best_err = cur_err;
|
||||
+ fp8_code = test_code;
|
||||
+ subblock_scale = test_scale;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
|
||||
+ uint32_t q0 = 0;
|
||||
+ uint32_t q1 = 0;
|
||||
+#pragma unroll
|
||||
+ for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
|
||||
+ q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 0], inv_scale) << (8 * k);
|
||||
+ q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 8], inv_scale) << (8 * k + 4);
|
||||
+ q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 4], inv_scale) << (8 * k);
|
||||
+ q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4);
|
||||
+ }
|
||||
+
|
||||
+ uint32_t * yqs = reinterpret_cast<uint32_t *>(yb->qs);
|
||||
+ yqs[2 * sub + 0] = q0;
|
||||
+ yqs[2 * sub + 1] = q1;
|
||||
+ reinterpret_cast<uint8_t *>(yb->d4)[sub] = fp8_code;
|
||||
+#else
|
||||
+ NO_DEVICE_CODE;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+static bool ggml_cuda_moe_routed_ffn_fused_quant(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ ggml_tensor * gate,
|
||||
+ ggml_tensor * up,
|
||||
+ ggml_tensor * glu,
|
||||
+ ggml_tensor * down) {
|
||||
+ if (!ggml_cuda_moe_routed_ffn_fused_quant_enabled()) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!ggml_cuda_moe_routed_ffn_down_supported(glu, down)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ const ggml_tensor * down_w = down->src[0];
|
||||
+ const ggml_tensor * ids = down->src[2];
|
||||
+ if (down_w->type != GGML_TYPE_NVFP4) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (gate == nullptr || up == nullptr || gate->type != GGML_TYPE_F32 || up->type != GGML_TYPE_F32) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (gate->ne[0] != glu->ne[0] || gate->ne[1] != glu->ne[1] || gate->ne[2] != glu->ne[2] || gate->ne[3] != glu->ne[3]) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (up->ne[0] != glu->ne[0] || up->ne[1] != glu->ne[1] || up->ne[2] != glu->ne[2] || up->ne[3] != glu->ne[3]) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (gate->nb[0] != sizeof(float) || up->nb[0] != sizeof(float)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
+ if (!blackwell_mma_available(cc)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ const int64_t n_ff = glu->ne[0];
|
||||
+ const int64_t n_ff_padded = GGML_PAD(n_ff, MATRIX_ROW_PADDING);
|
||||
+ if (n_ff % QK_NVFP4 != 0) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ const int64_t n_expert_used = ids->ne[0];
|
||||
+ const int64_t n_tokens = glu->ne[2];
|
||||
+ const int64_t n_experts = down_w->ne[2];
|
||||
+ const int64_t ne_get_rows = n_tokens * n_expert_used;
|
||||
+
|
||||
+ ggml_cuda_mmq_ids_meta ids_meta(ctx.pool(), ne_get_rows, n_experts);
|
||||
+ const int64_t sis1 = glu->nb[2] / glu->nb[1];
|
||||
+ ids_meta.build(ids, n_experts, n_tokens, n_expert_used, glu->ne[1], sis1, ctx.stream());
|
||||
+
|
||||
+ const size_t nbytes_src1_q = ne_get_rows * n_ff_padded * sizeof(block_fp4_mmq) / QK_K +
|
||||
+ get_mmq_x_max_host(cc) * sizeof(block_q8_1_mmq);
|
||||
+ ggml_cuda_pool_alloc<char> src1_q(ctx.pool(), nbytes_src1_q);
|
||||
+
|
||||
+ constexpr int nvfp4_block_size = 128;
|
||||
+ const int64_t block_num_y = (n_ff_padded + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size);
|
||||
+ const dim3 block_size(nvfp4_block_size, 1, 1);
|
||||
+ const dim3 num_blocks(ne_get_rows, block_num_y, 1);
|
||||
+ moe_swiglu_nvfp4_quant_kernel<<<num_blocks, block_size, 0, ctx.stream()>>>(
|
||||
+ (const float *) gate->data, (const float *) up->data, ids_meta.ids_src1.get(), src1_q.get(),
|
||||
+ n_ff, n_ff_padded, ne_get_rows, n_expert_used,
|
||||
+ gate->nb[1] / sizeof(float), gate->nb[2] / sizeof(float),
|
||||
+ up->nb[1] / sizeof(float), up->nb[2] / sizeof(float));
|
||||
+ CUDA_CHECK(cudaGetLastError());
|
||||
+
|
||||
+ ggml_cuda_mul_mat_q_moe_quantized(
|
||||
+ ctx, down_w, src1_q.get(), down,
|
||||
+ ids_meta.ids_dst.get(), ids_meta.expert_bounds.get(),
|
||||
+ n_tokens, n_expert_used, n_experts, n_ff_padded);
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+bool ggml_cuda_moe_routed_ffn_poc(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ ggml_tensor * gate_up,
|
||||
+ ggml_tensor * gate,
|
||||
+ ggml_tensor * up,
|
||||
+ ggml_tensor * glu,
|
||||
+ ggml_tensor * down) {
|
||||
+ if (!ggml_cuda_compute_forward(ctx, gate_up)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (ggml_cuda_moe_routed_ffn_fused_quant(ctx, gate, up, glu, down)) {
|
||||
+ return true;
|
||||
+ }
|
||||
+ if (!ggml_cuda_compute_forward(ctx, glu)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+ if (!ggml_cuda_compute_forward(ctx, down)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/moe-ffn.cuh b/ggml/src/ggml-cuda/moe-ffn.cuh
|
||||
new file mode 100644
|
||||
index 000000000..ce385d31a
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/ggml-cuda/moe-ffn.cuh
|
||||
@@ -0,0 +1,24 @@
|
||||
+#pragma once
|
||||
+
|
||||
+#include "common.cuh"
|
||||
+
|
||||
+bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+
|
||||
+bool ggml_cuda_moe_routed_ffn_poc_enabled();
|
||||
+
|
||||
+bool ggml_cuda_moe_routed_ffn_poc_should_engage(
|
||||
+ const ggml_tensor * gate_up,
|
||||
+ const ggml_tensor * gate,
|
||||
+ const ggml_tensor * up,
|
||||
+ const ggml_tensor * glu,
|
||||
+ const ggml_tensor * down,
|
||||
+ const ggml_tensor * ids,
|
||||
+ int cc);
|
||||
+
|
||||
+bool ggml_cuda_moe_routed_ffn_poc(
|
||||
+ ggml_backend_cuda_context & ctx,
|
||||
+ ggml_tensor * gate_up,
|
||||
+ ggml_tensor * gate,
|
||||
+ ggml_tensor * up,
|
||||
+ ggml_tensor * glu,
|
||||
+ ggml_tensor * down);
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
From fb9402661291e0488a3e2bf2f3948ebcd18e18c9 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 02:41:22 +0000
|
||||
Subject: [PATCH] feat(server): trace speculative batch shapes
|
||||
|
||||
Add an env-gated LLAMA_SPEC_SHAPE_TRACE log around the server batch rows emitted by normal decode and speculative verification slots. This keeps the instrumentation default-off while exposing the row/output shape entropy that prevents CUDA graph reuse under MTP serving.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
tools/server/server-context.cpp | 20 ++++++++++++++++++--
|
||||
1 file changed, 18 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||
index a77e2676d..fd8348af6 100644
|
||||
--- a/tools/server/server-context.cpp
|
||||
+++ b/tools/server/server-context.cpp
|
||||
@@ -457,12 +457,22 @@ struct server_slot {
|
||||
|
||||
// add sampled token of this slot to the batch, optionally add the speculative draft tokens if any
|
||||
void handle_last_sampled_token(server_batch & batch) {
|
||||
+ static const bool spec_shape_trace = getenv("LLAMA_SPEC_SHAPE_TRACE") != nullptr;
|
||||
+ const int32_t batch_before = batch.size();
|
||||
+
|
||||
bool add_ok = true;
|
||||
if (spec_draft.empty()) {
|
||||
// no speculative decoding
|
||||
- i_batch = batch.size();
|
||||
+ i_batch = batch_before;
|
||||
+
|
||||
+ const int32_t pos0 = prompt.tokens.pos_next();
|
||||
+
|
||||
+ add_ok &= batch.add(id, sampled, pos0, true);
|
||||
|
||||
- add_ok &= batch.add(id, sampled, prompt.tokens.pos_next(), true);
|
||||
+ if (spec_shape_trace) {
|
||||
+ SLT_INF(*this, "spec shape: kind=decode batch_before=%d rows=1 outputs=1 draft=0 pos0=%d slot_tokens=%zu\n",
|
||||
+ batch_before, pos0, prompt.tokens.size());
|
||||
+ }
|
||||
|
||||
SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n",
|
||||
sampled, n_ctx, prompt.n_tokens(), truncated);
|
||||
@@ -479,6 +489,12 @@ struct server_slot {
|
||||
|
||||
auto pos0 = prompt.tokens.pos_next();
|
||||
|
||||
+ if (spec_shape_trace) {
|
||||
+ SLT_INF(*this, "spec shape: kind=verify batch_before=%d rows=%zu outputs=%zu draft=%zu spec_i_first=%d spec_i_last=%d pos0=%d slot_tokens=%zu\n",
|
||||
+ batch_before, spec_draft.size() + 1, spec_draft.size() + 1, spec_draft.size(),
|
||||
+ spec_i_batch.front(), spec_i_batch.back(), pos0, prompt.tokens.size());
|
||||
+ }
|
||||
+
|
||||
add_ok &= batch.add(id, sampled, pos0++, true);
|
||||
for (auto token : spec_draft) {
|
||||
add_ok &= batch.add(this->id, token, pos0++, true);
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,212 +0,0 @@
|
||||
From 20a99518a39acbb4474fa9c97121fc7b9f07c1ef Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 04:27:19 +0000
|
||||
Subject: [PATCH] feat(cuda): trace moe mmq batch shapes
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 66 ++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/mmq.cuh | 31 ++++++++++++-
|
||||
tests/CMakeLists.txt | 2 +
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 42 ++++++++++++++++++
|
||||
4 files changed, 140 insertions(+), 1 deletion(-)
|
||||
create mode 100644 ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
create mode 100644 tests/test-cuda-mmq-shape-trace.cpp
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
new file mode 100644
|
||||
index 000000000..9d41b7c80
|
||||
--- /dev/null
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -0,0 +1,66 @@
|
||||
+#pragma once
|
||||
+
|
||||
+#include <cstddef>
|
||||
+#include <cstdint>
|
||||
+#include <cstdio>
|
||||
+
|
||||
+struct ggml_cuda_mmq_shape {
|
||||
+ int type;
|
||||
+ bool is_moe;
|
||||
+ int64_t ncols_dst;
|
||||
+ int64_t nchannels_x;
|
||||
+ int64_t ncols_max;
|
||||
+ int64_t n_active_est;
|
||||
+ int64_t density;
|
||||
+ int mmq_x_max;
|
||||
+ int mmq_x_lim;
|
||||
+ int mmq_x_best;
|
||||
+ int mmq_y;
|
||||
+ bool use_stream_k;
|
||||
+};
|
||||
+
|
||||
+static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
+ const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
+ const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
|
||||
+ const int mmq_y, const bool use_stream_k) {
|
||||
+ int64_t n_active_est = 0;
|
||||
+ int64_t density = 0;
|
||||
+ if (is_moe && ncols_dst > 0 && nchannels_x > 0) {
|
||||
+ n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x;
|
||||
+ density = (ncols_dst + n_active_est - 1) / n_active_est;
|
||||
+ }
|
||||
+
|
||||
+ return {
|
||||
+ type,
|
||||
+ is_moe,
|
||||
+ ncols_dst,
|
||||
+ nchannels_x,
|
||||
+ ncols_max,
|
||||
+ n_active_est,
|
||||
+ density,
|
||||
+ mmq_x_max,
|
||||
+ mmq_x_lim,
|
||||
+ mmq_x_best,
|
||||
+ mmq_y,
|
||||
+ use_stream_k,
|
||||
+ };
|
||||
+}
|
||||
+
|
||||
+static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) {
|
||||
+ return std::snprintf(buf, size,
|
||||
+ "type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
|
||||
+ "n_active_est=%lld density=%lld mmq_x_max=%d mmq_x_lim=%d "
|
||||
+ "mmq_x_best=%d mmq_y=%d stream_k=%d",
|
||||
+ shape.type,
|
||||
+ shape.is_moe ? 1 : 0,
|
||||
+ (long long) shape.ncols_dst,
|
||||
+ (long long) shape.nchannels_x,
|
||||
+ (long long) shape.ncols_max,
|
||||
+ (long long) shape.n_active_est,
|
||||
+ (long long) shape.density,
|
||||
+ shape.mmq_x_max,
|
||||
+ shape.mmq_x_lim,
|
||||
+ shape.mmq_x_best,
|
||||
+ shape.mmq_y,
|
||||
+ shape.use_stream_k ? 1 : 0);
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index b53e38a8b..6bc943738 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -3,10 +3,14 @@
|
||||
#include "common.cuh"
|
||||
#include "vecdotq.cuh"
|
||||
#include "mma.cuh"
|
||||
+#include "mmq-shape-trace.h"
|
||||
|
||||
+#include <atomic>
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
+#include <cstdio>
|
||||
#include <cstdlib>
|
||||
+#include <cstring>
|
||||
|
||||
using namespace ggml_cuda_mma;
|
||||
|
||||
@@ -4163,6 +4167,18 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() {
|
||||
return c;
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
|
||||
+ static const int limit = []() -> int {
|
||||
+ const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
|
||||
+ if (s == nullptr || strcmp(s, "0") == 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ const int parsed = atoi(s);
|
||||
+ return parsed > 0 ? parsed : 256;
|
||||
+ }();
|
||||
+ return limit;
|
||||
+}
|
||||
+
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -4249,6 +4265,20 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
}
|
||||
}
|
||||
|
||||
+ if (args.expert_bounds != nullptr) {
|
||||
+ static std::atomic<int> trace_count{0};
|
||||
+ const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
|
||||
+ const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit;
|
||||
+ if (trace_index >= 0 && trace_index < trace_limit) {
|
||||
+ char buf[256];
|
||||
+ const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
|
||||
+ (int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max,
|
||||
+ mmq_x_max, mmq_x_lim, mmq_x_best, mmq_y, args.use_stream_k);
|
||||
+ ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
|
||||
+ fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
switch (mmq_x_best) {
|
||||
case 8:
|
||||
launch_mul_mat_q<type, 8>(ctx, args, stream);
|
||||
@@ -4341,4 +4371,3 @@ void ggml_cuda_op_mul_mat_q(
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts);
|
||||
-
|
||||
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
|
||||
index 24592a279..0a5194c87 100644
|
||||
--- a/tests/CMakeLists.txt
|
||||
+++ b/tests/CMakeLists.txt
|
||||
@@ -234,6 +234,8 @@ llama_build_and_test(test-thread-safety.cpp ARGS -m "${MODEL_DEST}" -ngl 99 -p "
|
||||
set_tests_properties(test-thread-safety PROPERTIES FIXTURES_REQUIRED test-download-model)
|
||||
|
||||
llama_build_and_test(test-arg-parser.cpp)
|
||||
+llama_build_and_test(test-cuda-mmq-shape-trace.cpp)
|
||||
+target_include_directories(test-cuda-mmq-shape-trace PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
|
||||
if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
|
||||
# TODO: repair known memory leaks
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
new file mode 100644
|
||||
index 000000000..8620169c0
|
||||
--- /dev/null
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -0,0 +1,42 @@
|
||||
+#include "ggml-cuda/mmq-shape-trace.h"
|
||||
+
|
||||
+#include <cstdio>
|
||||
+#include <cstdlib>
|
||||
+#include <cstring>
|
||||
+
|
||||
+static void require(bool ok, const char * what) {
|
||||
+ if (!ok) {
|
||||
+ std::fprintf(stderr, "require failed: %s\n", what);
|
||||
+ std::exit(1);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+int main() {
|
||||
+ const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
|
||||
+ /* type */ 39,
|
||||
+ /* is_moe */ true,
|
||||
+ /* ncols_dst */ 1024,
|
||||
+ /* nchannels_x */ 256,
|
||||
+ /* ncols_max */ 128,
|
||||
+ /* mmq_x_max */ 128,
|
||||
+ /* mmq_x_lim */ 64,
|
||||
+ /* mmq_x_best */ 64,
|
||||
+ /* mmq_y */ 128,
|
||||
+ /* use_stream_k */ true);
|
||||
+
|
||||
+ require(shape.n_active_est == 256, "active expert estimate is capped by expert count");
|
||||
+ require(shape.density == 4, "density is ceil(assignments / active experts)");
|
||||
+
|
||||
+ char buf[256];
|
||||
+ const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
|
||||
+
|
||||
+ require(n > 0, "format returns byte count");
|
||||
+ require(std::strstr(buf, "moe=1") != nullptr, "trace includes moe flag");
|
||||
+ require(std::strstr(buf, "ncols_dst=1024") != nullptr, "trace includes routed assignment count");
|
||||
+ require(std::strstr(buf, "n_active_est=256") != nullptr, "trace includes active estimate");
|
||||
+ require(std::strstr(buf, "density=4") != nullptr, "trace includes density");
|
||||
+ require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile");
|
||||
+ require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag");
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
@@ -1,223 +0,0 @@
|
||||
From c78e537b56e3446f8aa645c6700aacf263639bd8 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 04:49:55 +0000
|
||||
Subject: [PATCH] feat(cuda): trace moe mmq launch shapes
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 65 ++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/mmq.cuh | 47 +++++++++++++-------
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 35 +++++++++++++++
|
||||
3 files changed, 132 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
index 9d41b7c80..98bc21f7f 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -19,6 +19,24 @@ struct ggml_cuda_mmq_shape {
|
||||
bool use_stream_k;
|
||||
};
|
||||
|
||||
+struct ggml_cuda_mmq_launch_shape {
|
||||
+ int type;
|
||||
+ bool is_moe;
|
||||
+ int64_t ncols_dst;
|
||||
+ int64_t ncols_max;
|
||||
+ int mmq_x;
|
||||
+ int mmq_y;
|
||||
+ int ntx;
|
||||
+ int nty;
|
||||
+ int ntzw;
|
||||
+ int ntiles_dst;
|
||||
+ int nsm;
|
||||
+ int tiles_nwaves;
|
||||
+ int tiles_efficiency_percent;
|
||||
+ int stream_k_blocks;
|
||||
+ bool fixup_needed;
|
||||
+};
|
||||
+
|
||||
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
|
||||
@@ -46,6 +64,30 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
};
|
||||
}
|
||||
|
||||
+static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make(
|
||||
+ const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max,
|
||||
+ const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw,
|
||||
+ const int ntiles_dst, const int nsm, const int tiles_nwaves, const int tiles_efficiency_percent,
|
||||
+ const int stream_k_blocks, const bool fixup_needed) {
|
||||
+ return {
|
||||
+ type,
|
||||
+ is_moe,
|
||||
+ ncols_dst,
|
||||
+ ncols_max,
|
||||
+ mmq_x,
|
||||
+ mmq_y,
|
||||
+ ntx,
|
||||
+ nty,
|
||||
+ ntzw,
|
||||
+ ntiles_dst,
|
||||
+ nsm,
|
||||
+ tiles_nwaves,
|
||||
+ tiles_efficiency_percent,
|
||||
+ stream_k_blocks,
|
||||
+ fixup_needed,
|
||||
+ };
|
||||
+}
|
||||
+
|
||||
static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) {
|
||||
return std::snprintf(buf, size,
|
||||
"type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
|
||||
@@ -64,3 +106,26 @@ static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, cons
|
||||
shape.mmq_y,
|
||||
shape.use_stream_k ? 1 : 0);
|
||||
}
|
||||
+
|
||||
+static inline int ggml_cuda_mmq_launch_shape_format(
|
||||
+ char * buf, const size_t size, const ggml_cuda_mmq_launch_shape & shape) {
|
||||
+ return std::snprintf(buf, size,
|
||||
+ "type=%d moe=%d ncols_dst=%lld ncols_max=%lld mmq_x=%d mmq_y=%d "
|
||||
+ "ntx=%d nty=%d ntzw=%d ntiles_dst=%d nsm=%d tiles_nwaves=%d "
|
||||
+ "tiles_efficiency=%d stream_k_blocks=%d fixup=%d",
|
||||
+ shape.type,
|
||||
+ shape.is_moe ? 1 : 0,
|
||||
+ (long long) shape.ncols_dst,
|
||||
+ (long long) shape.ncols_max,
|
||||
+ shape.mmq_x,
|
||||
+ shape.mmq_y,
|
||||
+ shape.ntx,
|
||||
+ shape.nty,
|
||||
+ shape.ntzw,
|
||||
+ shape.ntiles_dst,
|
||||
+ shape.nsm,
|
||||
+ shape.tiles_nwaves,
|
||||
+ shape.tiles_efficiency_percent,
|
||||
+ shape.stream_k_blocks,
|
||||
+ shape.fixup_needed ? 1 : 0);
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index 6bc943738..34002edf7 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -3989,6 +3989,24 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
|
||||
return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
|
||||
+ static const int limit = []() -> int {
|
||||
+ const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
|
||||
+ if (s == nullptr || strcmp(s, "0") == 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ const int parsed = atoi(s);
|
||||
+ return parsed > 0 ? parsed : 256;
|
||||
+ }();
|
||||
+ return limit;
|
||||
+}
|
||||
+
|
||||
+static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic<int> & counter) {
|
||||
+ const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
|
||||
+ const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit;
|
||||
+ return trace_index >= 0 && trace_index < trace_limit;
|
||||
+}
|
||||
+
|
||||
template <ggml_type type, int mmq_x>
|
||||
static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -4054,6 +4072,19 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
|
||||
const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
|
||||
|
||||
+ if (args.expert_bounds != nullptr) {
|
||||
+ static std::atomic<int> trace_count{0};
|
||||
+ if (ggml_cuda_moe_mmq_trace_take(trace_count)) {
|
||||
+ char buf[256];
|
||||
+ const ggml_cuda_mmq_launch_shape shape = ggml_cuda_mmq_launch_shape_make(
|
||||
+ (int) type, true, args.ncols_dst, args.ncols_max, mmq_x, mmq_y,
|
||||
+ ntx, nty, ntzw, ntiles_dst, nsm, tiles_nwaves, tiles_efficiency_percent,
|
||||
+ block_nums_stream_k.x, fixup_needed);
|
||||
+ ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), shape);
|
||||
+ fprintf(stderr, "[LLAMA_MOE_MMQ_LAUNCH] %s\n", buf);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
ggml_cuda_pool & pool = ctx.pool(id);
|
||||
ggml_cuda_pool_alloc<float> tmp_fixup(pool);
|
||||
if (fixup_needed) {
|
||||
@@ -4167,18 +4198,6 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() {
|
||||
return c;
|
||||
}
|
||||
|
||||
-static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
|
||||
- static const int limit = []() -> int {
|
||||
- const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
|
||||
- if (s == nullptr || strcmp(s, "0") == 0) {
|
||||
- return 0;
|
||||
- }
|
||||
- const int parsed = atoi(s);
|
||||
- return parsed > 0 ? parsed : 256;
|
||||
- }();
|
||||
- return limit;
|
||||
-}
|
||||
-
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -4267,9 +4286,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
|
||||
if (args.expert_bounds != nullptr) {
|
||||
static std::atomic<int> trace_count{0};
|
||||
- const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
|
||||
- const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit;
|
||||
- if (trace_index >= 0 && trace_index < trace_limit) {
|
||||
+ if (ggml_cuda_moe_mmq_trace_take(trace_count)) {
|
||||
char buf[256];
|
||||
const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
|
||||
(int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max,
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
index 8620169c0..86ee15e02 100644
|
||||
--- a/tests/test-cuda-mmq-shape-trace.cpp
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -38,5 +38,40 @@ int main() {
|
||||
require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile");
|
||||
require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag");
|
||||
|
||||
+ const ggml_cuda_mmq_launch_shape launch = ggml_cuda_mmq_launch_shape_make(
|
||||
+ /* type */ 39,
|
||||
+ /* is_moe */ true,
|
||||
+ /* ncols_dst */ 1024,
|
||||
+ /* ncols_max */ 128,
|
||||
+ /* mmq_x */ 64,
|
||||
+ /* mmq_y */ 128,
|
||||
+ /* ntx */ 2,
|
||||
+ /* nty */ 4,
|
||||
+ /* ntzw */ 3,
|
||||
+ /* ntiles_dst */ 24,
|
||||
+ /* nsm */ 16,
|
||||
+ /* tiles_nwaves */ 2,
|
||||
+ /* tiles_efficiency_percent */ 75,
|
||||
+ /* stream_k_blocks */ 16,
|
||||
+ /* fixup_needed */ true);
|
||||
+
|
||||
+ const int launch_n = ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), launch);
|
||||
+
|
||||
+ require(launch_n > 0, "launch format returns byte count");
|
||||
+ require(std::strstr(buf, "moe=1") != nullptr, "launch trace includes moe flag");
|
||||
+ require(std::strstr(buf, "ncols_dst=1024") != nullptr, "launch trace includes routed assignment count");
|
||||
+ require(std::strstr(buf, "ncols_max=128") != nullptr, "launch trace includes max column count");
|
||||
+ require(std::strstr(buf, "mmq_x=64") != nullptr, "launch trace includes compiled x tile");
|
||||
+ require(std::strstr(buf, "mmq_y=128") != nullptr, "launch trace includes compiled y tile");
|
||||
+ require(std::strstr(buf, "ntx=2") != nullptr, "launch trace includes x tile count");
|
||||
+ require(std::strstr(buf, "nty=4") != nullptr, "launch trace includes y tile count");
|
||||
+ require(std::strstr(buf, "ntzw=3") != nullptr, "launch trace includes batch tile count");
|
||||
+ require(std::strstr(buf, "ntiles_dst=24") != nullptr, "launch trace includes total tile count");
|
||||
+ require(std::strstr(buf, "nsm=16") != nullptr, "launch trace includes SM count");
|
||||
+ require(std::strstr(buf, "tiles_nwaves=2") != nullptr, "launch trace includes wave count");
|
||||
+ require(std::strstr(buf, "tiles_efficiency=75") != nullptr, "launch trace includes stream-k efficiency");
|
||||
+ require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count");
|
||||
+ require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag");
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
@@ -1,182 +0,0 @@
|
||||
From 2a9964d290a543d14db972d8d2927ee9d2974f7e Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 05:05:17 +0000
|
||||
Subject: [PATCH] feat(cuda): trace moe small-m mmq candidates
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 58 ++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/mmq.cuh | 27 +++++++++++++
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 35 +++++++++++++++++
|
||||
3 files changed, 120 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
index 98bc21f7f..47453d91f 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -37,6 +37,18 @@ struct ggml_cuda_mmq_launch_shape {
|
||||
bool fixup_needed;
|
||||
};
|
||||
|
||||
+struct ggml_cuda_mmq_small_m_shape {
|
||||
+ bool is_moe;
|
||||
+ int64_t ncols_dst;
|
||||
+ int64_t nchannels_x;
|
||||
+ int64_t ncols_max;
|
||||
+ int64_t n_active_est;
|
||||
+ int64_t density;
|
||||
+ int mmq_x_best;
|
||||
+ bool use_stream_k;
|
||||
+ bool is_candidate;
|
||||
+};
|
||||
+
|
||||
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
|
||||
@@ -64,6 +76,36 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
};
|
||||
}
|
||||
|
||||
+static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make(
|
||||
+ const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
+ const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) {
|
||||
+ int64_t n_active_est = 0;
|
||||
+ int64_t density = 0;
|
||||
+ if (is_moe && ncols_dst > 0 && nchannels_x > 0) {
|
||||
+ n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x;
|
||||
+ density = (ncols_dst + n_active_est - 1) / n_active_est;
|
||||
+ }
|
||||
+
|
||||
+ const bool is_candidate =
|
||||
+ is_moe &&
|
||||
+ use_stream_k &&
|
||||
+ ncols_max > 0 && ncols_max <= 128 &&
|
||||
+ density > 0 && density <= 4 &&
|
||||
+ mmq_x_best > 0 && mmq_x_best <= 64;
|
||||
+
|
||||
+ return {
|
||||
+ is_moe,
|
||||
+ ncols_dst,
|
||||
+ nchannels_x,
|
||||
+ ncols_max,
|
||||
+ n_active_est,
|
||||
+ density,
|
||||
+ mmq_x_best,
|
||||
+ use_stream_k,
|
||||
+ is_candidate,
|
||||
+ };
|
||||
+}
|
||||
+
|
||||
static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make(
|
||||
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max,
|
||||
const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw,
|
||||
@@ -129,3 +171,19 @@ static inline int ggml_cuda_mmq_launch_shape_format(
|
||||
shape.stream_k_blocks,
|
||||
shape.fixup_needed ? 1 : 0);
|
||||
}
|
||||
+
|
||||
+static inline int ggml_cuda_mmq_small_m_shape_format(
|
||||
+ char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
|
||||
+ return std::snprintf(buf, size,
|
||||
+ "candidate=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
|
||||
+ "n_active_est=%lld density=%lld mmq_x_best=%d stream_k=%d",
|
||||
+ shape.is_candidate ? 1 : 0,
|
||||
+ shape.is_moe ? 1 : 0,
|
||||
+ (long long) shape.ncols_dst,
|
||||
+ (long long) shape.nchannels_x,
|
||||
+ (long long) shape.ncols_max,
|
||||
+ (long long) shape.n_active_est,
|
||||
+ (long long) shape.density,
|
||||
+ shape.mmq_x_best,
|
||||
+ shape.use_stream_k ? 1 : 0);
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index 34002edf7..25ead9e7b 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -4007,6 +4007,24 @@ static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic<int> & counter) {
|
||||
return trace_index >= 0 && trace_index < trace_limit;
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_moe_mmq_small_m_trace_limit() {
|
||||
+ static const int limit = []() -> int {
|
||||
+ const char * s = getenv("LLAMA_MOE_MMQ_SMALL_M_TRACE");
|
||||
+ if (s == nullptr || strcmp(s, "0") == 0) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ const int parsed = atoi(s);
|
||||
+ return parsed > 0 ? parsed : 256;
|
||||
+ }();
|
||||
+ return limit;
|
||||
+}
|
||||
+
|
||||
+static inline bool ggml_cuda_moe_mmq_small_m_trace_take(std::atomic<int> & counter) {
|
||||
+ const int trace_limit = ggml_cuda_moe_mmq_small_m_trace_limit();
|
||||
+ const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit;
|
||||
+ return trace_index >= 0 && trace_index < trace_limit;
|
||||
+}
|
||||
+
|
||||
template <ggml_type type, int mmq_x>
|
||||
static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
@@ -4294,6 +4312,15 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
|
||||
fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf);
|
||||
}
|
||||
+
|
||||
+ static std::atomic<int> small_m_trace_count{0};
|
||||
+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
|
||||
+ true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_best, args.use_stream_k);
|
||||
+ if (small_m.is_candidate && ggml_cuda_moe_mmq_small_m_trace_take(small_m_trace_count)) {
|
||||
+ char buf[256];
|
||||
+ ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m);
|
||||
+ fprintf(stderr, "[LLAMA_MOE_MMQ_SMALL_M] %s\n", buf);
|
||||
+ }
|
||||
}
|
||||
|
||||
switch (mmq_x_best) {
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
index 86ee15e02..9f36ce1a1 100644
|
||||
--- a/tests/test-cuda-mmq-shape-trace.cpp
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -73,5 +73,40 @@ int main() {
|
||||
require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count");
|
||||
require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag");
|
||||
|
||||
+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
|
||||
+ /* is_moe */ true,
|
||||
+ /* ncols_dst */ 1024,
|
||||
+ /* nchannels_x */ 256,
|
||||
+ /* ncols_max */ 128,
|
||||
+ /* mmq_x_best */ 64,
|
||||
+ /* use_stream_k */ true);
|
||||
+
|
||||
+ require(small_m.is_candidate, "decode-like low-density MoE shape is a small-M candidate");
|
||||
+ require(small_m.n_active_est == 256, "small-M active estimate is capped by expert count");
|
||||
+ require(small_m.density == 4, "small-M density is ceil(assignments / active experts)");
|
||||
+
|
||||
+ require(!ggml_cuda_mmq_small_m_shape_make(
|
||||
+ /* is_moe */ false, 1024, 256, 128, 64, true).is_candidate,
|
||||
+ "dense shape is excluded");
|
||||
+ require(!ggml_cuda_mmq_small_m_shape_make(
|
||||
+ /* is_moe */ true, 4096, 256, 512, 128, true).is_candidate,
|
||||
+ "prefill-like shape is excluded");
|
||||
+ require(!ggml_cuda_mmq_small_m_shape_make(
|
||||
+ /* is_moe */ true, 4096, 256, 128, 64, true).is_candidate,
|
||||
+ "high-density shape is excluded");
|
||||
+ require(!ggml_cuda_mmq_small_m_shape_make(
|
||||
+ /* is_moe */ true, 1024, 256, 128, 128, true).is_candidate,
|
||||
+ "large selected tile is excluded");
|
||||
+ require(!ggml_cuda_mmq_small_m_shape_make(
|
||||
+ /* is_moe */ true, 1024, 256, 128, 64, false).is_candidate,
|
||||
+ "non-stream-k shape is excluded");
|
||||
+
|
||||
+ const int small_m_n = ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m);
|
||||
+
|
||||
+ require(small_m_n > 0, "small-M format returns byte count");
|
||||
+ require(std::strstr(buf, "candidate=1") != nullptr, "small-M trace includes candidate flag");
|
||||
+ require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density");
|
||||
+ require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile");
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
@@ -1,80 +0,0 @@
|
||||
From fbed2abaa9f5af8e500f95c8dda86b305450ceff Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 05:17:39 +0000
|
||||
Subject: [PATCH] feat(cuda): gate moe small-m mmq tile policy
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 9 +++++++++
|
||||
ggml/src/ggml-cuda/mmq.cuh | 13 +++++++++++++
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 10 ++++++++++
|
||||
3 files changed, 32 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
index 47453d91f..dfb4e898a 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -187,3 +187,12 @@ static inline int ggml_cuda_mmq_small_m_shape_format(
|
||||
shape.mmq_x_best,
|
||||
shape.use_stream_k ? 1 : 0);
|
||||
}
|
||||
+
|
||||
+static inline int ggml_cuda_mmq_small_m_tile_limit(
|
||||
+ const ggml_cuda_mmq_small_m_shape & shape, const int current_limit, const int requested_tile) {
|
||||
+ if (!shape.is_candidate || requested_tile < 8 || requested_tile >= current_limit) {
|
||||
+ return current_limit;
|
||||
+ }
|
||||
+
|
||||
+ return requested_tile;
|
||||
+}
|
||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
||||
index 25ead9e7b..16b3fcca4 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
||||
@@ -4201,6 +4201,15 @@ static inline int ggml_cuda_moe_density_max() {
|
||||
return d;
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_moe_small_m_tile() {
|
||||
+ static const int t = []() -> int {
|
||||
+ const char * s = getenv("LLAMA_MOE_SMALL_M_TILE");
|
||||
+ const int v = s ? atoi(s) : 0;
|
||||
+ return v >= 8 ? v : 0;
|
||||
+ }();
|
||||
+ return t;
|
||||
+}
|
||||
+
|
||||
// [paged patch 0017 / track B] DENSE NVFP4 decode mmq_x re-read occupancy DIAGNOSTIC (env, default off).
|
||||
// GGML_CUDA_FP4_DENSE_MMQ_X=<n> caps the dense (non-MoE) NVFP4 col-tile to <n>, splitting the M=128
|
||||
// decode ubatch into ceil(128/n) col-tiles. Each col-tile re-reads the full weight set (fatal cost
|
||||
@@ -4282,6 +4291,10 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
}
|
||||
}
|
||||
}
|
||||
+
|
||||
+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
|
||||
+ true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_lim, args.use_stream_k);
|
||||
+ mmq_x_lim = ggml_cuda_mmq_small_m_tile_limit(small_m, mmq_x_lim, ggml_cuda_moe_small_m_tile());
|
||||
}
|
||||
|
||||
int mmq_x_best = 0;
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
index 9f36ce1a1..f7863f03a 100644
|
||||
--- a/tests/test-cuda-mmq-shape-trace.cpp
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -108,5 +108,15 @@ int main() {
|
||||
require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density");
|
||||
require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile");
|
||||
|
||||
+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 0) == 64,
|
||||
+ "small-M tile override is default-off");
|
||||
+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 16) == 16,
|
||||
+ "small-M tile override caps candidate tile limit");
|
||||
+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 128) == 64,
|
||||
+ "small-M tile override ignores non-smaller tiles");
|
||||
+ require(ggml_cuda_mmq_small_m_tile_limit(
|
||||
+ ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128,
|
||||
+ "small-M tile override excludes prefill-like shapes");
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
@@ -1,292 +0,0 @@
|
||||
From 6c332094ca2fbb1e3211427c5f919adcaa89c588 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 05:32:27 +0000
|
||||
Subject: [PATCH] feat(cuda): trace moe mmid routes
|
||||
|
||||
Add a default-off LLAMA_MOE_MMID_ROUTE_TRACE diagnostic for MUL_MAT_ID dispatch routes.
|
||||
|
||||
The trace reports whether a call uses MMVQ, MMVF, grouped MMQ, MMF, or host-sync fallback while preserving the existing dispatch predicates.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 34 +++++++++--
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 88 ++++++++++++++++++++++++++++
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 82 ++++++++++++++++++++++++++
|
||||
3 files changed, 200 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 374949f25..a1754df39 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2685,6 +2685,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_moe_mmid_route_trace_limit() {
|
||||
+ static const int value = []() {
|
||||
+ const char * s = getenv("LLAMA_MOE_MMID_ROUTE_TRACE");
|
||||
+ return s ? atoi(s) : 0;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
@@ -2697,13 +2706,30 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
+ const bool src0_quantized = ggml_is_quantized(src0->type);
|
||||
+ const int mmvq_mmid_max = src0_quantized ? get_mmvq_mmid_max_batch(src0->type, cc) : MMVQ_MAX_BATCH_SIZE;
|
||||
+ const bool use_mmq = ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02);
|
||||
+ const bool use_mmf = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true);
|
||||
+
|
||||
+ const int mmid_trace_limit = ggml_cuda_moe_mmid_route_trace_limit();
|
||||
+ if (mmid_trace_limit > 0) {
|
||||
+ static std::atomic<int> trace_count{0};
|
||||
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
|
||||
+ if (trace_idx < mmid_trace_limit) {
|
||||
+ const ggml_cuda_mmid_route_shape route_shape = ggml_cuda_mmid_route_shape_make(
|
||||
+ src0->type, ne2, ne12, /*n_experts=*/ne02, mmvq_mmid_max, use_mmq, use_mmf,
|
||||
+ GGML_CUDA_CC_IS_AMD(cc), src0_quantized);
|
||||
+ char buf[256];
|
||||
+ ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), route_shape);
|
||||
+ fprintf(stderr, "[LLAMA_MOE_MMID_ROUTE] %s\n", buf);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
|
||||
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
|
||||
- if (ggml_is_quantized(src0->type)) {
|
||||
- const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
|
||||
+ if (src0_quantized) {
|
||||
if (ne2 <= mmvq_mmid_max) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
@@ -2716,12 +2742,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
- if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
|
||||
+ if (use_mmq) {
|
||||
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
- if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
|
||||
+ if (use_mmf) {
|
||||
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
index dfb4e898a..da234a302 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -49,6 +49,40 @@ struct ggml_cuda_mmq_small_m_shape {
|
||||
bool is_candidate;
|
||||
};
|
||||
|
||||
+enum ggml_cuda_mmid_route {
|
||||
+ GGML_CUDA_MMID_ROUTE_MMVQ,
|
||||
+ GGML_CUDA_MMID_ROUTE_MMVF,
|
||||
+ GGML_CUDA_MMID_ROUTE_MMQ,
|
||||
+ GGML_CUDA_MMID_ROUTE_MMF,
|
||||
+ GGML_CUDA_MMID_ROUTE_FALLBACK,
|
||||
+};
|
||||
+
|
||||
+struct ggml_cuda_mmid_route_shape {
|
||||
+ ggml_cuda_mmid_route route;
|
||||
+ int type;
|
||||
+ int64_t ne2;
|
||||
+ int64_t ne12;
|
||||
+ int64_t n_experts;
|
||||
+ int mmvq_max;
|
||||
+ bool use_mmq;
|
||||
+ bool use_mmf;
|
||||
+ bool is_amd;
|
||||
+ bool is_quantized;
|
||||
+ bool host_sync;
|
||||
+};
|
||||
+
|
||||
+static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
|
||||
+ switch (route) {
|
||||
+ case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq";
|
||||
+ case GGML_CUDA_MMID_ROUTE_MMVF: return "mmvf";
|
||||
+ case GGML_CUDA_MMID_ROUTE_MMQ: return "mmq";
|
||||
+ case GGML_CUDA_MMID_ROUTE_MMF: return "mmf";
|
||||
+ case GGML_CUDA_MMID_ROUTE_FALLBACK: return "fallback";
|
||||
+ }
|
||||
+
|
||||
+ return "unknown";
|
||||
+}
|
||||
+
|
||||
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
|
||||
@@ -76,6 +110,42 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
};
|
||||
}
|
||||
|
||||
+static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
|
||||
+ const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
|
||||
+ const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
|
||||
+ const bool is_quantized) {
|
||||
+ ggml_cuda_mmid_route route = GGML_CUDA_MMID_ROUTE_FALLBACK;
|
||||
+ if (ne2 <= mmvq_max) {
|
||||
+ if (is_quantized) {
|
||||
+ route = GGML_CUDA_MMID_ROUTE_MMVQ;
|
||||
+ } else if (is_amd) {
|
||||
+ route = GGML_CUDA_MMID_ROUTE_MMVF;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (route == GGML_CUDA_MMID_ROUTE_FALLBACK) {
|
||||
+ if (use_mmq) {
|
||||
+ route = GGML_CUDA_MMID_ROUTE_MMQ;
|
||||
+ } else if (use_mmf) {
|
||||
+ route = GGML_CUDA_MMID_ROUTE_MMF;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return {
|
||||
+ route,
|
||||
+ type,
|
||||
+ ne2,
|
||||
+ ne12,
|
||||
+ n_experts,
|
||||
+ mmvq_max,
|
||||
+ use_mmq,
|
||||
+ use_mmf,
|
||||
+ is_amd,
|
||||
+ is_quantized,
|
||||
+ route == GGML_CUDA_MMID_ROUTE_FALLBACK,
|
||||
+ };
|
||||
+}
|
||||
+
|
||||
static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make(
|
||||
const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) {
|
||||
@@ -172,6 +242,24 @@ static inline int ggml_cuda_mmq_launch_shape_format(
|
||||
shape.fixup_needed ? 1 : 0);
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_mmid_route_shape_format(
|
||||
+ char * buf, const size_t size, const ggml_cuda_mmid_route_shape & shape) {
|
||||
+ return std::snprintf(buf, size,
|
||||
+ "route=%s type=%d host_sync=%d ne2=%lld ne12=%lld n_experts=%lld "
|
||||
+ "mmvq_max=%d use_mmq=%d use_mmf=%d is_amd=%d is_quantized=%d",
|
||||
+ ggml_cuda_mmid_route_name(shape.route),
|
||||
+ shape.type,
|
||||
+ shape.host_sync ? 1 : 0,
|
||||
+ (long long) shape.ne2,
|
||||
+ (long long) shape.ne12,
|
||||
+ (long long) shape.n_experts,
|
||||
+ shape.mmvq_max,
|
||||
+ shape.use_mmq ? 1 : 0,
|
||||
+ shape.use_mmf ? 1 : 0,
|
||||
+ shape.is_amd ? 1 : 0,
|
||||
+ shape.is_quantized ? 1 : 0);
|
||||
+}
|
||||
+
|
||||
static inline int ggml_cuda_mmq_small_m_shape_format(
|
||||
char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
|
||||
return std::snprintf(buf, size,
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
index f7863f03a..e190cf1ac 100644
|
||||
--- a/tests/test-cuda-mmq-shape-trace.cpp
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -118,5 +118,87 @@ int main() {
|
||||
ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128,
|
||||
"small-M tile override excludes prefill-like shapes");
|
||||
|
||||
+ const ggml_cuda_mmid_route_shape mmvq = ggml_cuda_mmid_route_shape_make(
|
||||
+ /* type */ 39,
|
||||
+ /* ne2 */ 1,
|
||||
+ /* ne12 */ 1,
|
||||
+ /* ne02 */ 256,
|
||||
+ /* mmvq_max */ 4,
|
||||
+ /* use_mmq */ true,
|
||||
+ /* use_mmf */ true,
|
||||
+ /* is_amd */ false,
|
||||
+ /* is_quantized */ true);
|
||||
+
|
||||
+ require(mmvq.route == GGML_CUDA_MMID_ROUTE_MMVQ, "MMVQ wins when within its batch cap");
|
||||
+ require(!mmvq.host_sync, "MMVQ route is graph-safe");
|
||||
+
|
||||
+ const ggml_cuda_mmid_route_shape mmq = ggml_cuda_mmid_route_shape_make(
|
||||
+ /* type */ 39,
|
||||
+ /* ne2 */ 128,
|
||||
+ /* ne12 */ 128,
|
||||
+ /* ne02 */ 256,
|
||||
+ /* mmvq_max */ 4,
|
||||
+ /* use_mmq */ true,
|
||||
+ /* use_mmf */ true,
|
||||
+ /* is_amd */ false,
|
||||
+ /* is_quantized */ true);
|
||||
+
|
||||
+ require(mmq.route == GGML_CUDA_MMID_ROUTE_MMQ, "grouped MMQ wins after MMVQ cap");
|
||||
+ require(!mmq.host_sync, "grouped MMQ route is graph-safe");
|
||||
+
|
||||
+ const ggml_cuda_mmid_route_shape mmf = ggml_cuda_mmid_route_shape_make(
|
||||
+ /* type */ 1,
|
||||
+ /* ne2 */ 128,
|
||||
+ /* ne12 */ 128,
|
||||
+ /* ne02 */ 256,
|
||||
+ /* mmvq_max */ 4,
|
||||
+ /* use_mmq */ false,
|
||||
+ /* use_mmf */ true,
|
||||
+ /* is_amd */ false,
|
||||
+ /* is_quantized */ false);
|
||||
+
|
||||
+ require(mmf.route == GGML_CUDA_MMID_ROUTE_MMF, "MMF wins when grouped MMQ is unavailable");
|
||||
+ require(!mmf.host_sync, "MMF route is graph-safe");
|
||||
+
|
||||
+ const ggml_cuda_mmid_route_shape fallback = ggml_cuda_mmid_route_shape_make(
|
||||
+ /* type */ 0,
|
||||
+ /* ne2 */ 128,
|
||||
+ /* ne12 */ 128,
|
||||
+ /* ne02 */ 256,
|
||||
+ /* mmvq_max */ 4,
|
||||
+ /* use_mmq */ false,
|
||||
+ /* use_mmf */ false,
|
||||
+ /* is_amd */ false,
|
||||
+ /* is_quantized */ false);
|
||||
+
|
||||
+ require(fallback.route == GGML_CUDA_MMID_ROUTE_FALLBACK, "fallback is used when device routes do not match");
|
||||
+ require(fallback.host_sync, "fallback route requires host synchronization");
|
||||
+
|
||||
+ const ggml_cuda_mmid_route_shape amd_mmvf = ggml_cuda_mmid_route_shape_make(
|
||||
+ /* type */ 1,
|
||||
+ /* ne2 */ 1,
|
||||
+ /* ne12 */ 1,
|
||||
+ /* ne02 */ 256,
|
||||
+ /* mmvq_max */ 4,
|
||||
+ /* use_mmq */ false,
|
||||
+ /* use_mmf */ false,
|
||||
+ /* is_amd */ true,
|
||||
+ /* is_quantized */ false);
|
||||
+
|
||||
+ require(amd_mmvf.route == GGML_CUDA_MMID_ROUTE_MMVF, "AMD float vector route wins within MMVQ cap");
|
||||
+ require(!amd_mmvf.host_sync, "AMD float vector route is graph-safe");
|
||||
+
|
||||
+ const int route_n = ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), mmq);
|
||||
+
|
||||
+ require(route_n > 0, "MMID route format returns byte count");
|
||||
+ require(std::strstr(buf, "route=mmq") != nullptr, "MMID trace includes route name");
|
||||
+ require(std::strstr(buf, "host_sync=0") != nullptr, "MMID trace includes host sync flag");
|
||||
+ require(std::strstr(buf, "ne2=128") != nullptr, "MMID trace includes destination batch");
|
||||
+ require(std::strstr(buf, "ne12=128") != nullptr, "MMID trace includes routed token count");
|
||||
+ require(std::strstr(buf, "n_experts=256") != nullptr, "MMID trace includes expert count");
|
||||
+ require(std::strstr(buf, "mmvq_max=4") != nullptr, "MMID trace includes MMVQ cap");
|
||||
+ require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate");
|
||||
+ require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate");
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,345 +0,0 @@
|
||||
From 486c28c63d5297afd06e5a2bdbd4fb89cad749cd Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 05:49:12 +0000
|
||||
Subject: [PATCH] feat(cuda): trace mul mat routes
|
||||
|
||||
Add a default-off LLAMA_MUL_MAT_ROUTE_TRACE diagnostic for regular MUL_MAT dispatch routes.
|
||||
|
||||
The trace classifies projection-heavy calls without changing dispatch behavior.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 44 +++++++++-
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 117 +++++++++++++++++++++++++++
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 85 +++++++++++++++++++
|
||||
3 files changed, 244 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index a1754df39..cd34aff13 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2580,6 +2580,32 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
|
||||
return use_mul_mat_vec_q;
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_mul_mat_route_trace_limit() {
|
||||
+ static const int value = []() {
|
||||
+ const char * s = getenv("LLAMA_MUL_MAT_ROUTE_TRACE");
|
||||
+ return s ? atoi(s) : 0;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
+static inline void ggml_cuda_mul_mat_route_trace(const ggml_cuda_mul_mat_route_shape & shape) {
|
||||
+ const int trace_limit = ggml_cuda_mul_mat_route_trace_limit();
|
||||
+ if (trace_limit <= 0) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ static std::atomic<int> trace_count{0};
|
||||
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
|
||||
+ if (trace_idx >= trace_limit) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ char buf[256];
|
||||
+ ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), shape);
|
||||
+ fprintf(stderr, "[LLAMA_MUL_MAT_ROUTE] %s\n", buf);
|
||||
+}
|
||||
+
|
||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
|
||||
@@ -2591,6 +2617,10 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
if (!split) {
|
||||
const int cc_fp4 = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
if (ggml_cuda_fp4_prefill_should_engage(src0, src1, dst, cc_fp4)) {
|
||||
+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
|
||||
+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
|
||||
+ /*use_vec_f=*/false, /*use_mat_f=*/false, /*use_vec_q=*/false, /*use_mmq=*/false,
|
||||
+ /*use_batched_cublas=*/false, /*use_fp4_prefill=*/true, /*use_fwht=*/false));
|
||||
ggml_cuda_mul_mat_fp4_large_m(ctx, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
@@ -2654,12 +2684,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
|
||||
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
|
||||
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
|
||||
+ bool use_batched_cublas = !split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
|
||||
+ && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1;
|
||||
|
||||
const int32_t hint = ggml_get_op_params_i32(dst, 1);
|
||||
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
|
||||
+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
|
||||
+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
|
||||
+ use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q,
|
||||
+ use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/true));
|
||||
return;
|
||||
}
|
||||
|
||||
+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
|
||||
+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
|
||||
+ use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q,
|
||||
+ use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/false));
|
||||
+
|
||||
if (!split && use_mul_mat_vec_f) {
|
||||
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
||||
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
||||
@@ -2670,8 +2711,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_q) {
|
||||
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
|
||||
- } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
|
||||
- && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
+ } else if (use_batched_cublas) {
|
||||
// general KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_mul_mat_vec_f) {
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
index da234a302..8ac373fd9 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -71,6 +71,37 @@ struct ggml_cuda_mmid_route_shape {
|
||||
bool host_sync;
|
||||
};
|
||||
|
||||
+enum ggml_cuda_mul_mat_route {
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_FWHT,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_VEC_F,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_MAT_F,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_VEC_Q,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_MMQ,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ,
|
||||
+ GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
|
||||
+};
|
||||
+
|
||||
+struct ggml_cuda_mul_mat_route_shape {
|
||||
+ ggml_cuda_mul_mat_route route;
|
||||
+ int type;
|
||||
+ int64_t ne1;
|
||||
+ int64_t ne11;
|
||||
+ int64_t ne12;
|
||||
+ int64_t ne13;
|
||||
+ bool split;
|
||||
+ bool use_vec_f;
|
||||
+ bool use_mat_f;
|
||||
+ bool use_vec_q;
|
||||
+ bool use_mmq;
|
||||
+ bool use_batched_cublas;
|
||||
+ bool use_fp4_prefill;
|
||||
+ bool use_fwht;
|
||||
+};
|
||||
+
|
||||
static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
|
||||
switch (route) {
|
||||
case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq";
|
||||
@@ -83,6 +114,24 @@ static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
+static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_route route) {
|
||||
+ switch (route) {
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL: return "fp4_prefill";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_FWHT: return "fwht";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_VEC_F: return "vec_f";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_MAT_F: return "mat_f";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_VEC_Q: return "vec_q";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_MMQ: return "mmq";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS: return "batched_cublas";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F: return "op_vec_f";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q: return "op_vec_q";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ: return "op_mmq";
|
||||
+ case GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS: return "op_cublas";
|
||||
+ }
|
||||
+
|
||||
+ return "unknown";
|
||||
+}
|
||||
+
|
||||
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
|
||||
@@ -110,6 +159,52 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
};
|
||||
}
|
||||
|
||||
+static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make(
|
||||
+ const int type, const int64_t ne1, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
||||
+ const bool split, const bool use_vec_f, const bool use_mat_f, const bool use_vec_q,
|
||||
+ const bool use_mmq, const bool use_batched_cublas, const bool use_fp4_prefill,
|
||||
+ const bool use_fwht) {
|
||||
+ ggml_cuda_mul_mat_route route = GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS;
|
||||
+ if (use_fp4_prefill) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL;
|
||||
+ } else if (use_fwht) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_FWHT;
|
||||
+ } else if (!split && use_vec_f) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_VEC_F;
|
||||
+ } else if (!split && use_mat_f) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_MAT_F;
|
||||
+ } else if (!split && use_vec_q) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_VEC_Q;
|
||||
+ } else if (!split && use_mmq) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_MMQ;
|
||||
+ } else if (!split && use_batched_cublas) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS;
|
||||
+ } else if (use_vec_f) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F;
|
||||
+ } else if (use_vec_q) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q;
|
||||
+ } else if (use_mmq) {
|
||||
+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ;
|
||||
+ }
|
||||
+
|
||||
+ return {
|
||||
+ route,
|
||||
+ type,
|
||||
+ ne1,
|
||||
+ ne11,
|
||||
+ ne12,
|
||||
+ ne13,
|
||||
+ split,
|
||||
+ use_vec_f,
|
||||
+ use_mat_f,
|
||||
+ use_vec_q,
|
||||
+ use_mmq,
|
||||
+ use_batched_cublas,
|
||||
+ use_fp4_prefill,
|
||||
+ use_fwht,
|
||||
+ };
|
||||
+}
|
||||
+
|
||||
static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
|
||||
const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
|
||||
const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
|
||||
@@ -260,6 +355,28 @@ static inline int ggml_cuda_mmid_route_shape_format(
|
||||
shape.is_quantized ? 1 : 0);
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_mul_mat_route_shape_format(
|
||||
+ char * buf, const size_t size, const ggml_cuda_mul_mat_route_shape & shape) {
|
||||
+ return std::snprintf(buf, size,
|
||||
+ "route=%s type=%d ne1=%lld ne11=%lld ne12=%lld ne13=%lld split=%d "
|
||||
+ "use_vec_f=%d use_mat_f=%d use_vec_q=%d use_mmq=%d use_batched_cublas=%d "
|
||||
+ "use_fp4_prefill=%d use_fwht=%d",
|
||||
+ ggml_cuda_mul_mat_route_name(shape.route),
|
||||
+ shape.type,
|
||||
+ (long long) shape.ne1,
|
||||
+ (long long) shape.ne11,
|
||||
+ (long long) shape.ne12,
|
||||
+ (long long) shape.ne13,
|
||||
+ shape.split ? 1 : 0,
|
||||
+ shape.use_vec_f ? 1 : 0,
|
||||
+ shape.use_mat_f ? 1 : 0,
|
||||
+ shape.use_vec_q ? 1 : 0,
|
||||
+ shape.use_mmq ? 1 : 0,
|
||||
+ shape.use_batched_cublas ? 1 : 0,
|
||||
+ shape.use_fp4_prefill ? 1 : 0,
|
||||
+ shape.use_fwht ? 1 : 0);
|
||||
+}
|
||||
+
|
||||
static inline int ggml_cuda_mmq_small_m_shape_format(
|
||||
char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
|
||||
return std::snprintf(buf, size,
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
index e190cf1ac..2bd41d1d8 100644
|
||||
--- a/tests/test-cuda-mmq-shape-trace.cpp
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -200,5 +200,90 @@ int main() {
|
||||
require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate");
|
||||
require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate");
|
||||
|
||||
+ const ggml_cuda_mul_mat_route_shape mat_f = ggml_cuda_mul_mat_route_shape_make(
|
||||
+ /* type */ 30,
|
||||
+ /* ne1 */ 128,
|
||||
+ /* ne11 */ 128,
|
||||
+ /* ne12 */ 1,
|
||||
+ /* ne13 */ 1,
|
||||
+ /* split */ false,
|
||||
+ /* use_vec_f */ false,
|
||||
+ /* use_mat_f */ true,
|
||||
+ /* use_vec_q */ false,
|
||||
+ /* use_mmq */ false,
|
||||
+ /* use_batched_cublas */ false,
|
||||
+ /* use_fp4_prefill */ false,
|
||||
+ /* use_fwht */ false);
|
||||
+
|
||||
+ require(mat_f.route == GGML_CUDA_MUL_MAT_ROUTE_MAT_F, "regular MUL_MAT prefers direct mat_f when available");
|
||||
+ require(!mat_f.split, "regular MUL_MAT trace records split flag");
|
||||
+
|
||||
+ const ggml_cuda_mul_mat_route_shape batched = ggml_cuda_mul_mat_route_shape_make(
|
||||
+ /* type */ 31,
|
||||
+ /* ne1 */ 128,
|
||||
+ /* ne11 */ 128,
|
||||
+ /* ne12 */ 4,
|
||||
+ /* ne13 */ 1,
|
||||
+ /* split */ false,
|
||||
+ /* use_vec_f */ false,
|
||||
+ /* use_mat_f */ false,
|
||||
+ /* use_vec_q */ false,
|
||||
+ /* use_mmq */ false,
|
||||
+ /* use_batched_cublas */ true,
|
||||
+ /* use_fp4_prefill */ false,
|
||||
+ /* use_fwht */ false);
|
||||
+
|
||||
+ require(batched.route == GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS,
|
||||
+ "regular MUL_MAT records batched cuBLAS route");
|
||||
+
|
||||
+ const ggml_cuda_mul_mat_route_shape op_cublas = ggml_cuda_mul_mat_route_shape_make(
|
||||
+ /* type */ 0,
|
||||
+ /* ne1 */ 16,
|
||||
+ /* ne11 */ 16,
|
||||
+ /* ne12 */ 1,
|
||||
+ /* ne13 */ 1,
|
||||
+ /* split */ true,
|
||||
+ /* use_vec_f */ false,
|
||||
+ /* use_mat_f */ false,
|
||||
+ /* use_vec_q */ false,
|
||||
+ /* use_mmq */ false,
|
||||
+ /* use_batched_cublas */ false,
|
||||
+ /* use_fp4_prefill */ false,
|
||||
+ /* use_fwht */ false);
|
||||
+
|
||||
+ require(op_cublas.route == GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
|
||||
+ "regular MUL_MAT fallback records op cublas route");
|
||||
+
|
||||
+ const ggml_cuda_mul_mat_route_shape fp4_prefill = ggml_cuda_mul_mat_route_shape_make(
|
||||
+ /* type */ 39,
|
||||
+ /* ne1 */ 512,
|
||||
+ /* ne11 */ 512,
|
||||
+ /* ne12 */ 1,
|
||||
+ /* ne13 */ 1,
|
||||
+ /* split */ false,
|
||||
+ /* use_vec_f */ false,
|
||||
+ /* use_mat_f */ false,
|
||||
+ /* use_vec_q */ false,
|
||||
+ /* use_mmq */ true,
|
||||
+ /* use_batched_cublas */ false,
|
||||
+ /* use_fp4_prefill */ true,
|
||||
+ /* use_fwht */ false);
|
||||
+
|
||||
+ require(fp4_prefill.route == GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL,
|
||||
+ "regular MUL_MAT records native FP4 prefill route before MMQ");
|
||||
+
|
||||
+ const int mul_mat_route_n = ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), mat_f);
|
||||
+
|
||||
+ require(mul_mat_route_n > 0, "regular MUL_MAT route format returns byte count");
|
||||
+ require(std::strstr(buf, "route=mat_f") != nullptr, "regular MUL_MAT trace includes route name");
|
||||
+ require(std::strstr(buf, "type=30") != nullptr, "regular MUL_MAT trace includes type");
|
||||
+ require(std::strstr(buf, "ne1=128") != nullptr, "regular MUL_MAT trace includes output columns");
|
||||
+ require(std::strstr(buf, "ne11=128") != nullptr, "regular MUL_MAT trace includes src1 columns");
|
||||
+ require(std::strstr(buf, "ne12=1") != nullptr, "regular MUL_MAT trace includes src1 batch dim");
|
||||
+ require(std::strstr(buf, "split=0") != nullptr, "regular MUL_MAT trace includes split flag");
|
||||
+ require(std::strstr(buf, "use_mat_f=1") != nullptr, "regular MUL_MAT trace includes mat_f predicate");
|
||||
+ require(std::strstr(buf, "use_batched_cublas=0") != nullptr,
|
||||
+ "regular MUL_MAT trace includes batched cuBLAS predicate");
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,332 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 06:20:31 +0000
|
||||
Subject: [PATCH] feat(cuda): trace cublas routes
|
||||
|
||||
Add a default-off LLAMA_CUBLAS_ROUTE_TRACE diagnostic around the generic cuBLAS MUL_MAT path.
|
||||
|
||||
The trace classifies NVFP4/BF16/FP16/SGEMM subroutes without changing branch behavior, and extends the route helper test coverage.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 53 +++++++++++--
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 108 +++++++++++++++++++++++++++
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 62 +++++++++++++++
|
||||
3 files changed, 216 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cd34aff13..eff197818 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -1627,6 +1627,32 @@ static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type
|
||||
return compute_type;
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_cublas_route_trace_limit() {
|
||||
+ static const int value = []() {
|
||||
+ const char * s = getenv("LLAMA_CUBLAS_ROUTE_TRACE");
|
||||
+ return s ? atoi(s) : 0;
|
||||
+ }();
|
||||
+
|
||||
+ return value;
|
||||
+}
|
||||
+
|
||||
+static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_shape & shape) {
|
||||
+ const int trace_limit = ggml_cuda_cublas_route_trace_limit();
|
||||
+ if (trace_limit <= 0) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ static std::atomic<int> trace_count{0};
|
||||
+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
|
||||
+ if (trace_idx >= trace_limit) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ char buf[320];
|
||||
+ ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape);
|
||||
+ fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf);
|
||||
+}
|
||||
+
|
||||
static void ggml_cuda_op_mul_mat_cublas(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
@@ -1662,7 +1688,22 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
|
||||
- if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
+ const bool src0_contig = ggml_is_contiguous(src0);
|
||||
+ const bool full_rows = row_diff == src0->ne[1];
|
||||
+ const bool fast_fp16 = fast_fp16_hardware_available(cc);
|
||||
+ bool force_fp32 = false;
|
||||
+ bool force_fp16 = false;
|
||||
+ if (fast_fp16 && use_fp16) {
|
||||
+ const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
|
||||
+ force_fp32 = force_compute_type.fp32;
|
||||
+ force_fp16 = force_compute_type.fp16;
|
||||
+ }
|
||||
+ ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make(
|
||||
+ src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc,
|
||||
+ supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows,
|
||||
+ GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA));
|
||||
+
|
||||
+ if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) {
|
||||
// Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when
|
||||
// ggml_cuda_should_use_mmq() returned false (large-M dense prefill).
|
||||
// Dequant the FP4 weights to a TRANSIENT bf16 pool buffer and run a
|
||||
@@ -1702,7 +1743,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
- } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
+ } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && src0_contig && full_rows) {
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
||||
if (src1->type != GGML_TYPE_BF16) {
|
||||
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
|
||||
@@ -1730,7 +1771,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
- } else if (fast_fp16_hardware_available(cc) && use_fp16) {
|
||||
+ } else if (fast_fp16 && use_fp16) {
|
||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
@@ -1754,12 +1795,10 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
|
||||
- const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
|
||||
-
|
||||
- if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|
||||
+ if (!force_fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|
||||
|| GGML_CUDA_CC_IS_RDNA4(cc)
|
||||
|| cc == GGML_CUDA_CC_VOLTA
|
||||
- || force_compute_type.fp32))
|
||||
+ || force_fp32))
|
||||
{
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
index 8ac373fd9..f5b4ecf2c 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -85,6 +85,14 @@ enum ggml_cuda_mul_mat_route {
|
||||
GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
|
||||
};
|
||||
|
||||
+enum ggml_cuda_cublas_route {
|
||||
+ GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
|
||||
+ GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
|
||||
+ GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
|
||||
+ GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
|
||||
+ GGML_CUDA_CUBLAS_ROUTE_SGEMM,
|
||||
+};
|
||||
+
|
||||
struct ggml_cuda_mul_mat_route_shape {
|
||||
ggml_cuda_mul_mat_route route;
|
||||
int type;
|
||||
@@ -102,6 +110,27 @@ struct ggml_cuda_mul_mat_route_shape {
|
||||
bool use_fwht;
|
||||
};
|
||||
|
||||
+struct ggml_cuda_cublas_route_shape {
|
||||
+ ggml_cuda_cublas_route route;
|
||||
+ int type;
|
||||
+ int src1_type;
|
||||
+ int64_t row_diff;
|
||||
+ int64_t src1_ncols;
|
||||
+ int64_t ne00;
|
||||
+ int64_t ne10;
|
||||
+ int64_t ldc;
|
||||
+ bool supports_bf16;
|
||||
+ bool use_fp16;
|
||||
+ bool fast_fp16;
|
||||
+ bool force_fp32;
|
||||
+ bool force_fp16;
|
||||
+ bool src0_contig;
|
||||
+ bool full_rows;
|
||||
+ bool is_cdna;
|
||||
+ bool is_rdna4;
|
||||
+ bool is_volta;
|
||||
+};
|
||||
+
|
||||
static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
|
||||
switch (route) {
|
||||
case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq";
|
||||
@@ -132,6 +161,18 @@ static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
+static inline const char * ggml_cuda_cublas_route_name(const ggml_cuda_cublas_route route) {
|
||||
+ switch (route) {
|
||||
+ case GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC: return "nvfp4_bf16_tc";
|
||||
+ case GGML_CUDA_CUBLAS_ROUTE_BF16_TC: return "bf16_tc";
|
||||
+ case GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F: return "f16_tc_32f";
|
||||
+ case GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F: return "f16_tc_16f";
|
||||
+ case GGML_CUDA_CUBLAS_ROUTE_SGEMM: return "sgemm";
|
||||
+ }
|
||||
+
|
||||
+ return "unknown";
|
||||
+}
|
||||
+
|
||||
static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
|
||||
const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
|
||||
const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
|
||||
@@ -205,6 +246,47 @@ static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make(
|
||||
};
|
||||
}
|
||||
|
||||
+static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
|
||||
+ const int type, const int src1_type, const int64_t row_diff, const int64_t src1_ncols,
|
||||
+ const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16,
|
||||
+ const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16,
|
||||
+ const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4,
|
||||
+ const bool is_volta) {
|
||||
+ ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM;
|
||||
+ if (supports_bf16 && type == 40 && src0_contig && full_rows) {
|
||||
+ route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC;
|
||||
+ } else if (supports_bf16 && type == 30 && src0_contig && full_rows) {
|
||||
+ route = GGML_CUDA_CUBLAS_ROUTE_BF16_TC;
|
||||
+ } else if (fast_fp16 && use_fp16) {
|
||||
+ if (!force_fp16 && (is_cdna || is_rdna4 || is_volta || force_fp32)) {
|
||||
+ route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F;
|
||||
+ } else {
|
||||
+ route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return {
|
||||
+ route,
|
||||
+ type,
|
||||
+ src1_type,
|
||||
+ row_diff,
|
||||
+ src1_ncols,
|
||||
+ ne00,
|
||||
+ ne10,
|
||||
+ ldc,
|
||||
+ supports_bf16,
|
||||
+ use_fp16,
|
||||
+ fast_fp16,
|
||||
+ force_fp32,
|
||||
+ force_fp16,
|
||||
+ src0_contig,
|
||||
+ full_rows,
|
||||
+ is_cdna,
|
||||
+ is_rdna4,
|
||||
+ is_volta,
|
||||
+ };
|
||||
+}
|
||||
+
|
||||
static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
|
||||
const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
|
||||
const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
|
||||
@@ -377,6 +459,32 @@ static inline int ggml_cuda_mul_mat_route_shape_format(
|
||||
shape.use_fwht ? 1 : 0);
|
||||
}
|
||||
|
||||
+static inline int ggml_cuda_cublas_route_shape_format(
|
||||
+ char * buf, const size_t size, const ggml_cuda_cublas_route_shape & shape) {
|
||||
+ return std::snprintf(buf, size,
|
||||
+ "route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld "
|
||||
+ "supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d "
|
||||
+ "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d",
|
||||
+ ggml_cuda_cublas_route_name(shape.route),
|
||||
+ shape.type,
|
||||
+ shape.src1_type,
|
||||
+ (long long) shape.row_diff,
|
||||
+ (long long) shape.src1_ncols,
|
||||
+ (long long) shape.ne00,
|
||||
+ (long long) shape.ne10,
|
||||
+ (long long) shape.ldc,
|
||||
+ shape.supports_bf16 ? 1 : 0,
|
||||
+ shape.use_fp16 ? 1 : 0,
|
||||
+ shape.fast_fp16 ? 1 : 0,
|
||||
+ shape.force_fp32 ? 1 : 0,
|
||||
+ shape.force_fp16 ? 1 : 0,
|
||||
+ shape.src0_contig ? 1 : 0,
|
||||
+ shape.full_rows ? 1 : 0,
|
||||
+ shape.is_cdna ? 1 : 0,
|
||||
+ shape.is_rdna4 ? 1 : 0,
|
||||
+ shape.is_volta ? 1 : 0);
|
||||
+}
|
||||
+
|
||||
static inline int ggml_cuda_mmq_small_m_shape_format(
|
||||
char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
|
||||
return std::snprintf(buf, size,
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
index 2bd41d1d8..1443749c3 100644
|
||||
--- a/tests/test-cuda-mmq-shape-trace.cpp
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -285,5 +285,67 @@ int main() {
|
||||
require(std::strstr(buf, "use_batched_cublas=0") != nullptr,
|
||||
"regular MUL_MAT trace includes batched cuBLAS predicate");
|
||||
|
||||
+ const ggml_cuda_cublas_route_shape bf16_tc = ggml_cuda_cublas_route_shape_make(
|
||||
+ /* type */ 30,
|
||||
+ /* src1_type */ 0,
|
||||
+ /* row_diff */ 18,
|
||||
+ /* src1_ncols */ 18,
|
||||
+ /* ne00 */ 1024,
|
||||
+ /* ne10 */ 1024,
|
||||
+ /* ldc */ 18,
|
||||
+ /* supports_bf16 */ true,
|
||||
+ /* use_fp16 */ false,
|
||||
+ /* fast_fp16 */ true,
|
||||
+ /* force_fp32 */ false,
|
||||
+ /* force_fp16 */ false,
|
||||
+ /* src0_contig */ true,
|
||||
+ /* full_rows */ true,
|
||||
+ /* is_cdna */ false,
|
||||
+ /* is_rdna4 */ false,
|
||||
+ /* is_volta */ false);
|
||||
+
|
||||
+ require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
|
||||
+ "cuBLAS records native BF16 tensor-core route");
|
||||
+
|
||||
+ const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make(
|
||||
+ /* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true,
|
||||
+ false, false, false);
|
||||
+
|
||||
+ require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
|
||||
+ "cuBLAS records NVFP4 dequant-to-BF16 tensor-core route");
|
||||
+
|
||||
+ const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make(
|
||||
+ /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true,
|
||||
+ false, false, false);
|
||||
+
|
||||
+ require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
|
||||
+ "cuBLAS records default FP16 tensor-core 16F compute route");
|
||||
+
|
||||
+ const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make(
|
||||
+ /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true,
|
||||
+ false, false, false);
|
||||
+
|
||||
+ require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
|
||||
+ "cuBLAS records forced FP16 tensor-core 32F compute route");
|
||||
+
|
||||
+ const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make(
|
||||
+ /* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true,
|
||||
+ false, false, false);
|
||||
+
|
||||
+ require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM,
|
||||
+ "cuBLAS records SGEMM fallback route");
|
||||
+
|
||||
+ const int cublas_route_n = ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), bf16_tc);
|
||||
+
|
||||
+ require(cublas_route_n > 0, "cuBLAS route format returns byte count");
|
||||
+ require(std::strstr(buf, "route=bf16_tc") != nullptr, "cuBLAS trace includes route name");
|
||||
+ require(std::strstr(buf, "type=30") != nullptr, "cuBLAS trace includes src0 type");
|
||||
+ require(std::strstr(buf, "src1_type=0") != nullptr, "cuBLAS trace includes src1 type");
|
||||
+ require(std::strstr(buf, "row_diff=18") != nullptr, "cuBLAS trace includes row count");
|
||||
+ require(std::strstr(buf, "src1_ncols=18") != nullptr, "cuBLAS trace includes source column count");
|
||||
+ require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate");
|
||||
+ require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate");
|
||||
+ require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate");
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Wed, 1 Jul 2026 06:38:11 +0000
|
||||
Subject: [PATCH] feat(cuda): trace cublas tensor names
|
||||
|
||||
Extend LLAMA_CUBLAS_ROUTE_TRACE with src0/src1/dst tensor names so SGEMM and BF16 cuBLAS buckets can be tied back to graph nodes.
|
||||
|
||||
Assisted-by: Codex:gpt-5
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++--
|
||||
ggml/src/ggml-cuda/mmq-shape-trace.h | 16 +++++++++++++---
|
||||
tests/test-cuda-mmq-shape-trace.cpp | 18 ++++++++++++------
|
||||
3 files changed, 28 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index eff197818..1da67d2af 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -1648,7 +1648,7 @@ static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_sha
|
||||
return;
|
||||
}
|
||||
|
||||
- char buf[320];
|
||||
+ char buf[512];
|
||||
ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape);
|
||||
fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf);
|
||||
}
|
||||
@@ -1701,7 +1701,8 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make(
|
||||
src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc,
|
||||
supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows,
|
||||
- GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA));
|
||||
+ GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA,
|
||||
+ src0->name, src1->name, dst->name));
|
||||
|
||||
if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) {
|
||||
// Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when
|
||||
diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
index f5b4ecf2c..b55c7467c 100644
|
||||
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
|
||||
@@ -129,6 +129,9 @@ struct ggml_cuda_cublas_route_shape {
|
||||
bool is_cdna;
|
||||
bool is_rdna4;
|
||||
bool is_volta;
|
||||
+ const char * src0_name;
|
||||
+ const char * src1_name;
|
||||
+ const char * dst_name;
|
||||
};
|
||||
|
||||
static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
|
||||
@@ -251,7 +254,7 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
|
||||
const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16,
|
||||
const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16,
|
||||
const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4,
|
||||
- const bool is_volta) {
|
||||
+ const bool is_volta, const char * src0_name, const char * src1_name, const char * dst_name) {
|
||||
ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM;
|
||||
if (supports_bf16 && type == 40 && src0_contig && full_rows) {
|
||||
route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC;
|
||||
@@ -284,6 +287,9 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
|
||||
is_cdna,
|
||||
is_rdna4,
|
||||
is_volta,
|
||||
+ src0_name ? src0_name : "",
|
||||
+ src1_name ? src1_name : "",
|
||||
+ dst_name ? dst_name : "",
|
||||
};
|
||||
}
|
||||
|
||||
@@ -464,7 +470,8 @@ static inline int ggml_cuda_cublas_route_shape_format(
|
||||
return std::snprintf(buf, size,
|
||||
"route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld "
|
||||
"supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d "
|
||||
- "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d",
|
||||
+ "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d "
|
||||
+ "src0=%s src1=%s dst=%s",
|
||||
ggml_cuda_cublas_route_name(shape.route),
|
||||
shape.type,
|
||||
shape.src1_type,
|
||||
@@ -482,7 +489,10 @@ static inline int ggml_cuda_cublas_route_shape_format(
|
||||
shape.full_rows ? 1 : 0,
|
||||
shape.is_cdna ? 1 : 0,
|
||||
shape.is_rdna4 ? 1 : 0,
|
||||
- shape.is_volta ? 1 : 0);
|
||||
+ shape.is_volta ? 1 : 0,
|
||||
+ shape.src0_name,
|
||||
+ shape.src1_name,
|
||||
+ shape.dst_name);
|
||||
}
|
||||
|
||||
static inline int ggml_cuda_mmq_small_m_shape_format(
|
||||
diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
index 1443749c3..2547193ce 100644
|
||||
--- a/tests/test-cuda-mmq-shape-trace.cpp
|
||||
+++ b/tests/test-cuda-mmq-shape-trace.cpp
|
||||
@@ -27,7 +27,7 @@ int main() {
|
||||
require(shape.n_active_est == 256, "active expert estimate is capped by expert count");
|
||||
require(shape.density == 4, "density is ceil(assignments / active experts)");
|
||||
|
||||
- char buf[256];
|
||||
+ char buf[512];
|
||||
const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
|
||||
|
||||
require(n > 0, "format returns byte count");
|
||||
@@ -302,35 +302,38 @@ int main() {
|
||||
/* full_rows */ true,
|
||||
/* is_cdna */ false,
|
||||
/* is_rdna4 */ false,
|
||||
- /* is_volta */ false);
|
||||
+ /* is_volta */ false,
|
||||
+ /* src0_name */ "blk.0.proj.weight",
|
||||
+ /* src1_name */ "blk.0.proj.inp",
|
||||
+ /* dst_name */ "blk.0.proj.out");
|
||||
|
||||
require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
|
||||
"cuBLAS records native BF16 tensor-core route");
|
||||
|
||||
const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make(
|
||||
/* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true,
|
||||
- false, false, false);
|
||||
+ false, false, false, "nvfp4.weight", "nvfp4.inp", "nvfp4.out");
|
||||
|
||||
require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
|
||||
"cuBLAS records NVFP4 dequant-to-BF16 tensor-core route");
|
||||
|
||||
const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make(
|
||||
/* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true,
|
||||
- false, false, false);
|
||||
+ false, false, false, "f16.weight", "f16.inp", "f16.out");
|
||||
|
||||
require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
|
||||
"cuBLAS records default FP16 tensor-core 16F compute route");
|
||||
|
||||
const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make(
|
||||
/* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true,
|
||||
- false, false, false);
|
||||
+ false, false, false, "f16.weight", "f16.inp", "f16.out");
|
||||
|
||||
require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
|
||||
"cuBLAS records forced FP16 tensor-core 32F compute route");
|
||||
|
||||
const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make(
|
||||
/* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true,
|
||||
- false, false, false);
|
||||
+ false, false, false, "f32.weight", "f32.inp", "f32.out");
|
||||
|
||||
require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM,
|
||||
"cuBLAS records SGEMM fallback route");
|
||||
@@ -346,6 +349,9 @@ int main() {
|
||||
require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate");
|
||||
require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate");
|
||||
require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate");
|
||||
+ require(std::strstr(buf, "src0=blk.0.proj.weight") != nullptr, "cuBLAS trace includes src0 name");
|
||||
+ require(std::strstr(buf, "src1=blk.0.proj.inp") != nullptr, "cuBLAS trace includes src1 name");
|
||||
+ require(std::strstr(buf, "dst=blk.0.proj.out") != nullptr, "cuBLAS trace includes dst name");
|
||||
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
Reference in New Issue
Block a user