From b529cc5420dc735f585283a16dc5f07db34f539e Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 2 Jul 2026 10:19:10 +0000 Subject: [PATCH] patches(paged): trim series to Phase135 routed-FFN line, sync to fork 1edddc8fe The campaign patches 0048-0063 were added without matching fork commits. After a keep/drop review, the series is trimmed and re-mirrored 1:1 onto the fork branch mudler/llama.cpp:localai-paged (HEAD 1edddc8fe, on 51168c5ee). Kept, renumbered from the fork (now carry Assisted-by + Signed-off-by): - 0048 test(paged): cover MoE swiglu down chain (was 0051, fd920cf8a) - 0049 test(paged): cover MoE weighted combine chain (was 0052, a85c1e098) - 0050 test(paged): cover ragged MoE dispatch (was 0053, 2fed6aacf) - 0051 fix(speculative): disable backend sampling for MTP drafts (was 0054, f1d976f06) - 0052 feat(paged): whole-pattern MoE matcher + routed-FFN fused NVFP4-quant down MMQ (new, 1edddc8fe) Dropped (no fork commits, removed from the series): - 0048-0050 W4A16 grouped-tile pack/tune/pad: dead line, W4A16 ~1.5x slower than grouped-MMQ. - 0055-0063 speculative/moe/mul-mat/cublas route traces + the rejected small-M tile-policy knob (0059). - All other 110-140 campaign markers not needed by Phase135 (GPU-sort, W4A16-direct-A, boundary trace/timing, Phase133 sorted-F32, Phase134 fused-SWIGLU, Phase138 finalize) carry no code in this tree. Tree-hash proof (the mirror invariant): a fresh detached worktree at LLAMA_VERSION 0ed235ea2c17a19fc8238668653946721ed136fd with every on-disk patches/paged/0*.patch applied in numeric order (git apply) stages to tree 097c862c6834b7d8b90419b305b8402155ef8373, byte-identical to fork HEAD 1edddc8fe's tree. Series is 43 patches (0001-0047 unchanged + 0048-0052). Gated on GB10 sm_121a: default md5 MoE 8cb0ce23 / dense 5951a5b4 unchanged; opt-in md5-clean; MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46, MOE_SWIGLU_DOWN 7/7, MUL_MAT_ID_RAGGED_MOE 6/6; six mmq_moe_quantized_raw markers with zero sorted launches on the opt-in sentinel. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- ...ged-pack-W4A16-grouped-tile-metadata.patch | 121 --- ...t-paged-cover-MoE-swiglu-down-chain.patch} | 6 +- ...-paged-tune-W4A16-grouped-tile-shape.patch | 93 -- ...ed-cover-MoE-weighted-combine-chain.patch} | 6 +- ...paged-pad-W4A16-A-shared-tile-stride.patch | 56 -- ...est-paged-cover-ragged-MoE-dispatch.patch} | 6 +- ...isable-backend-sampling-for-MTP-dra.patch} | 7 +- ...-pattern-MoE-matcher-routed-FFN-fuse.patch | 933 ++++++++++++++++++ ...erver-trace-speculative-batch-shapes.patch | 57 -- ...feat-cuda-trace-moe-mmq-batch-shapes.patch | 212 ---- ...eat-cuda-trace-moe-mmq-launch-shapes.patch | 223 ----- ...uda-trace-moe-small-m-mmq-candidates.patch | 182 ---- ...uda-gate-moe-small-m-mmq-tile-policy.patch | 80 -- ...0060-feat-cuda-trace-moe-mmid-routes.patch | 292 ------ .../0061-feat-cuda-trace-mul-mat-routes.patch | 345 ------- .../0062-feat-cuda-trace-cublas-routes.patch | 332 ------- ...-feat-cuda-trace-cublas-tensor-names.patch | 162 --- 17 files changed, 950 insertions(+), 2163 deletions(-) delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0048-feat-paged-pack-W4A16-grouped-tile-metadata.patch rename backend/cpp/llama-cpp-localai-paged/patches/paged/{0051-test-paged-cover-MoE-swiglu-down-chain.patch => 0048-test-paged-cover-MoE-swiglu-down-chain.patch} (95%) delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0049-feat-paged-tune-W4A16-grouped-tile-shape.patch rename backend/cpp/llama-cpp-localai-paged/patches/paged/{0052-test-paged-cover-MoE-weighted-combine-chain.patch => 0049-test-paged-cover-MoE-weighted-combine-chain.patch} (95%) delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0050-feat-paged-pad-W4A16-A-shared-tile-stride.patch rename backend/cpp/llama-cpp-localai-paged/patches/paged/{0053-test-paged-cover-ragged-MoE-dispatch.patch => 0050-test-paged-cover-ragged-MoE-dispatch.patch} (96%) rename backend/cpp/llama-cpp-localai-paged/patches/paged/{0054-fix-speculative-disable-backend-sampling-for-MTP-drafts.patch => 0051-fix-speculative-disable-backend-sampling-for-MTP-dra.patch} (81%) create mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0052-feat-paged-whole-pattern-MoE-matcher-routed-FFN-fuse.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0055-feat-server-trace-speculative-batch-shapes.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0056-feat-cuda-trace-moe-mmq-batch-shapes.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0057-feat-cuda-trace-moe-mmq-launch-shapes.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0058-feat-cuda-trace-moe-small-m-mmq-candidates.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0059-feat-cuda-gate-moe-small-m-mmq-tile-policy.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0060-feat-cuda-trace-moe-mmid-routes.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0061-feat-cuda-trace-mul-mat-routes.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0062-feat-cuda-trace-cublas-routes.patch delete mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0063-feat-cuda-trace-cublas-tensor-names.patch diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-feat-paged-pack-W4A16-grouped-tile-metadata.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-feat-paged-pack-W4A16-grouped-tile-metadata.patch deleted file mode 100644 index de720c697..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-feat-paged-pack-W4A16-grouped-tile-metadata.patch +++ /dev/null @@ -1,121 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Tue, 30 Jun 2026 21:19:07 +0000 -Subject: [PATCH] feat(paged): pack W4A16 grouped tile metadata - -Collapse the grouped W4A16 tile metadata from three host vectors and three device copies into one aligned descriptor array and one H2D copy. This keeps the path default-off while reducing launch-side metadata overhead for the experimental W4A16 MoE prefill path. - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/w4a16-gemm.cu | 46 ++++++++++++++++++-------------- - 1 file changed, 26 insertions(+), 20 deletions(-) - -diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu -index f348f31ad..899e1a23f 100644 ---- a/ggml/src/ggml-cuda/w4a16-gemm.cu -+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu -@@ -18,6 +18,14 @@ typedef tile<16, 8, nv_bfloat162> tile_A; // A operand: M=16, K=16 - typedef tile< 8, 8, nv_bfloat162> tile_B; // B operand: N=8, K=16 - typedef tile<16, 8, float> tile_C; // accumulator: M=16, N=8 - -+struct alignas(16) w4a16_tile_desc { -+ int expert; -+ int row0; -+ int rows; -+ int pad; -+}; -+static_assert(sizeof(w4a16_tile_desc) == 16, "w4a16 tile descriptors must stay 16 bytes"); -+ - #ifndef LLAMA_W4A16_PREFILL_M - #define LLAMA_W4A16_PREFILL_M 0 - #endif // LLAMA_W4A16_PREFILL_M -@@ -92,9 +100,7 @@ static __global__ void w4a16_grouped_kernel( - const nv_bfloat16 * __restrict__ Abf, // [pad_rows, K] bf16 - const block_nvfp4 * __restrict__ W0, // src0 base (expert 0) - float * __restrict__ C, // [total_rows, N] f32 -- const int * __restrict__ g_tile_expert, -- const int * __restrict__ g_tile_row0, -- const int * __restrict__ g_tile_rows, -+ const w4a16_tile_desc * __restrict__ g_tiles, - int N, int K, int64_t expert_stride_blocks) { - #if defined(AMPERE_MMA_AVAILABLE) && defined(CP_ASYNC_AVAILABLE) - constexpr int BK = 64; // one nvfp4 block -@@ -129,9 +135,10 @@ static __global__ void w4a16_grouped_kernel( - const int tid = warp*32 + lane; // linear id for the cp.async strided copies - const int wrow = warp / WARPS_N, wcol = warp % WARPS_N; - -- const int e = g_tile_expert[blockIdx.y]; -- const int row0 = g_tile_row0[blockIdx.y]; -- const int rcount = g_tile_rows[blockIdx.y]; -+ const w4a16_tile_desc tile_desc = g_tiles[blockIdx.y]; -+ const int e = tile_desc.expert; -+ const int row0 = tile_desc.row0; -+ const int rcount = tile_desc.rows; - const int blockCol = blockIdx.x*BN; - const int Kb = K/64; - const block_nvfp4 * We = W0 + (int64_t) e*expert_stride_blocks; // expert e weight base -@@ -238,7 +245,7 @@ static __global__ void w4a16_grouped_kernel( - } - #else - GGML_UNUSED(Abf); GGML_UNUSED(W0); GGML_UNUSED(C); -- GGML_UNUSED(g_tile_expert); GGML_UNUSED(g_tile_row0); GGML_UNUSED(g_tile_rows); -+ GGML_UNUSED(g_tiles); - GGML_UNUSED(N); GGML_UNUSED(K); GGML_UNUSED(expert_stride_blocks); - NO_DEVICE_CODE; - #endif // AMPERE_MMA_AVAILABLE && CP_ASYNC_AVAILABLE -@@ -296,18 +303,21 @@ void ggml_cuda_mul_mat_id_w4a16_grouped( - return; - } - -- std::vector h_tile_expert, h_tile_row0, h_tile_rows; -+ std::vector h_tiles; - int64_t row = 0; - for (int64_t e = 0; e < n_experts; e++) { - const int t = tokens_per_expert[e]; - for (int off = 0; off < t; off += BM) { -- h_tile_expert.push_back((int32_t) e); -- h_tile_row0.push_back((int32_t) (row + off)); -- h_tile_rows.push_back((int32_t) std::min(BM, t - off)); -+ h_tiles.push_back({ -+ (int) e, -+ (int) (row + off), -+ (int) std::min(BM, t - off), -+ 0, -+ }); - } - row += t; - } -- const int n_tiles = (int) h_tile_expert.size(); -+ const int n_tiles = (int) h_tiles.size(); - - if (getenv("LLAMA_W4A16_DEBUG")) { - int max_tpe = 0, multi = 0; -@@ -319,13 +329,9 @@ void ggml_cuda_mul_mat_id_w4a16_grouped( - (long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi); - } - -- // device: tile map -- ggml_cuda_pool_alloc d_tile_expert(ctx.pool(), n_tiles); -- ggml_cuda_pool_alloc d_tile_row0 (ctx.pool(), n_tiles); -- ggml_cuda_pool_alloc d_tile_rows (ctx.pool(), n_tiles); -- CUDA_CHECK(cudaMemcpyAsync(d_tile_expert.ptr, h_tile_expert.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); -- CUDA_CHECK(cudaMemcpyAsync(d_tile_row0.ptr, h_tile_row0.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); -- CUDA_CHECK(cudaMemcpyAsync(d_tile_rows.ptr, h_tile_rows.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); -+ // device: packed tile map; one pageable H2D copy instead of three tiny copies -+ ggml_cuda_pool_alloc d_tiles(ctx.pool(), n_tiles); -+ CUDA_CHECK(cudaMemcpyAsync(d_tiles.ptr, h_tiles.data(), n_tiles*sizeof(w4a16_tile_desc), cudaMemcpyHostToDevice, stream)); - - // activations: f32 -> bf16 (cheap cast, NO act-quant), zero-padded so every tile's BM-row read - // stays in-bounds. A tile's row0 is generally NOT BM-aligned (experts start mid-buffer), and a -@@ -353,7 +359,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped( - dim3 block(32, WARPS_M*WARPS_N); // 2D: threadIdx.x = warp lane, threadIdx.y = warp - kern<<>>( - Abf.get(), (const block_nvfp4 *) src0->data, dst_sorted, -- d_tile_expert.ptr, d_tile_row0.ptr, d_tile_rows.ptr, -+ d_tiles.ptr, - (int) N, (int) K, expert_stride_blocks); - CUDA_CHECK(cudaGetLastError()); - } --- -2.43.0 - diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0051-test-paged-cover-MoE-swiglu-down-chain.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-test-paged-cover-MoE-swiglu-down-chain.patch similarity index 95% rename from backend/cpp/llama-cpp-localai-paged/patches/paged/0051-test-paged-cover-MoE-swiglu-down-chain.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0048-test-paged-cover-MoE-swiglu-down-chain.patch index 469213a50..8a0f6ec8f 100644 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0051-test-paged-cover-MoE-swiglu-down-chain.patch +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-test-paged-cover-MoE-swiglu-down-chain.patch @@ -1,8 +1,10 @@ -From cd56cf037379b084d6bb0ed47db8b785c828be86 Mon Sep 17 00:00:00 2001 +From fd920cf8a7fe9cc7753cd0640411ce771edfeaca Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 30 Jun 2026 23:18:38 +0000 -Subject: [PATCH] test(paged): cover MoE swiglu down chain +Subject: [PATCH 48/52] test(paged): cover MoE swiglu down chain +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto --- tests/test-backend-ops.cpp | 92 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-feat-paged-tune-W4A16-grouped-tile-shape.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-feat-paged-tune-W4A16-grouped-tile-shape.patch deleted file mode 100644 index 4d2736e44..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-feat-paged-tune-W4A16-grouped-tile-shape.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 7dfa0e17548c5f04f83d2cc2a057b0a9941b599a Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Tue, 30 Jun 2026 21:44:54 +0000 -Subject: [PATCH] feat(paged): tune W4A16 grouped tile shape - -Select a BM=32 grouped W4A16 MoE prefill shape by default after the GB10 shape sweep, while keeping the prior 64x128 shape and additional diagnostics selectable through LLAMA_W4A16_SHAPE. - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/w4a16-gemm.cu | 42 +++++++++++++++++++++++++++++--- - 1 file changed, 38 insertions(+), 4 deletions(-) - -diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu -index 899e1a23f..ca8864292 100644 ---- a/ggml/src/ggml-cuda/w4a16-gemm.cu -+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu -@@ -4,6 +4,7 @@ - #include - #include - #include -+#include - #include - - // =========================================================================== -@@ -281,7 +282,23 @@ bool ggml_cuda_w4a16_moe_grouped_should_engage( - return true; - } - --void ggml_cuda_mul_mat_id_w4a16_grouped( -+static bool ggml_cuda_w4a16_use_base_shape() { -+ static const bool use_base = [] { -+ const char * e = getenv("LLAMA_W4A16_SHAPE"); -+ if (e == nullptr || e[0] == '\0' || strcmp(e, "default") == 0 || strcmp(e, "bm32") == 0 || strcmp(e, "32x128") == 0) { -+ return false; -+ } -+ if (strcmp(e, "base") == 0 || strcmp(e, "64x128") == 0) { -+ return true; -+ } -+ fprintf(stderr, "[w4a16] unknown LLAMA_W4A16_SHAPE=%s, using default bm32\n", e); -+ return false; -+ }(); -+ return use_base; -+} -+ -+template -+static void ggml_cuda_mul_mat_id_w4a16_grouped_impl( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, - const float * src1_sorted, -@@ -290,9 +307,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped( - int64_t n_experts, int64_t K, int64_t N, - cudaStream_t stream) { - GGML_ASSERT(src0->type == GGML_TYPE_NVFP4); -- GGML_ASSERT(N % 128 == 0 && K % 64 == 0); -- -- constexpr int BM = 64, BN = 128, WARPS_M = 2, WARPS_N = 4, STAGES = 2; -+ GGML_ASSERT(N % BN == 0 && K % 64 == 0); - - // host: build the per-M-tile expert map (ragged, no tile crosses an expert boundary) - int64_t total_rows = 0; -@@ -327,6 +342,8 @@ void ggml_cuda_mul_mat_id_w4a16_grouped( - } - fprintf(stderr, "[w4a16] engaged: total_rows=%lld n_experts=%lld K=%lld N=%lld n_tiles=%d max_tpe=%d multi_tile_experts=%d\n", - (long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi); -+ fprintf(stderr, "[w4a16] shape: BM=%d BN=%d WARPS_M=%d WARPS_N=%d STAGES=%d\n", -+ BM, BN, WARPS_M, WARPS_N, STAGES); - } - - // device: packed tile map; one pageable H2D copy instead of three tiny copies -@@ -363,3 +380,20 @@ void ggml_cuda_mul_mat_id_w4a16_grouped( - (int) N, (int) K, expert_stride_blocks); - CUDA_CHECK(cudaGetLastError()); - } -+ -+void ggml_cuda_mul_mat_id_w4a16_grouped( -+ ggml_backend_cuda_context & ctx, -+ const ggml_tensor * src0, -+ const float * src1_sorted, -+ float * dst_sorted, -+ const int * tokens_per_expert, -+ int64_t n_experts, int64_t K, int64_t N, -+ cudaStream_t stream) { -+ if (ggml_cuda_w4a16_use_base_shape()) { -+ ggml_cuda_mul_mat_id_w4a16_grouped_impl<64, 128, 2, 4, 2>( -+ ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream); -+ } else { -+ ggml_cuda_mul_mat_id_w4a16_grouped_impl<32, 128, 1, 4, 2>( -+ ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream); -+ } -+} --- -2.43.0 - diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-test-paged-cover-MoE-weighted-combine-chain.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-test-paged-cover-MoE-weighted-combine-chain.patch similarity index 95% rename from backend/cpp/llama-cpp-localai-paged/patches/paged/0052-test-paged-cover-MoE-weighted-combine-chain.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0049-test-paged-cover-MoE-weighted-combine-chain.patch index e68f1da91..2239f72c9 100644 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-test-paged-cover-MoE-weighted-combine-chain.patch +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-test-paged-cover-MoE-weighted-combine-chain.patch @@ -1,8 +1,10 @@ -From 3ef7eb9e412eb34f8656675862f6753c65d28ec9 Mon Sep 17 00:00:00 2001 +From a85c1e098e22eb587fd80220986f35a8d6e11300 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 30 Jun 2026 23:50:33 +0000 -Subject: [PATCH] test(paged): cover MoE weighted combine chain +Subject: [PATCH 49/52] test(paged): cover MoE weighted combine chain +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto --- tests/test-backend-ops.cpp | 90 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-feat-paged-pad-W4A16-A-shared-tile-stride.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-feat-paged-pad-W4A16-A-shared-tile-stride.patch deleted file mode 100644 index b69a895be..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-feat-paged-pad-W4A16-A-shared-tile-stride.patch +++ /dev/null @@ -1,56 +0,0 @@ -From d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3 Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Tue, 30 Jun 2026 22:13:09 +0000 -Subject: [PATCH] feat(paged): pad W4A16 A shared tile stride - -Pad the grouped W4A16 A operand shared-memory row stride to reduce bank pressure on GB10 while preserving the selected bm32/default and base launch shapes. - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/w4a16-gemm.cu | 9 +++++---- - 1 file changed, 5 insertions(+), 4 deletions(-) - -diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu -index ca8864292..b17e022b0 100644 ---- a/ggml/src/ggml-cuda/w4a16-gemm.cu -+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu -@@ -111,7 +111,8 @@ static __global__ void w4a16_grouped_kernel( - constexpr int MF = WM/16, NF = WN/8; - - constexpr int AN = BK/2; // bf16 pairs per A smem row (nv_bfloat162) -- constexpr int SZ_A = BM*AN; // nv_bfloat162 per stage -+ constexpr int ASTR = AN + 4; // padded A smem row stride, in nv_bfloat162 -+ constexpr int SZ_A = BM*ASTR; // nv_bfloat162 per stage - constexpr int SZ_WQ = BN*8; // u32 per stage (32 qs bytes/row) - constexpr int SZ_WD = BN; // u32 per stage (4 scale bytes/row) - -@@ -155,7 +156,7 @@ static __global__ void w4a16_grouped_kernel( - const int c = idx % (BK/8); // 16B chunk in the row - const int r = idx / (BK/8); // row in tile - const nv_bfloat16 * src = Abf + (int64_t)(row0 + r)*K + (int64_t)kt*BK + c*8; -- w4a16_cp_async16(((char *) sA[st]) + (r*AN + c*4)*sizeof(uint32_t), src); -+ w4a16_cp_async16(((char *) sA[st]) + (r*ASTR + c*4)*sizeof(uint32_t), src); - } - // W qs: BN rows x 32 bytes = BN x 8 u32 (each block's qs at byte offset 4) - #pragma unroll 1 -@@ -198,7 +199,7 @@ static __global__ void w4a16_grouped_kernel( - #pragma unroll - for (int mi = 0; mi < MF; mi++) { - const int rb = wrow*WM + mi*16; -- load_ldmatrix(A_frag[mi], sAcur + rb*AN + kk*8, AN); -+ load_ldmatrix(A_frag[mi], sAcur + rb*ASTR + kk*8, ASTR); - } - // B fragments: in-register FP4->bf16 dequant (correct-by-construction via tile get_i/get_j) - tile_B B_frag[NF]; -@@ -368,7 +369,7 @@ static void ggml_cuda_mul_mat_id_w4a16_grouped_impl( - const int64_t expert_stride_blocks = (int64_t) (src0->nb[2] / sizeof(block_nvfp4)); - - auto kern = w4a16_grouped_kernel; -- constexpr int STAGE_U32 = BM*(64/2) + BN*8 + BN; -+ constexpr int STAGE_U32 = BM*((64/2) + 4) + BN*8 + BN; - const int smem_bytes = STAGES * STAGE_U32 * (int) sizeof(uint32_t); - CUDA_CHECK(cudaFuncSetAttribute(kern, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes)); - --- -2.43.0 - diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-test-paged-cover-ragged-MoE-dispatch.patch similarity index 96% rename from backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0050-test-paged-cover-ragged-MoE-dispatch.patch index cfc8e9736..ab9554ef4 100644 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-test-paged-cover-ragged-MoE-dispatch.patch @@ -1,8 +1,10 @@ -From e21732fc47206d5878e3b977bbd21858a3ba4ab0 Mon Sep 17 00:00:00 2001 +From 2fed6aacff14537864bbf754c7552740131d4eaf Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 1 Jul 2026 00:39:52 +0000 -Subject: [PATCH] test(paged): cover ragged MoE dispatch +Subject: [PATCH 50/52] test(paged): cover ragged MoE dispatch +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto --- tests/test-backend-ops.cpp | 118 +++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-fix-speculative-disable-backend-sampling-for-MTP-drafts.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0051-fix-speculative-disable-backend-sampling-for-MTP-dra.patch similarity index 81% rename from backend/cpp/llama-cpp-localai-paged/patches/paged/0054-fix-speculative-disable-backend-sampling-for-MTP-drafts.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0051-fix-speculative-disable-backend-sampling-for-MTP-dra.patch index 5c2c125aa..089e3c807 100644 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-fix-speculative-disable-backend-sampling-for-MTP-drafts.patch +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0051-fix-speculative-disable-backend-sampling-for-MTP-dra.patch @@ -1,8 +1,11 @@ -From 3eba64afff6ecaa25da11f0e394717224f221c9a Mon Sep 17 00:00:00 2001 +From f1d976f06fb92655106709256dd093ffefb85e2b Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 1 Jul 2026 00:50:36 +0000 -Subject: [PATCH] fix(speculative): disable backend sampling for MTP drafts +Subject: [PATCH 51/52] fix(speculative): disable backend sampling for MTP + drafts +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto --- common/speculative.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-feat-paged-whole-pattern-MoE-matcher-routed-FFN-fuse.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-feat-paged-whole-pattern-MoE-matcher-routed-FFN-fuse.patch new file mode 100644 index 000000000..10acac46f --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-feat-paged-whole-pattern-MoE-matcher-routed-FFN-fuse.patch @@ -0,0 +1,933 @@ +From 1edddc8fe93bb2fec5f831bbde5df2b7480a7b05 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Thu, 2 Jul 2026 12:15:38 +0200 +Subject: [PATCH 52/52] feat(paged): whole-pattern MoE matcher + routed-FFN + fused NVFP4-quant down MMQ + +Add the routed-FFN fused-quant line for the paged NVFP4 MoE decode step, +all default-off and md5-clean, gating a fused SwiGLU-to-NVFP4-quant plus a +raw pre-quantized down-projection MMQ that skips the intermediate F32 +materialize + re-quantize of the standard gate_up -> SwiGLU -> down chain. + +Pieces (all guarded, no effect unless explicitly enabled): + +- Whole-pattern MoE matcher + executor hook in ggml_cuda_try_fuse + (LLAMA_MOE_WHOLE_PATTERN_EXEC and the *_TRACE/_EARLY_TRACE diagnostics). + Detects the gate_up(MUL_MAT_ID) -> view/view -> SwiGLU(GLU) -> down(MUL_MAT_ID) + sub-graph early in the fusion pass and, when engaged, runs the whole chain + through a single executor instead of node-by-node. + +- Routed-FFN PoC scaffold ggml/src/ggml-cuda/moe-ffn.{cu,cuh} + a narrow hook + (LLAMA_MOE_ROUTED_FFN_POC). ggml_cuda_compute_forward is de-static-ed so the + executor translation unit can drive the standard op path for the fallback + legs. The executor tries the fused-quant path first, else falls back to the + stock compute_forward for glu + down (bit-identical to default). + +- Fused SwiGLU-to-NVFP4-quant + raw down MMQ (LLAMA_MOE_ROUTED_FFN_FUSED_QUANT): + moe_swiglu_nvfp4_quant_kernel writes block_fp4_mmq activations directly, then + ggml_cuda_mul_mat_q_moe_quantized (with the local ggml_cuda_mmq_ids_meta + refactor of the expert-sorted ids/bounds prep) runs the down GEMM on the + pre-quantized rows. Native FP4 (Blackwell) only; NVFP4 down weights only. + +Gated on GB10 (sm_121a), before/after this commit: +- Canonical default-path greedy md5 unchanged: MoE q36-35b-a3b-nvfp4 + 8cb0ce23777bf55f92f63d0292c756b0, dense q36-27b-nvfp4 + 5951a5b4d624ce891e22ab5fca9bc439. +- md5-clean opt-in: LLAMA_MOE_ROUTED_FFN_POC=1 LLAMA_MOE_ROUTED_FFN_FUSED_QUANT=1 + keeps the MoE md5 byte-identical (8cb0ce23...). +- test-backend-ops: MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46, + and the MoE sentinels MOE_SWIGLU_DOWN 7/7 + MUL_MAT_ID_RAGGED_MOE 6/6 pass both + default and opt-in. Opt-in emits exactly six route=mmq_moe_quantized_raw markers + with zero mmq_moe_sorted_raw launches (fused path provably engaged). +- Serving effect is flat-to-slightly-positive and not a shipped default: + decode agg 326.9 -> 332.7 t/s, mmq_nvfp4 6009 -> 5915 ms, aggregate flat + (~/bench/phase135_routed_ffn_fused_quant_serving/20260702_082102). + +The rejected/neutral neighbours of this line (Phase133 sorted-F32 down, +Phase134 fused-SWIGLU-only, Phase138 finalize/weighted-combine fusion, the +W4A16 grouped-tile pack/tune/pad line, GPU-sort, boundary/layout/quant traces) +are deliberately excluded and carry no markers in this tree. + +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto +--- + ggml/src/ggml-cuda/ggml-cuda.cu | 304 +++++++++++++++++++++++++++++++- + ggml/src/ggml-cuda/mmq.cu | 148 ++++++++++++++++ + ggml/src/ggml-cuda/mmq.cuh | 25 +++ + ggml/src/ggml-cuda/moe-ffn.cu | 296 +++++++++++++++++++++++++++++++ + ggml/src/ggml-cuda/moe-ffn.cuh | 24 +++ + 5 files changed, 796 insertions(+), 1 deletion(-) + create mode 100644 ggml/src/ggml-cuda/moe-ffn.cu + create mode 100644 ggml/src/ggml-cuda/moe-ffn.cuh + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 374949f25..ef1bdc3b4 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -32,6 +32,7 @@ + #include "ggml-cuda/im2col.cuh" + #include "ggml-cuda/mmf.cuh" + #include "ggml-cuda/mmq.cuh" ++#include "ggml-cuda/moe-ffn.cuh" + #include "ggml-cuda/mmvf.cuh" + #include "ggml-cuda/mmvq.cuh" + #include "ggml-cuda/norm.cuh" +@@ -2854,7 +2855,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * + nb1, nb2, nb3, stream); + } + +-static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { ++bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { + switch (dst->op) { + case GGML_OP_ARGMAX: + ggml_cuda_argmax(ctx, dst); +@@ -4032,6 +4033,201 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, + return false; + } + ++static inline const char * ggml_cuda_moe_wp_trace_tensor_name(const ggml_tensor * t) { ++ return t != nullptr && t->name[0] != '\0' ? t->name : "-"; ++} ++ ++static inline int ggml_cuda_moe_whole_pattern_trace_limit() { ++ static const int value = []() { ++ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_TRACE"); ++ if (s == nullptr || strcmp(s, "0") == 0) { ++ return 0; ++ } ++ const int parsed = atoi(s); ++ return parsed > 0 ? parsed : 128; ++ }(); ++ ++ return value; ++} ++ ++static inline bool ggml_cuda_moe_whole_pattern_trace_take(std::atomic & counter) { ++ const int trace_limit = ggml_cuda_moe_whole_pattern_trace_limit(); ++ if (trace_limit <= 0) { ++ return false; ++ } ++ ++ const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed); ++ return trace_idx < trace_limit; ++} ++ ++static inline int ggml_cuda_moe_whole_pattern_early_trace_limit() { ++ static const int value = []() { ++ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EARLY_TRACE"); ++ if (s == nullptr || strcmp(s, "0") == 0) { ++ return 0; ++ } ++ const int parsed = atoi(s); ++ return parsed > 0 ? parsed : 128; ++ }(); ++ ++ return value; ++} ++ ++static inline bool ggml_cuda_moe_whole_pattern_exec_enabled() { ++ static const bool value = []() { ++ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC"); ++ return s != nullptr && atoi(s) != 0; ++ }(); ++ ++ return value; ++} ++ ++static inline int ggml_cuda_moe_whole_pattern_exec_trace_limit() { ++ static const int value = []() { ++ const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC_TRACE"); ++ if (s == nullptr || strcmp(s, "0") == 0) { ++ return 0; ++ } ++ const int parsed = atoi(s); ++ return parsed > 0 ? parsed : 128; ++ }(); ++ ++ return value; ++} ++ ++static inline bool ggml_cuda_moe_whole_pattern_exec_trace_take(std::atomic & counter) { ++ const int trace_limit = ggml_cuda_moe_whole_pattern_exec_trace_limit(); ++ if (trace_limit <= 0) { ++ return false; ++ } ++ ++ const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed); ++ return trace_idx < trace_limit; ++} ++ ++struct ggml_cuda_moe_whole_pattern { ++ const ggml_tensor * gate_up = nullptr; ++ const ggml_tensor * gate = nullptr; ++ const ggml_tensor * up = nullptr; ++ const ggml_tensor * glu = nullptr; ++ const ggml_tensor * down = nullptr; ++ const ggml_tensor * ids = nullptr; ++ ++ bool view_pair = false; ++ bool ids_match = false; ++ bool swiglu = false; ++ bool supported_type = false; ++ bool supported = false; ++}; ++ ++static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect(const ggml_tensor * glu, const ggml_tensor * down) { ++ ggml_cuda_moe_whole_pattern pattern{}; ++ pattern.glu = glu; ++ pattern.down = down; ++ ++ if (glu == nullptr || down == nullptr || glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) { ++ return pattern; ++ } ++ ++ pattern.gate = glu->src[0]; ++ pattern.up = glu->src[1]; ++ pattern.ids = down->src[2]; ++ ++ pattern.view_pair = pattern.gate != nullptr && pattern.up != nullptr && ++ pattern.gate->op == GGML_OP_VIEW && pattern.up->op == GGML_OP_VIEW && ++ pattern.gate->view_src != nullptr && pattern.gate->view_src == pattern.up->view_src; ++ if (!pattern.view_pair) { ++ return pattern; ++ } ++ ++ pattern.gate_up = pattern.gate->view_src; ++ if (pattern.gate_up == nullptr || pattern.gate_up->op != GGML_OP_MUL_MAT_ID) { ++ return pattern; ++ } ++ ++ pattern.ids_match = pattern.gate_up->src[2] == pattern.ids; ++ pattern.swiglu = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU; ++ pattern.supported_type = down->src[0] != nullptr && ++ (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4); ++ pattern.supported = pattern.ids_match && pattern.swiglu && pattern.supported_type; ++ ++ return pattern; ++} ++ ++static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect_early(const ggml_cgraph * cgraph, int i) { ++ ggml_cuda_moe_whole_pattern pattern{}; ++ ++ if (cgraph == nullptr || i + 4 >= cgraph->n_nodes) { ++ return pattern; ++ } ++ ++ const ggml_tensor * gate_up = cgraph->nodes[i + 0]; ++ const ggml_tensor * view0 = cgraph->nodes[i + 1]; ++ const ggml_tensor * view1 = cgraph->nodes[i + 2]; ++ const ggml_tensor * glu = cgraph->nodes[i + 3]; ++ const ggml_tensor * down = cgraph->nodes[i + 4]; ++ ++ pattern.gate_up = gate_up; ++ pattern.glu = glu; ++ pattern.down = down; ++ ++ if (gate_up == nullptr || view0 == nullptr || view1 == nullptr || glu == nullptr || down == nullptr || ++ gate_up->op != GGML_OP_MUL_MAT_ID || view0->op != GGML_OP_VIEW || view1->op != GGML_OP_VIEW || ++ glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) { ++ return pattern; ++ } ++ ++ pattern.view_pair = view0->view_src == gate_up && view1->view_src == gate_up; ++ if (!pattern.view_pair) { ++ return pattern; ++ } ++ ++ if (glu->src[0] == view0 && glu->src[1] == view1) { ++ pattern.gate = view0; ++ pattern.up = view1; ++ } else if (glu->src[0] == view1 && glu->src[1] == view0) { ++ pattern.gate = view1; ++ pattern.up = view0; ++ } else { ++ return pattern; ++ } ++ ++ if (down->src[1] != glu) { ++ return pattern; ++ } ++ ++ pattern.ids = down->src[2]; ++ pattern.ids_match = gate_up->src[2] == pattern.ids; ++ pattern.swiglu = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU; ++ pattern.supported_type = down->src[0] != nullptr && ++ (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4); ++ pattern.supported = pattern.ids_match && pattern.swiglu && pattern.supported_type; ++ ++ return pattern; ++} ++ ++static bool ggml_cuda_moe_whole_pattern_exec_proof( ++ ggml_backend_cuda_context * cuda_ctx, ++ const ggml_cuda_moe_whole_pattern & pattern) { ++ GGML_ASSERT(cuda_ctx != nullptr); ++ GGML_ASSERT(pattern.supported); ++ GGML_ASSERT(pattern.gate_up != nullptr); ++ GGML_ASSERT(pattern.glu != nullptr); ++ GGML_ASSERT(pattern.down != nullptr); ++ ++ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast(pattern.gate_up))) { ++ return false; ++ } ++ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast(pattern.glu))) { ++ return false; ++ } ++ if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast(pattern.down))) { ++ return false; ++ } ++ ++ return true; ++} ++ + // try and fuse nodes and return the number of nodes to skip + static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, int i) { + +@@ -4042,6 +4238,112 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph + + ggml_tensor * node = cgraph->nodes[i]; + ++ static std::atomic moe_whole_pattern_early_trace_count{0}; ++ const bool routed_ffn_poc = ggml_cuda_moe_routed_ffn_poc_enabled(); ++ const bool whole_pattern_exec = ggml_cuda_moe_whole_pattern_exec_enabled(); ++ const int whole_pattern_early_trace_limit = ggml_cuda_moe_whole_pattern_early_trace_limit(); ++ if (node->op == GGML_OP_MUL_MAT_ID && ++ (routed_ffn_poc || whole_pattern_exec || whole_pattern_early_trace_limit > 0)) { ++ const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect_early(cgraph, i); ++ if (pattern.view_pair) { ++ const int trace_idx = moe_whole_pattern_early_trace_count.fetch_add(1, std::memory_order_relaxed); ++ if (trace_idx < whole_pattern_early_trace_limit) { ++ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr; ++ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr; ++ fprintf(stderr, ++ "[LLAMA_MOE_WHOLE_PATTERN_EARLY] supported=%d skip_ready=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s" ++ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64 ++ " n_embd=%" PRId64 " n_ff=%" PRId64 ++ " ids_match=%d swiglu=%d\n", ++ pattern.supported ? 1 : 0, ++ pattern.supported ? 4 : 0, ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.up), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.down), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids), ++ down_w != nullptr ? ggml_type_name(down_w->type) : "-", ++ down_x != nullptr ? down_x->ne[2] : 0, ++ pattern.ids != nullptr ? pattern.ids->ne[0] : 0, ++ down_w != nullptr ? down_w->ne[2] : 0, ++ down_w != nullptr ? down_w->ne[1] : 0, ++ down_w != nullptr ? down_w->ne[0] : 0, ++ pattern.ids_match ? 1 : 0, ++ pattern.swiglu ? 1 : 0); ++ } ++ } ++ ++ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; ++ const bool poc_supported = routed_ffn_poc && ggml_cuda_moe_routed_ffn_poc_should_engage( ++ pattern.gate_up, pattern.gate, pattern.up, pattern.glu, pattern.down, pattern.ids, cc); ++ ++ if ((poc_supported || (whole_pattern_exec && pattern.supported))) { ++ const bool ok = poc_supported ? ++ ggml_cuda_moe_routed_ffn_poc( ++ *cuda_ctx, ++ const_cast(pattern.gate_up), ++ const_cast(pattern.gate), ++ const_cast(pattern.up), ++ const_cast(pattern.glu), ++ const_cast(pattern.down)) : ++ ggml_cuda_moe_whole_pattern_exec_proof(cuda_ctx, pattern); ++ GGML_ASSERT(ok); ++ ++ static std::atomic moe_whole_pattern_exec_trace_count{0}; ++ if (ggml_cuda_moe_whole_pattern_exec_trace_take(moe_whole_pattern_exec_trace_count)) { ++ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr; ++ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr; ++ fprintf(stderr, ++ "[LLAMA_MOE_WHOLE_PATTERN_EXEC] skip=4 gate_up=%s glu=%s down=%s ids=%s" ++ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64 "\n", ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.down), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids), ++ down_x != nullptr ? down_x->ne[2] : 0, ++ pattern.ids != nullptr ? pattern.ids->ne[0] : 0, ++ down_w != nullptr ? down_w->ne[2] : 0); ++ } ++ ++ return 4; ++ } ++ } ++ ++ if (node->op == GGML_OP_GLU && i + 1 < cgraph->n_nodes && cgraph->nodes[i + 1]->op == GGML_OP_MUL_MAT_ID) { ++ static std::atomic moe_whole_pattern_trace_count{0}; ++ const bool whole_trace = ggml_cuda_moe_whole_pattern_trace_take(moe_whole_pattern_trace_count); ++ ++ if (whole_trace) { ++ const ggml_tensor * down = cgraph->nodes[i + 1]; ++ const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect(node, down); ++ ++ const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr; ++ const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr; ++ fprintf(stderr, ++ "[LLAMA_MOE_WHOLE_PATTERN] supported=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s" ++ " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64 ++ " n_embd=%" PRId64 " n_ff=%" PRId64 ++ " view_pair=%d ids_match=%d swiglu=%d\n", ++ pattern.supported ? 1 : 0, ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.gate), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.up), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.glu), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.down), ++ ggml_cuda_moe_wp_trace_tensor_name(pattern.ids), ++ down_w != nullptr ? ggml_type_name(down_w->type) : "-", ++ down_x != nullptr ? down_x->ne[2] : 0, ++ pattern.ids != nullptr ? pattern.ids->ne[0] : 0, ++ down_w != nullptr ? down_w->ne[2] : 0, ++ down_w != nullptr ? down_w->ne[1] : 0, ++ down_w != nullptr ? down_w->ne[0] : 0, ++ pattern.view_pair ? 1 : 0, ++ pattern.ids_match ? 1 : 0, ++ pattern.swiglu ? 1 : 0); ++ } ++ } ++ + //topk-moe + if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX || + cgraph->nodes[i]->op == GGML_OP_ARGSORT) { +diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu +index dc5c2d198..d8f39d395 100644 +--- a/ggml/src/ggml-cuda/mmq.cu ++++ b/ggml/src/ggml-cuda/mmq.cu +@@ -1,4 +1,7 @@ + #include ++#include ++#include ++#include + #include "common.cuh" + #include "mmq.cuh" + #include "quantize.cuh" +@@ -75,6 +78,151 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con + } + } + ++static inline int ggml_cuda_quant_trace_limit() { ++ static const int value = []() { ++ const char * s = getenv("LLAMA_QUANT_TRACE"); ++ return s ? atoi(s) : 0; ++ }(); ++ ++ return value; ++} ++ ++static inline const char * ggml_cuda_quant_trace_tensor_name(const ggml_tensor * t) { ++ return t != nullptr && t->name[0] != '\0' ? t->name : "-"; ++} ++ ++static inline void ggml_cuda_quant_trace( ++ const char * route, const ggml_tensor * src0, const ggml_tensor * src1, ++ const ggml_tensor * ids, const ggml_tensor * dst, const int native_fp4, ++ const int dedup, const int gathered, const int64_t ne10, const int64_t ne10_padded, ++ const int64_t rows, const int64_t ne12, const int64_t n_expert_used) { ++ const int trace_limit = ggml_cuda_quant_trace_limit(); ++ if (trace_limit <= 0) { ++ return; ++ } ++ ++ static std::atomic trace_count{0}; ++ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed); ++ if (trace_idx >= trace_limit) { ++ return; ++ } ++ ++ fprintf(stderr, ++ "[LLAMA_QUANT_TRACE] route=%s src0=%s src0_type=%s src1=%s src1_type=%s dst=%s dst_type=%s ids=%s " ++ "native_fp4=%d dedup=%d gathered=%d K=%" PRId64 " Kpad=%" PRId64 " rows=%" PRId64 ++ " ne12=%" PRId64 " experts=%" PRId64 "\n", ++ route, ggml_cuda_quant_trace_tensor_name(src0), src0 != nullptr ? ggml_type_name(src0->type) : "-", ++ ggml_cuda_quant_trace_tensor_name(src1), src1 != nullptr ? ggml_type_name(src1->type) : "-", ++ ggml_cuda_quant_trace_tensor_name(dst), dst != nullptr ? ggml_type_name(dst->type) : "-", ++ ggml_cuda_quant_trace_tensor_name(ids), native_fp4, dedup, gathered, ++ ne10, ne10_padded, rows, ne12, n_expert_used); ++} ++ ++ggml_cuda_mmq_ids_meta::ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts) ++ : ne_get_rows(ne_get_rows) { ++ alloc(pool, ne_get_rows, n_experts); ++} ++ ++void ggml_cuda_mmq_ids_meta::alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts) { ++ ids_src1.alloc(pool, ne_get_rows); ++ ids_dst.alloc(pool, ne_get_rows); ++ expert_bounds.alloc(pool, n_experts + 1); ++ this->ne_get_rows = ne_get_rows; ++} ++ ++void ggml_cuda_mmq_ids_meta::build( ++ const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens, ++ int64_t n_expert_used, int64_t nchannels_y, int64_t sis1, ++ cudaStream_t stream) { ++ GGML_ASSERT(ids->nb[0] == ggml_element_size(ids)); ++ const int si1 = ids->nb[1] / ggml_element_size(ids); ++ ++ ggml_cuda_launch_mm_ids_helper( ++ (const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), ++ n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream); ++ CUDA_CHECK(cudaGetLastError()); ++} ++ ++static void ggml_cuda_mul_mat_q_moe_quantized_impl( ++ ggml_backend_cuda_context & ctx, ++ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst, ++ const int32_t * ids_dst, const int32_t * expert_bounds, ++ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts, ++ int64_t ncols_src1_padded) { ++ GGML_ASSERT( dst->type == GGML_TYPE_F32); ++ GGML_ASSERT( src1_q != nullptr); ++ GGML_ASSERT( ids_dst != nullptr); ++ GGML_ASSERT( expert_bounds != nullptr); ++ GGML_ASSERT( n_tokens > 0); ++ GGML_ASSERT( n_expert_used > 0); ++ GGML_ASSERT( n_experts > 0); ++ GGML_ASSERT( ncols_src1_padded > 0); ++ ++ const int64_t ne00 = src0->ne[0]; ++ const int64_t ne01 = src0->ne[1]; ++ const int64_t ne02 = src0->ne[2]; ++ const int64_t ne03 = src0->ne[3]; ++ const int64_t ne0 = dst->ne[0]; ++ const int64_t ne1 = dst->ne[1]; ++ const int64_t ne2 = dst->ne[2]; ++ const int64_t ne3 = dst->ne[3]; ++ const int64_t nb00 = src0->nb[0]; ++ const int64_t nb01 = src0->nb[1]; ++ const int64_t nb02 = src0->nb[2]; ++ const int64_t nb03 = src0->nb[3]; ++ const int64_t nb0 = dst->nb[0]; ++ const int64_t nb1 = dst->nb[1]; ++ const int64_t nb2 = dst->nb[2]; ++ const int64_t nb3 = dst->nb[3]; ++ ++ const size_t ts_src0 = ggml_type_size(src0->type); ++ const size_t ts_dst = ggml_type_size(dst->type); ++ ++ GGML_ASSERT(nb00 == (int64_t) ts_src0); ++ GGML_ASSERT(nb0 == (int64_t) ts_dst); ++ GGML_ASSERT(ne00 <= ncols_src1_padded); ++ GGML_ASSERT(ne01 == ne0); ++ GGML_ASSERT(ne02 == n_experts); ++ GGML_ASSERT(ne1 == n_expert_used); ++ GGML_ASSERT(ne2 == n_tokens); ++ ++ cudaStream_t stream = ctx.stream(); ++ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; ++ const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ++ || GGML_CUDA_CC_IS_CDNA(cc); ++ ++ const int64_t ne_get_rows = n_tokens * n_expert_used; ++ const int64_t s01 = nb01 / ts_src0; ++ const int64_t s1 = nb1 / ts_dst; ++ const int64_t s02 = nb02 / ts_src0; ++ const int64_t s2 = nb2 / ts_dst; ++ const int64_t s03 = nb03 / ts_src0; ++ const int64_t s3 = nb3 / ts_dst; ++ const int64_t s12 = ne_get_rows * ncols_src1_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)); ++ const int64_t s13 = n_tokens * s12; ++ ++ const mmq_args args = { ++ (const char *) src0->data, src0->type, (const int *) src1_q, ids_dst, expert_bounds, (float *) dst->data, ++ ne00, ne01, ne_get_rows, s01, ne_get_rows, s1, ++ n_experts, n_experts, s02, s12, s2, ++ ne03, ne3, s03, s13, s3, ++ use_stream_k, n_tokens}; ++ ++ ggml_cuda_quant_trace("mmq_moe_quantized_raw", src0, nullptr, nullptr, dst, 1, ++ 0, 0, ne00, ncols_src1_padded, ne_get_rows, n_tokens, n_expert_used); ++ ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); ++} ++ ++void ggml_cuda_mul_mat_q_moe_quantized( ++ ggml_backend_cuda_context & ctx, ++ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst, ++ const int32_t * ids_dst, const int32_t * expert_bounds, ++ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts, ++ int64_t ncols_src1_padded) { ++ ggml_cuda_mul_mat_q_moe_quantized_impl(ctx, src0, src1_q, dst, ids_dst, expert_bounds, ++ n_tokens, n_expert_used, n_experts, ncols_src1_padded); ++} ++ + void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); +diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh +index b53e38a8b..690dc694a 100644 +--- a/ggml/src/ggml-cuda/mmq.cuh ++++ b/ggml/src/ggml-cuda/mmq.cuh +@@ -4334,6 +4334,31 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); + void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + ++struct ggml_cuda_mmq_ids_meta { ++ ggml_cuda_pool_alloc ids_src1; ++ ggml_cuda_pool_alloc ids_dst; ++ ggml_cuda_pool_alloc expert_bounds; ++ ++ int64_t ne_get_rows = 0; ++ ++ ggml_cuda_mmq_ids_meta() = default; ++ ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts); ++ ++ void alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts); ++ ++ void build( ++ const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens, ++ int64_t n_expert_used, int64_t nchannels_y, int64_t sis1, ++ cudaStream_t stream); ++}; ++ ++void ggml_cuda_mul_mat_q_moe_quantized( ++ ggml_backend_cuda_context & ctx, ++ const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst, ++ const int32_t * ids_dst, const int32_t * expert_bounds, ++ int64_t n_tokens, int64_t n_expert_used, int64_t n_experts, ++ int64_t ncols_src1_padded); ++ + void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, +diff --git a/ggml/src/ggml-cuda/moe-ffn.cu b/ggml/src/ggml-cuda/moe-ffn.cu +new file mode 100644 +index 000000000..507390d65 +--- /dev/null ++++ b/ggml/src/ggml-cuda/moe-ffn.cu +@@ -0,0 +1,296 @@ ++#include "moe-ffn.cuh" ++#include "getrows.cuh" ++#include "mmq.cuh" ++#include "unary.cuh" ++ ++#include ++ ++bool ggml_cuda_moe_routed_ffn_poc_enabled() { ++ static const bool enabled = [] { ++ const char * value = getenv("LLAMA_MOE_ROUTED_FFN_POC"); ++ return value != nullptr && atoi(value) != 0; ++ }(); ++ return enabled; ++} ++ ++bool ggml_cuda_moe_routed_ffn_poc_should_engage( ++ const ggml_tensor * gate_up, ++ const ggml_tensor * gate, ++ const ggml_tensor * up, ++ const ggml_tensor * glu, ++ const ggml_tensor * down, ++ const ggml_tensor * ids, ++ int cc) { ++ if (!blackwell_mma_available(cc)) { ++ return false; ++ } ++ if (gate_up == nullptr || gate == nullptr || up == nullptr || glu == nullptr || down == nullptr || ids == nullptr) { ++ return false; ++ } ++ if (gate_up->op != GGML_OP_MUL_MAT_ID || down->op != GGML_OP_MUL_MAT_ID) { ++ return false; ++ } ++ if (gate->op != GGML_OP_VIEW || up->op != GGML_OP_VIEW || gate->view_src != gate_up || up->view_src != gate_up) { ++ return false; ++ } ++ if (glu->op != GGML_OP_GLU || ggml_get_glu_op(glu) != GGML_GLU_OP_SWIGLU || down->src[1] != glu) { ++ return false; ++ } ++ if (gate_up->src[2] != ids || down->src[2] != ids) { ++ return false; ++ } ++ ++ const ggml_tensor * down_w = down->src[0]; ++ if (down_w == nullptr || (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4)) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ggml_cuda_moe_routed_ffn_fused_quant_enabled() { ++ static const bool enabled = [] { ++ const char * value = getenv("LLAMA_MOE_ROUTED_FFN_FUSED_QUANT"); ++ return value != nullptr && atoi(value) != 0; ++ }(); ++ return enabled; ++} ++ ++static bool ggml_cuda_moe_routed_ffn_down_supported(const ggml_tensor * glu, const ggml_tensor * down) { ++ const ggml_tensor * down_w = down != nullptr ? down->src[0] : nullptr; ++ const ggml_tensor * ids = down != nullptr ? down->src[2] : nullptr; ++ if (glu == nullptr || down == nullptr || down_w == nullptr || ids == nullptr) { ++ return false; ++ } ++ if (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4) { ++ return false; ++ } ++ if (glu->type != GGML_TYPE_F32 || down->type != GGML_TYPE_F32 || ids->type != GGML_TYPE_I32) { ++ return false; ++ } ++ if (glu->ne[3] != 1 || down->ne[3] != 1 || ids->ne[2] != 1 || ids->ne[3] != 1) { ++ return false; ++ } ++ if (down_w->ne[0] != glu->ne[0] || down_w->ne[1] != down->ne[0] || down_w->ne[2] <= 0) { ++ return false; ++ } ++ if (ids->ne[0] != glu->ne[1] || ids->ne[1] != glu->ne[2]) { ++ return false; ++ } ++ if (down->ne[1] != glu->ne[1] || down->ne[2] != glu->ne[2]) { ++ return false; ++ } ++ if (ids->nb[0] != ggml_element_size(ids)) { ++ return false; ++ } ++ if (glu->nb[0] != sizeof(float) || down->nb[0] != sizeof(float)) { ++ return false; ++ } ++ if (glu->nb[1] != (size_t) (glu->ne[0] * (int64_t) sizeof(float)) || ++ glu->nb[2] != (size_t) (glu->ne[1] * (int64_t) glu->nb[1])) { ++ return false; ++ } ++ if (down->nb[1] != (size_t) (down->ne[0] * (int64_t) sizeof(float)) || ++ down->nb[2] != (size_t) (down->ne[1] * (int64_t) down->nb[1])) { ++ return false; ++ } ++ ++ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; ++ return ggml_cuda_should_use_mmq(down_w->type, cc, glu->ne[2], down_w->ne[2]); ++} ++ ++static __global__ void moe_swiglu_nvfp4_quant_kernel( ++ const float * __restrict__ gate, ++ const float * __restrict__ up, ++ const int32_t * __restrict__ ids_src1, ++ void * __restrict__ vy, ++ int64_t n_ff, ++ int64_t n_ff_padded, ++ int64_t n_rows, ++ int64_t n_used, ++ int64_t gate_s1, ++ int64_t gate_s2, ++ int64_t up_s1, ++ int64_t up_s2) { ++#if defined(BLACKWELL_MMA_AVAILABLE) ++ const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB; ++ if (i0_base >= n_ff_padded) { ++ return; ++ } ++ ++ const int64_t row = blockIdx.x; ++ const int64_t src_row = ids_src1[row]; ++ const int64_t token = src_row / n_used; ++ const int64_t used = src_row - token * n_used; ++ const int64_t k_block = i0_base / QK_K; ++ const int64_t blocks_per_col = (n_ff_padded + QK_K - 1) / QK_K; ++ if (k_block >= blocks_per_col) { ++ return; ++ } ++ ++ block_fp4_mmq * y = (block_fp4_mmq *) vy; ++ block_fp4_mmq * yb = y + k_block * n_rows + row; ++ const int sub = (i0_base % QK_K) / QK_NVFP4_SUB; ++ ++ float vals_raw[QK_NVFP4_SUB]; ++ float amax_raw = 0.0f; ++#pragma unroll ++ for (int k = 0; k < QK_NVFP4_SUB; k++) { ++ const int64_t i0 = i0_base + k; ++ if (i0 < n_ff) { ++ const float g = gate[token * gate_s2 + used * gate_s1 + i0]; ++ const float u = up[token * up_s2 + used * up_s1 + i0]; ++ const float v = ggml_cuda_op_silu_single(g) * u; ++ vals_raw[k] = v; ++ amax_raw = fmaxf(amax_raw, fabsf(v)); ++ } else { ++ vals_raw[k] = 0.0f; ++ } ++ } ++ ++ static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2 }; ++ const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f); ++ ++ float best_err = FLT_MAX; ++ uint8_t fp8_code = 0; ++ float subblock_scale = 0.0f; ++ ++#pragma unroll ++ for (int i = 0; i < 5; i++) { ++ const int test_code = first_fp8_code + test_offsets[i]; ++ if (test_code < 0 || test_code > 0x7e) { ++ continue; ++ } ++ const uint8_t code = (uint8_t) test_code; ++ const float test_scale = ggml_cuda_ue4m3_to_fp32(code); ++ const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f; ++ float cur_err = 0.0f; ++#pragma unroll ++ for (int k = 0; k < QK_NVFP4_SUB; ++k) { ++ const float v = vals_raw[k]; ++ const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale); ++ const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale; ++ cur_err = fmaf(err_diff, err_diff, cur_err); ++ } ++ ++ if (cur_err < best_err) { ++ best_err = cur_err; ++ fp8_code = test_code; ++ subblock_scale = test_scale; ++ } ++ } ++ ++ const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f; ++ uint32_t q0 = 0; ++ uint32_t q1 = 0; ++#pragma unroll ++ for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) { ++ q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 0], inv_scale) << (8 * k); ++ q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 8], inv_scale) << (8 * k + 4); ++ q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 4], inv_scale) << (8 * k); ++ q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4); ++ } ++ ++ uint32_t * yqs = reinterpret_cast(yb->qs); ++ yqs[2 * sub + 0] = q0; ++ yqs[2 * sub + 1] = q1; ++ reinterpret_cast(yb->d4)[sub] = fp8_code; ++#else ++ NO_DEVICE_CODE; ++#endif ++} ++ ++static bool ggml_cuda_moe_routed_ffn_fused_quant( ++ ggml_backend_cuda_context & ctx, ++ ggml_tensor * gate, ++ ggml_tensor * up, ++ ggml_tensor * glu, ++ ggml_tensor * down) { ++ if (!ggml_cuda_moe_routed_ffn_fused_quant_enabled()) { ++ return false; ++ } ++ if (!ggml_cuda_moe_routed_ffn_down_supported(glu, down)) { ++ return false; ++ } ++ const ggml_tensor * down_w = down->src[0]; ++ const ggml_tensor * ids = down->src[2]; ++ if (down_w->type != GGML_TYPE_NVFP4) { ++ return false; ++ } ++ if (gate == nullptr || up == nullptr || gate->type != GGML_TYPE_F32 || up->type != GGML_TYPE_F32) { ++ return false; ++ } ++ if (gate->ne[0] != glu->ne[0] || gate->ne[1] != glu->ne[1] || gate->ne[2] != glu->ne[2] || gate->ne[3] != glu->ne[3]) { ++ return false; ++ } ++ if (up->ne[0] != glu->ne[0] || up->ne[1] != glu->ne[1] || up->ne[2] != glu->ne[2] || up->ne[3] != glu->ne[3]) { ++ return false; ++ } ++ if (gate->nb[0] != sizeof(float) || up->nb[0] != sizeof(float)) { ++ return false; ++ } ++ ++ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; ++ if (!blackwell_mma_available(cc)) { ++ return false; ++ } ++ ++ const int64_t n_ff = glu->ne[0]; ++ const int64_t n_ff_padded = GGML_PAD(n_ff, MATRIX_ROW_PADDING); ++ if (n_ff % QK_NVFP4 != 0) { ++ return false; ++ } ++ ++ const int64_t n_expert_used = ids->ne[0]; ++ const int64_t n_tokens = glu->ne[2]; ++ const int64_t n_experts = down_w->ne[2]; ++ const int64_t ne_get_rows = n_tokens * n_expert_used; ++ ++ ggml_cuda_mmq_ids_meta ids_meta(ctx.pool(), ne_get_rows, n_experts); ++ const int64_t sis1 = glu->nb[2] / glu->nb[1]; ++ ids_meta.build(ids, n_experts, n_tokens, n_expert_used, glu->ne[1], sis1, ctx.stream()); ++ ++ const size_t nbytes_src1_q = ne_get_rows * n_ff_padded * sizeof(block_fp4_mmq) / QK_K + ++ get_mmq_x_max_host(cc) * sizeof(block_q8_1_mmq); ++ ggml_cuda_pool_alloc src1_q(ctx.pool(), nbytes_src1_q); ++ ++ constexpr int nvfp4_block_size = 128; ++ const int64_t block_num_y = (n_ff_padded + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size); ++ const dim3 block_size(nvfp4_block_size, 1, 1); ++ const dim3 num_blocks(ne_get_rows, block_num_y, 1); ++ moe_swiglu_nvfp4_quant_kernel<<>>( ++ (const float *) gate->data, (const float *) up->data, ids_meta.ids_src1.get(), src1_q.get(), ++ n_ff, n_ff_padded, ne_get_rows, n_expert_used, ++ gate->nb[1] / sizeof(float), gate->nb[2] / sizeof(float), ++ up->nb[1] / sizeof(float), up->nb[2] / sizeof(float)); ++ CUDA_CHECK(cudaGetLastError()); ++ ++ ggml_cuda_mul_mat_q_moe_quantized( ++ ctx, down_w, src1_q.get(), down, ++ ids_meta.ids_dst.get(), ids_meta.expert_bounds.get(), ++ n_tokens, n_expert_used, n_experts, n_ff_padded); ++ return true; ++} ++ ++bool ggml_cuda_moe_routed_ffn_poc( ++ ggml_backend_cuda_context & ctx, ++ ggml_tensor * gate_up, ++ ggml_tensor * gate, ++ ggml_tensor * up, ++ ggml_tensor * glu, ++ ggml_tensor * down) { ++ if (!ggml_cuda_compute_forward(ctx, gate_up)) { ++ return false; ++ } ++ if (ggml_cuda_moe_routed_ffn_fused_quant(ctx, gate, up, glu, down)) { ++ return true; ++ } ++ if (!ggml_cuda_compute_forward(ctx, glu)) { ++ return false; ++ } ++ if (!ggml_cuda_compute_forward(ctx, down)) { ++ return false; ++ } ++ ++ return true; ++} +diff --git a/ggml/src/ggml-cuda/moe-ffn.cuh b/ggml/src/ggml-cuda/moe-ffn.cuh +new file mode 100644 +index 000000000..ce385d31a +--- /dev/null ++++ b/ggml/src/ggml-cuda/moe-ffn.cuh +@@ -0,0 +1,24 @@ ++#pragma once ++ ++#include "common.cuh" ++ ++bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, ggml_tensor * dst); ++ ++bool ggml_cuda_moe_routed_ffn_poc_enabled(); ++ ++bool ggml_cuda_moe_routed_ffn_poc_should_engage( ++ const ggml_tensor * gate_up, ++ const ggml_tensor * gate, ++ const ggml_tensor * up, ++ const ggml_tensor * glu, ++ const ggml_tensor * down, ++ const ggml_tensor * ids, ++ int cc); ++ ++bool ggml_cuda_moe_routed_ffn_poc( ++ ggml_backend_cuda_context & ctx, ++ ggml_tensor * gate_up, ++ ggml_tensor * gate, ++ ggml_tensor * up, ++ ggml_tensor * glu, ++ ggml_tensor * down); +-- +2.43.0 + diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-feat-server-trace-speculative-batch-shapes.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-feat-server-trace-speculative-batch-shapes.patch deleted file mode 100644 index 3d2081631..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-feat-server-trace-speculative-batch-shapes.patch +++ /dev/null @@ -1,57 +0,0 @@ -From fb9402661291e0488a3e2bf2f3948ebcd18e18c9 Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 02:41:22 +0000 -Subject: [PATCH] feat(server): trace speculative batch shapes - -Add an env-gated LLAMA_SPEC_SHAPE_TRACE log around the server batch rows emitted by normal decode and speculative verification slots. This keeps the instrumentation default-off while exposing the row/output shape entropy that prevents CUDA graph reuse under MTP serving. - -Assisted-by: Codex:gpt-5 ---- - tools/server/server-context.cpp | 20 ++++++++++++++++++-- - 1 file changed, 18 insertions(+), 2 deletions(-) - -diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp -index a77e2676d..fd8348af6 100644 ---- a/tools/server/server-context.cpp -+++ b/tools/server/server-context.cpp -@@ -457,12 +457,22 @@ struct server_slot { - - // add sampled token of this slot to the batch, optionally add the speculative draft tokens if any - void handle_last_sampled_token(server_batch & batch) { -+ static const bool spec_shape_trace = getenv("LLAMA_SPEC_SHAPE_TRACE") != nullptr; -+ const int32_t batch_before = batch.size(); -+ - bool add_ok = true; - if (spec_draft.empty()) { - // no speculative decoding -- i_batch = batch.size(); -+ i_batch = batch_before; -+ -+ const int32_t pos0 = prompt.tokens.pos_next(); -+ -+ add_ok &= batch.add(id, sampled, pos0, true); - -- add_ok &= batch.add(id, sampled, prompt.tokens.pos_next(), true); -+ if (spec_shape_trace) { -+ SLT_INF(*this, "spec shape: kind=decode batch_before=%d rows=1 outputs=1 draft=0 pos0=%d slot_tokens=%zu\n", -+ batch_before, pos0, prompt.tokens.size()); -+ } - - SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n", - sampled, n_ctx, prompt.n_tokens(), truncated); -@@ -479,6 +489,12 @@ struct server_slot { - - auto pos0 = prompt.tokens.pos_next(); - -+ if (spec_shape_trace) { -+ SLT_INF(*this, "spec shape: kind=verify batch_before=%d rows=%zu outputs=%zu draft=%zu spec_i_first=%d spec_i_last=%d pos0=%d slot_tokens=%zu\n", -+ batch_before, spec_draft.size() + 1, spec_draft.size() + 1, spec_draft.size(), -+ spec_i_batch.front(), spec_i_batch.back(), pos0, prompt.tokens.size()); -+ } -+ - add_ok &= batch.add(id, sampled, pos0++, true); - for (auto token : spec_draft) { - add_ok &= batch.add(this->id, token, pos0++, true); --- -2.43.0 - diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0056-feat-cuda-trace-moe-mmq-batch-shapes.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0056-feat-cuda-trace-moe-mmq-batch-shapes.patch deleted file mode 100644 index 53e797224..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0056-feat-cuda-trace-moe-mmq-batch-shapes.patch +++ /dev/null @@ -1,212 +0,0 @@ -From 20a99518a39acbb4474fa9c97121fc7b9f07c1ef Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 04:27:19 +0000 -Subject: [PATCH] feat(cuda): trace moe mmq batch shapes - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/mmq-shape-trace.h | 66 ++++++++++++++++++++++++++++ - ggml/src/ggml-cuda/mmq.cuh | 31 ++++++++++++- - tests/CMakeLists.txt | 2 + - tests/test-cuda-mmq-shape-trace.cpp | 42 ++++++++++++++++++ - 4 files changed, 140 insertions(+), 1 deletion(-) - create mode 100644 ggml/src/ggml-cuda/mmq-shape-trace.h - create mode 100644 tests/test-cuda-mmq-shape-trace.cpp - -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -new file mode 100644 -index 000000000..9d41b7c80 ---- /dev/null -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -0,0 +1,66 @@ -+#pragma once -+ -+#include -+#include -+#include -+ -+struct ggml_cuda_mmq_shape { -+ int type; -+ bool is_moe; -+ int64_t ncols_dst; -+ int64_t nchannels_x; -+ int64_t ncols_max; -+ int64_t n_active_est; -+ int64_t density; -+ int mmq_x_max; -+ int mmq_x_lim; -+ int mmq_x_best; -+ int mmq_y; -+ bool use_stream_k; -+}; -+ -+static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( -+ const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, -+ const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best, -+ const int mmq_y, const bool use_stream_k) { -+ int64_t n_active_est = 0; -+ int64_t density = 0; -+ if (is_moe && ncols_dst > 0 && nchannels_x > 0) { -+ n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x; -+ density = (ncols_dst + n_active_est - 1) / n_active_est; -+ } -+ -+ return { -+ type, -+ is_moe, -+ ncols_dst, -+ nchannels_x, -+ ncols_max, -+ n_active_est, -+ density, -+ mmq_x_max, -+ mmq_x_lim, -+ mmq_x_best, -+ mmq_y, -+ use_stream_k, -+ }; -+} -+ -+static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) { -+ return std::snprintf(buf, size, -+ "type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld " -+ "n_active_est=%lld density=%lld mmq_x_max=%d mmq_x_lim=%d " -+ "mmq_x_best=%d mmq_y=%d stream_k=%d", -+ shape.type, -+ shape.is_moe ? 1 : 0, -+ (long long) shape.ncols_dst, -+ (long long) shape.nchannels_x, -+ (long long) shape.ncols_max, -+ (long long) shape.n_active_est, -+ (long long) shape.density, -+ shape.mmq_x_max, -+ shape.mmq_x_lim, -+ shape.mmq_x_best, -+ shape.mmq_y, -+ shape.use_stream_k ? 1 : 0); -+} -diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh -index b53e38a8b..6bc943738 100644 ---- a/ggml/src/ggml-cuda/mmq.cuh -+++ b/ggml/src/ggml-cuda/mmq.cuh -@@ -3,10 +3,14 @@ - #include "common.cuh" - #include "vecdotq.cuh" - #include "mma.cuh" -+#include "mmq-shape-trace.h" - -+#include - #include - #include -+#include - #include -+#include - - using namespace ggml_cuda_mma; - -@@ -4163,6 +4167,18 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() { - return c; - } - -+static inline int ggml_cuda_moe_mmq_shape_trace_limit() { -+ static const int limit = []() -> int { -+ const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE"); -+ if (s == nullptr || strcmp(s, "0") == 0) { -+ return 0; -+ } -+ const int parsed = atoi(s); -+ return parsed > 0 ? parsed : 256; -+ }(); -+ return limit; -+} -+ - template - void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); -@@ -4249,6 +4265,20 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda - } - } - -+ if (args.expert_bounds != nullptr) { -+ static std::atomic trace_count{0}; -+ const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit(); -+ const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit; -+ if (trace_index >= 0 && trace_index < trace_limit) { -+ char buf[256]; -+ const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make( -+ (int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max, -+ mmq_x_max, mmq_x_lim, mmq_x_best, mmq_y, args.use_stream_k); -+ ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape); -+ fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf); -+ } -+ } -+ - switch (mmq_x_best) { - case 8: - launch_mul_mat_q(ctx, args, stream); -@@ -4341,4 +4371,3 @@ void ggml_cuda_op_mul_mat_q( - const int64_t src1_padded_row_size, cudaStream_t stream); - - bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts); -- -diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt -index 24592a279..0a5194c87 100644 ---- a/tests/CMakeLists.txt -+++ b/tests/CMakeLists.txt -@@ -234,6 +234,8 @@ llama_build_and_test(test-thread-safety.cpp ARGS -m "${MODEL_DEST}" -ngl 99 -p " - set_tests_properties(test-thread-safety PROPERTIES FIXTURES_REQUIRED test-download-model) - - llama_build_and_test(test-arg-parser.cpp) -+llama_build_and_test(test-cuda-mmq-shape-trace.cpp) -+target_include_directories(test-cuda-mmq-shape-trace PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) - - if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC) - # TODO: repair known memory leaks -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -new file mode 100644 -index 000000000..8620169c0 ---- /dev/null -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -0,0 +1,42 @@ -+#include "ggml-cuda/mmq-shape-trace.h" -+ -+#include -+#include -+#include -+ -+static void require(bool ok, const char * what) { -+ if (!ok) { -+ std::fprintf(stderr, "require failed: %s\n", what); -+ std::exit(1); -+ } -+} -+ -+int main() { -+ const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make( -+ /* type */ 39, -+ /* is_moe */ true, -+ /* ncols_dst */ 1024, -+ /* nchannels_x */ 256, -+ /* ncols_max */ 128, -+ /* mmq_x_max */ 128, -+ /* mmq_x_lim */ 64, -+ /* mmq_x_best */ 64, -+ /* mmq_y */ 128, -+ /* use_stream_k */ true); -+ -+ require(shape.n_active_est == 256, "active expert estimate is capped by expert count"); -+ require(shape.density == 4, "density is ceil(assignments / active experts)"); -+ -+ char buf[256]; -+ const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape); -+ -+ require(n > 0, "format returns byte count"); -+ require(std::strstr(buf, "moe=1") != nullptr, "trace includes moe flag"); -+ require(std::strstr(buf, "ncols_dst=1024") != nullptr, "trace includes routed assignment count"); -+ require(std::strstr(buf, "n_active_est=256") != nullptr, "trace includes active estimate"); -+ require(std::strstr(buf, "density=4") != nullptr, "trace includes density"); -+ require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile"); -+ require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag"); -+ -+ return 0; -+} diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0057-feat-cuda-trace-moe-mmq-launch-shapes.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0057-feat-cuda-trace-moe-mmq-launch-shapes.patch deleted file mode 100644 index a44ba7be6..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0057-feat-cuda-trace-moe-mmq-launch-shapes.patch +++ /dev/null @@ -1,223 +0,0 @@ -From c78e537b56e3446f8aa645c6700aacf263639bd8 Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 04:49:55 +0000 -Subject: [PATCH] feat(cuda): trace moe mmq launch shapes - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/mmq-shape-trace.h | 65 ++++++++++++++++++++++++++++ - ggml/src/ggml-cuda/mmq.cuh | 47 +++++++++++++------- - tests/test-cuda-mmq-shape-trace.cpp | 35 +++++++++++++++ - 3 files changed, 132 insertions(+), 15 deletions(-) - -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -index 9d41b7c80..98bc21f7f 100644 ---- a/ggml/src/ggml-cuda/mmq-shape-trace.h -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -19,6 +19,24 @@ struct ggml_cuda_mmq_shape { - bool use_stream_k; - }; - -+struct ggml_cuda_mmq_launch_shape { -+ int type; -+ bool is_moe; -+ int64_t ncols_dst; -+ int64_t ncols_max; -+ int mmq_x; -+ int mmq_y; -+ int ntx; -+ int nty; -+ int ntzw; -+ int ntiles_dst; -+ int nsm; -+ int tiles_nwaves; -+ int tiles_efficiency_percent; -+ int stream_k_blocks; -+ bool fixup_needed; -+}; -+ - static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, - const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best, -@@ -46,6 +64,30 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - }; - } - -+static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make( -+ const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max, -+ const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw, -+ const int ntiles_dst, const int nsm, const int tiles_nwaves, const int tiles_efficiency_percent, -+ const int stream_k_blocks, const bool fixup_needed) { -+ return { -+ type, -+ is_moe, -+ ncols_dst, -+ ncols_max, -+ mmq_x, -+ mmq_y, -+ ntx, -+ nty, -+ ntzw, -+ ntiles_dst, -+ nsm, -+ tiles_nwaves, -+ tiles_efficiency_percent, -+ stream_k_blocks, -+ fixup_needed, -+ }; -+} -+ - static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) { - return std::snprintf(buf, size, - "type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld " -@@ -64,3 +106,26 @@ static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, cons - shape.mmq_y, - shape.use_stream_k ? 1 : 0); - } -+ -+static inline int ggml_cuda_mmq_launch_shape_format( -+ char * buf, const size_t size, const ggml_cuda_mmq_launch_shape & shape) { -+ return std::snprintf(buf, size, -+ "type=%d moe=%d ncols_dst=%lld ncols_max=%lld mmq_x=%d mmq_y=%d " -+ "ntx=%d nty=%d ntzw=%d ntiles_dst=%d nsm=%d tiles_nwaves=%d " -+ "tiles_efficiency=%d stream_k_blocks=%d fixup=%d", -+ shape.type, -+ shape.is_moe ? 1 : 0, -+ (long long) shape.ncols_dst, -+ (long long) shape.ncols_max, -+ shape.mmq_x, -+ shape.mmq_y, -+ shape.ntx, -+ shape.nty, -+ shape.ntzw, -+ shape.ntiles_dst, -+ shape.nsm, -+ shape.tiles_nwaves, -+ shape.tiles_efficiency_percent, -+ shape.stream_k_blocks, -+ shape.fixup_needed ? 1 : 0); -+} -diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh -index 6bc943738..34002edf7 100644 ---- a/ggml/src/ggml-cuda/mmq.cuh -+++ b/ggml/src/ggml-cuda/mmq.cuh -@@ -3989,6 +3989,24 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int - return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int)); - } - -+static inline int ggml_cuda_moe_mmq_shape_trace_limit() { -+ static const int limit = []() -> int { -+ const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE"); -+ if (s == nullptr || strcmp(s, "0") == 0) { -+ return 0; -+ } -+ const int parsed = atoi(s); -+ return parsed > 0 ? parsed : 256; -+ }(); -+ return limit; -+} -+ -+static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic & counter) { -+ const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit(); -+ const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit; -+ return trace_index >= 0 && trace_index < trace_limit; -+} -+ - template - static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); -@@ -4054,6 +4072,19 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a - - const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0; - -+ if (args.expert_bounds != nullptr) { -+ static std::atomic trace_count{0}; -+ if (ggml_cuda_moe_mmq_trace_take(trace_count)) { -+ char buf[256]; -+ const ggml_cuda_mmq_launch_shape shape = ggml_cuda_mmq_launch_shape_make( -+ (int) type, true, args.ncols_dst, args.ncols_max, mmq_x, mmq_y, -+ ntx, nty, ntzw, ntiles_dst, nsm, tiles_nwaves, tiles_efficiency_percent, -+ block_nums_stream_k.x, fixup_needed); -+ ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), shape); -+ fprintf(stderr, "[LLAMA_MOE_MMQ_LAUNCH] %s\n", buf); -+ } -+ } -+ - ggml_cuda_pool & pool = ctx.pool(id); - ggml_cuda_pool_alloc tmp_fixup(pool); - if (fixup_needed) { -@@ -4167,18 +4198,6 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() { - return c; - } - --static inline int ggml_cuda_moe_mmq_shape_trace_limit() { -- static const int limit = []() -> int { -- const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE"); -- if (s == nullptr || strcmp(s, "0") == 0) { -- return 0; -- } -- const int parsed = atoi(s); -- return parsed > 0 ? parsed : 256; -- }(); -- return limit; --} -- - template - void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); -@@ -4267,9 +4286,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda - - if (args.expert_bounds != nullptr) { - static std::atomic trace_count{0}; -- const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit(); -- const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit; -- if (trace_index >= 0 && trace_index < trace_limit) { -+ if (ggml_cuda_moe_mmq_trace_take(trace_count)) { - char buf[256]; - const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make( - (int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max, -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -index 8620169c0..86ee15e02 100644 ---- a/tests/test-cuda-mmq-shape-trace.cpp -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -38,5 +38,40 @@ int main() { - require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile"); - require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag"); - -+ const ggml_cuda_mmq_launch_shape launch = ggml_cuda_mmq_launch_shape_make( -+ /* type */ 39, -+ /* is_moe */ true, -+ /* ncols_dst */ 1024, -+ /* ncols_max */ 128, -+ /* mmq_x */ 64, -+ /* mmq_y */ 128, -+ /* ntx */ 2, -+ /* nty */ 4, -+ /* ntzw */ 3, -+ /* ntiles_dst */ 24, -+ /* nsm */ 16, -+ /* tiles_nwaves */ 2, -+ /* tiles_efficiency_percent */ 75, -+ /* stream_k_blocks */ 16, -+ /* fixup_needed */ true); -+ -+ const int launch_n = ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), launch); -+ -+ require(launch_n > 0, "launch format returns byte count"); -+ require(std::strstr(buf, "moe=1") != nullptr, "launch trace includes moe flag"); -+ require(std::strstr(buf, "ncols_dst=1024") != nullptr, "launch trace includes routed assignment count"); -+ require(std::strstr(buf, "ncols_max=128") != nullptr, "launch trace includes max column count"); -+ require(std::strstr(buf, "mmq_x=64") != nullptr, "launch trace includes compiled x tile"); -+ require(std::strstr(buf, "mmq_y=128") != nullptr, "launch trace includes compiled y tile"); -+ require(std::strstr(buf, "ntx=2") != nullptr, "launch trace includes x tile count"); -+ require(std::strstr(buf, "nty=4") != nullptr, "launch trace includes y tile count"); -+ require(std::strstr(buf, "ntzw=3") != nullptr, "launch trace includes batch tile count"); -+ require(std::strstr(buf, "ntiles_dst=24") != nullptr, "launch trace includes total tile count"); -+ require(std::strstr(buf, "nsm=16") != nullptr, "launch trace includes SM count"); -+ require(std::strstr(buf, "tiles_nwaves=2") != nullptr, "launch trace includes wave count"); -+ require(std::strstr(buf, "tiles_efficiency=75") != nullptr, "launch trace includes stream-k efficiency"); -+ require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count"); -+ require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag"); -+ - return 0; - } diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0058-feat-cuda-trace-moe-small-m-mmq-candidates.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0058-feat-cuda-trace-moe-small-m-mmq-candidates.patch deleted file mode 100644 index c0b56b503..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0058-feat-cuda-trace-moe-small-m-mmq-candidates.patch +++ /dev/null @@ -1,182 +0,0 @@ -From 2a9964d290a543d14db972d8d2927ee9d2974f7e Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 05:05:17 +0000 -Subject: [PATCH] feat(cuda): trace moe small-m mmq candidates - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/mmq-shape-trace.h | 58 ++++++++++++++++++++++++++++ - ggml/src/ggml-cuda/mmq.cuh | 27 +++++++++++++ - tests/test-cuda-mmq-shape-trace.cpp | 35 +++++++++++++++++ - 3 files changed, 120 insertions(+) - -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -index 98bc21f7f..47453d91f 100644 ---- a/ggml/src/ggml-cuda/mmq-shape-trace.h -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -37,6 +37,18 @@ struct ggml_cuda_mmq_launch_shape { - bool fixup_needed; - }; - -+struct ggml_cuda_mmq_small_m_shape { -+ bool is_moe; -+ int64_t ncols_dst; -+ int64_t nchannels_x; -+ int64_t ncols_max; -+ int64_t n_active_est; -+ int64_t density; -+ int mmq_x_best; -+ bool use_stream_k; -+ bool is_candidate; -+}; -+ - static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, - const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best, -@@ -64,6 +76,36 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - }; - } - -+static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make( -+ const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, -+ const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) { -+ int64_t n_active_est = 0; -+ int64_t density = 0; -+ if (is_moe && ncols_dst > 0 && nchannels_x > 0) { -+ n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x; -+ density = (ncols_dst + n_active_est - 1) / n_active_est; -+ } -+ -+ const bool is_candidate = -+ is_moe && -+ use_stream_k && -+ ncols_max > 0 && ncols_max <= 128 && -+ density > 0 && density <= 4 && -+ mmq_x_best > 0 && mmq_x_best <= 64; -+ -+ return { -+ is_moe, -+ ncols_dst, -+ nchannels_x, -+ ncols_max, -+ n_active_est, -+ density, -+ mmq_x_best, -+ use_stream_k, -+ is_candidate, -+ }; -+} -+ - static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make( - const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max, - const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw, -@@ -129,3 +171,19 @@ static inline int ggml_cuda_mmq_launch_shape_format( - shape.stream_k_blocks, - shape.fixup_needed ? 1 : 0); - } -+ -+static inline int ggml_cuda_mmq_small_m_shape_format( -+ char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) { -+ return std::snprintf(buf, size, -+ "candidate=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld " -+ "n_active_est=%lld density=%lld mmq_x_best=%d stream_k=%d", -+ shape.is_candidate ? 1 : 0, -+ shape.is_moe ? 1 : 0, -+ (long long) shape.ncols_dst, -+ (long long) shape.nchannels_x, -+ (long long) shape.ncols_max, -+ (long long) shape.n_active_est, -+ (long long) shape.density, -+ shape.mmq_x_best, -+ shape.use_stream_k ? 1 : 0); -+} -diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh -index 34002edf7..25ead9e7b 100644 ---- a/ggml/src/ggml-cuda/mmq.cuh -+++ b/ggml/src/ggml-cuda/mmq.cuh -@@ -4007,6 +4007,24 @@ static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic & counter) { - return trace_index >= 0 && trace_index < trace_limit; - } - -+static inline int ggml_cuda_moe_mmq_small_m_trace_limit() { -+ static const int limit = []() -> int { -+ const char * s = getenv("LLAMA_MOE_MMQ_SMALL_M_TRACE"); -+ if (s == nullptr || strcmp(s, "0") == 0) { -+ return 0; -+ } -+ const int parsed = atoi(s); -+ return parsed > 0 ? parsed : 256; -+ }(); -+ return limit; -+} -+ -+static inline bool ggml_cuda_moe_mmq_small_m_trace_take(std::atomic & counter) { -+ const int trace_limit = ggml_cuda_moe_mmq_small_m_trace_limit(); -+ const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit; -+ return trace_index >= 0 && trace_index < trace_limit; -+} -+ - template - static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); -@@ -4294,6 +4312,15 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda - ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape); - fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf); - } -+ -+ static std::atomic small_m_trace_count{0}; -+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make( -+ true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_best, args.use_stream_k); -+ if (small_m.is_candidate && ggml_cuda_moe_mmq_small_m_trace_take(small_m_trace_count)) { -+ char buf[256]; -+ ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m); -+ fprintf(stderr, "[LLAMA_MOE_MMQ_SMALL_M] %s\n", buf); -+ } - } - - switch (mmq_x_best) { -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -index 86ee15e02..9f36ce1a1 100644 ---- a/tests/test-cuda-mmq-shape-trace.cpp -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -73,5 +73,40 @@ int main() { - require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count"); - require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag"); - -+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make( -+ /* is_moe */ true, -+ /* ncols_dst */ 1024, -+ /* nchannels_x */ 256, -+ /* ncols_max */ 128, -+ /* mmq_x_best */ 64, -+ /* use_stream_k */ true); -+ -+ require(small_m.is_candidate, "decode-like low-density MoE shape is a small-M candidate"); -+ require(small_m.n_active_est == 256, "small-M active estimate is capped by expert count"); -+ require(small_m.density == 4, "small-M density is ceil(assignments / active experts)"); -+ -+ require(!ggml_cuda_mmq_small_m_shape_make( -+ /* is_moe */ false, 1024, 256, 128, 64, true).is_candidate, -+ "dense shape is excluded"); -+ require(!ggml_cuda_mmq_small_m_shape_make( -+ /* is_moe */ true, 4096, 256, 512, 128, true).is_candidate, -+ "prefill-like shape is excluded"); -+ require(!ggml_cuda_mmq_small_m_shape_make( -+ /* is_moe */ true, 4096, 256, 128, 64, true).is_candidate, -+ "high-density shape is excluded"); -+ require(!ggml_cuda_mmq_small_m_shape_make( -+ /* is_moe */ true, 1024, 256, 128, 128, true).is_candidate, -+ "large selected tile is excluded"); -+ require(!ggml_cuda_mmq_small_m_shape_make( -+ /* is_moe */ true, 1024, 256, 128, 64, false).is_candidate, -+ "non-stream-k shape is excluded"); -+ -+ const int small_m_n = ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m); -+ -+ require(small_m_n > 0, "small-M format returns byte count"); -+ require(std::strstr(buf, "candidate=1") != nullptr, "small-M trace includes candidate flag"); -+ require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density"); -+ require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile"); -+ - return 0; - } diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0059-feat-cuda-gate-moe-small-m-mmq-tile-policy.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0059-feat-cuda-gate-moe-small-m-mmq-tile-policy.patch deleted file mode 100644 index 1f03781fd..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0059-feat-cuda-gate-moe-small-m-mmq-tile-policy.patch +++ /dev/null @@ -1,80 +0,0 @@ -From fbed2abaa9f5af8e500f95c8dda86b305450ceff Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 05:17:39 +0000 -Subject: [PATCH] feat(cuda): gate moe small-m mmq tile policy - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/mmq-shape-trace.h | 9 +++++++++ - ggml/src/ggml-cuda/mmq.cuh | 13 +++++++++++++ - tests/test-cuda-mmq-shape-trace.cpp | 10 ++++++++++ - 3 files changed, 32 insertions(+) - -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -index 47453d91f..dfb4e898a 100644 ---- a/ggml/src/ggml-cuda/mmq-shape-trace.h -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -187,3 +187,12 @@ static inline int ggml_cuda_mmq_small_m_shape_format( - shape.mmq_x_best, - shape.use_stream_k ? 1 : 0); - } -+ -+static inline int ggml_cuda_mmq_small_m_tile_limit( -+ const ggml_cuda_mmq_small_m_shape & shape, const int current_limit, const int requested_tile) { -+ if (!shape.is_candidate || requested_tile < 8 || requested_tile >= current_limit) { -+ return current_limit; -+ } -+ -+ return requested_tile; -+} -diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh -index 25ead9e7b..16b3fcca4 100644 ---- a/ggml/src/ggml-cuda/mmq.cuh -+++ b/ggml/src/ggml-cuda/mmq.cuh -@@ -4201,6 +4201,15 @@ static inline int ggml_cuda_moe_density_max() { - return d; - } - -+static inline int ggml_cuda_moe_small_m_tile() { -+ static const int t = []() -> int { -+ const char * s = getenv("LLAMA_MOE_SMALL_M_TILE"); -+ const int v = s ? atoi(s) : 0; -+ return v >= 8 ? v : 0; -+ }(); -+ return t; -+} -+ - // [paged patch 0017 / track B] DENSE NVFP4 decode mmq_x re-read occupancy DIAGNOSTIC (env, default off). - // GGML_CUDA_FP4_DENSE_MMQ_X= caps the dense (non-MoE) NVFP4 col-tile to , splitting the M=128 - // decode ubatch into ceil(128/n) col-tiles. Each col-tile re-reads the full weight set (fatal cost -@@ -4282,6 +4291,10 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda - } - } - } -+ -+ const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make( -+ true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_lim, args.use_stream_k); -+ mmq_x_lim = ggml_cuda_mmq_small_m_tile_limit(small_m, mmq_x_lim, ggml_cuda_moe_small_m_tile()); - } - - int mmq_x_best = 0; -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -index 9f36ce1a1..f7863f03a 100644 ---- a/tests/test-cuda-mmq-shape-trace.cpp -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -108,5 +108,15 @@ int main() { - require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density"); - require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile"); - -+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 0) == 64, -+ "small-M tile override is default-off"); -+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 16) == 16, -+ "small-M tile override caps candidate tile limit"); -+ require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 128) == 64, -+ "small-M tile override ignores non-smaller tiles"); -+ require(ggml_cuda_mmq_small_m_tile_limit( -+ ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128, -+ "small-M tile override excludes prefill-like shapes"); -+ - return 0; - } diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0060-feat-cuda-trace-moe-mmid-routes.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0060-feat-cuda-trace-moe-mmid-routes.patch deleted file mode 100644 index 2e5b79ad2..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0060-feat-cuda-trace-moe-mmid-routes.patch +++ /dev/null @@ -1,292 +0,0 @@ -From 6c332094ca2fbb1e3211427c5f919adcaa89c588 Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 05:32:27 +0000 -Subject: [PATCH] feat(cuda): trace moe mmid routes - -Add a default-off LLAMA_MOE_MMID_ROUTE_TRACE diagnostic for MUL_MAT_ID dispatch routes. - -The trace reports whether a call uses MMVQ, MMVF, grouped MMQ, MMF, or host-sync fallback while preserving the existing dispatch predicates. - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 34 +++++++++-- - ggml/src/ggml-cuda/mmq-shape-trace.h | 88 ++++++++++++++++++++++++++++ - tests/test-cuda-mmq-shape-trace.cpp | 82 ++++++++++++++++++++++++++ - 3 files changed, 200 insertions(+), 4 deletions(-) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 374949f25..a1754df39 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2685,6 +2685,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor - } - } - -+static inline int ggml_cuda_moe_mmid_route_trace_limit() { -+ static const int value = []() { -+ const char * s = getenv("LLAMA_MOE_MMID_ROUTE_TRACE"); -+ return s ? atoi(s) : 0; -+ }(); -+ -+ return value; -+} -+ - static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; -@@ -2697,13 +2706,30 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * - GGML_TENSOR_BINARY_OP_LOCALS - - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; -+ const bool src0_quantized = ggml_is_quantized(src0->type); -+ const int mmvq_mmid_max = src0_quantized ? get_mmvq_mmid_max_batch(src0->type, cc) : MMVQ_MAX_BATCH_SIZE; -+ const bool use_mmq = ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02); -+ const bool use_mmf = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true); -+ -+ const int mmid_trace_limit = ggml_cuda_moe_mmid_route_trace_limit(); -+ if (mmid_trace_limit > 0) { -+ static std::atomic trace_count{0}; -+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed); -+ if (trace_idx < mmid_trace_limit) { -+ const ggml_cuda_mmid_route_shape route_shape = ggml_cuda_mmid_route_shape_make( -+ src0->type, ne2, ne12, /*n_experts=*/ne02, mmvq_mmid_max, use_mmq, use_mmf, -+ GGML_CUDA_CC_IS_AMD(cc), src0_quantized); -+ char buf[256]; -+ ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), route_shape); -+ fprintf(stderr, "[LLAMA_MOE_MMID_ROUTE] %s\n", buf); -+ } -+ } - - // [TAG_MUL_MAT_ID_CUDA_GRAPHS] - if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE); - if (ne2 <= MMVQ_MAX_BATCH_SIZE) { -- if (ggml_is_quantized(src0->type)) { -- const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc); -+ if (src0_quantized) { - if (ne2 <= mmvq_mmid_max) { - ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); - return; -@@ -2716,12 +2742,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * - } - } - -- if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) { -+ if (use_mmq) { - ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); - return; - } - -- if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) { -+ if (use_mmf) { - ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst); - return; - } -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -index dfb4e898a..da234a302 100644 ---- a/ggml/src/ggml-cuda/mmq-shape-trace.h -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -49,6 +49,40 @@ struct ggml_cuda_mmq_small_m_shape { - bool is_candidate; - }; - -+enum ggml_cuda_mmid_route { -+ GGML_CUDA_MMID_ROUTE_MMVQ, -+ GGML_CUDA_MMID_ROUTE_MMVF, -+ GGML_CUDA_MMID_ROUTE_MMQ, -+ GGML_CUDA_MMID_ROUTE_MMF, -+ GGML_CUDA_MMID_ROUTE_FALLBACK, -+}; -+ -+struct ggml_cuda_mmid_route_shape { -+ ggml_cuda_mmid_route route; -+ int type; -+ int64_t ne2; -+ int64_t ne12; -+ int64_t n_experts; -+ int mmvq_max; -+ bool use_mmq; -+ bool use_mmf; -+ bool is_amd; -+ bool is_quantized; -+ bool host_sync; -+}; -+ -+static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) { -+ switch (route) { -+ case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq"; -+ case GGML_CUDA_MMID_ROUTE_MMVF: return "mmvf"; -+ case GGML_CUDA_MMID_ROUTE_MMQ: return "mmq"; -+ case GGML_CUDA_MMID_ROUTE_MMF: return "mmf"; -+ case GGML_CUDA_MMID_ROUTE_FALLBACK: return "fallback"; -+ } -+ -+ return "unknown"; -+} -+ - static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, - const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best, -@@ -76,6 +110,42 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - }; - } - -+static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make( -+ const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts, -+ const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd, -+ const bool is_quantized) { -+ ggml_cuda_mmid_route route = GGML_CUDA_MMID_ROUTE_FALLBACK; -+ if (ne2 <= mmvq_max) { -+ if (is_quantized) { -+ route = GGML_CUDA_MMID_ROUTE_MMVQ; -+ } else if (is_amd) { -+ route = GGML_CUDA_MMID_ROUTE_MMVF; -+ } -+ } -+ -+ if (route == GGML_CUDA_MMID_ROUTE_FALLBACK) { -+ if (use_mmq) { -+ route = GGML_CUDA_MMID_ROUTE_MMQ; -+ } else if (use_mmf) { -+ route = GGML_CUDA_MMID_ROUTE_MMF; -+ } -+ } -+ -+ return { -+ route, -+ type, -+ ne2, -+ ne12, -+ n_experts, -+ mmvq_max, -+ use_mmq, -+ use_mmf, -+ is_amd, -+ is_quantized, -+ route == GGML_CUDA_MMID_ROUTE_FALLBACK, -+ }; -+} -+ - static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make( - const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, - const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) { -@@ -172,6 +242,24 @@ static inline int ggml_cuda_mmq_launch_shape_format( - shape.fixup_needed ? 1 : 0); - } - -+static inline int ggml_cuda_mmid_route_shape_format( -+ char * buf, const size_t size, const ggml_cuda_mmid_route_shape & shape) { -+ return std::snprintf(buf, size, -+ "route=%s type=%d host_sync=%d ne2=%lld ne12=%lld n_experts=%lld " -+ "mmvq_max=%d use_mmq=%d use_mmf=%d is_amd=%d is_quantized=%d", -+ ggml_cuda_mmid_route_name(shape.route), -+ shape.type, -+ shape.host_sync ? 1 : 0, -+ (long long) shape.ne2, -+ (long long) shape.ne12, -+ (long long) shape.n_experts, -+ shape.mmvq_max, -+ shape.use_mmq ? 1 : 0, -+ shape.use_mmf ? 1 : 0, -+ shape.is_amd ? 1 : 0, -+ shape.is_quantized ? 1 : 0); -+} -+ - static inline int ggml_cuda_mmq_small_m_shape_format( - char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) { - return std::snprintf(buf, size, -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -index f7863f03a..e190cf1ac 100644 ---- a/tests/test-cuda-mmq-shape-trace.cpp -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -118,5 +118,87 @@ int main() { - ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128, - "small-M tile override excludes prefill-like shapes"); - -+ const ggml_cuda_mmid_route_shape mmvq = ggml_cuda_mmid_route_shape_make( -+ /* type */ 39, -+ /* ne2 */ 1, -+ /* ne12 */ 1, -+ /* ne02 */ 256, -+ /* mmvq_max */ 4, -+ /* use_mmq */ true, -+ /* use_mmf */ true, -+ /* is_amd */ false, -+ /* is_quantized */ true); -+ -+ require(mmvq.route == GGML_CUDA_MMID_ROUTE_MMVQ, "MMVQ wins when within its batch cap"); -+ require(!mmvq.host_sync, "MMVQ route is graph-safe"); -+ -+ const ggml_cuda_mmid_route_shape mmq = ggml_cuda_mmid_route_shape_make( -+ /* type */ 39, -+ /* ne2 */ 128, -+ /* ne12 */ 128, -+ /* ne02 */ 256, -+ /* mmvq_max */ 4, -+ /* use_mmq */ true, -+ /* use_mmf */ true, -+ /* is_amd */ false, -+ /* is_quantized */ true); -+ -+ require(mmq.route == GGML_CUDA_MMID_ROUTE_MMQ, "grouped MMQ wins after MMVQ cap"); -+ require(!mmq.host_sync, "grouped MMQ route is graph-safe"); -+ -+ const ggml_cuda_mmid_route_shape mmf = ggml_cuda_mmid_route_shape_make( -+ /* type */ 1, -+ /* ne2 */ 128, -+ /* ne12 */ 128, -+ /* ne02 */ 256, -+ /* mmvq_max */ 4, -+ /* use_mmq */ false, -+ /* use_mmf */ true, -+ /* is_amd */ false, -+ /* is_quantized */ false); -+ -+ require(mmf.route == GGML_CUDA_MMID_ROUTE_MMF, "MMF wins when grouped MMQ is unavailable"); -+ require(!mmf.host_sync, "MMF route is graph-safe"); -+ -+ const ggml_cuda_mmid_route_shape fallback = ggml_cuda_mmid_route_shape_make( -+ /* type */ 0, -+ /* ne2 */ 128, -+ /* ne12 */ 128, -+ /* ne02 */ 256, -+ /* mmvq_max */ 4, -+ /* use_mmq */ false, -+ /* use_mmf */ false, -+ /* is_amd */ false, -+ /* is_quantized */ false); -+ -+ require(fallback.route == GGML_CUDA_MMID_ROUTE_FALLBACK, "fallback is used when device routes do not match"); -+ require(fallback.host_sync, "fallback route requires host synchronization"); -+ -+ const ggml_cuda_mmid_route_shape amd_mmvf = ggml_cuda_mmid_route_shape_make( -+ /* type */ 1, -+ /* ne2 */ 1, -+ /* ne12 */ 1, -+ /* ne02 */ 256, -+ /* mmvq_max */ 4, -+ /* use_mmq */ false, -+ /* use_mmf */ false, -+ /* is_amd */ true, -+ /* is_quantized */ false); -+ -+ require(amd_mmvf.route == GGML_CUDA_MMID_ROUTE_MMVF, "AMD float vector route wins within MMVQ cap"); -+ require(!amd_mmvf.host_sync, "AMD float vector route is graph-safe"); -+ -+ const int route_n = ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), mmq); -+ -+ require(route_n > 0, "MMID route format returns byte count"); -+ require(std::strstr(buf, "route=mmq") != nullptr, "MMID trace includes route name"); -+ require(std::strstr(buf, "host_sync=0") != nullptr, "MMID trace includes host sync flag"); -+ require(std::strstr(buf, "ne2=128") != nullptr, "MMID trace includes destination batch"); -+ require(std::strstr(buf, "ne12=128") != nullptr, "MMID trace includes routed token count"); -+ require(std::strstr(buf, "n_experts=256") != nullptr, "MMID trace includes expert count"); -+ require(std::strstr(buf, "mmvq_max=4") != nullptr, "MMID trace includes MMVQ cap"); -+ require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate"); -+ require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate"); -+ - return 0; - } --- -2.43.0 - diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0061-feat-cuda-trace-mul-mat-routes.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0061-feat-cuda-trace-mul-mat-routes.patch deleted file mode 100644 index 2099568d0..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0061-feat-cuda-trace-mul-mat-routes.patch +++ /dev/null @@ -1,345 +0,0 @@ -From 486c28c63d5297afd06e5a2bdbd4fb89cad749cd Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 05:49:12 +0000 -Subject: [PATCH] feat(cuda): trace mul mat routes - -Add a default-off LLAMA_MUL_MAT_ROUTE_TRACE diagnostic for regular MUL_MAT dispatch routes. - -The trace classifies projection-heavy calls without changing dispatch behavior. - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 44 +++++++++- - ggml/src/ggml-cuda/mmq-shape-trace.h | 117 +++++++++++++++++++++++++++ - tests/test-cuda-mmq-shape-trace.cpp | 85 +++++++++++++++++++ - 3 files changed, 244 insertions(+), 2 deletions(-) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index a1754df39..cd34aff13 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2580,6 +2580,32 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) { - return use_mul_mat_vec_q; - } - -+static inline int ggml_cuda_mul_mat_route_trace_limit() { -+ static const int value = []() { -+ const char * s = getenv("LLAMA_MUL_MAT_ROUTE_TRACE"); -+ return s ? atoi(s) : 0; -+ }(); -+ -+ return value; -+} -+ -+static inline void ggml_cuda_mul_mat_route_trace(const ggml_cuda_mul_mat_route_shape & shape) { -+ const int trace_limit = ggml_cuda_mul_mat_route_trace_limit(); -+ if (trace_limit <= 0) { -+ return; -+ } -+ -+ static std::atomic trace_count{0}; -+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed); -+ if (trace_idx >= trace_limit) { -+ return; -+ } -+ -+ char buf[256]; -+ ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), shape); -+ fprintf(stderr, "[LLAMA_MUL_MAT_ROUTE] %s\n", buf); -+} -+ - static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); - -@@ -2591,6 +2617,10 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor - if (!split) { - const int cc_fp4 = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - if (ggml_cuda_fp4_prefill_should_engage(src0, src1, dst, cc_fp4)) { -+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make( -+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split, -+ /*use_vec_f=*/false, /*use_mat_f=*/false, /*use_vec_q=*/false, /*use_mmq=*/false, -+ /*use_batched_cublas=*/false, /*use_fp4_prefill=*/true, /*use_fwht=*/false)); - ggml_cuda_mul_mat_fp4_large_m(ctx, src0, src1, dst); - return; - } -@@ -2654,12 +2684,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor - bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16); - bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc); - bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32; -+ bool use_batched_cublas = !split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32) -+ && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1; - - const int32_t hint = ggml_get_op_params_i32(dst, 1); - if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) { -+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make( -+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split, -+ use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q, -+ use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/true)); - return; - } - -+ ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make( -+ src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split, -+ use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q, -+ use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/false)); -+ - if (!split && use_mul_mat_vec_f) { - // the custom F16 vector kernel can be used over batched cuBLAS GEMM - // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) -@@ -2670,8 +2711,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor - ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); - } else if (!split && use_mul_mat_q) { - ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst); -- } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32) -- && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { -+ } else if (use_batched_cublas) { - // general KQ + KQV multi-batch without FlashAttention - ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); - } else if (use_mul_mat_vec_f) { -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -index da234a302..8ac373fd9 100644 ---- a/ggml/src/ggml-cuda/mmq-shape-trace.h -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -71,6 +71,37 @@ struct ggml_cuda_mmid_route_shape { - bool host_sync; - }; - -+enum ggml_cuda_mul_mat_route { -+ GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL, -+ GGML_CUDA_MUL_MAT_ROUTE_FWHT, -+ GGML_CUDA_MUL_MAT_ROUTE_VEC_F, -+ GGML_CUDA_MUL_MAT_ROUTE_MAT_F, -+ GGML_CUDA_MUL_MAT_ROUTE_VEC_Q, -+ GGML_CUDA_MUL_MAT_ROUTE_MMQ, -+ GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS, -+ GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F, -+ GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q, -+ GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ, -+ GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS, -+}; -+ -+struct ggml_cuda_mul_mat_route_shape { -+ ggml_cuda_mul_mat_route route; -+ int type; -+ int64_t ne1; -+ int64_t ne11; -+ int64_t ne12; -+ int64_t ne13; -+ bool split; -+ bool use_vec_f; -+ bool use_mat_f; -+ bool use_vec_q; -+ bool use_mmq; -+ bool use_batched_cublas; -+ bool use_fp4_prefill; -+ bool use_fwht; -+}; -+ - static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) { - switch (route) { - case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq"; -@@ -83,6 +114,24 @@ static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route - return "unknown"; - } - -+static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_route route) { -+ switch (route) { -+ case GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL: return "fp4_prefill"; -+ case GGML_CUDA_MUL_MAT_ROUTE_FWHT: return "fwht"; -+ case GGML_CUDA_MUL_MAT_ROUTE_VEC_F: return "vec_f"; -+ case GGML_CUDA_MUL_MAT_ROUTE_MAT_F: return "mat_f"; -+ case GGML_CUDA_MUL_MAT_ROUTE_VEC_Q: return "vec_q"; -+ case GGML_CUDA_MUL_MAT_ROUTE_MMQ: return "mmq"; -+ case GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS: return "batched_cublas"; -+ case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F: return "op_vec_f"; -+ case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q: return "op_vec_q"; -+ case GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ: return "op_mmq"; -+ case GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS: return "op_cublas"; -+ } -+ -+ return "unknown"; -+} -+ - static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, - const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best, -@@ -110,6 +159,52 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - }; - } - -+static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make( -+ const int type, const int64_t ne1, const int64_t ne11, const int64_t ne12, const int64_t ne13, -+ const bool split, const bool use_vec_f, const bool use_mat_f, const bool use_vec_q, -+ const bool use_mmq, const bool use_batched_cublas, const bool use_fp4_prefill, -+ const bool use_fwht) { -+ ggml_cuda_mul_mat_route route = GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS; -+ if (use_fp4_prefill) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL; -+ } else if (use_fwht) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_FWHT; -+ } else if (!split && use_vec_f) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_VEC_F; -+ } else if (!split && use_mat_f) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_MAT_F; -+ } else if (!split && use_vec_q) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_VEC_Q; -+ } else if (!split && use_mmq) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_MMQ; -+ } else if (!split && use_batched_cublas) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS; -+ } else if (use_vec_f) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F; -+ } else if (use_vec_q) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q; -+ } else if (use_mmq) { -+ route = GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ; -+ } -+ -+ return { -+ route, -+ type, -+ ne1, -+ ne11, -+ ne12, -+ ne13, -+ split, -+ use_vec_f, -+ use_mat_f, -+ use_vec_q, -+ use_mmq, -+ use_batched_cublas, -+ use_fp4_prefill, -+ use_fwht, -+ }; -+} -+ - static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make( - const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts, - const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd, -@@ -260,6 +355,28 @@ static inline int ggml_cuda_mmid_route_shape_format( - shape.is_quantized ? 1 : 0); - } - -+static inline int ggml_cuda_mul_mat_route_shape_format( -+ char * buf, const size_t size, const ggml_cuda_mul_mat_route_shape & shape) { -+ return std::snprintf(buf, size, -+ "route=%s type=%d ne1=%lld ne11=%lld ne12=%lld ne13=%lld split=%d " -+ "use_vec_f=%d use_mat_f=%d use_vec_q=%d use_mmq=%d use_batched_cublas=%d " -+ "use_fp4_prefill=%d use_fwht=%d", -+ ggml_cuda_mul_mat_route_name(shape.route), -+ shape.type, -+ (long long) shape.ne1, -+ (long long) shape.ne11, -+ (long long) shape.ne12, -+ (long long) shape.ne13, -+ shape.split ? 1 : 0, -+ shape.use_vec_f ? 1 : 0, -+ shape.use_mat_f ? 1 : 0, -+ shape.use_vec_q ? 1 : 0, -+ shape.use_mmq ? 1 : 0, -+ shape.use_batched_cublas ? 1 : 0, -+ shape.use_fp4_prefill ? 1 : 0, -+ shape.use_fwht ? 1 : 0); -+} -+ - static inline int ggml_cuda_mmq_small_m_shape_format( - char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) { - return std::snprintf(buf, size, -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -index e190cf1ac..2bd41d1d8 100644 ---- a/tests/test-cuda-mmq-shape-trace.cpp -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -200,5 +200,90 @@ int main() { - require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate"); - require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate"); - -+ const ggml_cuda_mul_mat_route_shape mat_f = ggml_cuda_mul_mat_route_shape_make( -+ /* type */ 30, -+ /* ne1 */ 128, -+ /* ne11 */ 128, -+ /* ne12 */ 1, -+ /* ne13 */ 1, -+ /* split */ false, -+ /* use_vec_f */ false, -+ /* use_mat_f */ true, -+ /* use_vec_q */ false, -+ /* use_mmq */ false, -+ /* use_batched_cublas */ false, -+ /* use_fp4_prefill */ false, -+ /* use_fwht */ false); -+ -+ require(mat_f.route == GGML_CUDA_MUL_MAT_ROUTE_MAT_F, "regular MUL_MAT prefers direct mat_f when available"); -+ require(!mat_f.split, "regular MUL_MAT trace records split flag"); -+ -+ const ggml_cuda_mul_mat_route_shape batched = ggml_cuda_mul_mat_route_shape_make( -+ /* type */ 31, -+ /* ne1 */ 128, -+ /* ne11 */ 128, -+ /* ne12 */ 4, -+ /* ne13 */ 1, -+ /* split */ false, -+ /* use_vec_f */ false, -+ /* use_mat_f */ false, -+ /* use_vec_q */ false, -+ /* use_mmq */ false, -+ /* use_batched_cublas */ true, -+ /* use_fp4_prefill */ false, -+ /* use_fwht */ false); -+ -+ require(batched.route == GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS, -+ "regular MUL_MAT records batched cuBLAS route"); -+ -+ const ggml_cuda_mul_mat_route_shape op_cublas = ggml_cuda_mul_mat_route_shape_make( -+ /* type */ 0, -+ /* ne1 */ 16, -+ /* ne11 */ 16, -+ /* ne12 */ 1, -+ /* ne13 */ 1, -+ /* split */ true, -+ /* use_vec_f */ false, -+ /* use_mat_f */ false, -+ /* use_vec_q */ false, -+ /* use_mmq */ false, -+ /* use_batched_cublas */ false, -+ /* use_fp4_prefill */ false, -+ /* use_fwht */ false); -+ -+ require(op_cublas.route == GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS, -+ "regular MUL_MAT fallback records op cublas route"); -+ -+ const ggml_cuda_mul_mat_route_shape fp4_prefill = ggml_cuda_mul_mat_route_shape_make( -+ /* type */ 39, -+ /* ne1 */ 512, -+ /* ne11 */ 512, -+ /* ne12 */ 1, -+ /* ne13 */ 1, -+ /* split */ false, -+ /* use_vec_f */ false, -+ /* use_mat_f */ false, -+ /* use_vec_q */ false, -+ /* use_mmq */ true, -+ /* use_batched_cublas */ false, -+ /* use_fp4_prefill */ true, -+ /* use_fwht */ false); -+ -+ require(fp4_prefill.route == GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL, -+ "regular MUL_MAT records native FP4 prefill route before MMQ"); -+ -+ const int mul_mat_route_n = ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), mat_f); -+ -+ require(mul_mat_route_n > 0, "regular MUL_MAT route format returns byte count"); -+ require(std::strstr(buf, "route=mat_f") != nullptr, "regular MUL_MAT trace includes route name"); -+ require(std::strstr(buf, "type=30") != nullptr, "regular MUL_MAT trace includes type"); -+ require(std::strstr(buf, "ne1=128") != nullptr, "regular MUL_MAT trace includes output columns"); -+ require(std::strstr(buf, "ne11=128") != nullptr, "regular MUL_MAT trace includes src1 columns"); -+ require(std::strstr(buf, "ne12=1") != nullptr, "regular MUL_MAT trace includes src1 batch dim"); -+ require(std::strstr(buf, "split=0") != nullptr, "regular MUL_MAT trace includes split flag"); -+ require(std::strstr(buf, "use_mat_f=1") != nullptr, "regular MUL_MAT trace includes mat_f predicate"); -+ require(std::strstr(buf, "use_batched_cublas=0") != nullptr, -+ "regular MUL_MAT trace includes batched cuBLAS predicate"); -+ - return 0; - } --- -2.43.0 - diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0062-feat-cuda-trace-cublas-routes.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0062-feat-cuda-trace-cublas-routes.patch deleted file mode 100644 index 4bac09e4e..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0062-feat-cuda-trace-cublas-routes.patch +++ /dev/null @@ -1,332 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 06:20:31 +0000 -Subject: [PATCH] feat(cuda): trace cublas routes - -Add a default-off LLAMA_CUBLAS_ROUTE_TRACE diagnostic around the generic cuBLAS MUL_MAT path. - -The trace classifies NVFP4/BF16/FP16/SGEMM subroutes without changing branch behavior, and extends the route helper test coverage. - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 53 +++++++++++-- - ggml/src/ggml-cuda/mmq-shape-trace.h | 108 +++++++++++++++++++++++++++ - tests/test-cuda-mmq-shape-trace.cpp | 62 +++++++++++++++ - 3 files changed, 216 insertions(+), 7 deletions(-) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index cd34aff13..eff197818 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -1627,6 +1627,32 @@ static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type - return compute_type; - } - -+static inline int ggml_cuda_cublas_route_trace_limit() { -+ static const int value = []() { -+ const char * s = getenv("LLAMA_CUBLAS_ROUTE_TRACE"); -+ return s ? atoi(s) : 0; -+ }(); -+ -+ return value; -+} -+ -+static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_shape & shape) { -+ const int trace_limit = ggml_cuda_cublas_route_trace_limit(); -+ if (trace_limit <= 0) { -+ return; -+ } -+ -+ static std::atomic trace_count{0}; -+ const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed); -+ if (trace_idx >= trace_limit) { -+ return; -+ } -+ -+ char buf[320]; -+ ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape); -+ fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf); -+} -+ - static void ggml_cuda_op_mul_mat_cublas( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, -@@ -1662,7 +1688,22 @@ static void ggml_cuda_op_mul_mat_cublas( - row_diff == src0->ne[1] && - dst->op_params[0] == GGML_PREC_DEFAULT; - -- if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { -+ const bool src0_contig = ggml_is_contiguous(src0); -+ const bool full_rows = row_diff == src0->ne[1]; -+ const bool fast_fp16 = fast_fp16_hardware_available(cc); -+ bool force_fp32 = false; -+ bool force_fp16 = false; -+ if (fast_fp16 && use_fp16) { -+ const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type(); -+ force_fp32 = force_compute_type.fp32; -+ force_fp16 = force_compute_type.fp16; -+ } -+ ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make( -+ src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc, -+ supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows, -+ GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA)); -+ -+ if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) { - // Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when - // ggml_cuda_should_use_mmq() returned false (large-M dense prefill). - // Dequant the FP4 weights to a TRANSIENT bf16 pool buffer and run a -@@ -1702,7 +1743,7 @@ static void ggml_cuda_op_mul_mat_cublas( - - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); - to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); -- } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { -+ } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && src0_contig && full_rows) { - ggml_cuda_pool_alloc src1_as_bf16(ctx.pool(id)); - if (src1->type != GGML_TYPE_BF16) { - const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type); -@@ -1730,7 +1771,7 @@ static void ggml_cuda_op_mul_mat_cublas( - - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); - to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); -- } else if (fast_fp16_hardware_available(cc) && use_fp16) { -+ } else if (fast_fp16 && use_fp16) { - // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 - ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); - if (src0->type != GGML_TYPE_F16) { -@@ -1754,12 +1795,10 @@ static void ggml_cuda_op_mul_mat_cublas( - - CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); - -- const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type(); -- -- if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc) -+ if (!force_fp16 && (GGML_CUDA_CC_IS_CDNA(cc) - || GGML_CUDA_CC_IS_RDNA4(cc) - || cc == GGML_CUDA_CC_VOLTA -- || force_compute_type.fp32)) -+ || force_fp32)) - { - const float alpha = 1.0f; - const float beta = 0.0f; -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -index 8ac373fd9..f5b4ecf2c 100644 ---- a/ggml/src/ggml-cuda/mmq-shape-trace.h -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -85,6 +85,14 @@ enum ggml_cuda_mul_mat_route { - GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS, - }; - -+enum ggml_cuda_cublas_route { -+ GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC, -+ GGML_CUDA_CUBLAS_ROUTE_BF16_TC, -+ GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F, -+ GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F, -+ GGML_CUDA_CUBLAS_ROUTE_SGEMM, -+}; -+ - struct ggml_cuda_mul_mat_route_shape { - ggml_cuda_mul_mat_route route; - int type; -@@ -102,6 +110,27 @@ struct ggml_cuda_mul_mat_route_shape { - bool use_fwht; - }; - -+struct ggml_cuda_cublas_route_shape { -+ ggml_cuda_cublas_route route; -+ int type; -+ int src1_type; -+ int64_t row_diff; -+ int64_t src1_ncols; -+ int64_t ne00; -+ int64_t ne10; -+ int64_t ldc; -+ bool supports_bf16; -+ bool use_fp16; -+ bool fast_fp16; -+ bool force_fp32; -+ bool force_fp16; -+ bool src0_contig; -+ bool full_rows; -+ bool is_cdna; -+ bool is_rdna4; -+ bool is_volta; -+}; -+ - static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) { - switch (route) { - case GGML_CUDA_MMID_ROUTE_MMVQ: return "mmvq"; -@@ -132,6 +161,18 @@ static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_ - return "unknown"; - } - -+static inline const char * ggml_cuda_cublas_route_name(const ggml_cuda_cublas_route route) { -+ switch (route) { -+ case GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC: return "nvfp4_bf16_tc"; -+ case GGML_CUDA_CUBLAS_ROUTE_BF16_TC: return "bf16_tc"; -+ case GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F: return "f16_tc_32f"; -+ case GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F: return "f16_tc_16f"; -+ case GGML_CUDA_CUBLAS_ROUTE_SGEMM: return "sgemm"; -+ } -+ -+ return "unknown"; -+} -+ - static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make( - const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x, - const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best, -@@ -205,6 +246,47 @@ static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make( - }; - } - -+static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make( -+ const int type, const int src1_type, const int64_t row_diff, const int64_t src1_ncols, -+ const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16, -+ const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16, -+ const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4, -+ const bool is_volta) { -+ ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM; -+ if (supports_bf16 && type == 40 && src0_contig && full_rows) { -+ route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC; -+ } else if (supports_bf16 && type == 30 && src0_contig && full_rows) { -+ route = GGML_CUDA_CUBLAS_ROUTE_BF16_TC; -+ } else if (fast_fp16 && use_fp16) { -+ if (!force_fp16 && (is_cdna || is_rdna4 || is_volta || force_fp32)) { -+ route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F; -+ } else { -+ route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F; -+ } -+ } -+ -+ return { -+ route, -+ type, -+ src1_type, -+ row_diff, -+ src1_ncols, -+ ne00, -+ ne10, -+ ldc, -+ supports_bf16, -+ use_fp16, -+ fast_fp16, -+ force_fp32, -+ force_fp16, -+ src0_contig, -+ full_rows, -+ is_cdna, -+ is_rdna4, -+ is_volta, -+ }; -+} -+ - static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make( - const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts, - const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd, -@@ -377,6 +459,32 @@ static inline int ggml_cuda_mul_mat_route_shape_format( - shape.use_fwht ? 1 : 0); - } - -+static inline int ggml_cuda_cublas_route_shape_format( -+ char * buf, const size_t size, const ggml_cuda_cublas_route_shape & shape) { -+ return std::snprintf(buf, size, -+ "route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld " -+ "supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d " -+ "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d", -+ ggml_cuda_cublas_route_name(shape.route), -+ shape.type, -+ shape.src1_type, -+ (long long) shape.row_diff, -+ (long long) shape.src1_ncols, -+ (long long) shape.ne00, -+ (long long) shape.ne10, -+ (long long) shape.ldc, -+ shape.supports_bf16 ? 1 : 0, -+ shape.use_fp16 ? 1 : 0, -+ shape.fast_fp16 ? 1 : 0, -+ shape.force_fp32 ? 1 : 0, -+ shape.force_fp16 ? 1 : 0, -+ shape.src0_contig ? 1 : 0, -+ shape.full_rows ? 1 : 0, -+ shape.is_cdna ? 1 : 0, -+ shape.is_rdna4 ? 1 : 0, -+ shape.is_volta ? 1 : 0); -+} -+ - static inline int ggml_cuda_mmq_small_m_shape_format( - char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) { - return std::snprintf(buf, size, -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -index 2bd41d1d8..1443749c3 100644 ---- a/tests/test-cuda-mmq-shape-trace.cpp -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -285,5 +285,67 @@ int main() { - require(std::strstr(buf, "use_batched_cublas=0") != nullptr, - "regular MUL_MAT trace includes batched cuBLAS predicate"); - -+ const ggml_cuda_cublas_route_shape bf16_tc = ggml_cuda_cublas_route_shape_make( -+ /* type */ 30, -+ /* src1_type */ 0, -+ /* row_diff */ 18, -+ /* src1_ncols */ 18, -+ /* ne00 */ 1024, -+ /* ne10 */ 1024, -+ /* ldc */ 18, -+ /* supports_bf16 */ true, -+ /* use_fp16 */ false, -+ /* fast_fp16 */ true, -+ /* force_fp32 */ false, -+ /* force_fp16 */ false, -+ /* src0_contig */ true, -+ /* full_rows */ true, -+ /* is_cdna */ false, -+ /* is_rdna4 */ false, -+ /* is_volta */ false); -+ -+ require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC, -+ "cuBLAS records native BF16 tensor-core route"); -+ -+ const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make( -+ /* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true, -+ false, false, false); -+ -+ require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC, -+ "cuBLAS records NVFP4 dequant-to-BF16 tensor-core route"); -+ -+ const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make( -+ /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true, -+ false, false, false); -+ -+ require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F, -+ "cuBLAS records default FP16 tensor-core 16F compute route"); -+ -+ const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make( -+ /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true, -+ false, false, false); -+ -+ require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F, -+ "cuBLAS records forced FP16 tensor-core 32F compute route"); -+ -+ const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make( -+ /* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true, -+ false, false, false); -+ -+ require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM, -+ "cuBLAS records SGEMM fallback route"); -+ -+ const int cublas_route_n = ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), bf16_tc); -+ -+ require(cublas_route_n > 0, "cuBLAS route format returns byte count"); -+ require(std::strstr(buf, "route=bf16_tc") != nullptr, "cuBLAS trace includes route name"); -+ require(std::strstr(buf, "type=30") != nullptr, "cuBLAS trace includes src0 type"); -+ require(std::strstr(buf, "src1_type=0") != nullptr, "cuBLAS trace includes src1 type"); -+ require(std::strstr(buf, "row_diff=18") != nullptr, "cuBLAS trace includes row count"); -+ require(std::strstr(buf, "src1_ncols=18") != nullptr, "cuBLAS trace includes source column count"); -+ require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate"); -+ require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate"); -+ require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate"); -+ - return 0; - } --- -2.43.0 - diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0063-feat-cuda-trace-cublas-tensor-names.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0063-feat-cuda-trace-cublas-tensor-names.patch deleted file mode 100644 index dbf1cb23f..000000000 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0063-feat-cuda-trace-cublas-tensor-names.patch +++ /dev/null @@ -1,162 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ettore Di Giacinto -Date: Wed, 1 Jul 2026 06:38:11 +0000 -Subject: [PATCH] feat(cuda): trace cublas tensor names - -Extend LLAMA_CUBLAS_ROUTE_TRACE with src0/src1/dst tensor names so SGEMM and BF16 cuBLAS buckets can be tied back to graph nodes. - -Assisted-by: Codex:gpt-5 ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++-- - ggml/src/ggml-cuda/mmq-shape-trace.h | 16 +++++++++++++--- - tests/test-cuda-mmq-shape-trace.cpp | 18 ++++++++++++------ - 3 files changed, 28 insertions(+), 11 deletions(-) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index eff197818..1da67d2af 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -1648,7 +1648,7 @@ static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_sha - return; - } - -- char buf[320]; -+ char buf[512]; - ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape); - fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf); - } -@@ -1701,7 +1701,8 @@ static void ggml_cuda_op_mul_mat_cublas( - ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make( - src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc, - supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows, -- GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA)); -+ GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA, -+ src0->name, src1->name, dst->name)); - - if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) { - // Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when -diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h -index f5b4ecf2c..b55c7467c 100644 ---- a/ggml/src/ggml-cuda/mmq-shape-trace.h -+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h -@@ -129,6 +129,9 @@ struct ggml_cuda_cublas_route_shape { - bool is_cdna; - bool is_rdna4; - bool is_volta; -+ const char * src0_name; -+ const char * src1_name; -+ const char * dst_name; - }; - - static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) { -@@ -251,7 +254,7 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make( - const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16, - const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16, - const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4, -- const bool is_volta) { -+ const bool is_volta, const char * src0_name, const char * src1_name, const char * dst_name) { - ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM; - if (supports_bf16 && type == 40 && src0_contig && full_rows) { - route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC; -@@ -284,6 +287,9 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make( - is_cdna, - is_rdna4, - is_volta, -+ src0_name ? src0_name : "", -+ src1_name ? src1_name : "", -+ dst_name ? dst_name : "", - }; - } - -@@ -464,7 +470,8 @@ static inline int ggml_cuda_cublas_route_shape_format( - return std::snprintf(buf, size, - "route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld " - "supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d " -- "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d", -+ "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d " -+ "src0=%s src1=%s dst=%s", - ggml_cuda_cublas_route_name(shape.route), - shape.type, - shape.src1_type, -@@ -482,7 +489,10 @@ static inline int ggml_cuda_cublas_route_shape_format( - shape.full_rows ? 1 : 0, - shape.is_cdna ? 1 : 0, - shape.is_rdna4 ? 1 : 0, -- shape.is_volta ? 1 : 0); -+ shape.is_volta ? 1 : 0, -+ shape.src0_name, -+ shape.src1_name, -+ shape.dst_name); - } - - static inline int ggml_cuda_mmq_small_m_shape_format( -diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp -index 1443749c3..2547193ce 100644 ---- a/tests/test-cuda-mmq-shape-trace.cpp -+++ b/tests/test-cuda-mmq-shape-trace.cpp -@@ -27,7 +27,7 @@ int main() { - require(shape.n_active_est == 256, "active expert estimate is capped by expert count"); - require(shape.density == 4, "density is ceil(assignments / active experts)"); - -- char buf[256]; -+ char buf[512]; - const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape); - - require(n > 0, "format returns byte count"); -@@ -302,35 +302,38 @@ int main() { - /* full_rows */ true, - /* is_cdna */ false, - /* is_rdna4 */ false, -- /* is_volta */ false); -+ /* is_volta */ false, -+ /* src0_name */ "blk.0.proj.weight", -+ /* src1_name */ "blk.0.proj.inp", -+ /* dst_name */ "blk.0.proj.out"); - - require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC, - "cuBLAS records native BF16 tensor-core route"); - - const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make( - /* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true, -- false, false, false); -+ false, false, false, "nvfp4.weight", "nvfp4.inp", "nvfp4.out"); - - require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC, - "cuBLAS records NVFP4 dequant-to-BF16 tensor-core route"); - - const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make( - /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true, -- false, false, false); -+ false, false, false, "f16.weight", "f16.inp", "f16.out"); - - require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F, - "cuBLAS records default FP16 tensor-core 16F compute route"); - - const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make( - /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true, -- false, false, false); -+ false, false, false, "f16.weight", "f16.inp", "f16.out"); - - require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F, - "cuBLAS records forced FP16 tensor-core 32F compute route"); - - const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make( - /* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true, -- false, false, false); -+ false, false, false, "f32.weight", "f32.inp", "f32.out"); - - require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM, - "cuBLAS records SGEMM fallback route"); -@@ -346,6 +349,9 @@ int main() { - require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate"); - require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate"); - require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate"); -+ require(std::strstr(buf, "src0=blk.0.proj.weight") != nullptr, "cuBLAS trace includes src0 name"); -+ require(std::strstr(buf, "src1=blk.0.proj.inp") != nullptr, "cuBLAS trace includes src1 name"); -+ require(std::strstr(buf, "dst=blk.0.proj.out") != nullptr, "cuBLAS trace includes dst name"); - - return 0; - } --- -2.43.0 -