patches(paged): trim series to Phase135 routed-FFN line, sync to fork 1edddc8fe

The campaign patches 0048-0063 were added without matching fork commits. After a keep/drop review, the series is trimmed and re-mirrored 1:1 onto the fork branch mudler/llama.cpp:localai-paged (HEAD 1edddc8fe, on 51168c5ee). Kept, renumbered from the fork (now carry Assisted-by + Signed-off-by): - 0048 test(paged): cover MoE swiglu down chain (was 0051, fd920cf8a) - 0049 test(paged): cover MoE weighted combine chain (was 0052, a85c1e098) - 0050 test(paged): cover ragged MoE dispatch (was 0053, 2fed6aacf) - 0051 fix(speculative): disable backend sampling for MTP drafts (was 0054, f1d976f06) - 0052 feat(paged): whole-pattern MoE matcher + routed-FFN fused NVFP4-quant down MMQ (new, 1edddc8fe) Dropped (no fork commits, removed from the series): - 0048-0050 W4A16 grouped-tile pack/tune/pad: dead line, W4A16 ~1.5x slower than grouped-MMQ. - 0055-0063 speculative/moe/mul-mat/cublas route traces + the rejected small-M tile-policy knob (0059). - All other 110-140 campaign markers not needed by Phase135 (GPU-sort, W4A16-direct-A, boundary trace/timing, Phase133 sorted-F32, Phase134 fused-SWIGLU, Phase138 finalize) carry no code in this tree. Tree-hash proof (the mirror invariant): a fresh detached worktree at LLAMA_VERSION 0ed235ea2c17a19fc8238668653946721ed136fd with every on-disk patches/paged/0*.patch applied in numeric order (git apply) stages to tree 097c862c6834b7d8b90419b305b8402155ef8373, byte-identical to fork HEAD 1edddc8fe's tree. Series is 43 patches (0001-0047 unchanged + 0048-0052). Gated on GB10 sm_121a: default md5 MoE 8cb0ce23 / dense 5951a5b4 unchanged; opt-in md5-clean; MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46, MOE_SWIGLU_DOWN 7/7, MUL_MAT_ID_RAGGED_MOE 6/6; six mmq_moe_quantized_raw markers with zero sorted launches on the opt-in sentinel. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-02 20:37:03 -04:00 · 2026-07-02 10:19:10 +00:00
parent 1aba41082b
commit b529cc5420
17 changed files with 950 additions and 2163 deletions
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-feat-paged-pack-W4A16-grouped-tile-metadata.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-feat-paged-pack-W4A16-grouped-tile-metadata.patch
@@ -1,121 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Tue, 30 Jun 2026 21:19:07 +0000
-Subject: [PATCH] feat(paged): pack W4A16 grouped tile metadata
-
-Collapse the grouped W4A16 tile metadata from three host vectors and three device copies into one aligned descriptor array and one H2D copy. This keeps the path default-off while reducing launch-side metadata overhead for the experimental W4A16 MoE prefill path.
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/w4a16-gemm.cu | 46 ++++++++++++++++++--------------
- 1 file changed, 26 insertions(+), 20 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
-index f348f31ad..899e1a23f 100644
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
-+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
-@@ -18,6 +18,14 @@ typedef tile<16, 8, nv_bfloat162> tile_A; // A operand: M=16, K=16
- typedef tile< 8, 8, nv_bfloat162> tile_B; // B operand: N=8,  K=16
- typedef tile<16, 8, float>        tile_C; // accumulator: M=16, N=8
- 
-+struct alignas(16) w4a16_tile_desc {
-+    int expert;
-+    int row0;
-+    int rows;
-+    int pad;
-+};
-+static_assert(sizeof(w4a16_tile_desc) == 16, "w4a16 tile descriptors must stay 16 bytes");
-+
- #ifndef LLAMA_W4A16_PREFILL_M
- #define LLAMA_W4A16_PREFILL_M 0
- #endif // LLAMA_W4A16_PREFILL_M
-@@ -92,9 +100,7 @@ static __global__ void w4a16_grouped_kernel(
-         const nv_bfloat16 * __restrict__ Abf,            // [pad_rows, K] bf16
-         const block_nvfp4 * __restrict__ W0,             // src0 base (expert 0)
-         float * __restrict__ C,                          // [total_rows, N] f32
-        const int * __restrict__ g_tile_expert,
-        const int * __restrict__ g_tile_row0,
-        const int * __restrict__ g_tile_rows,
-+        const w4a16_tile_desc * __restrict__ g_tiles,
-         int N, int K, int64_t expert_stride_blocks) {
- #if defined(AMPERE_MMA_AVAILABLE) && defined(CP_ASYNC_AVAILABLE)
-     constexpr int BK   = 64;                 // one nvfp4 block
-@@ -129,9 +135,10 @@ static __global__ void w4a16_grouped_kernel(
-     const int tid  = warp*32 + lane;         // linear id for the cp.async strided copies
-     const int wrow = warp / WARPS_N, wcol = warp % WARPS_N;
- 
-    const int    e        = g_tile_expert[blockIdx.y];
-    const int    row0     = g_tile_row0[blockIdx.y];
-    const int    rcount   = g_tile_rows[blockIdx.y];
-+    const w4a16_tile_desc tile_desc = g_tiles[blockIdx.y];
-+    const int    e        = tile_desc.expert;
-+    const int    row0     = tile_desc.row0;
-+    const int    rcount   = tile_desc.rows;
-     const int    blockCol = blockIdx.x*BN;
-     const int    Kb       = K/64;
-     const block_nvfp4 * We = W0 + (int64_t) e*expert_stride_blocks; // expert e weight base
-@@ -238,7 +245,7 @@ static __global__ void w4a16_grouped_kernel(
-         }
- #else
-     GGML_UNUSED(Abf); GGML_UNUSED(W0); GGML_UNUSED(C);
-    GGML_UNUSED(g_tile_expert); GGML_UNUSED(g_tile_row0); GGML_UNUSED(g_tile_rows);
-+    GGML_UNUSED(g_tiles);
-     GGML_UNUSED(N); GGML_UNUSED(K); GGML_UNUSED(expert_stride_blocks);
-     NO_DEVICE_CODE;
- #endif // AMPERE_MMA_AVAILABLE && CP_ASYNC_AVAILABLE
-@@ -296,18 +303,21 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
-         return;
-     }
- 
-    std::vector<int32_t> h_tile_expert, h_tile_row0, h_tile_rows;
-+    std::vector<w4a16_tile_desc> h_tiles;
-     int64_t row = 0;
-     for (int64_t e = 0; e < n_experts; e++) {
-         const int t = tokens_per_expert[e];
-         for (int off = 0; off < t; off += BM) {
-            h_tile_expert.push_back((int32_t) e);
-            h_tile_row0.push_back((int32_t) (row + off));
-            h_tile_rows.push_back((int32_t) std::min(BM, t - off));
-+            h_tiles.push_back({
-+                (int) e,
-+                (int) (row + off),
-+                (int) std::min(BM, t - off),
-+                0,
-+            });
-         }
-         row += t;
-     }
-    const int n_tiles = (int) h_tile_expert.size();
-+    const int n_tiles = (int) h_tiles.size();
- 
-     if (getenv("LLAMA_W4A16_DEBUG")) {
-         int max_tpe = 0, multi = 0;
-@@ -319,13 +329,9 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
-                 (long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi);
-     }
- 
-    // device: tile map
-    ggml_cuda_pool_alloc<int32_t> d_tile_expert(ctx.pool(), n_tiles);
-    ggml_cuda_pool_alloc<int32_t> d_tile_row0  (ctx.pool(), n_tiles);
-    ggml_cuda_pool_alloc<int32_t> d_tile_rows  (ctx.pool(), n_tiles);
-    CUDA_CHECK(cudaMemcpyAsync(d_tile_expert.ptr, h_tile_expert.data(), n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
-    CUDA_CHECK(cudaMemcpyAsync(d_tile_row0.ptr,   h_tile_row0.data(),   n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
-    CUDA_CHECK(cudaMemcpyAsync(d_tile_rows.ptr,   h_tile_rows.data(),   n_tiles*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
-+    // device: packed tile map; one pageable H2D copy instead of three tiny copies
-+    ggml_cuda_pool_alloc<w4a16_tile_desc> d_tiles(ctx.pool(), n_tiles);
-+    CUDA_CHECK(cudaMemcpyAsync(d_tiles.ptr, h_tiles.data(), n_tiles*sizeof(w4a16_tile_desc), cudaMemcpyHostToDevice, stream));
- 
-     // activations: f32 -> bf16 (cheap cast, NO act-quant), zero-padded so every tile's BM-row read
-     // stays in-bounds. A tile's row0 is generally NOT BM-aligned (experts start mid-buffer), and a
-@@ -353,7 +359,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
-     dim3 block(32, WARPS_M*WARPS_N);   // 2D: threadIdx.x = warp lane, threadIdx.y = warp
-     kern<<<grid, block, smem_bytes, stream>>>(
-         Abf.get(), (const block_nvfp4 *) src0->data, dst_sorted,
-        d_tile_expert.ptr, d_tile_row0.ptr, d_tile_rows.ptr,
-+        d_tiles.ptr,
-         (int) N, (int) K, expert_stride_blocks);
-     CUDA_CHECK(cudaGetLastError());
- }
-- 
-2.43.0
-
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-test-paged-cover-MoE-swiglu-down-chain.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0048-test-paged-cover-MoE-swiglu-down-chain.patch
@@ -1,8 +1,10 @@
-From cd56cf037379b084d6bb0ed47db8b785c828be86 Mon Sep 17 00:00:00 2001
+From fd920cf8a7fe9cc7753cd0640411ce771edfeaca Mon Sep 17 00:00:00 2001
 From: Ettore Di Giacinto <mudler@localai.io>
 Date: Tue, 30 Jun 2026 23:18:38 +0000
-Subject: [PATCH] test(paged): cover MoE swiglu down chain
+Subject: [PATCH 48/52] test(paged): cover MoE swiglu down chain

+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
 ---
 tests/test-backend-ops.cpp | 92 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-feat-paged-tune-W4A16-grouped-tile-shape.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-feat-paged-tune-W4A16-grouped-tile-shape.patch
@@ -1,93 +0,0 @@
-From 7dfa0e17548c5f04f83d2cc2a057b0a9941b599a Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Tue, 30 Jun 2026 21:44:54 +0000
-Subject: [PATCH] feat(paged): tune W4A16 grouped tile shape
-
-Select a BM=32 grouped W4A16 MoE prefill shape by default after the GB10 shape sweep, while keeping the prior 64x128 shape and additional diagnostics selectable through LLAMA_W4A16_SHAPE.
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/w4a16-gemm.cu | 42 +++++++++++++++++++++++++++++---
- 1 file changed, 38 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
-index 899e1a23f..ca8864292 100644
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
-+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
-@@ -4,6 +4,7 @@
- #include <algorithm>
- #include <cstdint>
- #include <cstdlib>
-+#include <cstring>
- #include <vector>
- 
- // ===========================================================================
-@@ -281,7 +282,23 @@ bool ggml_cuda_w4a16_moe_grouped_should_engage(
-     return true;
- }
- 
-void ggml_cuda_mul_mat_id_w4a16_grouped(
-+static bool ggml_cuda_w4a16_use_base_shape() {
-+    static const bool use_base = [] {
-+        const char * e = getenv("LLAMA_W4A16_SHAPE");
-+        if (e == nullptr || e[0] == '\0' || strcmp(e, "default") == 0 || strcmp(e, "bm32") == 0 || strcmp(e, "32x128") == 0) {
-+            return false;
-+        }
-+        if (strcmp(e, "base") == 0 || strcmp(e, "64x128") == 0) {
-+            return true;
-+        }
-+        fprintf(stderr, "[w4a16] unknown LLAMA_W4A16_SHAPE=%s, using default bm32\n", e);
-+        return false;
-+    }();
-+    return use_base;
-+}
-+
-+template<int BM, int BN, int WARPS_M, int WARPS_N, int STAGES>
-+static void ggml_cuda_mul_mat_id_w4a16_grouped_impl(
-         ggml_backend_cuda_context & ctx,
-         const ggml_tensor * src0,
-         const float * src1_sorted,
-@@ -290,9 +307,7 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
-         int64_t n_experts, int64_t K, int64_t N,
-         cudaStream_t stream) {
-     GGML_ASSERT(src0->type == GGML_TYPE_NVFP4);
-    GGML_ASSERT(N % 128 == 0 && K % 64 == 0);
-
-    constexpr int BM = 64, BN = 128, WARPS_M = 2, WARPS_N = 4, STAGES = 2;
-+    GGML_ASSERT(N % BN == 0 && K % 64 == 0);
- 
-     // host: build the per-M-tile expert map (ragged, no tile crosses an expert boundary)
-     int64_t total_rows = 0;
-@@ -327,6 +342,8 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
-         }
-         fprintf(stderr, "[w4a16] engaged: total_rows=%lld n_experts=%lld K=%lld N=%lld n_tiles=%d max_tpe=%d multi_tile_experts=%d\n",
-                 (long long) total_rows, (long long) n_experts, (long long) K, (long long) N, n_tiles, max_tpe, multi);
-+        fprintf(stderr, "[w4a16] shape: BM=%d BN=%d WARPS_M=%d WARPS_N=%d STAGES=%d\n",
-+                BM, BN, WARPS_M, WARPS_N, STAGES);
-     }
- 
-     // device: packed tile map; one pageable H2D copy instead of three tiny copies
-@@ -363,3 +380,20 @@ void ggml_cuda_mul_mat_id_w4a16_grouped(
-         (int) N, (int) K, expert_stride_blocks);
-     CUDA_CHECK(cudaGetLastError());
- }
-+
-+void ggml_cuda_mul_mat_id_w4a16_grouped(
-+        ggml_backend_cuda_context & ctx,
-+        const ggml_tensor * src0,
-+        const float * src1_sorted,
-+        float * dst_sorted,
-+        const int * tokens_per_expert,
-+        int64_t n_experts, int64_t K, int64_t N,
-+        cudaStream_t stream) {
-+    if (ggml_cuda_w4a16_use_base_shape()) {
-+        ggml_cuda_mul_mat_id_w4a16_grouped_impl<64, 128, 2, 4, 2>(
-+                ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream);
-+    } else {
-+        ggml_cuda_mul_mat_id_w4a16_grouped_impl<32, 128, 1, 4, 2>(
-+                ctx, src0, src1_sorted, dst_sorted, tokens_per_expert, n_experts, K, N, stream);
-+    }
-+}
-- 
-2.43.0
-
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-test-paged-cover-MoE-weighted-combine-chain.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0049-test-paged-cover-MoE-weighted-combine-chain.patch
@@ -1,8 +1,10 @@
-From 3ef7eb9e412eb34f8656675862f6753c65d28ec9 Mon Sep 17 00:00:00 2001
+From a85c1e098e22eb587fd80220986f35a8d6e11300 Mon Sep 17 00:00:00 2001
 From: Ettore Di Giacinto <mudler@localai.io>
 Date: Tue, 30 Jun 2026 23:50:33 +0000
-Subject: [PATCH] test(paged): cover MoE weighted combine chain
+Subject: [PATCH 49/52] test(paged): cover MoE weighted combine chain

+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
 ---
 tests/test-backend-ops.cpp | 90 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-feat-paged-pad-W4A16-A-shared-tile-stride.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-feat-paged-pad-W4A16-A-shared-tile-stride.patch
@@ -1,56 +0,0 @@
-From d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3 Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Tue, 30 Jun 2026 22:13:09 +0000
-Subject: [PATCH] feat(paged): pad W4A16 A shared tile stride
-
-Pad the grouped W4A16 A operand shared-memory row stride to reduce bank pressure on GB10 while preserving the selected bm32/default and base launch shapes.
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/w4a16-gemm.cu | 9 +++++----
- 1 file changed, 5 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/w4a16-gemm.cu b/ggml/src/ggml-cuda/w4a16-gemm.cu
-index ca8864292..b17e022b0 100644
--- a/ggml/src/ggml-cuda/w4a16-gemm.cu
-+++ b/ggml/src/ggml-cuda/w4a16-gemm.cu
-@@ -111,7 +111,8 @@ static __global__ void w4a16_grouped_kernel(
-     constexpr int MF = WM/16, NF = WN/8;
- 
-     constexpr int AN  = BK/2;                 // bf16 pairs per A smem row (nv_bfloat162)
-    constexpr int SZ_A   = BM*AN;             // nv_bfloat162 per stage
-+    constexpr int ASTR = AN + 4;              // padded A smem row stride, in nv_bfloat162
-+    constexpr int SZ_A   = BM*ASTR;           // nv_bfloat162 per stage
-     constexpr int SZ_WQ  = BN*8;              // u32 per stage (32 qs bytes/row)
-     constexpr int SZ_WD  = BN;                // u32 per stage (4 scale bytes/row)
- 
-@@ -155,7 +156,7 @@ static __global__ void w4a16_grouped_kernel(
-             const int c = idx % (BK/8);          // 16B chunk in the row
-             const int r = idx / (BK/8);          // row in tile
-             const nv_bfloat16 * src = Abf + (int64_t)(row0 + r)*K + (int64_t)kt*BK + c*8;
-            w4a16_cp_async16(((char *) sA[st]) + (r*AN + c*4)*sizeof(uint32_t), src);
-+            w4a16_cp_async16(((char *) sA[st]) + (r*ASTR + c*4)*sizeof(uint32_t), src);
-         }
-         // W qs: BN rows x 32 bytes = BN x 8 u32 (each block's qs at byte offset 4)
- #pragma unroll 1
-@@ -198,7 +199,7 @@ static __global__ void w4a16_grouped_kernel(
- #pragma unroll
-             for (int mi = 0; mi < MF; mi++) {
-                 const int rb = wrow*WM + mi*16;
-                load_ldmatrix(A_frag[mi], sAcur + rb*AN + kk*8, AN);
-+                load_ldmatrix(A_frag[mi], sAcur + rb*ASTR + kk*8, ASTR);
-             }
-             // B fragments: in-register FP4->bf16 dequant (correct-by-construction via tile get_i/get_j)
-             tile_B B_frag[NF];
-@@ -368,7 +369,7 @@ static void ggml_cuda_mul_mat_id_w4a16_grouped_impl(
-     const int64_t expert_stride_blocks = (int64_t) (src0->nb[2] / sizeof(block_nvfp4));
- 
-     auto kern = w4a16_grouped_kernel<BM, BN, WARPS_M, WARPS_N, STAGES>;
-    constexpr int STAGE_U32 = BM*(64/2) + BN*8 + BN;
-+    constexpr int STAGE_U32 = BM*((64/2) + 4) + BN*8 + BN;
-     const int smem_bytes = STAGES * STAGE_U32 * (int) sizeof(uint32_t);
-     CUDA_CHECK(cudaFuncSetAttribute(kern, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
- 
-- 
-2.43.0
-
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-test-paged-cover-ragged-MoE-dispatch.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0050-test-paged-cover-ragged-MoE-dispatch.patch
@@ -1,8 +1,10 @@
-From e21732fc47206d5878e3b977bbd21858a3ba4ab0 Mon Sep 17 00:00:00 2001
+From 2fed6aacff14537864bbf754c7552740131d4eaf Mon Sep 17 00:00:00 2001
 From: Ettore Di Giacinto <mudler@localai.io>
 Date: Wed, 1 Jul 2026 00:39:52 +0000
-Subject: [PATCH] test(paged): cover ragged MoE dispatch
+Subject: [PATCH 50/52] test(paged): cover ragged MoE dispatch

+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
 ---
 tests/test-backend-ops.cpp | 118 +++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-fix-speculative-disable-backend-sampling-for-MTP-drafts.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-fix-speculative-disable-backend-sampling-for-MTP-drafts.patch
@@ -1,8 +1,11 @@
-From 3eba64afff6ecaa25da11f0e394717224f221c9a Mon Sep 17 00:00:00 2001
+From f1d976f06fb92655106709256dd093ffefb85e2b Mon Sep 17 00:00:00 2001
 From: Ettore Di Giacinto <mudler@localai.io>
 Date: Wed, 1 Jul 2026 00:50:36 +0000
-Subject: [PATCH] fix(speculative): disable backend sampling for MTP drafts
+Subject: [PATCH 51/52] fix(speculative): disable backend sampling for MTP
+ drafts

+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
 ---
 common/speculative.cpp | 6 ++++++
 1 file changed, 6 insertions(+)
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-feat-paged-whole-pattern-MoE-matcher-routed-FFN-fuse.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-feat-paged-whole-pattern-MoE-matcher-routed-FFN-fuse.patch
@@ -0,0 +1,933 @@
+From 1edddc8fe93bb2fec5f831bbde5df2b7480a7b05 Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Thu, 2 Jul 2026 12:15:38 +0200
+Subject: [PATCH 52/52] feat(paged): whole-pattern MoE matcher + routed-FFN
+ fused NVFP4-quant down MMQ
+
+Add the routed-FFN fused-quant line for the paged NVFP4 MoE decode step,
+all default-off and md5-clean, gating a fused SwiGLU-to-NVFP4-quant plus a
+raw pre-quantized down-projection MMQ that skips the intermediate F32
+materialize + re-quantize of the standard gate_up -> SwiGLU -> down chain.
+
+Pieces (all guarded, no effect unless explicitly enabled):
+
+- Whole-pattern MoE matcher + executor hook in ggml_cuda_try_fuse
+  (LLAMA_MOE_WHOLE_PATTERN_EXEC and the *_TRACE/_EARLY_TRACE diagnostics).
+  Detects the gate_up(MUL_MAT_ID) -> view/view -> SwiGLU(GLU) -> down(MUL_MAT_ID)
+  sub-graph early in the fusion pass and, when engaged, runs the whole chain
+  through a single executor instead of node-by-node.
+
+- Routed-FFN PoC scaffold ggml/src/ggml-cuda/moe-ffn.{cu,cuh} + a narrow hook
+  (LLAMA_MOE_ROUTED_FFN_POC). ggml_cuda_compute_forward is de-static-ed so the
+  executor translation unit can drive the standard op path for the fallback
+  legs. The executor tries the fused-quant path first, else falls back to the
+  stock compute_forward for glu + down (bit-identical to default).
+
+- Fused SwiGLU-to-NVFP4-quant + raw down MMQ (LLAMA_MOE_ROUTED_FFN_FUSED_QUANT):
+  moe_swiglu_nvfp4_quant_kernel writes block_fp4_mmq activations directly, then
+  ggml_cuda_mul_mat_q_moe_quantized (with the local ggml_cuda_mmq_ids_meta
+  refactor of the expert-sorted ids/bounds prep) runs the down GEMM on the
+  pre-quantized rows. Native FP4 (Blackwell) only; NVFP4 down weights only.
+
+Gated on GB10 (sm_121a), before/after this commit:
+- Canonical default-path greedy md5 unchanged: MoE q36-35b-a3b-nvfp4
+  8cb0ce23777bf55f92f63d0292c756b0, dense q36-27b-nvfp4
+  5951a5b4d624ce891e22ab5fca9bc439.
+- md5-clean opt-in: LLAMA_MOE_ROUTED_FFN_POC=1 LLAMA_MOE_ROUTED_FFN_FUSED_QUANT=1
+  keeps the MoE md5 byte-identical (8cb0ce23...).
+- test-backend-ops: MUL_MAT 1146/1146, MUL_MAT_ID 806/806, GATED_DELTA_NET 46/46,
+  and the MoE sentinels MOE_SWIGLU_DOWN 7/7 + MUL_MAT_ID_RAGGED_MOE 6/6 pass both
+  default and opt-in. Opt-in emits exactly six route=mmq_moe_quantized_raw markers
+  with zero mmq_moe_sorted_raw launches (fused path provably engaged).
+- Serving effect is flat-to-slightly-positive and not a shipped default:
+  decode agg 326.9 -> 332.7 t/s, mmq_nvfp4 6009 -> 5915 ms, aggregate flat
+  (~/bench/phase135_routed_ffn_fused_quant_serving/20260702_082102).
+
+The rejected/neutral neighbours of this line (Phase133 sorted-F32 down,
+Phase134 fused-SWIGLU-only, Phase138 finalize/weighted-combine fusion, the
+W4A16 grouped-tile pack/tune/pad line, GPU-sort, boundary/layout/quant traces)
+are deliberately excluded and carry no markers in this tree.
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 304 +++++++++++++++++++++++++++++++-
+ ggml/src/ggml-cuda/mmq.cu       | 148 ++++++++++++++++
+ ggml/src/ggml-cuda/mmq.cuh      |  25 +++
+ ggml/src/ggml-cuda/moe-ffn.cu   | 296 +++++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/moe-ffn.cuh  |  24 +++
+ 5 files changed, 796 insertions(+), 1 deletion(-)
+ create mode 100644 ggml/src/ggml-cuda/moe-ffn.cu
+ create mode 100644 ggml/src/ggml-cuda/moe-ffn.cuh
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 374949f25..ef1bdc3b4 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -32,6 +32,7 @@
+ #include "ggml-cuda/im2col.cuh"
+ #include "ggml-cuda/mmf.cuh"
+ #include "ggml-cuda/mmq.cuh"
+#include "ggml-cuda/moe-ffn.cuh"
+ #include "ggml-cuda/mmvf.cuh"
+ #include "ggml-cuda/mmvq.cuh"
+ #include "ggml-cuda/norm.cuh"
+@@ -2854,7 +2855,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
+         nb1, nb2, nb3, stream);
+ }
+ 
+-static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
+bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
+     switch (dst->op) {
+         case GGML_OP_ARGMAX:
+             ggml_cuda_argmax(ctx, dst);
+@@ -4032,6 +4033,201 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
+     return false;
+ }
+ 
+static inline const char * ggml_cuda_moe_wp_trace_tensor_name(const ggml_tensor * t) {
+    return t != nullptr && t->name[0] != '\0' ? t->name : "-";
+}
+
+static inline int ggml_cuda_moe_whole_pattern_trace_limit() {
+    static const int value = []() {
+        const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_TRACE");
+        if (s == nullptr || strcmp(s, "0") == 0) {
+            return 0;
+        }
+        const int parsed = atoi(s);
+        return parsed > 0 ? parsed : 128;
+    }();
+
+    return value;
+}
+
+static inline bool ggml_cuda_moe_whole_pattern_trace_take(std::atomic<int> & counter) {
+    const int trace_limit = ggml_cuda_moe_whole_pattern_trace_limit();
+    if (trace_limit <= 0) {
+        return false;
+    }
+
+    const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed);
+    return trace_idx < trace_limit;
+}
+
+static inline int ggml_cuda_moe_whole_pattern_early_trace_limit() {
+    static const int value = []() {
+        const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EARLY_TRACE");
+        if (s == nullptr || strcmp(s, "0") == 0) {
+            return 0;
+        }
+        const int parsed = atoi(s);
+        return parsed > 0 ? parsed : 128;
+    }();
+
+    return value;
+}
+
+static inline bool ggml_cuda_moe_whole_pattern_exec_enabled() {
+    static const bool value = []() {
+        const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC");
+        return s != nullptr && atoi(s) != 0;
+    }();
+
+    return value;
+}
+
+static inline int ggml_cuda_moe_whole_pattern_exec_trace_limit() {
+    static const int value = []() {
+        const char * s = getenv("LLAMA_MOE_WHOLE_PATTERN_EXEC_TRACE");
+        if (s == nullptr || strcmp(s, "0") == 0) {
+            return 0;
+        }
+        const int parsed = atoi(s);
+        return parsed > 0 ? parsed : 128;
+    }();
+
+    return value;
+}
+
+static inline bool ggml_cuda_moe_whole_pattern_exec_trace_take(std::atomic<int> & counter) {
+    const int trace_limit = ggml_cuda_moe_whole_pattern_exec_trace_limit();
+    if (trace_limit <= 0) {
+        return false;
+    }
+
+    const int trace_idx = counter.fetch_add(1, std::memory_order_relaxed);
+    return trace_idx < trace_limit;
+}
+
+struct ggml_cuda_moe_whole_pattern {
+    const ggml_tensor * gate_up = nullptr;
+    const ggml_tensor * gate    = nullptr;
+    const ggml_tensor * up      = nullptr;
+    const ggml_tensor * glu     = nullptr;
+    const ggml_tensor * down    = nullptr;
+    const ggml_tensor * ids     = nullptr;
+
+    bool view_pair      = false;
+    bool ids_match      = false;
+    bool swiglu         = false;
+    bool supported_type = false;
+    bool supported      = false;
+};
+
+static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect(const ggml_tensor * glu, const ggml_tensor * down) {
+    ggml_cuda_moe_whole_pattern pattern{};
+    pattern.glu  = glu;
+    pattern.down = down;
+
+    if (glu == nullptr || down == nullptr || glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) {
+        return pattern;
+    }
+
+    pattern.gate = glu->src[0];
+    pattern.up   = glu->src[1];
+    pattern.ids  = down->src[2];
+
+    pattern.view_pair = pattern.gate != nullptr && pattern.up != nullptr &&
+                        pattern.gate->op == GGML_OP_VIEW && pattern.up->op == GGML_OP_VIEW &&
+                        pattern.gate->view_src != nullptr && pattern.gate->view_src == pattern.up->view_src;
+    if (!pattern.view_pair) {
+        return pattern;
+    }
+
+    pattern.gate_up = pattern.gate->view_src;
+    if (pattern.gate_up == nullptr || pattern.gate_up->op != GGML_OP_MUL_MAT_ID) {
+        return pattern;
+    }
+
+    pattern.ids_match      = pattern.gate_up->src[2] == pattern.ids;
+    pattern.swiglu         = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU;
+    pattern.supported_type = down->src[0] != nullptr &&
+                             (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4);
+    pattern.supported      = pattern.ids_match && pattern.swiglu && pattern.supported_type;
+
+    return pattern;
+}
+
+static ggml_cuda_moe_whole_pattern ggml_cuda_moe_whole_pattern_detect_early(const ggml_cgraph * cgraph, int i) {
+    ggml_cuda_moe_whole_pattern pattern{};
+
+    if (cgraph == nullptr || i + 4 >= cgraph->n_nodes) {
+        return pattern;
+    }
+
+    const ggml_tensor * gate_up = cgraph->nodes[i + 0];
+    const ggml_tensor * view0   = cgraph->nodes[i + 1];
+    const ggml_tensor * view1   = cgraph->nodes[i + 2];
+    const ggml_tensor * glu     = cgraph->nodes[i + 3];
+    const ggml_tensor * down    = cgraph->nodes[i + 4];
+
+    pattern.gate_up = gate_up;
+    pattern.glu     = glu;
+    pattern.down    = down;
+
+    if (gate_up == nullptr || view0 == nullptr || view1 == nullptr || glu == nullptr || down == nullptr ||
+        gate_up->op != GGML_OP_MUL_MAT_ID || view0->op != GGML_OP_VIEW || view1->op != GGML_OP_VIEW ||
+        glu->op != GGML_OP_GLU || down->op != GGML_OP_MUL_MAT_ID) {
+        return pattern;
+    }
+
+    pattern.view_pair = view0->view_src == gate_up && view1->view_src == gate_up;
+    if (!pattern.view_pair) {
+        return pattern;
+    }
+
+    if (glu->src[0] == view0 && glu->src[1] == view1) {
+        pattern.gate = view0;
+        pattern.up   = view1;
+    } else if (glu->src[0] == view1 && glu->src[1] == view0) {
+        pattern.gate = view1;
+        pattern.up   = view0;
+    } else {
+        return pattern;
+    }
+
+    if (down->src[1] != glu) {
+        return pattern;
+    }
+
+    pattern.ids            = down->src[2];
+    pattern.ids_match      = gate_up->src[2] == pattern.ids;
+    pattern.swiglu         = ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU;
+    pattern.supported_type = down->src[0] != nullptr &&
+                             (down->src[0]->type == GGML_TYPE_NVFP4 || down->src[0]->type == GGML_TYPE_MXFP4);
+    pattern.supported      = pattern.ids_match && pattern.swiglu && pattern.supported_type;
+
+    return pattern;
+}
+
+static bool ggml_cuda_moe_whole_pattern_exec_proof(
+        ggml_backend_cuda_context * cuda_ctx,
+        const ggml_cuda_moe_whole_pattern & pattern) {
+    GGML_ASSERT(cuda_ctx != nullptr);
+    GGML_ASSERT(pattern.supported);
+    GGML_ASSERT(pattern.gate_up != nullptr);
+    GGML_ASSERT(pattern.glu != nullptr);
+    GGML_ASSERT(pattern.down != nullptr);
+
+    if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.gate_up))) {
+        return false;
+    }
+    if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.glu))) {
+        return false;
+    }
+    if (!ggml_cuda_compute_forward(*cuda_ctx, const_cast<ggml_tensor *>(pattern.down))) {
+        return false;
+    }
+
+    return true;
+}
+
+ // try and fuse nodes and return the number of nodes to skip
+ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, int i) {
+ 
+@@ -4042,6 +4238,112 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
+ 
+     ggml_tensor * node = cgraph->nodes[i];
+ 
+    static std::atomic<int> moe_whole_pattern_early_trace_count{0};
+    const bool routed_ffn_poc = ggml_cuda_moe_routed_ffn_poc_enabled();
+    const bool whole_pattern_exec = ggml_cuda_moe_whole_pattern_exec_enabled();
+    const int whole_pattern_early_trace_limit = ggml_cuda_moe_whole_pattern_early_trace_limit();
+    if (node->op == GGML_OP_MUL_MAT_ID &&
+            (routed_ffn_poc || whole_pattern_exec || whole_pattern_early_trace_limit > 0)) {
+        const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect_early(cgraph, i);
+        if (pattern.view_pair) {
+            const int trace_idx = moe_whole_pattern_early_trace_count.fetch_add(1, std::memory_order_relaxed);
+            if (trace_idx < whole_pattern_early_trace_limit) {
+                const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
+                const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
+                fprintf(stderr,
+                    "[LLAMA_MOE_WHOLE_PATTERN_EARLY] supported=%d skip_ready=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s"
+                    " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64
+                    " n_embd=%" PRId64 " n_ff=%" PRId64
+                    " ids_match=%d swiglu=%d\n",
+                    pattern.supported ? 1 : 0,
+                    pattern.supported ? 4 : 0,
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.gate),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.up),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
+                    down_w != nullptr ? ggml_type_name(down_w->type) : "-",
+                    down_x != nullptr ? down_x->ne[2] : 0,
+                    pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
+                    down_w != nullptr ? down_w->ne[2] : 0,
+                    down_w != nullptr ? down_w->ne[1] : 0,
+                    down_w != nullptr ? down_w->ne[0] : 0,
+                    pattern.ids_match ? 1 : 0,
+                    pattern.swiglu ? 1 : 0);
+            }
+        }
+
+        const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+        const bool poc_supported = routed_ffn_poc && ggml_cuda_moe_routed_ffn_poc_should_engage(
+            pattern.gate_up, pattern.gate, pattern.up, pattern.glu, pattern.down, pattern.ids, cc);
+
+        if ((poc_supported || (whole_pattern_exec && pattern.supported))) {
+            const bool ok = poc_supported ?
+                ggml_cuda_moe_routed_ffn_poc(
+                    *cuda_ctx,
+                    const_cast<ggml_tensor *>(pattern.gate_up),
+                    const_cast<ggml_tensor *>(pattern.gate),
+                    const_cast<ggml_tensor *>(pattern.up),
+                    const_cast<ggml_tensor *>(pattern.glu),
+                    const_cast<ggml_tensor *>(pattern.down)) :
+                ggml_cuda_moe_whole_pattern_exec_proof(cuda_ctx, pattern);
+            GGML_ASSERT(ok);
+
+            static std::atomic<int> moe_whole_pattern_exec_trace_count{0};
+            if (ggml_cuda_moe_whole_pattern_exec_trace_take(moe_whole_pattern_exec_trace_count)) {
+                const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
+                const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
+                fprintf(stderr,
+                    "[LLAMA_MOE_WHOLE_PATTERN_EXEC] skip=4 gate_up=%s glu=%s down=%s ids=%s"
+                    " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64 "\n",
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
+                    ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
+                    down_x != nullptr ? down_x->ne[2] : 0,
+                    pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
+                    down_w != nullptr ? down_w->ne[2] : 0);
+            }
+
+            return 4;
+        }
+    }
+
+    if (node->op == GGML_OP_GLU && i + 1 < cgraph->n_nodes && cgraph->nodes[i + 1]->op == GGML_OP_MUL_MAT_ID) {
+        static std::atomic<int> moe_whole_pattern_trace_count{0};
+        const bool whole_trace = ggml_cuda_moe_whole_pattern_trace_take(moe_whole_pattern_trace_count);
+
+        if (whole_trace) {
+            const ggml_tensor * down = cgraph->nodes[i + 1];
+            const ggml_cuda_moe_whole_pattern pattern = ggml_cuda_moe_whole_pattern_detect(node, down);
+
+            const ggml_tensor * down_w = pattern.down != nullptr ? pattern.down->src[0] : nullptr;
+            const ggml_tensor * down_x = pattern.down != nullptr ? pattern.down->src[1] : nullptr;
+            fprintf(stderr,
+                "[LLAMA_MOE_WHOLE_PATTERN] supported=%d gate_up=%s gate=%s up=%s glu=%s down=%s ids=%s type=%s"
+                " n_tokens=%" PRId64 " n_used=%" PRId64 " experts=%" PRId64
+                " n_embd=%" PRId64 " n_ff=%" PRId64
+                " view_pair=%d ids_match=%d swiglu=%d\n",
+                pattern.supported ? 1 : 0,
+                ggml_cuda_moe_wp_trace_tensor_name(pattern.gate_up),
+                ggml_cuda_moe_wp_trace_tensor_name(pattern.gate),
+                ggml_cuda_moe_wp_trace_tensor_name(pattern.up),
+                ggml_cuda_moe_wp_trace_tensor_name(pattern.glu),
+                ggml_cuda_moe_wp_trace_tensor_name(pattern.down),
+                ggml_cuda_moe_wp_trace_tensor_name(pattern.ids),
+                down_w != nullptr ? ggml_type_name(down_w->type) : "-",
+                down_x != nullptr ? down_x->ne[2] : 0,
+                pattern.ids != nullptr ? pattern.ids->ne[0] : 0,
+                down_w != nullptr ? down_w->ne[2] : 0,
+                down_w != nullptr ? down_w->ne[1] : 0,
+                down_w != nullptr ? down_w->ne[0] : 0,
+                pattern.view_pair ? 1 : 0,
+                pattern.ids_match ? 1 : 0,
+                pattern.swiglu ? 1 : 0);
+        }
+    }
+
+     //topk-moe
+     if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
+             cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
+diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
+index dc5c2d198..d8f39d395 100644
+--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
+@@ -1,4 +1,7 @@
+ #include <cstdlib>
+#include <atomic>
+#include <cinttypes>
+#include <cstdio>
+ #include "common.cuh"
+ #include "mmq.cuh"
+ #include "quantize.cuh"
+@@ -75,6 +78,151 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
+     }
+ }
+ 
+static inline int ggml_cuda_quant_trace_limit() {
+    static const int value = []() {
+        const char * s = getenv("LLAMA_QUANT_TRACE");
+        return s ? atoi(s) : 0;
+    }();
+
+    return value;
+}
+
+static inline const char * ggml_cuda_quant_trace_tensor_name(const ggml_tensor * t) {
+    return t != nullptr && t->name[0] != '\0' ? t->name : "-";
+}
+
+static inline void ggml_cuda_quant_trace(
+        const char * route, const ggml_tensor * src0, const ggml_tensor * src1,
+        const ggml_tensor * ids, const ggml_tensor * dst, const int native_fp4,
+        const int dedup, const int gathered, const int64_t ne10, const int64_t ne10_padded,
+        const int64_t rows, const int64_t ne12, const int64_t n_expert_used) {
+    const int trace_limit = ggml_cuda_quant_trace_limit();
+    if (trace_limit <= 0) {
+        return;
+    }
+
+    static std::atomic<int> trace_count{0};
+    const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
+    if (trace_idx >= trace_limit) {
+        return;
+    }
+
+    fprintf(stderr,
+        "[LLAMA_QUANT_TRACE] route=%s src0=%s src0_type=%s src1=%s src1_type=%s dst=%s dst_type=%s ids=%s "
+        "native_fp4=%d dedup=%d gathered=%d K=%" PRId64 " Kpad=%" PRId64 " rows=%" PRId64
+        " ne12=%" PRId64 " experts=%" PRId64 "\n",
+        route, ggml_cuda_quant_trace_tensor_name(src0), src0 != nullptr ? ggml_type_name(src0->type) : "-",
+        ggml_cuda_quant_trace_tensor_name(src1), src1 != nullptr ? ggml_type_name(src1->type) : "-",
+        ggml_cuda_quant_trace_tensor_name(dst), dst != nullptr ? ggml_type_name(dst->type) : "-",
+        ggml_cuda_quant_trace_tensor_name(ids), native_fp4, dedup, gathered,
+        ne10, ne10_padded, rows, ne12, n_expert_used);
+}
+
+ggml_cuda_mmq_ids_meta::ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts)
+    : ne_get_rows(ne_get_rows) {
+    alloc(pool, ne_get_rows, n_experts);
+}
+
+void ggml_cuda_mmq_ids_meta::alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts) {
+    ids_src1.alloc(pool, ne_get_rows);
+    ids_dst.alloc(pool, ne_get_rows);
+    expert_bounds.alloc(pool, n_experts + 1);
+    this->ne_get_rows = ne_get_rows;
+}
+
+void ggml_cuda_mmq_ids_meta::build(
+        const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens,
+        int64_t n_expert_used, int64_t nchannels_y, int64_t sis1,
+        cudaStream_t stream) {
+    GGML_ASSERT(ids->nb[0] == ggml_element_size(ids));
+    const int si1 = ids->nb[1] / ggml_element_size(ids);
+
+    ggml_cuda_launch_mm_ids_helper(
+        (const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(),
+        n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+    CUDA_CHECK(cudaGetLastError());
+}
+
+static void ggml_cuda_mul_mat_q_moe_quantized_impl(
+        ggml_backend_cuda_context & ctx,
+        const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
+        const int32_t * ids_dst, const int32_t * expert_bounds,
+        int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
+        int64_t ncols_src1_padded) {
+    GGML_ASSERT(        dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(        src1_q != nullptr);
+    GGML_ASSERT(        ids_dst != nullptr);
+    GGML_ASSERT(        expert_bounds != nullptr);
+    GGML_ASSERT(        n_tokens > 0);
+    GGML_ASSERT(        n_expert_used > 0);
+    GGML_ASSERT(        n_experts > 0);
+    GGML_ASSERT(        ncols_src1_padded > 0);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t ne0  = dst->ne[0];
+    const int64_t ne1  = dst->ne[1];
+    const int64_t ne2  = dst->ne[2];
+    const int64_t ne3  = dst->ne[3];
+    const int64_t nb00 = src0->nb[0];
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+    const int64_t nb03 = src0->nb[3];
+    const int64_t nb0  = dst->nb[0];
+    const int64_t nb1  = dst->nb[1];
+    const int64_t nb2  = dst->nb[2];
+    const int64_t nb3  = dst->nb[3];
+
+    const size_t ts_src0 = ggml_type_size(src0->type);
+    const size_t ts_dst  = ggml_type_size(dst->type);
+
+    GGML_ASSERT(nb00 == (int64_t) ts_src0);
+    GGML_ASSERT(nb0  == (int64_t) ts_dst);
+    GGML_ASSERT(ne00 <= ncols_src1_padded);
+    GGML_ASSERT(ne01 == ne0);
+    GGML_ASSERT(ne02 == n_experts);
+    GGML_ASSERT(ne1  == n_expert_used);
+    GGML_ASSERT(ne2  == n_tokens);
+
+    cudaStream_t stream = ctx.stream();
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
+                            || GGML_CUDA_CC_IS_CDNA(cc);
+
+    const int64_t ne_get_rows = n_tokens * n_expert_used;
+    const int64_t s01 = nb01 / ts_src0;
+    const int64_t s1  = nb1  / ts_dst;
+    const int64_t s02 = nb02 / ts_src0;
+    const int64_t s2  = nb2  / ts_dst;
+    const int64_t s03 = nb03 / ts_src0;
+    const int64_t s3  = nb3  / ts_dst;
+    const int64_t s12 = ne_get_rows * ncols_src1_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int));
+    const int64_t s13 = n_tokens * s12;
+
+    const mmq_args args = {
+        (const char *) src0->data, src0->type, (const int *) src1_q, ids_dst, expert_bounds, (float *) dst->data,
+        ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
+        n_experts, n_experts, s02, s12, s2,
+        ne03, ne3, s03, s13, s3,
+        use_stream_k, n_tokens};
+
+    ggml_cuda_quant_trace("mmq_moe_quantized_raw", src0, nullptr, nullptr, dst, 1,
+        0, 0, ne00, ncols_src1_padded, ne_get_rows, n_tokens, n_expert_used);
+    ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
+}
+
+void ggml_cuda_mul_mat_q_moe_quantized(
+        ggml_backend_cuda_context & ctx,
+        const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
+        const int32_t * ids_dst, const int32_t * expert_bounds,
+        int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
+        int64_t ncols_src1_padded) {
+    ggml_cuda_mul_mat_q_moe_quantized_impl(ctx, src0, src1_q, dst, ids_dst, expert_bounds,
+        n_tokens, n_expert_used, n_experts, ncols_src1_padded);
+}
+
+ void ggml_cuda_mul_mat_q(
+         ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+     GGML_ASSERT(        src1->type == GGML_TYPE_F32);
+diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
+index b53e38a8b..690dc694a 100644
+--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
+@@ -4334,6 +4334,31 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
+ void ggml_cuda_mul_mat_q(
+         ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+ 
+struct ggml_cuda_mmq_ids_meta {
+    ggml_cuda_pool_alloc<int32_t> ids_src1;
+    ggml_cuda_pool_alloc<int32_t> ids_dst;
+    ggml_cuda_pool_alloc<int32_t> expert_bounds;
+
+    int64_t ne_get_rows = 0;
+
+    ggml_cuda_mmq_ids_meta() = default;
+    ggml_cuda_mmq_ids_meta(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts);
+
+    void alloc(ggml_cuda_pool & pool, int64_t ne_get_rows, int64_t n_experts);
+
+    void build(
+        const ggml_tensor * ids, int64_t n_experts, int64_t n_tokens,
+        int64_t n_expert_used, int64_t nchannels_y, int64_t sis1,
+        cudaStream_t stream);
+};
+
+void ggml_cuda_mul_mat_q_moe_quantized(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const void * src1_q, ggml_tensor * dst,
+    const int32_t * ids_dst, const int32_t * expert_bounds,
+    int64_t n_tokens, int64_t n_expert_used, int64_t n_experts,
+    int64_t ncols_src1_padded);
+
+ void ggml_cuda_op_mul_mat_q(
+     ggml_backend_cuda_context & ctx,
+     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+diff --git a/ggml/src/ggml-cuda/moe-ffn.cu b/ggml/src/ggml-cuda/moe-ffn.cu
+new file mode 100644
+index 000000000..507390d65
+--- /dev/null
+++ b/ggml/src/ggml-cuda/moe-ffn.cu
+@@ -0,0 +1,296 @@
+#include "moe-ffn.cuh"
+#include "getrows.cuh"
+#include "mmq.cuh"
+#include "unary.cuh"
+
+#include <cstdlib>
+
+bool ggml_cuda_moe_routed_ffn_poc_enabled() {
+    static const bool enabled = [] {
+        const char * value = getenv("LLAMA_MOE_ROUTED_FFN_POC");
+        return value != nullptr && atoi(value) != 0;
+    }();
+    return enabled;
+}
+
+bool ggml_cuda_moe_routed_ffn_poc_should_engage(
+        const ggml_tensor * gate_up,
+        const ggml_tensor * gate,
+        const ggml_tensor * up,
+        const ggml_tensor * glu,
+        const ggml_tensor * down,
+        const ggml_tensor * ids,
+        int cc) {
+    if (!blackwell_mma_available(cc)) {
+        return false;
+    }
+    if (gate_up == nullptr || gate == nullptr || up == nullptr || glu == nullptr || down == nullptr || ids == nullptr) {
+        return false;
+    }
+    if (gate_up->op != GGML_OP_MUL_MAT_ID || down->op != GGML_OP_MUL_MAT_ID) {
+        return false;
+    }
+    if (gate->op != GGML_OP_VIEW || up->op != GGML_OP_VIEW || gate->view_src != gate_up || up->view_src != gate_up) {
+        return false;
+    }
+    if (glu->op != GGML_OP_GLU || ggml_get_glu_op(glu) != GGML_GLU_OP_SWIGLU || down->src[1] != glu) {
+        return false;
+    }
+    if (gate_up->src[2] != ids || down->src[2] != ids) {
+        return false;
+    }
+
+    const ggml_tensor * down_w = down->src[0];
+    if (down_w == nullptr || (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_cuda_moe_routed_ffn_fused_quant_enabled() {
+    static const bool enabled = [] {
+        const char * value = getenv("LLAMA_MOE_ROUTED_FFN_FUSED_QUANT");
+        return value != nullptr && atoi(value) != 0;
+    }();
+    return enabled;
+}
+
+static bool ggml_cuda_moe_routed_ffn_down_supported(const ggml_tensor * glu, const ggml_tensor * down) {
+    const ggml_tensor * down_w = down != nullptr ? down->src[0] : nullptr;
+    const ggml_tensor * ids    = down != nullptr ? down->src[2] : nullptr;
+    if (glu == nullptr || down == nullptr || down_w == nullptr || ids == nullptr) {
+        return false;
+    }
+    if (down_w->type != GGML_TYPE_NVFP4 && down_w->type != GGML_TYPE_MXFP4) {
+        return false;
+    }
+    if (glu->type != GGML_TYPE_F32 || down->type != GGML_TYPE_F32 || ids->type != GGML_TYPE_I32) {
+        return false;
+    }
+    if (glu->ne[3] != 1 || down->ne[3] != 1 || ids->ne[2] != 1 || ids->ne[3] != 1) {
+        return false;
+    }
+    if (down_w->ne[0] != glu->ne[0] || down_w->ne[1] != down->ne[0] || down_w->ne[2] <= 0) {
+        return false;
+    }
+    if (ids->ne[0] != glu->ne[1] || ids->ne[1] != glu->ne[2]) {
+        return false;
+    }
+    if (down->ne[1] != glu->ne[1] || down->ne[2] != glu->ne[2]) {
+        return false;
+    }
+    if (ids->nb[0] != ggml_element_size(ids)) {
+        return false;
+    }
+    if (glu->nb[0] != sizeof(float) || down->nb[0] != sizeof(float)) {
+        return false;
+    }
+    if (glu->nb[1] != (size_t) (glu->ne[0] * (int64_t) sizeof(float)) ||
+        glu->nb[2] != (size_t) (glu->ne[1] * (int64_t) glu->nb[1])) {
+        return false;
+    }
+    if (down->nb[1] != (size_t) (down->ne[0] * (int64_t) sizeof(float)) ||
+        down->nb[2] != (size_t) (down->ne[1] * (int64_t) down->nb[1])) {
+        return false;
+    }
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    return ggml_cuda_should_use_mmq(down_w->type, cc, glu->ne[2], down_w->ne[2]);
+}
+
+static __global__ void moe_swiglu_nvfp4_quant_kernel(
+        const float * __restrict__ gate,
+        const float * __restrict__ up,
+        const int32_t * __restrict__ ids_src1,
+        void * __restrict__ vy,
+        int64_t n_ff,
+        int64_t n_ff_padded,
+        int64_t n_rows,
+        int64_t n_used,
+        int64_t gate_s1,
+        int64_t gate_s2,
+        int64_t up_s1,
+        int64_t up_s2) {
+#if defined(BLACKWELL_MMA_AVAILABLE)
+    const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB;
+    if (i0_base >= n_ff_padded) {
+        return;
+    }
+
+    const int64_t row = blockIdx.x;
+    const int64_t src_row = ids_src1[row];
+    const int64_t token = src_row / n_used;
+    const int64_t used = src_row - token * n_used;
+    const int64_t k_block = i0_base / QK_K;
+    const int64_t blocks_per_col = (n_ff_padded + QK_K - 1) / QK_K;
+    if (k_block >= blocks_per_col) {
+        return;
+    }
+
+    block_fp4_mmq * y = (block_fp4_mmq *) vy;
+    block_fp4_mmq * yb = y + k_block * n_rows + row;
+    const int sub = (i0_base % QK_K) / QK_NVFP4_SUB;
+
+    float vals_raw[QK_NVFP4_SUB];
+    float amax_raw = 0.0f;
+#pragma unroll
+    for (int k = 0; k < QK_NVFP4_SUB; k++) {
+        const int64_t i0 = i0_base + k;
+        if (i0 < n_ff) {
+            const float g = gate[token * gate_s2 + used * gate_s1 + i0];
+            const float u = up[token * up_s2 + used * up_s1 + i0];
+            const float v = ggml_cuda_op_silu_single(g) * u;
+            vals_raw[k] = v;
+            amax_raw = fmaxf(amax_raw, fabsf(v));
+        } else {
+            vals_raw[k] = 0.0f;
+        }
+    }
+
+    static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2 };
+    const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
+
+    float best_err = FLT_MAX;
+    uint8_t fp8_code = 0;
+    float subblock_scale = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < 5; i++) {
+        const int test_code = first_fp8_code + test_offsets[i];
+        if (test_code < 0 || test_code > 0x7e) {
+            continue;
+        }
+        const uint8_t code = (uint8_t) test_code;
+        const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
+        const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
+        float cur_err = 0.0f;
+#pragma unroll
+        for (int k = 0; k < QK_NVFP4_SUB; ++k) {
+            const float v = vals_raw[k];
+            const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale);
+            const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale;
+            cur_err = fmaf(err_diff, err_diff, cur_err);
+        }
+
+        if (cur_err < best_err) {
+            best_err = cur_err;
+            fp8_code = test_code;
+            subblock_scale = test_scale;
+        }
+    }
+
+    const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
+    uint32_t q0 = 0;
+    uint32_t q1 = 0;
+#pragma unroll
+    for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
+        q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k +  0], inv_scale) << (8 * k);
+        q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k +  8], inv_scale) << (8 * k + 4);
+        q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k +  4], inv_scale) << (8 * k);
+        q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4);
+    }
+
+    uint32_t * yqs = reinterpret_cast<uint32_t *>(yb->qs);
+    yqs[2 * sub + 0] = q0;
+    yqs[2 * sub + 1] = q1;
+    reinterpret_cast<uint8_t *>(yb->d4)[sub] = fp8_code;
+#else
+    NO_DEVICE_CODE;
+#endif
+}
+
+static bool ggml_cuda_moe_routed_ffn_fused_quant(
+        ggml_backend_cuda_context & ctx,
+        ggml_tensor * gate,
+        ggml_tensor * up,
+        ggml_tensor * glu,
+        ggml_tensor * down) {
+    if (!ggml_cuda_moe_routed_ffn_fused_quant_enabled()) {
+        return false;
+    }
+    if (!ggml_cuda_moe_routed_ffn_down_supported(glu, down)) {
+        return false;
+    }
+    const ggml_tensor * down_w = down->src[0];
+    const ggml_tensor * ids = down->src[2];
+    if (down_w->type != GGML_TYPE_NVFP4) {
+        return false;
+    }
+    if (gate == nullptr || up == nullptr || gate->type != GGML_TYPE_F32 || up->type != GGML_TYPE_F32) {
+        return false;
+    }
+    if (gate->ne[0] != glu->ne[0] || gate->ne[1] != glu->ne[1] || gate->ne[2] != glu->ne[2] || gate->ne[3] != glu->ne[3]) {
+        return false;
+    }
+    if (up->ne[0] != glu->ne[0] || up->ne[1] != glu->ne[1] || up->ne[2] != glu->ne[2] || up->ne[3] != glu->ne[3]) {
+        return false;
+    }
+    if (gate->nb[0] != sizeof(float) || up->nb[0] != sizeof(float)) {
+        return false;
+    }
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    if (!blackwell_mma_available(cc)) {
+        return false;
+    }
+
+    const int64_t n_ff = glu->ne[0];
+    const int64_t n_ff_padded = GGML_PAD(n_ff, MATRIX_ROW_PADDING);
+    if (n_ff % QK_NVFP4 != 0) {
+        return false;
+    }
+
+    const int64_t n_expert_used = ids->ne[0];
+    const int64_t n_tokens = glu->ne[2];
+    const int64_t n_experts = down_w->ne[2];
+    const int64_t ne_get_rows = n_tokens * n_expert_used;
+
+    ggml_cuda_mmq_ids_meta ids_meta(ctx.pool(), ne_get_rows, n_experts);
+    const int64_t sis1 = glu->nb[2] / glu->nb[1];
+    ids_meta.build(ids, n_experts, n_tokens, n_expert_used, glu->ne[1], sis1, ctx.stream());
+
+    const size_t nbytes_src1_q = ne_get_rows * n_ff_padded * sizeof(block_fp4_mmq) / QK_K +
+        get_mmq_x_max_host(cc) * sizeof(block_q8_1_mmq);
+    ggml_cuda_pool_alloc<char> src1_q(ctx.pool(), nbytes_src1_q);
+
+    constexpr int nvfp4_block_size = 128;
+    const int64_t block_num_y = (n_ff_padded + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size);
+    const dim3 block_size(nvfp4_block_size, 1, 1);
+    const dim3 num_blocks(ne_get_rows, block_num_y, 1);
+    moe_swiglu_nvfp4_quant_kernel<<<num_blocks, block_size, 0, ctx.stream()>>>(
+        (const float *) gate->data, (const float *) up->data, ids_meta.ids_src1.get(), src1_q.get(),
+        n_ff, n_ff_padded, ne_get_rows, n_expert_used,
+        gate->nb[1] / sizeof(float), gate->nb[2] / sizeof(float),
+        up->nb[1] / sizeof(float), up->nb[2] / sizeof(float));
+    CUDA_CHECK(cudaGetLastError());
+
+    ggml_cuda_mul_mat_q_moe_quantized(
+        ctx, down_w, src1_q.get(), down,
+        ids_meta.ids_dst.get(), ids_meta.expert_bounds.get(),
+        n_tokens, n_expert_used, n_experts, n_ff_padded);
+    return true;
+}
+
+bool ggml_cuda_moe_routed_ffn_poc(
+        ggml_backend_cuda_context & ctx,
+        ggml_tensor * gate_up,
+        ggml_tensor * gate,
+        ggml_tensor * up,
+        ggml_tensor * glu,
+        ggml_tensor * down) {
+    if (!ggml_cuda_compute_forward(ctx, gate_up)) {
+        return false;
+    }
+    if (ggml_cuda_moe_routed_ffn_fused_quant(ctx, gate, up, glu, down)) {
+        return true;
+    }
+    if (!ggml_cuda_compute_forward(ctx, glu)) {
+        return false;
+    }
+    if (!ggml_cuda_compute_forward(ctx, down)) {
+        return false;
+    }
+
+    return true;
+}
+diff --git a/ggml/src/ggml-cuda/moe-ffn.cuh b/ggml/src/ggml-cuda/moe-ffn.cuh
+new file mode 100644
+index 000000000..ce385d31a
+--- /dev/null
+++ b/ggml/src/ggml-cuda/moe-ffn.cuh
+@@ -0,0 +1,24 @@
+#pragma once
+
+#include "common.cuh"
+
+bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_moe_routed_ffn_poc_enabled();
+
+bool ggml_cuda_moe_routed_ffn_poc_should_engage(
+    const ggml_tensor * gate_up,
+    const ggml_tensor * gate,
+    const ggml_tensor * up,
+    const ggml_tensor * glu,
+    const ggml_tensor * down,
+    const ggml_tensor * ids,
+    int cc);
+
+bool ggml_cuda_moe_routed_ffn_poc(
+    ggml_backend_cuda_context & ctx,
+    ggml_tensor * gate_up,
+    ggml_tensor * gate,
+    ggml_tensor * up,
+    ggml_tensor * glu,
+    ggml_tensor * down);
+-- 
+2.43.0
+
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-feat-server-trace-speculative-batch-shapes.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-feat-server-trace-speculative-batch-shapes.patch
@@ -1,57 +0,0 @@
-From fb9402661291e0488a3e2bf2f3948ebcd18e18c9 Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 02:41:22 +0000
-Subject: [PATCH] feat(server): trace speculative batch shapes
-
-Add an env-gated LLAMA_SPEC_SHAPE_TRACE log around the server batch rows emitted by normal decode and speculative verification slots. This keeps the instrumentation default-off while exposing the row/output shape entropy that prevents CUDA graph reuse under MTP serving.
-
-Assisted-by: Codex:gpt-5
---
- tools/server/server-context.cpp | 20 ++++++++++++++++++--
- 1 file changed, 18 insertions(+), 2 deletions(-)
-
-diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
-index a77e2676d..fd8348af6 100644
--- a/tools/server/server-context.cpp
-+++ b/tools/server/server-context.cpp
-@@ -457,12 +457,22 @@ struct server_slot {
- 
-     // add sampled token of this slot to the batch, optionally add the speculative draft tokens if any
-     void handle_last_sampled_token(server_batch & batch) {
-+        static const bool spec_shape_trace = getenv("LLAMA_SPEC_SHAPE_TRACE") != nullptr;
-+        const int32_t batch_before = batch.size();
-+
-         bool add_ok = true;
-         if (spec_draft.empty()) {
-             // no speculative decoding
-            i_batch = batch.size();
-+            i_batch = batch_before;
-+
-+            const int32_t pos0 = prompt.tokens.pos_next();
-+
-+            add_ok &= batch.add(id, sampled, pos0, true);
- 
-            add_ok &= batch.add(id, sampled, prompt.tokens.pos_next(), true);
-+            if (spec_shape_trace) {
-+                SLT_INF(*this, "spec shape: kind=decode batch_before=%d rows=1 outputs=1 draft=0 pos0=%d slot_tokens=%zu\n",
-+                        batch_before, pos0, prompt.tokens.size());
-+            }
- 
-             SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n",
-                     sampled, n_ctx, prompt.n_tokens(), truncated);
-@@ -479,6 +489,12 @@ struct server_slot {
- 
-             auto pos0 = prompt.tokens.pos_next();
- 
-+            if (spec_shape_trace) {
-+                SLT_INF(*this, "spec shape: kind=verify batch_before=%d rows=%zu outputs=%zu draft=%zu spec_i_first=%d spec_i_last=%d pos0=%d slot_tokens=%zu\n",
-+                        batch_before, spec_draft.size() + 1, spec_draft.size() + 1, spec_draft.size(),
-+                        spec_i_batch.front(), spec_i_batch.back(), pos0, prompt.tokens.size());
-+            }
-+
-             add_ok &= batch.add(id, sampled, pos0++, true);
-             for (auto token : spec_draft) {
-                 add_ok &= batch.add(this->id, token, pos0++, true);
-- 
-2.43.0
-
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0056-feat-cuda-trace-moe-mmq-batch-shapes.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0056-feat-cuda-trace-moe-mmq-batch-shapes.patch
@@ -1,212 +0,0 @@
-From 20a99518a39acbb4474fa9c97121fc7b9f07c1ef Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 04:27:19 +0000
-Subject: [PATCH] feat(cuda): trace moe mmq batch shapes
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/mmq-shape-trace.h | 66 ++++++++++++++++++++++++++++
- ggml/src/ggml-cuda/mmq.cuh           | 31 ++++++++++++-
- tests/CMakeLists.txt                 |  2 +
- tests/test-cuda-mmq-shape-trace.cpp  | 42 ++++++++++++++++++
- 4 files changed, 140 insertions(+), 1 deletion(-)
- create mode 100644 ggml/src/ggml-cuda/mmq-shape-trace.h
- create mode 100644 tests/test-cuda-mmq-shape-trace.cpp
-
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-new file mode 100644
-index 000000000..9d41b7c80
--- /dev/null
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -0,0 +1,66 @@
-+#pragma once
-+
-+#include <cstddef>
-+#include <cstdint>
-+#include <cstdio>
-+
-+struct ggml_cuda_mmq_shape {
-+    int type;
-+    bool is_moe;
-+    int64_t ncols_dst;
-+    int64_t nchannels_x;
-+    int64_t ncols_max;
-+    int64_t n_active_est;
-+    int64_t density;
-+    int mmq_x_max;
-+    int mmq_x_lim;
-+    int mmq_x_best;
-+    int mmq_y;
-+    bool use_stream_k;
-+};
-+
-+static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-+        const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-+        const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
-+        const int mmq_y, const bool use_stream_k) {
-+    int64_t n_active_est = 0;
-+    int64_t density = 0;
-+    if (is_moe && ncols_dst > 0 && nchannels_x > 0) {
-+        n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x;
-+        density = (ncols_dst + n_active_est - 1) / n_active_est;
-+    }
-+
-+    return {
-+        type,
-+        is_moe,
-+        ncols_dst,
-+        nchannels_x,
-+        ncols_max,
-+        n_active_est,
-+        density,
-+        mmq_x_max,
-+        mmq_x_lim,
-+        mmq_x_best,
-+        mmq_y,
-+        use_stream_k,
-+    };
-+}
-+
-+static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) {
-+    return std::snprintf(buf, size,
-+        "type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
-+        "n_active_est=%lld density=%lld mmq_x_max=%d mmq_x_lim=%d "
-+        "mmq_x_best=%d mmq_y=%d stream_k=%d",
-+        shape.type,
-+        shape.is_moe ? 1 : 0,
-+        (long long) shape.ncols_dst,
-+        (long long) shape.nchannels_x,
-+        (long long) shape.ncols_max,
-+        (long long) shape.n_active_est,
-+        (long long) shape.density,
-+        shape.mmq_x_max,
-+        shape.mmq_x_lim,
-+        shape.mmq_x_best,
-+        shape.mmq_y,
-+        shape.use_stream_k ? 1 : 0);
-+}
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index b53e38a8b..6bc943738 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -3,10 +3,14 @@
- #include "common.cuh"
- #include "vecdotq.cuh"
- #include "mma.cuh"
-+#include "mmq-shape-trace.h"
- 
-+#include <atomic>
- #include <climits>
- #include <cstdint>
-+#include <cstdio>
- #include <cstdlib>
-+#include <cstring>
- 
- using namespace ggml_cuda_mma;
- 
-@@ -4163,6 +4167,18 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() {
-     return c;
- }
- 
-+static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
-+    static const int limit = []() -> int {
-+        const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
-+        if (s == nullptr || strcmp(s, "0") == 0) {
-+            return 0;
-+        }
-+        const int parsed = atoi(s);
-+        return parsed > 0 ? parsed : 256;
-+    }();
-+    return limit;
-+}
-+
- template <ggml_type type>
- void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
-     const int    id     = ggml_cuda_get_device();
-@@ -4249,6 +4265,20 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
-         }
-     }
- 
-+    if (args.expert_bounds != nullptr) {
-+        static std::atomic<int> trace_count{0};
-+        const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
-+        const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit;
-+        if (trace_index >= 0 && trace_index < trace_limit) {
-+            char buf[256];
-+            const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
-+                (int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max,
-+                mmq_x_max, mmq_x_lim, mmq_x_best, mmq_y, args.use_stream_k);
-+            ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
-+            fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf);
-+        }
-+    }
-+
-     switch (mmq_x_best) {
-         case   8:
-             launch_mul_mat_q<type,   8>(ctx, args, stream);
-@@ -4341,4 +4371,3 @@ void ggml_cuda_op_mul_mat_q(
-     const int64_t src1_padded_row_size, cudaStream_t stream);
- 
- bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts);
-
-diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
-index 24592a279..0a5194c87 100644
--- a/tests/CMakeLists.txt
-+++ b/tests/CMakeLists.txt
-@@ -234,6 +234,8 @@ llama_build_and_test(test-thread-safety.cpp ARGS -m "${MODEL_DEST}" -ngl 99 -p "
- set_tests_properties(test-thread-safety PROPERTIES FIXTURES_REQUIRED test-download-model)
- 
- llama_build_and_test(test-arg-parser.cpp)
-+llama_build_and_test(test-cuda-mmq-shape-trace.cpp)
-+target_include_directories(test-cuda-mmq-shape-trace PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
- 
- if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
-   # TODO: repair known memory leaks
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-new file mode 100644
-index 000000000..8620169c0
--- /dev/null
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -0,0 +1,42 @@
-+#include "ggml-cuda/mmq-shape-trace.h"
-+
-+#include <cstdio>
-+#include <cstdlib>
-+#include <cstring>
-+
-+static void require(bool ok, const char * what) {
-+    if (!ok) {
-+        std::fprintf(stderr, "require failed: %s\n", what);
-+        std::exit(1);
-+    }
-+}
-+
-+int main() {
-+    const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
-+        /* type */ 39,
-+        /* is_moe */ true,
-+        /* ncols_dst */ 1024,
-+        /* nchannels_x */ 256,
-+        /* ncols_max */ 128,
-+        /* mmq_x_max */ 128,
-+        /* mmq_x_lim */ 64,
-+        /* mmq_x_best */ 64,
-+        /* mmq_y */ 128,
-+        /* use_stream_k */ true);
-+
-+    require(shape.n_active_est == 256, "active expert estimate is capped by expert count");
-+    require(shape.density == 4, "density is ceil(assignments / active experts)");
-+
-+    char buf[256];
-+    const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
-+
-+    require(n > 0, "format returns byte count");
-+    require(std::strstr(buf, "moe=1") != nullptr, "trace includes moe flag");
-+    require(std::strstr(buf, "ncols_dst=1024") != nullptr, "trace includes routed assignment count");
-+    require(std::strstr(buf, "n_active_est=256") != nullptr, "trace includes active estimate");
-+    require(std::strstr(buf, "density=4") != nullptr, "trace includes density");
-+    require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile");
-+    require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag");
-+
-+    return 0;
-+}
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0057-feat-cuda-trace-moe-mmq-launch-shapes.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0057-feat-cuda-trace-moe-mmq-launch-shapes.patch
@@ -1,223 +0,0 @@
-From c78e537b56e3446f8aa645c6700aacf263639bd8 Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 04:49:55 +0000
-Subject: [PATCH] feat(cuda): trace moe mmq launch shapes
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/mmq-shape-trace.h | 65 ++++++++++++++++++++++++++++
- ggml/src/ggml-cuda/mmq.cuh           | 47 +++++++++++++-------
- tests/test-cuda-mmq-shape-trace.cpp  | 35 +++++++++++++++
- 3 files changed, 132 insertions(+), 15 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-index 9d41b7c80..98bc21f7f 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -19,6 +19,24 @@ struct ggml_cuda_mmq_shape {
-     bool use_stream_k;
- };
- 
-+struct ggml_cuda_mmq_launch_shape {
-+    int type;
-+    bool is_moe;
-+    int64_t ncols_dst;
-+    int64_t ncols_max;
-+    int mmq_x;
-+    int mmq_y;
-+    int ntx;
-+    int nty;
-+    int ntzw;
-+    int ntiles_dst;
-+    int nsm;
-+    int tiles_nwaves;
-+    int tiles_efficiency_percent;
-+    int stream_k_blocks;
-+    bool fixup_needed;
-+};
-+
- static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-         const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-         const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
-@@ -46,6 +64,30 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-     };
- }
- 
-+static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make(
-+        const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max,
-+        const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw,
-+        const int ntiles_dst, const int nsm, const int tiles_nwaves, const int tiles_efficiency_percent,
-+        const int stream_k_blocks, const bool fixup_needed) {
-+    return {
-+        type,
-+        is_moe,
-+        ncols_dst,
-+        ncols_max,
-+        mmq_x,
-+        mmq_y,
-+        ntx,
-+        nty,
-+        ntzw,
-+        ntiles_dst,
-+        nsm,
-+        tiles_nwaves,
-+        tiles_efficiency_percent,
-+        stream_k_blocks,
-+        fixup_needed,
-+    };
-+}
-+
- static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, const ggml_cuda_mmq_shape & shape) {
-     return std::snprintf(buf, size,
-         "type=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
-@@ -64,3 +106,26 @@ static inline int ggml_cuda_mmq_shape_format(char * buf, const size_t size, cons
-         shape.mmq_y,
-         shape.use_stream_k ? 1 : 0);
- }
-+
-+static inline int ggml_cuda_mmq_launch_shape_format(
-+        char * buf, const size_t size, const ggml_cuda_mmq_launch_shape & shape) {
-+    return std::snprintf(buf, size,
-+        "type=%d moe=%d ncols_dst=%lld ncols_max=%lld mmq_x=%d mmq_y=%d "
-+        "ntx=%d nty=%d ntzw=%d ntiles_dst=%d nsm=%d tiles_nwaves=%d "
-+        "tiles_efficiency=%d stream_k_blocks=%d fixup=%d",
-+        shape.type,
-+        shape.is_moe ? 1 : 0,
-+        (long long) shape.ncols_dst,
-+        (long long) shape.ncols_max,
-+        shape.mmq_x,
-+        shape.mmq_y,
-+        shape.ntx,
-+        shape.nty,
-+        shape.ntzw,
-+        shape.ntiles_dst,
-+        shape.nsm,
-+        shape.tiles_nwaves,
-+        shape.tiles_efficiency_percent,
-+        shape.stream_k_blocks,
-+        shape.fixup_needed ? 1 : 0);
-+}
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index 6bc943738..34002edf7 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -3989,6 +3989,24 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
-     return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
- }
- 
-+static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
-+    static const int limit = []() -> int {
-+        const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
-+        if (s == nullptr || strcmp(s, "0") == 0) {
-+            return 0;
-+        }
-+        const int parsed = atoi(s);
-+        return parsed > 0 ? parsed : 256;
-+    }();
-+    return limit;
-+}
-+
-+static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic<int> & counter) {
-+    const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
-+    const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit;
-+    return trace_index >= 0 && trace_index < trace_limit;
-+}
-+
- template <ggml_type type, int mmq_x>
- static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
-     const int id = ggml_cuda_get_device();
-@@ -4054,6 +4072,19 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
- 
-     const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
- 
-+    if (args.expert_bounds != nullptr) {
-+        static std::atomic<int> trace_count{0};
-+        if (ggml_cuda_moe_mmq_trace_take(trace_count)) {
-+            char buf[256];
-+            const ggml_cuda_mmq_launch_shape shape = ggml_cuda_mmq_launch_shape_make(
-+                (int) type, true, args.ncols_dst, args.ncols_max, mmq_x, mmq_y,
-+                ntx, nty, ntzw, ntiles_dst, nsm, tiles_nwaves, tiles_efficiency_percent,
-+                block_nums_stream_k.x, fixup_needed);
-+            ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), shape);
-+            fprintf(stderr, "[LLAMA_MOE_MMQ_LAUNCH] %s\n", buf);
-+        }
-+    }
-+
-     ggml_cuda_pool & pool = ctx.pool(id);
-     ggml_cuda_pool_alloc<float> tmp_fixup(pool);
-     if (fixup_needed) {
-@@ -4167,18 +4198,6 @@ static inline int ggml_cuda_fp4_dense_mmq_x_cap() {
-     return c;
- }
- 
-static inline int ggml_cuda_moe_mmq_shape_trace_limit() {
-    static const int limit = []() -> int {
-        const char * s = getenv("LLAMA_MOE_MMQ_SHAPE_TRACE");
-        if (s == nullptr || strcmp(s, "0") == 0) {
-            return 0;
-        }
-        const int parsed = atoi(s);
-        return parsed > 0 ? parsed : 256;
-    }();
-    return limit;
-}
-
- template <ggml_type type>
- void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
-     const int    id     = ggml_cuda_get_device();
-@@ -4267,9 +4286,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
- 
-     if (args.expert_bounds != nullptr) {
-         static std::atomic<int> trace_count{0};
-        const int trace_limit = ggml_cuda_moe_mmq_shape_trace_limit();
-        const int trace_index = trace_limit > 0 ? trace_count.fetch_add(1, std::memory_order_relaxed) : trace_limit;
-        if (trace_index >= 0 && trace_index < trace_limit) {
-+        if (ggml_cuda_moe_mmq_trace_take(trace_count)) {
-             char buf[256];
-             const ggml_cuda_mmq_shape shape = ggml_cuda_mmq_shape_make(
-                 (int) type, true, args.ncols_dst, args.nchannels_x, args.ncols_max,
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-index 8620169c0..86ee15e02 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -38,5 +38,40 @@ int main() {
-     require(std::strstr(buf, "mmq_x_best=64") != nullptr, "trace includes selected tile");
-     require(std::strstr(buf, "stream_k=1") != nullptr, "trace includes stream-k flag");
- 
-+    const ggml_cuda_mmq_launch_shape launch = ggml_cuda_mmq_launch_shape_make(
-+        /* type */ 39,
-+        /* is_moe */ true,
-+        /* ncols_dst */ 1024,
-+        /* ncols_max */ 128,
-+        /* mmq_x */ 64,
-+        /* mmq_y */ 128,
-+        /* ntx */ 2,
-+        /* nty */ 4,
-+        /* ntzw */ 3,
-+        /* ntiles_dst */ 24,
-+        /* nsm */ 16,
-+        /* tiles_nwaves */ 2,
-+        /* tiles_efficiency_percent */ 75,
-+        /* stream_k_blocks */ 16,
-+        /* fixup_needed */ true);
-+
-+    const int launch_n = ggml_cuda_mmq_launch_shape_format(buf, sizeof(buf), launch);
-+
-+    require(launch_n > 0, "launch format returns byte count");
-+    require(std::strstr(buf, "moe=1") != nullptr, "launch trace includes moe flag");
-+    require(std::strstr(buf, "ncols_dst=1024") != nullptr, "launch trace includes routed assignment count");
-+    require(std::strstr(buf, "ncols_max=128") != nullptr, "launch trace includes max column count");
-+    require(std::strstr(buf, "mmq_x=64") != nullptr, "launch trace includes compiled x tile");
-+    require(std::strstr(buf, "mmq_y=128") != nullptr, "launch trace includes compiled y tile");
-+    require(std::strstr(buf, "ntx=2") != nullptr, "launch trace includes x tile count");
-+    require(std::strstr(buf, "nty=4") != nullptr, "launch trace includes y tile count");
-+    require(std::strstr(buf, "ntzw=3") != nullptr, "launch trace includes batch tile count");
-+    require(std::strstr(buf, "ntiles_dst=24") != nullptr, "launch trace includes total tile count");
-+    require(std::strstr(buf, "nsm=16") != nullptr, "launch trace includes SM count");
-+    require(std::strstr(buf, "tiles_nwaves=2") != nullptr, "launch trace includes wave count");
-+    require(std::strstr(buf, "tiles_efficiency=75") != nullptr, "launch trace includes stream-k efficiency");
-+    require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count");
-+    require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag");
-+
-     return 0;
- }
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0058-feat-cuda-trace-moe-small-m-mmq-candidates.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0058-feat-cuda-trace-moe-small-m-mmq-candidates.patch
@@ -1,182 +0,0 @@
-From 2a9964d290a543d14db972d8d2927ee9d2974f7e Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 05:05:17 +0000
-Subject: [PATCH] feat(cuda): trace moe small-m mmq candidates
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/mmq-shape-trace.h | 58 ++++++++++++++++++++++++++++
- ggml/src/ggml-cuda/mmq.cuh           | 27 +++++++++++++
- tests/test-cuda-mmq-shape-trace.cpp  | 35 +++++++++++++++++
- 3 files changed, 120 insertions(+)
-
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-index 98bc21f7f..47453d91f 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -37,6 +37,18 @@ struct ggml_cuda_mmq_launch_shape {
-     bool fixup_needed;
- };
- 
-+struct ggml_cuda_mmq_small_m_shape {
-+    bool is_moe;
-+    int64_t ncols_dst;
-+    int64_t nchannels_x;
-+    int64_t ncols_max;
-+    int64_t n_active_est;
-+    int64_t density;
-+    int mmq_x_best;
-+    bool use_stream_k;
-+    bool is_candidate;
-+};
-+
- static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-         const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-         const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
-@@ -64,6 +76,36 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-     };
- }
- 
-+static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make(
-+        const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-+        const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) {
-+    int64_t n_active_est = 0;
-+    int64_t density = 0;
-+    if (is_moe && ncols_dst > 0 && nchannels_x > 0) {
-+        n_active_est = ncols_dst < nchannels_x ? ncols_dst : nchannels_x;
-+        density = (ncols_dst + n_active_est - 1) / n_active_est;
-+    }
-+
-+    const bool is_candidate =
-+        is_moe &&
-+        use_stream_k &&
-+        ncols_max > 0 && ncols_max <= 128 &&
-+        density > 0 && density <= 4 &&
-+        mmq_x_best > 0 && mmq_x_best <= 64;
-+
-+    return {
-+        is_moe,
-+        ncols_dst,
-+        nchannels_x,
-+        ncols_max,
-+        n_active_est,
-+        density,
-+        mmq_x_best,
-+        use_stream_k,
-+        is_candidate,
-+    };
-+}
-+
- static inline ggml_cuda_mmq_launch_shape ggml_cuda_mmq_launch_shape_make(
-         const int type, const bool is_moe, const int64_t ncols_dst, const int64_t ncols_max,
-         const int mmq_x, const int mmq_y, const int ntx, const int nty, const int ntzw,
-@@ -129,3 +171,19 @@ static inline int ggml_cuda_mmq_launch_shape_format(
-         shape.stream_k_blocks,
-         shape.fixup_needed ? 1 : 0);
- }
-+
-+static inline int ggml_cuda_mmq_small_m_shape_format(
-+        char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
-+    return std::snprintf(buf, size,
-+        "candidate=%d moe=%d ncols_dst=%lld nchannels_x=%lld ncols_max=%lld "
-+        "n_active_est=%lld density=%lld mmq_x_best=%d stream_k=%d",
-+        shape.is_candidate ? 1 : 0,
-+        shape.is_moe ? 1 : 0,
-+        (long long) shape.ncols_dst,
-+        (long long) shape.nchannels_x,
-+        (long long) shape.ncols_max,
-+        (long long) shape.n_active_est,
-+        (long long) shape.density,
-+        shape.mmq_x_best,
-+        shape.use_stream_k ? 1 : 0);
-+}
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index 34002edf7..25ead9e7b 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -4007,6 +4007,24 @@ static inline bool ggml_cuda_moe_mmq_trace_take(std::atomic<int> & counter) {
-     return trace_index >= 0 && trace_index < trace_limit;
- }
- 
-+static inline int ggml_cuda_moe_mmq_small_m_trace_limit() {
-+    static const int limit = []() -> int {
-+        const char * s = getenv("LLAMA_MOE_MMQ_SMALL_M_TRACE");
-+        if (s == nullptr || strcmp(s, "0") == 0) {
-+            return 0;
-+        }
-+        const int parsed = atoi(s);
-+        return parsed > 0 ? parsed : 256;
-+    }();
-+    return limit;
-+}
-+
-+static inline bool ggml_cuda_moe_mmq_small_m_trace_take(std::atomic<int> & counter) {
-+    const int trace_limit = ggml_cuda_moe_mmq_small_m_trace_limit();
-+    const int trace_index = trace_limit > 0 ? counter.fetch_add(1, std::memory_order_relaxed) : trace_limit;
-+    return trace_index >= 0 && trace_index < trace_limit;
-+}
-+
- template <ggml_type type, int mmq_x>
- static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
-     const int id = ggml_cuda_get_device();
-@@ -4294,6 +4312,15 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
-             ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
-             fprintf(stderr, "[LLAMA_MOE_MMQ_SHAPE] %s\n", buf);
-         }
-+
-+        static std::atomic<int> small_m_trace_count{0};
-+        const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
-+            true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_best, args.use_stream_k);
-+        if (small_m.is_candidate && ggml_cuda_moe_mmq_small_m_trace_take(small_m_trace_count)) {
-+            char buf[256];
-+            ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m);
-+            fprintf(stderr, "[LLAMA_MOE_MMQ_SMALL_M] %s\n", buf);
-+        }
-     }
- 
-     switch (mmq_x_best) {
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-index 86ee15e02..9f36ce1a1 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -73,5 +73,40 @@ int main() {
-     require(std::strstr(buf, "stream_k_blocks=16") != nullptr, "launch trace includes actual stream-k block count");
-     require(std::strstr(buf, "fixup=1") != nullptr, "launch trace includes fixup flag");
- 
-+    const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
-+        /* is_moe */ true,
-+        /* ncols_dst */ 1024,
-+        /* nchannels_x */ 256,
-+        /* ncols_max */ 128,
-+        /* mmq_x_best */ 64,
-+        /* use_stream_k */ true);
-+
-+    require(small_m.is_candidate, "decode-like low-density MoE shape is a small-M candidate");
-+    require(small_m.n_active_est == 256, "small-M active estimate is capped by expert count");
-+    require(small_m.density == 4, "small-M density is ceil(assignments / active experts)");
-+
-+    require(!ggml_cuda_mmq_small_m_shape_make(
-+        /* is_moe */ false, 1024, 256, 128, 64, true).is_candidate,
-+        "dense shape is excluded");
-+    require(!ggml_cuda_mmq_small_m_shape_make(
-+        /* is_moe */ true, 4096, 256, 512, 128, true).is_candidate,
-+        "prefill-like shape is excluded");
-+    require(!ggml_cuda_mmq_small_m_shape_make(
-+        /* is_moe */ true, 4096, 256, 128, 64, true).is_candidate,
-+        "high-density shape is excluded");
-+    require(!ggml_cuda_mmq_small_m_shape_make(
-+        /* is_moe */ true, 1024, 256, 128, 128, true).is_candidate,
-+        "large selected tile is excluded");
-+    require(!ggml_cuda_mmq_small_m_shape_make(
-+        /* is_moe */ true, 1024, 256, 128, 64, false).is_candidate,
-+        "non-stream-k shape is excluded");
-+
-+    const int small_m_n = ggml_cuda_mmq_small_m_shape_format(buf, sizeof(buf), small_m);
-+
-+    require(small_m_n > 0, "small-M format returns byte count");
-+    require(std::strstr(buf, "candidate=1") != nullptr, "small-M trace includes candidate flag");
-+    require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density");
-+    require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile");
-+
-     return 0;
- }
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0059-feat-cuda-gate-moe-small-m-mmq-tile-policy.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0059-feat-cuda-gate-moe-small-m-mmq-tile-policy.patch
@@ -1,80 +0,0 @@
-From fbed2abaa9f5af8e500f95c8dda86b305450ceff Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 05:17:39 +0000
-Subject: [PATCH] feat(cuda): gate moe small-m mmq tile policy
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/mmq-shape-trace.h |  9 +++++++++
- ggml/src/ggml-cuda/mmq.cuh           | 13 +++++++++++++
- tests/test-cuda-mmq-shape-trace.cpp  | 10 ++++++++++
- 3 files changed, 32 insertions(+)
-
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-index 47453d91f..dfb4e898a 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -187,3 +187,12 @@ static inline int ggml_cuda_mmq_small_m_shape_format(
-         shape.mmq_x_best,
-         shape.use_stream_k ? 1 : 0);
- }
-+
-+static inline int ggml_cuda_mmq_small_m_tile_limit(
-+        const ggml_cuda_mmq_small_m_shape & shape, const int current_limit, const int requested_tile) {
-+    if (!shape.is_candidate || requested_tile < 8 || requested_tile >= current_limit) {
-+        return current_limit;
-+    }
-+
-+    return requested_tile;
-+}
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index 25ead9e7b..16b3fcca4 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -4201,6 +4201,15 @@ static inline int ggml_cuda_moe_density_max() {
-     return d;
- }
- 
-+static inline int ggml_cuda_moe_small_m_tile() {
-+    static const int t = []() -> int {
-+        const char * s = getenv("LLAMA_MOE_SMALL_M_TILE");
-+        const int v = s ? atoi(s) : 0;
-+        return v >= 8 ? v : 0;
-+    }();
-+    return t;
-+}
-+
- // [paged patch 0017 / track B] DENSE NVFP4 decode mmq_x re-read occupancy DIAGNOSTIC (env, default off).
- // GGML_CUDA_FP4_DENSE_MMQ_X=<n> caps the dense (non-MoE) NVFP4 col-tile to <n>, splitting the M=128
- // decode ubatch into ceil(128/n) col-tiles. Each col-tile re-reads the full weight set (fatal cost
-@@ -4282,6 +4291,10 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
-                 }
-             }
-         }
-+
-+        const ggml_cuda_mmq_small_m_shape small_m = ggml_cuda_mmq_small_m_shape_make(
-+            true, args.ncols_dst, args.nchannels_x, args.ncols_max, mmq_x_lim, args.use_stream_k);
-+        mmq_x_lim = ggml_cuda_mmq_small_m_tile_limit(small_m, mmq_x_lim, ggml_cuda_moe_small_m_tile());
-     }
- 
-     int mmq_x_best  = 0;
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-index 9f36ce1a1..f7863f03a 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -108,5 +108,15 @@ int main() {
-     require(std::strstr(buf, "density=4") != nullptr, "small-M trace includes density");
-     require(std::strstr(buf, "mmq_x_best=64") != nullptr, "small-M trace includes selected tile");
- 
-+    require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 0) == 64,
-+        "small-M tile override is default-off");
-+    require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 16) == 16,
-+        "small-M tile override caps candidate tile limit");
-+    require(ggml_cuda_mmq_small_m_tile_limit(small_m, 64, 128) == 64,
-+        "small-M tile override ignores non-smaller tiles");
-+    require(ggml_cuda_mmq_small_m_tile_limit(
-+        ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128,
-+        "small-M tile override excludes prefill-like shapes");
-+
-     return 0;
- }
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0060-feat-cuda-trace-moe-mmid-routes.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0060-feat-cuda-trace-moe-mmid-routes.patch
@@ -1,292 +0,0 @@
-From 6c332094ca2fbb1e3211427c5f919adcaa89c588 Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 05:32:27 +0000
-Subject: [PATCH] feat(cuda): trace moe mmid routes
-
-Add a default-off LLAMA_MOE_MMID_ROUTE_TRACE diagnostic for MUL_MAT_ID dispatch routes.
-
-The trace reports whether a call uses MMVQ, MMVF, grouped MMQ, MMF, or host-sync fallback while preserving the existing dispatch predicates.
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/ggml-cuda.cu      | 34 +++++++++--
- ggml/src/ggml-cuda/mmq-shape-trace.h | 88 ++++++++++++++++++++++++++++
- tests/test-cuda-mmq-shape-trace.cpp  | 82 ++++++++++++++++++++++++++
- 3 files changed, 200 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 374949f25..a1754df39 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2685,6 +2685,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
-     }
- }
- 
-+static inline int ggml_cuda_moe_mmid_route_trace_limit() {
-+    static const int value = []() {
-+        const char * s = getenv("LLAMA_MOE_MMID_ROUTE_TRACE");
-+        return s ? atoi(s) : 0;
-+    }();
-+
-+    return value;
-+}
-+
- static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-     const ggml_tensor * src0 = dst->src[0];
-     const ggml_tensor * src1 = dst->src[1];
-@@ -2697,13 +2706,30 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
-     GGML_TENSOR_BINARY_OP_LOCALS
- 
-     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-+    const bool src0_quantized = ggml_is_quantized(src0->type);
-+    const int mmvq_mmid_max = src0_quantized ? get_mmvq_mmid_max_batch(src0->type, cc) : MMVQ_MAX_BATCH_SIZE;
-+    const bool use_mmq = ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02);
-+    const bool use_mmf = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true);
-+
-+    const int mmid_trace_limit = ggml_cuda_moe_mmid_route_trace_limit();
-+    if (mmid_trace_limit > 0) {
-+        static std::atomic<int> trace_count{0};
-+        const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
-+        if (trace_idx < mmid_trace_limit) {
-+            const ggml_cuda_mmid_route_shape route_shape = ggml_cuda_mmid_route_shape_make(
-+                src0->type, ne2, ne12, /*n_experts=*/ne02, mmvq_mmid_max, use_mmq, use_mmf,
-+                GGML_CUDA_CC_IS_AMD(cc), src0_quantized);
-+            char buf[256];
-+            ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), route_shape);
-+            fprintf(stderr, "[LLAMA_MOE_MMID_ROUTE] %s\n", buf);
-+        }
-+    }
- 
-     // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
-     if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-         static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
-         if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
-            if (ggml_is_quantized(src0->type)) {
-                const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
-+            if (src0_quantized) {
-                 if (ne2 <= mmvq_mmid_max) {
-                     ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
-                     return;
-@@ -2716,12 +2742,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
-             }
-         }
- 
-        if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
-+        if (use_mmq) {
-             ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
-             return;
-         }
- 
-        if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
-+        if (use_mmf) {
-             ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
-             return;
-         }
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-index dfb4e898a..da234a302 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -49,6 +49,40 @@ struct ggml_cuda_mmq_small_m_shape {
-     bool is_candidate;
- };
- 
-+enum ggml_cuda_mmid_route {
-+    GGML_CUDA_MMID_ROUTE_MMVQ,
-+    GGML_CUDA_MMID_ROUTE_MMVF,
-+    GGML_CUDA_MMID_ROUTE_MMQ,
-+    GGML_CUDA_MMID_ROUTE_MMF,
-+    GGML_CUDA_MMID_ROUTE_FALLBACK,
-+};
-+
-+struct ggml_cuda_mmid_route_shape {
-+    ggml_cuda_mmid_route route;
-+    int type;
-+    int64_t ne2;
-+    int64_t ne12;
-+    int64_t n_experts;
-+    int mmvq_max;
-+    bool use_mmq;
-+    bool use_mmf;
-+    bool is_amd;
-+    bool is_quantized;
-+    bool host_sync;
-+};
-+
-+static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
-+    switch (route) {
-+        case GGML_CUDA_MMID_ROUTE_MMVQ:     return "mmvq";
-+        case GGML_CUDA_MMID_ROUTE_MMVF:     return "mmvf";
-+        case GGML_CUDA_MMID_ROUTE_MMQ:      return "mmq";
-+        case GGML_CUDA_MMID_ROUTE_MMF:      return "mmf";
-+        case GGML_CUDA_MMID_ROUTE_FALLBACK: return "fallback";
-+    }
-+
-+    return "unknown";
-+}
-+
- static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-         const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-         const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
-@@ -76,6 +110,42 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-     };
- }
- 
-+static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
-+        const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
-+        const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
-+        const bool is_quantized) {
-+    ggml_cuda_mmid_route route = GGML_CUDA_MMID_ROUTE_FALLBACK;
-+    if (ne2 <= mmvq_max) {
-+        if (is_quantized) {
-+            route = GGML_CUDA_MMID_ROUTE_MMVQ;
-+        } else if (is_amd) {
-+            route = GGML_CUDA_MMID_ROUTE_MMVF;
-+        }
-+    }
-+
-+    if (route == GGML_CUDA_MMID_ROUTE_FALLBACK) {
-+        if (use_mmq) {
-+            route = GGML_CUDA_MMID_ROUTE_MMQ;
-+        } else if (use_mmf) {
-+            route = GGML_CUDA_MMID_ROUTE_MMF;
-+        }
-+    }
-+
-+    return {
-+        route,
-+        type,
-+        ne2,
-+        ne12,
-+        n_experts,
-+        mmvq_max,
-+        use_mmq,
-+        use_mmf,
-+        is_amd,
-+        is_quantized,
-+        route == GGML_CUDA_MMID_ROUTE_FALLBACK,
-+    };
-+}
-+
- static inline ggml_cuda_mmq_small_m_shape ggml_cuda_mmq_small_m_shape_make(
-         const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-         const int64_t ncols_max, const int mmq_x_best, const bool use_stream_k) {
-@@ -172,6 +242,24 @@ static inline int ggml_cuda_mmq_launch_shape_format(
-         shape.fixup_needed ? 1 : 0);
- }
- 
-+static inline int ggml_cuda_mmid_route_shape_format(
-+        char * buf, const size_t size, const ggml_cuda_mmid_route_shape & shape) {
-+    return std::snprintf(buf, size,
-+        "route=%s type=%d host_sync=%d ne2=%lld ne12=%lld n_experts=%lld "
-+        "mmvq_max=%d use_mmq=%d use_mmf=%d is_amd=%d is_quantized=%d",
-+        ggml_cuda_mmid_route_name(shape.route),
-+        shape.type,
-+        shape.host_sync ? 1 : 0,
-+        (long long) shape.ne2,
-+        (long long) shape.ne12,
-+        (long long) shape.n_experts,
-+        shape.mmvq_max,
-+        shape.use_mmq ? 1 : 0,
-+        shape.use_mmf ? 1 : 0,
-+        shape.is_amd ? 1 : 0,
-+        shape.is_quantized ? 1 : 0);
-+}
-+
- static inline int ggml_cuda_mmq_small_m_shape_format(
-         char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
-     return std::snprintf(buf, size,
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-index f7863f03a..e190cf1ac 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -118,5 +118,87 @@ int main() {
-         ggml_cuda_mmq_small_m_shape_make(/* is_moe */ true, 4096, 256, 512, 128, true), 128, 16) == 128,
-         "small-M tile override excludes prefill-like shapes");
- 
-+    const ggml_cuda_mmid_route_shape mmvq = ggml_cuda_mmid_route_shape_make(
-+        /* type */ 39,
-+        /* ne2 */ 1,
-+        /* ne12 */ 1,
-+        /* ne02 */ 256,
-+        /* mmvq_max */ 4,
-+        /* use_mmq */ true,
-+        /* use_mmf */ true,
-+        /* is_amd */ false,
-+        /* is_quantized */ true);
-+
-+    require(mmvq.route == GGML_CUDA_MMID_ROUTE_MMVQ, "MMVQ wins when within its batch cap");
-+    require(!mmvq.host_sync, "MMVQ route is graph-safe");
-+
-+    const ggml_cuda_mmid_route_shape mmq = ggml_cuda_mmid_route_shape_make(
-+        /* type */ 39,
-+        /* ne2 */ 128,
-+        /* ne12 */ 128,
-+        /* ne02 */ 256,
-+        /* mmvq_max */ 4,
-+        /* use_mmq */ true,
-+        /* use_mmf */ true,
-+        /* is_amd */ false,
-+        /* is_quantized */ true);
-+
-+    require(mmq.route == GGML_CUDA_MMID_ROUTE_MMQ, "grouped MMQ wins after MMVQ cap");
-+    require(!mmq.host_sync, "grouped MMQ route is graph-safe");
-+
-+    const ggml_cuda_mmid_route_shape mmf = ggml_cuda_mmid_route_shape_make(
-+        /* type */ 1,
-+        /* ne2 */ 128,
-+        /* ne12 */ 128,
-+        /* ne02 */ 256,
-+        /* mmvq_max */ 4,
-+        /* use_mmq */ false,
-+        /* use_mmf */ true,
-+        /* is_amd */ false,
-+        /* is_quantized */ false);
-+
-+    require(mmf.route == GGML_CUDA_MMID_ROUTE_MMF, "MMF wins when grouped MMQ is unavailable");
-+    require(!mmf.host_sync, "MMF route is graph-safe");
-+
-+    const ggml_cuda_mmid_route_shape fallback = ggml_cuda_mmid_route_shape_make(
-+        /* type */ 0,
-+        /* ne2 */ 128,
-+        /* ne12 */ 128,
-+        /* ne02 */ 256,
-+        /* mmvq_max */ 4,
-+        /* use_mmq */ false,
-+        /* use_mmf */ false,
-+        /* is_amd */ false,
-+        /* is_quantized */ false);
-+
-+    require(fallback.route == GGML_CUDA_MMID_ROUTE_FALLBACK, "fallback is used when device routes do not match");
-+    require(fallback.host_sync, "fallback route requires host synchronization");
-+
-+    const ggml_cuda_mmid_route_shape amd_mmvf = ggml_cuda_mmid_route_shape_make(
-+        /* type */ 1,
-+        /* ne2 */ 1,
-+        /* ne12 */ 1,
-+        /* ne02 */ 256,
-+        /* mmvq_max */ 4,
-+        /* use_mmq */ false,
-+        /* use_mmf */ false,
-+        /* is_amd */ true,
-+        /* is_quantized */ false);
-+
-+    require(amd_mmvf.route == GGML_CUDA_MMID_ROUTE_MMVF, "AMD float vector route wins within MMVQ cap");
-+    require(!amd_mmvf.host_sync, "AMD float vector route is graph-safe");
-+
-+    const int route_n = ggml_cuda_mmid_route_shape_format(buf, sizeof(buf), mmq);
-+
-+    require(route_n > 0, "MMID route format returns byte count");
-+    require(std::strstr(buf, "route=mmq") != nullptr, "MMID trace includes route name");
-+    require(std::strstr(buf, "host_sync=0") != nullptr, "MMID trace includes host sync flag");
-+    require(std::strstr(buf, "ne2=128") != nullptr, "MMID trace includes destination batch");
-+    require(std::strstr(buf, "ne12=128") != nullptr, "MMID trace includes routed token count");
-+    require(std::strstr(buf, "n_experts=256") != nullptr, "MMID trace includes expert count");
-+    require(std::strstr(buf, "mmvq_max=4") != nullptr, "MMID trace includes MMVQ cap");
-+    require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate");
-+    require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate");
-+
-     return 0;
- }
-- 
-2.43.0
-
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0061-feat-cuda-trace-mul-mat-routes.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0061-feat-cuda-trace-mul-mat-routes.patch
@@ -1,345 +0,0 @@
-From 486c28c63d5297afd06e5a2bdbd4fb89cad749cd Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 05:49:12 +0000
-Subject: [PATCH] feat(cuda): trace mul mat routes
-
-Add a default-off LLAMA_MUL_MAT_ROUTE_TRACE diagnostic for regular MUL_MAT dispatch routes.
-
-The trace classifies projection-heavy calls without changing dispatch behavior.
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/ggml-cuda.cu      |  44 +++++++++-
- ggml/src/ggml-cuda/mmq-shape-trace.h | 117 +++++++++++++++++++++++++++
- tests/test-cuda-mmq-shape-trace.cpp  |  85 +++++++++++++++++++
- 3 files changed, 244 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index a1754df39..cd34aff13 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2580,6 +2580,32 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
-     return use_mul_mat_vec_q;
- }
- 
-+static inline int ggml_cuda_mul_mat_route_trace_limit() {
-+    static const int value = []() {
-+        const char * s = getenv("LLAMA_MUL_MAT_ROUTE_TRACE");
-+        return s ? atoi(s) : 0;
-+    }();
-+
-+    return value;
-+}
-+
-+static inline void ggml_cuda_mul_mat_route_trace(const ggml_cuda_mul_mat_route_shape & shape) {
-+    const int trace_limit = ggml_cuda_mul_mat_route_trace_limit();
-+    if (trace_limit <= 0) {
-+        return;
-+    }
-+
-+    static std::atomic<int> trace_count{0};
-+    const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
-+    if (trace_idx >= trace_limit) {
-+        return;
-+    }
-+
-+    char buf[256];
-+    ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), shape);
-+    fprintf(stderr, "[LLAMA_MUL_MAT_ROUTE] %s\n", buf);
-+}
-+
- static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
- 
-@@ -2591,6 +2617,10 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
-     if (!split) {
-         const int cc_fp4 = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-         if (ggml_cuda_fp4_prefill_should_engage(src0, src1, dst, cc_fp4)) {
-+            ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
-+                src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
-+                /*use_vec_f=*/false, /*use_mat_f=*/false, /*use_vec_q=*/false, /*use_mmq=*/false,
-+                /*use_batched_cublas=*/false, /*use_fp4_prefill=*/true, /*use_fwht=*/false));
-             ggml_cuda_mul_mat_fp4_large_m(ctx, src0, src1, dst);
-             return;
-         }
-@@ -2654,12 +2684,23 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
-     bool use_batched_cublas_f16  = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
-     bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
-     bool use_batched_cublas_f32  = src0->type == GGML_TYPE_F32;
-+    bool use_batched_cublas      = !split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
-+        && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1;
- 
-     const int32_t hint = ggml_get_op_params_i32(dst, 1);
-     if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
-+        ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
-+            src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
-+            use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q,
-+            use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/true));
-         return;
-     }
- 
-+    ggml_cuda_mul_mat_route_trace(ggml_cuda_mul_mat_route_shape_make(
-+        src0->type, dst->ne[1], src1->ne[1], src1->ne[2], src1->ne[3], split,
-+        use_mul_mat_vec_f, use_mul_mat_f, use_mul_mat_vec_q, use_mul_mat_q,
-+        use_batched_cublas, /*use_fp4_prefill=*/false, /*use_fwht=*/false));
-+
-     if (!split && use_mul_mat_vec_f) {
-         // the custom F16 vector kernel can be used over batched cuBLAS GEMM
-         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
-@@ -2670,8 +2711,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
-         ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
-     } else if (!split && use_mul_mat_q) {
-         ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
-    } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
-        && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
-+    } else if (use_batched_cublas) {
-         // general KQ + KQV multi-batch without FlashAttention
-         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
-     } else if (use_mul_mat_vec_f) {
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-index da234a302..8ac373fd9 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -71,6 +71,37 @@ struct ggml_cuda_mmid_route_shape {
-     bool host_sync;
- };
- 
-+enum ggml_cuda_mul_mat_route {
-+    GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL,
-+    GGML_CUDA_MUL_MAT_ROUTE_FWHT,
-+    GGML_CUDA_MUL_MAT_ROUTE_VEC_F,
-+    GGML_CUDA_MUL_MAT_ROUTE_MAT_F,
-+    GGML_CUDA_MUL_MAT_ROUTE_VEC_Q,
-+    GGML_CUDA_MUL_MAT_ROUTE_MMQ,
-+    GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS,
-+    GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F,
-+    GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q,
-+    GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ,
-+    GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
-+};
-+
-+struct ggml_cuda_mul_mat_route_shape {
-+    ggml_cuda_mul_mat_route route;
-+    int type;
-+    int64_t ne1;
-+    int64_t ne11;
-+    int64_t ne12;
-+    int64_t ne13;
-+    bool split;
-+    bool use_vec_f;
-+    bool use_mat_f;
-+    bool use_vec_q;
-+    bool use_mmq;
-+    bool use_batched_cublas;
-+    bool use_fp4_prefill;
-+    bool use_fwht;
-+};
-+
- static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
-     switch (route) {
-         case GGML_CUDA_MMID_ROUTE_MMVQ:     return "mmvq";
-@@ -83,6 +114,24 @@ static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route
-     return "unknown";
- }
- 
-+static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_route route) {
-+    switch (route) {
-+        case GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL:    return "fp4_prefill";
-+        case GGML_CUDA_MUL_MAT_ROUTE_FWHT:           return "fwht";
-+        case GGML_CUDA_MUL_MAT_ROUTE_VEC_F:          return "vec_f";
-+        case GGML_CUDA_MUL_MAT_ROUTE_MAT_F:          return "mat_f";
-+        case GGML_CUDA_MUL_MAT_ROUTE_VEC_Q:          return "vec_q";
-+        case GGML_CUDA_MUL_MAT_ROUTE_MMQ:            return "mmq";
-+        case GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS: return "batched_cublas";
-+        case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F:       return "op_vec_f";
-+        case GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q:       return "op_vec_q";
-+        case GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ:         return "op_mmq";
-+        case GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS:      return "op_cublas";
-+    }
-+
-+    return "unknown";
-+}
-+
- static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-         const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-         const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
-@@ -110,6 +159,52 @@ static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-     };
- }
- 
-+static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make(
-+        const int type, const int64_t ne1, const int64_t ne11, const int64_t ne12, const int64_t ne13,
-+        const bool split, const bool use_vec_f, const bool use_mat_f, const bool use_vec_q,
-+        const bool use_mmq, const bool use_batched_cublas, const bool use_fp4_prefill,
-+        const bool use_fwht) {
-+    ggml_cuda_mul_mat_route route = GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS;
-+    if (use_fp4_prefill) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL;
-+    } else if (use_fwht) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_FWHT;
-+    } else if (!split && use_vec_f) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_VEC_F;
-+    } else if (!split && use_mat_f) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_MAT_F;
-+    } else if (!split && use_vec_q) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_VEC_Q;
-+    } else if (!split && use_mmq) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_MMQ;
-+    } else if (!split && use_batched_cublas) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS;
-+    } else if (use_vec_f) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_F;
-+    } else if (use_vec_q) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_OP_VEC_Q;
-+    } else if (use_mmq) {
-+        route = GGML_CUDA_MUL_MAT_ROUTE_OP_MMQ;
-+    }
-+
-+    return {
-+        route,
-+        type,
-+        ne1,
-+        ne11,
-+        ne12,
-+        ne13,
-+        split,
-+        use_vec_f,
-+        use_mat_f,
-+        use_vec_q,
-+        use_mmq,
-+        use_batched_cublas,
-+        use_fp4_prefill,
-+        use_fwht,
-+    };
-+}
-+
- static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
-         const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
-         const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
-@@ -260,6 +355,28 @@ static inline int ggml_cuda_mmid_route_shape_format(
-         shape.is_quantized ? 1 : 0);
- }
- 
-+static inline int ggml_cuda_mul_mat_route_shape_format(
-+        char * buf, const size_t size, const ggml_cuda_mul_mat_route_shape & shape) {
-+    return std::snprintf(buf, size,
-+        "route=%s type=%d ne1=%lld ne11=%lld ne12=%lld ne13=%lld split=%d "
-+        "use_vec_f=%d use_mat_f=%d use_vec_q=%d use_mmq=%d use_batched_cublas=%d "
-+        "use_fp4_prefill=%d use_fwht=%d",
-+        ggml_cuda_mul_mat_route_name(shape.route),
-+        shape.type,
-+        (long long) shape.ne1,
-+        (long long) shape.ne11,
-+        (long long) shape.ne12,
-+        (long long) shape.ne13,
-+        shape.split ? 1 : 0,
-+        shape.use_vec_f ? 1 : 0,
-+        shape.use_mat_f ? 1 : 0,
-+        shape.use_vec_q ? 1 : 0,
-+        shape.use_mmq ? 1 : 0,
-+        shape.use_batched_cublas ? 1 : 0,
-+        shape.use_fp4_prefill ? 1 : 0,
-+        shape.use_fwht ? 1 : 0);
-+}
-+
- static inline int ggml_cuda_mmq_small_m_shape_format(
-         char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
-     return std::snprintf(buf, size,
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-index e190cf1ac..2bd41d1d8 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -200,5 +200,90 @@ int main() {
-     require(std::strstr(buf, "use_mmq=1") != nullptr, "MMID trace includes MMQ predicate");
-     require(std::strstr(buf, "use_mmf=1") != nullptr, "MMID trace includes MMF predicate");
- 
-+    const ggml_cuda_mul_mat_route_shape mat_f = ggml_cuda_mul_mat_route_shape_make(
-+        /* type */ 30,
-+        /* ne1 */ 128,
-+        /* ne11 */ 128,
-+        /* ne12 */ 1,
-+        /* ne13 */ 1,
-+        /* split */ false,
-+        /* use_vec_f */ false,
-+        /* use_mat_f */ true,
-+        /* use_vec_q */ false,
-+        /* use_mmq */ false,
-+        /* use_batched_cublas */ false,
-+        /* use_fp4_prefill */ false,
-+        /* use_fwht */ false);
-+
-+    require(mat_f.route == GGML_CUDA_MUL_MAT_ROUTE_MAT_F, "regular MUL_MAT prefers direct mat_f when available");
-+    require(!mat_f.split, "regular MUL_MAT trace records split flag");
-+
-+    const ggml_cuda_mul_mat_route_shape batched = ggml_cuda_mul_mat_route_shape_make(
-+        /* type */ 31,
-+        /* ne1 */ 128,
-+        /* ne11 */ 128,
-+        /* ne12 */ 4,
-+        /* ne13 */ 1,
-+        /* split */ false,
-+        /* use_vec_f */ false,
-+        /* use_mat_f */ false,
-+        /* use_vec_q */ false,
-+        /* use_mmq */ false,
-+        /* use_batched_cublas */ true,
-+        /* use_fp4_prefill */ false,
-+        /* use_fwht */ false);
-+
-+    require(batched.route == GGML_CUDA_MUL_MAT_ROUTE_BATCHED_CUBLAS,
-+        "regular MUL_MAT records batched cuBLAS route");
-+
-+    const ggml_cuda_mul_mat_route_shape op_cublas = ggml_cuda_mul_mat_route_shape_make(
-+        /* type */ 0,
-+        /* ne1 */ 16,
-+        /* ne11 */ 16,
-+        /* ne12 */ 1,
-+        /* ne13 */ 1,
-+        /* split */ true,
-+        /* use_vec_f */ false,
-+        /* use_mat_f */ false,
-+        /* use_vec_q */ false,
-+        /* use_mmq */ false,
-+        /* use_batched_cublas */ false,
-+        /* use_fp4_prefill */ false,
-+        /* use_fwht */ false);
-+
-+    require(op_cublas.route == GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
-+        "regular MUL_MAT fallback records op cublas route");
-+
-+    const ggml_cuda_mul_mat_route_shape fp4_prefill = ggml_cuda_mul_mat_route_shape_make(
-+        /* type */ 39,
-+        /* ne1 */ 512,
-+        /* ne11 */ 512,
-+        /* ne12 */ 1,
-+        /* ne13 */ 1,
-+        /* split */ false,
-+        /* use_vec_f */ false,
-+        /* use_mat_f */ false,
-+        /* use_vec_q */ false,
-+        /* use_mmq */ true,
-+        /* use_batched_cublas */ false,
-+        /* use_fp4_prefill */ true,
-+        /* use_fwht */ false);
-+
-+    require(fp4_prefill.route == GGML_CUDA_MUL_MAT_ROUTE_FP4_PREFILL,
-+        "regular MUL_MAT records native FP4 prefill route before MMQ");
-+
-+    const int mul_mat_route_n = ggml_cuda_mul_mat_route_shape_format(buf, sizeof(buf), mat_f);
-+
-+    require(mul_mat_route_n > 0, "regular MUL_MAT route format returns byte count");
-+    require(std::strstr(buf, "route=mat_f") != nullptr, "regular MUL_MAT trace includes route name");
-+    require(std::strstr(buf, "type=30") != nullptr, "regular MUL_MAT trace includes type");
-+    require(std::strstr(buf, "ne1=128") != nullptr, "regular MUL_MAT trace includes output columns");
-+    require(std::strstr(buf, "ne11=128") != nullptr, "regular MUL_MAT trace includes src1 columns");
-+    require(std::strstr(buf, "ne12=1") != nullptr, "regular MUL_MAT trace includes src1 batch dim");
-+    require(std::strstr(buf, "split=0") != nullptr, "regular MUL_MAT trace includes split flag");
-+    require(std::strstr(buf, "use_mat_f=1") != nullptr, "regular MUL_MAT trace includes mat_f predicate");
-+    require(std::strstr(buf, "use_batched_cublas=0") != nullptr,
-+        "regular MUL_MAT trace includes batched cuBLAS predicate");
-+
-     return 0;
- }
-- 
-2.43.0
-
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0062-feat-cuda-trace-cublas-routes.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0062-feat-cuda-trace-cublas-routes.patch
@@ -1,332 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 06:20:31 +0000
-Subject: [PATCH] feat(cuda): trace cublas routes
-
-Add a default-off LLAMA_CUBLAS_ROUTE_TRACE diagnostic around the generic cuBLAS MUL_MAT path.
-
-The trace classifies NVFP4/BF16/FP16/SGEMM subroutes without changing branch behavior, and extends the route helper test coverage.
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/ggml-cuda.cu      |  53 +++++++++++--
- ggml/src/ggml-cuda/mmq-shape-trace.h | 108 +++++++++++++++++++++++++++
- tests/test-cuda-mmq-shape-trace.cpp  |  62 +++++++++++++++
- 3 files changed, 216 insertions(+), 7 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cd34aff13..eff197818 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -1627,6 +1627,32 @@ static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type
-     return compute_type;
- }
- 
-+static inline int ggml_cuda_cublas_route_trace_limit() {
-+    static const int value = []() {
-+        const char * s = getenv("LLAMA_CUBLAS_ROUTE_TRACE");
-+        return s ? atoi(s) : 0;
-+    }();
-+
-+    return value;
-+}
-+
-+static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_shape & shape) {
-+    const int trace_limit = ggml_cuda_cublas_route_trace_limit();
-+    if (trace_limit <= 0) {
-+        return;
-+    }
-+
-+    static std::atomic<int> trace_count{0};
-+    const int trace_idx = trace_count.fetch_add(1, std::memory_order_relaxed);
-+    if (trace_idx >= trace_limit) {
-+        return;
-+    }
-+
-+    char buf[320];
-+    ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape);
-+    fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf);
-+}
-+
- static void ggml_cuda_op_mul_mat_cublas(
-     ggml_backend_cuda_context & ctx,
-     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-@@ -1662,7 +1688,22 @@ static void ggml_cuda_op_mul_mat_cublas(
-         row_diff == src0->ne[1] &&
-         dst->op_params[0] == GGML_PREC_DEFAULT;
- 
-    if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
-+    const bool src0_contig = ggml_is_contiguous(src0);
-+    const bool full_rows = row_diff == src0->ne[1];
-+    const bool fast_fp16 = fast_fp16_hardware_available(cc);
-+    bool force_fp32 = false;
-+    bool force_fp16 = false;
-+    if (fast_fp16 && use_fp16) {
-+        const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
-+        force_fp32 = force_compute_type.fp32;
-+        force_fp16 = force_compute_type.fp16;
-+    }
-+    ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make(
-+        src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc,
-+        supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows,
-+        GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA));
-+
-+    if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) {
-         // Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when
-         // ggml_cuda_should_use_mmq() returned false (large-M dense prefill).
-         // Dequant the FP4 weights to a TRANSIENT bf16 pool buffer and run a
-@@ -1702,7 +1743,7 @@ static void ggml_cuda_op_mul_mat_cublas(
- 
-         const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
-         to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
-+    } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && src0_contig && full_rows) {
-         ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
-         if (src1->type != GGML_TYPE_BF16) {
-             const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
-@@ -1730,7 +1771,7 @@ static void ggml_cuda_op_mul_mat_cublas(
- 
-         const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
-         to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    } else if (fast_fp16_hardware_available(cc) && use_fp16) {
-+    } else if (fast_fp16 && use_fp16) {
-         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-         ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
-         if (src0->type != GGML_TYPE_F16) {
-@@ -1754,12 +1795,10 @@ static void ggml_cuda_op_mul_mat_cublas(
- 
-         CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
- 
-        const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
-
-        if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
-+        if (!force_fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
-                                         || GGML_CUDA_CC_IS_RDNA4(cc)
-                                         || cc == GGML_CUDA_CC_VOLTA
-                                        || force_compute_type.fp32))
-+                                        || force_fp32))
-         {
-             const float alpha = 1.0f;
-             const float beta = 0.0f;
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-index 8ac373fd9..f5b4ecf2c 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -85,6 +85,14 @@ enum ggml_cuda_mul_mat_route {
-     GGML_CUDA_MUL_MAT_ROUTE_OP_CUBLAS,
- };
- 
-+enum ggml_cuda_cublas_route {
-+    GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
-+    GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
-+    GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
-+    GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
-+    GGML_CUDA_CUBLAS_ROUTE_SGEMM,
-+};
-+
- struct ggml_cuda_mul_mat_route_shape {
-     ggml_cuda_mul_mat_route route;
-     int type;
-@@ -102,6 +110,27 @@ struct ggml_cuda_mul_mat_route_shape {
-     bool use_fwht;
- };
- 
-+struct ggml_cuda_cublas_route_shape {
-+    ggml_cuda_cublas_route route;
-+    int type;
-+    int src1_type;
-+    int64_t row_diff;
-+    int64_t src1_ncols;
-+    int64_t ne00;
-+    int64_t ne10;
-+    int64_t ldc;
-+    bool supports_bf16;
-+    bool use_fp16;
-+    bool fast_fp16;
-+    bool force_fp32;
-+    bool force_fp16;
-+    bool src0_contig;
-+    bool full_rows;
-+    bool is_cdna;
-+    bool is_rdna4;
-+    bool is_volta;
-+};
-+
- static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
-     switch (route) {
-         case GGML_CUDA_MMID_ROUTE_MMVQ:     return "mmvq";
-@@ -132,6 +161,18 @@ static inline const char * ggml_cuda_mul_mat_route_name(const ggml_cuda_mul_mat_
-     return "unknown";
- }
- 
-+static inline const char * ggml_cuda_cublas_route_name(const ggml_cuda_cublas_route route) {
-+    switch (route) {
-+        case GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC: return "nvfp4_bf16_tc";
-+        case GGML_CUDA_CUBLAS_ROUTE_BF16_TC:       return "bf16_tc";
-+        case GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F:    return "f16_tc_32f";
-+        case GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F:    return "f16_tc_16f";
-+        case GGML_CUDA_CUBLAS_ROUTE_SGEMM:         return "sgemm";
-+    }
-+
-+    return "unknown";
-+}
-+
- static inline ggml_cuda_mmq_shape ggml_cuda_mmq_shape_make(
-         const int type, const bool is_moe, const int64_t ncols_dst, const int64_t nchannels_x,
-         const int64_t ncols_max, const int mmq_x_max, const int mmq_x_lim, const int mmq_x_best,
-@@ -205,6 +246,47 @@ static inline ggml_cuda_mul_mat_route_shape ggml_cuda_mul_mat_route_shape_make(
-     };
- }
- 
-+static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
-+        const int type, const int src1_type, const int64_t row_diff, const int64_t src1_ncols,
-+        const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16,
-+        const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16,
-+        const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4,
-+        const bool is_volta) {
-+    ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM;
-+    if (supports_bf16 && type == 40 && src0_contig && full_rows) {
-+        route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC;
-+    } else if (supports_bf16 && type == 30 && src0_contig && full_rows) {
-+        route = GGML_CUDA_CUBLAS_ROUTE_BF16_TC;
-+    } else if (fast_fp16 && use_fp16) {
-+        if (!force_fp16 && (is_cdna || is_rdna4 || is_volta || force_fp32)) {
-+            route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F;
-+        } else {
-+            route = GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F;
-+        }
-+    }
-+
-+    return {
-+        route,
-+        type,
-+        src1_type,
-+        row_diff,
-+        src1_ncols,
-+        ne00,
-+        ne10,
-+        ldc,
-+        supports_bf16,
-+        use_fp16,
-+        fast_fp16,
-+        force_fp32,
-+        force_fp16,
-+        src0_contig,
-+        full_rows,
-+        is_cdna,
-+        is_rdna4,
-+        is_volta,
-+    };
-+}
-+
- static inline ggml_cuda_mmid_route_shape ggml_cuda_mmid_route_shape_make(
-         const int type, const int64_t ne2, const int64_t ne12, const int64_t n_experts,
-         const int mmvq_max, const bool use_mmq, const bool use_mmf, const bool is_amd,
-@@ -377,6 +459,32 @@ static inline int ggml_cuda_mul_mat_route_shape_format(
-         shape.use_fwht ? 1 : 0);
- }
- 
-+static inline int ggml_cuda_cublas_route_shape_format(
-+        char * buf, const size_t size, const ggml_cuda_cublas_route_shape & shape) {
-+    return std::snprintf(buf, size,
-+        "route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld "
-+        "supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d "
-+        "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d",
-+        ggml_cuda_cublas_route_name(shape.route),
-+        shape.type,
-+        shape.src1_type,
-+        (long long) shape.row_diff,
-+        (long long) shape.src1_ncols,
-+        (long long) shape.ne00,
-+        (long long) shape.ne10,
-+        (long long) shape.ldc,
-+        shape.supports_bf16 ? 1 : 0,
-+        shape.use_fp16 ? 1 : 0,
-+        shape.fast_fp16 ? 1 : 0,
-+        shape.force_fp32 ? 1 : 0,
-+        shape.force_fp16 ? 1 : 0,
-+        shape.src0_contig ? 1 : 0,
-+        shape.full_rows ? 1 : 0,
-+        shape.is_cdna ? 1 : 0,
-+        shape.is_rdna4 ? 1 : 0,
-+        shape.is_volta ? 1 : 0);
-+}
-+
- static inline int ggml_cuda_mmq_small_m_shape_format(
-         char * buf, const size_t size, const ggml_cuda_mmq_small_m_shape & shape) {
-     return std::snprintf(buf, size,
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-index 2bd41d1d8..1443749c3 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -285,5 +285,67 @@ int main() {
-     require(std::strstr(buf, "use_batched_cublas=0") != nullptr,
-         "regular MUL_MAT trace includes batched cuBLAS predicate");
- 
-+    const ggml_cuda_cublas_route_shape bf16_tc = ggml_cuda_cublas_route_shape_make(
-+        /* type */ 30,
-+        /* src1_type */ 0,
-+        /* row_diff */ 18,
-+        /* src1_ncols */ 18,
-+        /* ne00 */ 1024,
-+        /* ne10 */ 1024,
-+        /* ldc */ 18,
-+        /* supports_bf16 */ true,
-+        /* use_fp16 */ false,
-+        /* fast_fp16 */ true,
-+        /* force_fp32 */ false,
-+        /* force_fp16 */ false,
-+        /* src0_contig */ true,
-+        /* full_rows */ true,
-+        /* is_cdna */ false,
-+        /* is_rdna4 */ false,
-+        /* is_volta */ false);
-+
-+    require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
-+        "cuBLAS records native BF16 tensor-core route");
-+
-+    const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make(
-+        /* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true,
-+        false, false, false);
-+
-+    require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
-+        "cuBLAS records NVFP4 dequant-to-BF16 tensor-core route");
-+
-+    const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make(
-+        /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true,
-+        false, false, false);
-+
-+    require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
-+        "cuBLAS records default FP16 tensor-core 16F compute route");
-+
-+    const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make(
-+        /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true,
-+        false, false, false);
-+
-+    require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
-+        "cuBLAS records forced FP16 tensor-core 32F compute route");
-+
-+    const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make(
-+        /* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true,
-+        false, false, false);
-+
-+    require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM,
-+        "cuBLAS records SGEMM fallback route");
-+
-+    const int cublas_route_n = ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), bf16_tc);
-+
-+    require(cublas_route_n > 0, "cuBLAS route format returns byte count");
-+    require(std::strstr(buf, "route=bf16_tc") != nullptr, "cuBLAS trace includes route name");
-+    require(std::strstr(buf, "type=30") != nullptr, "cuBLAS trace includes src0 type");
-+    require(std::strstr(buf, "src1_type=0") != nullptr, "cuBLAS trace includes src1 type");
-+    require(std::strstr(buf, "row_diff=18") != nullptr, "cuBLAS trace includes row count");
-+    require(std::strstr(buf, "src1_ncols=18") != nullptr, "cuBLAS trace includes source column count");
-+    require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate");
-+    require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate");
-+    require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate");
-+
-     return 0;
- }
-- 
-2.43.0
-
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0063-feat-cuda-trace-cublas-tensor-names.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0063-feat-cuda-trace-cublas-tensor-names.patch
@@ -1,162 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Ettore Di Giacinto <mudler@localai.io>
-Date: Wed, 1 Jul 2026 06:38:11 +0000
-Subject: [PATCH] feat(cuda): trace cublas tensor names
-
-Extend LLAMA_CUBLAS_ROUTE_TRACE with src0/src1/dst tensor names so SGEMM and BF16 cuBLAS buckets can be tied back to graph nodes.
-
-Assisted-by: Codex:gpt-5
---
- ggml/src/ggml-cuda/ggml-cuda.cu      |  5 +++--
- ggml/src/ggml-cuda/mmq-shape-trace.h | 16 +++++++++++++---
- tests/test-cuda-mmq-shape-trace.cpp  | 18 ++++++++++++------
- 3 files changed, 28 insertions(+), 11 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index eff197818..1da67d2af 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -1648,7 +1648,7 @@ static inline void ggml_cuda_cublas_route_trace(const ggml_cuda_cublas_route_sha
-         return;
-     }
- 
-    char buf[320];
-+    char buf[512];
-     ggml_cuda_cublas_route_shape_format(buf, sizeof(buf), shape);
-     fprintf(stderr, "[LLAMA_CUBLAS_ROUTE] %s\n", buf);
- }
-@@ -1701,7 +1701,8 @@ static void ggml_cuda_op_mul_mat_cublas(
-     ggml_cuda_cublas_route_trace(ggml_cuda_cublas_route_shape_make(
-         src0->type, src1->type, row_diff, src1_ncols, ne00, ne10, ldc,
-         supports_bf16, use_fp16, fast_fp16, force_fp32, force_fp16, src0_contig, full_rows,
-        GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA));
-+        GGML_CUDA_CC_IS_CDNA(cc), GGML_CUDA_CC_IS_RDNA4(cc), cc == GGML_CUDA_CC_VOLTA,
-+        src0->name, src1->name, dst->name));
- 
-     if (supports_bf16 && src0->type == GGML_TYPE_NVFP4 && src0_contig && full_rows) {
-         // Paged prefill lever (patch 0033): NVFP4 only reaches cuBLAS when
-diff --git a/ggml/src/ggml-cuda/mmq-shape-trace.h b/ggml/src/ggml-cuda/mmq-shape-trace.h
-index f5b4ecf2c..b55c7467c 100644
--- a/ggml/src/ggml-cuda/mmq-shape-trace.h
-+++ b/ggml/src/ggml-cuda/mmq-shape-trace.h
-@@ -129,6 +129,9 @@ struct ggml_cuda_cublas_route_shape {
-     bool is_cdna;
-     bool is_rdna4;
-     bool is_volta;
-+    const char * src0_name;
-+    const char * src1_name;
-+    const char * dst_name;
- };
- 
- static inline const char * ggml_cuda_mmid_route_name(const ggml_cuda_mmid_route route) {
-@@ -251,7 +254,7 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
-         const int64_t ne00, const int64_t ne10, const int64_t ldc, const bool supports_bf16,
-         const bool use_fp16, const bool fast_fp16, const bool force_fp32, const bool force_fp16,
-         const bool src0_contig, const bool full_rows, const bool is_cdna, const bool is_rdna4,
-        const bool is_volta) {
-+        const bool is_volta, const char * src0_name, const char * src1_name, const char * dst_name) {
-     ggml_cuda_cublas_route route = GGML_CUDA_CUBLAS_ROUTE_SGEMM;
-     if (supports_bf16 && type == 40 && src0_contig && full_rows) {
-         route = GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC;
-@@ -284,6 +287,9 @@ static inline ggml_cuda_cublas_route_shape ggml_cuda_cublas_route_shape_make(
-         is_cdna,
-         is_rdna4,
-         is_volta,
-+        src0_name ? src0_name : "",
-+        src1_name ? src1_name : "",
-+        dst_name ? dst_name : "",
-     };
- }
- 
-@@ -464,7 +470,8 @@ static inline int ggml_cuda_cublas_route_shape_format(
-     return std::snprintf(buf, size,
-         "route=%s type=%d src1_type=%d row_diff=%lld src1_ncols=%lld ne00=%lld ne10=%lld ldc=%lld "
-         "supports_bf16=%d use_fp16=%d fast_fp16=%d force_fp32=%d force_fp16=%d "
-        "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d",
-+        "src0_contig=%d full_rows=%d is_cdna=%d is_rdna4=%d is_volta=%d "
-+        "src0=%s src1=%s dst=%s",
-         ggml_cuda_cublas_route_name(shape.route),
-         shape.type,
-         shape.src1_type,
-@@ -482,7 +489,10 @@ static inline int ggml_cuda_cublas_route_shape_format(
-         shape.full_rows ? 1 : 0,
-         shape.is_cdna ? 1 : 0,
-         shape.is_rdna4 ? 1 : 0,
-        shape.is_volta ? 1 : 0);
-+        shape.is_volta ? 1 : 0,
-+        shape.src0_name,
-+        shape.src1_name,
-+        shape.dst_name);
- }
- 
- static inline int ggml_cuda_mmq_small_m_shape_format(
-diff --git a/tests/test-cuda-mmq-shape-trace.cpp b/tests/test-cuda-mmq-shape-trace.cpp
-index 1443749c3..2547193ce 100644
--- a/tests/test-cuda-mmq-shape-trace.cpp
-+++ b/tests/test-cuda-mmq-shape-trace.cpp
-@@ -27,7 +27,7 @@ int main() {
-     require(shape.n_active_est == 256, "active expert estimate is capped by expert count");
-     require(shape.density == 4, "density is ceil(assignments / active experts)");
- 
-    char buf[256];
-+    char buf[512];
-     const int n = ggml_cuda_mmq_shape_format(buf, sizeof(buf), shape);
- 
-     require(n > 0, "format returns byte count");
-@@ -302,35 +302,38 @@ int main() {
-         /* full_rows */ true,
-         /* is_cdna */ false,
-         /* is_rdna4 */ false,
-        /* is_volta */ false);
-+        /* is_volta */ false,
-+        /* src0_name */ "blk.0.proj.weight",
-+        /* src1_name */ "blk.0.proj.inp",
-+        /* dst_name */ "blk.0.proj.out");
- 
-     require(bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_BF16_TC,
-         "cuBLAS records native BF16 tensor-core route");
- 
-     const ggml_cuda_cublas_route_shape nvfp4_bf16_tc = ggml_cuda_cublas_route_shape_make(
-         /* type */ 40, 0, 128, 128, 1024, 1024, 128, true, false, true, false, false, true, true,
-        false, false, false);
-+        false, false, false, "nvfp4.weight", "nvfp4.inp", "nvfp4.out");
- 
-     require(nvfp4_bf16_tc.route == GGML_CUDA_CUBLAS_ROUTE_NVFP4_BF16_TC,
-         "cuBLAS records NVFP4 dequant-to-BF16 tensor-core route");
- 
-     const ggml_cuda_cublas_route_shape f16_tc_16f = ggml_cuda_cublas_route_shape_make(
-         /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, false, false, true, true,
-        false, false, false);
-+        false, false, false, "f16.weight", "f16.inp", "f16.out");
- 
-     require(f16_tc_16f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_16F,
-         "cuBLAS records default FP16 tensor-core 16F compute route");
- 
-     const ggml_cuda_cublas_route_shape f16_tc_32f = ggml_cuda_cublas_route_shape_make(
-         /* type */ 1, 0, 64, 64, 1024, 1024, 64, false, true, true, true, false, true, true,
-        false, false, false);
-+        false, false, false, "f16.weight", "f16.inp", "f16.out");
- 
-     require(f16_tc_32f.route == GGML_CUDA_CUBLAS_ROUTE_F16_TC_32F,
-         "cuBLAS records forced FP16 tensor-core 32F compute route");
- 
-     const ggml_cuda_cublas_route_shape sgemm = ggml_cuda_cublas_route_shape_make(
-         /* type */ 0, 0, 12, 12, 1024, 1024, 12, false, false, true, false, false, true, true,
-        false, false, false);
-+        false, false, false, "f32.weight", "f32.inp", "f32.out");
- 
-     require(sgemm.route == GGML_CUDA_CUBLAS_ROUTE_SGEMM,
-         "cuBLAS records SGEMM fallback route");
-@@ -346,6 +349,9 @@ int main() {
-     require(std::strstr(buf, "supports_bf16=1") != nullptr, "cuBLAS trace includes BF16 predicate");
-     require(std::strstr(buf, "force_fp32=0") != nullptr, "cuBLAS trace includes forced compute predicate");
-     require(std::strstr(buf, "src0_contig=1") != nullptr, "cuBLAS trace includes contiguity predicate");
-+    require(std::strstr(buf, "src0=blk.0.proj.weight") != nullptr, "cuBLAS trace includes src0 name");
-+    require(std::strstr(buf, "src1=blk.0.proj.inp") != nullptr, "cuBLAS trace includes src1 name");
-+    require(std::strstr(buf, "dst=blk.0.proj.out") != nullptr, "cuBLAS trace includes dst name");
- 
-     return 0;
- }
-- 
-2.43.0
-