From b009de0ee0f5b15ba7e6dbfc204220f33a3cee74 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Wed, 1 Jul 2026 00:41:21 +0000
Subject: [PATCH] test(paged): mirror ragged MoE dispatch gate

Assisted-by: Codex:gpt-5
---
 .../docs/GB10_PARITY_PHASE0_RESULTS.md        |  29 ++++
 ...test-paged-cover-ragged-MoE-dispatch.patch | 148 ++++++++++++++++++
 .../2026-07-01-serving-ragged-moe-phase8.md   |  32 +++-
 3 files changed, 206 insertions(+), 3 deletions(-)
 create mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch

diff --git a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md
index 09c2a8c25..6b1973f5e 100644
--- a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md
+++ b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md
@@ -746,3 +746,32 @@ Decision:
 - Do not implement fused dispatch yet. Standalone `mm_ids`/`gather_mmq` helper
   time is small; a source patch must reduce the larger grouped-MMQ/activation
   movement bucket and still beat the `+5%` serving A/B gate.
+
+## Phase 8 Ragged MoE Dispatch Test Gate
+
+Fork commit `e21732fc4` added patch
+`0053-test-paged-cover-ragged-MoE-dispatch.patch`. This is a test-only patch;
+it does not change the production inference path.
+
+The new `MUL_MAT_ID_RAGGED_MOE` gate covers:
+
+- one small F32 wiring case,
+- NVFP4 with `n_mats=256`, `n_used=8`, `m=768`, `k=2048`,
+  `n in {1, 8, 33, 128, 257}`,
+- deterministic unique top-k ids skewed toward hot experts, including expert
+  `255`, leaving many experts empty.
+
+DGX artifact:
+
+- `/home/mudler/bench/phase8_ragged_moe_dispatch/test_backend_ops_mul_mat_id_ragged_moe_fixed.txt`
+
+DGX result:
+
+- `test-backend-ops test -b CUDA0 -o MUL_MAT_ID_RAGGED_MOE -j 1`: `6/6`.
+
+Debug note:
+
+- The first version of the gate failed because the deterministic IDs produced
+  duplicate expert IDs within token 0. That is not a valid top-k routing shape
+  and caused a CPU/CUDA mismatch followed by a CUDA fault. The committed gate
+  preserves unique expert IDs per token while keeping cross-token load skew.
diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch
new file mode 100644
index 000000000..cfc8e9736
--- /dev/null
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch
@@ -0,0 +1,148 @@
+From e21732fc47206d5878e3b977bbd21858a3ba4ab0 Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Wed, 1 Jul 2026 00:39:52 +0000
+Subject: [PATCH] test(paged): cover ragged MoE dispatch
+
+---
+ tests/test-backend-ops.cpp | 118 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 118 insertions(+)
+
+diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
+index 71740ce9f..8c41ae56a 100644
+--- a/tests/test-backend-ops.cpp
++++ b/tests/test-backend-ops.cpp
+@@ -4615,6 +4615,115 @@ struct test_moe_weighted_combine : public test_case {
+     }
+ };
+ 
++// Ragged 256-expert MoE dispatch gate for serving decode.
++struct test_mul_mat_id_ragged_moe : public test_case {
++    const ggml_type type_a;
++    const int n_mats;
++    const int n_used;
++    const int64_t m;
++    const int64_t n;
++    const int64_t k;
++
++    std::string vars() override {
++        return VARS_TO_STR6(type_a, n_mats, n_used, m, n, k);
++    }
++
++    double max_nmse_err() override {
++        return 5e-4;
++    }
++
++    double max_nmse_err(ggml_backend_t backend) override {
++        if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
++            return 2e-2;
++        }
++        return max_nmse_err();
++    }
++
++    uint64_t op_flops(ggml_tensor * t) override {
++        GGML_UNUSED(t);
++        return 2 * m * k * n * n_used;
++    }
++
++    test_mul_mat_id_ragged_moe(ggml_type type_a = GGML_TYPE_NVFP4, int n_mats = 256, int n_used = 8,
++            int64_t m = 768, int64_t n = 128, int64_t k = 2048)
++        : type_a(type_a), n_mats(n_mats), n_used(n_used), m(m), n(n), k(k) {
++            GGML_ASSERT(n_used <= n_mats);
++        }
++
++    ggml_tensor * build_graph(ggml_context * ctx) override {
++        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
++        ggml_set_name(as, "as");
++
++        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
++        ggml_set_name(ids, "ids");
++        if (n_used != n_mats) {
++            ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
++            ggml_set_name(ids, "view_of_ids");
++        }
++
++        ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, n_used, n);
++        ggml_set_name(b, "b");
++
++        ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
++        ggml_set_name(out, "out");
++
++        return out;
++    }
++
++    void initialize_tensors(ggml_context * ctx) override {
++        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
++            if (ggml_is_view_op(t->op)) {
++                continue;
++            }
++            if (t->type != GGML_TYPE_I32) {
++                init_tensor_uniform(t);
++                continue;
++            }
++
++            std::vector<int32_t> data(t->ne[0]);
++            for (int64_t token = 0; token < ggml_nrows(t); ++token) {
++                for (int64_t r = 0; r < t->ne[0]; ++r) {
++                    data[r] = (int32_t) ((token * 17 + r * 31) % n_mats);
++                }
++
++                if (n_used >= 8) {
++                    // Skew rank 0 heavily to expert 0, exercise max expert id,
++                    // leave many experts empty, and preserve unique top-k ids.
++                    std::vector<bool> used(n_mats, false);
++                    const int64_t seeds[8] = {
++                        0,
++                        1 + token % 4,
++                        4 + (token * 3) % 8,
++                        n_mats - 1,
++                        token * 5 + 7,
++                        token * 7 + 11,
++                        token * 13 + 19,
++                        token * 29 + 23,
++                    };
++
++                    for (int64_t r = 0; r < 8; ++r) {
++                        int32_t id = (int32_t) (seeds[r] % n_mats);
++                        while (used[id]) {
++                            id = (id + 1) % n_mats;
++                        }
++                        data[r] = id;
++                        used[id] = true;
++                    }
++                }
++
++                ggml_backend_tensor_set(t, data.data(), token * t->nb[1], t->ne[0] * sizeof(int32_t));
++            }
++        }
++    }
++
++    bool run_whole_graph() override { return true; }
++
++    std::string op_desc(ggml_tensor * t) override {
++        GGML_UNUSED(t);
++        return "MUL_MAT_ID_RAGGED_MOE";
++    }
++};
++
+ // GGML_OP_OUT_PROD
+ struct test_out_prod : public test_case {
+     const ggml_type type_a;
+@@ -8941,6 +9050,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048));
+     }
+ 
++    // [paged Phase 8] Ragged 256-expert MoE dispatch gate for live serving decode.
++    // Deterministic ids skew many tokens into a few hot experts, include expert 255,
++    // and leave many experts empty. n=1 covers single-token decode; n=257 crosses
++    // the MMVQ/MMID batch cutoff while preserving top-8 routing.
++    test_cases.emplace_back(new test_mul_mat_id_ragged_moe(GGML_TYPE_F32, 16, 8, 32, 8, 64));
++    for (int n : {1, 8, 33, 128, 257}) {
++        test_cases.emplace_back(new test_mul_mat_id_ragged_moe(GGML_TYPE_NVFP4, 256, 8, 768, n, 2048));
++    }
++
+     // [paged P0 / track B] NVFP4/MXFP4 dense decode-shape mmq_y-down bit-exact gate.
+     // The dense FP4 weight GEMM is the track-B target; P1 lowers mmq_y (the weight-row tile) on the
+     // NVFP4 decode path to raise resident-CTA occupancy. mmq_y is a pure N-row tiling knob, so a
+-- 
+2.43.0
+
diff --git a/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md b/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md
index aa8ef17ac..8a4f1f5f7 100644
--- a/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md
+++ b/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md
@@ -282,7 +282,7 @@ Selected Phase 8 candidate:
 - Mirror patch under:
   `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/patches/paged/`
 
-- [ ] **Step 1: Add a test-only fork patch**
+- [x] **Step 1: Add a test-only fork patch**
 
   Add a `MUL_MAT_ID_RAGGED_MOE` whole-graph test that exercises:
 
@@ -292,7 +292,19 @@ Selected Phase 8 candidate:
   - `n_tokens in {1, 8, 33, 128, 257}`
   - explicitly empty experts and high skew into 1-row experts
 
-- [ ] **Step 2: Run red/green if the test exposes a missing path**
+  Result:
+
+  - Fork commit: `e21732fc4` (`test(paged): cover ragged MoE dispatch`).
+  - LocalAI patch:
+    `0053-test-paged-cover-ragged-MoE-dispatch.patch`.
+  - Coverage:
+    - one small F32 wiring case,
+    - NVFP4 with `n_mats=256`, `n_used=8`, `m=768`, `k=2048`,
+      `n in {1, 8, 33, 128, 257}`.
+    - deterministic unique top-k ids skewed toward hot experts, including
+      expert `255`, with many empty experts.
+
+- [x] **Step 2: Run red/green if the test exposes a missing path**
 
   Run:
 
@@ -305,7 +317,16 @@ Selected Phase 8 candidate:
   - Existing path should pass. If it fails, stop and debug before production
     code.
 
-- [ ] **Step 3: Mirror the test patch**
+  Result:
+
+  - Initial test failed because the first deterministic ID pattern created
+    duplicate expert IDs within the same token, which is not a valid top-k
+    routing shape. The corrected gate preserves unique expert IDs per token.
+  - DGX artifact:
+    `/home/mudler/bench/phase8_ragged_moe_dispatch/test_backend_ops_mul_mat_id_ragged_moe_fixed.txt`.
+  - Result: `MUL_MAT_ID_RAGGED_MOE` `6/6` on CUDA0.
+
+- [x] **Step 3: Mirror the test patch**
 
   Generate with:
 
@@ -315,6 +336,11 @@ Selected Phase 8 candidate:
 
   Copy into LocalAI only after checking patch order.
 
+  Result:
+
+  - Patch `0053-test-paged-cover-ragged-MoE-dispatch.patch` added after
+    `0052-test-paged-cover-MoE-weighted-combine-chain.patch`.
+
 ## Task 3: Default-Off Fused Dispatch Prototype If Promoted
 
 **Files:**