From b009de0ee0f5b15ba7e6dbfc204220f33a3cee74 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 1 Jul 2026 00:41:21 +0000 Subject: [PATCH] test(paged): mirror ragged MoE dispatch gate Assisted-by: Codex:gpt-5 --- .../docs/GB10_PARITY_PHASE0_RESULTS.md | 29 ++++ ...test-paged-cover-ragged-MoE-dispatch.patch | 148 ++++++++++++++++++ .../2026-07-01-serving-ragged-moe-phase8.md | 32 +++- 3 files changed, 206 insertions(+), 3 deletions(-) create mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch diff --git a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md index 09c2a8c25..6b1973f5e 100644 --- a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md +++ b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md @@ -746,3 +746,32 @@ Decision: - Do not implement fused dispatch yet. Standalone `mm_ids`/`gather_mmq` helper time is small; a source patch must reduce the larger grouped-MMQ/activation movement bucket and still beat the `+5%` serving A/B gate. + +## Phase 8 Ragged MoE Dispatch Test Gate + +Fork commit `e21732fc4` added patch +`0053-test-paged-cover-ragged-MoE-dispatch.patch`. This is a test-only patch; +it does not change the production inference path. + +The new `MUL_MAT_ID_RAGGED_MOE` gate covers: + +- one small F32 wiring case, +- NVFP4 with `n_mats=256`, `n_used=8`, `m=768`, `k=2048`, + `n in {1, 8, 33, 128, 257}`, +- deterministic unique top-k ids skewed toward hot experts, including expert + `255`, leaving many experts empty. + +DGX artifact: + +- `/home/mudler/bench/phase8_ragged_moe_dispatch/test_backend_ops_mul_mat_id_ragged_moe_fixed.txt` + +DGX result: + +- `test-backend-ops test -b CUDA0 -o MUL_MAT_ID_RAGGED_MOE -j 1`: `6/6`. + +Debug note: + +- The first version of the gate failed because the deterministic IDs produced + duplicate expert IDs within token 0. That is not a valid top-k routing shape + and caused a CPU/CUDA mismatch followed by a CUDA fault. The committed gate + preserves unique expert IDs per token while keeping cross-token load skew. diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch new file mode 100644 index 000000000..cfc8e9736 --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-test-paged-cover-ragged-MoE-dispatch.patch @@ -0,0 +1,148 @@ +From e21732fc47206d5878e3b977bbd21858a3ba4ab0 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Wed, 1 Jul 2026 00:39:52 +0000 +Subject: [PATCH] test(paged): cover ragged MoE dispatch + +--- + tests/test-backend-ops.cpp | 118 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 118 insertions(+) + +diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp +index 71740ce9f..8c41ae56a 100644 +--- a/tests/test-backend-ops.cpp ++++ b/tests/test-backend-ops.cpp +@@ -4615,6 +4615,115 @@ struct test_moe_weighted_combine : public test_case { + } + }; + ++// Ragged 256-expert MoE dispatch gate for serving decode. ++struct test_mul_mat_id_ragged_moe : public test_case { ++ const ggml_type type_a; ++ const int n_mats; ++ const int n_used; ++ const int64_t m; ++ const int64_t n; ++ const int64_t k; ++ ++ std::string vars() override { ++ return VARS_TO_STR6(type_a, n_mats, n_used, m, n, k); ++ } ++ ++ double max_nmse_err() override { ++ return 5e-4; ++ } ++ ++ double max_nmse_err(ggml_backend_t backend) override { ++ if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { ++ return 2e-2; ++ } ++ return max_nmse_err(); ++ } ++ ++ uint64_t op_flops(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return 2 * m * k * n * n_used; ++ } ++ ++ test_mul_mat_id_ragged_moe(ggml_type type_a = GGML_TYPE_NVFP4, int n_mats = 256, int n_used = 8, ++ int64_t m = 768, int64_t n = 128, int64_t k = 2048) ++ : type_a(type_a), n_mats(n_mats), n_used(n_used), m(m), n(n), k(k) { ++ GGML_ASSERT(n_used <= n_mats); ++ } ++ ++ ggml_tensor * build_graph(ggml_context * ctx) override { ++ ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ++ ggml_set_name(as, "as"); ++ ++ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ++ ggml_set_name(ids, "ids"); ++ if (n_used != n_mats) { ++ ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0); ++ ggml_set_name(ids, "view_of_ids"); ++ } ++ ++ ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, n_used, n); ++ ggml_set_name(b, "b"); ++ ++ ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids); ++ ggml_set_name(out, "out"); ++ ++ return out; ++ } ++ ++ void initialize_tensors(ggml_context * ctx) override { ++ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { ++ if (ggml_is_view_op(t->op)) { ++ continue; ++ } ++ if (t->type != GGML_TYPE_I32) { ++ init_tensor_uniform(t); ++ continue; ++ } ++ ++ std::vector data(t->ne[0]); ++ for (int64_t token = 0; token < ggml_nrows(t); ++token) { ++ for (int64_t r = 0; r < t->ne[0]; ++r) { ++ data[r] = (int32_t) ((token * 17 + r * 31) % n_mats); ++ } ++ ++ if (n_used >= 8) { ++ // Skew rank 0 heavily to expert 0, exercise max expert id, ++ // leave many experts empty, and preserve unique top-k ids. ++ std::vector used(n_mats, false); ++ const int64_t seeds[8] = { ++ 0, ++ 1 + token % 4, ++ 4 + (token * 3) % 8, ++ n_mats - 1, ++ token * 5 + 7, ++ token * 7 + 11, ++ token * 13 + 19, ++ token * 29 + 23, ++ }; ++ ++ for (int64_t r = 0; r < 8; ++r) { ++ int32_t id = (int32_t) (seeds[r] % n_mats); ++ while (used[id]) { ++ id = (id + 1) % n_mats; ++ } ++ data[r] = id; ++ used[id] = true; ++ } ++ } ++ ++ ggml_backend_tensor_set(t, data.data(), token * t->nb[1], t->ne[0] * sizeof(int32_t)); ++ } ++ } ++ } ++ ++ bool run_whole_graph() override { return true; } ++ ++ std::string op_desc(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return "MUL_MAT_ID_RAGGED_MOE"; ++ } ++}; ++ + // GGML_OP_OUT_PROD + struct test_out_prod : public test_case { + const ggml_type type_a; +@@ -8941,6 +9050,15 @@ static std::vector> make_test_cases_eval() { + test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048)); + } + ++ // [paged Phase 8] Ragged 256-expert MoE dispatch gate for live serving decode. ++ // Deterministic ids skew many tokens into a few hot experts, include expert 255, ++ // and leave many experts empty. n=1 covers single-token decode; n=257 crosses ++ // the MMVQ/MMID batch cutoff while preserving top-8 routing. ++ test_cases.emplace_back(new test_mul_mat_id_ragged_moe(GGML_TYPE_F32, 16, 8, 32, 8, 64)); ++ for (int n : {1, 8, 33, 128, 257}) { ++ test_cases.emplace_back(new test_mul_mat_id_ragged_moe(GGML_TYPE_NVFP4, 256, 8, 768, n, 2048)); ++ } ++ + // [paged P0 / track B] NVFP4/MXFP4 dense decode-shape mmq_y-down bit-exact gate. + // The dense FP4 weight GEMM is the track-B target; P1 lowers mmq_y (the weight-row tile) on the + // NVFP4 decode path to raise resident-CTA occupancy. mmq_y is a pure N-row tiling knob, so a +-- +2.43.0 + diff --git a/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md b/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md index aa8ef17ac..8a4f1f5f7 100644 --- a/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md +++ b/docs/superpowers/plans/2026-07-01-serving-ragged-moe-phase8.md @@ -282,7 +282,7 @@ Selected Phase 8 candidate: - Mirror patch under: `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention/backend/cpp/llama-cpp-localai-paged/patches/paged/` -- [ ] **Step 1: Add a test-only fork patch** +- [x] **Step 1: Add a test-only fork patch** Add a `MUL_MAT_ID_RAGGED_MOE` whole-graph test that exercises: @@ -292,7 +292,19 @@ Selected Phase 8 candidate: - `n_tokens in {1, 8, 33, 128, 257}` - explicitly empty experts and high skew into 1-row experts -- [ ] **Step 2: Run red/green if the test exposes a missing path** + Result: + + - Fork commit: `e21732fc4` (`test(paged): cover ragged MoE dispatch`). + - LocalAI patch: + `0053-test-paged-cover-ragged-MoE-dispatch.patch`. + - Coverage: + - one small F32 wiring case, + - NVFP4 with `n_mats=256`, `n_used=8`, `m=768`, `k=2048`, + `n in {1, 8, 33, 128, 257}`. + - deterministic unique top-k ids skewed toward hot experts, including + expert `255`, with many empty experts. + +- [x] **Step 2: Run red/green if the test exposes a missing path** Run: @@ -305,7 +317,16 @@ Selected Phase 8 candidate: - Existing path should pass. If it fails, stop and debug before production code. -- [ ] **Step 3: Mirror the test patch** + Result: + + - Initial test failed because the first deterministic ID pattern created + duplicate expert IDs within the same token, which is not a valid top-k + routing shape. The corrected gate preserves unique expert IDs per token. + - DGX artifact: + `/home/mudler/bench/phase8_ragged_moe_dispatch/test_backend_ops_mul_mat_id_ragged_moe_fixed.txt`. + - Result: `MUL_MAT_ID_RAGGED_MOE` `6/6` on CUDA0. + +- [x] **Step 3: Mirror the test patch** Generate with: @@ -315,6 +336,11 @@ Selected Phase 8 candidate: Copy into LocalAI only after checking patch order. + Result: + + - Patch `0053-test-paged-cover-ragged-MoE-dispatch.patch` added after + `0052-test-paged-cover-MoE-weighted-combine-chain.patch`. + ## Task 3: Default-Off Fused Dispatch Prototype If Promoted **Files:**