From 4b6fc0fa1cf6c3314710673f350754b4d0145cd7 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 30 Jun 2026 23:51:52 +0000 Subject: [PATCH] test(paged): mirror MoE weighted combine gate Assisted-by: Codex:gpt-5 --- .../docs/GB10_PARITY_PHASE0_RESULTS.md | 23 ++++ ...ged-cover-MoE-weighted-combine-chain.patch | 120 ++++++++++++++++++ .../plans/2026-06-30-serving-source-phase7.md | 24 +++- 3 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0052-test-paged-cover-MoE-weighted-combine-chain.patch diff --git a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md index cf48a35f5..6cf174634 100644 --- a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md +++ b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md @@ -620,3 +620,26 @@ Result: the fused quantizer used compact GLU-output strides to read split `gate`/`up` views. Split views stride over the merged gate/up tensor; using source-view strides fixed the op gate but not the end-to-end md5 drift. + +## Phase 7 Weighted-Combine Test Gate + +Fork commit `3ef7eb9e4d` added patch +`0052-test-paged-cover-MoE-weighted-combine-chain.patch`. This is a test-only +patch; it does not change the production inference path. + +The new `MOE_WEIGHTED_COMBINE` whole-graph gate covers: + +`down MUL_MAT_ID -> router-weight ggml_mul -> rank-ordered expert views/adds`. + +DGX artifact: + +- `/home/mudler/bench/phase7_source_scope/test_backend_ops_moe_weighted_combine_green.txt` + +DGX result: + +- `test-backend-ops test -b CUDA0 -o MOE_WEIGHTED_COMBINE -j 1`: `7/7`. + +This gate is the correctness target for the next candidate: a deterministic +post-down MoE weighted-combine fusion that preserves current f32 product and +rank-order add semantics while avoiding the rejected SWIGLU/FP4-quantization +shortcut. diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-test-paged-cover-MoE-weighted-combine-chain.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-test-paged-cover-MoE-weighted-combine-chain.patch new file mode 100644 index 000000000..e68f1da91 --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0052-test-paged-cover-MoE-weighted-combine-chain.patch @@ -0,0 +1,120 @@ +From 3ef7eb9e412eb34f8656675862f6753c65d28ec9 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Tue, 30 Jun 2026 23:50:33 +0000 +Subject: [PATCH] test(paged): cover MoE weighted combine chain + +--- + tests/test-backend-ops.cpp | 90 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 90 insertions(+) + +diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp +index aeca64802..71740ce9f 100644 +--- a/tests/test-backend-ops.cpp ++++ b/tests/test-backend-ops.cpp +@@ -4532,6 +4532,89 @@ struct test_moe_swiglu_down : public test_case { + } + }; + ++// MoE down projection -> router-weight multiply -> rank-ordered expert add. ++struct test_moe_weighted_combine : public test_case { ++ const ggml_type type_a; ++ const int n_mats; ++ const int n_used; ++ const int64_t n_ff; ++ const int64_t n_tokens; ++ const int64_t n_embd; ++ ++ std::string vars() override { ++ return VARS_TO_STR6(type_a, n_mats, n_used, n_ff, n_tokens, n_embd); ++ } ++ ++ double max_nmse_err() override { ++ return 5e-4; ++ } ++ ++ double max_nmse_err(ggml_backend_t backend) override { ++ if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { ++ return 2e-2; ++ } ++ return max_nmse_err(); ++ } ++ ++ uint64_t op_flops(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return 2 * n_ff * n_embd * n_tokens * n_used + 2 * n_embd * n_tokens * n_used; ++ } ++ ++ test_moe_weighted_combine(ggml_type type_a = GGML_TYPE_F32, int n_mats = 128, int n_used = 8, ++ int64_t n_ff = 768, int64_t n_tokens = 128, int64_t n_embd = 2048) ++ : type_a(type_a), n_mats(n_mats), n_used(n_used), n_ff(n_ff), n_tokens(n_tokens), n_embd(n_embd) { ++ GGML_ASSERT(n_used <= n_mats); ++ } ++ ++ ggml_tensor * build_graph(ggml_context * ctx) override { ++ ggml_tensor * down = ggml_new_tensor_3d(ctx, type_a, n_ff, n_embd, n_mats); ++ ggml_set_name(down, "down"); ++ ++ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n_tokens); ++ ggml_set_name(ids, "ids"); ++ if (n_used != n_mats) { ++ ids = ggml_view_2d(ctx, ids, n_used, n_tokens, ids->nb[1], 0); ++ ggml_set_name(ids, "view_of_ids"); ++ } ++ ++ ggml_tensor * act = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_ff, n_used, n_tokens); ++ ggml_set_name(act, "act"); ++ ++ ggml_tensor * weights = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, n_used, n_tokens); ++ ggml_set_name(weights, "weights"); ++ ++ ggml_tensor * experts = ggml_mul_mat_id(ctx, down, act, ids); ++ ggml_set_name(experts, "down_out"); ++ ++ experts = ggml_mul(ctx, experts, weights); ++ ggml_set_name(experts, "weighted"); ++ ++ ggml_tensor * out = ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0); ++ ggml_set_name(out, "rank_0"); ++ ++ for (int i = 1; i < n_used; ++i) { ++ ggml_tensor * rank = ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); ++ ggml_set_name(rank, "rank_i"); ++ out = ggml_add(ctx, out, rank); ++ ggml_set_name(out, "rank_sum"); ++ } ++ ++ return out; ++ } ++ ++ void initialize_tensors(ggml_context * ctx) override { ++ init_mul_mat_id_tensors(ctx, n_mats); ++ } ++ ++ bool run_whole_graph() override { return true; } ++ ++ std::string op_desc(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return "MOE_WEIGHTED_COMBINE"; ++ } ++}; ++ + // GGML_OP_OUT_PROD + struct test_out_prod : public test_case { + const ggml_type type_a; +@@ -8851,6 +8934,13 @@ static std::vector> make_test_cases_eval() { + test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048)); + } + ++ // [paged Phase 7] MoE down projection -> router-weight multiply -> rank-ordered ++ // expert add gate for the weighted-combine fusion candidate. ++ test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_F32, 8, 2, 32, 8, 64)); ++ for (int n : {16, 33, 64, 128, 130, 200}) { ++ test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048)); ++ } ++ + // [paged P0 / track B] NVFP4/MXFP4 dense decode-shape mmq_y-down bit-exact gate. + // The dense FP4 weight GEMM is the track-B target; P1 lowers mmq_y (the weight-row tile) on the + // NVFP4 decode path to raise resident-CTA occupancy. mmq_y is a pure N-row tiling knob, so a +-- +2.43.0 + diff --git a/docs/superpowers/plans/2026-06-30-serving-source-phase7.md b/docs/superpowers/plans/2026-06-30-serving-source-phase7.md index 2da200db7..977b6ecf2 100644 --- a/docs/superpowers/plans/2026-06-30-serving-source-phase7.md +++ b/docs/superpowers/plans/2026-06-30-serving-source-phase7.md @@ -216,7 +216,10 @@ to implementation when all are true: - vLLM confirms GEMM1 -> activation -> GEMM2 -> reduce; no SWIGLU-down shortcut to copy. - Next candidate: deterministic post-down MoE weighted-combine fusion. -- [ ] Add `MOE_WEIGHTED_COMBINE` test gate in the fork before production code. +- [x] Add `MOE_WEIGHTED_COMBINE` test gate in the fork before production code. + - Fork commit: `3ef7eb9e4` (`test(paged): cover MoE weighted combine chain`). + - LocalAI patch: `0052-test-paged-cover-MoE-weighted-combine-chain.patch`. + - DGX gate: `MOE_WEIGHTED_COMBINE` `7/7` on CUDA0. - [ ] Implement weighted-combine fusion only if the test gate is stable. - [ ] Run op/md5 gates before serving A/B. @@ -305,3 +308,22 @@ allows a new paged-MoE md5 namespace and a profile shows a material bucket win. target is non-greedy. - Run existing server completion tests covering backend sampling probabilities and logit-bias behavior. + +## Patch 0052 Result + +Patch `0052` adds a whole-graph test named `MOE_WEIGHTED_COMBINE`. It covers the +post-down MoE combine candidate: + +`down MUL_MAT_ID -> router-weight ggml_mul -> rank-ordered expert views/adds`. + +Coverage: + +- one small F32 wiring case, +- NVFP4 Qwen-style cases with `n_mats=128`, `n_used=8`, `n_ff=768`, + `n_embd=2048`, and `n_tokens in {16, 33, 64, 128, 130, 200}`. + +DGX result: + +- `test-backend-ops test -b CUDA0 -o MOE_WEIGHTED_COMBINE -j 1`: `7/7`. + +This is a test-only patch and does not change the production inference path.