mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-02 20:37:03 -04:00
test(paged): mirror MoE weighted combine gate
Assisted-by: Codex:gpt-5
This commit is contained in:
@@ -620,3 +620,26 @@ Result:
|
||||
the fused quantizer used compact GLU-output strides to read split `gate`/`up`
|
||||
views. Split views stride over the merged gate/up tensor; using source-view
|
||||
strides fixed the op gate but not the end-to-end md5 drift.
|
||||
|
||||
## Phase 7 Weighted-Combine Test Gate
|
||||
|
||||
Fork commit `3ef7eb9e4d` added patch
|
||||
`0052-test-paged-cover-MoE-weighted-combine-chain.patch`. This is a test-only
|
||||
patch; it does not change the production inference path.
|
||||
|
||||
The new `MOE_WEIGHTED_COMBINE` whole-graph gate covers:
|
||||
|
||||
`down MUL_MAT_ID -> router-weight ggml_mul -> rank-ordered expert views/adds`.
|
||||
|
||||
DGX artifact:
|
||||
|
||||
- `/home/mudler/bench/phase7_source_scope/test_backend_ops_moe_weighted_combine_green.txt`
|
||||
|
||||
DGX result:
|
||||
|
||||
- `test-backend-ops test -b CUDA0 -o MOE_WEIGHTED_COMBINE -j 1`: `7/7`.
|
||||
|
||||
This gate is the correctness target for the next candidate: a deterministic
|
||||
post-down MoE weighted-combine fusion that preserves current f32 product and
|
||||
rank-order add semantics while avoiding the rejected SWIGLU/FP4-quantization
|
||||
shortcut.
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
From 3ef7eb9e412eb34f8656675862f6753c65d28ec9 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 23:50:33 +0000
|
||||
Subject: [PATCH] test(paged): cover MoE weighted combine chain
|
||||
|
||||
---
|
||||
tests/test-backend-ops.cpp | 90 ++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 90 insertions(+)
|
||||
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index aeca64802..71740ce9f 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -4532,6 +4532,89 @@ struct test_moe_swiglu_down : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
+// MoE down projection -> router-weight multiply -> rank-ordered expert add.
|
||||
+struct test_moe_weighted_combine : public test_case {
|
||||
+ const ggml_type type_a;
|
||||
+ const int n_mats;
|
||||
+ const int n_used;
|
||||
+ const int64_t n_ff;
|
||||
+ const int64_t n_tokens;
|
||||
+ const int64_t n_embd;
|
||||
+
|
||||
+ std::string vars() override {
|
||||
+ return VARS_TO_STR6(type_a, n_mats, n_used, n_ff, n_tokens, n_embd);
|
||||
+ }
|
||||
+
|
||||
+ double max_nmse_err() override {
|
||||
+ return 5e-4;
|
||||
+ }
|
||||
+
|
||||
+ double max_nmse_err(ggml_backend_t backend) override {
|
||||
+ if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
|
||||
+ return 2e-2;
|
||||
+ }
|
||||
+ return max_nmse_err();
|
||||
+ }
|
||||
+
|
||||
+ uint64_t op_flops(ggml_tensor * t) override {
|
||||
+ GGML_UNUSED(t);
|
||||
+ return 2 * n_ff * n_embd * n_tokens * n_used + 2 * n_embd * n_tokens * n_used;
|
||||
+ }
|
||||
+
|
||||
+ test_moe_weighted_combine(ggml_type type_a = GGML_TYPE_F32, int n_mats = 128, int n_used = 8,
|
||||
+ int64_t n_ff = 768, int64_t n_tokens = 128, int64_t n_embd = 2048)
|
||||
+ : type_a(type_a), n_mats(n_mats), n_used(n_used), n_ff(n_ff), n_tokens(n_tokens), n_embd(n_embd) {
|
||||
+ GGML_ASSERT(n_used <= n_mats);
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
+ ggml_tensor * down = ggml_new_tensor_3d(ctx, type_a, n_ff, n_embd, n_mats);
|
||||
+ ggml_set_name(down, "down");
|
||||
+
|
||||
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n_tokens);
|
||||
+ ggml_set_name(ids, "ids");
|
||||
+ if (n_used != n_mats) {
|
||||
+ ids = ggml_view_2d(ctx, ids, n_used, n_tokens, ids->nb[1], 0);
|
||||
+ ggml_set_name(ids, "view_of_ids");
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * act = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_ff, n_used, n_tokens);
|
||||
+ ggml_set_name(act, "act");
|
||||
+
|
||||
+ ggml_tensor * weights = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, n_used, n_tokens);
|
||||
+ ggml_set_name(weights, "weights");
|
||||
+
|
||||
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down, act, ids);
|
||||
+ ggml_set_name(experts, "down_out");
|
||||
+
|
||||
+ experts = ggml_mul(ctx, experts, weights);
|
||||
+ ggml_set_name(experts, "weighted");
|
||||
+
|
||||
+ ggml_tensor * out = ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0);
|
||||
+ ggml_set_name(out, "rank_0");
|
||||
+
|
||||
+ for (int i = 1; i < n_used; ++i) {
|
||||
+ ggml_tensor * rank = ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
||||
+ ggml_set_name(rank, "rank_i");
|
||||
+ out = ggml_add(ctx, out, rank);
|
||||
+ ggml_set_name(out, "rank_sum");
|
||||
+ }
|
||||
+
|
||||
+ return out;
|
||||
+ }
|
||||
+
|
||||
+ void initialize_tensors(ggml_context * ctx) override {
|
||||
+ init_mul_mat_id_tensors(ctx, n_mats);
|
||||
+ }
|
||||
+
|
||||
+ bool run_whole_graph() override { return true; }
|
||||
+
|
||||
+ std::string op_desc(ggml_tensor * t) override {
|
||||
+ GGML_UNUSED(t);
|
||||
+ return "MOE_WEIGHTED_COMBINE";
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
// GGML_OP_OUT_PROD
|
||||
struct test_out_prod : public test_case {
|
||||
const ggml_type type_a;
|
||||
@@ -8851,6 +8934,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048));
|
||||
}
|
||||
|
||||
+ // [paged Phase 7] MoE down projection -> router-weight multiply -> rank-ordered
|
||||
+ // expert add gate for the weighted-combine fusion candidate.
|
||||
+ test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_F32, 8, 2, 32, 8, 64));
|
||||
+ for (int n : {16, 33, 64, 128, 130, 200}) {
|
||||
+ test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048));
|
||||
+ }
|
||||
+
|
||||
// [paged P0 / track B] NVFP4/MXFP4 dense decode-shape mmq_y-down bit-exact gate.
|
||||
// The dense FP4 weight GEMM is the track-B target; P1 lowers mmq_y (the weight-row tile) on the
|
||||
// NVFP4 decode path to raise resident-CTA occupancy. mmq_y is a pure N-row tiling knob, so a
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@@ -216,7 +216,10 @@ to implementation when all are true:
|
||||
- vLLM confirms GEMM1 -> activation -> GEMM2 -> reduce; no SWIGLU-down
|
||||
shortcut to copy.
|
||||
- Next candidate: deterministic post-down MoE weighted-combine fusion.
|
||||
- [ ] Add `MOE_WEIGHTED_COMBINE` test gate in the fork before production code.
|
||||
- [x] Add `MOE_WEIGHTED_COMBINE` test gate in the fork before production code.
|
||||
- Fork commit: `3ef7eb9e4` (`test(paged): cover MoE weighted combine chain`).
|
||||
- LocalAI patch: `0052-test-paged-cover-MoE-weighted-combine-chain.patch`.
|
||||
- DGX gate: `MOE_WEIGHTED_COMBINE` `7/7` on CUDA0.
|
||||
- [ ] Implement weighted-combine fusion only if the test gate is stable.
|
||||
- [ ] Run op/md5 gates before serving A/B.
|
||||
|
||||
@@ -305,3 +308,22 @@ allows a new paged-MoE md5 namespace and a profile shows a material bucket win.
|
||||
target is non-greedy.
|
||||
- Run existing server completion tests covering backend sampling probabilities
|
||||
and logit-bias behavior.
|
||||
|
||||
## Patch 0052 Result
|
||||
|
||||
Patch `0052` adds a whole-graph test named `MOE_WEIGHTED_COMBINE`. It covers the
|
||||
post-down MoE combine candidate:
|
||||
|
||||
`down MUL_MAT_ID -> router-weight ggml_mul -> rank-ordered expert views/adds`.
|
||||
|
||||
Coverage:
|
||||
|
||||
- one small F32 wiring case,
|
||||
- NVFP4 Qwen-style cases with `n_mats=128`, `n_used=8`, `n_ff=768`,
|
||||
`n_embd=2048`, and `n_tokens in {16, 33, 64, 128, 130, 200}`.
|
||||
|
||||
DGX result:
|
||||
|
||||
- `test-backend-ops test -b CUDA0 -o MOE_WEIGHTED_COMBINE -j 1`: `7/7`.
|
||||
|
||||
This is a test-only patch and does not change the production inference path.
|
||||
|
||||
Reference in New Issue
Block a user