mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-03 04:46:54 -04:00
test(paged): mirror MoE swiglu down gate
Mirror the llama.cpp Phase 7 test gate for the merged MoE gate_up/SWIGLU/down chain and record the DGX md5/op gate evidence. Assisted-by: Codex:gpt-5
This commit is contained in:
@@ -340,7 +340,7 @@ Performance:
|
||||
|
||||
Result:
|
||||
|
||||
- Rejected. No fork commit and no LocalAI patch `0051`.
|
||||
- Rejected. No fork commit and no LocalAI patch was created for that experiment.
|
||||
- The local fork experiment was reverted.
|
||||
- Do not ship Wq padding alone; the measured `+0.4%` / `+0.6%` default-shape
|
||||
gain is below the maintenance threshold.
|
||||
@@ -388,14 +388,14 @@ Second clean build attempt:
|
||||
- Branch: `localai-paged`
|
||||
- Working tree: clean after fork commit `d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3`
|
||||
- Phase 0 HEAD: `51168c5eee2e35348d9006f0b2fab3dc6e7c01cc`
|
||||
- Current HEAD: `d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3`
|
||||
- Current HEAD: `cd56cf037379b084d6bb0ed47db8b785c828be86`
|
||||
- Base pin: `0ed235ea2c17a19fc8238668653946721ed136fd`
|
||||
- Merge-base with base pin: `0ed235ea2c17a19fc8238668653946721ed136fd`
|
||||
- LocalAI patch count: `38` at Phase 0; current mirror count is `41` after
|
||||
patch `0050`.
|
||||
- LocalAI patch count: `38` at Phase 0; current mirror count is `42` after
|
||||
patch `0051`.
|
||||
- LocalAI patch mirror: applies cleanly to the base pin and tree-matches fork
|
||||
HEAD.
|
||||
- Tree hash after patch application: `8fcb151e0620fd0fc82b80c04318e5c34320b087`
|
||||
- Tree hash after patch application: `623b7cb008a929455ca3d9deae35494c02622fef`
|
||||
|
||||
## Existing Artifact Gap Review
|
||||
|
||||
@@ -562,3 +562,19 @@ Result:
|
||||
input/sampler uploads, with a workload that proves the target bucket matters.
|
||||
- Phase 7 must keep the canonical MoE and dense md5 gates as the first
|
||||
inference-safety check before any performance result is accepted.
|
||||
|
||||
## Phase 7 Source-Candidate Test Gate
|
||||
|
||||
Fork commit `cd56cf037379b084d6bb0ed47db8b785c828be86` added patch
|
||||
`0051-test-paged-cover-MoE-swiglu-down-chain.patch`. This is a test-only patch;
|
||||
it does not change the production inference path.
|
||||
|
||||
Fresh DGX gates from `/home/mudler/bench/phase7_source_scope/`:
|
||||
|
||||
- MoE greedy md5: `8cb0ce23777bf55f92f63d0292c756b0`.
|
||||
- Dense greedy md5: `5951a5b4d624ce891e22ab5fca9bc439`.
|
||||
- Baseline `MUL_MAT_ID`: `806/806`.
|
||||
- New `MOE_SWIGLU_DOWN`: `7/7`.
|
||||
|
||||
The new gate covers the merged MoE gate_up -> SWIGLU -> down-projection graph
|
||||
shape needed before attempting a batched NVFP4 down-input quantization fusion.
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
From cd56cf037379b084d6bb0ed47db8b785c828be86 Mon Sep 17 00:00:00 2001
|
||||
From: Ettore Di Giacinto <mudler@localai.io>
|
||||
Date: Tue, 30 Jun 2026 23:18:38 +0000
|
||||
Subject: [PATCH] test(paged): cover MoE swiglu down chain
|
||||
|
||||
---
|
||||
tests/test-backend-ops.cpp | 92 ++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 92 insertions(+)
|
||||
|
||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
||||
index 817069860..aeca64802 100644
|
||||
--- a/tests/test-backend-ops.cpp
|
||||
+++ b/tests/test-backend-ops.cpp
|
||||
@@ -4447,6 +4447,91 @@ struct test_mul_mat_id_fusion : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
+// Merged MoE gate_up -> SWIGLU -> down MUL_MAT_ID chain.
|
||||
+struct test_moe_swiglu_down : public test_case {
|
||||
+ const ggml_type type_a;
|
||||
+ const int n_mats;
|
||||
+ const int n_used;
|
||||
+ const int64_t n_ff;
|
||||
+ const int64_t n_tokens;
|
||||
+ const int64_t n_embd;
|
||||
+
|
||||
+ std::string vars() override {
|
||||
+ return VARS_TO_STR6(type_a, n_mats, n_used, n_ff, n_tokens, n_embd);
|
||||
+ }
|
||||
+
|
||||
+ double max_nmse_err() override {
|
||||
+ return 5e-4;
|
||||
+ }
|
||||
+
|
||||
+ double max_nmse_err(ggml_backend_t backend) override {
|
||||
+ if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
|
||||
+ // This whole-graph gate compounds two native-FP4 MUL_MAT_ID ops with
|
||||
+ // SWIGLU between them, so it needs slightly more room than the
|
||||
+ // single-op FP4 MUL_MAT_ID gate.
|
||||
+ return 2.5e-2;
|
||||
+ }
|
||||
+ return max_nmse_err();
|
||||
+ }
|
||||
+
|
||||
+ uint64_t op_flops(ggml_tensor * t) override {
|
||||
+ GGML_UNUSED(t);
|
||||
+ return 2 * n_ff * n_embd * n_tokens * n_used * 3;
|
||||
+ }
|
||||
+
|
||||
+ test_moe_swiglu_down(ggml_type type_a = GGML_TYPE_F32, int n_mats = 128, int n_used = 8,
|
||||
+ int64_t n_ff = 768, int64_t n_tokens = 128, int64_t n_embd = 2048)
|
||||
+ : type_a(type_a), n_mats(n_mats), n_used(n_used), n_ff(n_ff), n_tokens(n_tokens), n_embd(n_embd) {
|
||||
+ GGML_ASSERT(n_used <= n_mats);
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
+ ggml_tensor * gate_up = ggml_new_tensor_3d(ctx, type_a, n_embd, 2 * n_ff, n_mats);
|
||||
+ ggml_set_name(gate_up, "gate_up");
|
||||
+
|
||||
+ ggml_tensor * down = ggml_new_tensor_3d(ctx, type_a, n_ff, n_embd, n_mats);
|
||||
+ ggml_set_name(down, "down");
|
||||
+
|
||||
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n_tokens);
|
||||
+ ggml_set_name(ids, "ids");
|
||||
+ if (n_used != n_mats) {
|
||||
+ ids = ggml_view_2d(ctx, ids, n_used, n_tokens, ids->nb[1], 0);
|
||||
+ ggml_set_name(ids, "view_of_ids");
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_used, n_tokens);
|
||||
+ ggml_set_name(cur, "cur");
|
||||
+
|
||||
+ ggml_tensor * gate_up_out = ggml_mul_mat_id(ctx, gate_up, cur, ids);
|
||||
+ ggml_set_name(gate_up_out, "gate_up_out");
|
||||
+
|
||||
+ ggml_tensor * gate = ggml_view_3d(ctx, gate_up_out, n_ff, n_used, n_tokens, gate_up_out->nb[1], gate_up_out->nb[2], 0);
|
||||
+ ggml_set_name(gate, "gate");
|
||||
+
|
||||
+ ggml_tensor * up = ggml_view_3d(ctx, gate_up_out, n_ff, n_used, n_tokens, gate_up_out->nb[1], gate_up_out->nb[2], n_ff * gate_up_out->nb[0]);
|
||||
+ ggml_set_name(up, "up");
|
||||
+
|
||||
+ ggml_tensor * act = ggml_swiglu_split(ctx, gate, up);
|
||||
+ ggml_set_name(act, "swiglu");
|
||||
+
|
||||
+ ggml_tensor * out = ggml_mul_mat_id(ctx, down, act, ids);
|
||||
+ ggml_set_name(out, "out");
|
||||
+
|
||||
+ return out;
|
||||
+ }
|
||||
+
|
||||
+ void initialize_tensors(ggml_context * ctx) override {
|
||||
+ init_mul_mat_id_tensors(ctx, n_mats);
|
||||
+ }
|
||||
+
|
||||
+ bool run_whole_graph() override { return true; }
|
||||
+
|
||||
+ std::string op_desc(ggml_tensor * t) override {
|
||||
+ GGML_UNUSED(t);
|
||||
+ return "MOE_SWIGLU_DOWN";
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
// GGML_OP_OUT_PROD
|
||||
struct test_out_prod : public test_case {
|
||||
const ggml_type type_a;
|
||||
@@ -8759,6 +8844,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
+ // [paged Phase 7] Merged MoE gate_up -> SWIGLU -> down projection gate for the
|
||||
+ // serving candidate that fuses SWIGLU into NVFP4 down-input quantization.
|
||||
+ test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_F32, 8, 2, 32, 8, 64));
|
||||
+ for (int n : {16, 33, 64, 128, 130, 200}) {
|
||||
+ test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048));
|
||||
+ }
|
||||
+
|
||||
// [paged P0 / track B] NVFP4/MXFP4 dense decode-shape mmq_y-down bit-exact gate.
|
||||
// The dense FP4 weight GEMM is the track-B target; P1 lowers mmq_y (the weight-row tile) on the
|
||||
// NVFP4 decode path to raise resident-CTA occupancy. mmq_y is a pure N-row tiling knob, so a
|
||||
--
|
||||
2.43.0
|
||||
|
||||
Reference in New Issue
Block a user