diff --git a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md index a8084b7f1..ea26edf29 100644 --- a/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md +++ b/backend/cpp/llama-cpp-localai-paged/docs/GB10_PARITY_PHASE0_RESULTS.md @@ -340,7 +340,7 @@ Performance: Result: -- Rejected. No fork commit and no LocalAI patch `0051`. +- Rejected. No fork commit and no LocalAI patch was created for that experiment. - The local fork experiment was reverted. - Do not ship Wq padding alone; the measured `+0.4%` / `+0.6%` default-shape gain is below the maintenance threshold. @@ -388,14 +388,14 @@ Second clean build attempt: - Branch: `localai-paged` - Working tree: clean after fork commit `d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3` - Phase 0 HEAD: `51168c5eee2e35348d9006f0b2fab3dc6e7c01cc` -- Current HEAD: `d9b9be0bee3d7239132bfca05d5b057ff4ee4cc3` +- Current HEAD: `cd56cf037379b084d6bb0ed47db8b785c828be86` - Base pin: `0ed235ea2c17a19fc8238668653946721ed136fd` - Merge-base with base pin: `0ed235ea2c17a19fc8238668653946721ed136fd` -- LocalAI patch count: `38` at Phase 0; current mirror count is `41` after - patch `0050`. +- LocalAI patch count: `38` at Phase 0; current mirror count is `42` after + patch `0051`. - LocalAI patch mirror: applies cleanly to the base pin and tree-matches fork HEAD. -- Tree hash after patch application: `8fcb151e0620fd0fc82b80c04318e5c34320b087` +- Tree hash after patch application: `623b7cb008a929455ca3d9deae35494c02622fef` ## Existing Artifact Gap Review @@ -562,3 +562,19 @@ Result: input/sampler uploads, with a workload that proves the target bucket matters. - Phase 7 must keep the canonical MoE and dense md5 gates as the first inference-safety check before any performance result is accepted. + +## Phase 7 Source-Candidate Test Gate + +Fork commit `cd56cf037379b084d6bb0ed47db8b785c828be86` added patch +`0051-test-paged-cover-MoE-swiglu-down-chain.patch`. This is a test-only patch; +it does not change the production inference path. + +Fresh DGX gates from `/home/mudler/bench/phase7_source_scope/`: + +- MoE greedy md5: `8cb0ce23777bf55f92f63d0292c756b0`. +- Dense greedy md5: `5951a5b4d624ce891e22ab5fca9bc439`. +- Baseline `MUL_MAT_ID`: `806/806`. +- New `MOE_SWIGLU_DOWN`: `7/7`. + +The new gate covers the merged MoE gate_up -> SWIGLU -> down-projection graph +shape needed before attempting a batched NVFP4 down-input quantization fusion. diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0051-test-paged-cover-MoE-swiglu-down-chain.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0051-test-paged-cover-MoE-swiglu-down-chain.patch new file mode 100644 index 000000000..469213a50 --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0051-test-paged-cover-MoE-swiglu-down-chain.patch @@ -0,0 +1,122 @@ +From cd56cf037379b084d6bb0ed47db8b785c828be86 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Tue, 30 Jun 2026 23:18:38 +0000 +Subject: [PATCH] test(paged): cover MoE swiglu down chain + +--- + tests/test-backend-ops.cpp | 92 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 92 insertions(+) + +diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp +index 817069860..aeca64802 100644 +--- a/tests/test-backend-ops.cpp ++++ b/tests/test-backend-ops.cpp +@@ -4447,6 +4447,91 @@ struct test_mul_mat_id_fusion : public test_case { + } + }; + ++// Merged MoE gate_up -> SWIGLU -> down MUL_MAT_ID chain. ++struct test_moe_swiglu_down : public test_case { ++ const ggml_type type_a; ++ const int n_mats; ++ const int n_used; ++ const int64_t n_ff; ++ const int64_t n_tokens; ++ const int64_t n_embd; ++ ++ std::string vars() override { ++ return VARS_TO_STR6(type_a, n_mats, n_used, n_ff, n_tokens, n_embd); ++ } ++ ++ double max_nmse_err() override { ++ return 5e-4; ++ } ++ ++ double max_nmse_err(ggml_backend_t backend) override { ++ if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { ++ // This whole-graph gate compounds two native-FP4 MUL_MAT_ID ops with ++ // SWIGLU between them, so it needs slightly more room than the ++ // single-op FP4 MUL_MAT_ID gate. ++ return 2.5e-2; ++ } ++ return max_nmse_err(); ++ } ++ ++ uint64_t op_flops(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return 2 * n_ff * n_embd * n_tokens * n_used * 3; ++ } ++ ++ test_moe_swiglu_down(ggml_type type_a = GGML_TYPE_F32, int n_mats = 128, int n_used = 8, ++ int64_t n_ff = 768, int64_t n_tokens = 128, int64_t n_embd = 2048) ++ : type_a(type_a), n_mats(n_mats), n_used(n_used), n_ff(n_ff), n_tokens(n_tokens), n_embd(n_embd) { ++ GGML_ASSERT(n_used <= n_mats); ++ } ++ ++ ggml_tensor * build_graph(ggml_context * ctx) override { ++ ggml_tensor * gate_up = ggml_new_tensor_3d(ctx, type_a, n_embd, 2 * n_ff, n_mats); ++ ggml_set_name(gate_up, "gate_up"); ++ ++ ggml_tensor * down = ggml_new_tensor_3d(ctx, type_a, n_ff, n_embd, n_mats); ++ ggml_set_name(down, "down"); ++ ++ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n_tokens); ++ ggml_set_name(ids, "ids"); ++ if (n_used != n_mats) { ++ ids = ggml_view_2d(ctx, ids, n_used, n_tokens, ids->nb[1], 0); ++ ggml_set_name(ids, "view_of_ids"); ++ } ++ ++ ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_used, n_tokens); ++ ggml_set_name(cur, "cur"); ++ ++ ggml_tensor * gate_up_out = ggml_mul_mat_id(ctx, gate_up, cur, ids); ++ ggml_set_name(gate_up_out, "gate_up_out"); ++ ++ ggml_tensor * gate = ggml_view_3d(ctx, gate_up_out, n_ff, n_used, n_tokens, gate_up_out->nb[1], gate_up_out->nb[2], 0); ++ ggml_set_name(gate, "gate"); ++ ++ ggml_tensor * up = ggml_view_3d(ctx, gate_up_out, n_ff, n_used, n_tokens, gate_up_out->nb[1], gate_up_out->nb[2], n_ff * gate_up_out->nb[0]); ++ ggml_set_name(up, "up"); ++ ++ ggml_tensor * act = ggml_swiglu_split(ctx, gate, up); ++ ggml_set_name(act, "swiglu"); ++ ++ ggml_tensor * out = ggml_mul_mat_id(ctx, down, act, ids); ++ ggml_set_name(out, "out"); ++ ++ return out; ++ } ++ ++ void initialize_tensors(ggml_context * ctx) override { ++ init_mul_mat_id_tensors(ctx, n_mats); ++ } ++ ++ bool run_whole_graph() override { return true; } ++ ++ std::string op_desc(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return "MOE_SWIGLU_DOWN"; ++ } ++}; ++ + // GGML_OP_OUT_PROD + struct test_out_prod : public test_case { + const ggml_type type_a; +@@ -8759,6 +8844,13 @@ static std::vector> make_test_cases_eval() { + } + } + ++ // [paged Phase 7] Merged MoE gate_up -> SWIGLU -> down projection gate for the ++ // serving candidate that fuses SWIGLU into NVFP4 down-input quantization. ++ test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_F32, 8, 2, 32, 8, 64)); ++ for (int n : {16, 33, 64, 128, 130, 200}) { ++ test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048)); ++ } ++ + // [paged P0 / track B] NVFP4/MXFP4 dense decode-shape mmq_y-down bit-exact gate. + // The dense FP4 weight GEMM is the track-B target; P1 lowers mmq_y (the weight-row tile) on the + // NVFP4 decode path to raise resident-CTA occupancy. mmq_y is a pure N-row tiling knob, so a +-- +2.43.0 + diff --git a/docs/superpowers/plans/2026-06-30-serving-source-phase7.md b/docs/superpowers/plans/2026-06-30-serving-source-phase7.md index 26dfeea54..2d0845133 100644 --- a/docs/superpowers/plans/2026-06-30-serving-source-phase7.md +++ b/docs/superpowers/plans/2026-06-30-serving-source-phase7.md @@ -1,6 +1,6 @@ # Phase 7: Serving Source Candidate Scope -**Status:** Scoped. Code implementation not started. +**Status:** Test-gate patch landed. Production CUDA fusion not started. **Goal:** Select one maintainable source candidate for the remaining GB10 MoE serving gap, then implement only if it can be gated for inference correctness and @@ -130,9 +130,18 @@ to implementation when all are true: - [x] Pick Track A or Track B from concrete code evidence. - Primary: Track A, batched MoE SWIGLU -> NVFP4 down-input quantization. - Secondary: Track B, backend logit-bias upload cache for non-greedy workloads. -- [ ] Run baseline gates from the clean candidate build. -- [ ] Implement one fork-first incremental patch. -- [ ] Run md5/op gates before serving A/B. +- [x] Run baseline gates from the clean candidate build. + - Artifact: `/home/mudler/bench/phase7_source_scope/`. + - MoE md5: `8cb0ce23777bf55f92f63d0292c756b0`. + - Dense md5: `5951a5b4d624ce891e22ab5fca9bc439`. + - Baseline `MUL_MAT_ID`: `806/806`. +- [x] Implement one fork-first incremental patch. + - Fork commit: `cd56cf037` (`test(paged): cover MoE swiglu down chain`). + - LocalAI patch: `0051-test-paged-cover-MoE-swiglu-down-chain.patch`. + - Scope: test gate only; no production inference path changed. +- [x] Run md5/op gates before serving A/B. + - `MOE_SWIGLU_DOWN`: `7/7` on CUDA0. + - Serving A/B is not applicable to this test-only patch. - [ ] Keep only if the serving bucket and h2h result improve materially. - [ ] Regenerate LocalAI patch stack and update docs if kept. @@ -141,14 +150,39 @@ to implementation when all are true: - Add or extend a whole-graph op test for the batched MoE gate_up/SWIGLU/down chain. Shapes must include `type_a=NVFP4`, `n_mats=128`, `n_used=8`, `m=768`, `k=2048`, and `n in {16, 33, 64, 128, 130, 200}`. + - Done in fork commit `cd56cf037`. - Run `test-backend-ops test -b CUDA0 -o MUL_MAT_ID -j 1` and require `806/806` until a more specific op name is available. + - Baseline done before the test-gate patch. - Run canonical MoE and dense greedy md5 gates before serving A/B: - MoE `8cb0ce23777bf55f92f63d0292c756b0`. - Dense `5951a5b4d624ce891e22ab5fca9bc439`. + - Baseline done before the test-gate patch. - Run a mixed prompt/decode md5 gate (`ptok=512`, `gen=32`) because graph reuse can hide bugs that a decode-only gate misses. +## Patch 0051 Result + +Patch `0051` adds a whole-graph test named `MOE_SWIGLU_DOWN`. It covers the +merged MoE gate_up -> SWIGLU -> down projection chain and includes: + +- one small F32 wiring case, +- NVFP4 Qwen-style cases with `n_mats=128`, `n_used=8`, `n_ff=768`, + `n_embd=2048`, and `n_tokens in {16, 33, 64, 128, 130, 200}`. + +The first run used the inherited single-FP4-op tolerance (`2e-2`) and failed +consistently at roughly `0.0213-0.0218` NMSE. Root cause: this whole-graph gate +compounds two native-FP4 `MUL_MAT_ID` ops with SWIGLU between them, so the test +uses `2.5e-2` for Blackwell native-FP4 backends and keeps the F32 wiring case at +the stricter default tolerance. + +DGX result after the adjustment: + +- `test-backend-ops test -b CUDA0 -o MOE_SWIGLU_DOWN -j 1`: `7/7`. +- Patch mirror applies cleanly to base pin `0ed235ea2c17a19fc8238668653946721ed136fd` + and tree-matches fork head `cd56cf037`. +- Mirrored tree hash: `623b7cb008a929455ca3d9deae35494c02622fef`. + ## Required Tests Before Track B Source Patch - Establish fixed-seed baseline output md5 and token-id parity for a