bench: add multi-node M3 Ultra benchmark specs for 2, 3, and 4 nodes

The existing benchmark spec only covered single-node M3 Ultra configurations. Multi-node benchmarks are needed to test sharded inference across 2, 3, and 4 M3 Ultra 80-core clusters connected via Thunderbolt 5 with RDMA. Added 2x, 3x, and 4x-m3-ultra.toml benchmark specs with all_to_all topology (min Thunderbolt version 5), All(Rdma) constraint, min_nodes matching the host count, and skip_tensor_ring. Models are tiered by per-node memory (>=96GiB and >=256GiB), with >=512GiB commented out for now. Renamed single-m3-ultra.toml to 1x-m3-ultra.toml for consistent naming. Test plan: - CI
2026-02-25 10:48:26 -05:00 · 2026-02-23 18:28:04 +00:00
5 changed files with 657 additions and 1 deletions
--- a/bench/single-m3-ultra.toml
+++ b/bench/single-m3-ultra.toml
--- a/bench/2x-m3-ultra.toml
+++ b/bench/2x-m3-ultra.toml
@@ -0,0 +1,219 @@
+# 2-node M3 Ultra benchmarks (2 × 96 GiB = 192 GiB total, or 2 × 256 GiB = 512 GiB total)
+#
+# Shared constraints applied to ALL benchmarks in this file.
+constraints = [
+    "All(MacOsBuild(=25D125))",
+    "Hosts(=2)",
+    "All(Chip(m3_ultra))",
+    "All(GpuCores(=80))",
+    "All(Rdma)",
+]
+
+[topology]
+type = "all_to_all"
+min_version = 5
+
+# Default args merged into each benchmark's args (benchmark-level args win).
+[defaults]
+pp = [512, 2048, 8192, 16384]
+tg = 128
+min_nodes = 2
+skip_tensor_ring = true
+
+# ── 96 GiB per-node models (total storage < 192 GiB) ─────────────────────────
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-120b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-20b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/llama-3.3-70b-instruct-fp16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-3bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+# ── 256 GiB per-node models (192 GiB ≤ total storage < 512 GiB) ──────────────
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-8Bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-bf16"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-6bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-8bit-gs32"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/DeepSeek-V3.1-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+# ── 512 GiB per-node models (total storage ≥ 512 GiB) ────────────────────────
+
+# [[benchmark]]
+# model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
+# extra_constraints = ["All(Memory(>=512GiB))"]
+
+# [[benchmark]]
+# model = "mlx-community/Kimi-K2-Instruct-4bit"
+# extra_constraints = ["All(Memory(>=512GiB))"]
+
+# [[benchmark]]
+# model = "mlx-community/Kimi-K2.5"
+# extra_constraints = ["All(Memory(>=512GiB))"]
+
+# [[benchmark]]
+# model = "mlx-community/Kimi-K2-Thinking"
+# extra_constraints = ["All(Memory(>=512GiB))"]
+
+# [[benchmark]]
+# model = "mlx-community/DeepSeek-V3.1-8bit"
+# extra_constraints = ["All(Memory(>=512GiB))"]
--- a/bench/3x-m3-ultra.toml
+++ b/bench/3x-m3-ultra.toml
@@ -0,0 +1,217 @@
+# 3-node M3 Ultra benchmarks (3 × 96 GiB = 288 GiB total, or 3 × 256 GiB = 768 GiB total)
+#
+# Shared constraints applied to ALL benchmarks in this file.
+constraints = [
+    "All(MacOsBuild(=25D125))",
+    "Hosts(=3)",
+    "All(Chip(m3_ultra))",
+    "All(GpuCores(=80))",
+    "All(Rdma)",
+]
+
+[topology]
+type = "all_to_all"
+min_version = 5
+
+# Default args merged into each benchmark's args (benchmark-level args win).
+[defaults]
+pp = [512, 2048, 8192, 16384]
+tg = 128
+min_nodes = 3
+skip_tensor_ring = true
+
+# ── 96 GiB per-node models (total storage < 288 GiB) ─────────────────────────
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-120b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-20b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/llama-3.3-70b-instruct-fp16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-3bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-8Bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+# ── 256 GiB per-node models (288 GiB ≤ total storage < 768 GiB) ──────────────
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-8bit-gs32"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/DeepSeek-V3.1-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Kimi-K2-Instruct-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Kimi-K2.5"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Kimi-K2-Thinking"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/DeepSeek-V3.1-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
--- a/bench/4x-m3-ultra.toml
+++ b/bench/4x-m3-ultra.toml
@@ -0,0 +1,217 @@
+# 4-node M3 Ultra benchmarks (4 × 96 GiB = 384 GiB total, or 4 × 256 GiB = 1024 GiB total)
+#
+# Shared constraints applied to ALL benchmarks in this file.
+constraints = [
+    "All(MacOsBuild(=25D125))",
+    "Hosts(=4)",
+    "All(Chip(m3_ultra))",
+    "All(GpuCores(=80))",
+    "All(Rdma)",
+]
+
+[topology]
+type = "all_to_all"
+min_version = 5
+
+# Default args merged into each benchmark's args (benchmark-level args win).
+[defaults]
+pp = [512, 2048, 8192, 16384]
+tg = 128
+min_nodes = 4
+skip_tensor_ring = true
+
+# ── 96 GiB per-node models (total storage < 384 GiB) ─────────────────────────
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-120b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-20b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/llama-3.3-70b-instruct-fp16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-3bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-8Bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-8bit-gs32"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/DeepSeek-V3.1-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+# ── 256 GiB per-node models (384 GiB ≤ total storage < 1024 GiB) ─────────────
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Kimi-K2-Instruct-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Kimi-K2.5"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Kimi-K2-Thinking"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/DeepSeek-V3.1-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
--- a/bench/bench.toml
+++ b/bench/bench.toml
@@ -3,5 +3,8 @@
 # Lists the suite files to include. Each file defines benchmarks
 # with shared constraints, topology, and default args.
 include = [
-    "single-m3-ultra.toml",
+    "1x-m3-ultra.toml",
+    "2x-m3-ultra.toml",
+    "3x-m3-ultra.toml",
+    "4x-m3-ultra.toml",
 ]