Point mlx-lm to davidmcc73 fork with context parallelism support

Add context parallelism support to DeepSeek sharding
Store pre-shard head count and distributed group on each attention layer during sharding, enabling automatic TP→CP switching at runtime when context length exceeds a threshold. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 19:22:39 -05:00 · 2026-02-02 19:16:43 +00:00 · 2026-02-02 18:35:20 +00:00
2 changed files with 8 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,6 +71,7 @@ exo_pyo3_bindings = { workspace = true }
 # Uncomment to use local mlx/mlx-lm development versions:
 # mlx = { path = "/Users/Shared/mlx", editable=true }
 # mlx-lm = { path = "/Users/Shared/mlx-lm", editable=true }
+mlx-lm = { git = "https://github.com/davidmcc73/mlx-lm.git", branch = "main" }

 [build-system]
 requires = ["uv_build>=0.8.9,<0.9.0"]
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -520,6 +520,9 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
                layer.self_attn.kv_b_proj
            )
            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+            # Store pre-shard head count and group for context parallelism
+            layer.self_attn.context_parallel_total_heads = layer.self_attn.num_heads
+            layer.self_attn._cp_group = self.group
            layer.self_attn.num_heads //= self.N

            # Shard the MLP
@@ -542,6 +545,10 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):

            mx.eval(layer)

+        # Store group for context parallelism
+        if hasattr(model, "model"):
+            model.model._cp_group = self.group
+
        return model