remove sharding that has been upstreamed into mlx-lm

LLama, Ministral, GPT-OSS, MiniMax and DeepSeek all have their tensor sharding upstream. We can remove the sharding code from exo.
Mark slow tests as slow (#1220 )
2026-01-20 11:58:57 -05:00 · 2026-01-20 16:04:40 +00:00 · 2026-01-20 15:03:46 +00:00 · 2026-01-20 14:56:20 +00:00 · 2026-01-20 14:51:26 +00:00 · 2026-01-20 14:46:20 +00:00
5 changed files with 8 additions and 232 deletions
--- a/src/exo/master/tests/test_placement.py
+++ b/src/exo/master/tests/test_placement.py
@@ -276,9 +276,7 @@ def test_placement_selects_leaf_nodes(
    # arrange
    topology = Topology()

-    # Model requires more than any single node but fits within a 3-node cycle
-    model_card.storage_size.in_bytes = 1500
-    model_card.n_layers = 12
+    model_card.storage_size = Memory.from_bytes(1000)

    node_id_a = NodeId()
    node_id_b = NodeId()
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -13,17 +13,11 @@ from mlx.nn.layers.distributed import (
    shard_linear,
    sum_gradients,
 )
-from mlx_lm.models.deepseek_v3 import DeepseekV3MLP
 from mlx_lm.models.deepseek_v3 import Model as DeepseekV3Model
-from mlx_lm.models.deepseek_v32 import DeepseekV32MLP
 from mlx_lm.models.deepseek_v32 import Model as DeepseekV32Model
 from mlx_lm.models.glm4_moe import Model as Glm4MoeModel
 from mlx_lm.models.glm4_moe import MoE
 from mlx_lm.models.gpt_oss import GptOssMoeModel
-from mlx_lm.models.gpt_oss import Model as GptOssModel
-from mlx_lm.models.llama import Model as LlamaModel
-from mlx_lm.models.minimax import Model as MiniMaxModel
-from mlx_lm.models.ministral3 import Model as Ministral3Model
 from mlx_lm.models.qwen3_moe import Model as Qwen3MoeModel
 from mlx_lm.models.qwen3_moe import Qwen3MoeSparseMoeBlock
 from mlx_lm.models.qwen3_next import Model as Qwen3NextModel
@@ -341,33 +335,7 @@ def tensor_auto_parallel(
        except (AttributeError, TypeError, NameError):
            pass

-    if isinstance(model, (LlamaModel, Ministral3Model)):
-        logger.warning("shouldn't be hit - upstream sharding exists")
-        tensor_parallel_sharding_strategy = LlamaShardingStrategy(
-            group,
-            all_to_sharded_linear,
-            sharded_to_all_linear,
-            all_to_sharded_linear_in_place,
-            sharded_to_all_linear_in_place,
-        )
-    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model)):
-        logger.warning("shouldn't be hit - upstream sharding exists")
-        tensor_parallel_sharding_strategy = DeepSeekShardingStrategy(
-            group,
-            all_to_sharded_linear,
-            sharded_to_all_linear,
-            all_to_sharded_linear_in_place,
-            sharded_to_all_linear_in_place,
-        )
-    elif isinstance(model, MiniMaxModel):
-        tensor_parallel_sharding_strategy = MiniMaxShardingStrategy(
-            group,
-            all_to_sharded_linear,
-            sharded_to_all_linear,
-            all_to_sharded_linear_in_place,
-            sharded_to_all_linear_in_place,
-        )
-    elif isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
+    if isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
        tensor_parallel_sharding_strategy = QwenShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -375,15 +343,6 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
-    elif isinstance(model, GptOssModel):
-        tensor_parallel_sharding_strategy = GptOssShardingStrategy(
-            group,
-            all_to_sharded_linear,
-            sharded_to_all_linear,
-            all_to_sharded_linear_in_place,
-            sharded_to_all_linear_in_place,
-        )
-
    else:
        raise ValueError(f"Unsupported model type: {type(model)}")

@@ -418,34 +377,6 @@ class TensorParallelShardingStrategy(ABC):
    ) -> nn.Module: ...


-class LlamaShardingStrategy(TensorParallelShardingStrategy):
-    def shard_model(
-        self,
-        model: nn.Module,
-        timeout_seconds: float,
-        on_timeout: TimeoutCallback | None,
-    ) -> nn.Module:
-        model = cast(LlamaModel, model)
-        for layer in model.layers:
-            # Force load weights before sharding to avoid FAST_SYNCH deadlock
-            eval_with_timeout(
-                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
-            )
-            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
-            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
-            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
-            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
-            layer.self_attn.n_heads //= self.N
-            if layer.self_attn.n_kv_heads is not None:
-                layer.self_attn.n_kv_heads //= self.N
-
-            layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
-            layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
-            layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
-
-        return model
-
-
 def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
    inner_model_instance = _inner_model(model)
    if hasattr(inner_model_instance, "layers"):
@@ -472,105 +403,6 @@ def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
        raise ValueError("Model must have either a 'layers' or 'h' attribute")


-class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
-    def shard_model(
-        self,
-        model: nn.Module,
-        timeout_seconds: float,
-        on_timeout: TimeoutCallback | None,
-    ) -> nn.Module:
-        model = cast(DeepseekV3Model, model)
-        for layer in model.layers:
-            eval_with_timeout(
-                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
-            )
-            # Shard the self attention
-            if layer.self_attn.q_lora_rank is None:
-                layer.self_attn.q_proj = self.all_to_sharded_linear(
-                    layer.self_attn.q_proj
-                )
-            else:
-                layer.self_attn.q_b_proj = self.all_to_sharded_linear(
-                    layer.self_attn.q_b_proj
-                )
-            layer.self_attn.kv_b_proj = self.all_to_sharded_linear(
-                layer.self_attn.kv_b_proj
-            )
-            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
-            layer.self_attn.num_heads //= self.N
-
-            # Shard the MLP
-            if isinstance(layer.mlp, (DeepseekV3MLP, DeepseekV32MLP)):
-                layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
-                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
-                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
-
-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
-            else:
-                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.gate_proj)
-                self.sharded_to_all_linear_in_place(layer.mlp.shared_experts.down_proj)
-                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.up_proj)
-                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
-                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
-                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
-                layer.mlp = ShardedDeepseekV3MoE(layer.mlp)  # type: ignore
-                layer.mlp.sharding_group = self.group
-
-        return model
-
-
-class ShardedDeepseekV3MoE(CustomMlxLayer):
-    def __init__(self, layer: _LayerCallable):
-        super().__init__(layer)
-        self.sharding_group: mx.distributed.Group | None = None
-
-    def __call__(self, x: mx.array) -> mx.array:
-        if self.sharding_group is not None:
-            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer.__call__(x)
-        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)
-        return y
-
-
-class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
-    def shard_model(
-        self,
-        model: nn.Module,
-        timeout_seconds: float,
-        on_timeout: TimeoutCallback | None,
-    ) -> nn.Module:
-        model = cast(MiniMaxModel, model)
-        for layer in model.layers:
-            eval_with_timeout(
-                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
-            )
-            # Shard the self attention
-            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
-            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
-            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
-            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
-            layer.self_attn.num_attention_heads //= self.N
-            layer.self_attn.num_key_value_heads //= self.N
-
-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
-            self.all_to_sharded_linear_in_place(
-                layer.block_sparse_moe.switch_mlp.gate_proj
-            )
-            self.sharded_to_all_linear_in_place(
-                layer.block_sparse_moe.switch_mlp.down_proj
-            )
-            self.all_to_sharded_linear_in_place(
-                layer.block_sparse_moe.switch_mlp.up_proj
-            )
-            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
-            layer.block_sparse_moe.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
-
-        return model
-
-
 class QwenShardingStrategy(TensorParallelShardingStrategy):
    def shard_model(
        self,
@@ -623,58 +455,3 @@ class ShardedQwenMoE(CustomMlxLayer):
        if self.sharding_group is not None:
            y = mx.distributed.all_sum(y, group=self.sharding_group)
        return y
-
-
-class GptOssShardingStrategy(TensorParallelShardingStrategy):
-    def shard_model(
-        self,
-        model: nn.Module,
-        timeout_seconds: float,
-        on_timeout: TimeoutCallback | None,
-    ) -> nn.Module:
-        model = cast(GptOssMoeModel, model)
-
-        for layer in model.layers:
-            eval_with_timeout(
-                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
-            )
-            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
-            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
-            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
-            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
-
-            layer.self_attn.num_attention_heads //= self.N
-            layer.self_attn.num_key_value_heads //= self.N
-            layer.self_attn.num_key_value_groups = (
-                layer.self_attn.num_attention_heads
-                // layer.self_attn.num_key_value_heads
-            )
-
-            layer.self_attn.sinks = layer.self_attn.sinks[
-                layer.self_attn.num_attention_heads
-                * self.group.rank() : layer.self_attn.num_attention_heads
-                * (self.group.rank() + 1)
-            ]
-
-            self.all_to_sharded_linear_in_place(layer.mlp.experts.gate_proj)
-            self.sharded_to_all_linear_in_place(layer.mlp.experts.down_proj)
-            self.all_to_sharded_linear_in_place(layer.mlp.experts.up_proj)
-
-            layer.mlp = ShardedGptOssMoE(layer.mlp)  # type: ignore
-            layer.mlp.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
-
-        return model
-
-
-class ShardedGptOssMoE(CustomMlxLayer):
-    def __init__(self, layer: nn.Module):
-        super().__init__(layer)
-        self.sharding_group: mx.distributed.Group | None = None
-
-    def __call__(self, x: mx.array) -> mx.array:
-        if self.sharding_group is not None:
-            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer(x)
-        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)
-        return y
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -413,11 +413,6 @@ class Worker:
            )
            for nid in conns:
                for ip in conns[nid]:
-                    if "127.0.0.1" in ip or "localhost" in ip:
-                        logger.warning(
-                            f"Loopback connection should not happen: {ip=} for {nid=}"
-                        )
-
                    edge = SocketConnection(
                        # nonsense multiaddr
                        sink_multiaddr=Multiaddr(address=f"/ip4/{ip}/tcp/52415")
@@ -438,6 +433,9 @@ class Worker:
            for conn in self.state.topology.out_edges(self.node_id):
                if not isinstance(conn.edge, SocketConnection):
                    continue
+                # ignore mDNS discovered connections
+                if conn.edge.sink_multiaddr.port != 52415:
+                    continue
                if (
                    conn.sink not in conns
                    or conn.edge.sink_multiaddr.ip_address
--- a/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
@@ -18,6 +18,7 @@ def _check_model_exists() -> bool:


 pytestmark = [
+    pytest.mark.slow,
    pytest.mark.skipif(
        not _check_model_exists(),
        reason=f"GPT-OSS model not found at {DEFAULT_GPT_OSS_CONFIG.model_path}",
--- a/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
@@ -89,6 +89,8 @@ def get_test_models() -> list[tuple[str, ModelCard]]:

 TEST_MODELS: list[tuple[str, ModelCard]] = get_test_models()

+pytestmark = pytest.mark.slow
+

@pytest.fixture(scope="module")
 def event_loop():
Author	SHA1	Message	Date
Evan	ec8c963f2b	remove sharding that has been upstreamed into mlx-lm LLama, Ministral, GPT-OSS, MiniMax and DeepSeek all have their tensor sharding upstream. We can remove the sharding code from exo.	2026-01-20 16:04:40 +00:00
rltakashige	8b709e68b2	Mark slow tests as slow (#1220 ) ## Motivation <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Changes <!-- Describe what you changed in detail --> ## Why It Works <!-- Explain why your approach solves the problem --> ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> <!-- What you did: --> <!-- - --> ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-20 15:03:46 +00:00
Evan Quiney	4da6eeb11f	fix a test broken by #1204 (#1219 ) bad merge broke a test - fix it	2026-01-20 14:56:20 +00:00
Evan	3d2eee4884	quiet localhost log this log is just noise - remove it	2026-01-20 14:51:26 +00:00
Evan	116558839e	don't clear mdns discovered connections pingers currently removes mdns discovered connections - these systems should be independent	2026-01-20 14:46:20 +00:00