Add fixes

foo
Remove dead local paths code from download_shard (#1227 )
2026-01-20 20:10:10 -05:00 · 2026-01-20 17:13:02 +00:00 · 2026-01-20 17:12:31 +00:00 · 2026-01-20 17:07:27 +00:00 · 2026-01-20 16:57:05 +00:00 · 2026-01-20 15:03:46 +00:00
11 changed files with 142 additions and 189 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,8 @@ dependencies = [
    "loguru>=0.7.3",
    "exo_pyo3_bindings", # rust bindings
    "anyio==4.11.0",
-    "mlx; sys_platform == 'darwin'",
-    "mlx[cpu]; sys_platform == 'linux'",
+    "mlx==0.30.3; sys_platform == 'darwin'",
+    "mlx[cpu]==0.30.3; sys_platform == 'linux'",
    "mlx-lm @ git+https://github.com/AlexCheema/mlx-lm.git@fix-transformers-5.0.0rc2",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
@@ -59,7 +59,6 @@ members = [

 [tool.uv.sources]
 exo_pyo3_bindings = { workspace = true }
-mlx = { git = "https://github.com/ml-explore/mlx.git", branch = "main" }
 # Uncomment to use local mlx/mlx-lm development versions:
 # mlx = { path = "/Users/Shared/mlx", editable=true }
 # mlx-lm = { path = "/Users/Shared/mlx-lm", editable=true }
--- a/src/exo/master/tests/test_placement.py
+++ b/src/exo/master/tests/test_placement.py
@@ -276,9 +276,7 @@ def test_placement_selects_leaf_nodes(
    # arrange
    topology = Topology()

-    # Model requires more than any single node but fits within a 3-node cycle
-    model_card.storage_size.in_bytes = 1500
-    model_card.n_layers = 12
+    model_card.storage_size = Memory.from_bytes(1000)

    node_id_a = NodeId()
    node_id_b = NodeId()
--- a/src/exo/worker/download/download_utils.py
+++ b/src/exo/worker/download/download_utils.py
@@ -477,53 +477,6 @@ async def get_downloaded_size(path: Path) -> int:
    return 0


-async def download_progress_for_local_path(
-    repo_id: str, shard: ShardMetadata, local_path: Path
-) -> RepoDownloadProgress:
-    file_progress: dict[str, RepoFileDownloadProgress] = {}
-    total_files = 0
-    total_bytes = 0
-
-    if await aios.path.isdir(local_path):
-        for root, _, files in os.walk(local_path):
-            for f in files:
-                if f.endswith((".safetensors", ".bin", ".pt", ".gguf", ".json")):
-                    file_path = Path(root) / f
-                    size = (await aios.stat(file_path)).st_size
-                    rel_path = str(file_path.relative_to(local_path))
-                    file_progress[rel_path] = RepoFileDownloadProgress(
-                        repo_id=repo_id,
-                        repo_revision="local",
-                        file_path=rel_path,
-                        downloaded=Memory.from_bytes(size),
-                        downloaded_this_session=Memory.from_bytes(0),
-                        total=Memory.from_bytes(size),
-                        speed=0,
-                        eta=timedelta(0),
-                        status="complete",
-                        start_time=time.time(),
-                    )
-                    total_files += 1
-                    total_bytes += size
-    else:
-        raise ValueError(f"Local path {local_path} is not a directory")
-
-    return RepoDownloadProgress(
-        repo_id=repo_id,
-        repo_revision="local",
-        shard=shard,
-        completed_files=total_files,
-        total_files=total_files,
-        downloaded_bytes=Memory.from_bytes(total_bytes),
-        downloaded_bytes_this_session=Memory.from_bytes(0),
-        total_bytes=Memory.from_bytes(total_bytes),
-        overall_speed=0,
-        overall_eta=timedelta(0),
-        status="complete",
-        file_progress=file_progress,
-    )
-
-
 async def download_shard(
    shard: ShardMetadata,
    on_progress: Callable[[ShardMetadata, RepoDownloadProgress], Awaitable[None]],
@@ -534,14 +487,6 @@ async def download_shard(
    if not skip_download:
        logger.info(f"Downloading {shard.model_card.model_id=}")

-    # Handle local paths
-    if await aios.path.exists(str(shard.model_card.model_id)):
-        logger.info(f"Using local model path {shard.model_card.model_id}")
-        local_path = Path(str(shard.model_card.model_id))
-        return local_path, await download_progress_for_local_path(
-            str(shard.model_card.model_id), shard, local_path
-        )
-
    revision = "main"
    target_dir = await ensure_models_dir() / str(shard.model_card.model_id).replace(
        "/", "--"
@@ -552,7 +497,8 @@ async def download_shard(
    if not allow_patterns:
        allow_patterns = await resolve_allow_patterns(shard)

-    logger.info(f"Downloading {shard.model_card.model_id=} with {allow_patterns=}")
+    if not skip_download:
+        logger.info(f"Downloading {shard.model_card.model_id=} with {allow_patterns=}")

    all_start_time = time.time()
    # TODO: currently not recursive. Some models might require subdirectories - thus this will need to be changed.
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Callable
 from functools import partial
 from inspect import signature
-from typing import TYPE_CHECKING, Any, Protocol, cast
+from typing import TYPE_CHECKING, Any, cast

 import mlx.core as mx
 import mlx.nn as nn
@@ -67,27 +67,16 @@ def eval_with_timeout(
        completed.set()


-class _LayerCallable(Protocol):
-    """Structural type that any compatible layer must satisfy.
-
-    We require a single positional input of type ``mx.array`` and an
-    ``mx.array`` output, while permitting arbitrary *args / **kwargs so this
-    protocol matches the vast majority of `mlx.nn.Module` subclasses.
-    """
-
-    def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array: ...
-
-
 class CustomMlxLayer(nn.Module):
    """Base class for replacing an MLX layer with a custom implementation."""

-    def __init__(self, original_layer: _LayerCallable):
+    def __init__(self, original_layer: nn.Module):
        super().__init__()
        object.__setattr__(self, "_original_layer", original_layer)

    @property
-    def original_layer(self) -> _LayerCallable:
-        return cast(_LayerCallable, object.__getattribute__(self, "_original_layer"))
+    def original_layer(self) -> nn.Module:
+        return cast(nn.Module, object.__getattribute__(self, "_original_layer"))

    # Calls __getattr__ for any attributes not found on nn.Module (e.g. use_sliding)
    if not TYPE_CHECKING:
@@ -100,52 +89,53 @@ class CustomMlxLayer(nn.Module):
                return getattr(original_layer, name)


-class PipelineFirstLayer(CustomMlxLayer):
-    def __init__(
-        self,
-        original_layer: _LayerCallable,
-        r: int,
-        group: mx.distributed.Group,
-    ):
-        super().__init__(original_layer)
-        self.r: int = r
-        self.group = group
+def patch_pipeline_first_layer(
+    pipeline_layer: nn.Module, group: mx.distributed.Group
+) -> nn.Module:
+    cls = type(pipeline_layer)
+    orig_call = cast(Callable[..., mx.array], cls.__call__)

-    def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
-        if self.r != 0:
-            x = mx.distributed.recv_like(x, (self.r - 1), group=self.group)
-        return self.original_layer(x, *args, **kwargs)
+    rank = group.rank()
+
+    class PatchedFirstLayer(cls):
+        def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
+            if rank != 0:
+                x = mx.distributed.recv_like(x, (rank - 1), group=group)
+            return orig_call(self, x, *args, **kwargs)
+
+    pipeline_layer.__class__ = PatchedFirstLayer
+
+    return pipeline_layer


-class PipelineLastLayer(CustomMlxLayer):
-    def __init__(
-        self,
-        original_layer: _LayerCallable,
-        r: int,
-        s: int,
-        group: mx.distributed.Group,
-    ):
-        super().__init__(original_layer)
-        self.r: int = r
-        self.s: int = s
-        self.group = group
-        self.original_layer_signature = signature(self.original_layer.__call__)
+def patch_pipeline_last_layer(
+    pipeline_layer: nn.Module, group: mx.distributed.Group
+) -> nn.Module:
+    cls = type(pipeline_layer)
+    orig_call = cast(Callable[..., mx.array], cls.__call__)
+    orig_call_sig = signature(orig_call)

-    def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
-        cache = self.original_layer_signature.bind_partial(
-            x, *args, **kwargs
-        ).arguments.get("cache", None)
+    rank = group.rank()
+    size = group.size()

-        output: mx.array = self.original_layer(x, *args, **kwargs)
-
-        if self.r != self.s - 1:
-            output = mx.distributed.send(
-                output, (self.r + 1) % self.s, group=self.group
+    class PatchedLastLayer(cls):
+        def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
+            cache = orig_call_sig.bind_partial(x, *args, **kwargs).arguments.get(
+                "cache", None
            )
-            if cache is not None:
-                cache.keys = mx.depends(cache.keys, output)  # type: ignore[reportUnknownMemberType]

-        return output
+            output: mx.array = orig_call(self, x, *args, **kwargs)
+
+            if rank != size - 1:
+                output = mx.distributed.send(output, (rank + 1) % size, group=group)
+                if cache is not None:
+                    cache.keys = mx.depends(cache.keys, output)  # type: ignore[reportUnknownMemberType]
+
+            return output
+
+    pipeline_layer.__class__ = PatchedLastLayer
+
+    return pipeline_layer


 def _inner_model(model: nn.Module) -> nn.Module:
@@ -160,13 +150,13 @@ def _inner_model(model: nn.Module) -> nn.Module:
    raise ValueError("Model must either have a 'model' or 'transformer' attribute")


-def _get_layers(inner_model_instance: nn.Module) -> list[_LayerCallable]:
+def _get_layers(inner_model_instance: nn.Module) -> list[nn.Module]:
    # Handle both model.layers and model.h cases
-    layers: list[_LayerCallable]
+    layers: list[nn.Module]
    if hasattr(inner_model_instance, "layers"):
-        layers = cast(list[_LayerCallable], inner_model_instance.layers)
+        layers = cast(list[nn.Module], inner_model_instance.layers)
    elif hasattr(inner_model_instance, "h"):
-        layers = cast(list[_LayerCallable], inner_model_instance.h)
+        layers = cast(list[nn.Module], inner_model_instance.h)
    else:
        raise ValueError("Model must have either a 'layers' or 'h' attribute")

@@ -191,16 +181,12 @@ def pipeline_auto_parallel(
    layers = _get_layers(inner_model_instance)

    start_layer, end_layer = model_shard_meta.start_layer, model_shard_meta.end_layer
-    device_rank, world_size = model_shard_meta.device_rank, model_shard_meta.world_size

    layers = layers[start_layer:end_layer]
-
-    layers[0] = PipelineFirstLayer(layers[0], device_rank, group=group)
-    layers[-1] = PipelineLastLayer(
+    layers[0] = patch_pipeline_first_layer(layers[0], group)
+    layers[-1] = patch_pipeline_last_layer(
        layers[-1],
-        device_rank,
-        world_size,
-        group=group,
+        group,
    )

    if isinstance(inner_model_instance, GptOssMoeModel):
@@ -230,10 +216,10 @@ def pipeline_auto_parallel(
        "Expected a list of layers after auto-parallel initialisation"
    )

-    return patch_distributed_model(model)
+    return patch_pipeline_model(model, group)


-def patch_distributed_model[T](model: T) -> T:
+def patch_pipeline_model[T](model: T, group: mx.distributed.Group) -> T:
    # Patch __call__ on the model's class
    cls = model.__class__
    original_call = cls.__call__  # type :ignore
@@ -253,6 +239,36 @@ def patch_distributed_model[T](model: T) -> T:
        if cache is not None:
            cache[-1].state = mx.depends(cache[-1].state, logits)  # type: ignore

+        logits = mx.distributed.all_gather(logits, group=group)[
+            -logits.shape[0] :
+        ]  # type :ignore
+
+        return logits
+
+    cls.__call__ = patched_call
+    return model
+
+
+def patch_tensor_model[T](model: T) -> T:
+    """Patch model's __call__ to ensure distributed ops sync during inference."""
+    cls = model.__class__
+    original_call = cls.__call__
+    call_signature = signature(original_call)
+
+    def patched_call(
+        self: T,
+        *args: object,
+        **kwargs: object,
+    ) -> mx.array:
+        logits: mx.array = original_call(self, *args, **kwargs)  # pyright: ignore[reportAny]
+        cache = call_signature.bind_partial(self, *args, **kwargs).arguments.get(
+            "cache", None
+        )
+
+        # Add dependency to last cache entry to ensure distributed ops are evaluated
+        if cache is not None and len(cache) > 0:  # pyright: ignore[reportAny]
+            cache[-1].state = mx.depends(cache[-1].state, logits)  # pyright: ignore[reportAny,reportUnknownMemberType]
+
        return logits

    cls.__call__ = patched_call
@@ -305,23 +321,14 @@ def tensor_auto_parallel(
        group=group,
    )

-    if isinstance(model, GptOssModel):
-        tensor_parallel_sharding_strategy = GptOssShardingStrategy(
-            group,
-            all_to_sharded_linear,
-            sharded_to_all_linear,
-            all_to_sharded_linear_in_place,
-            sharded_to_all_linear_in_place,
-        )
+    if hasattr(model, "shard"):
+        try:
+            model.shard(group)  # type: ignore
+            return patch_tensor_model(model)
+        except (AttributeError, TypeError, NameError):
+            pass

-    # elif hasattr(model, "shard"):
-    #     try:
-    #         model.shard(group)  # type: ignore
-    #         return model
-    #     except (AttributeError, TypeError, NameError):
-    #         pass
-
-    elif isinstance(model, (LlamaModel, Ministral3Model)):
+    if isinstance(model, (LlamaModel, Ministral3Model)):
        logger.warning("shouldn't be hit - upstream sharding exists")
        tensor_parallel_sharding_strategy = LlamaShardingStrategy(
            group,
@@ -355,13 +362,22 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
+    elif isinstance(model, GptOssModel):
+        tensor_parallel_sharding_strategy = GptOssShardingStrategy(
+            group,
+            all_to_sharded_linear,
+            sharded_to_all_linear,
+            all_to_sharded_linear_in_place,
+            sharded_to_all_linear_in_place,
+        )
+
    else:
        raise ValueError(f"Unsupported model type: {type(model)}")

    model = tensor_parallel_sharding_strategy.shard_model(
        model, timeout_seconds, on_timeout
    )
-    return patch_distributed_model(model)
+    return patch_tensor_model(model)


 class TensorParallelShardingStrategy(ABC):
@@ -417,7 +433,7 @@ class LlamaShardingStrategy(TensorParallelShardingStrategy):
        return model


-def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
+def _set_layers(model: nn.Module, layers: list[nn.Module]) -> None:
    inner_model_instance = _inner_model(model)
    if hasattr(inner_model_instance, "layers"):
        inner_model_instance.layers = layers
@@ -492,17 +508,17 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):


 class ShardedDeepseekV3MoE(CustomMlxLayer):
-    def __init__(self, layer: _LayerCallable):
+    def __init__(self, layer: nn.Module):
        super().__init__(layer)
        self.sharding_group: mx.distributed.Group | None = None

    def __call__(self, x: mx.array) -> mx.array:
        if self.sharding_group is not None:
            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer.__call__(x)
+        y = self.original_layer.__call__(x)  # type: ignore
        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)
-        return y
+            y = mx.distributed.all_sum(y, group=self.sharding_group)  # type: ignore
+        return y  # type: ignore


 class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
@@ -536,7 +552,7 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
            self.all_to_sharded_linear_in_place(
                layer.block_sparse_moe.switch_mlp.up_proj
            )
-            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
+            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue]
            layer.block_sparse_moe.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]

        return model
@@ -570,7 +586,7 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
-                layer.mlp = ShardedQwenMoE(layer.mlp)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
+                layer.mlp = ShardedQwenMoE(layer.mlp)  # pyright: ignore[reportAttributeAccessIssue]
                layer.mlp.sharding_group = self.group

            # Shard the MLP
@@ -583,17 +599,17 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):


 class ShardedQwenMoE(CustomMlxLayer):
-    def __init__(self, layer: _LayerCallable):
+    def __init__(self, layer: nn.Module):
        super().__init__(layer)
        self.sharding_group: mx.distributed.Group | None = None

    def __call__(self, x: mx.array) -> mx.array:
        if self.sharding_group is not None:
            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer.__call__(x)
+        y = self.original_layer.__call__(x)  # type: ignore
        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)
-        return y
+            y = mx.distributed.all_sum(y, group=self.sharding_group)  # type: ignore
+        return y  # type: ignore


 class GptOssShardingStrategy(TensorParallelShardingStrategy):
@@ -645,7 +661,7 @@ class ShardedGptOssMoE(CustomMlxLayer):
    def __call__(self, x: mx.array) -> mx.array:
        if self.sharding_group is not None:
            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer(x)
+        y = self.original_layer(x)  # type: ignore
        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)
-        return y
+            y = mx.distributed.all_sum(y, group=self.sharding_group)  # type: ignore
+        return y  # type: ignore
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -169,10 +169,10 @@ def mlx_distributed_init(

                # TODO: update once upstream fixes
                logger.info(
-                    f"rank {rank} MLX_IBV_DEVICES: {coordination_file} with devices: {jaccl_devices_json}"
+                    f"rank {rank} MLX_JACCL_DEVICES: {coordination_file} with devices: {jaccl_devices_json}"
                )
                logger.info(f"rank {rank} MLX_JACCL_COORDINATOR: {jaccl_coordinator}")
-                os.environ["MLX_IBV_DEVICES"] = coordination_file
+                os.environ["MLX_JACCL_DEVICES"] = coordination_file
                os.environ["MLX_RANK"] = str(rank)
                os.environ["MLX_JACCL_COORDINATOR"] = jaccl_coordinator
                group = mx.distributed.init(backend="jaccl", strict=True)
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -413,11 +413,6 @@ class Worker:
            )
            for nid in conns:
                for ip in conns[nid]:
-                    if "127.0.0.1" in ip or "localhost" in ip:
-                        logger.warning(
-                            f"Loopback connection should not happen: {ip=} for {nid=}"
-                        )
-
                    edge = SocketConnection(
                        # nonsense multiaddr
                        sink_multiaddr=Multiaddr(address=f"/ip4/{ip}/tcp/52415")
@@ -438,6 +433,9 @@ class Worker:
            for conn in self.state.topology.out_edges(self.node_id):
                if not isinstance(conn.edge, SocketConnection):
                    continue
+                # ignore mDNS discovered connections
+                if conn.edge.sink_multiaddr.port != 52415:
+                    continue
                if (
                    conn.sink not in conns
                    or conn.edge.sink_multiaddr.ip_address
@@ -451,7 +449,7 @@ class Worker:
    async def _emit_existing_download_progress(self) -> None:
        try:
            while True:
-                logger.info("Fetching and emitting existing download progress...")
+                logger.debug("Fetching and emitting existing download progress...")
                async for (
                    _,
                    progress,
@@ -482,7 +480,7 @@ class Worker:
                    await self.event_sender.send(
                        NodeDownloadProgress(download_progress=status)
                    )
-                logger.info("Done emitting existing download progress.")
+                logger.debug("Done emitting existing download progress.")
                await anyio.sleep(5 * 60)  # 5 minutes
        except Exception as e:
            logger.error(f"Error emitting existing download progress: {e}")
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -71,7 +71,6 @@ def main(
        bound_instance.bound_shard,
    )
    device_rank = shard_metadata.device_rank
-    world_size = shard_metadata.world_size
    logger.info("hello from the runner")
    if getattr(shard_metadata, "immediate_exception", False):
        raise Exception("Fake exception - runner failed to spin up.")
@@ -208,7 +207,7 @@ def main(
                        for response in mlx_generator:
                            match response:
                                case GenerationResponse():
-                                    if device_rank == world_size - 1:
+                                    if device_rank == 0:
                                        event_sender.send(
                                            ChunkGenerated(
                                                command_id=command_id,
--- a/src/exo/worker/tests/unittests/test_mlx/conftest.py
+++ b/src/exo/worker/tests/unittests/test_mlx/conftest.py
@@ -18,7 +18,7 @@ from exo.shared.types.tasks import ChatCompletionTaskParams
 from exo.shared.types.worker.shards import PipelineShardMetadata, TensorShardMetadata
 from exo.worker.engines.mlx import Model
 from exo.worker.engines.mlx.generator.generate import mlx_generate
-from exo.worker.engines.mlx.utils_mlx import apply_chat_template, shard_and_load
+from exo.worker.engines.mlx.utils_mlx import shard_and_load, apply_chat_template


 class MockLayer(nn.Module):
@@ -116,7 +116,6 @@ def run_gpt_oss_pipeline_device(
            messages=[ChatCompletionMessage(role="user", content=prompt_text)],
            max_tokens=max_tokens,
        )
-
        prompt = apply_chat_template(tokenizer, task)

        generated_text = ""
--- a/src/exo/worker/tests/unittests/test_mlx/test_auto_parallel.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_auto_parallel.py
@@ -10,9 +10,9 @@ import pytest

 from exo.worker.engines.mlx.auto_parallel import (
    CustomMlxLayer,
-    PipelineFirstLayer,
-    PipelineLastLayer,
-    patch_distributed_model,
+    patch_pipeline_first_layer,
+    patch_pipeline_last_layer,
+    patch_pipeline_model,
 )
 from exo.worker.tests.unittests.test_mlx.conftest import MockLayer

@@ -50,12 +50,12 @@ def run_pipeline_device(
        group = mx.distributed.init(backend="ring", strict=True)

        mock = MockLayerInner()
-        first = PipelineFirstLayer(mock, r=rank, group=group)
-        composed = PipelineLastLayer(first, r=rank, s=world_size, group=group)
+        first = patch_pipeline_first_layer(mock, group)
+        composed = patch_pipeline_last_layer(first, group)

        # Wrap in a mock model, then wrap in PipelineParallelModel for all_gather
        inner_model = MockModel([composed])
-        model = patch_distributed_model(inner_model)
+        model = patch_pipeline_model(inner_model, group)

        x = mx.ones((1, 4))
        result = model(x)
@@ -78,8 +78,8 @@ def test_composed_wrappers_delegate_attributes() -> None:
    mock = MockLayer()
    group = mx.distributed.init()

-    first = PipelineFirstLayer(mock, r=0, group=group)
-    composed = PipelineLastLayer(first, r=0, s=1, group=group)
+    first = patch_pipeline_first_layer(mock, group)
+    composed = patch_pipeline_last_layer(first, group)

    assert composed.custom_attr == "test_value"  # type: ignore[attr-defined]
    assert composed.use_sliding is True  # type: ignore[attr-defined]
@@ -138,14 +138,9 @@ def test_composed_call_works() -> None:
                f"Device {rank} failed: {errors.get(rank, 'unknown')}"
            )
            result_array = results[rank]
-            # Each device sees its local result: intermediate ranks return their
-            # computed output (before sending), last rank returns the final result.
-            # With world_size=2 and each layer doing x*2:
-            #   - Rank 0: 1.0 * 2 = 2.0 (sends to rank 1)
-            #   - Rank 1: 2.0 * 2 = 4.0 (last rank, final result)
-            expected = 2.0 * (2**rank)  # 2.0 for rank 0, 4.0 for rank 1
-            assert (result_array == expected).all(), (
-                f"Device {rank}: expected {expected}, got {result_array}"
+            # Both devices see the final result (4.0) after all_gather
+            assert (result_array == 4.0).all(), (
+                f"Device {rank}: expected 4.0, got {result_array}"
            )
    finally:
        os.unlink(hostfile_path)
--- a/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
@@ -18,6 +18,7 @@ def _check_model_exists() -> bool:


 pytestmark = [
+    pytest.mark.slow,
    pytest.mark.skipif(
        not _check_model_exists(),
        reason=f"GPT-OSS model not found at {DEFAULT_GPT_OSS_CONFIG.model_path}",
--- a/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
@@ -89,6 +89,8 @@ def get_test_models() -> list[tuple[str, ModelCard]]:

 TEST_MODELS: list[tuple[str, ModelCard]] = get_test_models()

+pytestmark = pytest.mark.slow
+

@pytest.fixture(scope="module")
 def event_loop():
Author	SHA1	Message	Date
Ryuichi Leo Takashige	8f6f2f3065	Add fixes	2026-01-20 17:13:02 +00:00
Evan	e6af53c2ae	foo	2026-01-20 17:12:31 +00:00
Alex Cheema	ea9c6d6bdf	Remove dead local paths code from download_shard (#1227 ) ## Motivation The `download_progress_for_local_path` function and the "Handle local paths" code block in `download_shard` are dead code that cannot be reached in normal usage. The code checks if `model_id` (e.g., "mlx-community/Llama-3.2-3B-Instruct-4bit") exists as a filesystem path, but model IDs are constrained to HuggingFace repo format and there's no API pathway to pass local paths. ## Changes - Removed `download_progress_for_local_path()` function (45 lines) - Removed the "Handle local paths" block in `download_shard()` (7 lines) ## Why It Works This code was added in PR #669 as part of a "feature-local-models" branch, but the feature was never fully integrated. The check `aios.path.exists(str(shard.model_card.model_id))` would only return true if a directory literally named "mlx-community/Llama-3.2-3B-Instruct-4bit" existed in the cwd, which doesn't happen in practice. Offline caching is already handled by `fetch_file_list_with_cache`. ## Test Plan ### Manual Testing - Run exo normally and verify downloads still work ### Automated Testing - Existing tests pass (this code had no test coverage) Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-20 17:07:27 +00:00
Alex Cheema	4ea66d427b	Reduce download log spam (#1225 ) ## Motivation When `skip_download=True`, exo was logging a lot of unnecessary messages during periodic download status checks. This resulted in spammy logs that made it hard to see important messages. ## Changes - Only log "Downloading ... with allow_patterns=..." when actually downloading (not when skip_download is true) - Changed periodic download progress check logs from INFO to DEBUG level ## Why It Works The `skip_download=True` parameter is used when checking download status without actually downloading. By guarding the log behind `if not skip_download:`, we avoid logging on every status check. Changing the periodic emitting logs to DEBUG level reduces noise while still keeping them available for debugging. ## Test Plan ### Manual Testing - Run exo and observe that logs are less spammy during normal operation - Use -v or -vv flags to see DEBUG logs when needed ### Automated Testing - Existing tests cover this code path	2026-01-20 16:57:05 +00:00
rltakashige	8b709e68b2	Mark slow tests as slow (#1220 ) ## Motivation <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Changes <!-- Describe what you changed in detail --> ## Why It Works <!-- Explain why your approach solves the problem --> ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> <!-- What you did: --> <!-- - --> ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-20 15:03:46 +00:00
Evan Quiney	4da6eeb11f	fix a test broken by #1204 (#1219 ) bad merge broke a test - fix it	2026-01-20 14:56:20 +00:00
Evan	3d2eee4884	quiet localhost log this log is just noise - remove it	2026-01-20 14:51:26 +00:00
Evan	116558839e	don't clear mdns discovered connections pingers currently removes mdns discovered connections - these systems should be independent	2026-01-20 14:46:20 +00:00