fix: --no-downloads no longer blocks model loading (#1510 )

When --no-downloads is passed, skip download checks in plan() so pre-staged models can be loaded without DownloadCompleted events. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Leo/address rdma gpu locks 2 (#1515 )
2026-02-19 07:17:30 -05:00 · 2026-02-17 14:40:12 -08:00 · 2026-02-17 14:00:52 -08:00 · 2026-02-17 18:18:54 +00:00 · 2026-02-17 18:11:47 +00:00 · 2026-02-17 18:02:32 +00:00
11 changed files with 155 additions and 54 deletions
--- a/README.md
+++ b/README.md
@@ -72,16 +72,23 @@ There are two ways to run exo:

 ### Run from Source (macOS)

+If you have [Nix](https://nixos.org/) installed, you can skip most of the steps below and run exo directly (after accepting the Cachix cache):
+
+```bash
+nix run .#exo
+```
+
 **Prerequisites:**
+- [Xcode](https://developer.apple.com/xcode/) (provides the Metal ToolChain required for MLX compilation)
 - [brew](https://github.com/Homebrew/brew) (for simple package management on macOS)
-  
+
  ```bash
  /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
  ```
 - [uv](https://github.com/astral-sh/uv) (for Python dependency management)
 - [macmon](https://github.com/vladkens/macmon) (for hardware monitoring on Apple Silicon)
 - [node](https://github.com/nodejs/node) (for building the dashboard)
-  
+
  ```bash
  brew install uv macmon node
  ```
--- a/app/EXO/EXO/ExoProcessController.swift
+++ b/app/EXO/EXO/ExoProcessController.swift
@@ -126,11 +126,37 @@ final class ExoProcessController: ObservableObject {
            return
        }
        process.terminationHandler = nil
-        if process.isRunning {
-            process.terminate()
-        }
-        self.process = nil
        status = .stopped
+
+        guard process.isRunning else {
+            self.process = nil
+            return
+        }
+
+        let proc = process
+        self.process = nil
+
+        Task.detached {
+            proc.interrupt()
+
+            for _ in 0..<50 {
+                if !proc.isRunning { return }
+                try? await Task.sleep(nanoseconds: 100_000_000)
+            }
+
+            if proc.isRunning {
+                proc.terminate()
+            }
+
+            for _ in 0..<30 {
+                if !proc.isRunning { return }
+                try? await Task.sleep(nanoseconds: 100_000_000)
+            }
+
+            if proc.isRunning {
+                kill(proc.processIdentifier, SIGKILL)
+            }
+        }
    }

    func restart() {
--- a/flake.nix
+++ b/flake.nix
@@ -115,7 +115,7 @@
          packages = lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin (
            let
              uvLock = builtins.fromTOML (builtins.readFile ./uv.lock);
-              mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx") uvLock.package);
+              mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx" && p.source ? git) uvLock.package);
              uvLockMlxVersion = mlxPackage.version;
            in
            {
--- a/nix/mlx.nix
+++ b/nix/mlx.nix
@@ -41,16 +41,16 @@ let

  mlx = stdenv.mkDerivation rec {
    pname = "mlx";
-    version = let v = "0.30.6"; in
+    version = let v = "0.30.7.dev20260217+50487b41"; in
      assert v == uvLockMlxVersion || throw "MLX version mismatch: nix/mlx.nix has ${v} but uv.lock has ${uvLockMlxVersion}. Update both the version and hash in nix/mlx.nix.";
      v;
    pyproject = true;

    src = fetchFromGitHub {
-      owner = "ml-explore";
-      repo = "mlx";
-      tag = "v${version}";
-      hash = "sha256-avD5EGhwgmPdXLAyQSqTO6AXk/W3ziH+f6AetjK3Sdo=";
+      owner = "rltakashige";
+      repo = "mlx-jaccl-fix-small-recv";
+      rev = "50487b4141f3c951122655db3b83df5146c1fbeb";
+      hash = "sha256-IL4a9vMX5nocgJU1WG4zE8hArHkHJtnh4sdYh3od5zU=";
    };

    patches = [
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = [
    "loguru>=0.7.3",
    "exo_pyo3_bindings", # rust bindings
    "anyio==4.11.0",
-    "mlx==0.30.6; sys_platform == 'darwin'",
+    "mlx; sys_platform == 'darwin'",
    "mlx[cpu]==0.30.6; sys_platform == 'linux'",
    "mlx-lm==0.30.6",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
@@ -64,6 +64,7 @@ members = [

 [tool.uv.sources]
 exo_pyo3_bindings = { workspace = true }
+mlx = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git", branch = "address-rdma-gpu-locks", marker = "sys_platform == 'darwin'" }
 #mlx-lm = { git = "https://github.com/davidmcc73/mlx-lm", branch = "stable" }
 # Uncomment to use local mlx/mlx-lm development versions:
 # mlx = { path = "/Users/Shared/mlx", editable=true }
--- a/python/parts.nix
+++ b/python/parts.nix
@@ -58,6 +58,21 @@
        lib.optionalAttrs pkgs.stdenv.hostPlatform.isLinux (
          (lib.mapAttrs (_: ignoreMissing) nvidiaPackages) // {
            mlx = ignoreMissing prev.mlx;
+            mlx-cuda-13 = prev.mlx-cuda-13.overrideAttrs (old: {
+              buildInputs = (old.buildInputs or [ ]) ++ [
+                final.nvidia-cublas
+                final.nvidia-cuda-nvrtc
+                final.nvidia-cudnn-cu13
+                final.nvidia-nccl-cu13
+              ];
+              preFixup = ''
+                addAutoPatchelfSearchPath ${final.nvidia-cublas}
+                addAutoPatchelfSearchPath ${final.nvidia-cuda-nvrtc}
+                addAutoPatchelfSearchPath ${final.nvidia-cudnn-cu13}
+                addAutoPatchelfSearchPath ${final.nvidia-nccl-cu13}
+              '';
+              autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" ];
+            });
            torch = ignoreMissing prev.torch;
            triton = ignoreMissing prev.triton;
          }
@@ -74,14 +89,25 @@
          linuxOverlay
        ]
      );
-      exoVenv = pythonSet.mkVirtualEnv "exo-env" workspace.deps.default;
+      # mlx-cpu and mlx-cuda-13 both ship mlx/ site-packages files; keep first.
+      # mlx-cpu/mlx-cuda-13 and nvidia-cudnn-cu12/cu13 ship overlapping files.
+      venvCollisionPaths = lib.optionals pkgs.stdenv.hostPlatform.isLinux [
+        "lib/python3.13/site-packages/mlx*"
+        "lib/python3.13/site-packages/nvidia*"
+      ];
+
+      exoVenv = (pythonSet.mkVirtualEnv "exo-env" workspace.deps.default).overrideAttrs {
+        venvIgnoreCollisions = venvCollisionPaths;
+      };

      # Virtual environment with dev dependencies for testing
-      testVenv = pythonSet.mkVirtualEnv "exo-test-env" (
+      testVenv = (pythonSet.mkVirtualEnv "exo-test-env" (
        workspace.deps.default // {
          exo = [ "dev" ]; # Include pytest, pytest-asyncio, pytest-env
        }
-      );
+      )).overrideAttrs {
+        venvIgnoreCollisions = venvCollisionPaths;
+      };

      mkPythonScript = name: path: pkgs.writeShellApplication {
        inherit name;
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -94,6 +94,7 @@ class Node:
                command_sender=router.sender(topics.COMMANDS),
                download_command_sender=router.sender(topics.DOWNLOAD_COMMANDS),
                event_index_counter=event_index_counter,
+                no_downloads=args.no_downloads,
            )
        else:
            worker = None
@@ -136,6 +137,8 @@ class Node:

    async def run(self):
        async with self._tg as tg:
+            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
+            signal.signal(signal.SIGTERM, lambda _, __: self.shutdown())
            tg.start_soon(self.router.run)
            tg.start_soon(self.election.run)
            if self.download_coordinator:
@@ -147,8 +150,6 @@ class Node:
            if self.api:
                tg.start_soon(self.api.run)
            tg.start_soon(self._elect_loop)
-            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
-            signal.signal(signal.SIGTERM, lambda _, __: self.shutdown())

    def shutdown(self):
        # if this is our second call to shutdown, just sys.exit
@@ -225,6 +226,7 @@ class Node:
                        )
                        self._tg.start_soon(self.download_coordinator.run)
                    if self.worker:
+                        no_downloads = self.worker.no_downloads
                        self.worker.shutdown()
                        # TODO: add profiling etc to resource monitor
                        self.worker = Worker(
@@ -239,6 +241,7 @@ class Node:
                                topics.DOWNLOAD_COMMANDS
                            ),
                            event_index_counter=self.event_index_counter,
+                            no_downloads=no_downloads,
                        )
                        self._tg.start_soon(self.worker.run)
                    if self.api:
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -65,6 +65,7 @@ class Worker:
        command_sender: Sender[ForwarderCommand],
        download_command_sender: Sender[ForwarderDownloadCommand],
        event_index_counter: Iterator[int],
+        no_downloads: bool = False,
    ):
        self.node_id: NodeId = node_id
        self.session_id: SessionId = session_id
@@ -74,6 +75,7 @@ class Worker:
        self.event_index_counter = event_index_counter
        self.command_sender = command_sender
        self.download_command_sender = download_command_sender
+        self.no_downloads = no_downloads
        self.event_buffer = OrderedBuffer[Event]()
        self.out_for_delivery: dict[EventId, ForwarderEvent] = {}

@@ -182,6 +184,7 @@ class Worker:
                self.state.tasks,
                self.input_chunk_buffer,
                self.input_chunk_counts,
+                no_downloads=self.no_downloads,
            )
            if task is None:
                continue
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@@ -51,15 +51,20 @@ def plan(
    tasks: Mapping[TaskId, Task],
    input_chunk_buffer: Mapping[CommandId, dict[int, str]] | None = None,
    input_chunk_counts: Mapping[CommandId, int] | None = None,
+    no_downloads: bool = False,
 ) -> Task | None:
    # Python short circuiting OR logic should evaluate these sequentially.
    return (
        _cancel_tasks(runners, tasks)
        or _kill_runner(runners, all_runners, instances)
        or _create_runner(node_id, runners, instances)
-        or _model_needs_download(node_id, runners, global_download_status)
+        or (
+            None
+            if no_downloads
+            else _model_needs_download(node_id, runners, global_download_status)
+        )
        or _init_distributed_backend(runners, all_runners)
-        or _load_model(runners, all_runners, global_download_status)
+        or _load_model(runners, all_runners, global_download_status, no_downloads)
        or _ready_to_warmup(runners, all_runners)
        or _pending_tasks(runners, tasks, all_runners, input_chunk_buffer or {})
    )
@@ -192,22 +197,25 @@ def _load_model(
    runners: Mapping[RunnerId, RunnerSupervisor],
    all_runners: Mapping[RunnerId, RunnerStatus],
    global_download_status: Mapping[NodeId, Sequence[DownloadProgress]],
+    no_downloads: bool = False,
 ) -> LoadModel | None:
    for runner in runners.values():
        instance = runner.bound_instance.instance
        shard_assignments = instance.shard_assignments

-        all_local_downloads_complete = all(
-            nid in global_download_status
-            and any(
-                isinstance(dp, DownloadCompleted)
-                and dp.shard_metadata.model_card.model_id == shard_assignments.model_id
-                for dp in global_download_status[nid]
+        if not no_downloads:
+            all_local_downloads_complete = all(
+                nid in global_download_status
+                and any(
+                    isinstance(dp, DownloadCompleted)
+                    and dp.shard_metadata.model_card.model_id
+                    == shard_assignments.model_id
+                    for dp in global_download_status[nid]
+                )
+                for nid in shard_assignments.node_to_runner
            )
-            for nid in shard_assignments.node_to_runner
-        )
-        if not all_local_downloads_complete:
-            continue
+            if not all_local_downloads_complete:
+                continue

        is_single_node_instance = len(instance.shard_assignments.runner_to_shard) == 1
        if is_single_node_instance and isinstance(runner.status, RunnerIdle):
--- a/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
@@ -157,6 +157,41 @@ def test_plan_does_not_request_download_when_shard_already_downloaded():
    assert not isinstance(result, plan_mod.DownloadModel)


+def test_plan_loads_model_with_no_downloads_flag():
+    """
+    When no_downloads=True, plan() should skip download checks and proceed
+    to load the model even with empty global_download_status.
+    """
+    shard = get_pipeline_shard_metadata(model_id=MODEL_A_ID, device_rank=0)
+    instance = get_mlx_ring_instance(
+        instance_id=INSTANCE_1_ID,
+        model_id=MODEL_A_ID,
+        node_to_runner={NODE_A: RUNNER_1_ID},
+        runner_to_shard={RUNNER_1_ID: shard},
+    )
+    bound_instance = BoundInstance(
+        instance=instance, bound_runner_id=RUNNER_1_ID, bound_node_id=NODE_A
+    )
+    runner = FakeRunnerSupervisor(bound_instance=bound_instance, status=RunnerIdle())
+
+    runners = {RUNNER_1_ID: runner}
+    instances = {INSTANCE_1_ID: instance}
+    all_runners = {RUNNER_1_ID: RunnerIdle()}
+
+    result = plan_mod.plan(
+        node_id=NODE_A,
+        runners=runners,  # type: ignore
+        global_download_status={},
+        instances=instances,
+        all_runners=all_runners,
+        tasks={},
+        no_downloads=True,
+    )
+
+    assert isinstance(result, LoadModel)
+    assert result.instance_id == INSTANCE_1_ID
+
+
 def test_plan_does_not_load_model_until_all_shards_downloaded_globally():
    """
    LoadModel should not be emitted while some shards are still missing from
--- a/uv.lock
+++ b/uv.lock
@@ -377,8 +377,8 @@ dependencies = [
    { name = "hypercorn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "loguru", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "mflux", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", extra = ["cpu"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cpu"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.7.dev20260217+50487b41", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" }, marker = "sys_platform == 'darwin'" },
    { name = "mlx-lm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "msgspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "openai-harmony", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -416,7 +416,7 @@ requires-dist = [
    { name = "hypercorn", specifier = ">=0.18.0" },
    { name = "loguru", specifier = ">=0.7.3" },
    { name = "mflux", specifier = "==0.15.5" },
-    { name = "mlx", marker = "sys_platform == 'darwin'", specifier = "==0.30.6" },
+    { name = "mlx", marker = "sys_platform == 'darwin'", git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks" },
    { name = "mlx", extras = ["cpu"], marker = "sys_platform == 'linux'", specifier = "==0.30.6" },
    { name = "mlx-lm", specifier = "==0.30.6" },
    { name = "msgspec", specifier = ">=0.19.0" },
@@ -1020,8 +1020,8 @@ dependencies = [
    { name = "fonttools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "matplotlib", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", extra = ["cuda13"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cuda13"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.7.dev20260217+50487b41", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" }, marker = "sys_platform == 'darwin'" },
    { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "opencv-python", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "piexif", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1048,18 +1048,12 @@ wheels = [
 name = "mlx"
 version = "0.30.6"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mlx-metal", marker = "sys_platform == 'darwin'" },
+resolution-markers = [
+    "sys_platform == 'linux'",
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/5b/e460e144a34d5529e010056cccf50b538d56ed001473bc6b246018fd58cb/mlx-0.30.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ed86f8bffc174c2f259ca589ea25464c96cf69d1bb457074a2bf2ef53737e54f", size = 573515, upload-time = "2026-02-06T03:45:23.405Z" },
-    { url = "https://files.pythonhosted.org/packages/60/25/69833fefb9a3fef30b56792b1bcd022496c4fea83e45411d289b77ef7546/mlx-0.30.6-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:c52294958269e20f300639a17c1900ca8fc737d859ddda737f9811e94bd040e5", size = 573516, upload-time = "2026-02-06T03:45:24.618Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/6a/7e7fbeebc5cb51b6a5eba96b263a6298707bcbdc059f4b0b73e088bc3dea/mlx-0.30.6-cp313-cp313-macosx_26_0_arm64.whl", hash = "sha256:b5b6636f7c49a4d86d8ec82643b972f45a144a7a9f3a967b27b2e6e22cf71e6a", size = 573592, upload-time = "2026-02-06T03:45:25.928Z" },
    { url = "https://files.pythonhosted.org/packages/93/06/280f6f2ba80520a7109730425eda0d966658793aa0d02d8be8d351f75253/mlx-0.30.6-cp313-cp313-manylinux_2_35_aarch64.whl", hash = "sha256:67e6c9e30a9faeacc209917ef5523177cf9b086914b6b5d83ff886e4294b727d", size = 622011, upload-time = "2026-02-06T03:45:28.165Z" },
    { url = "https://files.pythonhosted.org/packages/fe/35/f872afbee9c079cc69924d9e9c46f5663adb7da58cba3511db082dd307c1/mlx-0.30.6-cp313-cp313-manylinux_2_35_x86_64.whl", hash = "sha256:47db8b16fcb6f6c5a47c0bdb24ed377b41237017ac93aa6cb6aa206c9bdf82e4", size = 663650, upload-time = "2026-02-06T03:45:30.315Z" },
-    { url = "https://files.pythonhosted.org/packages/60/23/361dc7a5797634e4d7e9bdd6564c6b28f9b1246672632def2f91bf066b18/mlx-0.30.6-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:78804a89dcff4a838f7c2da72392fe87a523e95122a3c840e53df019122aad45", size = 575028, upload-time = "2026-02-06T03:45:31.549Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/69/1854484d414171586814dfbe8def95f75c4ea2c7341ba13ba8ee675f7c62/mlx-0.30.6-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:ec13584ab069665cc7ad34a05494d9291cd623aef6ae96be48875fc87cfc25d6", size = 575026, upload-time = "2026-02-06T03:45:33.072Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/b8/3adbc441924209a7e4c568308b2a0b54bd09aee6a68db5bae85304791e54/mlx-0.30.6-cp314-cp314-macosx_26_0_arm64.whl", hash = "sha256:b2c5e8a090a753ef99a1380a4d059c983083f36198864f6df9faaf1223d083df", size = 575041, upload-time = "2026-02-06T03:45:34.814Z" },
    { url = "https://files.pythonhosted.org/packages/3f/54/9d9e06804fb2088202a2cdf60458e00b221f71420bea285720b60f9e82b5/mlx-0.30.6-cp314-cp314-manylinux_2_35_aarch64.whl", hash = "sha256:9ceddede4af0de31d1f6b3099f70e5469d60cd7c546975dedbdbeab3519cab3f", size = 624002, upload-time = "2026-02-06T03:45:36Z" },
    { url = "https://files.pythonhosted.org/packages/42/92/3140a15a50cb1f9267a6552171e1dfa577861de53e093124bc43707f2a0e/mlx-0.30.6-cp314-cp314-manylinux_2_35_x86_64.whl", hash = "sha256:4a6ffd2d16728cf95f63a1b555d7c2eaeea686a0e6b73228bd265411cb5d77a4", size = 663569, upload-time = "2026-02-06T03:45:37.242Z" },
 ]
@@ -1072,6 +1066,14 @@ cuda13 = [
    { name = "mlx-cuda-13", marker = "sys_platform == 'linux'" },
 ]

+[[package]]
+name = "mlx"
+version = "0.30.7.dev20260217+50487b41"
+source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" }
+resolution-markers = [
+    "sys_platform == 'darwin'",
+]
+
 [[package]]
 name = "mlx-cpu"
 version = "0.30.6"
@@ -1102,7 +1104,7 @@ version = "0.30.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", marker = "sys_platform == 'darwin'" },
+    { name = "mlx", version = "0.30.7.dev20260217+50487b41", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" }, marker = "sys_platform == 'darwin'" },
    { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1114,16 +1116,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/20/5f/01d281f1fa8a1521d5936659beb4f5ab1f32b463d059263cf9d4cef969d9/mlx_lm-0.30.6-py3-none-any.whl", hash = "sha256:a7405bd581eacc4bf8209d7a6b7f23629585a0d7c6740c2a97e51fee35b3b0e1", size = 379451, upload-time = "2026-02-04T21:27:43.222Z" },
 ]

-[[package]]
-name = "mlx-metal"
-version = "0.30.6"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f3/85/44406b521f920248fad621334d4dc15e77660a494edf890e7cbee33bf38d/mlx_metal-0.30.6-py3-none-macosx_14_0_arm64.whl", hash = "sha256:ea6d0c973def9a5b4f652cc77036237db3f88c9d0af63701d76b5fddde99b820", size = 38437818, upload-time = "2026-02-06T03:44:56.19Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/cb/10a516995f7d0c154b0d7e633c54b51e96977a86a355105b6474cfcbe0d0/mlx_metal-0.30.6-py3-none-macosx_15_0_arm64.whl", hash = "sha256:0f8cb94634d07e06a372d6ad9a090f38a18bab1ff19a140aede60eacf707bb94", size = 38433701, upload-time = "2026-02-06T03:44:59.678Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/7d/70cb272f7373c334709f210ed8420511fc9d64d05a7a646c0b3b94c29c04/mlx_metal-0.30.6-py3-none-macosx_26_0_arm64.whl", hash = "sha256:d761ae26304f2c4b454eeea7f612a56919d9e5e57dbb1dc0788f8e34aa6f41c2", size = 47718448, upload-time = "2026-02-06T03:45:03.133Z" },
-]
-
 [[package]]
 name = "more-itertools"
 version = "10.8.0"
Author	SHA1	Message	Date
Alex Cheema	ce45af58e3	fix: --no-downloads no longer blocks model loading (#1510 ) When --no-downloads is passed, skip download checks in plan() so pre-staged models can be loaded without DownloadCompleted events. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 14:40:12 -08:00
rltakashige	f2be929211	Leo/address rdma gpu locks 2 (#1515 ) Same as #1489 . Had to revert and redo thanks to Claude. --------- Co-authored-by: Jake Hillion <jake@hillion.co.uk> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 14:00:52 -08:00
rltakashige	83af8c63fa	Revert "Use custom fork that resolves GPU locks" (#1502 ) Reverts exo-explore/exo#1489 Goddammit Claude...	2026-02-17 18:18:54 +00:00
Evan Quiney	eccc6298d1	Revert "Add MetaInstance declarative layer (#1447 )" This reverts commit `a962a28afc`.	2026-02-17 18:11:47 +00:00
Evan Quiney	c8997217cf	Revert "feat: better onboarding UX for new users (#1479 )" This reverts commit `490d2e46ba`.	2026-02-17 18:02:32 +00:00
Alex Cheema	490d2e46ba	feat: better onboarding UX for new users (#1479 ) ## Summary - Auto-open dashboard in browser on first launch (uses `~/.exo/.dashboard_opened` marker) - Welcome overlay with "Choose a Model" CTA button when no model instance is running - Tutorial progress messages during model download → loading → ready lifecycle stages - Fix conversation sidebar text contrast — bumped to white text, added active state background - Simplify technical jargon — sharding/instance type/min nodes hidden behind collapsible "Advanced Options" toggle; strategy display hidden behind debug mode - Polished DMG installer with drag-to-Applications layout, custom branded background, and AppleScript-configured window positioning ## Test plan - [ ] Launch exo for the first time (delete `~/.exo/.dashboard_opened` to simulate) — browser should auto-open - [ ] Verify welcome overlay appears on topology when no model is loaded - [ ] Launch a model and verify download/loading/ready messages appear in instance cards - [ ] Check conversation sidebar text is readable (white on dark, yellow when active) - [ ] Verify "Advanced Options" toggle hides/shows sharding controls - [ ] Build DMG with `packaging/dmg/create-dmg.sh` and verify drag-to-Applications layout 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 17:52:49 +00:00
rltakashige	facf2d4d03	Use custom fork that resolves GPU locks (#1489 ) ## Motivation There is an issue on Macs that means that an explicit synchronization is necessary for memory to be updated from L1 cache. This means that GPU locks can occur when a spin wait does not see the updated timestamp. ## Changes Updated in my own personal fork. ## Why It Works https://github.com/ARM-software/acle/releases ## Test Plan ### Manual Testing Tested manually that no GPU locks occur (even with multiple simultaneous instances running) and that the performance differential is negligible (267 vs 269 tps on Llama 3.2 1B at an approx 10k context.) ------------------------------------------------------ I have seen a GPU lock, specifically when sending a particularly large chat completion while the model was loading. However, I have since been unable to reproduce and this may be something I did wrong. Please do create an issue and tag me if any GPU locks do occur. --------- Co-authored-by: Jake Hillion <jake@hillion.co.uk> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 17:48:43 +00:00
Alex Cheema	a962a28afc	Add MetaInstance declarative layer (#1447 ) ## Motivation Users currently manage instances directly, which means if a node disconnects or connections break, the instance dies and nothing recreates it. MetaInstance is a declarative primitive: "ensure an instance matching these parameters always exists." The reconciler watches for unhealthy or missing backing instances and re-places them automatically. ## Changes - MetaInstance type (`meta_instance.py`): declarative constraint with `model_id`, `min_nodes`, optional `node_ids`, and `sharding` - Reconciler (`reconcile.py`): `find_unsatisfied_meta_instances` checks which MetaInstances lack a healthy backing instance, `try_place_for_meta_instance` creates one - Master loop (`main.py`): periodically reconciles unsatisfied MetaInstances; immediate placement on `CreateMetaInstance` command - API (`api.py`): `create_meta_instance` / `delete_meta_instance` / `GET /meta_instances` endpoints; delete cascades to backing instances with task cancellation - Binding via `meta_instance_id` on Instance (`instances.py`): no separate binding event or backing map — the instance carries its parent MetaInstance ID directly, eliminating race conditions in the reconciler - Dashboard: sidebar shows MetaInstances with their backing instance status; orphan instances (created directly) still shown separately - Tests: constraint matching, connection health, unsatisfied detection, exclusive binding, cascade delete with task cancellation ### Recent improvements - fix: cancel active tasks on cascade delete — `DeleteMetaInstance` now emits `TaskStatusUpdated(Cancelled)` for any Pending/Running tasks on backing instances before emitting `InstanceDeleted`. Previously, cascade-deleting backing instances left orphaned task references in state. - Lifecycle logging — added `logger.info`/`logger.warning` for: `CreateMetaInstance` (model, min_nodes, sharding), `DeleteMetaInstance` (with cascade count), reconciler placement success/failure, and retry decisions with attempt counts in `InstanceHealthReconciler`. - GET `/meta_instances` endpoint — lists all meta-instances without needing to fetch full state. - 2 regression tests — `test_cascade_delete_cancels_active_tasks` and `test_cascade_delete_skips_completed_tasks` verify the cascade-delete event sequence. ## Why It Works Putting `meta_instance_id` on `BaseInstance` makes binding inherent to instance creation. When the reconciler creates an instance for a MetaInstance, it tags it via `model_copy`. When the instance is deleted, the binding disappears with it. This avoids the two bugs that a separate binding mechanism would introduce: 1. Stale exclusion sets — the reconciler loop can't accidentally bind two MetaInstances to the same instance 2. Delete ordering race — no window between deleting an instance and its binding where the reconciler could re-place ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> - Created MetaInstance via dashboard, verified instance placed - Verified delete cascades (deleting MetaInstance removes backing instance) - Verified orphan instances still work independently ### Automated Testing - 30 tests in `test_meta_instance_edge_cases.py`: lifecycle, retry logic, error handling, concurrent operations, cascade delete with task cancellation - 24 tests in `test_reconcile.py`: constraint matching, connection health (single/multi-node, edge removal, IP changes), unsatisfied detection, exclusive binding, idempotency - All 261 tests pass - basedpyright 0 errors, ruff clean, dashboard builds --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 09:48:19 -08:00
Alex Cheema	db79c350c1	Fix graceful process shutdown in macOS app (#1372 ) ## Motivation Fixes #1370 When the macOS app stops exo, GPU/system memory isn't released. This happens because: 1. The macOS app calls `process.terminate()` (SIGTERM) but the Python process only registers a graceful shutdown handler for SIGINT, not SIGTERM. SIGTERM's default Python behavior raises `SystemExit` which bypasses the cleanup cascade (runner subprocess MLX cleanup via `mx.clear_cache()`, channel closing, etc.). 2. The app doesn't wait for the process to actually finish cleanup — it immediately nils out the process reference. ## Changes `src/exo/main.py`: Register SIGTERM handler alongside SIGINT so the graceful shutdown cascade (`Node.shutdown()` → cancel task group → worker/runner cleanup → `mx.clear_cache()` + `gc.collect()`) runs regardless of which signal is received. `app/EXO/EXO/ExoProcessController.swift`: Replace immediate `process.terminate()` with escalating shutdown per @Evanev7's suggestion: 1. Send SIGINT via `process.interrupt()` — triggers the registered Python handler for graceful cleanup 2. Wait up to 5 seconds for the process to exit 3. If still running, escalate to SIGTERM via `process.terminate()` 4. Wait up to 3 seconds 5. If still running, force kill via SIGKILL The escalation runs in a detached `Task` so the UI updates immediately (status → stopped) without blocking. ## Why It Works The root cause is that SIGTERM wasn't triggering the graceful shutdown path. By registering a SIGTERM handler in Python and sending SIGINT first from the macOS app, the process gets a chance to run the full cleanup cascade: cancelling the task group, shutting down runners (which call `del model; mx.clear_cache(); gc.collect()`), closing channels, and flushing logs. The escalation to SIGTERM and SIGKILL ensures the process always terminates even if graceful shutdown hangs. ## Test Plan ### Manual Testing <!-- Hardware: Mac Studio M4 Max 128GB --> - Start exo via macOS app, load a model, run inference - Stop via the toggle switch, verify memory is released without requiring a system restart - Test rapid stop/start (restart) to ensure no race conditions ### Automated Testing - `uv run basedpyright` — 0 errors - `uv run ruff check` — passes - `nix fmt` — no changes --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: Evan Quiney <evanev7@gmail.com>	2026-02-17 09:03:54 -08:00