fix: resolve post-rebase type errors (duplicate decls, missing tasks param)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix: cancel active tasks on meta-instance cascade delete
2026-02-18 14:55:13 -05:00 · 2026-02-17 14:09:10 -08:00 · 2026-02-17 14:09:10 -08:00 · 2026-02-17 14:09:10 -08:00 · 2026-02-17 14:09:10 -08:00 · 2026-02-17 14:09:10 -08:00
13 changed files with 58 additions and 52 deletions
--- a/src/exo/download/coordinator.py
+++ b/src/exo/download/coordinator.py
@@ -324,7 +324,7 @@ class DownloadCoordinator:
                                shard_metadata=progress.shard,
                                total_bytes=progress.total_bytes,
                            )
-                        elif progress.downloaded_bytes.in_bytes == 0:
+                        elif progress.downloaded_bytes_this_session.in_bytes == 0:
                            status = DownloadPending(
                                node_id=self.node_id,
                                shard_metadata=progress.shard,
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -603,10 +603,10 @@ class API:
                        break

        except anyio.get_cancelled_exc_class():
-            cancel_command = TaskCancelled(cancelled_command_id=command_id)
+            command = TaskCancelled(cancelled_command_id=command_id)
            with anyio.CancelScope(shield=True):
                await self.command_sender.send(
-                    ForwarderCommand(origin=self.node_id, command=cancel_command)
+                    ForwarderCommand(origin=self.node_id, command=command)
                )
            raise
        finally:
@@ -946,10 +946,10 @@ class API:
                        del image_metadata[key]

        except anyio.get_cancelled_exc_class():
-            cancel_command = TaskCancelled(cancelled_command_id=command_id)
+            command = TaskCancelled(cancelled_command_id=command_id)
            with anyio.CancelScope(shield=True):
                await self.command_sender.send(
-                    ForwarderCommand(origin=self.node_id, command=cancel_command)
+                    ForwarderCommand(origin=self.node_id, command=command)
                )
            raise
        finally:
@@ -1032,10 +1032,10 @@ class API:

            return (images, stats if capture_stats else None)
        except anyio.get_cancelled_exc_class():
-            cancel_command = TaskCancelled(cancelled_command_id=command_id)
+            command = TaskCancelled(cancelled_command_id=command_id)
            with anyio.CancelScope(shield=True):
                await self.command_sender.send(
-                    ForwarderCommand(origin=self.node_id, command=cancel_command)
+                    ForwarderCommand(origin=self.node_id, command=command)
                )
            raise
        finally:
--- a/src/exo/master/main.py
+++ b/src/exo/master/main.py
@@ -417,19 +417,16 @@ class Master:
                            )
                        case TaskCancelled():
                            if (
-                                command.cancelled_command_id
-                                in self.command_task_mapping
-                            ):
+                                task_id := self.command_task_mapping.get(
+                                    command.cancelled_command_id
+                                )
+                            ) is not None:
                                generated_events.append(
-                                    TaskDeleted(
-                                        task_id=self.command_task_mapping[
-                                            command.cancelled_command_id
-                                        ]
+                                    TaskStatusUpdated(
+                                        task_status=TaskStatus.Cancelled,
+                                        task_id=task_id,
                                    )
                                )
-                                del self.command_task_mapping[
-                                    command.cancelled_command_id
-                                ]
                        case TaskFinished():
                            generated_events.append(
                                TaskDeleted(
@@ -438,10 +435,9 @@ class Master:
                                    ]
                                )
                            )
-                            if command.finished_command_id in self.command_task_mapping:
-                                del self.command_task_mapping[
-                                    command.finished_command_id
-                                ]
+                            self.command_task_mapping.pop(
+                                command.finished_command_id, None
+                            )
                        case RequestEventLog():
                            # We should just be able to send everything, since other buffers will ignore old messages
                            # rate limit to 1000 at a time
--- a/src/exo/master/reconcile.py
+++ b/src/exo/master/reconcile.py
@@ -200,7 +200,7 @@ def try_place_for_meta_instance(
    current_instances: Mapping[InstanceId, Instance],
    node_memory: Mapping[NodeId, MemoryUsage],
    node_network: Mapping[NodeId, NodeNetworkInfo],
-    tasks: Mapping[TaskId, Task],
+    tasks: Mapping[TaskId, Task] | None = None,
 ) -> PlacementResult:
    """Try to place an instance satisfying the meta-instance constraints.

@@ -233,7 +233,7 @@ def try_place_for_meta_instance(
            )
        return PlacementResult(
            events=list(
-                get_transition_events(current_instances, target_instances, tasks)
+                get_transition_events(current_instances, target_instances, tasks or {})
            ),
            error=None,
        )
--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -105,6 +105,7 @@ Command = (
    | TaskCancelled
    | CreateMetaInstance
    | DeleteMetaInstance
+    | TaskCancelled
    | TaskFinished
    | SendInputChunk
 )
--- a/src/exo/shared/types/tasks.py
+++ b/src/exo/shared/types/tasks.py
@@ -61,7 +61,7 @@ class TextGeneration(BaseTask):  # emitted by Master
    error_message: str | None = Field(default=None)


-class CancelTask(BaseTask):  # emitted by Worker when master cancels a task
+class CancelTask(BaseTask):
    cancelled_task_id: TaskId
    runner_id: RunnerId

--- a/src/exo/utils/channels.py
+++ b/src/exo/utils/channels.py
@@ -125,7 +125,9 @@ class MpSender[T]:
            self._state.buffer.put(item, block=True)

    async def send_async(self, item: T) -> None:
-        await to_thread.run_sync(self.send, item, limiter=CapacityLimiter(1))
+        await to_thread.run_sync(
+            self.send, item, limiter=CapacityLimiter(1), abandon_on_cancel=True
+        )

    def close(self) -> None:
        if not self._state.closed.is_set():
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -34,6 +34,7 @@ from exo.shared.types.events import (
 from exo.shared.types.multiaddr import Multiaddr
 from exo.shared.types.state import State
 from exo.shared.types.tasks import (
+    CancelTask,
    CreateRunner,
    DownloadModel,
    ImageEdits,
@@ -234,15 +235,22 @@ class Worker:
                        )
                    )
                case Shutdown(runner_id=runner_id):
+                    runner = self.runners.pop(runner_id)
                    try:
                        with fail_after(3):
-                            await self.runners.pop(runner_id).start_task(task)
+                            await runner.start_task(task)
                    except TimeoutError:
                        await self.event_sender.send(
                            TaskStatusUpdated(
                                task_id=task.task_id, task_status=TaskStatus.TimedOut
                            )
                        )
+                    finally:
+                        runner.shutdown()
+                case CancelTask(
+                    cancelled_task_id=cancelled_task_id, runner_id=runner_id
+                ):
+                    await self.runners[runner_id].cancel_task(cancelled_task_id)
                case ImageEdits() if task.task_params.total_input_chunks > 0:
                    # Assemble image from chunks and inject into task
                    cmd_id = task.command_id
@@ -280,18 +288,18 @@ class Worker:
                        del self.input_chunk_buffer[cmd_id]
                    if cmd_id in self.input_chunk_counts:
                        del self.input_chunk_counts[cmd_id]
-                    await self.runners[self._task_to_runner_id(task)].start_task(
-                        modified_task
-                    )
+                    await self._start_runner_task(modified_task)
                case task:
-                    await self.runners[self._task_to_runner_id(task)].start_task(task)
+                    await self._start_runner_task(task)

    def shutdown(self):
        self._tg.cancel_scope.cancel()

-    def _task_to_runner_id(self, task: Task):
-        instance = self.state.instances[task.instance_id]
-        return instance.shard_assignments.node_to_runner[self.node_id]
+    async def _start_runner_task(self, task: Task):
+        if (instance := self.state.instances.get(task.instance_id)) is not None:
+            await self.runners[
+                instance.shard_assignments.node_to_runner[self.node_id]
+            ].start_task(task)

    async def _nack_request(self, since_idx: int) -> None:
        # We request all events after (and including) the missing index.
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@@ -328,8 +328,7 @@ def _pending_tasks(
 def _cancel_tasks(
    runners: Mapping[RunnerId, RunnerSupervisor],
    tasks: Mapping[TaskId, Task],
-) -> CancelTask | None:
-    """Find a cancelled task that hasn't been sent to the runner yet."""
+) -> Task | None:
    for task in tasks.values():
        if task.task_status != TaskStatus.Cancelled:
            continue
--- a/src/exo/worker/runner/bootstrap.py
+++ b/src/exo/worker/runner/bootstrap.py
@@ -67,9 +67,7 @@ def entrypoint(
        try:
            event_sender.close()
            task_receiver.close()
-            cancel_receiver.close()
        finally:
            event_sender.join()
            task_receiver.join()
-            cancel_receiver.join()
            logger.info("bye from the runner")
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -243,7 +243,7 @@ def main(
                        assert inference_model
                        assert tokenizer

-                        t = time.perf_counter()
+                        t = time.monotonic()
                        toks = warmup_inference(
                            model=inference_model,
                            tokenizer=tokenizer,
@@ -251,7 +251,7 @@ def main(
                        )
                        logger.info(f"warmed up by generating {toks} tokens")
                        check_for_cancel_every = min(
-                            math.ceil(toks / max(time.perf_counter() - t, 0.001)), 100
+                            math.ceil(toks / min(time.monotonic() - t, 0.001)), 100
                        )
                        if group is not None:
                            check_for_cancel_every = int(
--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -72,8 +72,8 @@ class RunnerSupervisor:
    initialize_timeout: float
    _ev_recv: MpReceiver[Event]
    _task_sender: MpSender[Task]
-    _cancel_sender: MpSender[TaskId]
    _event_sender: Sender[Event]
+    _cancel_sender: MpSender[TaskId]
    _pipe_read_fd: int | None = None  # Python reads runner's pipe output
    _pipe_write_fd: int | None = None  # Python writes gathered data to runner
    _child_pipe_fds: tuple[int, int] | None = None  # fds to close after fork
@@ -188,6 +188,8 @@ class RunnerSupervisor:
        self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
        self._cancel_sender.close()
        self._event_sender.close()
+        self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
+        self._cancel_sender.close()
        self._close_pipe_fds()
        self.runner_process.join(1)
        if not self.runner_process.is_alive():
--- a/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
@@ -1,7 +1,9 @@
 # Check tasks are complete before runner is ever ready.
+import unittest.mock
 from collections.abc import Iterable
 from typing import Callable

+import mlx.core as mx
 import pytest

 import exo.worker.runner.runner as mlx_runner
@@ -115,12 +117,6 @@ def patch_out_mlx(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setattr(mlx_runner, "warmup_inference", make_nothin(1))
    monkeypatch.setattr(mlx_runner, "_check_for_debug_prompts", nothin)
    monkeypatch.setattr(mlx_runner, "mx_any", make_nothin(False))
-
-    # Mock mx.distributed.all_gather so MockGroup doesn't hit real MLX C++ bindings.
-    def _mock_all_gather(x: object, **_kw: object) -> object:
-        return x
-
-    monkeypatch.setattr(mlx_runner.mx.distributed, "all_gather", _mock_all_gather)
    # Mock apply_chat_template since we're using a fake tokenizer (integer 1).
    # Returns a prompt without thinking tag so detect_thinking_prompt_suffix returns None.
    monkeypatch.setattr(mlx_runner, "apply_chat_template", make_nothin("test prompt"))
@@ -185,12 +181,16 @@ def _run(tasks: Iterable[Task]):
        cancel_receiver.close = nothin
        cancel_receiver.join = nothin

-        mlx_runner.main(
-            bound_instance,
-            event_sender,  # pyright: ignore[reportArgumentType]
-            task_receiver,
-            cancel_receiver,
-        )
+        with unittest.mock.patch(
+            "exo.worker.runner.runner.mx.distributed.all_gather",
+            make_nothin(mx.array([1])),
+        ):
+            mlx_runner.main(
+                bound_instance,
+                event_sender,  # pyright: ignore[reportArgumentType]
+                task_receiver,
+                cancel_receiver,
+            )

        return event_sender.events