api cancellation

closing the http request to the api now - sends a cancellation from the api - writes that canellation in the master - worker plans off the cancellation - runner observes that cancellation after every generation step (+1 communication per token) - cancellation happens synchronously to prevent gpu locks
2026-02-05 03:33:30 -05:00 · 2026-02-04 16:45:50 +00:00
38 changed files with 680 additions and 1092 deletions
--- a/MISSED_THINGS.md
+++ b/MISSED_THINGS.md
@@ -5,21 +5,21 @@
 [X] Fetching download status of all models on start
 [X] Deduplication of tasks in plan_step.
 [X] resolve_allow_patterns should just be wildcard now.
-[] no mx_barrier in genreate.py mlx_generate at the end.
+[X] no mx_barrier in genreate.py mlx_generate at the end.
 [] cache assertion not needed in auto_parallel.py PipelineLastLayer.
-[] GPTOSS support dropped in auto_parallel.py.
-[] sharding changed "all-to-sharded" became _all_to_sharded in auto_parallel.py.
-[] same as above with "sharded-to-all" became _sharded_to_all in auto_parallel.py.
-[] Dropped support for Ministral3Model, DeepseekV32Model, Glm4MoeModel, Qwen3NextModel, GptOssMode in auto_parallel.py.
+[X] GPTOSS support dropped in auto_parallel.py.
+[X] sharding changed "all-to-sharded" became _all_to_sharded in auto_parallel.py.
+[X] same as above with "sharded-to-all" became _sharded_to_all in auto_parallel.py.
+[X] Dropped support for Ministral3Model, DeepseekV32Model, Glm4MoeModel, Qwen3NextModel, GptOssMode in auto_parallel.py.
 [] Dropped prefill/decode code in auto_parallel.py and utils_mlx.py.
 [X] KV_CACHE_BITS should be None to disable quantized KV cache.
-[] Dropped _set_nofile_limit in utils_mlx.py.
-[] We have group optional in load_mlx_items in utils_mlx.py.
-[] Dropped add_missing_chat_templates for GptOss in load_mlx_items in utils_mlx.py.
-[] Dropped model.make_cache in make_kv_cache in utils_mlx.py.
+[X] Dropped _set_nofile_limit in utils_mlx.py.
+[X] We have group optional in load_mlx_items in utils_mlx.py.
+[X] Dropped add_missing_chat_templates for GptOss in load_mlx_items in utils_mlx.py.
+[X] Dropped model.make_cache in make_kv_cache in utils_mlx.py.
 [X] We put cache limit back in utils_mlx.py.
-[] topology.py remove_node removes the connections after checking if node is is in self._node_id_to_rx_id_map. on beta_1 it checks after, so would remove stale connections I guess?
-[] Missing Glm 4.7 model cards (this isn't ready yet but should be picked up, probably create an issue... the blocker is transforemrs version doesn't support the tokenizer for Glm 4.7. rc-1 does but we can't upgrade as it breaks other things.)
+[X] topology.py remove_node removes the connections after checking if node is is in self._node_id_to_rx_id_map. on beta_1 it checks after, so would remove stale connections I guess?
+[X] Missing Glm 4.7 model cards (this isn't ready yet but should be picked up, probably create an issue... the blocker is transforemrs version doesn't support the tokenizer for Glm 4.7. rc-1 does but we can't upgrade as it breaks other things.)
 [] try-except in _command_processor only excepts ValueError. This was silently failing leading to un-debuggable errors (we had a KeyError that was happening ). Changed this to catch Exception instead of ValueError. See exo-v2 89ae38405e0052e3c22405daf094b065878aa873 and fb99fea69b5a39017efc90c5dad0072e677455f0.
 [X] In placement.py, place_instance no longer looks at model_meta.supports_tensor and check if this tensor parallel number of nodes is supported by the model's tensor dimensions.
 [X] In placement.py, place_instanec, we no longer have the special case to exclude DeepSeek v3.1 pipeline parallel (it doesn't work).
--- a/resources/image_model_cards/exolabs--Qwen-Image-4bit.toml
+++ b/resources/image_model_cards/exolabs--Qwen-Image-4bit.toml
@@ -3,7 +3,6 @@ n_layers = 60
 hidden_size = 1
 supports_tensor = false
 tasks = ["TextToImage"]
-uses_cfg = true

 [storage_size]
 in_bytes = 26799533856
--- a/resources/image_model_cards/exolabs--Qwen-Image-8bit.toml
+++ b/resources/image_model_cards/exolabs--Qwen-Image-8bit.toml
@@ -3,7 +3,6 @@ n_layers = 60
 hidden_size = 1
 supports_tensor = false
 tasks = ["TextToImage"]
-uses_cfg = true

 [storage_size]
 in_bytes = 37014734400
--- a/resources/image_model_cards/exolabs--Qwen-Image-Edit-2509-4bit.toml
+++ b/resources/image_model_cards/exolabs--Qwen-Image-Edit-2509-4bit.toml
@@ -3,7 +3,6 @@ n_layers = 60
 hidden_size = 1
 supports_tensor = false
 tasks = ["ImageToImage"]
-uses_cfg = true

 [storage_size]
 in_bytes = 26799533856
--- a/resources/image_model_cards/exolabs--Qwen-Image-Edit-2509-8bit.toml
+++ b/resources/image_model_cards/exolabs--Qwen-Image-Edit-2509-8bit.toml
@@ -3,7 +3,6 @@ n_layers = 60
 hidden_size = 1
 supports_tensor = false
 tasks = ["ImageToImage"]
-uses_cfg = true

 [storage_size]
 in_bytes = 37014734400
--- a/resources/image_model_cards/exolabs--Qwen-Image-Edit-2509.toml
+++ b/resources/image_model_cards/exolabs--Qwen-Image-Edit-2509.toml
@@ -3,7 +3,6 @@ n_layers = 60
 hidden_size = 1
 supports_tensor = false
 tasks = ["ImageToImage"]
-uses_cfg = true

 [storage_size]
 in_bytes = 57445135488
--- a/resources/image_model_cards/exolabs--Qwen-Image.toml
+++ b/resources/image_model_cards/exolabs--Qwen-Image.toml
@@ -3,7 +3,6 @@ n_layers = 60
 hidden_size = 1
 supports_tensor = false
 tasks = ["TextToImage"]
-uses_cfg = true

 [storage_size]
 in_bytes = 57445135488
--- a/src/exo/download/coordinator.py
+++ b/src/exo/download/coordinator.py
@@ -53,10 +53,11 @@ class DownloadCoordinator:
    # Internal event channel for forwarding (initialized in __post_init__)
    event_sender: Sender[Event] = field(init=False)
    event_receiver: Receiver[Event] = field(init=False)
-    _tg: TaskGroup = field(init=False, default_factory=anyio.create_task_group)
+    _tg: TaskGroup = field(init=False)

    def __post_init__(self) -> None:
        self.event_sender, self.event_receiver = channel[Event]()
+        self._tg = anyio.create_task_group()

    async def run(self) -> None:
        logger.info("Starting DownloadCoordinator")
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -135,22 +135,19 @@ class Node:
        )

    async def run(self):
-        with anyio.open_signal_receiver(signal.SIGINT, signal.SIGTERM) as signals:
-            async with self._tg as tg:
-                tg.start_soon(self.router.run)
-                tg.start_soon(self.election.run)
-                if self.download_coordinator:
-                    tg.start_soon(self.download_coordinator.run)
-                if self.worker:
-                    tg.start_soon(self.worker.run)
-                if self.master:
-                    tg.start_soon(self.master.run)
-                if self.api:
-                    tg.start_soon(self.api.run)
-                tg.start_soon(self._elect_loop)
-                async for sig in signals:
-                    self.shutdown()
-
+        async with self._tg as tg:
+            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
+            tg.start_soon(self.router.run)
+            tg.start_soon(self.election.run)
+            if self.download_coordinator:
+                tg.start_soon(self.download_coordinator.run)
+            if self.worker:
+                tg.start_soon(self.worker.run)
+            if self.master:
+                tg.start_soon(self.master.run)
+            if self.api:
+                tg.start_soon(self.api.run)
+            tg.start_soon(self._elect_loop)

    def shutdown(self):
        # if this is our second call to shutdown, just sys.exit
--- a/src/exo/master/adapters/chat_completions.py
+++ b/src/exo/master/adapters/chat_completions.py
@@ -158,7 +158,7 @@ async def generate_chat_stream(
 async def collect_chat_response(
    command_id: CommandId,
    chunk_stream: AsyncGenerator[ErrorChunk | ToolCallChunk | TokenChunk, None],
-) -> ChatCompletionResponse:
+) -> AsyncGenerator[str]:
    """Collect all token chunks and return a single ChatCompletionResponse."""
    text_parts: list[str] = []
    tool_calls: list[ToolCall] = []
@@ -196,7 +196,7 @@ async def collect_chat_response(
    combined_text = "".join(text_parts)
    assert model is not None

-    return ChatCompletionResponse(
+    yield ChatCompletionResponse(
        id=command_id,
        created=int(time.time()),
        model=model,
@@ -211,4 +211,5 @@ async def collect_chat_response(
                finish_reason=finish_reason,
            )
        ],
-    )
+    ).model_dump_json()
+    return
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -1,7 +1,6 @@
 import base64
 import contextlib
 import json
-import random
 import time
 from collections.abc import AsyncGenerator, Awaitable, Callable
 from datetime import datetime, timezone
@@ -123,6 +122,7 @@ from exo.shared.types.commands import (
    PlaceInstance,
    SendInputChunk,
    StartDownload,
+    TaskCancelled,
    TaskFinished,
    TextGeneration,
 )
@@ -151,15 +151,6 @@ def _format_to_content_type(image_format: Literal["png", "jpeg", "webp"] | None)
    return f"image/{image_format or 'png'}"


-def _ensure_seed(params: AdvancedImageParams | None) -> AdvancedImageParams:
-    """Ensure advanced params has a seed set for distributed consistency."""
-    if params is None:
-        return AdvancedImageParams(seed=random.randint(0, 2**32 - 1))
-    if params.seed is None:
-        return params.model_copy(update={"seed": random.randint(0, 2**32 - 1)})
-    return params
-
-
 class API:
    def __init__(
        self,
@@ -529,16 +520,14 @@ class API:
                        break

        except anyio.get_cancelled_exc_class():
-            # TODO: TaskCancelled
-            """
-            self.command_sender.send_nowait(
-                ForwarderCommand(origin=self.node_id, command=command)
-            )
-            """
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
-            command = TaskFinished(finished_command_id=command_id)
-            await self._send(command)
+            await self._send(TaskFinished(finished_command_id=command_id))
            if command_id in self._text_generation_queues:
                del self._text_generation_queues[command_id]

@@ -628,11 +617,14 @@ class API:
                ),
                media_type="text/event-stream",
            )
-
-        return await collect_chat_response(
-            command.command_id,
-            self._token_chunk_stream(command.command_id),
-        )
+        else:
+            return StreamingResponse(
+                collect_chat_response(
+                    command.command_id,
+                    self._token_chunk_stream(command.command_id),
+                ),
+                media_type="application/json",
+            )

    async def bench_chat_completions(
        self, payload: BenchChatCompletionRequest
@@ -648,8 +640,7 @@ class API:
        command = TextGeneration(task_params=task_params)
        await self._send(command)

-        response = await self._collect_text_generation_with_stats(command.command_id)
-        return response
+        return await self._collect_text_generation_with_stats(command.command_id)

    async def _resolve_and_validate_text_model(self, model_id: ModelId) -> ModelId:
        """Validate a text model exists and return the resolved model ID.
@@ -719,9 +710,6 @@ class API:
        with SSE-formatted events for partial and final images.
        """
        payload.model = await self._validate_image_model(ModelId(payload.model))
-        payload = payload.model_copy(
-            update={"advanced_params": _ensure_seed(payload.advanced_params)}
-        )

        command = ImageGeneration(
            task_params=payload,
@@ -851,6 +839,11 @@ class API:
                        del image_metadata[key]

        except anyio.get_cancelled_exc_class():
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
            await self._send(TaskFinished(finished_command_id=command_id))
@@ -932,6 +925,11 @@ class API:

            return (images, stats if capture_stats else None)
        except anyio.get_cancelled_exc_class():
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
            await self._send(TaskFinished(finished_command_id=command_id))
@@ -970,9 +968,6 @@ class API:

        payload.stream = False
        payload.partial_images = 0
-        payload = payload.model_copy(
-            update={"advanced_params": _ensure_seed(payload.advanced_params)}
-        )

        command = ImageGeneration(
            task_params=payload,
@@ -1004,7 +999,6 @@ class API:
    ) -> ImageEdits:
        """Prepare and send an image edits command with chunked image upload."""
        resolved_model = await self._validate_image_model(model)
-        advanced_params = _ensure_seed(advanced_params)

        image_content = await image.read()
        image_data = base64.b64encode(image_content).decode("utf-8")
--- a/src/exo/master/main.py
+++ b/src/exo/master/main.py
@@ -21,6 +21,7 @@ from exo.shared.types.commands import (
    PlaceInstance,
    RequestEventLog,
    SendInputChunk,
+    TaskCancelled,
    TaskFinished,
    TestCommand,
    TextGeneration,
@@ -36,6 +37,7 @@ from exo.shared.types.events import (
    NodeTimedOut,
    TaskCreated,
    TaskDeleted,
+    TaskStatusUpdated,
    TraceEventData,
    TracesCollected,
    TracesMerged,
@@ -96,18 +98,16 @@ class Master:
    async def run(self):
        logger.info("Starting Master")

-        try:
-            async with self._tg as tg:
-                tg.start_soon(self._event_processor)
-                tg.start_soon(self._command_processor)
-                tg.start_soon(self._loopback_processor)
-                tg.start_soon(self._plan)
-        finally:
-            self.global_event_sender.close()
-            self.local_event_receiver.close()
-            self.command_receiver.close()
-            self._loopback_event_sender.close()
-            self._loopback_event_receiver.close()
+        async with self._tg as tg:
+            tg.start_soon(self._event_processor)
+            tg.start_soon(self._command_processor)
+            tg.start_soon(self._loopback_processor)
+            tg.start_soon(self._plan)
+        self.global_event_sender.close()
+        self.local_event_receiver.close()
+        self.command_receiver.close()
+        self._loopback_event_sender.close()
+        self._loopback_event_receiver.close()

    async def shutdown(self):
        logger.info("Stopping Master")
@@ -278,7 +278,7 @@ class Master:
                        case DeleteInstance():
                            placement = delete_instance(command, self.state.instances)
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case PlaceInstance():
@@ -290,7 +290,7 @@ class Master:
                                self.state.node_network,
                            )
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case CreateInstance():
@@ -300,7 +300,7 @@ class Master:
                                self.state.instances,
                            )
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case SendInputChunk(chunk=chunk):
@@ -310,6 +310,18 @@ class Master:
                                    chunk=chunk,
                                )
                            )
+                        case TaskCancelled():
+                            if (
+                                task_id := self.command_task_mapping.get(
+                                    command.cancelled_command_id
+                                )
+                            ) is not None:
+                                generated_events.append(
+                                    TaskStatusUpdated(
+                                        task_status=TaskStatus.Cancelled,
+                                        task_id=task_id,
+                                    )
+                                )
                        case TaskFinished():
                            generated_events.append(
                                TaskDeleted(
@@ -318,10 +330,9 @@ class Master:
                                    ]
                                )
                            )
-                            if command.finished_command_id in self.command_task_mapping:
-                                del self.command_task_mapping[
-                                    command.finished_command_id
-                                ]
+                            self.command_task_mapping.pop(
+                                command.finished_command_id, None
+                            )
                        case RequestEventLog():
                            # We should just be able to send everything, since other buffers will ignore old messages
                            for i in range(command.since_idx, len(self._event_log)):
--- a/src/exo/master/placement.py
+++ b/src/exo/master/placement.py
@@ -20,9 +20,15 @@ from exo.shared.types.commands import (
    PlaceInstance,
 )
 from exo.shared.types.common import NodeId
-from exo.shared.types.events import Event, InstanceCreated, InstanceDeleted
+from exo.shared.types.events import (
+    Event,
+    InstanceCreated,
+    InstanceDeleted,
+    TaskStatusUpdated,
+)
 from exo.shared.types.memory import Memory
 from exo.shared.types.profiling import MemoryUsage, NodeNetworkInfo
+from exo.shared.types.tasks import Task, TaskId, TaskStatus
 from exo.shared.types.worker.instances import (
    Instance,
    InstanceId,
@@ -180,6 +186,7 @@ def delete_instance(
 def get_transition_events(
    current_instances: Mapping[InstanceId, Instance],
    target_instances: Mapping[InstanceId, Instance],
+    tasks: Mapping[TaskId, Task],
 ) -> Sequence[Event]:
    events: list[Event] = []

@@ -195,6 +202,18 @@ def get_transition_events(
    # find instances to delete
    for instance_id in current_instances:
        if instance_id not in target_instances:
+            for task in tasks.values():
+                if task.instance_id == instance_id and task.task_status in [
+                    TaskStatus.Pending,
+                    TaskStatus.Running,
+                ]:
+                    events.append(
+                        TaskStatusUpdated(
+                            task_status=TaskStatus.Cancelled,
+                            task_id=task.task_id,
+                        )
+                    )
+
            events.append(
                InstanceDeleted(
                    instance_id=instance_id,
--- a/src/exo/master/placement_utils.py
+++ b/src/exo/master/placement_utils.py
@@ -10,7 +10,6 @@ from exo.shared.types.profiling import MemoryUsage, NodeNetworkInfo
 from exo.shared.types.topology import Cycle, RDMAConnection, SocketConnection
 from exo.shared.types.worker.runners import RunnerId, ShardAssignments
 from exo.shared.types.worker.shards import (
-    CfgShardMetadata,
    PipelineShardMetadata,
    Sharding,
    ShardMetadata,
@@ -75,43 +74,40 @@ def allocate_layers_proportionally(
    return result


-def _validate_cycle(cycle: Cycle) -> None:
+def get_shard_assignments_for_pipeline_parallel(
+    model_card: ModelCard,
+    cycle: Cycle,
+    node_memory: Mapping[NodeId, MemoryUsage],
+):
    if not cycle.node_ids:
        raise ValueError("Cannot create shard assignments for empty node cycle")

-
-def _compute_total_memory(
-    node_ids: list[NodeId],
-    node_memory: Mapping[NodeId, MemoryUsage],
-) -> Memory:
-    total_memory = sum(
-        (node_memory[node_id].ram_available for node_id in node_ids),
+    cycle_memory = sum(
+        (node_memory[node_id].ram_available for node_id in cycle.node_ids),
        start=Memory(),
    )
-    if total_memory.in_bytes == 0:
+    if cycle_memory.in_bytes == 0:
        raise ValueError("Cannot create shard assignments: total available memory is 0")
-    return total_memory

+    total_layers = model_card.n_layers
+    world_size = len(cycle)
+    runner_to_shard: dict[RunnerId, ShardMetadata] = {}
+    node_to_runner: dict[NodeId, RunnerId] = {}

-def _allocate_and_validate_layers(
-    node_ids: list[NodeId],
-    node_memory: Mapping[NodeId, MemoryUsage],
-    total_memory: Memory,
-    model_card: ModelCard,
-) -> list[int]:
    layer_allocations = allocate_layers_proportionally(
-        total_layers=model_card.n_layers,
+        total_layers=total_layers,
        memory_fractions=[
-            node_memory[node_id].ram_available.in_bytes / total_memory.in_bytes
-            for node_id in node_ids
+            node_memory[node_id].ram_available.in_bytes / cycle_memory.in_bytes
+            for node_id in cycle.node_ids
        ],
    )

-    total_storage_bytes = model_card.storage_size.in_bytes
-    total_layers = model_card.n_layers
-    for i, node_id in enumerate(node_ids):
-        node_layers = layer_allocations[i]
-        required_memory = (total_storage_bytes * node_layers) // total_layers
+    # Validate each node has sufficient memory for its assigned layers
+    memory_per_layer = model_card.storage_size.in_bytes / total_layers
+    for i, (node_id, node_layers) in enumerate(
+        zip(cycle.node_ids, layer_allocations, strict=True)
+    ):
+        required_memory = node_layers * memory_per_layer
        available_memory = node_memory[node_id].ram_available.in_bytes
        if required_memory > available_memory:
            raise ValueError(
@@ -120,126 +116,33 @@ def _allocate_and_validate_layers(
                f"but only has {available_memory / (1024**3):.2f} GB available"
            )

-    return layer_allocations
-
-
-def get_shard_assignments_for_pipeline_parallel(
-    model_card: ModelCard,
-    cycle: Cycle,
-    node_memory: Mapping[NodeId, MemoryUsage],
-) -> ShardAssignments:
-    """Create shard assignments for pipeline parallel execution."""
-    world_size = len(cycle)
-    use_cfg_parallel = model_card.uses_cfg and world_size >= 2 and world_size % 2 == 0
-
-    if use_cfg_parallel:
-        return _get_shard_assignments_for_cfg_parallel(model_card, cycle, node_memory)
-    else:
-        return _get_shard_assignments_for_pure_pipeline(model_card, cycle, node_memory)
-
-
-def _get_shard_assignments_for_cfg_parallel(
-    model_card: ModelCard,
-    cycle: Cycle,
-    node_memory: Mapping[NodeId, MemoryUsage],
-) -> ShardAssignments:
-    """Create shard assignments for CFG parallel execution.
-
-    CFG parallel runs two independent pipelines. Group 0 processes the positive
-    prompt, group 1 processes the negative prompt. The ring topology places
-    group 1's ranks in reverse order so both "last stages" are neighbors for
-    efficient CFG exchange.
-    """
-    _validate_cycle(cycle)
-
-    world_size = len(cycle)
-    cfg_world_size = 2
-    pipeline_world_size = world_size // cfg_world_size
-
-    # Allocate layers for one pipeline group (both groups run the same layers)
-    pipeline_node_ids = cycle.node_ids[:pipeline_world_size]
-    pipeline_memory = _compute_total_memory(pipeline_node_ids, node_memory)
-    layer_allocations = _allocate_and_validate_layers(
-        pipeline_node_ids, node_memory, pipeline_memory, model_card
-    )
-
-    # Ring topology: group 0 ascending [0,1,2,...], group 1 descending [...,2,1,0]
-    # This places both last stages as neighbors for CFG exchange.
-    position_to_cfg_pipeline = [(0, r) for r in range(pipeline_world_size)] + [
-        (1, r) for r in reversed(range(pipeline_world_size))
-    ]
-
-    runner_to_shard: dict[RunnerId, ShardMetadata] = {}
-    node_to_runner: dict[NodeId, RunnerId] = {}
-
-    for device_rank, node_id in enumerate(cycle.node_ids):
-        cfg_rank, pipeline_rank = position_to_cfg_pipeline[device_rank]
-        layers_before = sum(layer_allocations[:pipeline_rank])
-        node_layers = layer_allocations[pipeline_rank]
-
-        shard = CfgShardMetadata(
-            model_card=model_card,
-            device_rank=device_rank,
-            world_size=world_size,
-            start_layer=layers_before,
-            end_layer=layers_before + node_layers,
-            n_layers=model_card.n_layers,
-            cfg_rank=cfg_rank,
-            cfg_world_size=cfg_world_size,
-            pipeline_rank=pipeline_rank,
-            pipeline_world_size=pipeline_world_size,
-        )
-
+    layers_assigned = 0
+    for i, (node_id, node_layers) in enumerate(
+        zip(cycle.node_ids, layer_allocations, strict=True)
+    ):
        runner_id = RunnerId()
-        runner_to_shard[runner_id] = shard
-        node_to_runner[node_id] = runner_id
-
-    return ShardAssignments(
-        model_id=model_card.model_id,
-        runner_to_shard=runner_to_shard,
-        node_to_runner=node_to_runner,
-    )
-
-
-def _get_shard_assignments_for_pure_pipeline(
-    model_card: ModelCard,
-    cycle: Cycle,
-    node_memory: Mapping[NodeId, MemoryUsage],
-) -> ShardAssignments:
-    """Create shard assignments for pure pipeline execution."""
-    _validate_cycle(cycle)
-    total_memory = _compute_total_memory(cycle.node_ids, node_memory)
-
-    layer_allocations = _allocate_and_validate_layers(
-        cycle.node_ids, node_memory, total_memory, model_card
-    )
-
-    runner_to_shard: dict[RunnerId, ShardMetadata] = {}
-    node_to_runner: dict[NodeId, RunnerId] = {}
-
-    for pipeline_rank, node_id in enumerate(cycle.node_ids):
-        layers_before = sum(layer_allocations[:pipeline_rank])
-        node_layers = layer_allocations[pipeline_rank]

        shard = PipelineShardMetadata(
            model_card=model_card,
-            device_rank=pipeline_rank,
-            world_size=len(cycle),
-            start_layer=layers_before,
-            end_layer=layers_before + node_layers,
-            n_layers=model_card.n_layers,
+            device_rank=i,
+            world_size=world_size,
+            start_layer=layers_assigned,
+            end_layer=layers_assigned + node_layers,
+            n_layers=total_layers,
        )

-        runner_id = RunnerId()
        runner_to_shard[runner_id] = shard
        node_to_runner[node_id] = runner_id
+        layers_assigned += node_layers

-    return ShardAssignments(
+    shard_assignments = ShardAssignments(
        model_id=model_card.model_id,
        runner_to_shard=runner_to_shard,
        node_to_runner=node_to_runner,
    )

+    return shard_assignments
+

 def get_shard_assignments_for_tensor_parallel(
    model_card: ModelCard,
--- a/src/exo/master/tests/test_placement.py
+++ b/src/exo/master/tests/test_placement.py
@@ -239,7 +239,7 @@ def test_get_transition_events_no_change(instance: Instance):
    target_instances = {instance_id: instance}

    # act
-    events = get_transition_events(current_instances, target_instances)
+    events = get_transition_events(current_instances, target_instances, {})

    # assert
    assert len(events) == 0
@@ -252,7 +252,7 @@ def test_get_transition_events_create_instance(instance: Instance):
    target_instances: dict[InstanceId, Instance] = {instance_id: instance}

    # act
-    events = get_transition_events(current_instances, target_instances)
+    events = get_transition_events(current_instances, target_instances, {})

    # assert
    assert len(events) == 1
@@ -266,7 +266,7 @@ def test_get_transition_events_delete_instance(instance: Instance):
    target_instances: dict[InstanceId, Instance] = {}

    # act
-    events = get_transition_events(current_instances, target_instances)
+    events = get_transition_events(current_instances, target_instances, {})

    # assert
    assert len(events) == 1
--- a/src/exo/master/tests/test_placement_utils.py
+++ b/src/exo/master/tests/test_placement_utils.py
@@ -5,7 +5,6 @@ from exo.master.placement_utils import (
    filter_cycles_by_memory,
    get_mlx_jaccl_coordinators,
    get_shard_assignments,
-    get_shard_assignments_for_pipeline_parallel,
    get_smallest_cycles,
 )
 from exo.master.tests.conftest import (
@@ -21,11 +20,7 @@ from exo.shared.types.profiling import (
    NodeNetworkInfo,
 )
 from exo.shared.types.topology import Connection, SocketConnection
-from exo.shared.types.worker.shards import (
-    CfgShardMetadata,
-    PipelineShardMetadata,
-    Sharding,
-)
+from exo.shared.types.worker.shards import Sharding


 def test_filter_cycles_by_memory():
@@ -492,193 +487,3 @@ def test_get_shard_assignments_insufficient_memory_raises():
        get_shard_assignments(
            model_card, selected_cycle, Sharding.Pipeline, node_memory
        )
-
-
-class TestCfgParallelPlacement:
-    def _create_ring_topology(self, node_ids: list[NodeId]) -> Topology:
-        topology = Topology()
-        for node_id in node_ids:
-            topology.add_node(node_id)
-
-        for i, node_id in enumerate(node_ids):
-            next_node = node_ids[(i + 1) % len(node_ids)]
-            conn = Connection(
-                source=node_id,
-                sink=next_node,
-                edge=create_socket_connection(i + 1),
-            )
-            topology.add_connection(conn)
-
-        return topology
-
-    def test_two_nodes_cfg_model_uses_cfg_parallel(self):
-        """Two nodes with CFG model should use CFG parallel (no pipeline)."""
-        node_a = NodeId()
-        node_b = NodeId()
-
-        topology = self._create_ring_topology([node_a, node_b])
-        cycles = [c for c in topology.get_cycles() if len(c) == 2]
-        cycle = cycles[0]
-
-        node_memory = {
-            node_a: create_node_memory(1000 * 1024),
-            node_b: create_node_memory(1000 * 1024),
-        }
-
-        model_card = ModelCard(
-            model_id=ModelId("qwen-image-test"),
-            n_layers=60,
-            storage_size=Memory.from_kb(1000),
-            hidden_size=1,
-            supports_tensor=False,
-            uses_cfg=True,
-            tasks=[ModelTask.TextToImage],
-        )
-
-        assignments = get_shard_assignments_for_pipeline_parallel(
-            model_card, cycle, node_memory
-        )
-
-        shards = list(assignments.runner_to_shard.values())
-        assert len(shards) == 2
-
-        # CFG models should get CfgShardMetadata
-        for shard in shards:
-            assert isinstance(shard, CfgShardMetadata)
-            # Both nodes should have all layers (no pipeline split)
-            assert shard.start_layer == 0
-            assert shard.end_layer == 60
-            assert shard.cfg_world_size == 2
-            # Each node is the only stage in its pipeline group
-            assert shard.pipeline_world_size == 1
-            assert shard.pipeline_rank == 0
-
-        cfg_ranks = sorted(
-            s.cfg_rank for s in shards if isinstance(s, CfgShardMetadata)
-        )
-        assert cfg_ranks == [0, 1]
-
-    def test_four_nodes_cfg_model_uses_hybrid(self):
-        """Four nodes with CFG model should use 2 CFG groups x 2 pipeline stages."""
-        nodes = [NodeId() for _ in range(4)]
-
-        topology = self._create_ring_topology(nodes)
-        cycles = [c for c in topology.get_cycles() if len(c) == 4]
-        cycle = cycles[0]
-
-        node_memory = {n: create_node_memory(1000 * 1024) for n in nodes}
-
-        model_card = ModelCard(
-            model_id=ModelId("qwen-image-test"),
-            n_layers=60,
-            storage_size=Memory.from_kb(1000),
-            hidden_size=1,
-            supports_tensor=False,
-            uses_cfg=True,
-            tasks=[ModelTask.TextToImage],
-        )
-
-        assignments = get_shard_assignments_for_pipeline_parallel(
-            model_card, cycle, node_memory
-        )
-
-        shards = list(assignments.runner_to_shard.values())
-        assert len(shards) == 4
-
-        # CFG models should get CfgShardMetadata
-        for shard in shards:
-            assert isinstance(shard, CfgShardMetadata)
-            assert shard.cfg_world_size == 2
-            assert shard.pipeline_world_size == 2
-            assert shard.pipeline_rank in [0, 1]
-
-        # Check we have 2 nodes in each CFG group
-        cfg_0_shards = [
-            s for s in shards if isinstance(s, CfgShardMetadata) and s.cfg_rank == 0
-        ]
-        cfg_1_shards = [
-            s for s in shards if isinstance(s, CfgShardMetadata) and s.cfg_rank == 1
-        ]
-        assert len(cfg_0_shards) == 2
-        assert len(cfg_1_shards) == 2
-
-        # Both CFG groups should have the same layer assignments
-        cfg_0_layers = [(s.start_layer, s.end_layer) for s in cfg_0_shards]
-        cfg_1_layers = [(s.start_layer, s.end_layer) for s in cfg_1_shards]
-        assert sorted(cfg_0_layers) == sorted(cfg_1_layers)
-
-    def test_three_nodes_cfg_model_uses_sequential_cfg(self):
-        """Three nodes (odd) with CFG model should use sequential CFG (PipelineShardMetadata)."""
-        nodes = [NodeId() for _ in range(3)]
-
-        topology = self._create_ring_topology(nodes)
-        cycles = [c for c in topology.get_cycles() if len(c) == 3]
-        cycle = cycles[0]
-
-        node_memory = {n: create_node_memory(1000 * 1024) for n in nodes}
-
-        model_card = ModelCard(
-            model_id=ModelId("qwen-image-test"),
-            n_layers=60,
-            storage_size=Memory.from_kb(1000),
-            hidden_size=1,
-            supports_tensor=False,
-            uses_cfg=True,
-            tasks=[ModelTask.TextToImage],
-        )
-
-        assignments = get_shard_assignments_for_pipeline_parallel(
-            model_card, cycle, node_memory
-        )
-
-        shards = list(assignments.runner_to_shard.values())
-        assert len(shards) == 3
-
-        # Odd node count with CFG model falls back to PipelineShardMetadata (sequential CFG)
-        for shard in shards:
-            assert isinstance(shard, PipelineShardMetadata)
-
-    def test_two_nodes_non_cfg_model_uses_pipeline(self):
-        """Two nodes with non-CFG model should use pure pipeline (PipelineShardMetadata)."""
-        node_a = NodeId()
-        node_b = NodeId()
-
-        topology = self._create_ring_topology([node_a, node_b])
-        cycles = [c for c in topology.get_cycles() if len(c) == 2]
-        cycle = cycles[0]
-
-        node_memory = {
-            node_a: create_node_memory(1000 * 1024),
-            node_b: create_node_memory(1000 * 1024),
-        }
-
-        model_card = ModelCard(
-            model_id=ModelId("flux-test"),
-            n_layers=57,
-            storage_size=Memory.from_kb(1000),
-            hidden_size=1,
-            supports_tensor=False,
-            uses_cfg=False,  # Non-CFG model
-            tasks=[ModelTask.TextToImage],
-        )
-
-        assignments = get_shard_assignments_for_pipeline_parallel(
-            model_card, cycle, node_memory
-        )
-
-        shards = list(assignments.runner_to_shard.values())
-        assert len(shards) == 2
-
-        # Non-CFG models should get PipelineShardMetadata
-        for shard in shards:
-            assert isinstance(shard, PipelineShardMetadata)
-
-        # Should have actual layer sharding (pipeline)
-        layer_ranges = sorted(
-            (s.start_layer, s.end_layer)
-            for s in shards
-            if isinstance(s, PipelineShardMetadata)
-        )
-        # First shard starts at 0, last shard ends at 57
-        assert layer_ranges[0][0] == 0
-        assert layer_ranges[-1][1] == 57
--- a/src/exo/routing/router.py
+++ b/src/exo/routing/router.py
@@ -10,7 +10,6 @@ from anyio import (
    ClosedResourceError,
    create_task_group,
    sleep_forever,
-    move_on_after
 )
 from anyio.abc import TaskGroup
 from exo_pyo3_bindings import (
@@ -147,21 +146,18 @@ class Router:

    async def run(self):
        logger.debug("Starting Router")
-        try:
-            async with create_task_group() as tg:
-                self._tg = tg
-                for topic in self.topic_routers:
-                    router = self.topic_routers[topic]
-                    tg.start_soon(router.run)
-                tg.start_soon(self._networking_recv)
-                tg.start_soon(self._networking_recv_connection_messages)
-                tg.start_soon(self._networking_publish)
-                # Router only shuts down if you cancel it.
-                await sleep_forever()
-        finally:
-            with move_on_after(1, shield=True):
-                for topic in self.topic_routers:
-                    await self._networking_unsubscribe(str(topic))
+        async with create_task_group() as tg:
+            self._tg = tg
+            for topic in self.topic_routers:
+                router = self.topic_routers[topic]
+                tg.start_soon(router.run)
+            tg.start_soon(self._networking_recv)
+            tg.start_soon(self._networking_recv_connection_messages)
+            tg.start_soon(self._networking_publish)
+            # Router only shuts down if you cancel it.
+            await sleep_forever()
+        for topic in self.topic_routers:
+            await self._networking_unsubscribe(str(topic))

    async def shutdown(self):
        logger.debug("Shutting down Router")
@@ -170,12 +166,12 @@ class Router:
        self._tg.cancel_scope.cancel()

    async def _networking_subscribe(self, topic: str):
+        logger.info(f"Subscribing to {topic}")
        await self._net.gossipsub_subscribe(topic)
-        logger.info(f"Subscribed to {topic}")

    async def _networking_unsubscribe(self, topic: str):
+        logger.info(f"Unsubscribing from {topic}")
        await self._net.gossipsub_unsubscribe(topic)
-        logger.info(f"Unsubscribed from {topic}")

    async def _networking_recv(self):
        while True:
--- a/src/exo/shared/election.py
+++ b/src/exo/shared/election.py
@@ -86,29 +86,28 @@ class Election:

    async def run(self):
        logger.info("Starting Election")
-        try:
-            async with create_task_group() as tg:
-                self._tg = tg
-                tg.start_soon(self._election_receiver)
-                tg.start_soon(self._connection_receiver)
-                tg.start_soon(self._command_counter)
+        async with create_task_group() as tg:
+            self._tg = tg
+            tg.start_soon(self._election_receiver)
+            tg.start_soon(self._connection_receiver)
+            tg.start_soon(self._command_counter)

-                # And start an election immediately, that instantly resolves
-                candidates: list[ElectionMessage] = []
-                logger.debug("Starting initial campaign")
-                self._candidates = candidates
-                await self._campaign(candidates, campaign_timeout=0.0)
-                logger.debug("Initial campaign finished")
-        finally:
-            # Cancel and wait for the last election to end
-            if self._campaign_cancel_scope is not None:
-                logger.debug("Cancelling campaign")
-                self._campaign_cancel_scope.cancel()
-            if self._campaign_done is not None:
-                logger.debug("Waiting for campaign to finish")
-                await self._campaign_done.wait()
-            logger.debug("Campaign cancelled and finished")
-            logger.info("Election shutdown")
+            # And start an election immediately, that instantly resolves
+            candidates: list[ElectionMessage] = []
+            logger.debug("Starting initial campaign")
+            self._candidates = candidates
+            await self._campaign(candidates, campaign_timeout=0.0)
+            logger.debug("Initial campaign finished")
+
+        # Cancel and wait for the last election to end
+        if self._campaign_cancel_scope is not None:
+            logger.debug("Cancelling campaign")
+            self._campaign_cancel_scope.cancel()
+        if self._campaign_done is not None:
+            logger.debug("Waiting for campaign to finish")
+            await self._campaign_done.wait()
+        logger.debug("Campaign cancelled and finished")
+        logger.info("Election finished")

    async def elect(self, em: ElectionMessage) -> None:
        logger.debug(f"Electing: {em}")
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -65,9 +65,9 @@ class ComponentInfo(CamelCaseModel):
    component_name: str
    component_path: str
    storage_size: Memory
-    n_layers: PositiveInt | None = None
+    n_layers: PositiveInt | None
    can_shard: bool
-    safetensors_index_filename: str | None = None
+    safetensors_index_filename: str | None


 class ModelCard(CamelCaseModel):
@@ -82,7 +82,6 @@ class ModelCard(CamelCaseModel):
    quantization: str = ""
    base_model: str = ""
    capabilities: list[str] = []
-    uses_cfg: bool = False

    @field_validator("tasks", mode="before")
    @classmethod
@@ -156,6 +155,87 @@ def is_custom_card(model_id: ModelId) -> bool:
    return os.path.isfile(str(card_path))


+# TODO: quantizing and dynamically creating model cards
+def _generate_image_model_quant_variants(  # pyright: ignore[reportUnusedFunction]
+    base_name: str,
+    base_card: ModelCard,
+) -> dict[str, ModelCard]:
+    """Create quantized variants of an image model card.
+
+    Only the transformer component is quantized; text encoders stay at bf16.
+    Sizes are calculated exactly from the base card's component sizes.
+    """
+    if base_card.components is None:
+        raise ValueError(f"Image model {base_name} must have components defined")
+
+    # quantizations = [8, 6, 5, 4, 3]
+    quantizations = [8, 4]
+
+    num_transformer_bytes = next(
+        c.storage_size.in_bytes
+        for c in base_card.components
+        if c.component_name == "transformer"
+    )
+
+    transformer_bytes = Memory.from_bytes(num_transformer_bytes)
+
+    remaining_bytes = Memory.from_bytes(
+        sum(
+            c.storage_size.in_bytes
+            for c in base_card.components
+            if c.component_name != "transformer"
+        )
+    )
+
+    def with_transformer_size(new_size: Memory) -> list[ComponentInfo]:
+        assert base_card.components is not None
+        return [
+            ComponentInfo(
+                component_name=c.component_name,
+                component_path=c.component_path,
+                storage_size=new_size
+                if c.component_name == "transformer"
+                else c.storage_size,
+                n_layers=c.n_layers,
+                can_shard=c.can_shard,
+                safetensors_index_filename=c.safetensors_index_filename,
+            )
+            for c in base_card.components
+        ]
+
+    variants = {
+        base_name: ModelCard(
+            model_id=base_card.model_id,
+            storage_size=transformer_bytes + remaining_bytes,
+            n_layers=base_card.n_layers,
+            hidden_size=base_card.hidden_size,
+            supports_tensor=base_card.supports_tensor,
+            tasks=base_card.tasks,
+            components=with_transformer_size(transformer_bytes),
+        )
+    }
+
+    for quant in quantizations:
+        quant_transformer_bytes = Memory.from_bytes(
+            (num_transformer_bytes * quant) // 16
+        )
+        total_bytes = remaining_bytes + quant_transformer_bytes
+
+        model_id = ModelId(base_card.model_id + f"-{quant}bit")
+
+        variants[f"{base_name}-{quant}bit"] = ModelCard(
+            model_id=model_id,
+            storage_size=total_bytes,
+            n_layers=base_card.n_layers,
+            hidden_size=base_card.hidden_size,
+            supports_tensor=base_card.supports_tensor,
+            tasks=base_card.tasks,
+            components=with_transformer_size(quant_transformer_bytes),
+        )
+
+    return variants
+
+
 class ConfigData(BaseModel):
    model_config = {"extra": "ignore"}  # Allow unknown fields

--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -48,6 +48,10 @@ class DeleteInstance(BaseCommand):
    instance_id: InstanceId


+class TaskCancelled(BaseCommand):
+    cancelled_command_id: CommandId
+
+
 class TaskFinished(BaseCommand):
    finished_command_id: CommandId

@@ -84,6 +88,7 @@ Command = (
    | PlaceInstance
    | CreateInstance
    | DeleteInstance
+    | TaskCancelled
    | TaskFinished
    | SendInputChunk
 )
--- a/src/exo/shared/types/tasks.py
+++ b/src/exo/shared/types/tasks.py
@@ -24,6 +24,7 @@ class TaskStatus(str, Enum):
    Complete = "Complete"
    TimedOut = "TimedOut"
    Failed = "Failed"
+    Cancelled = "Cancelled"


 class BaseTask(TaggedModel):
@@ -60,6 +61,10 @@ class TextGeneration(BaseTask):  # emitted by Master
    error_message: str | None = Field(default=None)


+class CancelTask(BaseTask):
+    cancelled_task_id: TaskId
+
+
 class ImageGeneration(BaseTask):  # emitted by Master
    command_id: CommandId
    task_params: ImageGenerationTaskParams
@@ -87,6 +92,7 @@ Task = (
    | LoadModel
    | StartWarmup
    | TextGeneration
+    | CancelTask
    | ImageGeneration
    | ImageEdits
    | Shutdown
--- a/src/exo/shared/types/worker/shards.py
+++ b/src/exo/shared/types/worker/shards.py
@@ -1,5 +1,4 @@
 from enum import Enum
-from typing import TypeAlias, final

 from pydantic import Field

@@ -52,7 +51,6 @@ class BaseShardMetadata(TaggedModel):
        )


-@final
 class PipelineShardMetadata(BaseShardMetadata):
    """
    Pipeline parallelism shard meta.
@@ -62,23 +60,8 @@ class PipelineShardMetadata(BaseShardMetadata):
    """


-@final
-class CfgShardMetadata(BaseShardMetadata):
-    """Shard metadata for CFG-parallel image generation models."""
-
-    cfg_rank: int  # 0 = positive branch, 1 = negative branch
-    cfg_world_size: int = 2
-
-    # Pipeline-relative coordinates (computed at placement time)
-    pipeline_rank: int  # rank within the pipeline group (0, 1, 2, ...)
-    pipeline_world_size: int  # number of nodes per pipeline group
-
-
-@final
 class TensorShardMetadata(BaseShardMetadata):
    pass


-ShardMetadata: TypeAlias = (
-    PipelineShardMetadata | CfgShardMetadata | TensorShardMetadata
-)
+ShardMetadata = PipelineShardMetadata | TensorShardMetadata
--- a/src/exo/worker/engines/image/distributed_model.py
+++ b/src/exo/worker/engines/image/distributed_model.py
@@ -9,7 +9,7 @@ from PIL import Image
 from exo.download.download_utils import build_model_path
 from exo.shared.types.api import AdvancedImageParams
 from exo.shared.types.worker.instances import BoundInstance
-from exo.shared.types.worker.shards import CfgShardMetadata, PipelineShardMetadata
+from exo.shared.types.worker.shards import PipelineShardMetadata
 from exo.worker.engines.image.config import ImageModelConfig
 from exo.worker.engines.image.models import (
    create_adapter_for_model,
@@ -30,19 +30,14 @@ class DistributedImageModel:
        self,
        model_id: str,
        local_path: Path,
-        shard_metadata: PipelineShardMetadata | CfgShardMetadata,
+        shard_metadata: PipelineShardMetadata,
        group: Optional[mx.distributed.Group] = None,
        quantize: int | None = None,
    ):
        config = get_config_for_model(model_id)
        adapter = create_adapter_for_model(config, model_id, local_path, quantize)

-        has_layer_sharding = (
-            shard_metadata.start_layer != 0
-            or shard_metadata.end_layer != shard_metadata.n_layers
-        )
-
-        if group is not None and has_layer_sharding:
+        if group is not None:
            adapter.slice_transformer_blocks(
                start_layer=shard_metadata.start_layer,
                end_layer=shard_metadata.end_layer,
@@ -80,10 +75,8 @@ class DistributedImageModel:
        model_path = build_model_path(model_id)

        shard_metadata = bound_instance.bound_shard
-        if not isinstance(shard_metadata, (PipelineShardMetadata, CfgShardMetadata)):
-            raise ValueError(
-                "Expected PipelineShardMetadata or CfgShardMetadata for image generation"
-            )
+        if not isinstance(shard_metadata, PipelineShardMetadata):
+            raise ValueError("Expected PipelineShardMetadata for image generation")

        is_distributed = (
            len(bound_instance.instance.shard_assignments.node_to_runner) > 1
--- a/src/exo/worker/engines/image/models/base.py
+++ b/src/exo/worker/engines/image/models/base.py
@@ -86,27 +86,6 @@ class PromptData(ABC):
        """
        ...

-    @abstractmethod
-    def get_cfg_branch_data(
-        self, positive: bool
-    ) -> tuple[mx.array, mx.array | None, mx.array | None, mx.array | None]:
-        """Get embeddings for a single CFG branch (positive or negative).
-
-        Used for sequential CFG and CFG parallel modes where we process
-        one branch at a time instead of batching.
-
-        Args:
-            positive: True for positive prompt, False for negative prompt
-
-        Returns:
-            Tuple of:
-            - embeds: [1, seq, hidden] prompt embeddings
-            - mask: [1, seq] attention mask or None
-            - pooled: [1, hidden] pooled embeddings or None
-            - conditioning_latents: [1, latent_seq, latent_dim] or None
-        """
-        ...
-

 class ModelAdapter(ABC, Generic[ModelT, TransformerT]):
    _config: ImageModelConfig
--- a/src/exo/worker/engines/image/models/flux/adapter.py
+++ b/src/exo/worker/engines/image/models/flux/adapter.py
@@ -64,12 +64,6 @@ class FluxPromptData(PromptData):
    ) -> tuple[mx.array, mx.array, mx.array | None, mx.array | None] | None:
        return None

-    def get_cfg_branch_data(
-        self, positive: bool
-    ) -> tuple[mx.array, mx.array | None, mx.array | None, mx.array | None]:
-        """Flux doesn't use CFG, but we return positive data for compatibility."""
-        return (self._prompt_embeds, None, self._pooled_prompt_embeds, None)
-

 class FluxModelAdapter(ModelAdapter[Flux1, Transformer]):
    def __init__(
--- a/src/exo/worker/engines/image/models/qwen/adapter.py
+++ b/src/exo/worker/engines/image/models/qwen/adapter.py
@@ -133,24 +133,6 @@ class QwenPromptData(PromptData):

        return batched_embeds, batched_mask, None, cond_latents

-    def get_cfg_branch_data(
-        self, positive: bool
-    ) -> tuple[mx.array, mx.array | None, mx.array | None, mx.array | None]:
-        if positive:
-            return (
-                self._prompt_embeds,
-                self._prompt_mask,
-                None,
-                self.conditioning_latents,
-            )
-        else:
-            return (
-                self._negative_prompt_embeds,
-                self._negative_prompt_mask,
-                None,
-                self.conditioning_latents,
-            )
-

 class QwenModelAdapter(ModelAdapter[QwenImage, QwenTransformer]):
    """Adapter for Qwen-Image model.
--- a/src/exo/worker/engines/image/models/qwen/config.py
+++ b/src/exo/worker/engines/image/models/qwen/config.py
@@ -12,7 +12,7 @@ QWEN_IMAGE_CONFIG = ImageModelConfig(
        ),
    ),
    default_steps={"low": 10, "medium": 25, "high": 50},
-    num_sync_steps_factor=0.25,
+    num_sync_steps_factor=0.125,  # ~3 sync steps for medium (30 steps)
    guidance_scale=3.5,  # Set to None or < 1.0 to disable CFG
 )

@@ -24,6 +24,6 @@ QWEN_IMAGE_EDIT_CONFIG = ImageModelConfig(
        ),
    ),
    default_steps={"low": 10, "medium": 25, "high": 50},
-    num_sync_steps_factor=0.25,
+    num_sync_steps_factor=0.125,
    guidance_scale=3.5,
 )
--- a/src/exo/worker/engines/image/models/qwen/edit_adapter.py
+++ b/src/exo/worker/engines/image/models/qwen/edit_adapter.py
@@ -153,24 +153,6 @@ class QwenEditPromptData(PromptData):

        return batched_embeds, batched_mask, None, batched_cond_latents

-    def get_cfg_branch_data(
-        self, positive: bool
-    ) -> tuple[mx.array, mx.array | None, mx.array | None, mx.array | None]:
-        if positive:
-            return (
-                self._prompt_embeds,
-                self._prompt_mask,
-                None,
-                self._conditioning_latents,
-            )
-        else:
-            return (
-                self._negative_prompt_embeds,
-                self._negative_prompt_mask,
-                None,
-                self._conditioning_latents,
-            )
-

 class QwenEditModelAdapter(ModelAdapter[QwenImageEdit, QwenTransformer]):
    """Adapter for Qwen-Image-Edit model.
--- a/src/exo/worker/engines/image/pipeline/runner.py
+++ b/src/exo/worker/engines/image/pipeline/runner.py
@@ -1,7 +1,5 @@
-from collections.abc import Iterator
-from dataclasses import dataclass
 from math import ceil
-from typing import Any, Optional, final
+from typing import Any, Optional

 import mlx.core as mx
 from mflux.models.common.config.config import Config
@@ -13,7 +11,7 @@ from exo.shared.tracing import (
    clear_trace_buffer,
    trace,
 )
-from exo.shared.types.worker.shards import CfgShardMetadata, PipelineShardMetadata
+from exo.shared.types.worker.shards import PipelineShardMetadata
 from exo.worker.engines.image.config import ImageModelConfig
 from exo.worker.engines.image.models.base import (
    ModelAdapter,
@@ -27,16 +25,6 @@ from exo.worker.engines.image.pipeline.block_wrapper import (
 )


-@final
-@dataclass(frozen=True)
-class CfgBranch:
-    positive: bool
-    embeds: mx.array
-    mask: mx.array | None
-    pooled: mx.array | None
-    cond_latents: mx.array | None
-
-
 def calculate_patch_heights(
    latent_height: int, num_patches: int
 ) -> tuple[list[int], int]:
@@ -82,18 +70,29 @@ class DiffusionRunner:
        config: ImageModelConfig,
        adapter: ModelAdapter[Any, Any],
        group: Optional[mx.distributed.Group],
-        shard_metadata: PipelineShardMetadata | CfgShardMetadata,
+        shard_metadata: PipelineShardMetadata,
        num_patches: Optional[int] = None,
    ):
        self.config = config
        self.adapter = adapter
        self.group = group

-        self._init_cfg_topology(shard_metadata)
+        if group is None:
+            self.rank = 0
+            self.world_size = 1
+            self.next_rank = 0
+            self.prev_rank = 0
+            self.start_layer = 0
+            self.end_layer = config.total_blocks
+        else:
+            self.rank = shard_metadata.device_rank
+            self.world_size = shard_metadata.world_size
+            self.next_rank = (self.rank + 1) % self.world_size
+            self.prev_rank = (self.rank - 1 + self.world_size) % self.world_size
+            self.start_layer = shard_metadata.start_layer
+            self.end_layer = shard_metadata.end_layer

-        self.num_patches = (
-            num_patches if num_patches else max(1, self.pipeline_world_size)
-        )
+        self.num_patches = num_patches if num_patches else max(1, self.world_size)

        self.total_joint = config.joint_block_count
        self.total_single = config.single_block_count
@@ -103,97 +102,6 @@ class DiffusionRunner:

        self._compute_assigned_blocks()

-    def _init_cfg_topology(
-        self, shard_metadata: PipelineShardMetadata | CfgShardMetadata
-    ) -> None:
-        """Initialize CFG and pipeline topology from shard metadata.
-
-        Both CfgShardMetadata and PipelineShardMetadata represent pipeline parallel
-        execution. CFG adds a second parallel pipeline for negative prompt processing,
-        but within each pipeline group the communication pattern is identical.
-        """
-        if self.group is None:
-            # Single node - no distributed communication
-            self.rank = 0
-            self.world_size = 1
-            self.start_layer = 0
-            self.end_layer = self.config.total_blocks
-            self.cfg_rank = 0
-            self.cfg_world_size = 1
-            self.cfg_parallel = False
-            self.pipeline_rank = 0
-            self.pipeline_world_size = 1
-            self.next_pipeline_rank: int | None = None
-            self.prev_pipeline_rank: int | None = None
-            self.cfg_peer_rank: int | None = None
-            self.first_pipeline_rank: int = 0
-            self.last_pipeline_rank: int = 0
-            return
-
-        # Common fields from base metadata
-        self.rank = shard_metadata.device_rank
-        self.world_size = shard_metadata.world_size
-        self.start_layer = shard_metadata.start_layer
-        self.end_layer = shard_metadata.end_layer
-
-        if isinstance(shard_metadata, CfgShardMetadata):
-            # CFG parallel: two independent pipelines
-            self.cfg_rank = shard_metadata.cfg_rank
-            self.cfg_world_size = shard_metadata.cfg_world_size
-            self.cfg_parallel = True
-            self.pipeline_rank = shard_metadata.pipeline_rank
-            self.pipeline_world_size = shard_metadata.pipeline_world_size
-        else:
-            # Pure pipeline: single pipeline group, sequential CFG
-            self.cfg_rank = 0
-            self.cfg_world_size = 1
-            self.cfg_parallel = False
-            self.pipeline_rank = shard_metadata.device_rank
-            self.pipeline_world_size = shard_metadata.world_size
-
-        # Pipeline neighbor computation (same logic for both types)
-        is_first = self.pipeline_rank == 0
-        is_last = self.pipeline_rank == self.pipeline_world_size - 1
-
-        self.next_pipeline_rank = (
-            None
-            if is_last
-            else self._device_rank_for(self.cfg_rank, self.pipeline_rank + 1)
-        )
-        self.prev_pipeline_rank = (
-            None
-            if is_first
-            else self._device_rank_for(self.cfg_rank, self.pipeline_rank - 1)
-        )
-
-        # CFG peer is the corresponding last stage in the other CFG group
-        if self.cfg_parallel and is_last:
-            other_cfg_rank = 1 - self.cfg_rank
-            self.cfg_peer_rank = self._device_rank_for(
-                other_cfg_rank, self.pipeline_rank
-            )
-        else:
-            self.cfg_peer_rank = None
-
-        # First/last pipeline ranks for ring communication (latent broadcast)
-        self.first_pipeline_rank = self._device_rank_for(self.cfg_rank, 0)
-        self.last_pipeline_rank = self._device_rank_for(
-            self.cfg_rank, self.pipeline_world_size - 1
-        )
-
-    def _device_rank_for(self, cfg_rank: int, pipeline_rank: int) -> int:
-        """Convert (cfg_rank, pipeline_rank) to device_rank in the ring topology.
-
-        Ring layout: [cfg0_pipe0, cfg0_pipe1, ..., cfg1_pipeN-1, cfg1_pipeN-2, ..., cfg1_pipe0]
-        Group 0 is in ascending order, group 1 is reversed so last stages are neighbors.
-        """
-        if not self.cfg_parallel:
-            return pipeline_rank
-        if cfg_rank == 0:
-            return pipeline_rank
-        else:
-            return self.world_size - 1 - pipeline_rank
-
    def _compute_assigned_blocks(self) -> None:
        """Determine which joint/single blocks this stage owns."""
        start = self.start_layer
@@ -230,11 +138,11 @@ class DiffusionRunner:

    @property
    def is_first_stage(self) -> bool:
-        return self.pipeline_rank == 0
+        return self.rank == 0

    @property
    def is_last_stage(self) -> bool:
-        return self.pipeline_rank == self.pipeline_world_size - 1
+        return self.rank == self.world_size - 1

    @property
    def is_distributed(self) -> bool:
@@ -245,97 +153,6 @@ class DiffusionRunner:
            return self._guidance_override
        return self.config.guidance_scale

-    def _get_cfg_branches(self, prompt_data: PromptData) -> Iterator[CfgBranch]:
-        """Yield the CFG branches this node should process.
-
-        - No CFG: yields one branch (positive)
-        - CFG parallel: yields one branch (our assigned branch)
-        - Sequential CFG: yields two branches (positive, then negative)
-        """
-        if not self.adapter.needs_cfg:
-            embeds, mask, pooled, cond = prompt_data.get_cfg_branch_data(positive=True)
-            yield CfgBranch(
-                positive=True,
-                embeds=embeds,
-                mask=mask,
-                pooled=pooled,
-                cond_latents=cond,
-            )
-        elif self.cfg_parallel:
-            positive = self.cfg_rank == 0
-            embeds, mask, pooled, cond = prompt_data.get_cfg_branch_data(positive)
-            yield CfgBranch(
-                positive=positive,
-                embeds=embeds,
-                mask=mask,
-                pooled=pooled,
-                cond_latents=cond,
-            )
-        else:
-            pos_embeds, pos_mask, pos_pooled, pos_cond = (
-                prompt_data.get_cfg_branch_data(positive=True)
-            )
-            yield CfgBranch(
-                positive=True,
-                embeds=pos_embeds,
-                mask=pos_mask,
-                pooled=pos_pooled,
-                cond_latents=pos_cond,
-            )
-            neg_embeds, neg_mask, neg_pooled, neg_cond = (
-                prompt_data.get_cfg_branch_data(positive=False)
-            )
-            yield CfgBranch(
-                positive=False,
-                embeds=neg_embeds,
-                mask=neg_mask,
-                pooled=neg_pooled,
-                cond_latents=neg_cond,
-            )
-
-    def _combine_cfg_results(self, results: list[tuple[bool, mx.array]]) -> mx.array:
-        if len(results) == 1:
-            positive, noise = results[0]
-            if self.cfg_parallel and self.is_last_stage:
-                # TODO(ciaran): try to remove
-                mx.eval(noise)
-                return self._exchange_and_apply_guidance(noise, positive)
-            return noise
-
-        noise_neg = next(n for p, n in results if not p)
-        noise_pos = next(n for p, n in results if p)
-        return self._apply_guidance(noise_pos, noise_neg)
-
-    def _exchange_and_apply_guidance(
-        self, noise: mx.array, is_positive: bool
-    ) -> mx.array:
-        assert self.group is not None
-        assert self.cfg_peer_rank is not None
-
-        if is_positive:
-            noise = mx.distributed.send(noise, self.cfg_peer_rank, group=self.group)
-            mx.async_eval(noise)
-            noise_neg = mx.distributed.recv_like(
-                noise, self.cfg_peer_rank, group=self.group
-            )
-            mx.eval(noise_neg)
-            noise_pos = noise
-        else:
-            noise_pos = mx.distributed.recv_like(
-                noise, self.cfg_peer_rank, group=self.group
-            )
-            mx.eval(noise_pos)
-            noise = mx.distributed.send(noise, self.cfg_peer_rank, group=self.group)
-            mx.async_eval(noise)
-            noise_neg = noise
-
-        return self._apply_guidance(noise_pos, noise_neg)
-
-    def _apply_guidance(self, noise_pos: mx.array, noise_neg: mx.array) -> mx.array:
-        scale = self._get_effective_guidance_scale()
-        assert scale is not None
-        return self.adapter.apply_guidance(noise_pos, noise_neg, scale)
-
    def _ensure_wrappers(
        self,
        text_seq_len: int,
@@ -653,9 +470,7 @@ class DiffusionRunner:
    ) -> mx.array:
        if self.group is None:
            return self._single_node_step(t, config, latents, prompt_data)
-        elif (
-            self.pipeline_world_size == 1 or t < config.init_time_step + num_sync_steps
-        ):
+        elif t < config.init_time_step + num_sync_steps:
            with trace(name=f"sync {t}", rank=self.rank, category="sync"):
                return self._sync_pipeline_step(
                    t,
@@ -681,29 +496,42 @@ class DiffusionRunner:
        prompt_data: PromptData,
    ) -> mx.array:
        cond_image_grid = prompt_data.cond_image_grid
-        results: list[tuple[bool, mx.array]] = []
-
-        for branch in self._get_cfg_branches(prompt_data):
-            # Reset caches before each branch to ensure no state contamination
-            self._reset_all_caches()
+        needs_cfg = self.adapter.needs_cfg

+        if needs_cfg:
+            batched_data = prompt_data.get_batched_cfg_data()
+            assert batched_data is not None, "CFG model must provide batched data"
+            prompt_embeds, encoder_mask, batched_pooled, cond_latents = batched_data
            pooled_embeds = (
-                branch.pooled if branch.pooled is not None else branch.embeds
+                batched_pooled if batched_pooled is not None else prompt_embeds
+            )
+            step_latents = mx.concatenate([latents, latents], axis=0)
+        else:
+            prompt_embeds = prompt_data.prompt_embeds
+            pooled_embeds = prompt_data.pooled_prompt_embeds
+            encoder_mask = prompt_data.get_encoder_hidden_states_mask(positive=True)
+            cond_latents = prompt_data.conditioning_latents
+            step_latents = latents
+
+        noise = self._forward_pass(
+            step_latents,
+            prompt_embeds,
+            pooled_embeds,
+            t=t,
+            config=config,
+            encoder_hidden_states_mask=encoder_mask,
+            cond_image_grid=cond_image_grid,
+            conditioning_latents=cond_latents,
+        )
+
+        if needs_cfg:
+            noise_pos, noise_neg = mx.split(noise, 2, axis=0)
+            guidance_scale = self._get_effective_guidance_scale()
+            assert guidance_scale is not None
+            noise = self.adapter.apply_guidance(
+                noise_pos, noise_neg, guidance_scale=guidance_scale
            )

-            noise = self._forward_pass(
-                latents,
-                branch.embeds,
-                pooled_embeds,
-                t=t,
-                config=config,
-                encoder_hidden_states_mask=branch.mask,
-                cond_image_grid=cond_image_grid,
-                conditioning_latents=branch.cond_latents,
-            )
-            results.append((branch.positive, noise))
-
-        noise = self._combine_cfg_results(results)
        return config.scheduler.step(noise=noise, timestep=t, latents=latents)  # pyright: ignore[reportAny]

    def _create_patches(
@@ -754,7 +582,7 @@ class DiffusionRunner:
            )

        text_embeddings = self.adapter.compute_text_embeddings(
-            t, config, pooled_prompt_embeds, hidden_states=hidden_states
+            t, config, pooled_prompt_embeds
        )
        image_rotary_embeddings = self.adapter.compute_rotary_embeddings(
            prompt_embeds,
@@ -766,22 +594,19 @@ class DiffusionRunner:

        if self.has_joint_blocks:
            if not self.is_first_stage:
-                assert self.prev_pipeline_rank is not None
                with trace(
-                    name=f"recv {self.prev_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"recv {self.prev_rank}", rank=self.rank, category="comms"
                ):
                    hidden_states = mx.distributed.recv(
                        (batch_size, num_img_tokens, hidden_dim),
                        dtype,
-                        self.prev_pipeline_rank,
+                        self.prev_rank,
                        group=self.group,
                    )
                    encoder_hidden_states = mx.distributed.recv(
                        (batch_size, text_seq_len, hidden_dim),
                        dtype,
-                        self.prev_pipeline_rank,
+                        self.prev_rank,
                        group=self.group,
                    )
                    mx.eval(hidden_states, encoder_hidden_states)
@@ -814,45 +639,34 @@ class DiffusionRunner:
            if self.has_single_blocks or self.is_last_stage:
                hidden_states = concatenated
            else:
-                assert self.next_pipeline_rank is not None
                with trace(
-                    name=f"send {self.next_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"send {self.next_rank}", rank=self.rank, category="comms"
                ):
                    concatenated = mx.distributed.send(
-                        concatenated, self.next_pipeline_rank, group=self.group
+                        concatenated, self.next_rank, group=self.group
                    )
                    mx.async_eval(concatenated)

        elif self.has_joint_blocks and not self.is_last_stage:
            assert encoder_hidden_states is not None
-            assert self.next_pipeline_rank is not None
-            with trace(
-                name=f"send {self.next_pipeline_rank}",
-                rank=self.rank,
-                category="comms",
-            ):
+            with trace(name=f"send {self.next_rank}", rank=self.rank, category="comms"):
                hidden_states = mx.distributed.send(
-                    hidden_states, self.next_pipeline_rank, group=self.group
+                    hidden_states, self.next_rank, group=self.group
                )
                encoder_hidden_states = mx.distributed.send(
-                    encoder_hidden_states, self.next_pipeline_rank, group=self.group
+                    encoder_hidden_states, self.next_rank, group=self.group
                )
                mx.async_eval(hidden_states, encoder_hidden_states)

        if self.has_single_blocks:
            if not self.owns_concat_stage and not self.is_first_stage:
-                assert self.prev_pipeline_rank is not None
                with trace(
-                    name=f"recv {self.prev_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"recv {self.prev_rank}", rank=self.rank, category="comms"
                ):
                    hidden_states = mx.distributed.recv(
                        (batch_size, text_seq_len + num_img_tokens, hidden_dim),
                        dtype,
-                        self.prev_pipeline_rank,
+                        self.prev_rank,
                        group=self.group,
                    )
                    mx.eval(hidden_states)
@@ -875,14 +689,11 @@ class DiffusionRunner:
                    mx.eval(hidden_states)

            if not self.is_last_stage:
-                assert self.next_pipeline_rank is not None
                with trace(
-                    name=f"send {self.next_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"send {self.next_rank}", rank=self.rank, category="comms"
                ):
                    hidden_states = mx.distributed.send(
-                        hidden_states, self.next_pipeline_rank, group=self.group
+                        hidden_states, self.next_rank, group=self.group
                    )
                    mx.async_eval(hidden_states)

@@ -905,67 +716,83 @@ class DiffusionRunner:
        kontext_image_ids: mx.array | None = None,
    ) -> mx.array:
        prev_latents = hidden_states
+        needs_cfg = self.adapter.needs_cfg
        cond_image_grid = prompt_data.cond_image_grid

        scaled_hidden_states = config.scheduler.scale_model_input(hidden_states, t)  # pyright: ignore[reportAny]
        original_latent_tokens: int = scaled_hidden_states.shape[1]  # pyright: ignore[reportAny]

-        results: list[tuple[bool, mx.array]] = []
-
-        for branch in self._get_cfg_branches(prompt_data):
+        if needs_cfg:
+            batched_data = prompt_data.get_batched_cfg_data()
+            assert batched_data is not None, "CFG model must provide batched data"
+            prompt_embeds, encoder_mask, batched_pooled, cond_latents = batched_data
            pooled_embeds = (
-                branch.pooled if branch.pooled is not None else branch.embeds
+                batched_pooled if batched_pooled is not None else prompt_embeds
            )
-
-            cond_latents = branch.cond_latents
-            if cond_latents is not None:
-                num_img_tokens: int = original_latent_tokens + cond_latents.shape[1]
-            else:
-                num_img_tokens = original_latent_tokens
-
-            step_latents: mx.array = scaled_hidden_states  # pyright: ignore[reportAny]
-            if self.is_first_stage and cond_latents is not None:
-                step_latents = mx.concatenate([step_latents, cond_latents], axis=1)
-
-            text_seq_len = branch.embeds.shape[1]
-            self._ensure_wrappers(text_seq_len, branch.mask)
-
-            noise = self._run_sync_pass(
-                t,
-                config,
-                step_latents,
-                branch.embeds,
-                pooled_embeds,
-                branch.mask,
-                cond_image_grid,
-                kontext_image_ids,
-                num_img_tokens,
-                original_latent_tokens,
-                cond_latents,
+            step_latents = mx.concatenate(
+                [scaled_hidden_states, scaled_hidden_states], axis=0
            )
+        else:
+            prompt_embeds = prompt_data.prompt_embeds
+            pooled_embeds = prompt_data.pooled_prompt_embeds
+            encoder_mask = prompt_data.get_encoder_hidden_states_mask(positive=True)
+            cond_latents = prompt_data.conditioning_latents
+            step_latents = scaled_hidden_states  # pyright: ignore[reportAny]

-            if self.is_last_stage:
-                assert noise is not None
-                results.append((branch.positive, noise))
+        if cond_latents is not None:
+            num_img_tokens: int = original_latent_tokens + cond_latents.shape[1]
+        else:
+            num_img_tokens = original_latent_tokens
+
+        if self.is_first_stage and cond_latents is not None:
+            step_latents = mx.concatenate([step_latents, cond_latents], axis=1)
+
+        text_seq_len = prompt_embeds.shape[1]
+        self._ensure_wrappers(text_seq_len, encoder_mask)
+
+        noise = self._run_sync_pass(
+            t,
+            config,
+            step_latents,
+            prompt_embeds,
+            pooled_embeds,
+            encoder_mask,
+            cond_image_grid,
+            kontext_image_ids,
+            num_img_tokens,
+            original_latent_tokens,
+            cond_latents,
+        )

        if self.is_last_stage:
-            noise = self._combine_cfg_results(results)
+            assert noise is not None
+            if needs_cfg:
+                noise_pos, noise_neg = mx.split(noise, 2, axis=0)
+                guidance_scale = self._get_effective_guidance_scale()
+                assert guidance_scale is not None
+                noise = self.adapter.apply_guidance(
+                    noise_pos, noise_neg, guidance_scale
+                )

            hidden_states = config.scheduler.step(  # pyright: ignore[reportAny]
                noise=noise, timestep=t, latents=prev_latents
            )

            if not self.is_first_stage:
-                hidden_states = mx.distributed.send(
-                    hidden_states, self.first_pipeline_rank, group=self.group
-                )
-                mx.async_eval(hidden_states)
+                with trace(name="send 0", rank=self.rank, category="comms"):
+                    hidden_states = mx.distributed.send(
+                        hidden_states, 0, group=self.group
+                    )
+                    mx.async_eval(hidden_states)

        elif self.is_first_stage:
-            hidden_states = mx.distributed.recv_like(
-                prev_latents, src=self.last_pipeline_rank, group=self.group
-            )
-            mx.eval(hidden_states)
+            with trace(
+                name=f"recv {self.world_size - 1}", rank=self.rank, category="comms"
+            ):
+                hidden_states = mx.distributed.recv_like(
+                    prev_latents, src=self.world_size - 1, group=self.group
+                )
+                mx.eval(hidden_states)

        else:
            hidden_states = prev_latents
@@ -982,10 +809,39 @@ class DiffusionRunner:
        kontext_image_ids: mx.array | None = None,
    ) -> mx.array:
        patch_latents, token_indices = self._create_patches(latents, config)
+        needs_cfg = self.adapter.needs_cfg
        cond_image_grid = prompt_data.cond_image_grid

-        prev_patch_latents = [p for p in patch_latents]
+        if needs_cfg:
+            batched_data = prompt_data.get_batched_cfg_data()
+            assert batched_data is not None, "CFG model must provide batched data"
+            prompt_embeds, encoder_mask, batched_pooled, _ = batched_data
+            pooled_embeds = (
+                batched_pooled if batched_pooled is not None else prompt_embeds
+            )
+        else:
+            prompt_embeds = prompt_data.prompt_embeds
+            pooled_embeds = prompt_data.pooled_prompt_embeds
+            encoder_mask = prompt_data.get_encoder_hidden_states_mask(positive=True)

+        text_seq_len = prompt_embeds.shape[1]
+        self._ensure_wrappers(text_seq_len, encoder_mask)
+        self._set_text_seq_len(text_seq_len)
+
+        if self.joint_block_wrappers:
+            for wrapper in self.joint_block_wrappers:
+                wrapper.set_encoder_mask(encoder_mask)
+
+        text_embeddings = self.adapter.compute_text_embeddings(t, config, pooled_embeds)
+        image_rotary_embeddings = self.adapter.compute_rotary_embeddings(
+            prompt_embeds,
+            config,
+            encoder_hidden_states_mask=encoder_mask,
+            cond_image_grid=cond_image_grid,
+            kontext_image_ids=kontext_image_ids,
+        )
+
+        prev_patch_latents = [p for p in patch_latents]
        encoder_hidden_states: mx.array | None = None

        for patch_idx in range(len(patch_latents)):
@@ -997,57 +853,34 @@ class DiffusionRunner:
                and not is_first_async_step
            ):
                with trace(
-                    name=f"recv {self.last_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"recv {self.prev_rank}", rank=self.rank, category="comms"
                ):
                    patch = mx.distributed.recv_like(
-                        patch, src=self.last_pipeline_rank, group=self.group
+                        patch, src=self.prev_rank, group=self.group
                    )
                    mx.eval(patch)

-            results: list[tuple[bool, mx.array]] = []
+            step_patch = mx.concatenate([patch, patch], axis=0) if needs_cfg else patch

-            for branch in self._get_cfg_branches(prompt_data):
-                pooled_embeds = (
-                    branch.pooled if branch.pooled is not None else branch.embeds
-                )
-
-                text_seq_len = branch.embeds.shape[1]
-                self._ensure_wrappers(text_seq_len, branch.mask)
-                self._set_text_seq_len(text_seq_len)
-
-                if self.joint_block_wrappers:
-                    for wrapper in self.joint_block_wrappers:
-                        wrapper.set_encoder_mask(branch.mask)
-
-                text_embeddings = self.adapter.compute_text_embeddings(
-                    t, config, pooled_embeds
-                )
-                image_rotary_embeddings = self.adapter.compute_rotary_embeddings(
-                    branch.embeds,
-                    config,
-                    encoder_hidden_states_mask=branch.mask,
-                    cond_image_grid=cond_image_grid,
-                    kontext_image_ids=kontext_image_ids,
-                )
-
-                noise, encoder_hidden_states = self._run_single_patch_pass(
-                    patch=patch,
-                    patch_idx=patch_idx,
-                    token_indices=token_indices[patch_idx],
-                    prompt_embeds=branch.embeds,
-                    text_embeddings=text_embeddings,
-                    image_rotary_embeddings=image_rotary_embeddings,
-                    encoder_hidden_states=encoder_hidden_states,
-                )
-
-                if self.is_last_stage:
-                    assert noise is not None
-                    results.append((branch.positive, noise))
+            noise, encoder_hidden_states = self._run_single_patch_pass(
+                patch=step_patch,
+                patch_idx=patch_idx,
+                token_indices=token_indices[patch_idx],
+                prompt_embeds=prompt_embeds,
+                text_embeddings=text_embeddings,
+                image_rotary_embeddings=image_rotary_embeddings,
+                encoder_hidden_states=encoder_hidden_states,
+            )

            if self.is_last_stage:
-                noise = self._combine_cfg_results(results)
+                assert noise is not None
+                if needs_cfg:
+                    noise_pos, noise_neg = mx.split(noise, 2, axis=0)
+                    guidance_scale = self._get_effective_guidance_scale()
+                    assert guidance_scale is not None
+                    noise = self.adapter.apply_guidance(
+                        noise_pos, noise_neg, guidance_scale
+                    )

                patch_latents[patch_idx] = config.scheduler.step(  # pyright: ignore[reportAny]
                    noise=noise,
@@ -1057,14 +890,10 @@ class DiffusionRunner:

                if not self.is_first_stage and t != config.num_inference_steps - 1:
                    with trace(
-                        name=f"send {self.first_pipeline_rank}",
-                        rank=self.rank,
-                        category="comms",
+                        name=f"send {self.next_rank}", rank=self.rank, category="comms"
                    ):
                        patch_latents[patch_idx] = mx.distributed.send(
-                            patch_latents[patch_idx],
-                            self.first_pipeline_rank,
-                            group=self.group,
+                            patch_latents[patch_idx], self.next_rank, group=self.group
                        )
                        mx.async_eval(patch_latents[patch_idx])

@@ -1104,31 +933,26 @@ class DiffusionRunner:

        if self.has_joint_blocks:
            if not self.is_first_stage:
-                assert self.prev_pipeline_rank is not None
                patch_len = patch.shape[1]
                with trace(
-                    name=f"recv {self.prev_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"recv {self.prev_rank}", rank=self.rank, category="comms"
                ):
                    patch = mx.distributed.recv(
                        (batch_size, patch_len, hidden_dim),
                        patch.dtype,
-                        self.prev_pipeline_rank,
+                        self.prev_rank,
                        group=self.group,
                    )
                    mx.eval(patch)

                if patch_idx == 0:
                    with trace(
-                        name=f"recv {self.prev_pipeline_rank}",
-                        rank=self.rank,
-                        category="comms",
+                        name=f"recv {self.prev_rank}", rank=self.rank, category="comms"
                    ):
                        encoder_hidden_states = mx.distributed.recv(
                            (batch_size, text_seq_len, hidden_dim),
                            patch.dtype,
-                            self.prev_pipeline_rank,
+                            self.prev_rank,
                            group=self.group,
                        )
                        mx.eval(encoder_hidden_states)
@@ -1164,54 +988,39 @@ class DiffusionRunner:
            if self.has_single_blocks or self.is_last_stage:
                patch = patch_concat
            else:
-                assert self.next_pipeline_rank is not None
                with trace(
-                    name=f"send {self.next_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"send {self.next_rank}", rank=self.rank, category="comms"
                ):
                    patch_concat = mx.distributed.send(
-                        patch_concat, self.next_pipeline_rank, group=self.group
+                        patch_concat, self.next_rank, group=self.group
                    )
                    mx.async_eval(patch_concat)

        elif self.has_joint_blocks and not self.is_last_stage:
-            assert self.next_pipeline_rank is not None
-            with trace(
-                name=f"send {self.next_pipeline_rank}",
-                rank=self.rank,
-                category="comms",
-            ):
-                patch = mx.distributed.send(
-                    patch, self.next_pipeline_rank, group=self.group
-                )
+            with trace(name=f"send {self.next_rank}", rank=self.rank, category="comms"):
+                patch = mx.distributed.send(patch, self.next_rank, group=self.group)
                mx.async_eval(patch)

            if patch_idx == 0:
                assert encoder_hidden_states is not None
                with trace(
-                    name=f"send {self.next_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"send {self.next_rank}", rank=self.rank, category="comms"
                ):
                    encoder_hidden_states = mx.distributed.send(
-                        encoder_hidden_states, self.next_pipeline_rank, group=self.group
+                        encoder_hidden_states, self.next_rank, group=self.group
                    )
                    mx.async_eval(encoder_hidden_states)

        if self.has_single_blocks:
            if not self.owns_concat_stage and not self.is_first_stage:
-                assert self.prev_pipeline_rank is not None
                patch_len = patch.shape[1]
                with trace(
-                    name=f"recv {self.prev_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"recv {self.prev_rank}", rank=self.rank, category="comms"
                ):
                    patch = mx.distributed.recv(
                        (batch_size, text_seq_len + patch_len, hidden_dim),
                        patch.dtype,
-                        self.prev_pipeline_rank,
+                        self.prev_rank,
                        group=self.group,
                    )
                    mx.eval(patch)
@@ -1234,20 +1043,15 @@ class DiffusionRunner:
                    mx.eval(patch)

            if not self.is_last_stage:
-                assert self.next_pipeline_rank is not None
                with trace(
-                    name=f"send {self.next_pipeline_rank}",
-                    rank=self.rank,
-                    category="comms",
+                    name=f"send {self.next_rank}", rank=self.rank, category="comms"
                ):
-                    patch = mx.distributed.send(
-                        patch, self.next_pipeline_rank, group=self.group
-                    )
+                    patch = mx.distributed.send(patch, self.next_rank, group=self.group)
                    mx.async_eval(patch)

        noise: mx.array | None = None
        if self.is_last_stage:
-            patch_img_only = patch[:, text_seq_len:, :]
-            noise = self.adapter.final_projection(patch_img_only, text_embeddings)
+            patch = patch[:, text_seq_len:, :]
+            noise = self.adapter.final_projection(patch, text_embeddings)

        return noise, encoder_hidden_states
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -26,7 +26,6 @@ from exo.worker.engines.mlx.cache import KVPrefixCache, encode_prompt, make_kv_c
 from exo.worker.engines.mlx.constants import KV_BITS, KV_GROUP_SIZE, MAX_TOKENS
 from exo.worker.engines.mlx.utils_mlx import (
    apply_chat_template,
-    mx_barrier,
 )
 from exo.worker.runner.bootstrap import logger

@@ -130,10 +129,6 @@ def warmup_inference(

    logger.info("Generated ALL warmup tokens")

-    # TODO: Do we want an mx_barrier?
-    #  At least this version is actively incorrect, as it should use mx_barrier(group)
-    mx_barrier()
-
    return tokens_generated


@@ -330,5 +325,3 @@ def mlx_generate(
        # Limit accumulated_text to what's needed for stop sequence detection
        if max_stop_len > 0 and len(accumulated_text) > max_stop_len:
            accumulated_text = accumulated_text[-max_stop_len:]
-
-        # TODO: Do we want an mx_barrier?
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -48,7 +48,6 @@ from exo.shared.types.worker.instances import (
    MlxRingInstance,
 )
 from exo.shared.types.worker.shards import (
-    CfgShardMetadata,
    PipelineShardMetadata,
    ShardMetadata,
    TensorShardMetadata,
@@ -67,8 +66,6 @@ Group = mx.distributed.Group
 resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 4096))


-# TODO: Test this
-#  ALSO https://github.com/exo-explore/exo/pull/233#discussion_r2549683673
 def get_weights_size(model_shard_meta: ShardMetadata) -> Memory:
    return Memory.from_float_kb(
        (model_shard_meta.end_layer - model_shard_meta.start_layer)
@@ -86,30 +83,6 @@ class ModelLoadingTimeoutError(Exception):
    pass


-def mx_barrier(group: Group | None = None):
-    mx.eval(
-        mx.distributed.all_sum(
-            mx.array(1.0),
-            stream=mx.default_stream(mx.Device(mx.cpu)),
-            group=group,
-        )
-    )
-
-
-def broadcast_from_zero(value: int, group: Group | None = None):
-    if group is None:
-        return value
-
-    if group.rank() == 0:
-        a = mx.array([value], dtype=mx.int32)
-    else:
-        a = mx.array([0], dtype=mx.int32)
-
-    m = mx.distributed.all_sum(a, stream=mx.Device(mx.DeviceType.cpu), group=group)
-    mx.eval(m)
-    return int(m.item())
-
-
 class HostList(RootModel[list[str]]):
    @classmethod
    def from_hosts(cls, hosts: list[Host]) -> "HostList":
@@ -275,11 +248,6 @@ def shard_and_load(
            logger.info(f"loading model from {model_path} with pipeline parallelism")
            model = pipeline_auto_parallel(model, group, shard_metadata)
            eval_with_timeout(model.parameters(), timeout_seconds, on_timeout)
-        case CfgShardMetadata():
-            raise ValueError(
-                "CfgShardMetadata is not supported for text model loading - "
-                "this metadata type is only for image generation models"
-            )

    # TODO: Do we need this?
    mx.eval(model)
@@ -553,3 +521,23 @@ def mlx_cleanup(
    import gc

    gc.collect()
+
+
+def mx_any(bool_: bool, group: Group | None) -> bool:
+    if group is None:
+        return bool_
+    num_true = mx.distributed.all_sum(
+        mx.array(bool_), group=group, stream=mx.default_stream(mx.Device(mx.cpu))
+    )
+    mx.eval(num_true)
+    return num_true.item() > 0
+
+
+def mx_barrier(group: Group | None):
+    if group is None:
+        return
+    mx.eval(
+        mx.distributed.all_sum(
+            mx.array(1.0), group=group, stream=mx.default_stream(mx.Device(mx.cpu))
+        )
+    )
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -32,6 +32,7 @@ from exo.shared.types.events import (
 from exo.shared.types.multiaddr import Multiaddr
 from exo.shared.types.state import State
 from exo.shared.types.tasks import (
+    CancelTask,
    CreateRunner,
    DownloadModel,
    ImageEdits,
@@ -98,23 +99,22 @@ class Worker:
        info_send, info_recv = channel[GatheredInfo]()
        info_gatherer: InfoGatherer = InfoGatherer(info_send)

-        try:
-            async with self._tg as tg:
-                tg.start_soon(info_gatherer.run)
-                tg.start_soon(self._forward_info, info_recv)
-                tg.start_soon(self.plan_step)
-                tg.start_soon(self._resend_out_for_delivery)
-                tg.start_soon(self._event_applier)
-                tg.start_soon(self._forward_events)
-                tg.start_soon(self._poll_connection_updates)
-        finally:
-            # Actual shutdown code - waits for all tasks to complete before executing.
-            logger.info("Stopping Worker")
-            self.local_event_sender.close()
-            self.command_sender.close()
-            self.download_command_sender.close()
+        async with self._tg as tg:
+            tg.start_soon(info_gatherer.run)
+            tg.start_soon(self._forward_info, info_recv)
+            tg.start_soon(self.plan_step)
+            tg.start_soon(self._resend_out_for_delivery)
+            tg.start_soon(self._event_applier)
+            tg.start_soon(self._forward_events)
+            tg.start_soon(self._poll_connection_updates)
+
+        # Actual shutdown code - waits for all tasks to complete before executing.
+        self.local_event_sender.close()
+        self.command_sender.close()
+        self.download_command_sender.close()
+        async with create_task_group() as tg:
            for runner in self.runners.values():
-                runner.shutdown()
+                tg.start_soon(runner.shutdown)

    async def _forward_info(self, recv: Receiver[GatheredInfo]):
        with recv as info_stream:
@@ -218,15 +218,22 @@ class Worker:
                        )
                    )
                case Shutdown(runner_id=runner_id):
+                    runner = self.runners.pop(runner_id)
                    try:
                        with fail_after(3):
-                            await self.runners.pop(runner_id).start_task(task)
+                            await runner.start_task(task)
                    except TimeoutError:
                        await self.event_sender.send(
                            TaskStatusUpdated(
                                task_id=task.task_id, task_status=TaskStatus.TimedOut
                            )
                        )
+                    finally:
+                        await runner.shutdown()
+                case CancelTask(cancelled_task_id=cancelled_task_id):
+                    await self.runners[self._task_to_runner_id(task)].cancel_task(
+                        cancelled_task_id
+                    )
                case ImageEdits() if task.task_params.total_input_chunks > 0:
                    # Assemble image from chunks and inject into task
                    cmd_id = task.command_id
@@ -314,8 +321,6 @@ class Worker:
            for event in self.out_for_delivery.copy().values():
                await self.local_event_sender.send(event)

-    ## Op Executors
-
    def _create_supervisor(self, task: CreateRunner) -> RunnerSupervisor:
        """Creates and stores a new AssignedRunner with initial downloading status."""
        runner = RunnerSupervisor.create(
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@@ -4,6 +4,7 @@ from collections.abc import Mapping, Sequence

 from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.tasks import (
+    CancelTask,
    ConnectToGroup,
    CreateRunner,
    DownloadModel,
@@ -53,13 +54,14 @@ def plan(
 ) -> Task | None:
    # Python short circuiting OR logic should evaluate these sequentially.
    return (
-        _kill_runner(runners, all_runners, instances)
+        _cancel_tasks(runners, tasks)
+        or _kill_runner(runners, all_runners, instances)
        or _create_runner(node_id, runners, instances)
        or _model_needs_download(node_id, runners, global_download_status)
        or _init_distributed_backend(runners, all_runners)
        or _load_model(runners, all_runners, global_download_status)
        or _ready_to_warmup(runners, all_runners)
-        or _pending_tasks(runners, tasks, all_runners, input_chunk_buffer)
+        or _pending_tasks(runners, tasks, all_runners, input_chunk_buffer or {})
    )


@@ -270,7 +272,7 @@ def _pending_tasks(
    runners: Mapping[RunnerId, RunnerSupervisor],
    tasks: Mapping[TaskId, Task],
    all_runners: Mapping[RunnerId, RunnerStatus],
-    input_chunk_buffer: Mapping[CommandId, dict[int, str]] | None = None,
+    input_chunk_buffer: Mapping[CommandId, dict[int, str]],
 ) -> Task | None:
    for task in tasks.values():
        # for now, just forward chat completions
@@ -284,7 +286,7 @@ def _pending_tasks(
        if isinstance(task, ImageEdits) and task.task_params.total_input_chunks > 0:
            cmd_id = task.command_id
            expected = task.task_params.total_input_chunks
-            received = len((input_chunk_buffer or {}).get(cmd_id, {}))
+            received = len(input_chunk_buffer.get(cmd_id, {}))
            if received < expected:
                continue  # Wait for all chunks to arrive

@@ -292,16 +294,31 @@ def _pending_tasks(
            if task.instance_id != runner.bound_instance.instance.instance_id:
                continue

-            # I have a design point here; this is a state race in disguise as the task status doesn't get updated to completed fast enough
-            # however, realistically the task status should be set to completed by the LAST runner, so this is a true race
-            # the actual solution is somewhat deeper than this bypass - TODO!
+            # the task status _should_ be set to completed by the LAST runner
+            # it is currently set by the first
+            # this is definitely a hack
            if task.task_id in runner.completed:
                continue

-            # TODO: Check ordering aligns with MLX distributeds expectations.
-
            if isinstance(runner.status, RunnerReady) and all(
                isinstance(all_runners[global_runner_id], (RunnerReady, RunnerRunning))
                for global_runner_id in runner.bound_instance.instance.shard_assignments.runner_to_shard
            ):
                return task
+
+
+def _cancel_tasks(
+    runners: Mapping[RunnerId, RunnerSupervisor],
+    tasks: Mapping[TaskId, Task],
+) -> Task | None:
+    for task in tasks.values():
+        if task.task_status != TaskStatus.Cancelled:
+            continue
+        for runner in runners.values():
+            if task.instance_id != runner.bound_instance.instance.instance_id:
+                continue
+            if task.task_id in runner.cancelled:
+                continue
+            return CancelTask(
+                instance_id=task.instance_id, cancelled_task_id=task.task_id
+            )
--- a/src/exo/worker/runner/bootstrap.py
+++ b/src/exo/worker/runner/bootstrap.py
@@ -3,7 +3,7 @@ import os
 import loguru

 from exo.shared.types.events import Event, RunnerStatusUpdated
-from exo.shared.types.tasks import Task
+from exo.shared.types.tasks import Task, TaskId
 from exo.shared.types.worker.instances import BoundInstance, MlxJacclInstance
 from exo.shared.types.worker.runners import RunnerFailed
 from exo.utils.channels import ClosedResourceError, MpReceiver, MpSender
@@ -15,6 +15,7 @@ def entrypoint(
    bound_instance: BoundInstance,
    event_sender: MpSender[Event],
    task_receiver: MpReceiver[Task],
+    cancel_receiver: MpReceiver[TaskId],
    _logger: "loguru.Logger",
 ) -> None:
    fast_synch_override = os.environ.get("EXO_FAST_SYNCH")
@@ -38,7 +39,7 @@ def entrypoint(
    try:
        from exo.worker.runner.runner import main

-        main(bound_instance, event_sender, task_receiver)
+        main(bound_instance, event_sender, task_receiver, cancel_receiver)
    except ClosedResourceError:
        logger.warning("Runner communication closed unexpectedly")
    except Exception as e:
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -1,5 +1,6 @@
 import base64
 import json
+import math
 import time
 from collections.abc import Generator
 from functools import cache
@@ -66,11 +67,7 @@ from exo.shared.types.worker.runners import (
    RunnerStatus,
    RunnerWarmingUp,
 )
-from exo.shared.types.worker.shards import (
-    CfgShardMetadata,
-    PipelineShardMetadata,
-    ShardMetadata,
-)
+from exo.shared.types.worker.shards import ShardMetadata
 from exo.utils.channels import MpReceiver, MpSender
 from exo.worker.engines.image import (
    DistributedImageModel,
@@ -87,30 +84,16 @@ from exo.worker.engines.mlx.utils_mlx import (
    initialize_mlx,
    load_mlx_items,
    mlx_force_oom,
+    mx_any,
 )
 from exo.worker.runner.bootstrap import logger


-def _is_primary_output_node(shard_metadata: ShardMetadata) -> bool:
-    """Check if this node is the primary output node for image generation.
-
-    For CFG models: the last pipeline stage in CFG group 0 (positive prompt).
-    For non-CFG models: the last pipeline stage.
-    """
-    if isinstance(shard_metadata, CfgShardMetadata):
-        is_pipeline_last = (
-            shard_metadata.pipeline_rank == shard_metadata.pipeline_world_size - 1
-        )
-        return is_pipeline_last and shard_metadata.cfg_rank == 0
-    elif isinstance(shard_metadata, PipelineShardMetadata):
-        return shard_metadata.device_rank == shard_metadata.world_size - 1
-    return False
-
-
 def main(
    bound_instance: BoundInstance,
    event_sender: MpSender[Event],
    task_receiver: MpReceiver[Task],
+    cancel_receiver: MpReceiver[TaskId],
 ):
    instance, runner_id, shard_metadata = (
        bound_instance.instance,
@@ -125,11 +108,15 @@ def main(
        time.sleep(timeout)

    setup_start_time = time.time()
+    cancelled_tasks = set[TaskId]()

-    model: Model | DistributedImageModel | None = None
+    # type checker was unhappy with me - splitting these fixed it
+    inference_model: Model | None = None
+    image_model: DistributedImageModel | None = None
    tokenizer = None
    group = None
    kv_prefix_cache: KVPrefixCache | None = None
+    check_for_cancel_every: int | None = None

    current_status: RunnerStatus = RunnerIdle()
    logger.info("runner created")
@@ -142,6 +129,7 @@ def main(
            if task.task_id in seen:
                logger.warning("repeat task - potential error")
            seen.add(task.task_id)
+            cancelled_tasks.discard(TaskId("CANCEL_CURRENT_TASK"))
            event_sender.send(
                TaskStatusUpdated(task_id=task.task_id, task_status=TaskStatus.Running)
            )
@@ -186,7 +174,7 @@ def main(
                        time.sleep(0.5)

                    if ModelTask.TextGeneration in shard_metadata.model_card.tasks:
-                        model, tokenizer = load_mlx_items(
+                        inference_model, tokenizer = load_mlx_items(
                            bound_instance, group, on_timeout=on_model_load_timeout
                        )
                        logger.info(
@@ -198,7 +186,7 @@ def main(
                        ModelTask.TextToImage in shard_metadata.model_card.tasks
                        or ModelTask.ImageToImage in shard_metadata.model_card.tasks
                    ):
-                        model = initialize_image_model(bound_instance)
+                        image_model = initialize_image_model(bound_instance)
                    else:
                        raise ValueError(
                            f"Unknown model task(s): {shard_metadata.model_card.tasks}"
@@ -206,8 +194,6 @@ def main(
                    current_status = RunnerLoaded()
                    logger.info("runner loaded")
                case StartWarmup() if isinstance(current_status, RunnerLoaded):
-                    assert model
-
                    current_status = RunnerWarmingUp()
                    logger.info("runner warming up")
                    event_sender.send(
@@ -218,15 +204,30 @@ def main(

                    logger.info(f"warming up inference for instance: {instance}")
                    if ModelTask.TextGeneration in shard_metadata.model_card.tasks:
-                        assert not isinstance(model, DistributedImageModel)
+                        assert inference_model
                        assert tokenizer

+                        t = time.perf_counter()
                        toks = warmup_inference(
-                            model=model,
+                            model=inference_model,
                            tokenizer=tokenizer,
-                            # kv_prefix_cache=kv_prefix_cache,  # supply for warmup-time prefix caching
                        )
                        logger.info(f"warmed up by generating {toks} tokens")
+                        check_for_cancel_every = min(
+                            math.ceil(toks / (time.perf_counter() - t)), 100
+                        )
+                        if group is not None:
+                            check_for_cancel_every = int(
+                                mx.max(
+                                    mx.distributed.all_gather(
+                                        mx.array([check_for_cancel_every]), group=group
+                                    )
+                                ).item()
+                            )
+
+                        logger.info(
+                            f"runner checking for cancellation every {check_for_cancel_every} tokens"
+                        )
                        logger.info(
                            f"runner initialized in {time.time() - setup_start_time} seconds"
                        )
@@ -234,8 +235,8 @@ def main(
                        ModelTask.TextToImage in shard_metadata.model_card.tasks
                        or ModelTask.ImageToImage in shard_metadata.model_card.tasks
                    ):
-                        assert isinstance(model, DistributedImageModel)
-                        image = warmup_image_generator(model=model)
+                        assert image_model
+                        image = warmup_image_generator(model=image_model)
                        if image is not None:
                            logger.info(f"warmed up by generating {image.size} image")
                        else:
@@ -254,8 +255,9 @@ def main(
                            runner_id=runner_id, runner_status=current_status
                        )
                    )
-                    assert model and not isinstance(model, DistributedImageModel)
+                    assert inference_model
                    assert tokenizer
+                    assert check_for_cancel_every

                    try:
                        _check_for_debug_prompts(task_params)
@@ -265,7 +267,7 @@ def main(

                        # Generate responses using the actual MLX generation
                        mlx_generator = mlx_generate(
-                            model=model,
+                            model=inference_model,
                            tokenizer=tokenizer,
                            task=task_params,
                            prompt=prompt,
@@ -289,11 +291,11 @@ def main(
                            patch_glm_tokenizer(tokenizer)

                        # GPT-OSS specific parsing to match other model formats.
-                        elif isinstance(model, GptOssModel):
+                        elif isinstance(inference_model, GptOssModel):
                            mlx_generator = parse_gpt_oss(mlx_generator)

                        if tokenizer.has_tool_calling and not isinstance(
-                            model, GptOssModel
+                            inference_model, GptOssModel
                        ):
                            assert tokenizer.tool_call_start
                            assert tokenizer.tool_call_end
@@ -306,7 +308,18 @@ def main(
                            )

                        completion_tokens = 0
+                        tokens_since_last_cancel_check = 0
                        for response in mlx_generator:
+                            tokens_since_last_cancel_check += 1
+                            if tokens_since_last_cancel_check >= check_for_cancel_every:
+                                tokens_since_last_cancel_check = 0
+                                cancelled_tasks.update(cancel_receiver.collect())
+                                want_to_cancel = (task.task_id in cancelled_tasks) or (
+                                    TaskId("CANCEL_CURRENT_TASK") in cancelled_tasks
+                                )
+                                if mx_any(want_to_cancel, group):
+                                    break
+
                            match response:
                                case GenerationResponse():
                                    completion_tokens += 1
@@ -376,7 +389,7 @@ def main(
                case ImageGeneration(
                    task_params=task_params, command_id=command_id
                ) if isinstance(current_status, RunnerReady):
-                    assert isinstance(model, DistributedImageModel)
+                    assert image_model
                    logger.info(f"received image generation request: {str(task)[:500]}")
                    current_status = RunnerRunning()
                    logger.info("runner running")
@@ -387,11 +400,16 @@ def main(
                    )

                    try:
+                        # Generate images using the image generation backend
+                        # Track image_index for final images only
                        image_index = 0
-                        for response in generate_image(model=model, task=task_params):
-                            is_primary_output = _is_primary_output_node(shard_metadata)
-
-                            if is_primary_output:
+                        for response in generate_image(
+                            model=image_model, task=task_params
+                        ):
+                            if (
+                                shard_metadata.device_rank
+                                == shard_metadata.world_size - 1
+                            ):
                                match response:
                                    case PartialImageResponse():
                                        logger.info(
@@ -416,7 +434,7 @@ def main(
                                        image_index += 1
                    # can we make this more explicit?
                    except Exception as e:
-                        if _is_primary_output_node(shard_metadata):
+                        if shard_metadata.device_rank == shard_metadata.world_size - 1:
                            event_sender.send(
                                ChunkGenerated(
                                    command_id=command_id,
@@ -438,7 +456,7 @@ def main(
                case ImageEdits(task_params=task_params, command_id=command_id) if (
                    isinstance(current_status, RunnerReady)
                ):
-                    assert isinstance(model, DistributedImageModel)
+                    assert image_model
                    logger.info(f"received image edits request: {str(task)[:500]}")
                    current_status = RunnerRunning()
                    logger.info("runner running")
@@ -450,8 +468,13 @@ def main(

                    try:
                        image_index = 0
-                        for response in generate_image(model=model, task=task_params):
-                            if _is_primary_output_node(shard_metadata):
+                        for response in generate_image(
+                            model=image_model, task=task_params
+                        ):
+                            if (
+                                shard_metadata.device_rank
+                                == shard_metadata.world_size - 1
+                            ):
                                match response:
                                    case PartialImageResponse():
                                        logger.info(
@@ -475,7 +498,7 @@ def main(
                                        )
                                        image_index += 1
                    except Exception as e:
-                        if _is_primary_output_node(shard_metadata):
+                        if shard_metadata.device_rank == shard_metadata.world_size - 1:
                            event_sender.send(
                                ChunkGenerated(
                                    command_id=command_id,
@@ -514,7 +537,7 @@ def main(
                RunnerStatusUpdated(runner_id=runner_id, runner_status=current_status)
            )
            if isinstance(current_status, RunnerShutdown):
-                del model, tokenizer, group
+                del inference_model, image_model, tokenizer, group
                mx.clear_cache()
                import gc

--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -8,8 +8,10 @@ import anyio
 from anyio import (
    BrokenResourceError,
    ClosedResourceError,
+    create_task_group,
    to_thread,
 )
+from anyio.abc import TaskGroup
 from loguru import logger

 from exo.shared.types.events import (
@@ -47,9 +49,12 @@ class RunnerSupervisor:
    _ev_recv: MpReceiver[Event]
    _task_sender: MpSender[Task]
    _event_sender: Sender[Event]
+    _cancel_sender: MpSender[TaskId]
+    _tg: TaskGroup = field(default_factory=create_task_group, init=False)
    status: RunnerStatus = field(default_factory=RunnerIdle, init=False)
    pending: dict[TaskId, anyio.Event] = field(default_factory=dict, init=False)
    completed: set[TaskId] = field(default_factory=set, init=False)
+    cancelled: set[TaskId] = field(default_factory=set, init=False)

    @classmethod
    def create(
@@ -60,8 +65,8 @@ class RunnerSupervisor:
        initialize_timeout: float = 400,
    ) -> Self:
        ev_send, ev_recv = mp_channel[Event]()
-        # A task is kind of a runner command
        task_sender, task_recv = mp_channel[Task]()
+        cancel_sender, cancel_recv = mp_channel[TaskId]()

        runner_process = Process(
            target=entrypoint,
@@ -69,6 +74,7 @@ class RunnerSupervisor:
                bound_instance,
                ev_send,
                task_recv,
+                cancel_recv,
                logger,
            ),
            daemon=True,
@@ -83,6 +89,7 @@ class RunnerSupervisor:
            initialize_timeout=initialize_timeout,
            _ev_recv=ev_recv,
            _task_sender=task_sender,
+            _cancel_sender=cancel_sender,
            _event_sender=event_sender,
        )

@@ -90,36 +97,42 @@ class RunnerSupervisor:

    async def run(self):
        self.runner_process.start()
-        await self._forward_events()
+        async with self._tg as tg:
+            tg.start_soon(self._forward_events)

+        with anyio.CancelScope(shield=True), contextlib.suppress(ClosedResourceError):
+            await self._cancel_sender.send_async(TaskId("CANCEL_CURRENT_TASK"))

-    def shutdown(self):
-        logger.info("Runner supervisor shutting down")
-        self._ev_recv.close()
-        self._task_sender.close()
-        self._event_sender.close()
-        self.runner_process.join(1)
-        if not self.runner_process.is_alive():
-            logger.info("Runner process succesfully terminated")
-            return
+            self._ev_recv.close()
+            self._task_sender.close()
+            self._event_sender.close()
+            self._cancel_sender.close()

-        # This is overkill but it's not technically bad, just unnecessary.
-        logger.warning("Runner process didn't shutdown succesfully, terminating")
-        self.runner_process.terminate()
-        self.runner_process.join(5)
-        if not self.runner_process.is_alive():
-            return
+            await to_thread.run_sync(self.runner_process.join, 10)
+            if not self.runner_process.is_alive():
+                return

-        logger.critical("Runner process didn't respond to SIGTERM, killing")
-        self.runner_process.kill()
+            # This is overkill but it's not technically bad, just unnecessary.
+            logger.warning("Runner process didn't shutdown succesfully, terminating")
+            self.runner_process.terminate()
+            await to_thread.run_sync(self.runner_process.join, 5)
+            if not self.runner_process.is_alive():
+                return

-        self.runner_process.join(5)
-        if not self.runner_process.is_alive():
-            return
+            logger.critical("Runner process didn't respond to SIGTERM, killing")
+            self.runner_process.kill()

-        logger.critical(
-            "Runner process didn't respond to SIGKILL. System resources may have leaked"
-        )
+            await to_thread.run_sync(self.runner_process.join, 5)
+            if not self.runner_process.is_alive():
+                return
+
+            logger.critical(
+                "Runner process didn't respond to SIGKILL. System resources may have leaked"
+            )
+
+    async def shutdown(self):
+        await self._cancel_sender.send_async(TaskId("CANCEL_CURRENT_TASK"))
+        self._tg.cancel_scope.cancel()

    async def start_task(self, task: Task):
        if task.task_id in self.pending:
@@ -142,6 +155,13 @@ class RunnerSupervisor:
            return
        await event.wait()

+    async def cancel_task(self, task_id: TaskId):
+        if task_id in self.completed:
+            logger.info(f"Unable to cancel {task_id} as it has been completed")
+            return
+        self.cancelled.add(task_id)
+        await self._cancel_sender.send_async(task_id)
+
    async def _forward_events(self):
        with self._ev_recv as events:
            try:
@@ -206,4 +226,4 @@ class RunnerSupervisor:
                runner_status=RunnerFailed(error_message=f"Terminated ({cause})"),
            )
        )
-        self.shutdown()
+        await self.shutdown()
--- a/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
@@ -2,6 +2,7 @@
 from collections.abc import Iterable
 from typing import Callable

+import mlx.core as mx
 import pytest

 import exo.worker.runner.runner as mlx_runner
@@ -19,6 +20,7 @@ from exo.shared.types.tasks import (
    Shutdown,
    StartWarmup,
    Task,
+    TaskId,
    TaskStatus,
    TextGeneration,
 )
@@ -113,6 +115,8 @@ def patch_out_mlx(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setattr(mlx_runner, "load_mlx_items", make_nothin((1, MockTokenizer)))
    monkeypatch.setattr(mlx_runner, "warmup_inference", make_nothin(1))
    monkeypatch.setattr(mlx_runner, "_check_for_debug_prompts", nothin)
+    monkeypatch.setattr(mlx_runner, "mx.all_gather", make_nothin(mx.array([1])))
+    monkeypatch.setattr(mlx_runner, "mx_any", make_nothin(False))
    # Mock apply_chat_template since we're using a fake tokenizer (integer 1).
    # Returns a prompt without thinking tag so detect_thinking_prompt_suffix returns None.
    monkeypatch.setattr(mlx_runner, "apply_chat_template", make_nothin("test prompt"))
@@ -163,6 +167,7 @@ def _run(tasks: Iterable[Task]):
    )

    task_sender, task_receiver = mp_channel[Task]()
+    _cancel_sender, cancel_receiver = mp_channel[TaskId]()
    event_sender = EventCollector()

    with task_sender:
@@ -174,7 +179,7 @@ def _run(tasks: Iterable[Task]):
        task_receiver.close = nothin
        task_receiver.join = nothin

-        mlx_runner.main(bound_instance, event_sender, task_receiver)  # type: ignore[arg-type]
+        mlx_runner.main(bound_instance, event_sender, task_receiver, cancel_receiver)  # pyright: ignore[reportArgumentType]

        return event_sender.events

--- a/tests/run_exo_on.sh
+++ b/tests/run_exo_on.sh
@@ -22,7 +22,7 @@ echo "Deploying $commit to $# hosts..."
 hosts=("$@")
 cleanup() {
  for host in "${hosts[@]}"; do
-    ssh -T -o BatchMode=yes "$host@$host" "pkill -f bin/exo" &
+    ssh -T -o BatchMode=yes "$host@$host" "pkill -SIGINT -of exo-env" &
  done
  wait
  jobs -pr | xargs -r kill 2>/dev/null || true
@@ -34,13 +34,21 @@ reset=$'\e[0m'
 i=0
 for host; do
  colour=${colours[i++ % 4]}
-  ssh -T -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" \
-    "/nix/var/nix/profiles/default/bin/nix run github:exo-explore/exo/$commit#exo" \
-    |& awk -v p="${colour}[${host}]${reset}" '{ print p $0; fflush() }' &
+  {
+    ssh -T -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" \
+      "/nix/var/nix/profiles/default/bin/nix shell nixpkgs#git -c bash -s -- '$commit'" \
+      2>&1 | awk -v p="${colour}[${host}]${reset}" '{ print p $0; fflush() }' &
+  } <<'EOF'
+        set -euo pipefail
+        cd exo
+        git fetch -q origin
+        git checkout -q "$1"
+        EXO_LIBP2P_NAMESPACE="$1" /nix/var/nix/profiles/default/bin/nix run .#exo
+EOF
 done

 for host; do
  echo "Waiting for $host..."
-  until curl -sf "http://$host:52415/models" &>/dev/null; do sleep 1; done
+  until curl -sf "http://$host:52415/models"; do sleep 1; done
 done
 wait