feat: add MetaInstance declarative layer with reconciliation

Adds a declarative MetaInstance system for managing model instances with automatic placement, retry logic (max 3 attempts), and lifecycle management via a reconciliation loop. - Process managers for instance health, meta-instance lifecycle, and node timeout detection - Reconciliation engine driving state transitions and cascading deletes - Dashboard UI for creating/managing MetaInstances with node selection, sharding config, retry status, and error feedback - JACCL SideChannel integration for distributed inference coordination - Comprehensive test suite (25+ edge cases) Split from original #1519. Independent bug fixes extracted to: #1547 (misc fixes), #1546 (JACCL sidechannel), #1582 (download detection), #1580 (RDMA warning) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix: change RDMA AVAILABLE to RDMA NOT ENABLED warning (#1580 )
2026-02-24 02:07:17 -05:00 · 2026-02-21 13:05:10 -08:00 · 2026-02-20 21:40:07 +00:00 · 2026-02-20 20:27:45 +00:00 · 2026-02-20 18:25:49 +00:00 · 2026-02-20 18:17:56 +00:00
55 changed files with 4352 additions and 794 deletions
--- a/app/EXO/EXO/ContentView.swift
+++ b/app/EXO/EXO/ContentView.swift
@@ -26,6 +26,8 @@ struct ContentView: View {
    @State private var uninstallInProgress = false
    @State private var pendingNamespace: String = ""
    @State private var pendingHFToken: String = ""
+    @State private var pendingEnableImageModels = false
+
    var body: some View {
        VStack(alignment: .leading, spacing: 12) {
            statusSection
@@ -325,6 +327,28 @@ struct ContentView: View {
                            .disabled(pendingHFToken == controller.hfToken)
                        }
                    }
+                    Divider()
+                    HStack {
+                        Toggle(
+                            "Enable Image Models (experimental)", isOn: $pendingEnableImageModels
+                        )
+                        .toggleStyle(.switch)
+                        .font(.caption2)
+                        .onAppear {
+                            pendingEnableImageModels = controller.enableImageModels
+                        }
+
+                        Spacer()
+
+                        Button("Save & Restart") {
+                            controller.enableImageModels = pendingEnableImageModels
+                            if controller.status == .running || controller.status == .starting {
+                                controller.restart()
+                            }
+                        }
+                        .font(.caption2)
+                        .disabled(pendingEnableImageModels == controller.enableImageModels)
+                    }
                    HoverButton(title: "Check for Updates", small: true) {
                        updater.checkForUpdates()
                    }
--- a/app/EXO/EXO/ExoProcessController.swift
+++ b/app/EXO/EXO/ExoProcessController.swift
@@ -4,6 +4,8 @@ import Foundation

 private let customNamespaceKey = "EXOCustomNamespace"
 private let hfTokenKey = "EXOHFToken"
+private let enableImageModelsKey = "EXOEnableImageModels"
+
@MainActor
 final class ExoProcessController: ObservableObject {
    enum Status: Equatable {
@@ -49,6 +51,14 @@ final class ExoProcessController: ObservableObject {
            UserDefaults.standard.set(hfToken, forKey: hfTokenKey)
        }
    }
+    @Published var enableImageModels: Bool = {
+        return UserDefaults.standard.bool(forKey: enableImageModelsKey)
+    }()
+    {
+        didSet {
+            UserDefaults.standard.set(enableImageModels, forKey: enableImageModelsKey)
+        }
+    }

    private var process: Process?
    private var runtimeDirectoryURL: URL?
@@ -236,6 +246,10 @@ final class ExoProcessController: ObservableObject {
        if !hfToken.isEmpty {
            environment["HF_TOKEN"] = hfToken
        }
+        if enableImageModels {
+            environment["EXO_ENABLE_IMAGE_MODELS"] = "true"
+        }
+
        var paths: [String] = []
        if let existing = environment["PATH"], !existing.isEmpty {
            paths = existing.split(separator: ":").map(String.init)
--- a/bench/exo_bench.py
+++ b/bench/exo_bench.py
@@ -338,7 +338,7 @@ def main() -> int:
    )

    logger.info("Planning phase: checking downloads...")
-    run_planning_phase(
+    download_duration_s = run_planning_phase(
        client,
        full_model_id,
        selected[0],
@@ -346,6 +346,10 @@ def main() -> int:
        args.timeout,
        settle_deadline,
    )
+    if download_duration_s is not None:
+        logger.info(f"Download: {download_duration_s:.1f}s (freshly downloaded)")
+    else:
+        logger.info("Download: model already cached")

    all_rows: list[dict[str, Any]] = []

@@ -409,6 +413,11 @@ def main() -> int:
                            "pp_tokens": actual_pp_tokens,
                            "tg": tg,
                            "repeat_index": r,
+                            **(
+                                {"download_duration_s": download_duration_s}
+                                if download_duration_s is not None
+                                else {}
+                            ),
                        }
                    )
                    runs.append(row)
--- a/bench/harness.py
+++ b/bench/harness.py
@@ -289,8 +289,12 @@ def run_planning_phase(
    danger_delete: bool,
    timeout: float,
    settle_deadline: float | None,
-) -> None:
-    """Check disk space and ensure model is downloaded before benchmarking."""
+) -> float | None:
+    """Check disk space and ensure model is downloaded before benchmarking.
+
+    Returns the wall-clock download duration in seconds if a fresh download
+    was needed, or None if the model was already cached on all nodes.
+    """
    # Get model size from /models
    models = client.request_json("GET", "/models") or {}
    model_bytes = 0
@@ -303,7 +307,7 @@ def run_planning_phase(
        logger.warning(
            f"Could not determine size for {full_model_id}, skipping disk check"
        )
-        return
+        return None

    # Get nodes from preview
    inner = unwrap_instance(preview["instance"])
@@ -314,6 +318,8 @@ def run_planning_phase(
    downloads = state.get("downloads", {})
    node_disk = state.get("nodeDisk", {})

+    needs_download = False
+
    for node_id in node_ids:
        node_downloads = downloads.get(node_id, [])

@@ -329,6 +335,8 @@ def run_planning_phase(
        if already_downloaded:
            continue

+        needs_download = True
+
        # Wait for disk info if settle_deadline is set
        disk_info = node_disk.get(node_id, {})
        backoff = _SETTLE_INITIAL_BACKOFF_S
@@ -357,7 +365,7 @@ def run_planning_phase(
                f"have {avail // (1024**3)}GB. Use --danger-delete-downloads to free space."
            )

-        # Delete from smallest to largest
+        # Delete from smallest to largest (skip read-only models from EXO_MODELS_PATH)
        completed = [
            (
                unwrap_instance(p["DownloadCompleted"]["shardMetadata"])["modelCard"][
@@ -367,6 +375,7 @@ def run_planning_phase(
            )
            for p in node_downloads
            if "DownloadCompleted" in p
+            and not p["DownloadCompleted"].get("readOnly", False)
        ]
        for del_model, size in sorted(completed, key=lambda x: x[1]):
            logger.info(f"Deleting {del_model} from {node_id} ({size // (1024**2)}MB)")
@@ -379,6 +388,7 @@ def run_planning_phase(
            raise RuntimeError(f"Could not free enough space on {node_id}")

    # Start downloads (idempotent)
+    download_t0 = time.perf_counter() if needs_download else None
    for node_id in node_ids:
        runner_id = inner["shardAssignments"]["nodeToRunner"][node_id]
        shard = runner_to_shard[runner_id]
@@ -421,7 +431,9 @@ def run_planning_phase(
            if not done:
                all_done = False
        if all_done:
-            return
+            if download_t0 is not None:
+                return time.perf_counter() - download_t0
+            return None
        time.sleep(1)

    raise TimeoutError("Downloads did not complete in time")
--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
@@ -168,7 +168,7 @@ export interface ModelDownloadStatus {
 export interface PlacementPreview {
  model_id: string;
  sharding: "Pipeline" | "Tensor";
-  instance_meta: "MlxRing" | "MlxIbv" | "MlxJaccl";
+  instance_meta: "MlxRing" | "MlxJaccl";
  instance: unknown | null;
  memory_delta_by_node: Record<string, number> | null;
  error: string | null;
@@ -219,7 +219,6 @@ interface RawStateResponse {
    string,
    {
      MlxRingInstance?: Instance;
-      MlxIbvInstance?: Instance;
      MlxJacclInstance?: Instance;
    }
  >;
@@ -255,6 +254,20 @@ interface RawStateResponse {
    string,
    { total: { inBytes: number }; available: { inBytes: number } }
  >;
+  // MetaInstances (declarative instance constraints)
+  metaInstances?: Record<string, MetaInstanceData>;
+}
+
+export interface MetaInstanceData {
+  metaInstanceId: string;
+  modelId: string;
+  sharding: string;
+  instanceMeta: string;
+  minNodes: number;
+  nodeIds: string[] | null;
+  placementError: string | null;
+  consecutiveFailures: number;
+  lastFailureError: string | null;
 }

 export interface MessageAttachment {
@@ -554,6 +567,7 @@ class AppStore {
  previewNodeFilter = $state<Set<string>>(new Set());
  lastUpdate = $state<number | null>(null);
  nodeIdentities = $state<Record<string, RawNodeIdentity>>({});
+  metaInstances = $state<Record<string, MetaInstanceData>>({});
  thunderboltBridgeCycles = $state<string[][]>([]);
  nodeThunderbolt = $state<
    Record<
@@ -912,11 +926,7 @@ class AppStore {

    let instanceType: string | null = null;
    if (instanceTag === "MlxRingInstance") instanceType = "MLX Ring";
-    else if (
-      instanceTag === "MlxIbvInstance" ||
-      instanceTag === "MlxJacclInstance"
-    )
-      instanceType = "MLX RDMA";
+    else if (instanceTag === "MlxJacclInstance") instanceType = "MLX RDMA";

    let sharding: string | null = null;
    const inst = instance as {
@@ -1290,6 +1300,8 @@ class AppStore {
      this.nodeThunderbolt = data.nodeThunderbolt ?? {};
      // RDMA ctl status per node
      this.nodeRdmaCtl = data.nodeRdmaCtl ?? {};
+      // MetaInstances
+      this.metaInstances = data.metaInstances ?? {};
      // Thunderbolt bridge cycles
      this.thunderboltBridgeCycles = data.thunderboltBridgeCycles ?? [];
      // Thunderbolt bridge status per node
@@ -3162,6 +3174,7 @@ export const totalTokens = () => appStore.totalTokens;
 export const prefillProgress = () => appStore.prefillProgress;
 export const topologyData = () => appStore.topologyData;
 export const instances = () => appStore.instances;
+export const metaInstances = () => appStore.metaInstances;
 export const runners = () => appStore.runners;
 export const downloads = () => appStore.downloads;
 export const nodeDisk = () => appStore.nodeDisk;
--- a/dashboard/src/routes/+page.svelte
+++ b/dashboard/src/routes/+page.svelte
--- a/dashboard/src/routes/downloads/+page.svelte
+++ b/dashboard/src/routes/downloads/+page.svelte
@@ -29,7 +29,12 @@
        etaMs: number;
        modelDirectory?: string;
      }
-    | { kind: "pending"; modelDirectory?: string }
+    | {
+        kind: "pending";
+        downloaded: number;
+        total: number;
+        modelDirectory?: string;
+      }
    | { kind: "failed"; modelDirectory?: string }
    | { kind: "not_present" };

@@ -255,7 +260,20 @@
          } else if (tag === "DownloadFailed") {
            cell = { kind: "failed", modelDirectory };
          } else {
-            cell = { kind: "pending", modelDirectory };
+            const downloaded = getBytes(
+              payload.downloaded ??
+                payload.downloaded_bytes ??
+                payload.downloadedBytes,
+            );
+            const total = getBytes(
+              payload.total ?? payload.total_bytes ?? payload.totalBytes,
+            );
+            cell = {
+              kind: "pending",
+              downloaded,
+              total,
+              modelDirectory,
+            };
          }

          const existing = row.cells[nodeId];
@@ -265,14 +283,51 @@
        }
      }

+      function rowSortKey(row: ModelRow): number {
+        // in progress (4) -> completed (3) -> paused (2) -> not started (1) -> not present (0)
+        let best = 0;
+        for (const cell of Object.values(row.cells)) {
+          let score = 0;
+          if (cell.kind === "downloading") score = 4;
+          else if (cell.kind === "completed") score = 3;
+          else if (cell.kind === "pending" && cell.downloaded > 0)
+            score = 2; // paused
+          else if (cell.kind === "pending" || cell.kind === "failed") score = 1; // not started
+          if (score > best) best = score;
+        }
+        return best;
+      }
+
+      function totalCompletedBytes(row: ModelRow): number {
+        let total = 0;
+        for (const cell of Object.values(row.cells)) {
+          if (cell.kind === "completed") total += cell.totalBytes;
+        }
+        return total;
+      }
+
      const rows = Array.from(rowMap.values()).sort((a, b) => {
-        const aCompleted = Object.values(a.cells).filter(
-          (c) => c.kind === "completed",
-        ).length;
-        const bCompleted = Object.values(b.cells).filter(
-          (c) => c.kind === "completed",
-        ).length;
-        if (aCompleted !== bCompleted) return bCompleted - aCompleted;
+        const aPriority = rowSortKey(a);
+        const bPriority = rowSortKey(b);
+        if (aPriority !== bPriority) return bPriority - aPriority;
+        // Within completed or paused, sort by biggest size first
+        if (aPriority === 3 && bPriority === 3) {
+          const sizeDiff = totalCompletedBytes(b) - totalCompletedBytes(a);
+          if (sizeDiff !== 0) return sizeDiff;
+        }
+        if (aPriority === 2 && bPriority === 2) {
+          const aSize = Math.max(
+            ...Object.values(a.cells).map((c) =>
+              c.kind === "pending" ? c.total : 0,
+            ),
+          );
+          const bSize = Math.max(
+            ...Object.values(b.cells).map((c) =>
+              c.kind === "pending" ? c.total : 0,
+            ),
+          );
+          if (aSize !== bSize) return bSize - aSize;
+        }
        return a.modelId.localeCompare(b.modelId);
      });

@@ -482,9 +537,34 @@
                    {:else if cell.kind === "pending"}
                      <div
                        class="flex flex-col items-center gap-0.5"
-                        title="Download pending"
+                        title={cell.downloaded > 0
+                          ? `${formatBytes(cell.downloaded)} / ${formatBytes(cell.total)} downloaded`
+                          : "Download pending"}
                      >
-                        <span class="text-exo-light-gray/50 text-sm">...</span>
+                        {#if cell.downloaded > 0 && cell.total > 0}
+                          <span class="text-exo-light-gray/70 text-[10px]"
+                            >{formatBytes(cell.downloaded)} / {formatBytes(
+                              cell.total,
+                            )}</span
+                          >
+                          <div
+                            class="w-full h-1 bg-white/10 rounded-full overflow-hidden"
+                          >
+                            <div
+                              class="h-full bg-exo-light-gray/40 rounded-full"
+                              style="width: {(
+                                (cell.downloaded / cell.total) *
+                                100
+                              ).toFixed(1)}%"
+                            ></div>
+                          </div>
+                          <span class="text-exo-light-gray/40 text-[9px]"
+                            >paused</span
+                          >
+                        {:else}
+                          <span class="text-exo-light-gray/50 text-sm">...</span
+                          >
+                        {/if}
                      </div>
                    {:else if cell.kind === "failed"}
                      <div
--- a/nix/mlx.nix
+++ b/nix/mlx.nix
@@ -41,7 +41,7 @@ let

  mlx = stdenv.mkDerivation rec {
    pname = "mlx";
-    version = let v = "0.30.7.dev20260218+14841977"; in
+    version = let v = "0.30.7.dev20260220+13998a05"; in
      assert v == uvLockMlxVersion || throw "MLX version mismatch: nix/mlx.nix has ${v} but uv.lock has ${uvLockMlxVersion}. Update both the version and hash in nix/mlx.nix.";
      v;
    pyproject = true;
@@ -49,8 +49,8 @@ let
    src = fetchFromGitHub {
      owner = "rltakashige";
      repo = "mlx-jaccl-fix-small-recv";
-      rev = "1484197707f35186ad3bd614357c7c47fdf86ebc";
-      hash = "sha256-FupCMoK/SF/ldfKuvMSAKECcOP8c+ANgkQlPZttDsLk=";
+      rev = "13998a054715edcdc93618fb1496c79c7c25ff7c";
+      hash = "sha256-fAqA3hFwNBx7FcoGnhQsIFpAIRbC2EerACm4Fvne0Cc=";
    };

    patches = [
--- a/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-4bit.toml
@@ -3,6 +3,10 @@ n_layers = 48
 hidden_size = 2048
 supports_tensor = true
 tasks = ["TextGeneration"]
+family = "qwen"
+quantization = "4bit"
+base_model = "Qwen3 Coder Next"
+capabilities = ["text", "code"]

 [storage_size]
 in_bytes = 45644286500
--- a/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-5bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-5bit.toml
@@ -3,6 +3,10 @@ n_layers = 48
 hidden_size = 2048
 supports_tensor = true
 tasks = ["TextGeneration"]
+family = "qwen"
+quantization = "5bit"
+base_model = "Qwen3 Coder Next"
+capabilities = ["text", "code"]

 [storage_size]
 in_bytes = 57657697020
--- a/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-6bit.toml
@@ -3,6 +3,10 @@ n_layers = 48
 hidden_size = 2048
 supports_tensor = true
 tasks = ["TextGeneration"]
+family = "qwen"
+quantization = "6bit"
+base_model = "Qwen3 Coder Next"
+capabilities = ["text", "code"]

 [storage_size]
 in_bytes = 68899327465
--- a/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-8bit.toml
@@ -3,6 +3,10 @@ n_layers = 48
 hidden_size = 2048
 supports_tensor = true
 tasks = ["TextGeneration"]
+family = "qwen"
+quantization = "8bit"
+base_model = "Qwen3 Coder Next"
+capabilities = ["text", "code"]

 [storage_size]
 in_bytes = 89357758772
--- a/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-bf16.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Coder-Next-bf16.toml
@@ -3,6 +3,10 @@ n_layers = 48
 hidden_size = 2048
 supports_tensor = true
 tasks = ["TextGeneration"]
+family = "qwen"
+quantization = "bf16"
+base_model = "Qwen3 Coder Next"
+capabilities = ["text", "code"]

 [storage_size]
 in_bytes = 157548627945
--- a/src/exo/download/coordinator.py
+++ b/src/exo/download/coordinator.py
@@ -1,7 +1,7 @@
 import asyncio
 import socket
 from dataclasses import dataclass, field
-from typing import Iterator
+from random import random

 import anyio
 from anyio import current_time
@@ -12,20 +12,24 @@ from exo.download.download_utils import (
    RepoDownloadProgress,
    delete_model,
    map_repo_download_progress_to_download_progress_data,
+    resolve_model_in_path,
 )
 from exo.download.shard_downloader import ShardDownloader
-from exo.shared.constants import EXO_MODELS_DIR
-from exo.shared.models.model_cards import ModelId
+from exo.shared.constants import EXO_MODELS_DIR, EXO_MODELS_PATH
+from exo.shared.models.model_cards import ModelId, get_model_cards
 from exo.shared.types.commands import (
    CancelDownload,
    DeleteDownload,
    ForwarderDownloadCommand,
    StartDownload,
 )
-from exo.shared.types.common import NodeId, SessionId
+from exo.shared.types.common import NodeId, SessionId, SystemId
 from exo.shared.types.events import (
    Event,
-    ForwarderEvent,
+    EventId,
+    # TODO(evan): just for acks, should delete this ASAP
+    GlobalForwarderEvent,
+    LocalForwarderEvent,
    NodeDownloadProgress,
 )
 from exo.shared.types.worker.downloads import (
@@ -35,7 +39,7 @@ from exo.shared.types.worker.downloads import (
    DownloadPending,
    DownloadProgress,
 )
-from exo.shared.types.worker.shards import ShardMetadata
+from exo.shared.types.worker.shards import PipelineShardMetadata, ShardMetadata
 from exo.utils.channels import Receiver, Sender, channel


@@ -45,10 +49,16 @@ class DownloadCoordinator:
    session_id: SessionId
    shard_downloader: ShardDownloader
    download_command_receiver: Receiver[ForwarderDownloadCommand]
-    local_event_sender: Sender[ForwarderEvent]
-    event_index_counter: Iterator[int]
+    local_event_sender: Sender[LocalForwarderEvent]
+
+    # ack stuff
+    _global_event_receiver: Receiver[GlobalForwarderEvent]
+    _out_for_delivery: dict[EventId, LocalForwarderEvent] = field(default_factory=dict)
+
    offline: bool = False

+    _system_id: SystemId = field(default_factory=SystemId)
+
    # Local state
    download_status: dict[ModelId, DownloadProgress] = field(default_factory=dict)
    active_downloads: dict[ModelId, asyncio.Task[None]] = field(default_factory=dict)
@@ -115,12 +125,18 @@ class DownloadCoordinator:
        )
        if not self.offline:
            self._test_internet_connection()
-        async with self._tg as tg:
-            tg.start_soon(self._command_processor)
-            tg.start_soon(self._forward_events)
-            tg.start_soon(self._emit_existing_download_progress)
-            if not self.offline:
-                tg.start_soon(self._check_internet_connection)
+        try:
+            async with self._tg as tg:
+                tg.start_soon(self._command_processor)
+                tg.start_soon(self._forward_events)
+                tg.start_soon(self._emit_existing_download_progress)
+                tg.start_soon(self._resend_out_for_delivery)
+                tg.start_soon(self._clear_ofd)
+                if not self.offline:
+                    tg.start_soon(self._check_internet_connection)
+        finally:
+            for task in self.active_downloads.values():
+                task.cancel()

    def _test_internet_connection(self) -> None:
        # Try multiple endpoints since some ISPs/networks block specific IPs
@@ -153,6 +169,20 @@ class DownloadCoordinator:
    def shutdown(self) -> None:
        self._tg.cancel_scope.cancel()

+    # directly copied from worker
+    async def _resend_out_for_delivery(self) -> None:
+        # This can also be massively tightened, we should check events are at least a certain age before resending.
+        # Exponential backoff would also certainly help here.
+        while True:
+            await anyio.sleep(1 + random())
+            for event in self._out_for_delivery.copy().values():
+                await self.local_event_sender.send(event)
+
+    async def _clear_ofd(self) -> None:
+        with self._global_event_receiver as events:
+            async for event in events:
+                self._out_for_delivery.pop(event.event.event_id, None)
+
    async def _command_processor(self) -> None:
        with self.download_command_receiver as commands:
            async for cmd in commands:
@@ -185,6 +215,25 @@ class DownloadCoordinator:
                )
                return

+        # Check EXO_MODELS_PATH for pre-downloaded models
+        found_path = resolve_model_in_path(model_id)
+        if found_path is not None:
+            logger.info(
+                f"DownloadCoordinator: Model {model_id} found in EXO_MODELS_PATH at {found_path}"
+            )
+            completed = DownloadCompleted(
+                shard_metadata=shard,
+                node_id=self.node_id,
+                total=shard.model_card.storage_size,
+                model_directory=str(found_path),
+                read_only=True,
+            )
+            self.download_status[model_id] = completed
+            await self.event_sender.send(
+                NodeDownloadProgress(download_progress=completed)
+            )
+            return
+
        # Emit pending status
        progress = DownloadPending(
            shard_metadata=shard,
@@ -269,6 +318,15 @@ class DownloadCoordinator:
        self.active_downloads[model_id] = task

    async def _delete_download(self, model_id: ModelId) -> None:
+        # Protect read-only models (from EXO_MODELS_PATH) from deletion
+        if model_id in self.download_status:
+            current = self.download_status[model_id]
+            if isinstance(current, DownloadCompleted) and current.read_only:
+                logger.warning(
+                    f"Refusing to delete read-only model {model_id} (from EXO_MODELS_PATH)"
+                )
+                return
+
        # Cancel if active
        if model_id in self.active_downloads:
            logger.info(f"Cancelling active download for {model_id} before deletion")
@@ -298,19 +356,21 @@ class DownloadCoordinator:
            del self.download_status[model_id]

    async def _forward_events(self) -> None:
+        idx = 0
        with self.event_receiver as events:
            async for event in events:
-                idx = next(self.event_index_counter)
-                fe = ForwarderEvent(
+                fe = LocalForwarderEvent(
                    origin_idx=idx,
-                    origin=self.node_id,
+                    origin=self._system_id,
                    session=self.session_id,
                    event=event,
                )
+                idx += 1
                logger.debug(
                    f"DownloadCoordinator published event {idx}: {str(event)[:100]}"
                )
                await self.local_event_sender.send(fe)
+                self._out_for_delivery[event.event_id] = fe

    async def _emit_existing_download_progress(self) -> None:
        try:
@@ -345,6 +405,8 @@ class DownloadCoordinator:
                                model_directory=self._model_dir(
                                    progress.shard.model_card.model_id
                                ),
+                                downloaded=progress.downloaded,
+                                total=progress.total,
                            )
                        else:
                            status = DownloadOngoing(
@@ -364,6 +426,39 @@ class DownloadCoordinator:
                    await self.event_sender.send(
                        NodeDownloadProgress(download_progress=status)
                    )
+                # Scan EXO_MODELS_PATH for pre-downloaded models
+                if EXO_MODELS_PATH is not None:
+                    for card in await get_model_cards():
+                        mid = card.model_id
+                        if mid in self.active_downloads:
+                            continue
+                        if isinstance(
+                            self.download_status.get(mid),
+                            (DownloadCompleted, DownloadOngoing, DownloadFailed),
+                        ):
+                            continue
+                        found = resolve_model_in_path(mid)
+                        if found is not None:
+                            path_shard = PipelineShardMetadata(
+                                model_card=card,
+                                device_rank=0,
+                                world_size=1,
+                                start_layer=0,
+                                end_layer=card.n_layers,
+                                n_layers=card.n_layers,
+                            )
+                            path_completed: DownloadProgress = DownloadCompleted(
+                                node_id=self.node_id,
+                                shard_metadata=path_shard,
+                                total=card.storage_size,
+                                model_directory=str(found),
+                                read_only=True,
+                            )
+                            self.download_status[mid] = path_completed
+                            await self.event_sender.send(
+                                NodeDownloadProgress(download_progress=path_completed)
+                            )
+
                logger.debug(
                    "DownloadCoordinator: Done emitting existing download progress."
                )
--- a/src/exo/download/download_utils.py
+++ b/src/exo/download/download_utils.py
@@ -20,7 +20,6 @@ from huggingface_hub import (
 )
 from loguru import logger
 from pydantic import (
-    DirectoryPath,
    TypeAdapter,
 )

@@ -31,7 +30,7 @@ from exo.download.huggingface_utils import (
    get_hf_endpoint,
    get_hf_token,
 )
-from exo.shared.constants import EXO_MODELS_DIR
+from exo.shared.constants import EXO_MODELS_DIR, EXO_MODELS_PATH
 from exo.shared.models.model_cards import ModelTask
 from exo.shared.types.common import ModelId
 from exo.shared.types.memory import Memory
@@ -111,7 +110,27 @@ def map_repo_download_progress_to_download_progress_data(
    )


-def build_model_path(model_id: ModelId) -> DirectoryPath:
+def resolve_model_in_path(model_id: ModelId) -> Path | None:
+    """Search EXO_MODELS_PATH directories for a pre-existing model.
+
+    Checks each directory for the normalized name (org--model).  A candidate
+    is only returned if ``is_model_directory_complete`` confirms all weight
+    files are present.
+    """
+    if EXO_MODELS_PATH is None:
+        return None
+    normalized = model_id.normalize()
+    for search_dir in EXO_MODELS_PATH:
+        candidate = search_dir / normalized
+        if candidate.is_dir() and is_model_directory_complete(candidate):
+            return candidate
+    return None
+
+
+def build_model_path(model_id: ModelId) -> Path:
+    found = resolve_model_in_path(model_id)
+    if found is not None:
+        return found
    return EXO_MODELS_DIR / model_id.normalize()


@@ -158,6 +177,72 @@ async def seed_models(seed_dir: str | Path):
                    logger.error(traceback.format_exc())


+def _scan_model_directory(
+    model_dir: Path, recursive: bool = False
+) -> list[FileListEntry] | None:
+    """Scan a local model directory and build a file list.
+
+    Requires at least one ``*.safetensors.index.json``.  Every weight file
+    referenced by the index that is missing on disk gets ``size=None``.
+    """
+    index_files = list(model_dir.glob("**/*.safetensors.index.json"))
+    if not index_files:
+        return None
+
+    entries_by_path: dict[str, FileListEntry] = {}
+
+    if recursive:
+        for dirpath, _, filenames in os.walk(model_dir):
+            for filename in filenames:
+                if filename.endswith(".partial"):
+                    continue
+                full_path = Path(dirpath) / filename
+                rel_path = str(full_path.relative_to(model_dir))
+                entries_by_path[rel_path] = FileListEntry(
+                    type="file",
+                    path=rel_path,
+                    size=full_path.stat().st_size,
+                )
+    else:
+        for item in model_dir.iterdir():
+            if item.is_file() and not item.name.endswith(".partial"):
+                entries_by_path[item.name] = FileListEntry(
+                    type="file",
+                    path=item.name,
+                    size=item.stat().st_size,
+                )
+
+    # Add expected weight files from index that haven't been downloaded yet
+    for index_file in index_files:
+        try:
+            index_data = ModelSafetensorsIndex.model_validate_json(
+                index_file.read_text()
+            )
+            relative_dir = index_file.parent.relative_to(model_dir)
+            for filename in set(index_data.weight_map.values()):
+                rel_path = (
+                    str(relative_dir / filename)
+                    if relative_dir != Path(".")
+                    else filename
+                )
+                if rel_path not in entries_by_path:
+                    entries_by_path[rel_path] = FileListEntry(
+                        type="file",
+                        path=rel_path,
+                        size=None,
+                    )
+        except Exception:
+            continue
+
+    return list(entries_by_path.values())
+
+
+def is_model_directory_complete(model_dir: Path) -> bool:
+    """Check if a model directory contains all required weight files."""
+    file_list = _scan_model_directory(model_dir, recursive=True)
+    return file_list is not None and all(f.size is not None for f in file_list)
+
+
 async def _build_file_list_from_local_directory(
    model_id: ModelId,
    recursive: bool = False,
@@ -172,59 +257,7 @@ async def _build_file_list_from_local_directory(
    if not await aios.path.exists(model_dir):
        return None

-    def _scan() -> list[FileListEntry] | None:
-        index_files = list(model_dir.glob("**/*.safetensors.index.json"))
-        if not index_files:
-            return None
-
-        entries_by_path: dict[str, FileListEntry] = {}
-
-        if recursive:
-            for dirpath, _, filenames in os.walk(model_dir):
-                for filename in filenames:
-                    if filename.endswith(".partial"):
-                        continue
-                    full_path = Path(dirpath) / filename
-                    rel_path = str(full_path.relative_to(model_dir))
-                    entries_by_path[rel_path] = FileListEntry(
-                        type="file",
-                        path=rel_path,
-                        size=full_path.stat().st_size,
-                    )
-        else:
-            for item in model_dir.iterdir():
-                if item.is_file() and not item.name.endswith(".partial"):
-                    entries_by_path[item.name] = FileListEntry(
-                        type="file",
-                        path=item.name,
-                        size=item.stat().st_size,
-                    )
-
-        # Add expected weight files from index that haven't been downloaded yet
-        for index_file in index_files:
-            try:
-                index_data = ModelSafetensorsIndex.model_validate_json(
-                    index_file.read_text()
-                )
-                relative_dir = index_file.parent.relative_to(model_dir)
-                for filename in set(index_data.weight_map.values()):
-                    rel_path = (
-                        str(relative_dir / filename)
-                        if relative_dir != Path(".")
-                        else filename
-                    )
-                    if rel_path not in entries_by_path:
-                        entries_by_path[rel_path] = FileListEntry(
-                            type="file",
-                            path=rel_path,
-                            size=None,
-                        )
-            except Exception:
-                continue
-
-        return list(entries_by_path.values())
-
-    file_list = await asyncio.to_thread(_scan)
+    file_list = await asyncio.to_thread(_scan_model_directory, model_dir, recursive)
    if not file_list:
        return None
    return file_list
--- a/src/exo/download/tests/test_coordinator_ack.py
+++ b/src/exo/download/tests/test_coordinator_ack.py
@@ -0,0 +1,98 @@
+from typing import Any
+
+import anyio
+import pytest
+
+from exo.download.coordinator import DownloadCoordinator
+from exo.download.shard_downloader import NoopShardDownloader
+from exo.shared.models.model_cards import ModelCard, ModelTask
+from exo.shared.types.common import ModelId, NodeId, SessionId
+from exo.shared.types.events import (
+    GlobalForwarderEvent,
+    LocalForwarderEvent,
+    NodeDownloadProgress,
+)
+from exo.shared.types.memory import Memory
+from exo.shared.types.worker.downloads import (
+    DownloadPending,
+)
+from exo.shared.types.worker.shards import PipelineShardMetadata
+from exo.utils.channels import channel
+
+# Use the built‑in NoopShardDownloader directly – it already implements the required abstract interface.
+# No additional subclass is needed for this test.
+
+
+@pytest.mark.anyio
+async def test_ack_behaviour():
+    # Create channels (type Any for simplicity)
+    _, command_receiver = channel[Any]()
+    local_sender, _ = channel[Any]()
+    global_sender, global_receiver = channel[Any]()
+
+    # Minimal identifiers
+    node_id = NodeId()
+    session_id = SessionId(master_node_id=node_id, election_clock=0)
+
+    # Create a dummy model card and shard metadata
+    model_id = ModelId("test/model")
+    model_card = ModelCard(
+        model_id=model_id,
+        storage_size=Memory.from_bytes(0),
+        n_layers=1,
+        hidden_size=1,
+        supports_tensor=True,
+        tasks=[ModelTask.TextGeneration],
+    )
+    shard = PipelineShardMetadata(
+        model_card=model_card,
+        device_rank=0,
+        world_size=1,
+        start_layer=0,
+        end_layer=1,
+        n_layers=1,
+    )
+
+    # Instantiate the coordinator with the dummy downloader
+    coord = DownloadCoordinator(
+        node_id=node_id,
+        session_id=session_id,
+        shard_downloader=NoopShardDownloader(),
+        download_command_receiver=command_receiver,
+        local_event_sender=local_sender,
+        _global_event_receiver=global_receiver,
+    )
+
+    async with anyio.create_task_group() as tg:
+        # Start the forwarding and ack‑clearing loops
+        tg.start_soon(coord._forward_events)  # pyright: ignore[reportPrivateUsage]
+        tg.start_soon(coord._clear_ofd)  # pyright: ignore[reportPrivateUsage]
+
+        # Send a pending download progress event via the internal event sender
+        pending = DownloadPending(
+            node_id=node_id,
+            shard_metadata=shard,
+            model_directory="/tmp/model",
+        )
+        await coord.event_sender.send(NodeDownloadProgress(download_progress=pending))
+        # Allow the forwarder to process the event
+        await anyio.sleep(0.1)
+
+        # There should be exactly one entry awaiting ACK
+        assert len(coord._out_for_delivery) == 1  # pyright: ignore[reportPrivateUsage]
+        # Retrieve the stored LocalForwarderEvent
+        stored_fe: LocalForwarderEvent = next(iter(coord._out_for_delivery.values()))  # pyright: ignore[reportPrivateUsage]
+        # Simulate receiving a global ack for this event
+        ack = GlobalForwarderEvent(
+            origin_idx=0,
+            origin=node_id,
+            session=session_id,
+            event=stored_fe.event,
+        )
+        await global_sender.send(ack)
+        # Give the clear‑ofd task a moment to process the ack
+        await anyio.sleep(0.1)
+        # The out‑for‑delivery map should now be empty
+        assert len(coord._out_for_delivery) == 0  # pyright: ignore[reportPrivateUsage]
+        # Cancel background tasks
+        tg.cancel_scope.cancel()
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -1,11 +1,10 @@
 import argparse
-import itertools
 import multiprocessing as mp
 import os
 import resource
 import signal
 from dataclasses import dataclass, field
-from typing import Iterator, Self
+from typing import Self

 import anyio
 from anyio.abc import TaskGroup
@@ -38,12 +37,11 @@ class Node:
    api: API | None

    node_id: NodeId
-    event_index_counter: Iterator[int]
    offline: bool
    _tg: TaskGroup = field(init=False, default_factory=anyio.create_task_group)

    @classmethod
-    async def create(cls, args: "Args") -> "Self":
+    async def create(cls, args: "Args") -> Self:
        keypair = get_node_id_keypair()
        node_id = NodeId(keypair.to_node_id())
        session_id = SessionId(master_node_id=node_id, election_clock=0)
@@ -57,9 +55,6 @@ class Node:

        logger.info(f"Starting node {node_id}")

-        # Create shared event index counter for Worker and DownloadCoordinator
-        event_index_counter = itertools.count()
-
        # Create DownloadCoordinator (unless --no-downloads)
        if not args.no_downloads:
            download_coordinator = DownloadCoordinator(
@@ -68,8 +63,9 @@ class Node:
                exo_shard_downloader(),
                download_command_receiver=router.receiver(topics.DOWNLOAD_COMMANDS),
                local_event_sender=router.sender(topics.LOCAL_EVENTS),
-                event_index_counter=event_index_counter,
                offline=args.offline,
+                # TODO(evan): remove
+                _global_event_receiver=router.receiver(topics.GLOBAL_EVENTS),
            )
        else:
            download_coordinator = None
@@ -95,7 +91,6 @@ class Node:
                local_event_sender=router.sender(topics.LOCAL_EVENTS),
                command_sender=router.sender(topics.COMMANDS),
                download_command_sender=router.sender(topics.DOWNLOAD_COMMANDS),
-                event_index_counter=event_index_counter,
            )
        else:
            worker = None
@@ -133,7 +128,6 @@ class Node:
            master,
            api,
            node_id,
-            event_index_counter,
            args.offline,
        )

@@ -212,8 +206,6 @@ class Node:
                    )
                if result.is_new_master:
                    await anyio.sleep(0)
-                    # Fresh counter for new session (buffer expects indices from 0)
-                    self.event_index_counter = itertools.count()
                    if self.download_coordinator:
                        self.download_coordinator.shutdown()
                        self.download_coordinator = DownloadCoordinator(
@@ -224,8 +216,11 @@ class Node:
                                topics.DOWNLOAD_COMMANDS
                            ),
                            local_event_sender=self.router.sender(topics.LOCAL_EVENTS),
-                            event_index_counter=self.event_index_counter,
                            offline=self.offline,
+                            # TODO(evan): remove
+                            _global_event_receiver=self.router.receiver(
+                                topics.GLOBAL_EVENTS
+                            ),
                        )
                        self._tg.start_soon(self.download_coordinator.run)
                    if self.worker:
@@ -242,7 +237,6 @@ class Node:
                            download_command_sender=self.router.sender(
                                topics.DOWNLOAD_COMMANDS
                            ),
-                            event_index_counter=self.event_index_counter,
                        )
                        self._tg.start_soon(self.worker.run)
                    if self.api:
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -79,8 +79,11 @@ from exo.shared.types.api import (
    ChatCompletionResponse,
    CreateInstanceParams,
    CreateInstanceResponse,
+    CreateMetaInstanceParams,
+    CreateMetaInstanceResponse,
    DeleteDownloadResponse,
    DeleteInstanceResponse,
+    DeleteMetaInstanceResponse,
    ErrorInfo,
    ErrorResponse,
    FinishReason,
@@ -126,8 +129,10 @@ from exo.shared.types.claude_api import (
 from exo.shared.types.commands import (
    Command,
    CreateInstance,
+    CreateMetaInstance,
    DeleteDownload,
    DeleteInstance,
+    DeleteMetaInstance,
    DownloadCommand,
    ForwarderCommand,
    ForwarderDownloadCommand,
@@ -140,15 +145,23 @@ from exo.shared.types.commands import (
    TaskFinished,
    TextGeneration,
 )
-from exo.shared.types.common import CommandId, Id, NodeId, SessionId
+from exo.shared.types.common import (
+    CommandId,
+    Id,
+    MetaInstanceId,
+    NodeId,
+    SessionId,
+    SystemId,
+)
 from exo.shared.types.events import (
    ChunkGenerated,
    Event,
-    ForwarderEvent,
+    GlobalForwarderEvent,
    IndexedEvent,
    TracesMerged,
 )
 from exo.shared.types.memory import Memory
+from exo.shared.types.meta_instance import MetaInstance
 from exo.shared.types.ollama_api import (
    OllamaChatRequest,
    OllamaChatResponse,
@@ -197,8 +210,7 @@ class API:
        session_id: SessionId,
        *,
        port: int,
-        # Ideally this would be a MasterForwarderEvent but type system says no :(
-        global_event_receiver: Receiver[ForwarderEvent],
+        global_event_receiver: Receiver[GlobalForwarderEvent],
        command_sender: Sender[ForwarderCommand],
        download_command_sender: Sender[ForwarderDownloadCommand],
        # This lets us pause the API if an election is running
@@ -206,6 +218,7 @@ class API:
    ) -> None:
        self.state = State()
        self._event_log = DiskEventLog(_API_EVENT_LOG_DIR)
+        self._system_id = SystemId()
        self.command_sender = command_sender
        self.download_command_sender = download_command_sender
        self.global_event_receiver = global_event_receiver
@@ -257,6 +270,7 @@ class API:
        self._event_log.close()
        self._event_log = DiskEventLog(_API_EVENT_LOG_DIR)
        self.state = State()
+        self._system_id = SystemId()
        self.session_id = new_session_id
        self.event_buffer = OrderedBuffer[Event]()
        self._text_generation_queues = {}
@@ -302,6 +316,9 @@ class API:
        self.app.get("/instance/previews")(self.get_placement_previews)
        self.app.get("/instance/{instance_id}")(self.get_instance)
        self.app.delete("/instance/{instance_id}")(self.delete_instance)
+        self.app.get("/meta_instances")(self.list_meta_instances)
+        self.app.post("/meta_instance")(self.create_meta_instance)
+        self.app.delete("/meta_instance/{meta_instance_id}")(self.delete_meta_instance)
        self.app.get("/models")(self.get_models)
        self.app.get("/v1/models")(self.get_models)
        self.app.post("/models/add")(self.add_custom_model)
@@ -346,12 +363,27 @@ class API:
        self.app.get("/v1/traces/{task_id}/raw")(self.get_trace_raw)

    async def place_instance(self, payload: PlaceInstanceParams):
+        model_card = await ModelCard.load(payload.model_id)
        command = PlaceInstance(
-            model_card=await ModelCard.load(payload.model_id),
+            model_card=model_card,
            sharding=payload.sharding,
            instance_meta=payload.instance_meta,
            min_nodes=payload.min_nodes,
        )
+
+        # Validate placement before sending — fail fast with a clear error
+        # instead of silently dropping the command in the master.
+        try:
+            get_instance_placements(
+                command,
+                topology=self.state.topology,
+                current_instances=self.state.instances,
+                node_memory=self.state.node_memory,
+                node_network=self.state.node_network,
+            )
+        except ValueError as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+
        await self._send(command)

        return CreateInstanceResponse(
@@ -563,6 +595,44 @@ class API:
            instance_id=instance_id,
        )

+    def list_meta_instances(self) -> dict[MetaInstanceId, MetaInstance]:
+        return dict(self.state.meta_instances)
+
+    async def create_meta_instance(
+        self, payload: CreateMetaInstanceParams
+    ) -> CreateMetaInstanceResponse:
+        meta_instance = MetaInstance(
+            model_id=payload.model_id,
+            sharding=payload.sharding,
+            instance_meta=payload.instance_meta,
+            min_nodes=payload.min_nodes,
+            node_ids=payload.node_ids,
+        )
+        command = CreateMetaInstance(meta_instance=meta_instance)
+        await self._send(command)
+        return CreateMetaInstanceResponse(
+            message="Command received.",
+            command_id=command.command_id,
+            meta_instance_id=meta_instance.meta_instance_id,
+        )
+
+    async def delete_meta_instance(
+        self, meta_instance_id: MetaInstanceId
+    ) -> DeleteMetaInstanceResponse:
+        meta = self.state.meta_instances.get(meta_instance_id)
+        if not meta:
+            raise HTTPException(status_code=404, detail="MetaInstance not found")
+
+        # Command processor handles cascade-deleting backing instances
+        command = DeleteMetaInstance(meta_instance_id=meta_instance_id)
+        await self._send(command)
+
+        return DeleteMetaInstanceResponse(
+            message="Command received.",
+            command_id=command.command_id,
+            meta_instance_id=meta_instance_id,
+        )
+
    async def _token_chunk_stream(
        self, command_id: CommandId
    ) -> AsyncGenerator[
@@ -589,7 +659,7 @@ class API:
            command = TaskCancelled(cancelled_command_id=command_id)
            with anyio.CancelScope(shield=True):
                await self.command_sender.send(
-                    ForwarderCommand(origin=self.node_id, command=command)
+                    ForwarderCommand(origin=self._system_id, command=command)
                )
            raise
        finally:
@@ -937,7 +1007,7 @@ class API:
            command = TaskCancelled(cancelled_command_id=command_id)
            with anyio.CancelScope(shield=True):
                await self.command_sender.send(
-                    ForwarderCommand(origin=self.node_id, command=command)
+                    ForwarderCommand(origin=self._system_id, command=command)
                )
            raise
        finally:
@@ -1023,7 +1093,7 @@ class API:
            command = TaskCancelled(cancelled_command_id=command_id)
            with anyio.CancelScope(shield=True):
                await self.command_sender.send(
-                    ForwarderCommand(origin=self.node_id, command=command)
+                    ForwarderCommand(origin=self._system_id, command=command)
                )
            raise
        finally:
@@ -1621,6 +1691,8 @@ class API:
    async def _apply_state(self):
        with self.global_event_receiver as events:
            async for f_event in events:
+                if f_event.session != self.session_id:
+                    continue
                if f_event.origin != self.session_id.master_node_id:
                    continue
                self.event_buffer.ingest(f_event.origin_idx, f_event.event)
@@ -1684,12 +1756,12 @@ class API:
        while self.paused:
            await self.paused_ev.wait()
        await self.command_sender.send(
-            ForwarderCommand(origin=self.node_id, command=command)
+            ForwarderCommand(origin=self._system_id, command=command)
        )

    async def _send_download(self, command: DownloadCommand):
        await self.download_command_sender.send(
-            ForwarderDownloadCommand(origin=self.node_id, command=command)
+            ForwarderDownloadCommand(origin=self._system_id, command=command)
        )

    async def start_download(
--- a/src/exo/master/main.py
+++ b/src/exo/master/main.py
@@ -1,4 +1,5 @@
-from datetime import datetime, timedelta, timezone
+from collections.abc import Sequence
+from datetime import datetime, timezone

 import anyio
 from anyio.abc import TaskGroup
@@ -12,11 +13,22 @@ from exo.master.placement import (
    get_transition_events,
    place_instance,
 )
+from exo.master.process_managers import ProcessManager
+from exo.master.process_managers.instance_health import InstanceHealthReconciler
+from exo.master.process_managers.meta_instance import MetaInstanceReconciler
+from exo.master.process_managers.node_timeout import NodeTimeoutReconciler
+from exo.master.reconcile import (
+    find_unsatisfied_meta_instances,
+    try_place_for_meta_instance,
+)
 from exo.shared.apply import apply
 from exo.shared.constants import EXO_EVENT_LOG_DIR, EXO_TRACING_ENABLED
+from exo.shared.models.model_cards import ModelCard
 from exo.shared.types.commands import (
    CreateInstance,
+    CreateMetaInstance,
    DeleteInstance,
+    DeleteMetaInstance,
    ForwarderCommand,
    ForwarderDownloadCommand,
    ImageEdits,
@@ -29,15 +41,18 @@ from exo.shared.types.commands import (
    TestCommand,
    TextGeneration,
 )
-from exo.shared.types.common import CommandId, NodeId, SessionId
+from exo.shared.types.common import CommandId, NodeId, SessionId, SystemId
 from exo.shared.types.events import (
    Event,
-    ForwarderEvent,
+    GlobalForwarderEvent,
    IndexedEvent,
    InputChunkReceived,
    InstanceDeleted,
+    LocalForwarderEvent,
+    MetaInstanceCreated,
+    MetaInstanceDeleted,
+    MetaInstancePlacementFailed,
    NodeGatheredInfo,
-    NodeTimedOut,
    TaskCreated,
    TaskDeleted,
    TaskStatusUpdated,
@@ -60,7 +75,7 @@ from exo.shared.types.tasks import (
    TextGeneration as TextGenerationTask,
 )
 from exo.shared.types.worker.instances import InstanceId
-from exo.utils.channels import Receiver, Sender, channel
+from exo.utils.channels import Receiver, Sender
 from exo.utils.event_buffer import MultiSourceBuffer


@@ -71,8 +86,8 @@ class Master:
        session_id: SessionId,
        *,
        command_receiver: Receiver[ForwarderCommand],
-        local_event_receiver: Receiver[ForwarderEvent],
-        global_event_sender: Sender[ForwarderEvent],
+        local_event_receiver: Receiver[LocalForwarderEvent],
+        global_event_sender: Sender[GlobalForwarderEvent],
        download_command_sender: Sender[ForwarderDownloadCommand],
    ):
        self.state = State()
@@ -84,16 +99,16 @@ class Master:
        self.local_event_receiver = local_event_receiver
        self.global_event_sender = global_event_sender
        self.download_command_sender = download_command_sender
-        send, recv = channel[Event]()
-        self.event_sender: Sender[Event] = send
-        self._loopback_event_receiver: Receiver[Event] = recv
-        self._loopback_event_sender: Sender[ForwarderEvent] = (
-            local_event_receiver.clone_sender()
-        )
-        self._multi_buffer = MultiSourceBuffer[NodeId, Event]()
+        self._system_id = SystemId()
+        self._multi_buffer = MultiSourceBuffer[SystemId, Event]()
        self._event_log = DiskEventLog(EXO_EVENT_LOG_DIR / "master")
        self._pending_traces: dict[TaskId, dict[int, list[TraceEventData]]] = {}
        self._expected_ranks: dict[TaskId, set[int]] = {}
+        self._process_managers: Sequence[ProcessManager] = [
+            InstanceHealthReconciler(),
+            NodeTimeoutReconciler(),
+            MetaInstanceReconciler(),
+        ]

    async def run(self):
        logger.info("Starting Master")
@@ -102,15 +117,12 @@ class Master:
            async with self._tg as tg:
                tg.start_soon(self._event_processor)
                tg.start_soon(self._command_processor)
-                tg.start_soon(self._loopback_processor)
-                tg.start_soon(self._plan)
+                tg.start_soon(self._reconcile)
        finally:
            self._event_log.close()
            self.global_event_sender.close()
            self.local_event_receiver.close()
            self.command_receiver.close()
-            self._loopback_event_sender.close()
-            self._loopback_event_receiver.close()

    async def shutdown(self):
        logger.info("Stopping Master")
@@ -288,10 +300,90 @@ class Master:
                            ):
                                await self.download_command_sender.send(
                                    ForwarderDownloadCommand(
-                                        origin=self.node_id, command=cmd
+                                        origin=self._system_id, command=cmd
                                    )
                                )
                            generated_events.extend(transition_events)
+                        case CreateMetaInstance():
+                            logger.info(
+                                f"Creating MetaInstance for {command.meta_instance.model_id}"
+                                f" (min_nodes={command.meta_instance.min_nodes},"
+                                f" sharding={command.meta_instance.sharding})"
+                            )
+                            # Apply immediately so self.state is fresh across
+                            # the await below and the reconciler won't race.
+                            await self._apply_and_broadcast(
+                                MetaInstanceCreated(meta_instance=command.meta_instance)
+                            )
+                            # Immediate placement attempt for responsiveness
+                            model_card = await ModelCard.load(
+                                command.meta_instance.model_id
+                            )
+                            # Re-check: reconciler may have satisfied it during the await
+                            meta_id = command.meta_instance.meta_instance_id
+                            still_unsatisfied = any(
+                                m.meta_instance_id == meta_id
+                                for m in find_unsatisfied_meta_instances(
+                                    self.state.meta_instances,
+                                    self.state.instances,
+                                    self.state.topology,
+                                )
+                            )
+                            if still_unsatisfied:
+                                result = try_place_for_meta_instance(
+                                    command.meta_instance,
+                                    model_card,
+                                    self.state.topology,
+                                    self.state.instances,
+                                    self.state.node_memory,
+                                    self.state.node_network,
+                                    self.state.tasks,
+                                )
+                                generated_events.extend(result.events)
+                                if result.error is not None:
+                                    generated_events.append(
+                                        MetaInstancePlacementFailed(
+                                            meta_instance_id=meta_id,
+                                            reason=result.error,
+                                        )
+                                    )
+                        case DeleteMetaInstance():
+                            backing_count = sum(
+                                1
+                                for inst in self.state.instances.values()
+                                if inst.meta_instance_id == command.meta_instance_id
+                            )
+                            logger.info(
+                                f"Deleting MetaInstance {command.meta_instance_id}"
+                                f" (cascade-deleting {backing_count} backing instance(s))"
+                            )
+                            generated_events.append(
+                                MetaInstanceDeleted(
+                                    meta_instance_id=command.meta_instance_id
+                                )
+                            )
+                            # Cascade-delete backing instances atomically,
+                            # cancelling any active tasks first.
+                            for iid, inst in self.state.instances.items():
+                                if inst.meta_instance_id == command.meta_instance_id:
+                                    for task in self.state.tasks.values():
+                                        if (
+                                            task.instance_id == iid
+                                            and task.task_status
+                                            in (
+                                                TaskStatus.Pending,
+                                                TaskStatus.Running,
+                                            )
+                                        ):
+                                            generated_events.append(
+                                                TaskStatusUpdated(
+                                                    task_status=TaskStatus.Cancelled,
+                                                    task_id=task.task_id,
+                                                )
+                                            )
+                                    generated_events.append(
+                                        InstanceDeleted(instance_id=iid)
+                                    )
                        case PlaceInstance():
                            placement = place_instance(
                                command,
@@ -354,31 +446,32 @@ class Master:
                            ):
                                await self._send_event(IndexedEvent(idx=i, event=event))
                    for event in generated_events:
-                        await self.event_sender.send(event)
+                        await self._apply_and_broadcast(event)
                except ValueError as e:
                    logger.opt(exception=e).warning("Error in command processor")

-    # These plan loops are the cracks showing in our event sourcing architecture - more things could be commands
-    async def _plan(self) -> None:
+    async def _apply_and_broadcast(self, event: Event) -> None:
+        """Apply event to state, persist to disk, and broadcast to workers.
+
+        State is updated synchronously (before any await), so callers can
+        rely on ``self.state`` reflecting this event immediately after the
+        call.  Python's cooperative scheduling guarantees no interleaving
+        between the state read and write.
+        """
+        logger.debug(f"Master indexing event: {str(event)[:100]}")
+        indexed = IndexedEvent(event=event, idx=len(self._event_log))
+        self.state = apply(self.state, indexed)
+        event._master_time_stamp = datetime.now(tz=timezone.utc)  # pyright: ignore[reportPrivateUsage]
+        self._event_log.append(event)
+        await self._send_event(indexed)
+
+    async def _reconcile(self) -> None:
        while True:
-            # kill broken instances
-            connected_node_ids = set(self.state.topology.list_nodes())
-            for instance_id, instance in self.state.instances.items():
-                for node_id in instance.shard_assignments.node_to_runner:
-                    if node_id not in connected_node_ids:
-                        await self.event_sender.send(
-                            InstanceDeleted(instance_id=instance_id)
-                        )
-                        break
-
-            # time out dead nodes
-            for node_id, time in self.state.last_seen.items():
-                now = datetime.now(tz=timezone.utc)
-                if now - time > timedelta(seconds=30):
-                    logger.info(f"Manually removing node {node_id} due to inactivity")
-                    await self.event_sender.send(NodeTimedOut(node_id=node_id))
-
-            await anyio.sleep(10)
+            for pm in self._process_managers:
+                events = await pm.reconcile(self.state)
+                for event in events:
+                    await self._apply_and_broadcast(event)
+            await anyio.sleep(1)

    async def _event_processor(self) -> None:
        with self.local_event_receiver as local_events:
@@ -396,38 +489,16 @@ class Master:
                        await self._handle_traces_collected(event)
                        continue

-                    logger.debug(f"Master indexing event: {str(event)[:100]}")
-                    indexed = IndexedEvent(event=event, idx=len(self._event_log))
-                    self.state = apply(self.state, indexed)
-
-                    event._master_time_stamp = datetime.now(tz=timezone.utc)  # pyright: ignore[reportPrivateUsage]
                    if isinstance(event, NodeGatheredInfo):
                        event.when = str(datetime.now(tz=timezone.utc))

-                    self._event_log.append(event)
-                    await self._send_event(indexed)
-
-    async def _loopback_processor(self) -> None:
-        # this would ideally not be necessary.
-        # this is WAY less hacky than how I was working around this before
-        local_index = 0
-        with self._loopback_event_receiver as events:
-            async for event in events:
-                await self._loopback_event_sender.send(
-                    ForwarderEvent(
-                        origin=NodeId(f"master_{self.node_id}"),
-                        origin_idx=local_index,
-                        session=self.session_id,
-                        event=event,
-                    )
-                )
-                local_index += 1
+                    await self._apply_and_broadcast(event)

    # This function is re-entrant, take care!
    async def _send_event(self, event: IndexedEvent):
        # Convenience method since this line is ugly
        await self.global_event_sender.send(
-            ForwarderEvent(
+            GlobalForwarderEvent(
                origin=self.node_id,
                origin_idx=event.idx,
                session=self.session_id,
@@ -453,7 +524,7 @@ class Master:
        for trace_data in self._pending_traces[task_id].values():
            all_trace_data.extend(trace_data)

-        await self.event_sender.send(
+        await self._apply_and_broadcast(
            TracesMerged(task_id=task_id, traces=all_trace_data)
        )

--- a/src/exo/master/process_managers/init.py
+++ b/src/exo/master/process_managers/init.py
@@ -0,0 +1,12 @@
+from collections.abc import Sequence
+from typing import Protocol, runtime_checkable
+
+from exo.shared.types.events import Event
+from exo.shared.types.state import State
+
+
+@runtime_checkable
+class ProcessManager(Protocol):
+    """A reconciliation step that examines state and returns corrective events."""
+
+    async def reconcile(self, state: State) -> Sequence[Event]: ...
--- a/src/exo/master/process_managers/instance_health.py
+++ b/src/exo/master/process_managers/instance_health.py
@@ -0,0 +1,62 @@
+from collections.abc import Sequence
+from typing import final
+
+from loguru import logger
+
+from exo.master.reconcile import instance_connections_healthy, instance_runners_failed
+from exo.shared.types.events import Event, InstanceDeleted, InstanceRetrying
+from exo.shared.types.state import State
+
+MAX_INSTANCE_RETRIES = 3
+
+
+@final
+class InstanceHealthReconciler:
+    """Delete instances whose network connections are broken or whose runners have all failed."""
+
+    async def reconcile(self, state: State) -> Sequence[Event]:
+        events: list[Event] = []
+        for instance_id, instance in state.instances.items():
+            if not instance_connections_healthy(instance, state.topology):
+                events.append(
+                    InstanceDeleted(
+                        instance_id=instance_id,
+                        failure_error="Network connection lost",
+                    )
+                )
+                continue
+
+            is_failed, error_message = instance_runners_failed(
+                instance, state.runners, state.node_identities
+            )
+            if is_failed:
+                # Retry within the same instance if backed by a MetaInstance
+                mid = instance.meta_instance_id
+                mi = state.meta_instances.get(mid) if mid else None
+                if mid and mi and mi.consecutive_failures < MAX_INSTANCE_RETRIES:
+                    logger.info(
+                        f"Instance {instance_id} failed (attempt"
+                        f" {mi.consecutive_failures + 1}/{MAX_INSTANCE_RETRIES}),"
+                        f" retrying: {error_message}"
+                    )
+                    events.append(
+                        InstanceRetrying(
+                            instance_id=instance_id,
+                            meta_instance_id=mid,
+                            failure_error=error_message or "Runner failed",
+                        )
+                    )
+                else:
+                    if mid and mi:
+                        logger.warning(
+                            f"Instance {instance_id} exceeded retry limit"
+                            f" ({MAX_INSTANCE_RETRIES}), deleting:"
+                            f" {error_message}"
+                        )
+                    events.append(
+                        InstanceDeleted(
+                            instance_id=instance_id,
+                            failure_error=error_message,
+                        )
+                    )
+        return events
--- a/src/exo/master/process_managers/meta_instance.py
+++ b/src/exo/master/process_managers/meta_instance.py
@@ -0,0 +1,92 @@
+from collections.abc import Sequence
+from typing import final
+
+import anyio
+from loguru import logger
+
+from exo.master.reconcile import (
+    find_unsatisfied_meta_instances,
+    try_place_for_meta_instance,
+)
+from exo.shared.models.model_cards import ModelCard
+from exo.shared.types.events import Event, InstanceCreated, MetaInstancePlacementFailed
+from exo.shared.types.state import State
+from exo.shared.types.worker.instances import Instance, InstanceId
+
+MODEL_CARD_LOAD_TIMEOUT_SECONDS = 10
+
+
+@final
+class MetaInstanceReconciler:
+    """Place instances for unsatisfied MetaInstances."""
+
+    async def reconcile(self, state: State) -> Sequence[Event]:
+        all_events: list[Event] = []
+        # Local copy for intermediate tracking — so placement of B
+        # sees A's instance and doesn't double-place on same resources.
+        current_instances: dict[InstanceId, Instance] = dict(state.instances)
+
+        unsatisfied = find_unsatisfied_meta_instances(
+            state.meta_instances,
+            current_instances,
+            state.topology,
+        )
+        for meta_instance in unsatisfied:
+            try:
+                with anyio.fail_after(MODEL_CARD_LOAD_TIMEOUT_SECONDS):
+                    model_card = await ModelCard.load(meta_instance.model_id)
+            except TimeoutError:
+                logger.warning(
+                    f"ModelCard.load timed out for {meta_instance.model_id}, skipping this cycle"
+                )
+                continue
+            except Exception as exc:
+                logger.warning(
+                    f"ModelCard.load failed for {meta_instance.model_id}: {exc}"
+                )
+                error = f"Failed to load model card: {exc}"
+                if meta_instance.placement_error != error:
+                    all_events.append(
+                        MetaInstancePlacementFailed(
+                            meta_instance_id=meta_instance.meta_instance_id,
+                            reason=error,
+                        )
+                    )
+                continue
+
+            result = try_place_for_meta_instance(
+                meta_instance,
+                model_card,
+                state.topology,
+                current_instances,
+                state.node_memory,
+                state.node_network,
+                state.tasks,
+            )
+            # Update local instance map so next placement sees this one
+            for event in result.events:
+                if isinstance(event, InstanceCreated):
+                    logger.info(
+                        f"MetaInstance reconciler placed instance"
+                        f" {event.instance.instance_id} for"
+                        f" {meta_instance.model_id}"
+                    )
+                    current_instances[event.instance.instance_id] = event.instance
+            all_events.extend(result.events)
+
+            # Emit placement failure if error differs from what's already in state
+            if (
+                result.error is not None
+                and meta_instance.placement_error != result.error
+            ):
+                logger.warning(
+                    f"MetaInstance placement failed for"
+                    f" {meta_instance.model_id}: {result.error}"
+                )
+                all_events.append(
+                    MetaInstancePlacementFailed(
+                        meta_instance_id=meta_instance.meta_instance_id,
+                        reason=result.error,
+                    )
+                )
+        return all_events
--- a/src/exo/master/process_managers/node_timeout.py
+++ b/src/exo/master/process_managers/node_timeout.py
@@ -0,0 +1,27 @@
+from collections.abc import Sequence
+from datetime import datetime, timedelta, timezone
+from typing import final
+
+from loguru import logger
+
+from exo.shared.types.events import Event, NodeTimedOut
+from exo.shared.types.state import State
+
+_DEFAULT_TIMEOUT = timedelta(seconds=30)
+
+
+@final
+class NodeTimeoutReconciler:
+    """Time out nodes that haven't been seen recently."""
+
+    def __init__(self, timeout: timedelta = _DEFAULT_TIMEOUT) -> None:
+        self.timeout = timeout
+
+    async def reconcile(self, state: State) -> Sequence[Event]:
+        now = datetime.now(tz=timezone.utc)
+        events: list[Event] = []
+        for node_id, last_seen in state.last_seen.items():
+            if now - last_seen > self.timeout:
+                logger.info(f"Removing node {node_id} due to inactivity")
+                events.append(NodeTimedOut(node_id=node_id))
+        return events
--- a/src/exo/master/reconcile.py
+++ b/src/exo/master/reconcile.py
@@ -0,0 +1,244 @@
+from collections.abc import Mapping, Sequence
+from typing import NamedTuple
+
+from loguru import logger
+
+from exo.master.placement import get_transition_events, place_instance
+from exo.shared.models.model_cards import ModelCard
+from exo.shared.topology import Topology
+from exo.shared.types.commands import PlaceInstance
+from exo.shared.types.common import MetaInstanceId, NodeId
+from exo.shared.types.events import Event
+from exo.shared.types.meta_instance import MetaInstance
+from exo.shared.types.profiling import MemoryUsage, NodeIdentity, NodeNetworkInfo
+from exo.shared.types.tasks import Task, TaskId
+from exo.shared.types.topology import RDMAConnection, SocketConnection
+from exo.shared.types.worker.instances import (
+    BaseInstance,
+    Instance,
+    InstanceId,
+    MlxJacclInstance,
+    MlxRingInstance,
+)
+from exo.shared.types.worker.runners import (
+    RunnerFailed,
+    RunnerId,
+    RunnerShutdown,
+    RunnerStatus,
+)
+
+
+class PlacementResult(NamedTuple):
+    """Result of a placement attempt: events to apply and optional error reason."""
+
+    events: Sequence[Event]
+    error: str | None
+
+
+def _get_ring_order(instance: BaseInstance) -> list[NodeId]:
+    """Reconstruct ring order from shard device_rank."""
+    node_ranks: list[tuple[NodeId, int]] = []
+    for node_id, runner_id in instance.shard_assignments.node_to_runner.items():
+        shard = instance.shard_assignments.runner_to_shard[runner_id]
+        node_ranks.append((node_id, shard.device_rank))
+    node_ranks.sort(key=lambda x: x[1])
+    return [node_id for node_id, _ in node_ranks]
+
+
+def _ring_connections_healthy(instance: MlxRingInstance, topology: Topology) -> bool:
+    """Check that the specific IPs used by a ring instance still exist in the topology."""
+    ring = _get_ring_order(instance)
+    n = len(ring)
+    for node in ring:
+        hosts = instance.hosts_by_node[node]
+        for idx in range(n):
+            host = hosts[idx]
+            if host.ip in ("0.0.0.0", "198.51.100.1"):
+                continue  # self or placeholder
+            # Real connection: node → ring[idx]. Check specific IP.
+            connections = topology.get_all_connections_between(node, ring[idx])
+            if not any(
+                isinstance(c, SocketConnection)
+                and c.sink_multiaddr.ip_address == host.ip
+                for c in connections
+            ):
+                return False
+    return True
+
+
+def _jaccl_connections_healthy(instance: MlxJacclInstance, topology: Topology) -> bool:
+    """Check that the specific RDMA interfaces used by a JACCL instance still exist."""
+    ring = _get_ring_order(instance)
+    n = len(ring)
+    for i in range(n):
+        for j in range(n):
+            iface = instance.jaccl_devices[i][j]
+            if iface is None:
+                continue
+            connections = topology.get_all_connections_between(ring[i], ring[j])
+            if not any(
+                isinstance(c, RDMAConnection) and c.source_rdma_iface == iface
+                for c in connections
+            ):
+                return False
+    return True
+
+
+def instance_connections_healthy(instance: Instance, topology: Topology) -> bool:
+    """Check that an instance's nodes and specific connections are still in the topology."""
+    instance_nodes = set(instance.shard_assignments.node_to_runner.keys())
+    if not all(topology.contains_node(n) for n in instance_nodes):
+        return False
+    if len(instance_nodes) <= 1:
+        return True
+    match instance:
+        case MlxRingInstance():
+            return _ring_connections_healthy(instance, topology)
+        case MlxJacclInstance():
+            return _jaccl_connections_healthy(instance, topology)
+
+
+def instance_runners_failed(
+    instance: Instance,
+    runners: Mapping[RunnerId, RunnerStatus],
+    node_identities: Mapping[NodeId, NodeIdentity],
+) -> tuple[bool, str | None]:
+    """Check if an instance's runners have all reached terminal failure states.
+
+    Returns ``(True, error_message)`` when ALL runners are terminal
+    (``RunnerFailed`` or ``RunnerShutdown``) and at least one is ``RunnerFailed``.
+
+    Returns ``(False, None)`` when runners are still active, haven't reported
+    yet, or all gracefully shut down (no ``RunnerFailed``).
+    """
+    instance_runner_ids = set(instance.shard_assignments.node_to_runner.values())
+
+    if not instance_runner_ids:
+        return False, None
+
+    # Build reverse mapping: runner_id -> node_id
+    runner_to_node: dict[RunnerId, NodeId] = {
+        runner_id: node_id
+        for node_id, runner_id in instance.shard_assignments.node_to_runner.items()
+    }
+
+    has_any_failed = False
+    error_messages: list[str] = []
+
+    for runner_id in instance_runner_ids:
+        status = runners.get(runner_id)
+        if status is None:
+            # Runner hasn't reported yet — instance is still starting
+            return False, None
+        if isinstance(status, RunnerFailed):
+            has_any_failed = True
+            if status.error_message:
+                node_id = runner_to_node.get(runner_id)
+                name = (
+                    node_identities[node_id].friendly_name
+                    if node_id and node_id in node_identities
+                    else node_id or "unknown"
+                )
+                error_messages.append(f"{name}: {status.error_message}")
+        elif isinstance(status, RunnerShutdown):
+            pass  # Terminal but not a failure indicator on its own
+        else:
+            # Runner is still active (connecting, loading, running, etc.)
+            return False, None
+
+    if has_any_failed:
+        return True, "; ".join(error_messages) if error_messages else "Runner failed"
+
+    # All runners are Shutdown but none Failed — graceful shutdown, not a failure
+    return False, None
+
+
+def instance_satisfies_meta_instance(
+    meta_instance: MetaInstance,
+    instance: Instance,
+) -> bool:
+    """Check if a single instance satisfies a meta-instance's constraints.
+
+    This is a pure constraint check (model, min_nodes, node_ids).
+    Use ``instance_connections_healthy`` separately for topology health.
+    """
+    if instance.shard_assignments.model_id != meta_instance.model_id:
+        return False
+
+    instance_nodes = set(instance.shard_assignments.node_to_runner.keys())
+
+    if len(instance_nodes) < meta_instance.min_nodes:
+        return False
+
+    return meta_instance.node_ids is None or set(meta_instance.node_ids).issubset(
+        instance_nodes
+    )
+
+
+def find_unsatisfied_meta_instances(
+    meta_instances: Mapping[MetaInstanceId, MetaInstance],
+    instances: Mapping[InstanceId, Instance],
+    topology: Topology,
+) -> Sequence[MetaInstance]:
+    """Return meta-instances that have no healthy backing instance."""
+    unsatisfied: list[MetaInstance] = []
+    for meta_id, meta_instance in meta_instances.items():
+        has_healthy_backing = any(
+            instance.meta_instance_id == meta_id
+            and instance_connections_healthy(instance, topology)
+            for instance in instances.values()
+        )
+        if not has_healthy_backing:
+            unsatisfied.append(meta_instance)
+    return unsatisfied
+
+
+def try_place_for_meta_instance(
+    meta_instance: MetaInstance,
+    model_card: ModelCard,
+    topology: Topology,
+    current_instances: Mapping[InstanceId, Instance],
+    node_memory: Mapping[NodeId, MemoryUsage],
+    node_network: Mapping[NodeId, NodeNetworkInfo],
+    tasks: Mapping[TaskId, Task] | None = None,
+) -> PlacementResult:
+    """Try to place an instance satisfying the meta-instance constraints.
+
+    Returns a :class:`PlacementResult` with events on success, or an error
+    reason on failure.
+    """
+    command = PlaceInstance(
+        model_card=model_card,
+        sharding=meta_instance.sharding,
+        instance_meta=meta_instance.instance_meta,
+        min_nodes=meta_instance.min_nodes,
+    )
+    try:
+        target_instances = place_instance(
+            command,
+            topology,
+            current_instances,
+            node_memory,
+            node_network,
+            required_nodes=(
+                set(meta_instance.node_ids) if meta_instance.node_ids else None
+            ),
+        )
+        # Tag the new instance with meta_instance_id
+        new_instance_ids = set(target_instances.keys()) - set(current_instances.keys())
+        if new_instance_ids:
+            new_id = next(iter(new_instance_ids))
+            target_instances[new_id] = target_instances[new_id].model_copy(
+                update={"meta_instance_id": meta_instance.meta_instance_id}
+            )
+        return PlacementResult(
+            events=list(
+                get_transition_events(current_instances, target_instances, tasks or {})
+            ),
+            error=None,
+        )
+    except ValueError as e:
+        logger.debug(
+            f"MetaInstance placement not possible for {meta_instance.model_id}: {e}"
+        )
+        return PlacementResult(events=[], error=str(e))
--- a/src/exo/master/tests/test_master.py
+++ b/src/exo/master/tests/test_master.py
@@ -15,11 +15,12 @@ from exo.shared.types.commands import (
    PlaceInstance,
    TextGeneration,
 )
-from exo.shared.types.common import ModelId, NodeId, SessionId
+from exo.shared.types.common import ModelId, NodeId, SessionId, SystemId
 from exo.shared.types.events import (
-    ForwarderEvent,
+    GlobalForwarderEvent,
    IndexedEvent,
    InstanceCreated,
+    LocalForwarderEvent,
    NodeGatheredInfo,
    TaskCreated,
 )
@@ -45,9 +46,9 @@ async def test_master():
    node_id = NodeId(keypair.to_node_id())
    session_id = SessionId(master_node_id=node_id, election_clock=0)

-    ge_sender, global_event_receiver = channel[ForwarderEvent]()
+    ge_sender, global_event_receiver = channel[GlobalForwarderEvent]()
    command_sender, co_receiver = channel[ForwarderCommand]()
-    local_event_sender, le_receiver = channel[ForwarderEvent]()
+    local_event_sender, le_receiver = channel[LocalForwarderEvent]()
    fcds, _fcdr = channel[ForwarderDownloadCommand]()

    all_events: list[IndexedEvent] = []
@@ -75,13 +76,12 @@ async def test_master():
    async with anyio.create_task_group() as tg:
        tg.start_soon(master.run)

-        sender_node_id = NodeId(f"{keypair.to_node_id()}_sender")
        # inject a NodeGatheredInfo event
        logger.info("inject a NodeGatheredInfo event")
        await local_event_sender.send(
-            ForwarderEvent(
+            LocalForwarderEvent(
                origin_idx=0,
-                origin=sender_node_id,
+                origin=SystemId("Worker"),
                session=session_id,
                event=(
                    NodeGatheredInfo(
@@ -108,7 +108,7 @@ async def test_master():
        logger.info("inject a CreateInstance Command")
        await command_sender.send(
            ForwarderCommand(
-                origin=node_id,
+                origin=SystemId("API"),
                command=(
                    PlaceInstance(
                        command_id=CommandId(),
@@ -133,7 +133,7 @@ async def test_master():
        logger.info("inject a TextGeneration Command")
        await command_sender.send(
            ForwarderCommand(
-                origin=node_id,
+                origin=SystemId("API"),
                command=(
                    TextGeneration(
                        command_id=CommandId(),
--- a/src/exo/master/tests/test_meta_instance_edge_cases.py
+++ b/src/exo/master/tests/test_meta_instance_edge_cases.py
@@ -0,0 +1,778 @@
+"""Edge-case and regression tests for MetaInstance lifecycle, concurrent operations, and error handling."""
+
+import pytest
+
+from exo.master.process_managers.instance_health import (
+    MAX_INSTANCE_RETRIES,
+    InstanceHealthReconciler,
+)
+from exo.master.process_managers.meta_instance import MetaInstanceReconciler
+from exo.master.reconcile import (
+    find_unsatisfied_meta_instances,
+    instance_connections_healthy,
+    instance_runners_failed,
+    instance_satisfies_meta_instance,
+)
+from exo.shared.apply import apply
+from exo.shared.models.model_cards import ModelCard, ModelId, ModelTask
+from exo.shared.topology import Topology
+from exo.shared.types.common import Host, MetaInstanceId, NodeId
+from exo.shared.types.events import (
+    IndexedEvent,
+    InstanceCreated,
+    InstanceDeleted,
+    InstanceRetrying,
+    MetaInstanceCreated,
+    MetaInstanceDeleted,
+    MetaInstancePlacementFailed,
+    TaskStatusUpdated,
+)
+from exo.shared.types.memory import Memory
+from exo.shared.types.meta_instance import MetaInstance
+from exo.shared.types.multiaddr import Multiaddr
+from exo.shared.types.profiling import NodeIdentity
+from exo.shared.types.state import State
+from exo.shared.types.tasks import LoadModel, TaskId, TaskStatus
+from exo.shared.types.topology import Connection, SocketConnection
+from exo.shared.types.worker.instances import (
+    InstanceId,
+    MlxRingInstance,
+)
+from exo.shared.types.worker.runners import (
+    RunnerFailed,
+    RunnerId,
+    RunnerReady,
+    ShardAssignments,
+)
+from exo.shared.types.worker.shards import PipelineShardMetadata
+
+# --- Helpers (copied from test_reconcile.py for independence) ---
+
+
+def _model_card(model_id: str = "test-org/test-model") -> ModelCard:
+    return ModelCard(
+        model_id=ModelId(model_id),
+        storage_size=Memory.from_kb(1000),
+        n_layers=10,
+        hidden_size=30,
+        supports_tensor=True,
+        tasks=[ModelTask.TextGeneration],
+    )
+
+
+def _topology(*node_ids: str, connect: bool = True) -> Topology:
+    t = Topology()
+    nodes = [NodeId(n) for n in node_ids]
+    for n in nodes:
+        t.add_node(n)
+    if connect and len(nodes) > 1:
+        for i in range(len(nodes)):
+            j = (i + 1) % len(nodes)
+            t.add_connection(
+                Connection(
+                    source=nodes[i],
+                    sink=nodes[j],
+                    edge=SocketConnection(
+                        sink_multiaddr=Multiaddr(
+                            address=f"/ip4/10.0.0.{j + 1}/tcp/50000"
+                        )
+                    ),
+                )
+            )
+            t.add_connection(
+                Connection(
+                    source=nodes[j],
+                    sink=nodes[i],
+                    edge=SocketConnection(
+                        sink_multiaddr=Multiaddr(
+                            address=f"/ip4/10.0.0.{i + 1}/tcp/50000"
+                        )
+                    ),
+                )
+            )
+    return t
+
+
+def _meta_instance(
+    model_id: str = "test-org/test-model",
+    *,
+    min_nodes: int = 1,
+    node_ids: list[NodeId] | None = None,
+    meta_instance_id: MetaInstanceId | None = None,
+    consecutive_failures: int = 0,
+    last_failure_error: str | None = None,
+    placement_error: str | None = None,
+) -> MetaInstance:
+    return MetaInstance(
+        meta_instance_id=meta_instance_id or MetaInstanceId(),
+        model_id=ModelId(model_id),
+        min_nodes=min_nodes,
+        node_ids=node_ids,
+        consecutive_failures=consecutive_failures,
+        last_failure_error=last_failure_error,
+        placement_error=placement_error,
+    )
+
+
+def _instance(
+    model_id: str = "test-org/test-model",
+    node_ids: list[str] | None = None,
+    instance_id: InstanceId | None = None,
+    meta_instance_id: MetaInstanceId | None = None,
+) -> tuple[InstanceId, MlxRingInstance]:
+    iid = instance_id or InstanceId()
+    nodes = node_ids or ["node-a"]
+    n = len(nodes)
+    mc = _model_card(model_id)
+    ephemeral_port = 50000
+    node_to_runner = {NodeId(nd): RunnerId() for nd in nodes}
+    runner_to_shard = {
+        runner_id: PipelineShardMetadata(
+            model_card=mc,
+            device_rank=i,
+            world_size=n,
+            start_layer=0,
+            end_layer=mc.n_layers,
+            n_layers=mc.n_layers,
+        )
+        for i, runner_id in enumerate(node_to_runner.values())
+    }
+    hosts_by_node: dict[NodeId, list[Host]] = {}
+    for r, node_str in enumerate(nodes):
+        hosts: list[Host] = []
+        for idx in range(n):
+            if idx == r:
+                hosts.append(Host(ip="0.0.0.0", port=ephemeral_port))
+            elif n > 1 and idx in ((r - 1) % n, (r + 1) % n):
+                hosts.append(Host(ip=f"10.0.0.{idx + 1}", port=ephemeral_port))
+            else:
+                hosts.append(Host(ip="198.51.100.1", port=0))
+        hosts_by_node[NodeId(node_str)] = hosts
+    return iid, MlxRingInstance(
+        instance_id=iid,
+        shard_assignments=ShardAssignments(
+            model_id=ModelId(model_id),
+            runner_to_shard=runner_to_shard,
+            node_to_runner=node_to_runner,
+        ),
+        hosts_by_node=hosts_by_node,
+        ephemeral_port=ephemeral_port,
+        meta_instance_id=meta_instance_id,
+    )
+
+
+# =============================================================================
+# 1. MetaInstance lifecycle edge cases
+# =============================================================================
+
+
+def test_meta_instance_model_is_frozen():
+    """MetaInstance should be immutable (frozen model)."""
+    meta = _meta_instance()
+    try:
+        meta.model_id = ModelId("something-else")
+        raise AssertionError("Should have raised")
+    except Exception:
+        pass  # Expected — frozen model
+
+
+def test_meta_instance_created_then_deleted_roundtrip():
+    """Create and delete a MetaInstance through apply — state should be clean."""
+    state = State()
+    meta = _meta_instance()
+    state = apply(
+        state, IndexedEvent(idx=0, event=MetaInstanceCreated(meta_instance=meta))
+    )
+    assert meta.meta_instance_id in state.meta_instances
+    state = apply(
+        state,
+        IndexedEvent(
+            idx=1, event=MetaInstanceDeleted(meta_instance_id=meta.meta_instance_id)
+        ),
+    )
+    assert meta.meta_instance_id not in state.meta_instances
+    assert len(state.meta_instances) == 0
+
+
+def test_delete_nonexistent_meta_instance_is_safe():
+    """Deleting a MetaInstance that doesn't exist should not crash."""
+    state = State()
+    event = MetaInstanceDeleted(meta_instance_id=MetaInstanceId("nonexistent"))
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    assert len(new_state.meta_instances) == 0
+
+
+def test_placement_failed_for_nonexistent_meta_instance_is_safe():
+    """MetaInstancePlacementFailed for unknown ID should not crash."""
+    state = State()
+    event = MetaInstancePlacementFailed(
+        meta_instance_id=MetaInstanceId("nonexistent"),
+        reason="test",
+    )
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    assert len(new_state.meta_instances) == 0
+
+
+def test_multiple_meta_instances_for_same_model():
+    """Multiple MetaInstances for the same model are tracked independently."""
+    state = State()
+    meta_a = _meta_instance("test-org/model-x")
+    meta_b = _meta_instance("test-org/model-x")
+    state = apply(
+        state, IndexedEvent(idx=0, event=MetaInstanceCreated(meta_instance=meta_a))
+    )
+    state = apply(
+        state, IndexedEvent(idx=1, event=MetaInstanceCreated(meta_instance=meta_b))
+    )
+    assert len(state.meta_instances) == 2
+    assert meta_a.meta_instance_id in state.meta_instances
+    assert meta_b.meta_instance_id in state.meta_instances
+
+
+# =============================================================================
+# 2. Retry logic edge cases
+# =============================================================================
+
+
+def test_retry_counter_resets_on_successful_instance_creation():
+    """When a new instance is created for a meta-instance, failures should reset."""
+    meta = _meta_instance(consecutive_failures=2, last_failure_error="old")
+    _, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    state = apply(state, IndexedEvent(idx=0, event=InstanceCreated(instance=inst)))
+    mi = state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 0
+    # last_failure_error is preserved (for UI display)
+    assert mi.last_failure_error == "old"
+
+
+async def test_retry_count_increments_through_full_cycle():
+    """Walk through MAX_INSTANCE_RETRIES worth of retries, then verify delete."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    topology = _topology("node-a")
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        topology=topology,
+    )
+
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    for idx, i in enumerate(range(MAX_INSTANCE_RETRIES)):
+        # Simulate runners failing
+        state_with_runners = state.model_copy(
+            update={"runners": {runner_ids[0]: RunnerFailed(error_message=f"fail-{i}")}}
+        )
+        reconciler = InstanceHealthReconciler()
+        events = await reconciler.reconcile(state_with_runners)
+        assert len(events) == 1
+        assert isinstance(events[0], InstanceRetrying), f"iteration {i}"
+        state = apply(state, IndexedEvent(idx=idx, event=events[0]))
+
+    # After MAX_INSTANCE_RETRIES retries, failure counter should be at max
+    mi = state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == MAX_INSTANCE_RETRIES
+
+    # Next failure should result in deletion
+    state_with_runners = state.model_copy(
+        update={"runners": {runner_ids[0]: RunnerFailed(error_message="final")}}
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state_with_runners)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceDeleted)
+
+
+async def test_health_reconciler_respects_exact_limit():
+    """At exactly MAX_INSTANCE_RETRIES, reconciler should delete, not retry."""
+    meta = _meta_instance(consecutive_failures=MAX_INSTANCE_RETRIES)
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        runners={runner_ids[0]: RunnerFailed(error_message="OOM")},
+        topology=_topology("node-a"),
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceDeleted)
+
+
+async def test_health_reconciler_at_limit_minus_one_retries():
+    """At MAX_INSTANCE_RETRIES - 1, reconciler should still retry."""
+    meta = _meta_instance(consecutive_failures=MAX_INSTANCE_RETRIES - 1)
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        runners={runner_ids[0]: RunnerFailed(error_message="OOM")},
+        topology=_topology("node-a"),
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceRetrying)
+
+
+# =============================================================================
+# 3. Error handling edge cases
+# =============================================================================
+
+
+def test_runners_failed_with_empty_error_message():
+    """RunnerFailed with empty error_message should still report as failed."""
+    _, inst = _instance(node_ids=["node-a"])
+    runners = {
+        rid: RunnerFailed(error_message="")
+        for rid in inst.shard_assignments.node_to_runner.values()
+    }
+    is_failed, error = instance_runners_failed(inst, runners, {})
+    assert is_failed is True
+    # Empty error message means we get the fallback
+    assert error == "Runner failed"
+
+
+def test_runners_failed_with_none_error_message():
+    """RunnerFailed with None error_message should still report as failed."""
+    _, inst = _instance(node_ids=["node-a"])
+    runners = {
+        rid: RunnerFailed(error_message=None)
+        for rid in inst.shard_assignments.node_to_runner.values()
+    }
+    is_failed, error = instance_runners_failed(inst, runners, {})
+    assert is_failed is True
+    assert error == "Runner failed"
+
+
+def test_runners_failed_collects_all_error_messages():
+    """With multiple failed runners, all error messages should be collected."""
+    _, inst = _instance(node_ids=["node-a", "node-b", "node-c"])
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    runners = {
+        runner_ids[0]: RunnerFailed(error_message="OOM on GPU 0"),
+        runner_ids[1]: RunnerFailed(error_message="OOM on GPU 1"),
+        runner_ids[2]: RunnerFailed(error_message="OOM on GPU 2"),
+    }
+    is_failed, error = instance_runners_failed(inst, runners, {})
+    assert is_failed is True
+    assert error is not None
+    assert "OOM on GPU 0" in error
+    assert "OOM on GPU 1" in error
+    assert "OOM on GPU 2" in error
+
+
+def test_runners_failed_includes_friendly_name():
+    """Error messages should include node friendly names when available."""
+    _, inst = _instance(node_ids=["node-a"])
+    node_id = NodeId("node-a")
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    runners = {runner_ids[0]: RunnerFailed(error_message="OOM")}
+    identities = {node_id: NodeIdentity(friendly_name="My Mac Studio")}
+    is_failed, error = instance_runners_failed(inst, runners, identities)
+    assert is_failed is True
+    assert error is not None
+    assert "My Mac Studio" in error
+
+
+def test_instance_retrying_for_missing_instance_is_safe():
+    """InstanceRetrying for an instance not in state should not crash.
+
+    NOTE: When the instance is missing, the handler returns early WITHOUT
+    incrementing the MetaInstance failure counter. This means stale retry
+    events for already-deleted instances are silently dropped. This is
+    acceptable since the InstanceDeleted handler already increments failures.
+    """
+    meta = _meta_instance()
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    event = InstanceRetrying(
+        instance_id=InstanceId("nonexistent"),
+        meta_instance_id=meta.meta_instance_id,
+        failure_error="crash",
+    )
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    # Does not crash, but failure count is NOT incremented (early return)
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 0
+
+
+# =============================================================================
+# 4. Backward compatibility
+# =============================================================================
+
+
+def test_instance_without_meta_instance_id_works():
+    """Instances created without meta_instance_id should still function normally."""
+    _, inst = _instance(node_ids=["node-a"])
+    assert inst.meta_instance_id is None
+    topology = _topology("node-a")
+    assert instance_connections_healthy(inst, topology) is True
+
+
+def test_instance_deleted_without_meta_does_not_affect_meta_instances():
+    """Deleting an instance without meta_instance_id should not affect meta_instances."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"])  # no meta_instance_id
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+    )
+    event = InstanceDeleted(instance_id=iid, failure_error="crash")
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 0  # unchanged
+
+
+def test_satisfies_ignores_meta_instance_id_binding():
+    """instance_satisfies_meta_instance checks constraints only, not binding."""
+    meta = _meta_instance()
+    _, inst = _instance(node_ids=["node-a"])  # no meta_instance_id set
+    # Should match on constraints (model, min_nodes) regardless of binding
+    assert instance_satisfies_meta_instance(meta, inst) is True
+
+
+def test_find_unsatisfied_uses_binding_not_constraints():
+    """find_unsatisfied checks meta_instance_id binding, not just constraint matching."""
+    meta = _meta_instance()
+    # Instance matches constraints but is NOT bound to this meta_instance
+    iid, inst = _instance(node_ids=["node-a"])
+    topology = _topology("node-a")
+    result = find_unsatisfied_meta_instances(
+        {meta.meta_instance_id: meta}, {iid: inst}, topology
+    )
+    # Should be unsatisfied because instance.meta_instance_id != meta.meta_instance_id
+    assert list(result) == [meta]
+
+
+# =============================================================================
+# 5. Concurrent / multi-instance scenarios
+# =============================================================================
+
+
+async def test_health_reconciler_handles_multiple_failing_instances():
+    """Multiple instances failing simultaneously should each get their own event."""
+    meta_a = _meta_instance()
+    meta_b = _meta_instance()
+    iid_a, inst_a = _instance(
+        node_ids=["node-a"], meta_instance_id=meta_a.meta_instance_id
+    )
+    iid_b, inst_b = _instance(
+        node_ids=["node-b"], meta_instance_id=meta_b.meta_instance_id
+    )
+    runner_ids_a = list(inst_a.shard_assignments.node_to_runner.values())
+    runner_ids_b = list(inst_b.shard_assignments.node_to_runner.values())
+    state = State(
+        meta_instances={
+            meta_a.meta_instance_id: meta_a,
+            meta_b.meta_instance_id: meta_b,
+        },
+        instances={iid_a: inst_a, iid_b: inst_b},
+        runners={
+            runner_ids_a[0]: RunnerFailed(error_message="OOM"),
+            runner_ids_b[0]: RunnerFailed(error_message="OOM"),
+        },
+        topology=_topology("node-a", "node-b"),
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 2
+    # Both should be InstanceRetrying since failures < MAX
+    assert all(isinstance(e, InstanceRetrying) for e in events)
+    instance_ids = {e.instance_id for e in events}  # type: ignore[union-attr]
+    assert instance_ids == {iid_a, iid_b}
+
+
+async def test_health_reconciler_mixed_healthy_and_failing():
+    """Only failing instances should produce events; healthy ones should not."""
+    meta_healthy = _meta_instance()
+    meta_failing = _meta_instance()
+    iid_h, inst_h = _instance(
+        node_ids=["node-a"], meta_instance_id=meta_healthy.meta_instance_id
+    )
+    iid_f, inst_f = _instance(
+        node_ids=["node-b"], meta_instance_id=meta_failing.meta_instance_id
+    )
+    runner_ids_h = list(inst_h.shard_assignments.node_to_runner.values())
+    runner_ids_f = list(inst_f.shard_assignments.node_to_runner.values())
+    state = State(
+        meta_instances={
+            meta_healthy.meta_instance_id: meta_healthy,
+            meta_failing.meta_instance_id: meta_failing,
+        },
+        instances={iid_h: inst_h, iid_f: inst_f},
+        runners={
+            runner_ids_h[0]: RunnerReady(),
+            runner_ids_f[0]: RunnerFailed(error_message="crash"),
+        },
+        topology=_topology("node-a", "node-b"),
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceRetrying)
+    assert events[0].instance_id == iid_f
+
+
+async def test_meta_instance_reconciler_empty_state():
+    """MetaInstanceReconciler with no meta_instances should produce no events."""
+    state = State()
+    reconciler = MetaInstanceReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 0
+
+
+# =============================================================================
+# 6. Placement error tracking
+# =============================================================================
+
+
+def test_placement_failed_sets_error():
+    """MetaInstancePlacementFailed should set placement_error on the MetaInstance."""
+    meta = _meta_instance()
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    event = MetaInstancePlacementFailed(
+        meta_instance_id=meta.meta_instance_id,
+        reason="Not enough memory",
+    )
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.placement_error == "Not enough memory"
+
+
+def test_instance_created_clears_placement_error():
+    """InstanceCreated should clear placement_error on the MetaInstance."""
+    meta = _meta_instance(placement_error="Not enough memory")
+    _, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    state = apply(state, IndexedEvent(idx=0, event=InstanceCreated(instance=inst)))
+    mi = state.meta_instances[meta.meta_instance_id]
+    assert mi.placement_error is None
+
+
+def test_placement_error_does_not_increment_failures():
+    """Placement failures should only set placement_error, not increment consecutive_failures."""
+    meta = _meta_instance()
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    event = MetaInstancePlacementFailed(
+        meta_instance_id=meta.meta_instance_id,
+        reason="No resources",
+    )
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 0
+    assert mi.placement_error == "No resources"
+
+
+# =============================================================================
+# 7. State serialization roundtrip
+# =============================================================================
+
+
+def test_state_with_meta_instances_serializes():
+    """State with meta_instances should serialize and deserialize correctly."""
+    meta = _meta_instance(consecutive_failures=2, last_failure_error="test")
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+    )
+    json_str = state.model_dump_json()
+    restored = State.model_validate_json(json_str)
+    assert meta.meta_instance_id in restored.meta_instances
+    mi = restored.meta_instances[meta.meta_instance_id]
+    assert mi.model_id == meta.model_id
+    assert mi.consecutive_failures == 2
+    assert mi.last_failure_error == "test"
+    assert iid in restored.instances
+    assert restored.instances[iid].meta_instance_id == meta.meta_instance_id
+
+
+# =============================================================================
+# 8. MetaInstanceReconciler error handling
+# =============================================================================
+
+
+async def test_meta_instance_reconciler_model_load_error_emits_placement_failed(
+    monkeypatch: "pytest.MonkeyPatch",
+):
+    """When ModelCard.load raises, reconciler emits MetaInstancePlacementFailed."""
+    import exo.master.process_managers.meta_instance as mi_mod
+
+    meta = _meta_instance()
+    topo = _topology("node-a")
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        topology=topo,
+    )
+
+    async def _failing_load(_model_id: ModelId) -> ModelCard:
+        raise RuntimeError("Network error")
+
+    monkeypatch.setattr(
+        mi_mod, "ModelCard", type("MC", (), {"load": staticmethod(_failing_load)})
+    )
+
+    reconciler = MetaInstanceReconciler()
+    events = await reconciler.reconcile(state)
+
+    placement_failed = [e for e in events if isinstance(e, MetaInstancePlacementFailed)]
+    assert len(placement_failed) == 1
+    assert "Failed to load model card" in placement_failed[0].reason
+    assert meta.meta_instance_id == placement_failed[0].meta_instance_id
+
+
+async def test_meta_instance_reconciler_model_load_error_skips_dedup(
+    monkeypatch: "pytest.MonkeyPatch",
+):
+    """When ModelCard.load error matches existing placement_error, no duplicate event."""
+    import exo.master.process_managers.meta_instance as mi_mod
+
+    meta = _meta_instance(placement_error="Failed to load model card: Network error")
+    topo = _topology("node-a")
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        topology=topo,
+    )
+
+    async def _failing_load(_model_id: ModelId) -> ModelCard:
+        raise RuntimeError("Network error")
+
+    monkeypatch.setattr(
+        mi_mod, "ModelCard", type("MC", (), {"load": staticmethod(_failing_load)})
+    )
+
+    reconciler = MetaInstanceReconciler()
+    events = await reconciler.reconcile(state)
+
+    # Error matches existing placement_error, so no duplicate event emitted
+    assert len(events) == 0
+
+
+async def test_meta_instance_reconciler_continues_after_error(
+    monkeypatch: "pytest.MonkeyPatch",
+):
+    """Reconciler should continue to next meta-instance after one fails to load."""
+    import exo.master.process_managers.meta_instance as mi_mod
+
+    meta_a = _meta_instance(model_id="org/model-a")
+    meta_b = _meta_instance(model_id="org/model-b")
+    topo = _topology("node-a")
+    state = State(
+        meta_instances={
+            meta_a.meta_instance_id: meta_a,
+            meta_b.meta_instance_id: meta_b,
+        },
+        topology=topo,
+    )
+
+    call_count = 0
+
+    async def _load_second_fails(model_id: ModelId) -> ModelCard:
+        nonlocal call_count
+        call_count += 1
+        raise RuntimeError(f"Cannot load {model_id}")
+
+    monkeypatch.setattr(
+        mi_mod, "ModelCard", type("MC", (), {"load": staticmethod(_load_second_fails)})
+    )
+
+    reconciler = MetaInstanceReconciler()
+    events = await reconciler.reconcile(state)
+
+    # Both meta-instances should have been attempted (not short-circuited)
+    assert call_count == 2
+    # Both should have placement failed events
+    placement_failed = [e for e in events if isinstance(e, MetaInstancePlacementFailed)]
+    assert len(placement_failed) == 2
+
+
+# =============================================================================
+# 8. Cascade delete with task cancellation
+# =============================================================================
+
+
+def test_cascade_delete_cancels_active_tasks():
+    """Deleting a MetaInstance should cancel tasks on backing instances.
+
+    Regression test: previously, cascade-deleting backing instances via
+    DeleteMetaInstance did not emit TaskStatusUpdated(Cancelled) for active
+    tasks, leaving orphaned task references in state.
+    """
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    task_id = TaskId()
+    task = LoadModel(task_id=task_id, instance_id=iid, task_status=TaskStatus.Running)
+
+    # Build state with meta-instance, backing instance, and active task
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        tasks={task_id: task},
+        topology=_topology("node-a"),
+    )
+
+    # Simulate the cascade-delete event sequence produced by main.py:
+    # 1. MetaInstanceDeleted
+    # 2. TaskStatusUpdated(Cancelled) for active tasks
+    # 3. InstanceDeleted
+    idx = 0
+    state = apply(
+        state,
+        IndexedEvent(
+            idx=idx,
+            event=MetaInstanceDeleted(meta_instance_id=meta.meta_instance_id),
+        ),
+    )
+    idx += 1
+    state = apply(
+        state,
+        IndexedEvent(
+            idx=idx,
+            event=TaskStatusUpdated(task_id=task_id, task_status=TaskStatus.Cancelled),
+        ),
+    )
+    idx += 1
+    state = apply(
+        state,
+        IndexedEvent(idx=idx, event=InstanceDeleted(instance_id=iid)),
+    )
+
+    # Verify everything is cleaned up
+    assert len(state.meta_instances) == 0
+    assert len(state.instances) == 0
+    assert state.tasks[task_id].task_status == TaskStatus.Cancelled
+
+
+def test_cascade_delete_skips_completed_tasks():
+    """Cascade delete should only cancel Pending/Running tasks, not completed ones."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+
+    running_task_id = TaskId()
+    completed_task_id = TaskId()
+    running_task = LoadModel(
+        task_id=running_task_id, instance_id=iid, task_status=TaskStatus.Running
+    )
+    completed_task = LoadModel(
+        task_id=completed_task_id, instance_id=iid, task_status=TaskStatus.Complete
+    )
+
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        tasks={running_task_id: running_task, completed_task_id: completed_task},
+        topology=_topology("node-a"),
+    )
+
+    # Only the running task should be cancelled — we verify the logic pattern
+    # by checking which tasks are Pending or Running
+    active_tasks = [
+        t
+        for t in state.tasks.values()
+        if t.instance_id == iid
+        and t.task_status in (TaskStatus.Pending, TaskStatus.Running)
+    ]
+    assert len(active_tasks) == 1
+    assert active_tasks[0].task_id == running_task_id
--- a/src/exo/master/tests/test_reconcile.py
+++ b/src/exo/master/tests/test_reconcile.py
@@ -0,0 +1,742 @@
+from exo.master.process_managers.instance_health import InstanceHealthReconciler
+from exo.master.reconcile import (
+    find_unsatisfied_meta_instances,
+    instance_connections_healthy,
+    instance_runners_failed,
+    instance_satisfies_meta_instance,
+)
+from exo.shared.apply import apply
+from exo.shared.models.model_cards import ModelCard, ModelId, ModelTask
+from exo.shared.topology import Topology
+from exo.shared.types.common import Host, MetaInstanceId, NodeId
+from exo.shared.types.events import (
+    IndexedEvent,
+    InstanceCreated,
+    InstanceDeleted,
+    InstanceRetrying,
+    MetaInstanceCreated,
+    MetaInstanceDeleted,
+)
+from exo.shared.types.memory import Memory
+from exo.shared.types.meta_instance import MetaInstance
+from exo.shared.types.multiaddr import Multiaddr
+from exo.shared.types.state import State
+from exo.shared.types.topology import Connection, SocketConnection
+from exo.shared.types.worker.instances import (
+    InstanceId,
+    MlxRingInstance,
+)
+from exo.shared.types.worker.runners import (
+    RunnerFailed,
+    RunnerId,
+    RunnerLoading,
+    RunnerReady,
+    RunnerShutdown,
+    ShardAssignments,
+)
+from exo.shared.types.worker.shards import PipelineShardMetadata
+
+
+def _model_card(model_id: str = "test-org/test-model") -> ModelCard:
+    return ModelCard(
+        model_id=ModelId(model_id),
+        storage_size=Memory.from_kb(1000),
+        n_layers=10,
+        hidden_size=30,
+        supports_tensor=True,
+        tasks=[ModelTask.TextGeneration],
+    )
+
+
+def _topology(*node_ids: str, connect: bool = True) -> Topology:
+    """Build a topology with nodes connected in a bidirectional ring with unique IPs.
+
+    Node at index ``i`` gets IP ``10.0.0.{i+1}``. Edges go in both directions
+    between consecutive nodes (including wrap-around).
+    """
+    t = Topology()
+    nodes = [NodeId(n) for n in node_ids]
+    for n in nodes:
+        t.add_node(n)
+    if connect and len(nodes) > 1:
+        for i in range(len(nodes)):
+            j = (i + 1) % len(nodes)
+            t.add_connection(
+                Connection(
+                    source=nodes[i],
+                    sink=nodes[j],
+                    edge=SocketConnection(
+                        sink_multiaddr=Multiaddr(
+                            address=f"/ip4/10.0.0.{j + 1}/tcp/50000"
+                        )
+                    ),
+                )
+            )
+            t.add_connection(
+                Connection(
+                    source=nodes[j],
+                    sink=nodes[i],
+                    edge=SocketConnection(
+                        sink_multiaddr=Multiaddr(
+                            address=f"/ip4/10.0.0.{i + 1}/tcp/50000"
+                        )
+                    ),
+                )
+            )
+    return t
+
+
+def _meta_instance(
+    model_id: str = "test-org/test-model",
+    *,
+    min_nodes: int = 1,
+    node_ids: list[NodeId] | None = None,
+    meta_instance_id: MetaInstanceId | None = None,
+) -> MetaInstance:
+    return MetaInstance(
+        meta_instance_id=meta_instance_id or MetaInstanceId(),
+        model_id=ModelId(model_id),
+        min_nodes=min_nodes,
+        node_ids=node_ids,
+    )
+
+
+def _instance(
+    model_id: str = "test-org/test-model",
+    node_ids: list[str] | None = None,
+    instance_id: InstanceId | None = None,
+    meta_instance_id: MetaInstanceId | None = None,
+) -> tuple[InstanceId, MlxRingInstance]:
+    """Create a test instance with hosts_by_node matching ``_topology()`` IPs."""
+    iid = instance_id or InstanceId()
+    nodes = node_ids or ["node-a"]
+    n = len(nodes)
+    mc = _model_card(model_id)
+    ephemeral_port = 50000
+    node_to_runner = {NodeId(nd): RunnerId() for nd in nodes}
+    runner_to_shard = {
+        runner_id: PipelineShardMetadata(
+            model_card=mc,
+            device_rank=i,
+            world_size=n,
+            start_layer=0,
+            end_layer=mc.n_layers,
+            n_layers=mc.n_layers,
+        )
+        for i, runner_id in enumerate(node_to_runner.values())
+    }
+    # Build hosts_by_node with IPs matching _topology() convention:
+    # node at index idx has IP 10.0.0.{idx+1}
+    hosts_by_node: dict[NodeId, list[Host]] = {}
+    for r, node_str in enumerate(nodes):
+        hosts: list[Host] = []
+        for idx in range(n):
+            if idx == r:
+                hosts.append(Host(ip="0.0.0.0", port=ephemeral_port))
+            elif n > 1 and idx in ((r - 1) % n, (r + 1) % n):
+                hosts.append(Host(ip=f"10.0.0.{idx + 1}", port=ephemeral_port))
+            else:
+                hosts.append(Host(ip="198.51.100.1", port=0))
+        hosts_by_node[NodeId(node_str)] = hosts
+    return iid, MlxRingInstance(
+        instance_id=iid,
+        shard_assignments=ShardAssignments(
+            model_id=ModelId(model_id),
+            runner_to_shard=runner_to_shard,
+            node_to_runner=node_to_runner,
+        ),
+        hosts_by_node=hosts_by_node,
+        ephemeral_port=ephemeral_port,
+        meta_instance_id=meta_instance_id,
+    )
+
+
+# --- instance_satisfies_meta_instance (pure constraint matching) ---
+
+
+def test_satisfies_matching_model():
+    meta = _meta_instance()
+    _, inst = _instance(node_ids=["node-a"])
+    assert instance_satisfies_meta_instance(meta, inst) is True
+
+
+def test_not_satisfies_wrong_model():
+    meta = _meta_instance("test-org/model-a")
+    _, inst = _instance("test-org/model-b")
+    assert instance_satisfies_meta_instance(meta, inst) is False
+
+
+def test_not_satisfies_missing_required_node():
+    meta = _meta_instance(node_ids=[NodeId("node-c")])
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    assert instance_satisfies_meta_instance(meta, inst) is False
+
+
+def test_not_satisfies_fewer_than_min_nodes():
+    meta = _meta_instance(min_nodes=3)
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    assert instance_satisfies_meta_instance(meta, inst) is False
+
+
+def test_satisfies_with_node_ids_specified():
+    meta = _meta_instance(node_ids=[NodeId("node-a"), NodeId("node-b")], min_nodes=2)
+    _, inst = _instance(node_ids=["node-a", "node-b", "node-c"])
+    assert instance_satisfies_meta_instance(meta, inst) is True
+
+
+# --- instance_connections_healthy ---
+
+
+def test_healthy_single_node_present():
+    _, inst = _instance(node_ids=["node-a"])
+    topology = _topology("node-a")
+    assert instance_connections_healthy(inst, topology) is True
+
+
+def test_unhealthy_single_node_missing():
+    _, inst = _instance(node_ids=["node-a"])
+    topology = Topology()  # empty
+    assert instance_connections_healthy(inst, topology) is False
+
+
+def test_healthy_two_node_ring():
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    topology = _topology("node-a", "node-b")
+    assert instance_connections_healthy(inst, topology) is True
+
+
+def test_unhealthy_two_node_edge_removed():
+    """Nodes present but edge removed — ring broken."""
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    topology = _topology("node-a", "node-b", connect=False)
+    assert instance_connections_healthy(inst, topology) is False
+
+
+def test_unhealthy_two_node_ip_changed():
+    """Edge exists but with a different IP than instance was configured with."""
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    # Build topology with different IPs than _instance() expects
+    topology = Topology()
+    topology.add_node(NodeId("node-a"))
+    topology.add_node(NodeId("node-b"))
+    topology.add_connection(
+        Connection(
+            source=NodeId("node-a"),
+            sink=NodeId("node-b"),
+            edge=SocketConnection(
+                sink_multiaddr=Multiaddr(address="/ip4/192.168.99.99/tcp/50000")
+            ),
+        )
+    )
+    topology.add_connection(
+        Connection(
+            source=NodeId("node-b"),
+            sink=NodeId("node-a"),
+            edge=SocketConnection(
+                sink_multiaddr=Multiaddr(address="/ip4/192.168.99.98/tcp/50000")
+            ),
+        )
+    )
+    assert instance_connections_healthy(inst, topology) is False
+
+
+def test_healthy_three_node_ring():
+    _, inst = _instance(node_ids=["node-a", "node-b", "node-c"])
+    topology = _topology("node-a", "node-b", "node-c")
+    assert instance_connections_healthy(inst, topology) is True
+
+
+def test_unhealthy_three_node_one_edge_removed():
+    """Remove one edge from a three-node ring — instance unhealthy."""
+    _, inst = _instance(node_ids=["node-a", "node-b", "node-c"])
+    # Build topology with one direction of one edge missing
+    topology = Topology()
+    nodes = [NodeId("node-a"), NodeId("node-b"), NodeId("node-c")]
+    for n in nodes:
+        topology.add_node(n)
+    # Add all edges except node-a → node-b
+    topology.add_connection(
+        Connection(
+            source=nodes[1],
+            sink=nodes[0],
+            edge=SocketConnection(
+                sink_multiaddr=Multiaddr(address="/ip4/10.0.0.1/tcp/50000")
+            ),
+        )
+    )
+    topology.add_connection(
+        Connection(
+            source=nodes[1],
+            sink=nodes[2],
+            edge=SocketConnection(
+                sink_multiaddr=Multiaddr(address="/ip4/10.0.0.3/tcp/50000")
+            ),
+        )
+    )
+    topology.add_connection(
+        Connection(
+            source=nodes[2],
+            sink=nodes[1],
+            edge=SocketConnection(
+                sink_multiaddr=Multiaddr(address="/ip4/10.0.0.2/tcp/50000")
+            ),
+        )
+    )
+    topology.add_connection(
+        Connection(
+            source=nodes[2],
+            sink=nodes[0],
+            edge=SocketConnection(
+                sink_multiaddr=Multiaddr(address="/ip4/10.0.0.1/tcp/50000")
+            ),
+        )
+    )
+    topology.add_connection(
+        Connection(
+            source=nodes[0],
+            sink=nodes[2],
+            edge=SocketConnection(
+                sink_multiaddr=Multiaddr(address="/ip4/10.0.0.3/tcp/50000")
+            ),
+        )
+    )
+    # Missing: node-a → node-b (ip 10.0.0.2)
+    assert instance_connections_healthy(inst, topology) is False
+
+
+def test_unhealthy_node_missing_from_topology():
+    """Instance has a node that's not in the topology at all."""
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    topology = _topology("node-a")  # node-b not present
+    assert instance_connections_healthy(inst, topology) is False
+
+
+def test_healthy_extra_nodes_in_topology():
+    """Extra nodes in topology don't affect instance health."""
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    topology = _topology("node-a", "node-b", "node-c")
+    assert instance_connections_healthy(inst, topology) is True
+
+
+# --- find_unsatisfied_meta_instances ---
+
+
+def test_unsatisfied_no_meta_instances():
+    result = find_unsatisfied_meta_instances({}, {}, Topology())
+    assert list(result) == []
+
+
+def test_unsatisfied_one_satisfied():
+    meta = _meta_instance()
+    id_a, inst_a = _instance(meta_instance_id=meta.meta_instance_id)
+    topology = _topology("node-a")
+    result = find_unsatisfied_meta_instances(
+        {meta.meta_instance_id: meta},
+        {id_a: inst_a},
+        topology,
+    )
+    assert list(result) == []
+
+
+def test_unsatisfied_one_not_satisfied():
+    meta = _meta_instance("test-org/model-x")
+    id_a, inst_a = _instance("test-org/model-y")
+    topology = _topology("node-a")
+    result = find_unsatisfied_meta_instances(
+        {meta.meta_instance_id: meta}, {id_a: inst_a}, topology
+    )
+    assert list(result) == [meta]
+
+
+def test_unsatisfied_mix():
+    meta_satisfied = _meta_instance("test-org/model-a")
+    meta_unsatisfied = _meta_instance("test-org/model-b")
+    id_a, inst_a = _instance(
+        "test-org/model-a", meta_instance_id=meta_satisfied.meta_instance_id
+    )
+    topology = _topology("node-a")
+    result = find_unsatisfied_meta_instances(
+        {
+            meta_satisfied.meta_instance_id: meta_satisfied,
+            meta_unsatisfied.meta_instance_id: meta_unsatisfied,
+        },
+        {id_a: inst_a},
+        topology,
+    )
+    assert list(result) == [meta_unsatisfied]
+
+
+def test_unsatisfied_node_disconnect():
+    meta = _meta_instance()
+    id_a, inst_a = _instance(
+        node_ids=["node-a", "node-b"], meta_instance_id=meta.meta_instance_id
+    )
+    topology = _topology("node-a")  # node-b disconnected
+    result = find_unsatisfied_meta_instances(
+        {meta.meta_instance_id: meta},
+        {id_a: inst_a},
+        topology,
+    )
+    assert list(result) == [meta]
+
+
+def test_unsatisfied_edge_break():
+    """Instance exists but its connections broke — meta-instance becomes unsatisfied."""
+    meta = _meta_instance()
+    id_a, inst_a = _instance(
+        node_ids=["node-a", "node-b"], meta_instance_id=meta.meta_instance_id
+    )
+    topology = _topology("node-a", "node-b", connect=False)  # nodes present, no edges
+    result = find_unsatisfied_meta_instances(
+        {meta.meta_instance_id: meta},
+        {id_a: inst_a},
+        topology,
+    )
+    assert list(result) == [meta]
+
+
+def test_unsatisfied_idempotent():
+    meta = _meta_instance("test-org/model-x")
+    topology = _topology("node-a")
+    meta_instances = {meta.meta_instance_id: meta}
+    instances: dict[InstanceId, MlxRingInstance] = {}
+    result_1 = list(
+        find_unsatisfied_meta_instances(meta_instances, instances, topology)
+    )
+    result_2 = list(
+        find_unsatisfied_meta_instances(meta_instances, instances, topology)
+    )
+    assert result_1 == result_2
+
+
+def test_unsatisfied_exclusive_binding():
+    """Two MetaInstances for the same model: one is bound via meta_instance_id, the other is unsatisfied."""
+    meta_a = _meta_instance("test-org/model-x")
+    meta_b = _meta_instance("test-org/model-x")
+    id_inst, inst = _instance(
+        "test-org/model-x", meta_instance_id=meta_a.meta_instance_id
+    )
+    topology = _topology("node-a")
+    result = find_unsatisfied_meta_instances(
+        {
+            meta_a.meta_instance_id: meta_a,
+            meta_b.meta_instance_id: meta_b,
+        },
+        {id_inst: inst},
+        topology,
+    )
+    assert list(result) == [meta_b]
+
+
+# --- apply handlers ---
+
+
+def test_apply_meta_instance_created():
+    state = State()
+    meta = _meta_instance()
+    event = MetaInstanceCreated(meta_instance=meta)
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    assert meta.meta_instance_id in new_state.meta_instances
+    assert new_state.meta_instances[meta.meta_instance_id] == meta
+
+
+def test_apply_meta_instance_deleted():
+    meta = _meta_instance()
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    event = MetaInstanceDeleted(meta_instance_id=meta.meta_instance_id)
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    assert meta.meta_instance_id not in new_state.meta_instances
+
+
+def test_apply_meta_instance_deleted_clears_failure_info():
+    meta = _meta_instance().model_copy(
+        update={"consecutive_failures": 2, "last_failure_error": "OOM"}
+    )
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    event = MetaInstanceDeleted(meta_instance_id=meta.meta_instance_id)
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    assert meta.meta_instance_id not in new_state.meta_instances
+
+
+# --- instance_runners_failed ---
+
+
+def test_runners_failed_all_failed():
+    """All runners in RunnerFailed -> instance is failed."""
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    runners = {
+        rid: RunnerFailed(error_message="OOM")
+        for rid in inst.shard_assignments.node_to_runner.values()
+    }
+    is_failed, error = instance_runners_failed(inst, runners, {})
+    assert is_failed is True
+    assert error is not None
+    assert "OOM" in error
+
+
+def test_runners_failed_mixed_failed_shutdown():
+    """One Failed + one Shutdown = failed."""
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    runners = {
+        runner_ids[0]: RunnerFailed(error_message="crash"),
+        runner_ids[1]: RunnerShutdown(),
+    }
+    is_failed, error = instance_runners_failed(inst, runners, {})
+    assert is_failed is True
+    assert error is not None
+    assert "crash" in error
+
+
+def test_runners_not_failed_all_shutdown():
+    """All Shutdown (graceful) = not a failure."""
+    _, inst = _instance(node_ids=["node-a"])
+    runners = {
+        rid: RunnerShutdown() for rid in inst.shard_assignments.node_to_runner.values()
+    }
+    is_failed, _ = instance_runners_failed(inst, runners, {})
+    assert is_failed is False
+
+
+def test_runners_not_failed_still_active():
+    """Some runners still active = not failed yet."""
+    _, inst = _instance(node_ids=["node-a", "node-b"])
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    runners = {
+        runner_ids[0]: RunnerFailed(error_message="OOM"),
+        runner_ids[1]: RunnerLoading(),
+    }
+    is_failed, _ = instance_runners_failed(inst, runners, {})
+    assert is_failed is False
+
+
+def test_runners_not_failed_no_status():
+    """Runner not yet reported = not failed."""
+    _, inst = _instance(node_ids=["node-a"])
+    is_failed, _ = instance_runners_failed(inst, {}, {})
+    assert is_failed is False
+
+
+def test_runners_not_failed_healthy():
+    """Runners in Ready state = not failed."""
+    _, inst = _instance(node_ids=["node-a"])
+    runners = {
+        rid: RunnerReady() for rid in inst.shard_assignments.node_to_runner.values()
+    }
+    is_failed, _ = instance_runners_failed(inst, runners, {})
+    assert is_failed is False
+
+
+# --- failure tracking in apply_instance_deleted ---
+
+
+def test_apply_instance_deleted_tracks_failure():
+    """InstanceDeleted with failure_error increments meta instance failure count."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+    )
+    event = InstanceDeleted(instance_id=iid, failure_error="Runner OOM")
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 1
+    assert mi.last_failure_error == "Runner OOM"
+
+
+def test_apply_instance_deleted_increments_failure():
+    """Subsequent failures increment the counter."""
+    meta = _meta_instance().model_copy(
+        update={"consecutive_failures": 2, "last_failure_error": "previous error"}
+    )
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+    )
+    event = InstanceDeleted(instance_id=iid, failure_error="new error")
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 3
+    assert mi.last_failure_error == "new error"
+
+
+def test_apply_instance_deleted_no_failure_no_tracking():
+    """InstanceDeleted without failure_error does not track."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+    )
+    event = InstanceDeleted(instance_id=iid)
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 0
+
+
+def test_apply_instance_deleted_orphan_no_tracking():
+    """InstanceDeleted for orphan instance (no meta_instance_id) does not track."""
+    iid, inst = _instance(node_ids=["node-a"])
+    state = State(instances={iid: inst})
+    event = InstanceDeleted(instance_id=iid, failure_error="crash")
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    assert len(new_state.meta_instances) == 0
+
+
+# --- InstanceRetrying ---
+
+
+def test_apply_instance_retrying_removes_runners():
+    """InstanceRetrying removes the instance's runners from state but keeps the instance."""
+    meta = _meta_instance()
+    iid, inst = _instance(
+        node_ids=["node-a", "node-b"], meta_instance_id=meta.meta_instance_id
+    )
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    runners = {
+        runner_ids[0]: RunnerFailed(error_message="OOM"),
+        runner_ids[1]: RunnerShutdown(),
+    }
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        runners=runners,
+    )
+    event = InstanceRetrying(
+        instance_id=iid,
+        meta_instance_id=meta.meta_instance_id,
+        failure_error="OOM",
+    )
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    # Instance still exists
+    assert iid in new_state.instances
+    # Runners removed
+    assert runner_ids[0] not in new_state.runners
+    assert runner_ids[1] not in new_state.runners
+
+
+def test_apply_instance_retrying_increments_failure():
+    """InstanceRetrying increments consecutive_failures on the MetaInstance."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+    )
+    event = InstanceRetrying(
+        instance_id=iid,
+        meta_instance_id=meta.meta_instance_id,
+        failure_error="crash",
+    )
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 1
+    assert mi.last_failure_error == "crash"
+
+
+def test_apply_instance_retrying_skips_missing_runners():
+    """InstanceRetrying doesn't assert if runners haven't reported yet."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    # No runners in state at all
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+    )
+    event = InstanceRetrying(
+        instance_id=iid,
+        meta_instance_id=meta.meta_instance_id,
+        failure_error="crash",
+    )
+    # Should not raise
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    assert iid in new_state.instances
+
+
+def test_apply_instance_created_resets_failure_counter():
+    """InstanceCreated resets consecutive_failures but preserves last_failure_error."""
+    meta = _meta_instance().model_copy(
+        update={"consecutive_failures": 3, "last_failure_error": "old error"}
+    )
+    _, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    state = State(meta_instances={meta.meta_instance_id: meta})
+    event = InstanceCreated(instance=inst)
+    new_state = apply(state, IndexedEvent(idx=0, event=event))
+    mi = new_state.meta_instances[meta.meta_instance_id]
+    assert mi.consecutive_failures == 0
+    assert mi.last_failure_error == "old error"
+    assert mi.placement_error is None
+
+
+# --- InstanceHealthReconciler retry-vs-delete ---
+
+
+async def test_health_reconciler_retries_when_under_limit():
+    """InstanceHealthReconciler emits InstanceRetrying when consecutive_failures < 3."""
+    meta = _meta_instance()
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        runners={runner_ids[0]: RunnerFailed(error_message="OOM")},
+        topology=_topology("node-a"),
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceRetrying)
+    assert events[0].instance_id == iid
+    assert events[0].meta_instance_id == meta.meta_instance_id
+
+
+async def test_health_reconciler_deletes_when_limit_reached():
+    """InstanceHealthReconciler emits InstanceDeleted when consecutive_failures >= 3."""
+    meta = _meta_instance().model_copy(update={"consecutive_failures": 3})
+    iid, inst = _instance(node_ids=["node-a"], meta_instance_id=meta.meta_instance_id)
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        runners={runner_ids[0]: RunnerFailed(error_message="OOM")},
+        topology=_topology("node-a"),
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceDeleted)
+
+
+async def test_health_reconciler_deletes_without_meta_instance():
+    """Instances without a MetaInstance are deleted immediately on runner failure."""
+    iid, inst = _instance(node_ids=["node-a"])
+    runner_ids = list(inst.shard_assignments.node_to_runner.values())
+    state = State(
+        instances={iid: inst},
+        runners={runner_ids[0]: RunnerFailed(error_message="crash")},
+        topology=_topology("node-a"),
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceDeleted)
+
+
+async def test_health_reconciler_network_failure_always_deletes():
+    """Network failure always triggers InstanceDeleted regardless of retry count."""
+    meta = _meta_instance()
+    iid, inst = _instance(
+        node_ids=["node-a", "node-b"], meta_instance_id=meta.meta_instance_id
+    )
+    state = State(
+        meta_instances={meta.meta_instance_id: meta},
+        instances={iid: inst},
+        topology=_topology("node-a"),  # node-b missing
+    )
+    reconciler = InstanceHealthReconciler()
+    events = await reconciler.reconcile(state)
+    assert len(events) == 1
+    assert isinstance(events[0], InstanceDeleted)
+    assert events[0].failure_error == "Network connection lost"
--- a/src/exo/routing/topics.py
+++ b/src/exo/routing/topics.py
@@ -5,7 +5,8 @@ from exo.routing.connection_message import ConnectionMessage
 from exo.shared.election import ElectionMessage
 from exo.shared.types.commands import ForwarderCommand, ForwarderDownloadCommand
 from exo.shared.types.events import (
-    ForwarderEvent,
+    GlobalForwarderEvent,
+    LocalForwarderEvent,
 )
 from exo.utils.pydantic_ext import CamelCaseModel

@@ -36,8 +37,8 @@ class TypedTopic[T: CamelCaseModel]:
        return self.model_type.model_validate_json(b.decode("utf-8"))


-GLOBAL_EVENTS = TypedTopic("global_events", PublishPolicy.Always, ForwarderEvent)
-LOCAL_EVENTS = TypedTopic("local_events", PublishPolicy.Always, ForwarderEvent)
+GLOBAL_EVENTS = TypedTopic("global_events", PublishPolicy.Always, GlobalForwarderEvent)
+LOCAL_EVENTS = TypedTopic("local_events", PublishPolicy.Always, LocalForwarderEvent)
 COMMANDS = TypedTopic("commands", PublishPolicy.Always, ForwarderCommand)
 ELECTION_MESSAGES = TypedTopic(
    "election_messages", PublishPolicy.Always, ElectionMessage
--- a/src/exo/shared/apply.py
+++ b/src/exo/shared/apply.py
@@ -4,7 +4,7 @@ from datetime import datetime

 from loguru import logger

-from exo.shared.types.common import NodeId
+from exo.shared.types.common import MetaInstanceId, NodeId
 from exo.shared.types.events import (
    ChunkGenerated,
    Event,
@@ -12,6 +12,10 @@ from exo.shared.types.events import (
    InputChunkReceived,
    InstanceCreated,
    InstanceDeleted,
+    InstanceRetrying,
+    MetaInstanceCreated,
+    MetaInstanceDeleted,
+    MetaInstancePlacementFailed,
    NodeDownloadProgress,
    NodeGatheredInfo,
    NodeTimedOut,
@@ -28,6 +32,7 @@ from exo.shared.types.events import (
    TracesCollected,
    TracesMerged,
 )
+from exo.shared.types.meta_instance import MetaInstance
 from exo.shared.types.profiling import (
    NodeIdentity,
    NodeNetworkInfo,
@@ -72,6 +77,14 @@ def event_apply(event: Event, state: State) -> State:
            return apply_instance_created(event, state)
        case InstanceDeleted():
            return apply_instance_deleted(event, state)
+        case InstanceRetrying():
+            return apply_instance_retrying(event, state)
+        case MetaInstanceCreated():
+            return apply_meta_instance_created(event, state)
+        case MetaInstanceDeleted():
+            return apply_meta_instance_deleted(event, state)
+        case MetaInstancePlacementFailed():
+            return apply_meta_instance_placement_failed(event, state)
        case NodeTimedOut():
            return apply_node_timed_out(event, state)
        case NodeDownloadProgress():
@@ -174,20 +187,123 @@ def apply_task_failed(event: TaskFailed, state: State) -> State:
    return state.model_copy(update={"tasks": new_tasks})


+def _update_meta_instance(
+    state: State, mid: MetaInstanceId, **fields: object
+) -> Mapping[MetaInstanceId, MetaInstance]:
+    mi = state.meta_instances[mid]
+    return {**state.meta_instances, mid: mi.model_copy(update=fields)}
+
+
 def apply_instance_created(event: InstanceCreated, state: State) -> State:
    instance = event.instance
    new_instances: Mapping[InstanceId, Instance] = {
        **state.instances,
        instance.instance_id: instance,
    }
-    return state.model_copy(update={"instances": new_instances})
+    update: dict[str, object] = {"instances": new_instances}
+    # Reset failure tracking when a new instance is created for a meta-instance
+    if instance.meta_instance_id and instance.meta_instance_id in state.meta_instances:
+        mi = state.meta_instances[instance.meta_instance_id]
+        if mi.placement_error is not None or mi.consecutive_failures > 0:
+            update["meta_instances"] = _update_meta_instance(
+                state,
+                instance.meta_instance_id,
+                placement_error=None,
+                consecutive_failures=0,
+            )
+    return state.model_copy(update=update)


 def apply_instance_deleted(event: InstanceDeleted, state: State) -> State:
+    deleted_instance = state.instances.get(event.instance_id)
    new_instances: Mapping[InstanceId, Instance] = {
        iid: inst for iid, inst in state.instances.items() if iid != event.instance_id
    }
-    return state.model_copy(update={"instances": new_instances})
+    update: dict[str, object] = {"instances": new_instances}
+
+    # Track failure on the MetaInstance itself
+    if (
+        event.failure_error
+        and deleted_instance
+        and deleted_instance.meta_instance_id
+        and deleted_instance.meta_instance_id in state.meta_instances
+    ):
+        mid = deleted_instance.meta_instance_id
+        mi = state.meta_instances[mid]
+        update["meta_instances"] = {
+            **state.meta_instances,
+            mid: mi.model_copy(
+                update={
+                    "consecutive_failures": mi.consecutive_failures + 1,
+                    "last_failure_error": event.failure_error,
+                }
+            ),
+        }
+
+    return state.model_copy(update=update)
+
+
+def apply_instance_retrying(event: InstanceRetrying, state: State) -> State:
+    """Runners failed but retry limit not reached — remove runners, keep instance."""
+    instance = state.instances.get(event.instance_id)
+    if instance is None:
+        # Instance was already deleted (e.g. cascade from DeleteMetaInstance).
+        # The InstanceDeleted handler already incremented consecutive_failures
+        # on the MetaInstance, so skipping here avoids double-counting.
+        return state
+
+    # Remove all runners belonging to this instance from state
+    runner_ids_to_remove = set(instance.shard_assignments.node_to_runner.values())
+    new_runners: Mapping[RunnerId, RunnerStatus] = {
+        rid: rs for rid, rs in state.runners.items() if rid not in runner_ids_to_remove
+    }
+
+    update: dict[str, object] = {"runners": new_runners}
+
+    # Increment failure count on the MetaInstance
+    if event.meta_instance_id in state.meta_instances:
+        update["meta_instances"] = _update_meta_instance(
+            state,
+            event.meta_instance_id,
+            consecutive_failures=state.meta_instances[
+                event.meta_instance_id
+            ].consecutive_failures
+            + 1,
+            last_failure_error=event.failure_error,
+        )
+
+    return state.model_copy(update=update)
+
+
+def apply_meta_instance_created(event: MetaInstanceCreated, state: State) -> State:
+    new_meta: Mapping[MetaInstanceId, MetaInstance] = {
+        **state.meta_instances,
+        event.meta_instance.meta_instance_id: event.meta_instance,
+    }
+    return state.model_copy(update={"meta_instances": new_meta})
+
+
+def apply_meta_instance_deleted(event: MetaInstanceDeleted, state: State) -> State:
+    new_meta: Mapping[MetaInstanceId, MetaInstance] = {
+        mid: mi
+        for mid, mi in state.meta_instances.items()
+        if mid != event.meta_instance_id
+    }
+    return state.model_copy(update={"meta_instances": new_meta})
+
+
+def apply_meta_instance_placement_failed(
+    event: MetaInstancePlacementFailed, state: State
+) -> State:
+    if event.meta_instance_id not in state.meta_instances:
+        return state
+    return state.model_copy(
+        update={
+            "meta_instances": _update_meta_instance(
+                state, event.meta_instance_id, placement_error=event.reason
+            )
+        }
+    )


 def apply_runner_status_updated(event: RunnerStatusUpdated, state: State) -> State:
--- a/src/exo/shared/constants.py
+++ b/src/exo/shared/constants.py
@@ -33,6 +33,15 @@ EXO_MODELS_DIR = (
    if _EXO_MODELS_DIR_ENV is None
    else Path.home() / _EXO_MODELS_DIR_ENV
 )
+
+# Read-only search path for pre-downloaded models (colon-separated directories)
+_EXO_MODELS_PATH_ENV = os.environ.get("EXO_MODELS_PATH", None)
+EXO_MODELS_PATH: tuple[Path, ...] | None = (
+    tuple(Path(p).expanduser() for p in _EXO_MODELS_PATH_ENV.split(":") if p)
+    if _EXO_MODELS_PATH_ENV is not None
+    else None
+)
+
 _RESOURCES_DIR_ENV = os.environ.get("EXO_RESOURCES_DIR", None)
 RESOURCES_DIR = (
    find_resources() if _RESOURCES_DIR_ENV is None else Path.home() / _RESOURCES_DIR_ENV
@@ -65,4 +74,8 @@ EXO_EVENT_LOG_DIR = EXO_DATA_HOME / "event_log"
 EXO_IMAGE_CACHE_DIR = EXO_CACHE_HOME / "images"
 EXO_TRACING_CACHE_DIR = EXO_CACHE_HOME / "traces"

+EXO_ENABLE_IMAGE_MODELS = (
+    os.getenv("EXO_ENABLE_IMAGE_MODELS", "false").lower() == "true"
+)
+
 EXO_TRACING_ENABLED = os.getenv("EXO_TRACING_ENABLED", "false").lower() == "true"
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -20,6 +20,7 @@ from tomlkit.exceptions import TOMLKitError

 from exo.shared.constants import (
    EXO_CUSTOM_MODEL_CARDS_DIR,
+    EXO_ENABLE_IMAGE_MODELS,
    RESOURCES_DIR,
 )
 from exo.shared.types.common import ModelId
@@ -49,10 +50,16 @@ async def _refresh_card_cache():
                pass


+def _is_image_card(card: "ModelCard") -> bool:
+    return any(t in (ModelTask.TextToImage, ModelTask.ImageToImage) for t in card.tasks)
+
+
 async def get_model_cards() -> list["ModelCard"]:
    if len(_card_cache) == 0:
        await _refresh_card_cache()
-    return list(_card_cache.values())
+    if EXO_ENABLE_IMAGE_MODELS:
+        return list(_card_cache.values())
+    return [c for c in _card_cache.values() if not _is_image_card(c)]


 class ModelTask(str, Enum):
--- a/src/exo/shared/tests/test_election.py
+++ b/src/exo/shared/tests/test_election.py
@@ -4,7 +4,7 @@ from anyio import create_task_group, fail_after, move_on_after
 from exo.routing.connection_message import ConnectionMessage, ConnectionMessageType
 from exo.shared.election import Election, ElectionMessage, ElectionResult
 from exo.shared.types.commands import ForwarderCommand, TestCommand
-from exo.shared.types.common import NodeId, SessionId
+from exo.shared.types.common import NodeId, SessionId, SystemId
 from exo.utils.channels import channel

 # ======= #
@@ -384,7 +384,7 @@ async def test_tie_breaker_prefers_node_with_more_commands_seen() -> None:
            # Pump local commands so our commands_seen is high before the round starts
            for _ in range(50):
                await co_tx.send(
-                    ForwarderCommand(origin=NodeId("SOMEONE"), command=TestCommand())
+                    ForwarderCommand(origin=SystemId("SOMEONE"), command=TestCommand())
                )

            # Trigger a round at clock=1 with a peer of equal seniority but fewer commands
--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -6,7 +6,7 @@ from uuid import uuid4
 from pydantic import BaseModel, Field, field_validator

 from exo.shared.models.model_cards import ModelCard, ModelId
-from exo.shared.types.common import CommandId, NodeId
+from exo.shared.types.common import CommandId, MetaInstanceId, NodeId
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
 from exo.shared.types.worker.shards import Sharding, ShardMetadata
@@ -262,6 +262,26 @@ class DeleteInstanceResponse(BaseModel):
    instance_id: InstanceId


+class CreateMetaInstanceParams(BaseModel):
+    model_id: ModelId
+    sharding: Sharding = Sharding.Pipeline
+    instance_meta: InstanceMeta = InstanceMeta.MlxRing
+    min_nodes: int = 1
+    node_ids: list[NodeId] | None = None
+
+
+class CreateMetaInstanceResponse(BaseModel):
+    message: str
+    command_id: CommandId
+    meta_instance_id: MetaInstanceId
+
+
+class DeleteMetaInstanceResponse(BaseModel):
+    message: str
+    command_id: CommandId
+    meta_instance_id: MetaInstanceId
+
+
 ImageSize = Literal[
    "auto",
    "512x512",
--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -6,7 +6,8 @@ from exo.shared.types.api import (
    ImageGenerationTaskParams,
 )
 from exo.shared.types.chunks import InputImageChunk
-from exo.shared.types.common import CommandId, NodeId
+from exo.shared.types.common import CommandId, MetaInstanceId, NodeId, SystemId
+from exo.shared.types.meta_instance import MetaInstance
 from exo.shared.types.text_generation import TextGenerationTaskParams
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
 from exo.shared.types.worker.shards import Sharding, ShardMetadata
@@ -52,6 +53,14 @@ class TaskCancelled(BaseCommand):
    cancelled_command_id: CommandId


+class CreateMetaInstance(BaseCommand):
+    meta_instance: MetaInstance
+
+
+class DeleteMetaInstance(BaseCommand):
+    meta_instance_id: MetaInstanceId
+
+
 class TaskFinished(BaseCommand):
    finished_command_id: CommandId

@@ -94,16 +103,18 @@ Command = (
    | CreateInstance
    | DeleteInstance
    | TaskCancelled
+    | CreateMetaInstance
+    | DeleteMetaInstance
    | TaskFinished
    | SendInputChunk
 )


 class ForwarderCommand(CamelCaseModel):
-    origin: NodeId
+    origin: SystemId
    command: Command


 class ForwarderDownloadCommand(CamelCaseModel):
-    origin: NodeId
+    origin: SystemId
    command: DownloadCommand
--- a/src/exo/shared/types/common.py
+++ b/src/exo/shared/types/common.py
@@ -25,6 +25,10 @@ class NodeId(Id):
    pass


+class SystemId(Id):
+    pass
+
+
 class ModelId(Id):
    def normalize(self) -> str:
        return self.replace("/", "--")
@@ -42,6 +46,10 @@ class CommandId(Id):
    pass


+class MetaInstanceId(Id):
+    """Identifier for a MetaInstance."""
+
+
 class Host(CamelCaseModel):
    ip: str
    port: int
--- a/src/exo/shared/types/events.py
+++ b/src/exo/shared/types/events.py
@@ -5,7 +5,15 @@ from pydantic import Field

 from exo.shared.topology import Connection
 from exo.shared.types.chunks import GenerationChunk, InputImageChunk
-from exo.shared.types.common import CommandId, Id, NodeId, SessionId
+from exo.shared.types.common import (
+    CommandId,
+    Id,
+    MetaInstanceId,
+    NodeId,
+    SessionId,
+    SystemId,
+)
+from exo.shared.types.meta_instance import MetaInstance
 from exo.shared.types.tasks import Task, TaskId, TaskStatus
 from exo.shared.types.worker.downloads import DownloadProgress
 from exo.shared.types.worker.instances import Instance, InstanceId
@@ -66,6 +74,30 @@ class InstanceCreated(BaseEvent):

 class InstanceDeleted(BaseEvent):
    instance_id: InstanceId
+    failure_error: str | None = None
+
+
+class MetaInstanceCreated(BaseEvent):
+    meta_instance: MetaInstance
+
+
+class MetaInstanceDeleted(BaseEvent):
+    meta_instance_id: MetaInstanceId
+
+
+@final
+class MetaInstancePlacementFailed(BaseEvent):
+    meta_instance_id: MetaInstanceId
+    reason: str
+
+
+@final
+class InstanceRetrying(BaseEvent):
+    """Runners failed but retry count is below the limit — restart runners, keep instance."""
+
+    instance_id: InstanceId
+    meta_instance_id: MetaInstanceId
+    failure_error: str


 class RunnerStatusUpdated(BaseEvent):
@@ -141,6 +173,10 @@ Event = (
    | TaskAcknowledged
    | InstanceCreated
    | InstanceDeleted
+    | InstanceRetrying
+    | MetaInstanceCreated
+    | MetaInstanceDeleted
+    | MetaInstancePlacementFailed
    | RunnerStatusUpdated
    | RunnerDeleted
    | NodeTimedOut
@@ -162,10 +198,19 @@ class IndexedEvent(CamelCaseModel):
    event: Event


-class ForwarderEvent(CamelCaseModel):
+class GlobalForwarderEvent(CamelCaseModel):
    """An event the forwarder will serialize and send over the network"""

    origin_idx: int = Field(ge=0)
    origin: NodeId
    session: SessionId
    event: Event
+
+
+class LocalForwarderEvent(CamelCaseModel):
+    """An event the forwarder will serialize and send over the network"""
+
+    origin_idx: int = Field(ge=0)
+    origin: SystemId
+    session: SessionId
+    event: Event
--- a/src/exo/shared/types/meta_instance.py
+++ b/src/exo/shared/types/meta_instance.py
@@ -0,0 +1,25 @@
+from typing import final
+
+from pydantic import Field
+
+from exo.shared.models.model_cards import ModelId
+from exo.shared.types.common import MetaInstanceId, NodeId
+from exo.shared.types.worker.instances import InstanceMeta
+from exo.shared.types.worker.shards import Sharding
+from exo.utils.pydantic_ext import FrozenModel
+
+
+@final
+class MetaInstance(FrozenModel):
+    """Declarative constraint: ensure an instance matching these parameters always exists."""
+
+    meta_instance_id: MetaInstanceId = Field(default_factory=MetaInstanceId)
+    model_id: ModelId
+    sharding: Sharding = Sharding.Pipeline
+    instance_meta: InstanceMeta = InstanceMeta.MlxRing
+    min_nodes: int = 1
+    node_ids: list[NodeId] | None = None
+    # Failure tracking
+    placement_error: str | None = None
+    consecutive_failures: int = 0
+    last_failure_error: str | None = None
--- a/src/exo/shared/types/state.py
+++ b/src/exo/shared/types/state.py
@@ -6,7 +6,8 @@ from pydantic import ConfigDict, Field, field_serializer, field_validator
 from pydantic.alias_generators import to_camel

 from exo.shared.topology import Topology, TopologySnapshot
-from exo.shared.types.common import NodeId
+from exo.shared.types.common import MetaInstanceId, NodeId
+from exo.shared.types.meta_instance import MetaInstance
 from exo.shared.types.profiling import (
    DiskUsage,
    MemoryUsage,
@@ -41,6 +42,7 @@ class State(CamelCaseModel):
        arbitrary_types_allowed=True,
    )
    instances: Mapping[InstanceId, Instance] = {}
+    meta_instances: Mapping[MetaInstanceId, MetaInstance] = {}
    runners: Mapping[RunnerId, RunnerStatus] = {}
    downloads: Mapping[NodeId, Sequence[DownloadProgress]] = {}
    tasks: Mapping[TaskId, Task] = {}
--- a/src/exo/shared/types/worker/downloads.py
+++ b/src/exo/shared/types/worker/downloads.py
@@ -30,11 +30,13 @@ class BaseDownloadProgress(TaggedModel):


 class DownloadPending(BaseDownloadProgress):
-    pass
+    downloaded: Memory = Memory()
+    total: Memory = Memory()


 class DownloadCompleted(BaseDownloadProgress):
    total: Memory
+    read_only: bool = False


 class DownloadFailed(BaseDownloadProgress):
--- a/src/exo/shared/types/worker/instances.py
+++ b/src/exo/shared/types/worker/instances.py
@@ -2,7 +2,8 @@ from enum import Enum

 from pydantic import model_validator

-from exo.shared.types.common import Host, Id, NodeId
+from exo.shared.models.model_cards import ModelTask
+from exo.shared.types.common import Host, Id, MetaInstanceId, NodeId
 from exo.shared.types.worker.runners import RunnerId, ShardAssignments, ShardMetadata
 from exo.utils.pydantic_ext import CamelCaseModel, TaggedModel

@@ -19,6 +20,7 @@ class InstanceMeta(str, Enum):
 class BaseInstance(TaggedModel):
    instance_id: InstanceId
    shard_assignments: ShardAssignments
+    meta_instance_id: MetaInstanceId | None = None

    def shard(self, runner_id: RunnerId) -> ShardMetadata | None:
        return self.shard_assignments.runner_to_shard.get(runner_id, None)
@@ -49,6 +51,13 @@ class BoundInstance(CamelCaseModel):
        assert shard is not None
        return shard

+    @property
+    def is_image_model(self) -> bool:
+        return (
+            ModelTask.TextToImage in self.bound_shard.model_card.tasks
+            or ModelTask.ImageToImage in self.bound_shard.model_card.tasks
+        )
+
    @model_validator(mode="after")
    def validate_shard_exists(self) -> "BoundInstance":
        assert (
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -1,13 +1,14 @@
+import contextlib
 from collections import defaultdict
 from datetime import datetime, timezone
 from random import random
-from typing import Iterator

 import anyio
-from anyio import CancelScope, create_task_group, fail_after
+from anyio import CancelScope, ClosedResourceError, create_task_group, fail_after
 from anyio.abc import TaskGroup
 from loguru import logger

+from exo.download.download_utils import resolve_model_in_path
 from exo.shared.apply import apply
 from exo.shared.models.model_cards import ModelId
 from exo.shared.types.api import ImageEditsTaskParams
@@ -17,13 +18,15 @@ from exo.shared.types.commands import (
    RequestEventLog,
    StartDownload,
 )
-from exo.shared.types.common import CommandId, NodeId, SessionId
+from exo.shared.types.common import CommandId, NodeId, SessionId, SystemId
 from exo.shared.types.events import (
    Event,
    EventId,
-    ForwarderEvent,
+    GlobalForwarderEvent,
    IndexedEvent,
    InputChunkReceived,
+    LocalForwarderEvent,
+    NodeDownloadProgress,
    NodeGatheredInfo,
    TaskCreated,
    TaskStatusUpdated,
@@ -42,6 +45,7 @@ from exo.shared.types.tasks import (
    TaskStatus,
 )
 from exo.shared.types.topology import Connection, SocketConnection
+from exo.shared.types.worker.downloads import DownloadCompleted
 from exo.shared.types.worker.runners import RunnerId
 from exo.utils.channels import Receiver, Sender, channel
 from exo.utils.event_buffer import OrderedBuffer
@@ -58,24 +62,22 @@ class Worker:
        node_id: NodeId,
        session_id: SessionId,
        *,
-        global_event_receiver: Receiver[ForwarderEvent],
-        local_event_sender: Sender[ForwarderEvent],
+        global_event_receiver: Receiver[GlobalForwarderEvent],
+        local_event_sender: Sender[LocalForwarderEvent],
        # This is for requesting updates. It doesn't need to be a general command sender right now,
        # but I think it's the correct way to be thinking about commands
        command_sender: Sender[ForwarderCommand],
        download_command_sender: Sender[ForwarderDownloadCommand],
-        event_index_counter: Iterator[int],
    ):
        self.node_id: NodeId = node_id
        self.session_id: SessionId = session_id

        self.global_event_receiver = global_event_receiver
        self.local_event_sender = local_event_sender
-        self.event_index_counter = event_index_counter
        self.command_sender = command_sender
        self.download_command_sender = download_command_sender
        self.event_buffer = OrderedBuffer[Event]()
-        self.out_for_delivery: dict[EventId, ForwarderEvent] = {}
+        self.out_for_delivery: dict[EventId, LocalForwarderEvent] = {}

        self.state: State = State()
        self.runners: dict[RunnerId, RunnerSupervisor] = {}
@@ -86,6 +88,8 @@ class Worker:
        self._nack_base_seconds: float = 0.5
        self._nack_cap_seconds: float = 10.0

+        self._system_id = SystemId()
+
        self.event_sender, self.event_receiver = channel[Event]()

        # Buffer for input image chunks (for image editing)
@@ -116,7 +120,8 @@ class Worker:
            self.command_sender.close()
            self.download_command_sender.close()
            for runner in self.runners.values():
-                runner.shutdown()
+                with contextlib.suppress(ClosedResourceError):
+                    runner.shutdown()

    async def _forward_info(self, recv: Receiver[GatheredInfo]):
        with recv as info_stream:
@@ -132,6 +137,8 @@ class Worker:
    async def _event_applier(self):
        with self.global_event_receiver as events:
            async for f_event in events:
+                if f_event.session != self.session_id:
+                    continue
                if f_event.origin != self.session_id.master_node_id:
                    continue
                self.event_buffer.ingest(f_event.origin_idx, f_event.event)
@@ -210,20 +217,44 @@ class Worker:
                    model_id = shard.model_card.model_id
                    self._download_backoff.record_attempt(model_id)

-                    await self.download_command_sender.send(
-                        ForwarderDownloadCommand(
-                            origin=self.node_id,
-                            command=StartDownload(
-                                target_node_id=self.node_id,
-                                shard_metadata=shard,
-                            ),
+                    found_path = resolve_model_in_path(model_id)
+                    if found_path is not None:
+                        logger.info(
+                            f"Model {model_id} found in EXO_MODELS_PATH at {found_path}"
                        )
-                    )
-                    await self.event_sender.send(
-                        TaskStatusUpdated(
-                            task_id=task.task_id, task_status=TaskStatus.Running
+                        await self.event_sender.send(
+                            NodeDownloadProgress(
+                                download_progress=DownloadCompleted(
+                                    node_id=self.node_id,
+                                    shard_metadata=shard,
+                                    model_directory=str(found_path),
+                                    total=shard.model_card.storage_size,
+                                    read_only=True,
+                                )
+                            )
+                        )
+                        await self.event_sender.send(
+                            TaskStatusUpdated(
+                                task_id=task.task_id,
+                                task_status=TaskStatus.Complete,
+                            )
+                        )
+                    else:
+                        await self.download_command_sender.send(
+                            ForwarderDownloadCommand(
+                                origin=self._system_id,
+                                command=StartDownload(
+                                    target_node_id=self.node_id,
+                                    shard_metadata=shard,
+                                ),
+                            )
+                        )
+                        await self.event_sender.send(
+                            TaskStatusUpdated(
+                                task_id=task.task_id,
+                                task_status=TaskStatus.Running,
+                            )
                        )
-                    )
                case Shutdown(runner_id=runner_id):
                    runner = self.runners.pop(runner_id)
                    try:
@@ -236,7 +267,8 @@ class Worker:
                            )
                        )
                    finally:
-                        runner.shutdown()
+                        with contextlib.suppress(ClosedResourceError):
+                            runner.shutdown()
                case CancelTask(
                    cancelled_task_id=cancelled_task_id, runner_id=runner_id
                ):
@@ -317,7 +349,7 @@ class Worker:
                )
                await self.command_sender.send(
                    ForwarderCommand(
-                        origin=self.node_id,
+                        origin=self._system_id,
                        command=RequestEventLog(since_idx=since_idx),
                    )
                )
@@ -344,15 +376,16 @@ class Worker:
        return runner

    async def _forward_events(self) -> None:
+        idx = 0
        with self.event_receiver as events:
            async for event in events:
-                idx = next(self.event_index_counter)
-                fe = ForwarderEvent(
+                fe = LocalForwarderEvent(
                    origin_idx=idx,
-                    origin=self.node_id,
+                    origin=self._system_id,
                    session=self.session_id,
                    event=event,
                )
+                idx += 1
                logger.debug(f"Worker published event {idx}: {str(event)[:100]}")
                await self.local_event_sender.send(fe)
                self.out_for_delivery[event.event_id] = fe
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@@ -35,6 +35,7 @@ from exo.shared.types.worker.runners import (
    RunnerLoading,
    RunnerReady,
    RunnerRunning,
+    RunnerShutdown,
    RunnerStatus,
    RunnerWarmingUp,
 )
@@ -56,7 +57,7 @@ def plan(
    return (
        _cancel_tasks(runners, tasks)
        or _kill_runner(runners, all_runners, instances)
-        or _create_runner(node_id, runners, instances)
+        or _create_runner(node_id, runners, instances, all_runners)
        or _model_needs_download(node_id, runners, global_download_status)
        or _init_distributed_backend(runners, all_runners)
        or _load_model(runners, all_runners, global_download_status)
@@ -75,6 +76,12 @@ def _kill_runner(
        if (instance_id := runner.bound_instance.instance.instance_id) not in instances:
            return Shutdown(instance_id=instance_id, runner_id=runner_id)

+        # Master removed our runner from state (retry signal) and process is dead
+        if runner_id not in all_runners and isinstance(
+            runner.status, (RunnerFailed, RunnerShutdown)
+        ):
+            return Shutdown(instance_id=instance_id, runner_id=runner_id)
+
        for (
            global_runner_id
        ) in runner.bound_instance.instance.shard_assignments.node_to_runner.values():
@@ -92,6 +99,7 @@ def _create_runner(
    node_id: NodeId,
    runners: Mapping[RunnerId, RunnerSupervisor],
    instances: Mapping[InstanceId, Instance],
+    all_runners: Mapping[RunnerId, RunnerStatus],
 ) -> CreateRunner | None:
    for instance in instances.values():
        runner_id = instance.shard_assignments.node_to_runner.get(node_id, None)
@@ -101,6 +109,16 @@ def _create_runner(
        if runner_id in runners:
            continue

+        # Don't create while any peer runner is in a terminal state — wait for
+        # the master to emit InstanceRetrying which removes them from state.
+        has_terminal_peer = any(
+            isinstance(all_runners.get(peer_rid), (RunnerFailed, RunnerShutdown))
+            for peer_rid in instance.shard_assignments.node_to_runner.values()
+            if peer_rid != runner_id
+        )
+        if has_terminal_peer:
+            continue
+
        shard = instance.shard(runner_id)
        assert shard is not None

--- a/src/exo/worker/runner/init.py
+++ b/src/exo/worker/runner/init.py
--- a/src/exo/worker/runner/bootstrap.py
+++ b/src/exo/worker/runner/bootstrap.py
@@ -37,9 +37,13 @@ def entrypoint(

    # Import main after setting global logger - this lets us just import logger from this module
    try:
-        from exo.worker.runner.runner import main
+        if bound_instance.is_image_model:
+            from exo.worker.runner.image_models.runner import main
+        else:
+            from exo.worker.runner.llm_inference.runner import main

        main(bound_instance, event_sender, task_receiver, cancel_receiver)
+
    except ClosedResourceError:
        logger.warning("Runner communication closed unexpectedly")
    except Exception as e:
--- a/src/exo/worker/runner/image_models/init.py
+++ b/src/exo/worker/runner/image_models/init.py
--- a/src/exo/worker/runner/image_models/runner.py
+++ b/src/exo/worker/runner/image_models/runner.py
@@ -0,0 +1,453 @@
+import base64
+import resource
+import time
+from typing import TYPE_CHECKING, Literal
+
+import mlx.core as mx
+
+from exo.shared.constants import EXO_MAX_CHUNK_SIZE, EXO_TRACING_ENABLED
+from exo.shared.models.model_cards import ModelTask
+from exo.shared.tracing import clear_trace_buffer, get_trace_buffer
+from exo.shared.types.api import ImageGenerationStats
+from exo.shared.types.chunks import ErrorChunk, ImageChunk
+from exo.shared.types.common import CommandId, ModelId
+from exo.shared.types.events import (
+    ChunkGenerated,
+    Event,
+    RunnerStatusUpdated,
+    TaskAcknowledged,
+    TaskStatusUpdated,
+    TraceEventData,
+    TracesCollected,
+)
+from exo.shared.types.tasks import (
+    ConnectToGroup,
+    ImageEdits,
+    ImageGeneration,
+    LoadModel,
+    Shutdown,
+    StartWarmup,
+    Task,
+    TaskId,
+    TaskStatus,
+)
+from exo.shared.types.worker.instances import BoundInstance
+from exo.shared.types.worker.runner_response import (
+    ImageGenerationResponse,
+    PartialImageResponse,
+)
+from exo.shared.types.worker.runners import (
+    RunnerConnected,
+    RunnerConnecting,
+    RunnerFailed,
+    RunnerIdle,
+    RunnerLoaded,
+    RunnerLoading,
+    RunnerReady,
+    RunnerRunning,
+    RunnerShutdown,
+    RunnerShuttingDown,
+    RunnerStatus,
+    RunnerWarmingUp,
+)
+from exo.shared.types.worker.shards import (
+    CfgShardMetadata,
+    PipelineShardMetadata,
+    ShardMetadata,
+)
+from exo.utils.channels import MpReceiver, MpSender
+from exo.worker.engines.image import (
+    DistributedImageModel,
+    generate_image,
+    initialize_image_model,
+    warmup_image_generator,
+)
+from exo.worker.engines.mlx.utils_mlx import (
+    initialize_mlx,
+)
+from exo.worker.runner.bootstrap import logger
+
+
+def _is_primary_output_node(shard_metadata: ShardMetadata) -> bool:
+    """Check if this node is the primary output node for image generation.
+
+    For CFG models: the last pipeline stage in CFG group 0 (positive prompt).
+    For non-CFG models: the last pipeline stage.
+    """
+    if isinstance(shard_metadata, CfgShardMetadata):
+        is_pipeline_last = (
+            shard_metadata.pipeline_rank == shard_metadata.pipeline_world_size - 1
+        )
+        return is_pipeline_last and shard_metadata.cfg_rank == 0
+    elif isinstance(shard_metadata, PipelineShardMetadata):
+        return shard_metadata.device_rank == shard_metadata.world_size - 1
+    return False
+
+
+def _process_image_response(
+    response: ImageGenerationResponse | PartialImageResponse,
+    command_id: CommandId,
+    shard_metadata: ShardMetadata,
+    event_sender: MpSender[Event],
+    image_index: int,
+) -> None:
+    """Process a single image response and send chunks."""
+    encoded_data = base64.b64encode(response.image_data).decode("utf-8")
+    is_partial = isinstance(response, PartialImageResponse)
+    # Extract stats from final ImageGenerationResponse if available
+    stats = response.stats if isinstance(response, ImageGenerationResponse) else None
+    _send_image_chunk(
+        encoded_data=encoded_data,
+        command_id=command_id,
+        model_id=shard_metadata.model_card.model_id,
+        event_sender=event_sender,
+        image_index=response.image_index,
+        is_partial=is_partial,
+        partial_index=response.partial_index if is_partial else None,
+        total_partials=response.total_partials if is_partial else None,
+        stats=stats,
+        image_format=response.format,
+    )
+
+
+def _send_traces_if_enabled(
+    event_sender: MpSender[Event],
+    task_id: TaskId,
+    rank: int,
+) -> None:
+    if not EXO_TRACING_ENABLED:
+        return
+
+    traces = get_trace_buffer()
+    if traces:
+        trace_data = [
+            TraceEventData(
+                name=t.name,
+                start_us=t.start_us,
+                duration_us=t.duration_us,
+                rank=t.rank,
+                category=t.category,
+            )
+            for t in traces
+        ]
+        event_sender.send(
+            TracesCollected(
+                task_id=task_id,
+                rank=rank,
+                traces=trace_data,
+            )
+        )
+    clear_trace_buffer()
+
+
+def _send_image_chunk(
+    encoded_data: str,
+    command_id: CommandId,
+    model_id: ModelId,
+    event_sender: MpSender[Event],
+    image_index: int,
+    is_partial: bool,
+    partial_index: int | None = None,
+    total_partials: int | None = None,
+    stats: ImageGenerationStats | None = None,
+    image_format: Literal["png", "jpeg", "webp"] | None = None,
+) -> None:
+    """Send base64-encoded image data as chunks via events."""
+    data_chunks = [
+        encoded_data[i : i + EXO_MAX_CHUNK_SIZE]
+        for i in range(0, len(encoded_data), EXO_MAX_CHUNK_SIZE)
+    ]
+    total_chunks = len(data_chunks)
+    for chunk_index, chunk_data in enumerate(data_chunks):
+        # Only include stats on the last chunk of the final image
+        chunk_stats = (
+            stats if chunk_index == total_chunks - 1 and not is_partial else None
+        )
+        event_sender.send(
+            ChunkGenerated(
+                command_id=command_id,
+                chunk=ImageChunk(
+                    model=model_id,
+                    data=chunk_data,
+                    chunk_index=chunk_index,
+                    total_chunks=total_chunks,
+                    image_index=image_index,
+                    is_partial=is_partial,
+                    partial_index=partial_index,
+                    total_partials=total_partials,
+                    stats=chunk_stats,
+                    format=image_format,
+                ),
+            )
+        )
+
+
+def main(
+    bound_instance: BoundInstance,
+    event_sender: MpSender[Event],
+    task_receiver: MpReceiver[Task],
+    cancel_receiver: MpReceiver[TaskId],
+):
+    soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (min(max(soft, 2048), hard), hard))
+
+    instance, runner_id, shard_metadata = (
+        bound_instance.instance,
+        bound_instance.bound_runner_id,
+        bound_instance.bound_shard,
+    )
+    device_rank = shard_metadata.device_rank
+    logger.info("hello from the runner")
+    if getattr(shard_metadata, "immediate_exception", False):
+        raise Exception("Fake exception - runner failed to spin up.")
+    if timeout := getattr(shard_metadata, "should_timeout", 0):
+        time.sleep(timeout)
+
+    setup_start_time = time.time()
+    cancelled_tasks = set[TaskId]()
+
+    image_model: DistributedImageModel | None = None
+    group = None
+
+    current_status: RunnerStatus = RunnerIdle()
+    logger.info("runner created")
+    event_sender.send(
+        RunnerStatusUpdated(runner_id=runner_id, runner_status=current_status)
+    )
+    seen = set[TaskId]()
+    with task_receiver as tasks:
+        for task in tasks:
+            if task.task_id in seen:
+                logger.warning("repeat task - potential error")
+            seen.add(task.task_id)
+            cancelled_tasks.discard(TaskId("CANCEL_CURRENT_TASK"))
+            event_sender.send(
+                TaskStatusUpdated(task_id=task.task_id, task_status=TaskStatus.Running)
+            )
+            match task:
+                case ConnectToGroup() if isinstance(
+                    current_status, (RunnerIdle, RunnerFailed)
+                ):
+                    logger.info("runner connecting")
+                    current_status = RunnerConnecting()
+                    event_sender.send(
+                        RunnerStatusUpdated(
+                            runner_id=runner_id, runner_status=current_status
+                        )
+                    )
+                    event_sender.send(TaskAcknowledged(task_id=task.task_id))
+                    group = initialize_mlx(bound_instance)
+
+                    logger.info("runner connected")
+                    current_status = RunnerConnected()
+
+                # we load the model if it's connected with a group, or idle without a group. we should never tell a model to connect if it doesn't need to
+                case LoadModel() if (
+                    isinstance(current_status, RunnerConnected) and group is not None
+                ) or (isinstance(current_status, RunnerIdle) and group is None):
+                    current_status = RunnerLoading()
+                    logger.info("runner loading")
+                    event_sender.send(
+                        RunnerStatusUpdated(
+                            runner_id=runner_id, runner_status=current_status
+                        )
+                    )
+                    event_sender.send(TaskAcknowledged(task_id=task.task_id))
+
+                    assert (
+                        ModelTask.TextToImage in shard_metadata.model_card.tasks
+                        or ModelTask.ImageToImage in shard_metadata.model_card.tasks
+                    ), f"Incorrect model task(s): {shard_metadata.model_card.tasks}"
+
+                    image_model = initialize_image_model(bound_instance)
+                    current_status = RunnerLoaded()
+                    logger.info("runner loaded")
+
+                case StartWarmup() if isinstance(current_status, RunnerLoaded):
+                    current_status = RunnerWarmingUp()
+                    logger.info("runner warming up")
+                    event_sender.send(
+                        RunnerStatusUpdated(
+                            runner_id=runner_id, runner_status=current_status
+                        )
+                    )
+                    event_sender.send(TaskAcknowledged(task_id=task.task_id))
+
+                    logger.info(f"warming up inference for instance: {instance}")
+
+                    assert image_model
+                    image = warmup_image_generator(model=image_model)
+                    if image is not None:
+                        logger.info(f"warmed up by generating {image.size} image")
+                    else:
+                        logger.info("warmup completed (non-primary node)")
+
+                    logger.info(
+                        f"runner initialized in {time.time() - setup_start_time} seconds"
+                    )
+
+                    current_status = RunnerReady()
+                    logger.info("runner ready")
+
+                case ImageGeneration(
+                    task_params=task_params, command_id=command_id
+                ) if isinstance(current_status, RunnerReady):
+                    assert image_model
+                    logger.info(f"received image generation request: {str(task)[:500]}")
+                    current_status = RunnerRunning()
+                    logger.info("runner running")
+                    event_sender.send(
+                        RunnerStatusUpdated(
+                            runner_id=runner_id, runner_status=current_status
+                        )
+                    )
+                    event_sender.send(TaskAcknowledged(task_id=task.task_id))
+
+                    try:
+                        image_index = 0
+                        for response in generate_image(
+                            model=image_model, task=task_params
+                        ):
+                            is_primary_output = _is_primary_output_node(shard_metadata)
+
+                            if is_primary_output:
+                                match response:
+                                    case PartialImageResponse():
+                                        logger.info(
+                                            f"sending partial ImageChunk {response.partial_index}/{response.total_partials}"
+                                        )
+                                        _process_image_response(
+                                            response,
+                                            command_id,
+                                            shard_metadata,
+                                            event_sender,
+                                            image_index,
+                                        )
+                                    case ImageGenerationResponse():
+                                        logger.info("sending final ImageChunk")
+                                        _process_image_response(
+                                            response,
+                                            command_id,
+                                            shard_metadata,
+                                            event_sender,
+                                            image_index,
+                                        )
+                                        image_index += 1
+                    # can we make this more explicit?
+                    except Exception as e:
+                        if _is_primary_output_node(shard_metadata):
+                            event_sender.send(
+                                ChunkGenerated(
+                                    command_id=command_id,
+                                    chunk=ErrorChunk(
+                                        model=shard_metadata.model_card.model_id,
+                                        finish_reason="error",
+                                        error_message=str(e),
+                                    ),
+                                )
+                            )
+                        raise
+                    finally:
+                        _send_traces_if_enabled(event_sender, task.task_id, device_rank)
+
+                    current_status = RunnerReady()
+                    logger.info("runner ready")
+
+                case ImageEdits(task_params=task_params, command_id=command_id) if (
+                    isinstance(current_status, RunnerReady)
+                ):
+                    assert image_model
+                    logger.info(f"received image edits request: {str(task)[:500]}")
+                    current_status = RunnerRunning()
+                    logger.info("runner running")
+                    event_sender.send(
+                        RunnerStatusUpdated(
+                            runner_id=runner_id, runner_status=current_status
+                        )
+                    )
+                    event_sender.send(TaskAcknowledged(task_id=task.task_id))
+
+                    try:
+                        image_index = 0
+                        for response in generate_image(
+                            model=image_model, task=task_params
+                        ):
+                            if _is_primary_output_node(shard_metadata):
+                                match response:
+                                    case PartialImageResponse():
+                                        logger.info(
+                                            f"sending partial ImageChunk {response.partial_index}/{response.total_partials}"
+                                        )
+                                        _process_image_response(
+                                            response,
+                                            command_id,
+                                            shard_metadata,
+                                            event_sender,
+                                            image_index,
+                                        )
+                                    case ImageGenerationResponse():
+                                        logger.info("sending final ImageChunk")
+                                        _process_image_response(
+                                            response,
+                                            command_id,
+                                            shard_metadata,
+                                            event_sender,
+                                            image_index,
+                                        )
+                                        image_index += 1
+                    except Exception as e:
+                        if _is_primary_output_node(shard_metadata):
+                            event_sender.send(
+                                ChunkGenerated(
+                                    command_id=command_id,
+                                    chunk=ErrorChunk(
+                                        model=shard_metadata.model_card.model_id,
+                                        finish_reason="error",
+                                        error_message=str(e),
+                                    ),
+                                )
+                            )
+                        raise
+                    finally:
+                        _send_traces_if_enabled(event_sender, task.task_id, device_rank)
+
+                    current_status = RunnerReady()
+                    logger.info("runner ready")
+
+                case Shutdown():
+                    current_status = RunnerShuttingDown()
+                    logger.info("runner shutting down")
+                    if not TYPE_CHECKING:
+                        del image_model, group
+                        mx.clear_cache()
+                        import gc
+
+                        gc.collect()
+
+                    event_sender.send(
+                        RunnerStatusUpdated(
+                            runner_id=runner_id, runner_status=current_status
+                        )
+                    )
+                    event_sender.send(TaskAcknowledged(task_id=task.task_id))
+
+                    current_status = RunnerShutdown()
+                case _:
+                    raise ValueError(
+                        f"Received {task.__class__.__name__} outside of state machine in {current_status=}"
+                    )
+            was_cancelled = (task.task_id in cancelled_tasks) or (
+                TaskId("CANCEL_CURRENT_TASK") in cancelled_tasks
+            )
+            if not was_cancelled:
+                event_sender.send(
+                    TaskStatusUpdated(
+                        task_id=task.task_id, task_status=TaskStatus.Complete
+                    )
+                )
+            event_sender.send(
+                RunnerStatusUpdated(runner_id=runner_id, runner_status=current_status)
+            )
+
+            if isinstance(current_status, RunnerShutdown):
+                break
--- a/src/exo/worker/runner/llm_inference/init.py
+++ b/src/exo/worker/runner/llm_inference/init.py
--- a/src/exo/worker/runner/llm_inference/runner.py
+++ b/src/exo/worker/runner/llm_inference/runner.py
@@ -1,10 +1,9 @@
-import base64
 import math
 import resource
 import time
 from collections.abc import Generator
 from functools import cache
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, cast

 import mlx.core as mx
 from mlx_lm.models.deepseek_v32 import Model as DeepseekV32Model
@@ -18,31 +17,22 @@ from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
    load_harmony_encoding,
 )

-from exo.shared.constants import EXO_MAX_CHUNK_SIZE, EXO_TRACING_ENABLED
-from exo.shared.models.model_cards import ModelId, ModelTask
-from exo.shared.tracing import clear_trace_buffer, get_trace_buffer
-from exo.shared.types.api import ImageGenerationStats
+from exo.shared.models.model_cards import ModelTask
 from exo.shared.types.chunks import (
    ErrorChunk,
-    ImageChunk,
    PrefillProgressChunk,
    TokenChunk,
    ToolCallChunk,
 )
-from exo.shared.types.common import CommandId
 from exo.shared.types.events import (
    ChunkGenerated,
    Event,
    RunnerStatusUpdated,
    TaskAcknowledged,
    TaskStatusUpdated,
-    TraceEventData,
-    TracesCollected,
 )
 from exo.shared.types.tasks import (
    ConnectToGroup,
-    ImageEdits,
-    ImageGeneration,
    LoadModel,
    Shutdown,
    StartWarmup,
@@ -55,8 +45,6 @@ from exo.shared.types.text_generation import TextGenerationTaskParams
 from exo.shared.types.worker.instances import BoundInstance
 from exo.shared.types.worker.runner_response import (
    GenerationResponse,
-    ImageGenerationResponse,
-    PartialImageResponse,
    ToolCallItem,
    ToolCallResponse,
 )
@@ -74,18 +62,7 @@ from exo.shared.types.worker.runners import (
    RunnerStatus,
    RunnerWarmingUp,
 )
-from exo.shared.types.worker.shards import (
-    CfgShardMetadata,
-    PipelineShardMetadata,
-    ShardMetadata,
-)
 from exo.utils.channels import MpReceiver, MpSender
-from exo.worker.engines.image import (
-    DistributedImageModel,
-    generate_image,
-    initialize_image_model,
-    warmup_image_generator,
-)
 from exo.worker.engines.mlx import Model
 from exo.worker.engines.mlx.cache import KVPrefixCache
 from exo.worker.engines.mlx.generator.generate import (
@@ -106,22 +83,6 @@ from exo.worker.runner.bootstrap import logger
 from .tool_parsers import ToolParser, make_mlx_parser


-def _is_primary_output_node(shard_metadata: ShardMetadata) -> bool:
-    """Check if this node is the primary output node for image generation.
-
-    For CFG models: the last pipeline stage in CFG group 0 (positive prompt).
-    For non-CFG models: the last pipeline stage.
-    """
-    if isinstance(shard_metadata, CfgShardMetadata):
-        is_pipeline_last = (
-            shard_metadata.pipeline_rank == shard_metadata.pipeline_world_size - 1
-        )
-        return is_pipeline_last and shard_metadata.cfg_rank == 0
-    elif isinstance(shard_metadata, PipelineShardMetadata):
-        return shard_metadata.device_rank == shard_metadata.world_size - 1
-    return False
-
-
 def main(
    bound_instance: BoundInstance,
    event_sender: MpSender[Event],
@@ -146,9 +107,7 @@ def main(
    setup_start_time = time.time()
    cancelled_tasks = set[TaskId]()

-    # type checker was unhappy with me - splitting these fixed it
    inference_model: Model | None = None
-    image_model: DistributedImageModel | None = None
    tokenizer = None
    tool_parser: ToolParser | None = None
    group = None
@@ -211,33 +170,25 @@ def main(
                        )
                        time.sleep(0.5)

-                    if ModelTask.TextGeneration in shard_metadata.model_card.tasks:
-                        inference_model, tokenizer = load_mlx_items(
-                            bound_instance, group, on_timeout=on_model_load_timeout
-                        )
-                        logger.info(
-                            f"model has_tool_calling={tokenizer.has_tool_calling} using tokens {tokenizer.tool_call_start}, {tokenizer.tool_call_end}"
-                        )
-                        if tokenizer.has_tool_calling:
-                            assert tokenizer.tool_call_start
-                            assert tokenizer.tool_call_end
-                            assert tokenizer.tool_parser  # pyright: ignore[reportAny]
-                            tool_parser = make_mlx_parser(
-                                tokenizer.tool_call_start,
-                                tokenizer.tool_call_end,
-                                tokenizer.tool_parser,  # pyright: ignore[reportAny]
-                            )
-                        kv_prefix_cache = KVPrefixCache(group)
-
-                    elif (
-                        ModelTask.TextToImage in shard_metadata.model_card.tasks
-                        or ModelTask.ImageToImage in shard_metadata.model_card.tasks
-                    ):
-                        image_model = initialize_image_model(bound_instance)
-                    else:
-                        raise ValueError(
-                            f"Unknown model task(s): {shard_metadata.model_card.tasks}"
+                    assert (
+                        ModelTask.TextGeneration in shard_metadata.model_card.tasks
+                    ), f"Incorrect model task(s): {shard_metadata.model_card.tasks}"
+                    inference_model, tokenizer = load_mlx_items(
+                        bound_instance, group, on_timeout=on_model_load_timeout
+                    )
+                    logger.info(
+                        f"model has_tool_calling={tokenizer.has_tool_calling} using tokens {tokenizer.tool_call_start}, {tokenizer.tool_call_end}"
+                    )
+                    if tokenizer.has_tool_calling:
+                        assert tokenizer.tool_call_start
+                        assert tokenizer.tool_call_end
+                        assert tokenizer.tool_parser  # pyright: ignore[reportAny]
+                        tool_parser = make_mlx_parser(
+                            tokenizer.tool_call_start,
+                            tokenizer.tool_call_end,
+                            tokenizer.tool_parser,  # pyright: ignore[reportAny]
                        )
+                    kv_prefix_cache = KVPrefixCache(group)
                    current_status = RunnerLoaded()
                    logger.info("runner loaded")
                case StartWarmup() if isinstance(current_status, RunnerLoaded):
@@ -251,46 +202,34 @@ def main(
                    event_sender.send(TaskAcknowledged(task_id=task.task_id))

                    logger.info(f"warming up inference for instance: {instance}")
-                    if ModelTask.TextGeneration in shard_metadata.model_card.tasks:
-                        assert inference_model
-                        assert tokenizer
+                    assert inference_model
+                    assert tokenizer

-                        t = time.monotonic()
-                        toks = warmup_inference(
-                            model=inference_model,
-                            tokenizer=tokenizer,
-                            group=group,
+                    t = time.monotonic()
+                    toks = warmup_inference(
+                        model=cast(Model, inference_model),
+                        tokenizer=tokenizer,
+                        group=group,
+                    )
+                    logger.info(f"warmed up by generating {toks} tokens")
+                    check_for_cancel_every = min(
+                        math.ceil(toks / min(time.monotonic() - t, 0.001)), 100
+                    )
+                    if group is not None:
+                        check_for_cancel_every = int(
+                            mx.max(
+                                mx.distributed.all_gather(
+                                    mx.array([check_for_cancel_every]), group=group
+                                )
+                            ).item()
                        )
-                        logger.info(f"warmed up by generating {toks} tokens")
-                        check_for_cancel_every = min(
-                            math.ceil(toks / min(time.monotonic() - t, 0.001)), 100
-                        )
-                        if group is not None:
-                            check_for_cancel_every = int(
-                                mx.max(
-                                    mx.distributed.all_gather(
-                                        mx.array([check_for_cancel_every]), group=group
-                                    )
-                                ).item()
-                            )
-
-                        logger.info(
-                            f"runner checking for cancellation every {check_for_cancel_every} tokens"
-                        )
-                        logger.info(
-                            f"runner initialized in {time.time() - setup_start_time} seconds"
-                        )
-                    elif (
-                        ModelTask.TextToImage in shard_metadata.model_card.tasks
-                        or ModelTask.ImageToImage in shard_metadata.model_card.tasks
-                    ):
-                        assert image_model
-                        image = warmup_image_generator(model=image_model)
-                        if image is not None:
-                            logger.info(f"warmed up by generating {image.size} image")
-                        else:
-                            logger.info("warmup completed (non-primary node)")

+                    logger.info(
+                        f"runner checking for cancellation every {check_for_cancel_every} tokens"
+                    )
+                    logger.info(
+                        f"runner initialized in {time.time() - setup_start_time} seconds"
+                    )
                    current_status = RunnerReady()
                    logger.info("runner ready")
                case TextGeneration(task_params=task_params, command_id=command_id) if (
@@ -345,7 +284,7 @@ def main(

                        # Generate responses using the actual MLX generation
                        mlx_generator = mlx_generate(
-                            model=inference_model,
+                            model=cast(Model, inference_model),
                            tokenizer=tokenizer,
                            task=task_params,
                            prompt=prompt,
@@ -374,7 +313,7 @@ def main(
                            mlx_generator = parse_tool_calls(mlx_generator, tool_parser)

                        completion_tokens = 0
-                        tokens_since_last_cancel_check = 0
+                        tokens_since_last_cancel_check = check_for_cancel_every
                        for response in mlx_generator:
                            tokens_since_last_cancel_check += 1
                            if tokens_since_last_cancel_check >= check_for_cancel_every:
@@ -458,138 +397,12 @@ def main(

                    current_status = RunnerReady()
                    logger.info("runner ready")
-                case ImageGeneration(
-                    task_params=task_params, command_id=command_id
-                ) if isinstance(current_status, RunnerReady):
-                    assert image_model
-                    logger.info(f"received image generation request: {str(task)[:500]}")
-                    current_status = RunnerRunning()
-                    logger.info("runner running")
-                    event_sender.send(
-                        RunnerStatusUpdated(
-                            runner_id=runner_id, runner_status=current_status
-                        )
-                    )
-                    event_sender.send(TaskAcknowledged(task_id=task.task_id))

-                    try:
-                        image_index = 0
-                        for response in generate_image(
-                            model=image_model, task=task_params
-                        ):
-                            is_primary_output = _is_primary_output_node(shard_metadata)
-
-                            if is_primary_output:
-                                match response:
-                                    case PartialImageResponse():
-                                        logger.info(
-                                            f"sending partial ImageChunk {response.partial_index}/{response.total_partials}"
-                                        )
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                    case ImageGenerationResponse():
-                                        logger.info("sending final ImageChunk")
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                        image_index += 1
-                    # can we make this more explicit?
-                    except Exception as e:
-                        if _is_primary_output_node(shard_metadata):
-                            event_sender.send(
-                                ChunkGenerated(
-                                    command_id=command_id,
-                                    chunk=ErrorChunk(
-                                        model=shard_metadata.model_card.model_id,
-                                        finish_reason="error",
-                                        error_message=str(e),
-                                    ),
-                                )
-                            )
-                        raise
-                    finally:
-                        _send_traces_if_enabled(
-                            event_sender, task.task_id, shard_metadata.device_rank
-                        )
-
-                    current_status = RunnerReady()
-                    logger.info("runner ready")
-                case ImageEdits(task_params=task_params, command_id=command_id) if (
-                    isinstance(current_status, RunnerReady)
-                ):
-                    assert image_model
-                    logger.info(f"received image edits request: {str(task)[:500]}")
-                    current_status = RunnerRunning()
-                    logger.info("runner running")
-                    event_sender.send(
-                        RunnerStatusUpdated(
-                            runner_id=runner_id, runner_status=current_status
-                        )
-                    )
-                    event_sender.send(TaskAcknowledged(task_id=task.task_id))
-
-                    try:
-                        image_index = 0
-                        for response in generate_image(
-                            model=image_model, task=task_params
-                        ):
-                            if _is_primary_output_node(shard_metadata):
-                                match response:
-                                    case PartialImageResponse():
-                                        logger.info(
-                                            f"sending partial ImageChunk {response.partial_index}/{response.total_partials}"
-                                        )
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                    case ImageGenerationResponse():
-                                        logger.info("sending final ImageChunk")
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                        image_index += 1
-                    except Exception as e:
-                        if _is_primary_output_node(shard_metadata):
-                            event_sender.send(
-                                ChunkGenerated(
-                                    command_id=command_id,
-                                    chunk=ErrorChunk(
-                                        model=shard_metadata.model_card.model_id,
-                                        finish_reason="error",
-                                        error_message=str(e),
-                                    ),
-                                )
-                            )
-                        raise
-                    finally:
-                        _send_traces_if_enabled(
-                            event_sender, task.task_id, shard_metadata.device_rank
-                        )
-
-                    current_status = RunnerReady()
-                    logger.info("runner ready")
                case Shutdown():
                    current_status = RunnerShuttingDown()
                    logger.info("runner shutting down")
                    if not TYPE_CHECKING:
-                        del inference_model, image_model, tokenizer, group
+                        del inference_model, tokenizer, group
                        mx.clear_cache()
                        import gc

@@ -890,104 +703,6 @@ def parse_thinking_models(
        yield response.model_copy(update={"is_thinking": in_thinking})


-def _send_image_chunk(
-    encoded_data: str,
-    command_id: CommandId,
-    model_id: ModelId,
-    event_sender: MpSender[Event],
-    image_index: int,
-    is_partial: bool,
-    partial_index: int | None = None,
-    total_partials: int | None = None,
-    stats: ImageGenerationStats | None = None,
-    image_format: Literal["png", "jpeg", "webp"] | None = None,
-) -> None:
-    """Send base64-encoded image data as chunks via events."""
-    data_chunks = [
-        encoded_data[i : i + EXO_MAX_CHUNK_SIZE]
-        for i in range(0, len(encoded_data), EXO_MAX_CHUNK_SIZE)
-    ]
-    total_chunks = len(data_chunks)
-    for chunk_index, chunk_data in enumerate(data_chunks):
-        # Only include stats on the last chunk of the final image
-        chunk_stats = (
-            stats if chunk_index == total_chunks - 1 and not is_partial else None
-        )
-        event_sender.send(
-            ChunkGenerated(
-                command_id=command_id,
-                chunk=ImageChunk(
-                    model=model_id,
-                    data=chunk_data,
-                    chunk_index=chunk_index,
-                    total_chunks=total_chunks,
-                    image_index=image_index,
-                    is_partial=is_partial,
-                    partial_index=partial_index,
-                    total_partials=total_partials,
-                    stats=chunk_stats,
-                    format=image_format,
-                ),
-            )
-        )
-
-
-def _send_traces_if_enabled(
-    event_sender: MpSender[Event],
-    task_id: TaskId,
-    rank: int,
-) -> None:
-    if not EXO_TRACING_ENABLED:
-        return
-
-    traces = get_trace_buffer()
-    if traces:
-        trace_data = [
-            TraceEventData(
-                name=t.name,
-                start_us=t.start_us,
-                duration_us=t.duration_us,
-                rank=t.rank,
-                category=t.category,
-            )
-            for t in traces
-        ]
-        event_sender.send(
-            TracesCollected(
-                task_id=task_id,
-                rank=rank,
-                traces=trace_data,
-            )
-        )
-    clear_trace_buffer()
-
-
-def _process_image_response(
-    response: ImageGenerationResponse | PartialImageResponse,
-    command_id: CommandId,
-    shard_metadata: ShardMetadata,
-    event_sender: MpSender[Event],
-    image_index: int,
-) -> None:
-    """Process a single image response and send chunks."""
-    encoded_data = base64.b64encode(response.image_data).decode("utf-8")
-    is_partial = isinstance(response, PartialImageResponse)
-    # Extract stats from final ImageGenerationResponse if available
-    stats = response.stats if isinstance(response, ImageGenerationResponse) else None
-    _send_image_chunk(
-        encoded_data=encoded_data,
-        command_id=command_id,
-        model_id=shard_metadata.model_card.model_id,
-        event_sender=event_sender,
-        image_index=response.image_index,
-        is_partial=is_partial,
-        partial_index=response.partial_index if is_partial else None,
-        total_partials=response.total_partials if is_partial else None,
-        stats=stats,
-        image_format=response.format,
-    )
-
-
 def parse_tool_calls(
    responses: Generator[GenerationResponse], tool_parser: ToolParser
 ) -> Generator[GenerationResponse | ToolCallResponse]:
--- a/src/exo/worker/runner/llm_inference/tool_parsers.py
+++ b/src/exo/worker/runner/llm_inference/tool_parsers.py
--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -100,10 +100,13 @@ class RunnerSupervisor:
        logger.info("Runner supervisor shutting down")
        self._ev_recv.close()
        self._task_sender.close()
-        with contextlib.suppress(ClosedResourceError):
+        try:
            self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
-        self._cancel_sender.close()
-        self.runner_process.join(5)
+            self._cancel_sender.close()
+        except ClosedResourceError:
+            pass
+        self._event_sender.close()
+        self.runner_process.join(1)
        if not self.runner_process.is_alive():
            logger.info("Runner process succesfully terminated")
            return
@@ -140,6 +143,7 @@ class RunnerSupervisor:
        await event.wait()

    async def cancel_task(self, task_id: TaskId):
+        """Send a cancellation signal to the runner process."""
        if task_id in self.completed:
            logger.info(f"Unable to cancel {task_id} as it has been completed")
            return
--- a/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_dsml_e2e.py
@@ -19,7 +19,7 @@ from exo.worker.engines.mlx.dsml_encoding import (
    encode_messages,
    parse_dsml_output,
 )
-from exo.worker.runner.runner import parse_deepseek_v32
+from exo.worker.runner.llm_inference.runner import parse_deepseek_v32

 # ── Shared fixtures ──────────────────────────────────────────────

--- a/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
@@ -6,7 +6,7 @@ from typing import Callable
 import mlx.core as mx
 import pytest

-import exo.worker.runner.runner as mlx_runner
+import exo.worker.runner.llm_inference.runner as mlx_runner
 from exo.shared.types.chunks import TokenChunk
 from exo.shared.types.events import (
    ChunkGenerated,
@@ -179,8 +179,11 @@ def _run(tasks: Iterable[Task]):
        # this is some c++ nonsense
        task_receiver.close = nothin
        task_receiver.join = nothin
+        cancel_receiver.close = nothin
+        cancel_receiver.join = nothin
+
        with unittest.mock.patch(
-            "exo.worker.runner.runner.mx.distributed.all_gather",
+            "exo.worker.runner.llm_inference.runner.mx.distributed.all_gather",
            make_nothin(mx.array([1])),
        ):
            mlx_runner.main(
--- a/src/exo/worker/tests/unittests/test_runner/test_parse_gpt_oss.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_parse_gpt_oss.py
@@ -4,7 +4,7 @@ from exo.shared.types.worker.runner_response import (
    GenerationResponse,
    ToolCallResponse,
 )
-from exo.worker.runner.runner import parse_gpt_oss
+from exo.worker.runner.llm_inference.runner import parse_gpt_oss

 # Token IDs from mlx-community/gpt-oss-20b-MXFP4-Q8 tokenizer.
 # These are stable since they come from the model's vocabulary.
--- a/src/exo/worker/tests/unittests/test_runner/test_parse_tool_calls.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_parse_tool_calls.py
@@ -4,8 +4,8 @@ from collections.abc import Generator
 from typing import Any

 from exo.shared.types.worker.runner_response import GenerationResponse, ToolCallResponse
-from exo.worker.runner.runner import parse_tool_calls
-from exo.worker.runner.tool_parsers import make_mlx_parser
+from exo.worker.runner.llm_inference.runner import parse_tool_calls
+from exo.worker.runner.llm_inference.tool_parsers import make_mlx_parser


 def _make_responses(
--- a/uv.lock
+++ b/uv.lock
@@ -378,7 +378,7 @@ dependencies = [
    { name = "loguru", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "mflux", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cpu"], marker = "sys_platform == 'linux'" },
-    { name = "mlx", version = "0.30.7.dev20260218+14841977", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }, marker = "sys_platform == 'darwin'" },
+    { name = "mlx", version = "0.30.7.dev20260220+13998a05", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#13998a054715edcdc93618fb1496c79c7c25ff7c" }, marker = "sys_platform == 'darwin'" },
    { name = "mlx-lm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "msgspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "openai-harmony", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1023,7 +1023,7 @@ dependencies = [
    { name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "matplotlib", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cuda13"], marker = "sys_platform == 'linux'" },
-    { name = "mlx", version = "0.30.7.dev20260218+14841977", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }, marker = "sys_platform == 'darwin'" },
+    { name = "mlx", version = "0.30.7.dev20260220+13998a05", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#13998a054715edcdc93618fb1496c79c7c25ff7c" }, marker = "sys_platform == 'darwin'" },
    { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "opencv-python", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "piexif", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1070,8 +1070,8 @@ cuda13 = [

 [[package]]
 name = "mlx"
-version = "0.30.7.dev20260218+14841977"
-source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }
+version = "0.30.7.dev20260220+13998a05"
+source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#13998a054715edcdc93618fb1496c79c7c25ff7c" }
 resolution-markers = [
    "sys_platform == 'darwin'",
 ]
@@ -1106,7 +1106,7 @@ version = "0.30.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", version = "0.30.7.dev20260218+14841977", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }, marker = "sys_platform == 'darwin'" },
+    { name = "mlx", version = "0.30.7.dev20260220+13998a05", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#13998a054715edcdc93618fb1496c79c7c25ff7c" }, marker = "sys_platform == 'darwin'" },
    { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
Author	SHA1	Message	Date
Alex Cheema	928c41b13c	feat: add MetaInstance declarative layer with reconciliation Adds a declarative MetaInstance system for managing model instances with automatic placement, retry logic (max 3 attempts), and lifecycle management via a reconciliation loop. - Process managers for instance health, meta-instance lifecycle, and node timeout detection - Reconciliation engine driving state transitions and cascading deletes - Dashboard UI for creating/managing MetaInstances with node selection, sharding config, retry status, and error feedback - JACCL SideChannel integration for distributed inference coordination - Comprehensive test suite (25+ edge cases) Split from original #1519. Independent bug fixes extracted to: #1547 (misc fixes), #1546 (JACCL sidechannel), #1582 (download detection), #1580 (RDMA warning) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-21 13:05:10 -08:00
Alex Cheema	1780e4ade4	fix: change RDMA AVAILABLE to RDMA NOT ENABLED warning (#1580 ) ## Summary - Changed blue info badge "RDMA AVAILABLE" to yellow warning badge "RDMA NOT ENABLED" — more accurately describes the state - Added hover tooltip with enable instructions to all views (was missing in 2 of 4 instances) - Warning icon instead of info icon, consistent with other cluster warnings (TB cycle, macOS mismatch) ## Screenshots Badge (yellow warning): ![RDMA warning badge](https://raw.githubusercontent.com/exo-explore/exo/3f7bdb482c5011d60f140aa84ab21023032e4a57/rdma-warning.png) Hover tooltip with instructions: ![RDMA warning hover](https://raw.githubusercontent.com/exo-explore/exo/3f7bdb482c5011d60f140aa84ab21023032e4a57/rdma-warning-hover.png) ## Test plan - [x] Dashboard builds successfully - [ ] Verify badge appears when 2+ TB5 nodes have RDMA disabled - [ ] Verify hover tooltip shows in normal layout - [ ] Verify hover tooltip shows in topology-only mode - [ ] Verify dismiss button works - [ ] Verify compact badge in status bar shows yellow warning 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: rltakashige <rl.takashige@gmail.com>	2026-02-20 21:40:07 +00:00
Jake Hillion	ab9273e723	downloads: add read_only flag to DownloadCompleted for EXO_MODELS_PATH Models in EXO_MODELS_PATH are pre-downloaded into read-only directories and must not be deleted. The DownloadCoordinator had no awareness of these paths, so they never appeared as completed downloads in cluster state, and the bench harness could attempt to delete them when freeing disk space. Added a `read_only: bool` field to `DownloadCompleted` (default False). The DownloadCoordinator now checks `resolve_model_in_path` in `_start_download`, proactively scans EXO_MODELS_PATH in `_emit_existing_download_progress` to emit DownloadCompleted events for all pre-downloaded models (overriding DownloadPending from the regular scan), and refuses deletion of read-only models. The bench harness filters out read-only models from deletion candidates. Test plan: - Ran with EXO_MODELS_PATH. Available models now show as downloaded in the UI. There isn't good UI for the fact they can't be deleted, but it should work with exo_bench.	2026-02-20 20:27:45 +00:00
Jake Hillion	71e48c0f62	model-cards: add missing metadata for Qwen3 Coder Next variants (#1576 ) The Qwen3-Coder-Next model card TOML files were missing family, quantization, base_model, and capabilities fields. This caused them not to appear under the Qwen family filter in the dashboard's model picker. Added the missing metadata to all five variants (4bit, 5bit, 6bit, 8bit, bf16), matching the format used by the existing Qwen3-Coder-480B model cards. Test plan: - Eyeballs	2026-02-20 18:25:49 +00:00
Jake Hillion	42da58c297	worker: add EXO_MODELS_PATH for pre-downloaded model directories Users with pre-existing model files (e.g. on shared NFS mounts or from prior downloads) had no way to point exo at those directories without going through the download coordinator. EXO_MODELS_DIR only moves the download target directory, it doesn't support read-only search paths. Added EXO_MODELS_PATH environment variable as a colon-separated list of directories to search for models. When the worker's plan loop encounters a DownloadModel task, it checks these directories first and emits a synthetic DownloadCompleted event if found, bypassing the download coordinator entirely. The runner's build_model_path also checks these directories first so the correct path is used during model loading. This keeps the existing event sourcing state machine unchanged — the DownloadCompleted event propagates naturally through the system, so _load_model and all downstream logic work without modification. Test plan: - `s1@s1s-Mac-Studio ~ % EXO_LIBP2P_NAMESPACE=jake EXO_MODELS_PATH="/Volumes/Definitely Leo's SSD" nix --extra-experimental-features 'nix-command flakes' run github:exo-explore/exo/f2babbc2f742357d97dc177619fec062ef545be4` - Started mlx-community/Qwen3-Coder-Next-4bit - it's present on the disk and it worked. - Renamed one safetensor of mlx-community/Qwen3-Coder-Next-4bit on the disk. It then started the download locally, as expected.	2026-02-20 18:17:56 +00:00
Mustafa Alp Yılmaz	6b5a705959	fix: immediate cancel check after prefill completes (#1575 ) ## Problem When a request is cancelled during prefill, the cancellation is not detected until `check_for_cancel_every` additional tokens have been generated. This is because `tokens_since_last_cancel_check` is initialized to `0`, meaning the first cancel check only happens after generating `check_for_cancel_every` tokens post-prefill. For long prefills (which are the most likely to be cancelled), this adds unnecessary latency before the cancellation is actually honoured. ## Fix Initialize `tokens_since_last_cancel_check` to `check_for_cancel_every` instead of `0`, so the very first token generated after prefill triggers an immediate cancel check. ```diff - tokens_since_last_cancel_check = 0 + tokens_since_last_cancel_check = check_for_cancel_every ``` ## Impact - Cancellations issued during prefill are detected immediately when generation begins - No change in behaviour for non-cancelled requests (the counter resets to `0` after each check as before) - 1 line changed Co-authored-by: rltakashige <rl.takashige@gmail.com>	2026-02-20 18:00:59 +00:00
Alex Cheema	6b54a27019	fix: add downloaded_bytes to DownloadPending event (#1564 ) ## Summary - Add downloaded_bytes field to existing DownloadPending event for accurate resume progress - Minimal change per maintainer directive — no new download states introduced ## Test plan - [x] 42 tests passed, 1 skipped - [x] Verified downloaded_bytes populates correctly for partial downloads 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: rltakashige <rl.takashige@gmail.com>	2026-02-20 17:54:18 +00:00
rltakashige	e01f50a5cd	Update mlx fork (#1565 ) ## Motivation Some fixes upstream. This sort of commit will probably be quite common until GPU locks are resolved.	2026-02-20 17:23:52 +00:00
Evan Quiney	1093080214	cancel active downloads on coordinator shutdown (#1567 ) we were seeing some crashes as lost download tasks were trying to push data toward a deleted coordinator. this cancels download tasks with the coordinator's shutdown on master election	2026-02-20 17:17:43 +00:00
rltakashige	1a2b8b044a	Refactor runner into separate runners (#1570 ) ## Motivation We're going to be refactoring the llm inference code, so we should split the runner up into parts while we can. ## Test Plan ### Manual Testing Works on single node, at least. ### Automated Testing Passes CI. Will be tested by our tests today.	2026-02-20 17:11:01 +00:00
Evan Quiney	dc8d42b4dc	add system ids (#1536 ) addresses some election edge cases where a new worker with an old master would get stuck on the old workers buffer index - we now use new system ids each time we instantiate a node, and each event-producing system has a unique system id for its lifespan (until the master moves).	2026-02-20 15:41:59 +00:00
Jake Hillion	d484b062e8	bench: add download timing to bench output (#1566 ) The bench script downloads models during the planning phase but doesn't record how long the download took, making it difficult to track download performance for a given model over time. Modified `run_planning_phase` to return download metadata: whether a fresh download occurred, the wall-clock duration, and the model size in bytes. These fields are included in every JSON output row alongside the existing per-run metrics, and a summary line is logged to the console. This allows filtering bench results by `download_occurred` and grouping by `model_id` to compute average download times across runs. Test plan: ``` # existing model jake@maverick:/data/users/jake/repos/exo/ > nix run .#exo-bench -- --host s1 --model mlx-community/gpt-oss-120b-MXFP4-Q8 --pp 128 --tg 128 ... 2026-02-20 15:23:49.081 \| INFO \| __main__:main:340 - Planning phase: checking downloads... 2026-02-20 15:23:49.152 \| INFO \| harness:run_planning_phase:402 - Started download on 12D3KooWKx41iikn188ozrxSdoG26g88jFCfie9wEA1eQR8csbPm 2026-02-20 15:23:49.184 \| INFO \| __main__:main:352 - Download: model already cached ... Wrote results JSON: bench/results.json jake@maverick:/data/users/jake/repos/exo/ > cat bench/results.json [ { "elapsed_s": 2.9446684420108795, "output_text_preview": "The user just typed a long series of \"a\". Possibly they are testing. There's no explicit question. Could be they want a response? Might be a test of handling long input. We can respond politely, ask i", "stats": { "prompt_tps": 117.7872141515621, "generation_tps": 85.49598231498028, "prompt_tokens": 129, "generation_tokens": 128, "peak_memory_usage": { "inBytes": 68215145744 } }, "model_short_id": "gpt-oss-120b-MXFP4-Q8", "model_id": "mlx-community/gpt-oss-120b-MXFP4-Q8", "placement_sharding": "Pipeline", "placement_instance_meta": "MlxRing", "placement_nodes": 1, "instance_id": "68babc2a-6e94-4c70-aa07-7ec681f7c856", "pp_tokens": 128, "tg": 128, "repeat_index": 0 } ]% # no change to output ``` ``` # missing model jake@maverick:/data/users/jake/repos/exo/ > nix run .#exo-bench -- --host s1 --model mlx-community/Meta-Llama-3.1-8B-Instruct-4bit --pp 128 --tg 128 ... 2026-02-20 15:24:42.553 \| INFO \| __main__:main:340 - Planning phase: checking downloads... 2026-02-20 15:24:42.625 \| INFO \| harness:run_planning_phase:402 - Started download on 12D3KooWKx41iikn188ozrxSdoG26g88jFCfie9wEA1eQR8csbPm 2026-02-20 15:25:37.494 \| INFO \| __main__:main:350 - Download: 54.9s (freshly downloaded) ... Wrote results JSON: bench/results.json jake@maverick:/data/users/jake/repos/exo/ > cat bench/results.json [ { "elapsed_s": 1.500349276990164, "output_text_preview": "It seems like you've entered a large number of 'a's. If you'd like to discuss something or ask a question, I'm here to help. If not, is there anything else I can assist you with? \n\nIf you're intereste", "stats": { "prompt_tps": 395.43264952543666, "generation_tps": 128.03520443181478, "prompt_tokens": 129, "generation_tokens": 128, "peak_memory_usage": { "inBytes": 5116952079 } }, "model_short_id": "Meta-Llama-3.1-8B-Instruct-4bit", "model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit", "placement_sharding": "Pipeline", "placement_instance_meta": "MlxRing", "placement_nodes": 1, "instance_id": "ccd9bd71-d4cc-4b75-a37f-98090544626a", "pp_tokens": 128, "tg": 128, "repeat_index": 0, "download_duration_s": 54.88322358299047 } ]% # one new field ```	2026-02-20 15:33:08 +00:00