dashboard: show failure reasons for runner and download errors (#1350 )

Surface error messages that the backend already captures but the dashboard was discarding. Runner failures now extract errorMessage from RunnerFailed status, and download failures display errorMessage from DownloadFailed payloads on both the main page and downloads page. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
dashboard: redesign downloads page as model×node table (#1465 )
2026-02-18 23:06:23 -05:00 · 2026-02-17 09:59:06 -08:00 · 2026-02-17 14:31:47 +00:00 · 2026-02-17 11:48:28 +00:00
6 changed files with 664 additions and 503 deletions
--- a/dashboard/src/routes/+page.svelte
+++ b/dashboard/src/routes/+page.svelte
@@ -1084,7 +1084,7 @@
      return {
        isDownloading: false,
        isFailed: statusInfo.statusText === "FAILED",
-        errorMessage: null,
+        errorMessage: statusInfo.errorMessage,
        progress: null,
        statusText: statusInfo.statusText,
        perNode: [],
@@ -1142,10 +1142,15 @@
  function deriveInstanceStatus(instanceWrapped: unknown): {
    statusText: string;
    statusClass: string;
+    errorMessage: string | null;
  } {
    const [, instance] = getTagged(instanceWrapped);
    if (!instance || typeof instance !== "object") {
-      return { statusText: "PREPARING", statusClass: "inactive" };
+      return {
+        statusText: "PREPARING",
+        statusClass: "inactive",
+        errorMessage: null,
+      };
    }

    const inst = instance as {
@@ -1153,50 +1158,106 @@
    };
    const runnerIds = Object.keys(inst.shardAssignments?.runnerToShard || {});

+    const statusMap: Record<string, string> = {
+      RunnerWaitingForInitialization: "WaitingForInitialization",
+      RunnerInitializingBackend: "InitializingBackend",
+      RunnerWaitingForModel: "WaitingForModel",
+      RunnerLoading: "Loading",
+      RunnerLoaded: "Loaded",
+      RunnerWarmingUp: "WarmingUp",
+      RunnerReady: "Ready",
+      RunnerRunning: "Running",
+      RunnerShutdown: "Shutdown",
+      RunnerFailed: "Failed",
+    };
+
    const statuses = runnerIds
      .map((rid) => {
        const r = runnersData[rid];
        if (!r) return null;
-        const [kind] = getTagged(r);
-        const statusMap: Record<string, string> = {
-          RunnerWaitingForInitialization: "WaitingForInitialization",
-          RunnerInitializingBackend: "InitializingBackend",
-          RunnerWaitingForModel: "WaitingForModel",
-          RunnerLoading: "Loading",
-          RunnerLoaded: "Loaded",
-          RunnerWarmingUp: "WarmingUp",
-          RunnerReady: "Ready",
-          RunnerRunning: "Running",
-          RunnerShutdown: "Shutdown",
-          RunnerFailed: "Failed",
-        };
-        return kind ? statusMap[kind] || null : null;
+        const [kind, payload] = getTagged(r);
+        if (!kind || !statusMap[kind]) return null;
+        const errorMessage =
+          kind === "RunnerFailed" && payload && typeof payload === "object"
+            ? (((payload as Record<string, unknown>).errorMessage as string) ??
+              null)
+            : null;
+        return { status: statusMap[kind], errorMessage };
      })
-      .filter((s): s is string => s !== null);
+      .filter(
+        (s): s is { status: string; errorMessage: string | null } => s !== null,
+      );

-    const has = (s: string) => statuses.includes(s);
+    const has = (s: string) => statuses.some((e) => e.status === s);

    if (statuses.length === 0)
-      return { statusText: "PREPARING", statusClass: "inactive" };
-    if (has("Failed")) return { statusText: "FAILED", statusClass: "failed" };
+      return {
+        statusText: "PREPARING",
+        statusClass: "inactive",
+        errorMessage: null,
+      };
+    if (has("Failed")) {
+      const failedEntry = statuses.find(
+        (e) => e.status === "Failed" && e.errorMessage,
+      );
+      return {
+        statusText: "FAILED",
+        statusClass: "failed",
+        errorMessage: failedEntry?.errorMessage ?? null,
+      };
+    }
    if (has("Shutdown"))
-      return { statusText: "SHUTDOWN", statusClass: "inactive" };
+      return {
+        statusText: "SHUTDOWN",
+        statusClass: "inactive",
+        errorMessage: null,
+      };
    if (has("Loading"))
-      return { statusText: "LOADING", statusClass: "starting" };
+      return {
+        statusText: "LOADING",
+        statusClass: "starting",
+        errorMessage: null,
+      };
    if (has("WarmingUp"))
-      return { statusText: "WARMING UP", statusClass: "starting" };
+      return {
+        statusText: "WARMING UP",
+        statusClass: "starting",
+        errorMessage: null,
+      };
    if (has("Running"))
-      return { statusText: "RUNNING", statusClass: "running" };
-    if (has("Ready")) return { statusText: "READY", statusClass: "loaded" };
-    if (has("Loaded")) return { statusText: "LOADED", statusClass: "loaded" };
+      return {
+        statusText: "RUNNING",
+        statusClass: "running",
+        errorMessage: null,
+      };
+    if (has("Ready"))
+      return { statusText: "READY", statusClass: "loaded", errorMessage: null };
+    if (has("Loaded"))
+      return {
+        statusText: "LOADED",
+        statusClass: "loaded",
+        errorMessage: null,
+      };
    if (has("WaitingForModel"))
-      return { statusText: "WAITING", statusClass: "starting" };
+      return {
+        statusText: "WAITING",
+        statusClass: "starting",
+        errorMessage: null,
+      };
    if (has("InitializingBackend"))
-      return { statusText: "INITIALIZING", statusClass: "starting" };
+      return {
+        statusText: "INITIALIZING",
+        statusClass: "starting",
+        errorMessage: null,
+      };
    if (has("WaitingForInitialization"))
-      return { statusText: "INITIALIZING", statusClass: "starting" };
+      return {
+        statusText: "INITIALIZING",
+        statusClass: "starting",
+        errorMessage: null,
+      };

-    return { statusText: "RUNNING", statusClass: "active" };
+    return { statusText: "RUNNING", statusClass: "active", errorMessage: null };
  }

  function getBytes(value: unknown): number {
--- a/dashboard/src/routes/downloads/+page.svelte
+++ b/dashboard/src/routes/downloads/+page.svelte
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,7 +132,7 @@ markers = [
 env = [
  "EXO_TESTS=1"
 ]
-addopts = "-m 'not slow'"
+addopts = "-m 'not slow' --ignore=tests/start_distributed_test.py"
 filterwarnings = [
    "ignore:builtin type Swig:DeprecationWarning",
 ]
--- a/src/exo/download/coordinator.py
+++ b/src/exo/download/coordinator.py
@@ -14,6 +14,7 @@ from exo.download.download_utils import (
    map_repo_download_progress_to_download_progress_data,
 )
 from exo.download.shard_downloader import ShardDownloader
+from exo.shared.constants import EXO_MODELS_DIR
 from exo.shared.models.model_cards import ModelId
 from exo.shared.types.commands import (
    CancelDownload,
@@ -63,6 +64,9 @@ class DownloadCoordinator:
        self.event_sender, self.event_receiver = channel[Event]()
        self.shard_downloader.on_progress(self._download_progress_callback)

+    def _model_dir(self, model_id: ModelId) -> str:
+        return str(EXO_MODELS_DIR / model_id.normalize())
+
    async def _download_progress_callback(
        self, callback_shard: ShardMetadata, progress: RepoDownloadProgress
    ) -> None:
@@ -74,6 +78,7 @@ class DownloadCoordinator:
                shard_metadata=callback_shard,
                node_id=self.node_id,
                total_bytes=progress.total_bytes,
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = completed
            await self.event_sender.send(
@@ -93,6 +98,7 @@ class DownloadCoordinator:
                download_progress=map_repo_download_progress_to_download_progress_data(
                    progress
                ),
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = ongoing
            await self.event_sender.send(
@@ -170,7 +176,11 @@ class DownloadCoordinator:
                return

        # Emit pending status
-        progress = DownloadPending(shard_metadata=shard, node_id=self.node_id)
+        progress = DownloadPending(
+            shard_metadata=shard,
+            node_id=self.node_id,
+            model_directory=self._model_dir(model_id),
+        )
        self.download_status[model_id] = progress
        await self.event_sender.send(NodeDownloadProgress(download_progress=progress))

@@ -184,6 +194,7 @@ class DownloadCoordinator:
                shard_metadata=shard,
                node_id=self.node_id,
                total_bytes=initial_progress.total_bytes,
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = completed
            await self.event_sender.send(
@@ -206,6 +217,7 @@ class DownloadCoordinator:
            download_progress=map_repo_download_progress_to_download_progress_data(
                initial_progress
            ),
+            model_directory=self._model_dir(model_id),
        )
        self.download_status[model_id] = status
        self.event_sender.send_nowait(NodeDownloadProgress(download_progress=status))
@@ -219,6 +231,7 @@ class DownloadCoordinator:
                    shard_metadata=shard,
                    node_id=self.node_id,
                    error_message=str(e),
+                    model_directory=self._model_dir(model_id),
                )
                self.download_status[model_id] = failed
                await self.event_sender.send(
@@ -253,6 +266,7 @@ class DownloadCoordinator:
            pending = DownloadPending(
                shard_metadata=current_status.shard_metadata,
                node_id=self.node_id,
+                model_directory=self._model_dir(model_id),
            )
            await self.event_sender.send(
                NodeDownloadProgress(download_progress=pending)
@@ -295,11 +309,18 @@ class DownloadCoordinator:
                            node_id=self.node_id,
                            shard_metadata=progress.shard,
                            total_bytes=progress.total_bytes,
+                            model_directory=self._model_dir(
+                                progress.shard.model_card.model_id
+                            ),
                        )
                    elif progress.status in ["in_progress", "not_started"]:
                        if progress.downloaded_bytes_this_session.in_bytes == 0:
                            status = DownloadPending(
-                                node_id=self.node_id, shard_metadata=progress.shard
+                                node_id=self.node_id,
+                                shard_metadata=progress.shard,
+                                model_directory=self._model_dir(
+                                    progress.shard.model_card.model_id
+                                ),
                            )
                        else:
                            status = DownloadOngoing(
@@ -308,6 +329,9 @@ class DownloadCoordinator:
                                download_progress=map_repo_download_progress_to_download_progress_data(
                                    progress
                                ),
+                                model_directory=self._model_dir(
+                                    progress.shard.model_card.model_id
+                                ),
                            )
                    else:
                        continue
--- a/src/exo/shared/apply.py
+++ b/src/exo/shared/apply.py
@@ -218,11 +218,6 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
        key: value for key, value in state.downloads.items() if key != event.node_id
    }
    # Clean up all granular node mappings
-    node_identities = {
-        key: value
-        for key, value in state.node_identities.items()
-        if key != event.node_id
-    }
    node_memory = {
        key: value for key, value in state.node_memory.items() if key != event.node_id
    }
@@ -263,7 +258,6 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
            "downloads": downloads,
            "topology": topology,
            "last_seen": last_seen,
-            "node_identities": node_identities,
            "node_memory": node_memory,
            "node_disk": node_disk,
            "node_system": node_system,
--- a/src/exo/shared/types/worker/downloads.py
+++ b/src/exo/shared/types/worker/downloads.py
@@ -26,6 +26,7 @@ class DownloadProgressData(CamelCaseModel):
 class BaseDownloadProgress(TaggedModel):
    node_id: NodeId
    shard_metadata: ShardMetadata
+    model_directory: str = ""


 class DownloadPending(BaseDownloadProgress):