diff --git a/dashboard/src/routes/+page.svelte b/dashboard/src/routes/+page.svelte index bec828d97..9dfcda49f 100644 --- a/dashboard/src/routes/+page.svelte +++ b/dashboard/src/routes/+page.svelte @@ -1278,7 +1278,7 @@ typeof runnerData === "object" ) { const rd = runnerData as { errorMessage?: string }; - if (rd.errorMessage) failedErrors.push(rd.errorMessage); + if (rd.errorMessage) failedErrors.push(`${getNodeName(nodeId)}: ${rd.errorMessage}`); } } if (status) { diff --git a/src/exo/master/process_managers/instance_health.py b/src/exo/master/process_managers/instance_health.py index c07192c14..c054a0fcd 100644 --- a/src/exo/master/process_managers/instance_health.py +++ b/src/exo/master/process_managers/instance_health.py @@ -23,7 +23,7 @@ class InstanceHealthReconciler: continue is_failed, error_message = instance_runners_failed( - instance, state.runners + instance, state.runners, state.node_identities ) if is_failed: events.append( diff --git a/src/exo/master/reconcile.py b/src/exo/master/reconcile.py index ce0ae078b..e23625934 100644 --- a/src/exo/master/reconcile.py +++ b/src/exo/master/reconcile.py @@ -10,7 +10,7 @@ from exo.shared.types.commands import PlaceInstance from exo.shared.types.common import MetaInstanceId, NodeId from exo.shared.types.events import Event from exo.shared.types.meta_instance import MetaInstance -from exo.shared.types.profiling import MemoryUsage, NodeNetworkInfo +from exo.shared.types.profiling import MemoryUsage, NodeIdentity, NodeNetworkInfo from exo.shared.types.topology import RDMAConnection, SocketConnection from exo.shared.types.worker.instances import ( BaseInstance, @@ -100,6 +100,7 @@ def instance_connections_healthy(instance: Instance, topology: Topology) -> bool def instance_runners_failed( instance: Instance, runners: Mapping[RunnerId, RunnerStatus], + node_identities: Mapping[NodeId, NodeIdentity], ) -> tuple[bool, str | None]: """Check if an instance's runners have all reached terminal failure states. @@ -114,6 +115,12 @@ def instance_runners_failed( if not instance_runner_ids: return False, None + # Build reverse mapping: runner_id -> node_id + runner_to_node: dict[RunnerId, NodeId] = { + runner_id: node_id + for node_id, runner_id in instance.shard_assignments.node_to_runner.items() + } + has_any_failed = False error_messages: list[str] = [] @@ -125,7 +132,9 @@ def instance_runners_failed( if isinstance(status, RunnerFailed): has_any_failed = True if status.error_message: - error_messages.append(status.error_message) + node_id = runner_to_node.get(runner_id) + name = node_identities[node_id].friendly_name if node_id and node_id in node_identities else node_id or "unknown" + error_messages.append(f"{name}: {status.error_message}") elif isinstance(status, RunnerShutdown): pass # Terminal but not a failure indicator on its own else: diff --git a/src/exo/master/tests/test_reconcile.py b/src/exo/master/tests/test_reconcile.py index 61a7540b4..1ee2e357f 100644 --- a/src/exo/master/tests/test_reconcile.py +++ b/src/exo/master/tests/test_reconcile.py @@ -472,7 +472,7 @@ def test_runners_failed_all_failed(): rid: RunnerFailed(error_message="OOM") for rid in inst.shard_assignments.node_to_runner.values() } - is_failed, error = instance_runners_failed(inst, runners) + is_failed, error = instance_runners_failed(inst, runners, {}) assert is_failed is True assert error is not None assert "OOM" in error @@ -486,9 +486,10 @@ def test_runners_failed_mixed_failed_shutdown(): runner_ids[0]: RunnerFailed(error_message="crash"), runner_ids[1]: RunnerShutdown(), } - is_failed, error = instance_runners_failed(inst, runners) + is_failed, error = instance_runners_failed(inst, runners, {}) assert is_failed is True - assert error == "crash" + assert error is not None + assert "crash" in error def test_runners_not_failed_all_shutdown(): @@ -498,7 +499,7 @@ def test_runners_not_failed_all_shutdown(): rid: RunnerShutdown() for rid in inst.shard_assignments.node_to_runner.values() } - is_failed, _ = instance_runners_failed(inst, runners) + is_failed, _ = instance_runners_failed(inst, runners, {}) assert is_failed is False @@ -510,14 +511,14 @@ def test_runners_not_failed_still_active(): runner_ids[0]: RunnerFailed(error_message="OOM"), runner_ids[1]: RunnerLoading(), } - is_failed, _ = instance_runners_failed(inst, runners) + is_failed, _ = instance_runners_failed(inst, runners, {}) assert is_failed is False def test_runners_not_failed_no_status(): """Runner not yet reported = not failed.""" _, inst = _instance(node_ids=["node-a"]) - is_failed, _ = instance_runners_failed(inst, {}) + is_failed, _ = instance_runners_failed(inst, {}, {}) assert is_failed is False @@ -528,7 +529,7 @@ def test_runners_not_failed_healthy(): rid: RunnerReady() for rid in inst.shard_assignments.node_to_runner.values() } - is_failed, _ = instance_runners_failed(inst, runners) + is_failed, _ = instance_runners_failed(inst, runners, {}) assert is_failed is False