Include node friendly names in runner error messages

Each error in the combined message is now prefixed with the node's friendly
name (e.g. "MacBook Pro: OOM; Mac Studio: connection reset") so the root
cause node is easily identifiable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Cheema
2026-02-11 12:41:10 -08:00
parent a896ecca84
commit 89f0a4a69d
4 changed files with 21 additions and 11 deletions

View File

@@ -1278,7 +1278,7 @@
typeof runnerData === "object"
) {
const rd = runnerData as { errorMessage?: string };
if (rd.errorMessage) failedErrors.push(rd.errorMessage);
if (rd.errorMessage) failedErrors.push(`${getNodeName(nodeId)}: ${rd.errorMessage}`);
}
}
if (status) {

View File

@@ -23,7 +23,7 @@ class InstanceHealthReconciler:
continue
is_failed, error_message = instance_runners_failed(
instance, state.runners
instance, state.runners, state.node_identities
)
if is_failed:
events.append(

View File

@@ -10,7 +10,7 @@ from exo.shared.types.commands import PlaceInstance
from exo.shared.types.common import MetaInstanceId, NodeId
from exo.shared.types.events import Event
from exo.shared.types.meta_instance import MetaInstance
from exo.shared.types.profiling import MemoryUsage, NodeNetworkInfo
from exo.shared.types.profiling import MemoryUsage, NodeIdentity, NodeNetworkInfo
from exo.shared.types.topology import RDMAConnection, SocketConnection
from exo.shared.types.worker.instances import (
BaseInstance,
@@ -100,6 +100,7 @@ def instance_connections_healthy(instance: Instance, topology: Topology) -> bool
def instance_runners_failed(
instance: Instance,
runners: Mapping[RunnerId, RunnerStatus],
node_identities: Mapping[NodeId, NodeIdentity],
) -> tuple[bool, str | None]:
"""Check if an instance's runners have all reached terminal failure states.
@@ -114,6 +115,12 @@ def instance_runners_failed(
if not instance_runner_ids:
return False, None
# Build reverse mapping: runner_id -> node_id
runner_to_node: dict[RunnerId, NodeId] = {
runner_id: node_id
for node_id, runner_id in instance.shard_assignments.node_to_runner.items()
}
has_any_failed = False
error_messages: list[str] = []
@@ -125,7 +132,9 @@ def instance_runners_failed(
if isinstance(status, RunnerFailed):
has_any_failed = True
if status.error_message:
error_messages.append(status.error_message)
node_id = runner_to_node.get(runner_id)
name = node_identities[node_id].friendly_name if node_id and node_id in node_identities else node_id or "unknown"
error_messages.append(f"{name}: {status.error_message}")
elif isinstance(status, RunnerShutdown):
pass # Terminal but not a failure indicator on its own
else:

View File

@@ -472,7 +472,7 @@ def test_runners_failed_all_failed():
rid: RunnerFailed(error_message="OOM")
for rid in inst.shard_assignments.node_to_runner.values()
}
is_failed, error = instance_runners_failed(inst, runners)
is_failed, error = instance_runners_failed(inst, runners, {})
assert is_failed is True
assert error is not None
assert "OOM" in error
@@ -486,9 +486,10 @@ def test_runners_failed_mixed_failed_shutdown():
runner_ids[0]: RunnerFailed(error_message="crash"),
runner_ids[1]: RunnerShutdown(),
}
is_failed, error = instance_runners_failed(inst, runners)
is_failed, error = instance_runners_failed(inst, runners, {})
assert is_failed is True
assert error == "crash"
assert error is not None
assert "crash" in error
def test_runners_not_failed_all_shutdown():
@@ -498,7 +499,7 @@ def test_runners_not_failed_all_shutdown():
rid: RunnerShutdown()
for rid in inst.shard_assignments.node_to_runner.values()
}
is_failed, _ = instance_runners_failed(inst, runners)
is_failed, _ = instance_runners_failed(inst, runners, {})
assert is_failed is False
@@ -510,14 +511,14 @@ def test_runners_not_failed_still_active():
runner_ids[0]: RunnerFailed(error_message="OOM"),
runner_ids[1]: RunnerLoading(),
}
is_failed, _ = instance_runners_failed(inst, runners)
is_failed, _ = instance_runners_failed(inst, runners, {})
assert is_failed is False
def test_runners_not_failed_no_status():
"""Runner not yet reported = not failed."""
_, inst = _instance(node_ids=["node-a"])
is_failed, _ = instance_runners_failed(inst, {})
is_failed, _ = instance_runners_failed(inst, {}, {})
assert is_failed is False
@@ -528,7 +529,7 @@ def test_runners_not_failed_healthy():
rid: RunnerReady()
for rid in inst.shard_assignments.node_to_runner.values()
}
is_failed, _ = instance_runners_failed(inst, runners)
is_failed, _ = instance_runners_failed(inst, runners, {})
assert is_failed is False