Compare commits

...

3 Commits

Author SHA1 Message Date
Alex Cheema
2e6ce5ba98 Exclude start_distributed_test.py from pytest collection
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 10:18:49 -08:00
Alex Cheema
a11a4178c8 Merge remote-tracking branch 'origin/main' into alexcheema/graceful-loading-shutdown 2026-02-13 10:17:59 -08:00
Alex Cheema
9986d34212 fix: forcefully terminate runner on Shutdown timeout
When an instance is deleted while the runner is loading a model,
the runner can't process the Shutdown task because mx.eval() is
blocking. Previously the runner process was left running as an
orphan. Now we call supervisor.shutdown() to kill the process
and free all resources immediately.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 05:56:36 -08:00
2 changed files with 8 additions and 2 deletions

View File

@@ -132,7 +132,7 @@ markers = [
env = [
"EXO_TESTS=1"
]
addopts = "-m 'not slow'"
addopts = "-m 'not slow' --ignore=tests/start_distributed_test.py"
filterwarnings = [
"ignore:builtin type Swig:DeprecationWarning",
]

View File

@@ -224,10 +224,16 @@ class Worker:
)
)
case Shutdown(runner_id=runner_id):
supervisor = self.runners.pop(runner_id)
try:
with fail_after(3):
await self.runners.pop(runner_id).start_task(task)
await supervisor.start_task(task)
except TimeoutError:
logger.warning(
f"Runner {runner_id} did not respond to Shutdown within 3s, "
f"forcefully terminating process"
)
supervisor.shutdown()
await self.event_sender.send(
TaskStatusUpdated(
task_id=task.task_id, task_status=TaskStatus.TimedOut