Compare commits

...

2 Commits

Author SHA1 Message Date
Alex Cheema
e70c7e73d2 Exclude start_distributed_test.py from pytest collection
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 11:12:20 -08:00
Alex Cheema
dff835b154 fix: forcefully terminate runner on Shutdown timeout
When an instance is deleted while the runner is loading a model,
the runner can't process the Shutdown task because mx.eval() is
blocking. Previously the runner process was left running as an
orphan. Now we call supervisor.shutdown() to kill the process
and free all resources immediately.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 11:12:20 -08:00
2 changed files with 9 additions and 4 deletions

View File

@@ -132,7 +132,7 @@ markers = [
env = [
"EXO_TESTS=1"
]
addopts = "-m 'not slow'"
addopts = "-m 'not slow' --ignore=tests/start_distributed_test.py"
filterwarnings = [
"ignore:builtin type Swig:DeprecationWarning",
]

View File

@@ -225,18 +225,23 @@ class Worker:
)
)
case Shutdown(runner_id=runner_id):
runner = self.runners.pop(runner_id)
supervisor = self.runners.pop(runner_id)
try:
with fail_after(3):
await runner.start_task(task)
await supervisor.start_task(task)
except TimeoutError:
logger.warning(
f"Runner {runner_id} did not respond to Shutdown within 3s, "
f"forcefully terminating process"
)
supervisor.shutdown()
await self.event_sender.send(
TaskStatusUpdated(
task_id=task.task_id, task_status=TaskStatus.TimedOut
)
)
finally:
runner.shutdown()
supervisor.shutdown()
case CancelTask(
cancelled_task_id=cancelled_task_id, runner_id=runner_id
):