Compare commits

...

5 Commits

Author SHA1 Message Date
Alex Cheema
92f9cd4820 Merge branch 'main' into alexcheema/no-silent-instance-override 2026-02-23 10:46:18 -08:00
Alex Cheema
c1ce541901 fix: remove silent instance_meta override for single-node placements
All combinations of sharding (Pipeline/Tensor) and instance type
(MlxRing/MlxJaccl) are equivalent and valid for n=1, so there is no
reason to override the user's selection. Removing the override also
fixes the preview type mismatch reported in #1426.

Supersedes #1509.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 10:45:34 -08:00
Alex Cheema
c90a0cec78 fix: suppress closure errors in runnersupervisor and force spawn start method (#1547)
some errors could be thrown during shutdown - we can dismiss these safely

co-authored by me :)
2026-02-23 18:30:41 +00:00
Alex Cheema
e8c1337168 fix: add download/resume buttons to pending downloads (#1581)
## Summary
- Adds a **resume button** (download icon) to paused pending downloads
(those with partial progress)
- Adds a **download button** to not-started pending downloads
- Both buttons call the existing `startDownload()` function which
handles both new downloads and resuming partial ones
- Previously, paused downloads only showed a "paused" label with no
action, and not-started downloads showed "..." with no way to trigger
them

## Test plan
- [ ] Build dashboard (`cd dashboard && npm run build`)
- [ ] Start exo, navigate to Downloads tab
- [ ] Verify paused downloads show a clickable resume (download arrow)
icon below the progress bar
- [ ] Verify not-started pending downloads show a clickable download
icon
- [ ] Click both button types and confirm downloads start/resume

> Note: Screenshot could not be captured because the dashboard requires
the exo backend API to render, and exo has a pre-existing
`Keypair.generate()` startup bug on main.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 18:20:35 +00:00
Alex Cheema
7024ddcf3e fix: detect completed downloads by checking final file exists (#1582)
## Summary

Split from #1547 per review feedback.

When scanning existing download status, `get_downloaded_size()` returns
bytes from either the final file or its `.partial` counterpart — so a
`.partial` file with all bytes downloaded (e.g. process killed before
hash verification and rename) could be falsely reported as complete.

The previous approach (commit 3b54e7df) added a byte-comparison fallback
in the coordinator (`downloaded >= total > 0`), but this suffered from
the same `.partial` conflation issue.

**Fix:** Check whether the final (non-`.partial`) file actually exists
on disk before marking status as `"complete"` in `download_utils.py`.
This is the only reliable signal that hash verification passed and the
rename from `.partial` succeeded. The coordinator-level byte comparison
is removed since the source now reports correctly.

### Changes
- `download_utils.py`: Add `final_file_exists` check — only report
`"complete"` when the renamed, hash-verified file exists with matching
size
- `coordinator.py`: Revert to simple `progress.status == "complete"`
check, removing the byte-comparison fallback

**Note:** The corresponding byte-comparison workaround in #1547 should
also be removed.

## Test plan
- [x] basedpyright — 0 errors
- [x] ruff check — passes
- [x] pytest — 218 passed, 1 skipped (1 pre-existing Rust bindings
failure)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 18:12:07 +00:00
4 changed files with 79 additions and 25 deletions

View File

@@ -469,11 +469,11 @@
<td class="px-4 py-3 text-center align-middle">
{#if cell.kind === "completed"}
<div
class="flex flex-col items-center gap-0.5"
class="flex flex-col items-center gap-1"
title="Completed ({formatBytes(cell.totalBytes)})"
>
<svg
class="w-5 h-5 text-green-400"
class="w-7 h-7 text-green-400"
viewBox="0 0 20 20"
fill="currentColor"
>
@@ -483,18 +483,18 @@
clip-rule="evenodd"
></path>
</svg>
<span class="text-[10px] text-exo-light-gray/70"
<span class="text-xs text-exo-light-gray/70"
>{formatBytes(cell.totalBytes)}</span
>
<button
type="button"
class="text-exo-light-gray/40 hover:text-red-400 transition-colors mt-0.5"
class="text-exo-light-gray/40 hover:text-red-400 transition-colors mt-0.5 cursor-pointer"
onclick={() =>
deleteDownload(col.nodeId, row.modelId)}
title="Delete from this node"
>
<svg
class="w-3.5 h-3.5"
class="w-5 h-5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
@@ -517,11 +517,11 @@
cell.speed,
)} - ETA {formatEta(cell.etaMs)}"
>
<span class="text-exo-yellow text-xs font-medium"
<span class="text-exo-yellow text-sm font-medium"
>{clampPercent(cell.percentage).toFixed(1)}%</span
>
<div
class="w-14 h-1.5 bg-exo-black/60 rounded-sm overflow-hidden"
class="w-16 h-2 bg-exo-black/60 rounded-sm overflow-hidden"
>
<div
class="h-full bg-gradient-to-r from-exo-yellow to-exo-yellow/70 transition-all duration-300"
@@ -530,25 +530,25 @@
).toFixed(1)}%"
></div>
</div>
<span class="text-[9px] text-exo-light-gray/60"
<span class="text-[10px] text-exo-light-gray/60"
>{formatSpeed(cell.speed)}</span
>
</div>
{:else if cell.kind === "pending"}
<div
class="flex flex-col items-center gap-0.5"
class="flex flex-col items-center gap-1"
title={cell.downloaded > 0
? `${formatBytes(cell.downloaded)} / ${formatBytes(cell.total)} downloaded`
? `${formatBytes(cell.downloaded)} / ${formatBytes(cell.total)} downloaded (paused)`
: "Download pending"}
>
{#if cell.downloaded > 0 && cell.total > 0}
<span class="text-exo-light-gray/70 text-[10px]"
<span class="text-exo-light-gray/70 text-xs"
>{formatBytes(cell.downloaded)} / {formatBytes(
cell.total,
)}</span
>
<div
class="w-full h-1 bg-white/10 rounded-full overflow-hidden"
class="w-full h-1.5 bg-white/10 rounded-full overflow-hidden"
>
<div
class="h-full bg-exo-light-gray/40 rounded-full"
@@ -558,9 +558,55 @@
).toFixed(1)}%"
></div>
</div>
<span class="text-exo-light-gray/40 text-[9px]"
>paused</span
{#if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/50 hover:text-exo-yellow transition-colors cursor-pointer"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Resume download on this node"
>
<svg
class="w-5 h-5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
stroke-width="2"
>
<path
d="M10 3v10m0 0l-3-3m3 3l3-3M3 17h14"
stroke-linecap="round"
stroke-linejoin="round"
></path>
</svg>
</button>
{:else}
<span class="text-exo-light-gray/40 text-[10px]"
>paused</span
>
{/if}
{:else if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors cursor-pointer"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Start download on this node"
>
<svg
class="w-6 h-6"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
stroke-width="2"
>
<path
d="M10 3v10m0 0l-3-3m3 3l3-3M3 17h14"
stroke-linecap="round"
stroke-linejoin="round"
></path>
</svg>
</button>
{:else}
<span class="text-exo-light-gray/50 text-sm">...</span
>
@@ -568,11 +614,11 @@
</div>
{:else if cell.kind === "failed"}
<div
class="flex flex-col items-center gap-0.5"
class="flex flex-col items-center gap-1"
title="Download failed"
>
<svg
class="w-5 h-5 text-red-400"
class="w-7 h-7 text-red-400"
viewBox="0 0 20 20"
fill="currentColor"
>
@@ -585,13 +631,13 @@
{#if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/40 hover:text-exo-yellow transition-colors"
class="text-exo-light-gray/40 hover:text-exo-yellow transition-colors cursor-pointer"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Retry download on this node"
>
<svg
class="w-3.5 h-3.5"
class="w-5 h-5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
@@ -617,13 +663,13 @@
{#if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors mt-0.5 opacity-0 group-hover:opacity-100"
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors mt-0.5 opacity-0 group-hover:opacity-100 cursor-pointer"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Download to this node"
>
<svg
class="w-3.5 h-3.5"
class="w-5 h-5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"

View File

@@ -823,6 +823,7 @@ async def download_shard(
for file in filtered_file_list:
downloaded_bytes = await get_downloaded_size(target_dir / file.path)
final_file_exists = await aios.path.exists(target_dir / file.path)
file_progress[file.path] = RepoFileDownloadProgress(
repo_id=shard.model_card.model_id,
repo_revision=revision,
@@ -832,7 +833,9 @@ async def download_shard(
total=Memory.from_bytes(file.size or 0),
speed=0,
eta=timedelta(0),
status="complete" if downloaded_bytes == file.size else "not_started",
status="complete"
if final_file_exists and downloaded_bytes == file.size
else "not_started",
start_time=time.time(),
)

View File

@@ -252,7 +252,7 @@ def main():
target = min(max(soft, 65535), hard)
resource.setrlimit(resource.RLIMIT_NOFILE, (target, hard))
mp.set_start_method("spawn")
mp.set_start_method("spawn", force=True)
# TODO: Refactor the current verbosity system
logger_setup(EXO_LOG, args.verbosity)
logger.info("Starting EXO")

View File

@@ -106,13 +106,18 @@ class RunnerSupervisor:
def shutdown(self):
logger.info("Runner supervisor shutting down")
self._tg.cancel_tasks()
self._ev_recv.close()
self._task_sender.close()
if not self._cancel_watch_runner.cancel_called:
self._cancel_watch_runner.cancel()
with contextlib.suppress(ClosedResourceError):
self._ev_recv.close()
with contextlib.suppress(ClosedResourceError):
self._task_sender.close()
with contextlib.suppress(ClosedResourceError):
self._event_sender.close()
with contextlib.suppress(ClosedResourceError):
self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
self._cancel_sender.close()
with contextlib.suppress(ClosedResourceError):
self._cancel_sender.close()
self.runner_process.join(5)
if not self.runner_process.is_alive():
logger.info("Runner process succesfully terminated")