Compare commits

..

1 Commits

Author SHA1 Message Date
Alex Cheema
dc9000290b fix instance type mismatch in /instance/previews endpoint
Derive instance_meta from the actual instance type returned by
place_instance() instead of using the requested instance_meta from
the loop variable. place_instance() overrides single-node placements
to MlxRing, but the preview response was still reporting the original
requested type (e.g., MlxJaccl), causing a mismatch.

Closes #1426

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 18:04:38 +00:00
5 changed files with 42 additions and 83 deletions

View File

@@ -469,11 +469,11 @@
<td class="px-4 py-3 text-center align-middle">
{#if cell.kind === "completed"}
<div
class="flex flex-col items-center gap-1"
class="flex flex-col items-center gap-0.5"
title="Completed ({formatBytes(cell.totalBytes)})"
>
<svg
class="w-7 h-7 text-green-400"
class="w-5 h-5 text-green-400"
viewBox="0 0 20 20"
fill="currentColor"
>
@@ -483,18 +483,18 @@
clip-rule="evenodd"
></path>
</svg>
<span class="text-xs text-exo-light-gray/70"
<span class="text-[10px] text-exo-light-gray/70"
>{formatBytes(cell.totalBytes)}</span
>
<button
type="button"
class="text-exo-light-gray/40 hover:text-red-400 transition-colors mt-0.5 cursor-pointer"
class="text-exo-light-gray/40 hover:text-red-400 transition-colors mt-0.5"
onclick={() =>
deleteDownload(col.nodeId, row.modelId)}
title="Delete from this node"
>
<svg
class="w-5 h-5"
class="w-3.5 h-3.5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
@@ -517,11 +517,11 @@
cell.speed,
)} - ETA {formatEta(cell.etaMs)}"
>
<span class="text-exo-yellow text-sm font-medium"
<span class="text-exo-yellow text-xs font-medium"
>{clampPercent(cell.percentage).toFixed(1)}%</span
>
<div
class="w-16 h-2 bg-exo-black/60 rounded-sm overflow-hidden"
class="w-14 h-1.5 bg-exo-black/60 rounded-sm overflow-hidden"
>
<div
class="h-full bg-gradient-to-r from-exo-yellow to-exo-yellow/70 transition-all duration-300"
@@ -530,25 +530,25 @@
).toFixed(1)}%"
></div>
</div>
<span class="text-[10px] text-exo-light-gray/60"
<span class="text-[9px] text-exo-light-gray/60"
>{formatSpeed(cell.speed)}</span
>
</div>
{:else if cell.kind === "pending"}
<div
class="flex flex-col items-center gap-1"
class="flex flex-col items-center gap-0.5"
title={cell.downloaded > 0
? `${formatBytes(cell.downloaded)} / ${formatBytes(cell.total)} downloaded (paused)`
? `${formatBytes(cell.downloaded)} / ${formatBytes(cell.total)} downloaded`
: "Download pending"}
>
{#if cell.downloaded > 0 && cell.total > 0}
<span class="text-exo-light-gray/70 text-xs"
<span class="text-exo-light-gray/70 text-[10px]"
>{formatBytes(cell.downloaded)} / {formatBytes(
cell.total,
)}</span
>
<div
class="w-full h-1.5 bg-white/10 rounded-full overflow-hidden"
class="w-full h-1 bg-white/10 rounded-full overflow-hidden"
>
<div
class="h-full bg-exo-light-gray/40 rounded-full"
@@ -558,55 +558,9 @@
).toFixed(1)}%"
></div>
</div>
{#if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/50 hover:text-exo-yellow transition-colors cursor-pointer"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Resume download on this node"
>
<svg
class="w-5 h-5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
stroke-width="2"
>
<path
d="M10 3v10m0 0l-3-3m3 3l3-3M3 17h14"
stroke-linecap="round"
stroke-linejoin="round"
></path>
</svg>
</button>
{:else}
<span class="text-exo-light-gray/40 text-[10px]"
>paused</span
>
{/if}
{:else if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors cursor-pointer"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Start download on this node"
<span class="text-exo-light-gray/40 text-[9px]"
>paused</span
>
<svg
class="w-6 h-6"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
stroke-width="2"
>
<path
d="M10 3v10m0 0l-3-3m3 3l3-3M3 17h14"
stroke-linecap="round"
stroke-linejoin="round"
></path>
</svg>
</button>
{:else}
<span class="text-exo-light-gray/50 text-sm">...</span
>
@@ -614,11 +568,11 @@
</div>
{:else if cell.kind === "failed"}
<div
class="flex flex-col items-center gap-1"
class="flex flex-col items-center gap-0.5"
title="Download failed"
>
<svg
class="w-7 h-7 text-red-400"
class="w-5 h-5 text-red-400"
viewBox="0 0 20 20"
fill="currentColor"
>
@@ -631,13 +585,13 @@
{#if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/40 hover:text-exo-yellow transition-colors cursor-pointer"
class="text-exo-light-gray/40 hover:text-exo-yellow transition-colors"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Retry download on this node"
>
<svg
class="w-5 h-5"
class="w-3.5 h-3.5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"
@@ -663,13 +617,13 @@
{#if row.shardMetadata}
<button
type="button"
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors mt-0.5 opacity-0 group-hover:opacity-100 cursor-pointer"
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors mt-0.5 opacity-0 group-hover:opacity-100"
onclick={() =>
startDownload(col.nodeId, row.shardMetadata!)}
title="Download to this node"
>
<svg
class="w-5 h-5"
class="w-3.5 h-3.5"
viewBox="0 0 20 20"
fill="none"
stroke="currentColor"

View File

@@ -823,7 +823,6 @@ async def download_shard(
for file in filtered_file_list:
downloaded_bytes = await get_downloaded_size(target_dir / file.path)
final_file_exists = await aios.path.exists(target_dir / file.path)
file_progress[file.path] = RepoFileDownloadProgress(
repo_id=shard.model_card.model_id,
repo_revision=revision,
@@ -833,9 +832,7 @@ async def download_shard(
total=Memory.from_bytes(file.size or 0),
speed=0,
eta=timedelta(0),
status="complete"
if final_file_exists and downloaded_bytes == file.size
else "not_started",
status="complete" if downloaded_bytes == file.size else "not_started",
start_time=time.time(),
)

View File

@@ -252,7 +252,7 @@ def main():
target = min(max(soft, 65535), hard)
resource.setrlimit(resource.RLIMIT_NOFILE, (target, hard))
mp.set_start_method("spawn", force=True)
mp.set_start_method("spawn")
# TODO: Refactor the current verbosity system
logger_setup(EXO_LOG, args.verbosity)
logger.info("Starting EXO")

View File

@@ -168,7 +168,12 @@ from exo.shared.types.openai_responses import (
)
from exo.shared.types.state import State
from exo.shared.types.worker.downloads import DownloadCompleted
from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
from exo.shared.types.worker.instances import (
Instance,
InstanceId,
InstanceMeta,
MlxJacclInstance,
)
from exo.shared.types.worker.shards import Sharding
from exo.utils.banner import print_startup_banner
from exo.utils.channels import Receiver, Sender, channel
@@ -513,6 +518,14 @@ class API:
shard_assignments = instance.shard_assignments
placement_node_ids = list(shard_assignments.node_to_runner.keys())
# Derive instance_meta from the actual instance type, since
# place_instance() may override it (e.g., single-node → MlxRing)
actual_instance_meta = (
InstanceMeta.MlxJaccl
if isinstance(instance, MlxJacclInstance)
else InstanceMeta.MlxRing
)
memory_delta_by_node: dict[str, int] = {}
if placement_node_ids:
total_bytes = model_card.storage_size.in_bytes
@@ -525,14 +538,14 @@ class API:
if (
model_card.model_id,
sharding,
instance_meta,
actual_instance_meta,
len(placement_node_ids),
) not in seen:
previews.append(
PlacementPreview(
model_id=model_card.model_id,
sharding=sharding,
instance_meta=instance_meta,
instance_meta=actual_instance_meta,
instance=instance,
memory_delta_by_node=memory_delta_by_node or None,
error=None,
@@ -542,7 +555,7 @@ class API:
(
model_card.model_id,
sharding,
instance_meta,
actual_instance_meta,
len(placement_node_ids),
)
)

View File

@@ -106,18 +106,13 @@ class RunnerSupervisor:
def shutdown(self):
logger.info("Runner supervisor shutting down")
self._tg.cancel_tasks()
self._ev_recv.close()
self._task_sender.close()
if not self._cancel_watch_runner.cancel_called:
self._cancel_watch_runner.cancel()
with contextlib.suppress(ClosedResourceError):
self._ev_recv.close()
with contextlib.suppress(ClosedResourceError):
self._task_sender.close()
with contextlib.suppress(ClosedResourceError):
self._event_sender.close()
with contextlib.suppress(ClosedResourceError):
self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
with contextlib.suppress(ClosedResourceError):
self._cancel_sender.close()
self._cancel_sender.close()
self.runner_process.join(5)
if not self.runner_process.is_alive():
logger.info("Runner process succesfully terminated")