mirror of
https://github.com/exo-explore/exo.git
synced 2026-02-24 02:07:17 -05:00
Compare commits
5 Commits
fix-instan
...
alexcheema
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92f9cd4820 | ||
|
|
c1ce541901 | ||
|
|
c90a0cec78 | ||
|
|
e8c1337168 | ||
|
|
7024ddcf3e |
@@ -469,11 +469,11 @@
|
||||
<td class="px-4 py-3 text-center align-middle">
|
||||
{#if cell.kind === "completed"}
|
||||
<div
|
||||
class="flex flex-col items-center gap-0.5"
|
||||
class="flex flex-col items-center gap-1"
|
||||
title="Completed ({formatBytes(cell.totalBytes)})"
|
||||
>
|
||||
<svg
|
||||
class="w-5 h-5 text-green-400"
|
||||
class="w-7 h-7 text-green-400"
|
||||
viewBox="0 0 20 20"
|
||||
fill="currentColor"
|
||||
>
|
||||
@@ -483,18 +483,18 @@
|
||||
clip-rule="evenodd"
|
||||
></path>
|
||||
</svg>
|
||||
<span class="text-[10px] text-exo-light-gray/70"
|
||||
<span class="text-xs text-exo-light-gray/70"
|
||||
>{formatBytes(cell.totalBytes)}</span
|
||||
>
|
||||
<button
|
||||
type="button"
|
||||
class="text-exo-light-gray/40 hover:text-red-400 transition-colors mt-0.5"
|
||||
class="text-exo-light-gray/40 hover:text-red-400 transition-colors mt-0.5 cursor-pointer"
|
||||
onclick={() =>
|
||||
deleteDownload(col.nodeId, row.modelId)}
|
||||
title="Delete from this node"
|
||||
>
|
||||
<svg
|
||||
class="w-3.5 h-3.5"
|
||||
class="w-5 h-5"
|
||||
viewBox="0 0 20 20"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
@@ -517,11 +517,11 @@
|
||||
cell.speed,
|
||||
)} - ETA {formatEta(cell.etaMs)}"
|
||||
>
|
||||
<span class="text-exo-yellow text-xs font-medium"
|
||||
<span class="text-exo-yellow text-sm font-medium"
|
||||
>{clampPercent(cell.percentage).toFixed(1)}%</span
|
||||
>
|
||||
<div
|
||||
class="w-14 h-1.5 bg-exo-black/60 rounded-sm overflow-hidden"
|
||||
class="w-16 h-2 bg-exo-black/60 rounded-sm overflow-hidden"
|
||||
>
|
||||
<div
|
||||
class="h-full bg-gradient-to-r from-exo-yellow to-exo-yellow/70 transition-all duration-300"
|
||||
@@ -530,25 +530,25 @@
|
||||
).toFixed(1)}%"
|
||||
></div>
|
||||
</div>
|
||||
<span class="text-[9px] text-exo-light-gray/60"
|
||||
<span class="text-[10px] text-exo-light-gray/60"
|
||||
>{formatSpeed(cell.speed)}</span
|
||||
>
|
||||
</div>
|
||||
{:else if cell.kind === "pending"}
|
||||
<div
|
||||
class="flex flex-col items-center gap-0.5"
|
||||
class="flex flex-col items-center gap-1"
|
||||
title={cell.downloaded > 0
|
||||
? `${formatBytes(cell.downloaded)} / ${formatBytes(cell.total)} downloaded`
|
||||
? `${formatBytes(cell.downloaded)} / ${formatBytes(cell.total)} downloaded (paused)`
|
||||
: "Download pending"}
|
||||
>
|
||||
{#if cell.downloaded > 0 && cell.total > 0}
|
||||
<span class="text-exo-light-gray/70 text-[10px]"
|
||||
<span class="text-exo-light-gray/70 text-xs"
|
||||
>{formatBytes(cell.downloaded)} / {formatBytes(
|
||||
cell.total,
|
||||
)}</span
|
||||
>
|
||||
<div
|
||||
class="w-full h-1 bg-white/10 rounded-full overflow-hidden"
|
||||
class="w-full h-1.5 bg-white/10 rounded-full overflow-hidden"
|
||||
>
|
||||
<div
|
||||
class="h-full bg-exo-light-gray/40 rounded-full"
|
||||
@@ -558,9 +558,55 @@
|
||||
).toFixed(1)}%"
|
||||
></div>
|
||||
</div>
|
||||
<span class="text-exo-light-gray/40 text-[9px]"
|
||||
>paused</span
|
||||
{#if row.shardMetadata}
|
||||
<button
|
||||
type="button"
|
||||
class="text-exo-light-gray/50 hover:text-exo-yellow transition-colors cursor-pointer"
|
||||
onclick={() =>
|
||||
startDownload(col.nodeId, row.shardMetadata!)}
|
||||
title="Resume download on this node"
|
||||
>
|
||||
<svg
|
||||
class="w-5 h-5"
|
||||
viewBox="0 0 20 20"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
>
|
||||
<path
|
||||
d="M10 3v10m0 0l-3-3m3 3l3-3M3 17h14"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
></path>
|
||||
</svg>
|
||||
</button>
|
||||
{:else}
|
||||
<span class="text-exo-light-gray/40 text-[10px]"
|
||||
>paused</span
|
||||
>
|
||||
{/if}
|
||||
{:else if row.shardMetadata}
|
||||
<button
|
||||
type="button"
|
||||
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors cursor-pointer"
|
||||
onclick={() =>
|
||||
startDownload(col.nodeId, row.shardMetadata!)}
|
||||
title="Start download on this node"
|
||||
>
|
||||
<svg
|
||||
class="w-6 h-6"
|
||||
viewBox="0 0 20 20"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
>
|
||||
<path
|
||||
d="M10 3v10m0 0l-3-3m3 3l3-3M3 17h14"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
></path>
|
||||
</svg>
|
||||
</button>
|
||||
{:else}
|
||||
<span class="text-exo-light-gray/50 text-sm">...</span
|
||||
>
|
||||
@@ -568,11 +614,11 @@
|
||||
</div>
|
||||
{:else if cell.kind === "failed"}
|
||||
<div
|
||||
class="flex flex-col items-center gap-0.5"
|
||||
class="flex flex-col items-center gap-1"
|
||||
title="Download failed"
|
||||
>
|
||||
<svg
|
||||
class="w-5 h-5 text-red-400"
|
||||
class="w-7 h-7 text-red-400"
|
||||
viewBox="0 0 20 20"
|
||||
fill="currentColor"
|
||||
>
|
||||
@@ -585,13 +631,13 @@
|
||||
{#if row.shardMetadata}
|
||||
<button
|
||||
type="button"
|
||||
class="text-exo-light-gray/40 hover:text-exo-yellow transition-colors"
|
||||
class="text-exo-light-gray/40 hover:text-exo-yellow transition-colors cursor-pointer"
|
||||
onclick={() =>
|
||||
startDownload(col.nodeId, row.shardMetadata!)}
|
||||
title="Retry download on this node"
|
||||
>
|
||||
<svg
|
||||
class="w-3.5 h-3.5"
|
||||
class="w-5 h-5"
|
||||
viewBox="0 0 20 20"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
@@ -617,13 +663,13 @@
|
||||
{#if row.shardMetadata}
|
||||
<button
|
||||
type="button"
|
||||
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors mt-0.5 opacity-0 group-hover:opacity-100"
|
||||
class="text-exo-light-gray/30 hover:text-exo-yellow transition-colors mt-0.5 opacity-0 group-hover:opacity-100 cursor-pointer"
|
||||
onclick={() =>
|
||||
startDownload(col.nodeId, row.shardMetadata!)}
|
||||
title="Download to this node"
|
||||
>
|
||||
<svg
|
||||
class="w-3.5 h-3.5"
|
||||
class="w-5 h-5"
|
||||
viewBox="0 0 20 20"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
|
||||
@@ -823,6 +823,7 @@ async def download_shard(
|
||||
|
||||
for file in filtered_file_list:
|
||||
downloaded_bytes = await get_downloaded_size(target_dir / file.path)
|
||||
final_file_exists = await aios.path.exists(target_dir / file.path)
|
||||
file_progress[file.path] = RepoFileDownloadProgress(
|
||||
repo_id=shard.model_card.model_id,
|
||||
repo_revision=revision,
|
||||
@@ -832,7 +833,9 @@ async def download_shard(
|
||||
total=Memory.from_bytes(file.size or 0),
|
||||
speed=0,
|
||||
eta=timedelta(0),
|
||||
status="complete" if downloaded_bytes == file.size else "not_started",
|
||||
status="complete"
|
||||
if final_file_exists and downloaded_bytes == file.size
|
||||
else "not_started",
|
||||
start_time=time.time(),
|
||||
)
|
||||
|
||||
|
||||
@@ -252,7 +252,7 @@ def main():
|
||||
target = min(max(soft, 65535), hard)
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE, (target, hard))
|
||||
|
||||
mp.set_start_method("spawn")
|
||||
mp.set_start_method("spawn", force=True)
|
||||
# TODO: Refactor the current verbosity system
|
||||
logger_setup(EXO_LOG, args.verbosity)
|
||||
logger.info("Starting EXO")
|
||||
|
||||
@@ -168,12 +168,7 @@ from exo.shared.types.openai_responses import (
|
||||
)
|
||||
from exo.shared.types.state import State
|
||||
from exo.shared.types.worker.downloads import DownloadCompleted
|
||||
from exo.shared.types.worker.instances import (
|
||||
Instance,
|
||||
InstanceId,
|
||||
InstanceMeta,
|
||||
MlxJacclInstance,
|
||||
)
|
||||
from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
|
||||
from exo.shared.types.worker.shards import Sharding
|
||||
from exo.utils.banner import print_startup_banner
|
||||
from exo.utils.channels import Receiver, Sender, channel
|
||||
@@ -518,14 +513,6 @@ class API:
|
||||
shard_assignments = instance.shard_assignments
|
||||
placement_node_ids = list(shard_assignments.node_to_runner.keys())
|
||||
|
||||
# Derive instance_meta from the actual instance type, since
|
||||
# place_instance() may override it (e.g., single-node → MlxRing)
|
||||
actual_instance_meta = (
|
||||
InstanceMeta.MlxJaccl
|
||||
if isinstance(instance, MlxJacclInstance)
|
||||
else InstanceMeta.MlxRing
|
||||
)
|
||||
|
||||
memory_delta_by_node: dict[str, int] = {}
|
||||
if placement_node_ids:
|
||||
total_bytes = model_card.storage_size.in_bytes
|
||||
@@ -538,14 +525,14 @@ class API:
|
||||
if (
|
||||
model_card.model_id,
|
||||
sharding,
|
||||
actual_instance_meta,
|
||||
instance_meta,
|
||||
len(placement_node_ids),
|
||||
) not in seen:
|
||||
previews.append(
|
||||
PlacementPreview(
|
||||
model_id=model_card.model_id,
|
||||
sharding=sharding,
|
||||
instance_meta=actual_instance_meta,
|
||||
instance_meta=instance_meta,
|
||||
instance=instance,
|
||||
memory_delta_by_node=memory_delta_by_node or None,
|
||||
error=None,
|
||||
@@ -555,7 +542,7 @@ class API:
|
||||
(
|
||||
model_card.model_id,
|
||||
sharding,
|
||||
actual_instance_meta,
|
||||
instance_meta,
|
||||
len(placement_node_ids),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -106,13 +106,18 @@ class RunnerSupervisor:
|
||||
def shutdown(self):
|
||||
logger.info("Runner supervisor shutting down")
|
||||
self._tg.cancel_tasks()
|
||||
self._ev_recv.close()
|
||||
self._task_sender.close()
|
||||
if not self._cancel_watch_runner.cancel_called:
|
||||
self._cancel_watch_runner.cancel()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._ev_recv.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._task_sender.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._event_sender.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
|
||||
self._cancel_sender.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._cancel_sender.close()
|
||||
self.runner_process.join(5)
|
||||
if not self.runner_process.is_alive():
|
||||
logger.info("Runner process succesfully terminated")
|
||||
|
||||
Reference in New Issue
Block a user