mirror of
https://github.com/exo-explore/exo.git
synced 2026-02-23 17:58:36 -05:00
Compare commits
2 Commits
feat/mac-s
...
feat/meta-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59b0deb4ab | ||
|
|
42e1e7322b |
@@ -20,6 +20,7 @@ from harness import (
|
||||
instance_id_from_instance,
|
||||
nodes_used_in_instance,
|
||||
resolve_model_short_id,
|
||||
run_planning_phase,
|
||||
settle_and_fetch_placements,
|
||||
wait_for_instance_gone,
|
||||
wait_for_instance_ready,
|
||||
@@ -962,6 +963,21 @@ Examples:
|
||||
|
||||
selected.sort(key=_placement_sort_key)
|
||||
preview = selected[0]
|
||||
|
||||
settle_deadline = (
|
||||
time.monotonic() + args.settle_timeout if args.settle_timeout > 0 else None
|
||||
)
|
||||
|
||||
print("Planning phase: checking downloads...", file=log)
|
||||
run_planning_phase(
|
||||
exo,
|
||||
full_model_id,
|
||||
preview,
|
||||
args.danger_delete_downloads,
|
||||
args.timeout,
|
||||
settle_deadline,
|
||||
)
|
||||
|
||||
instance = preview["instance"]
|
||||
instance_id = instance_id_from_instance(instance)
|
||||
sharding = str(preview["sharding"])
|
||||
|
||||
@@ -35,6 +35,7 @@ from harness import (
|
||||
instance_id_from_instance,
|
||||
nodes_used_in_instance,
|
||||
resolve_model_short_id,
|
||||
run_planning_phase,
|
||||
settle_and_fetch_placements,
|
||||
wait_for_instance_gone,
|
||||
wait_for_instance_ready,
|
||||
@@ -332,6 +333,20 @@ def main() -> int:
|
||||
if args.dry_run:
|
||||
return 0
|
||||
|
||||
settle_deadline = (
|
||||
time.monotonic() + args.settle_timeout if args.settle_timeout > 0 else None
|
||||
)
|
||||
|
||||
logger.info("Planning phase: checking downloads...")
|
||||
run_planning_phase(
|
||||
client,
|
||||
full_model_id,
|
||||
selected[0],
|
||||
args.danger_delete_downloads,
|
||||
args.timeout,
|
||||
settle_deadline,
|
||||
)
|
||||
|
||||
all_rows: list[dict[str, Any]] = []
|
||||
|
||||
for preview in selected:
|
||||
|
||||
150
bench/harness.py
150
bench/harness.py
@@ -282,6 +282,151 @@ def settle_and_fetch_placements(
|
||||
return selected
|
||||
|
||||
|
||||
def run_planning_phase(
|
||||
client: ExoClient,
|
||||
full_model_id: str,
|
||||
preview: dict[str, Any],
|
||||
danger_delete: bool,
|
||||
timeout: float,
|
||||
settle_deadline: float | None,
|
||||
) -> None:
|
||||
"""Check disk space and ensure model is downloaded before benchmarking."""
|
||||
# Get model size from /models
|
||||
models = client.request_json("GET", "/models") or {}
|
||||
model_bytes = 0
|
||||
for m in models.get("data", []):
|
||||
if m.get("hugging_face_id") == full_model_id:
|
||||
model_bytes = m.get("storage_size_megabytes", 0) * 1024 * 1024
|
||||
break
|
||||
|
||||
if not model_bytes:
|
||||
logger.warning(
|
||||
f"Could not determine size for {full_model_id}, skipping disk check"
|
||||
)
|
||||
return
|
||||
|
||||
# Get nodes from preview
|
||||
inner = unwrap_instance(preview["instance"])
|
||||
node_ids = list(inner["shardAssignments"]["nodeToRunner"].keys())
|
||||
runner_to_shard = inner["shardAssignments"]["runnerToShard"]
|
||||
|
||||
state = client.request_json("GET", "/state")
|
||||
downloads = state.get("downloads", {})
|
||||
node_disk = state.get("nodeDisk", {})
|
||||
|
||||
for node_id in node_ids:
|
||||
node_downloads = downloads.get(node_id, [])
|
||||
|
||||
# Check if model already downloaded on this node
|
||||
already_downloaded = any(
|
||||
"DownloadCompleted" in p
|
||||
and unwrap_instance(p["DownloadCompleted"]["shardMetadata"])["modelCard"][
|
||||
"modelId"
|
||||
]
|
||||
== full_model_id
|
||||
for p in node_downloads
|
||||
)
|
||||
if already_downloaded:
|
||||
continue
|
||||
|
||||
# Wait for disk info if settle_deadline is set
|
||||
disk_info = node_disk.get(node_id, {})
|
||||
backoff = _SETTLE_INITIAL_BACKOFF_S
|
||||
while not disk_info and settle_deadline and time.monotonic() < settle_deadline:
|
||||
remaining = settle_deadline - time.monotonic()
|
||||
logger.info(
|
||||
f"Waiting for disk info on {node_id} ({remaining:.0f}s remaining)..."
|
||||
)
|
||||
time.sleep(min(backoff, remaining))
|
||||
backoff = min(backoff * _SETTLE_BACKOFF_MULTIPLIER, _SETTLE_MAX_BACKOFF_S)
|
||||
state = client.request_json("GET", "/state")
|
||||
node_disk = state.get("nodeDisk", {})
|
||||
disk_info = node_disk.get(node_id, {})
|
||||
|
||||
if not disk_info:
|
||||
logger.warning(f"No disk info for {node_id}, skipping space check")
|
||||
continue
|
||||
|
||||
avail = disk_info.get("available", {}).get("inBytes", 0)
|
||||
if avail >= model_bytes:
|
||||
continue
|
||||
|
||||
if not danger_delete:
|
||||
raise RuntimeError(
|
||||
f"Insufficient disk on {node_id}: need {model_bytes // (1024**3)}GB, "
|
||||
f"have {avail // (1024**3)}GB. Use --danger-delete-downloads to free space."
|
||||
)
|
||||
|
||||
# Delete from smallest to largest
|
||||
completed = [
|
||||
(
|
||||
unwrap_instance(p["DownloadCompleted"]["shardMetadata"])["modelCard"][
|
||||
"modelId"
|
||||
],
|
||||
p["DownloadCompleted"]["totalBytes"]["inBytes"],
|
||||
)
|
||||
for p in node_downloads
|
||||
if "DownloadCompleted" in p
|
||||
]
|
||||
for del_model, size in sorted(completed, key=lambda x: x[1]):
|
||||
logger.info(f"Deleting {del_model} from {node_id} ({size // (1024**2)}MB)")
|
||||
client.request_json("DELETE", f"/download/{node_id}/{del_model}")
|
||||
avail += size
|
||||
if avail >= model_bytes:
|
||||
break
|
||||
|
||||
if avail < model_bytes:
|
||||
raise RuntimeError(f"Could not free enough space on {node_id}")
|
||||
|
||||
# Start downloads (idempotent)
|
||||
for node_id in node_ids:
|
||||
runner_id = inner["shardAssignments"]["nodeToRunner"][node_id]
|
||||
shard = runner_to_shard[runner_id]
|
||||
client.request_json(
|
||||
"POST",
|
||||
"/download/start",
|
||||
body={
|
||||
"targetNodeId": node_id,
|
||||
"shardMetadata": shard,
|
||||
},
|
||||
)
|
||||
logger.info(f"Started download on {node_id}")
|
||||
|
||||
# Wait for downloads
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
state = client.request_json("GET", "/state")
|
||||
downloads = state.get("downloads", {})
|
||||
all_done = True
|
||||
for node_id in node_ids:
|
||||
done = any(
|
||||
"DownloadCompleted" in p
|
||||
and unwrap_instance(p["DownloadCompleted"]["shardMetadata"])[
|
||||
"modelCard"
|
||||
]["modelId"]
|
||||
== full_model_id
|
||||
for p in downloads.get(node_id, [])
|
||||
)
|
||||
failed = [
|
||||
p["DownloadFailed"]["errorMessage"]
|
||||
for p in downloads.get(node_id, [])
|
||||
if "DownloadFailed" in p
|
||||
and unwrap_instance(p["DownloadFailed"]["shardMetadata"])["modelCard"][
|
||||
"modelId"
|
||||
]
|
||||
== full_model_id
|
||||
]
|
||||
if failed:
|
||||
raise RuntimeError(f"Download failed on {node_id}: {failed[0]}")
|
||||
if not done:
|
||||
all_done = False
|
||||
if all_done:
|
||||
return
|
||||
time.sleep(1)
|
||||
|
||||
raise TimeoutError("Downloads did not complete in time")
|
||||
|
||||
|
||||
def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
|
||||
ap.add_argument("--host", default=os.environ.get("EXO_HOST", "localhost"))
|
||||
ap.add_argument(
|
||||
@@ -325,3 +470,8 @@ def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
|
||||
default=0,
|
||||
help="Max seconds to wait for the cluster to produce valid placements (0 = try once).",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--danger-delete-downloads",
|
||||
action="store_true",
|
||||
help="Delete existing models from smallest to largest to make room for benchmark model.",
|
||||
)
|
||||
|
||||
@@ -185,11 +185,7 @@
|
||||
|
||||
let instanceType: string | null = null;
|
||||
if (instanceTag === "MlxRingInstance") instanceType = "MLX Ring";
|
||||
else if (
|
||||
instanceTag === "MlxIbvInstance" ||
|
||||
instanceTag === "MlxJacclInstance"
|
||||
)
|
||||
instanceType = "MLX RDMA";
|
||||
else if (instanceTag === "MlxJacclInstance") instanceType = "MLX RDMA";
|
||||
|
||||
let sharding: string | null = null;
|
||||
const inst = instance as {
|
||||
|
||||
256
dashboard/src/lib/components/MetaInstancePanel.svelte
Normal file
256
dashboard/src/lib/components/MetaInstancePanel.svelte
Normal file
@@ -0,0 +1,256 @@
|
||||
<script lang="ts">
|
||||
import type { MetaInstance } from "$lib/stores/app.svelte";
|
||||
|
||||
interface Props {
|
||||
metaInstances: Record<string, MetaInstance>;
|
||||
instances: Record<string, unknown>;
|
||||
onDelete?: (metaInstanceId: string) => void;
|
||||
onHoverNodes?: (nodeIds: Set<string>) => void;
|
||||
onHoverEnd?: () => void;
|
||||
}
|
||||
|
||||
let { metaInstances, instances, onDelete, onHoverNodes, onHoverEnd }: Props =
|
||||
$props();
|
||||
|
||||
function getTagged(obj: unknown): [string | null, unknown] {
|
||||
if (!obj || typeof obj !== "object") return [null, null];
|
||||
const keys = Object.keys(obj as Record<string, unknown>);
|
||||
if (keys.length === 1) {
|
||||
return [keys[0], (obj as Record<string, unknown>)[keys[0]]];
|
||||
}
|
||||
return [null, null];
|
||||
}
|
||||
|
||||
interface LinkedInstance {
|
||||
instanceId: string;
|
||||
modelId: string;
|
||||
nodeIds: string[];
|
||||
}
|
||||
|
||||
function findLinkedInstance(metaInstanceId: string): LinkedInstance | null {
|
||||
for (const [instanceId, instanceWrapped] of Object.entries(instances)) {
|
||||
const [, instance] = getTagged(instanceWrapped);
|
||||
if (!instance || typeof instance !== "object") continue;
|
||||
const inst = instance as {
|
||||
metaInstanceId?: string;
|
||||
shardAssignments?: {
|
||||
modelId?: string;
|
||||
nodeToRunner?: Record<string, string>;
|
||||
};
|
||||
};
|
||||
if (inst.metaInstanceId === metaInstanceId) {
|
||||
return {
|
||||
instanceId,
|
||||
modelId: inst.shardAssignments?.modelId || "Unknown",
|
||||
nodeIds: Object.keys(inst.shardAssignments?.nodeToRunner ?? {}),
|
||||
};
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
type MetaStatus = "active" | "provisioning" | "error" | "retrying";
|
||||
|
||||
function getStatus(
|
||||
meta: MetaInstance,
|
||||
linked: LinkedInstance | null,
|
||||
): MetaStatus {
|
||||
if (meta.placementError || meta.lastFailureError) {
|
||||
if (meta.consecutiveFailures > 0 && meta.consecutiveFailures < 3)
|
||||
return "retrying";
|
||||
return "error";
|
||||
}
|
||||
if (linked) return "active";
|
||||
return "provisioning";
|
||||
}
|
||||
|
||||
function statusLabel(status: MetaStatus): string {
|
||||
switch (status) {
|
||||
case "active":
|
||||
return "ACTIVE";
|
||||
case "provisioning":
|
||||
return "PROVISIONING";
|
||||
case "error":
|
||||
return "ERROR";
|
||||
case "retrying":
|
||||
return "RETRYING";
|
||||
}
|
||||
}
|
||||
|
||||
function statusDotClass(status: MetaStatus): string {
|
||||
switch (status) {
|
||||
case "active":
|
||||
return "bg-green-400 shadow-[0_0_6px_rgba(74,222,128,0.6)]";
|
||||
case "provisioning":
|
||||
return "bg-yellow-400 animate-pulse shadow-[0_0_6px_rgba(250,204,21,0.6)]";
|
||||
case "error":
|
||||
return "bg-red-400 shadow-[0_0_6px_rgba(248,113,113,0.6)]";
|
||||
case "retrying":
|
||||
return "bg-orange-400 animate-pulse shadow-[0_0_6px_rgba(251,146,60,0.6)]";
|
||||
}
|
||||
}
|
||||
|
||||
function statusTextClass(status: MetaStatus): string {
|
||||
switch (status) {
|
||||
case "active":
|
||||
return "text-green-400";
|
||||
case "provisioning":
|
||||
return "text-yellow-400";
|
||||
case "error":
|
||||
return "text-red-400";
|
||||
case "retrying":
|
||||
return "text-orange-400";
|
||||
}
|
||||
}
|
||||
|
||||
function borderClass(status: MetaStatus): string {
|
||||
switch (status) {
|
||||
case "active":
|
||||
return "border-green-500/30 border-l-green-400";
|
||||
case "provisioning":
|
||||
return "border-purple-500/30 border-l-purple-400";
|
||||
case "error":
|
||||
return "border-red-500/30 border-l-red-400";
|
||||
case "retrying":
|
||||
return "border-orange-500/30 border-l-orange-400";
|
||||
}
|
||||
}
|
||||
|
||||
function cornerClass(status: MetaStatus): string {
|
||||
switch (status) {
|
||||
case "active":
|
||||
return "border-green-500/50";
|
||||
case "provisioning":
|
||||
return "border-purple-500/50";
|
||||
case "error":
|
||||
return "border-red-500/50";
|
||||
case "retrying":
|
||||
return "border-orange-500/50";
|
||||
}
|
||||
}
|
||||
|
||||
function handleHover(meta: MetaInstance, linked: LinkedInstance | null) {
|
||||
if (!onHoverNodes) return;
|
||||
if (linked && linked.nodeIds.length > 0) {
|
||||
onHoverNodes(new Set(linked.nodeIds));
|
||||
} else if (meta.nodeIds && meta.nodeIds.length > 0) {
|
||||
onHoverNodes(new Set(meta.nodeIds));
|
||||
}
|
||||
}
|
||||
|
||||
function formatModelId(modelId: string): string {
|
||||
return modelId.split("/").pop() || modelId;
|
||||
}
|
||||
</script>
|
||||
|
||||
<!-- Panel Header -->
|
||||
<div class="flex items-center gap-2 mb-4">
|
||||
<div
|
||||
class="w-2 h-2 bg-purple-400 rounded-full shadow-[0_0_8px_rgba(168,85,247,0.6)] animate-pulse"
|
||||
></div>
|
||||
<h3 class="text-xs text-purple-400 font-mono tracking-[0.2em] uppercase">
|
||||
Meta-Instances
|
||||
</h3>
|
||||
<div
|
||||
class="flex-1 h-px bg-gradient-to-r from-purple-400/30 to-transparent"
|
||||
></div>
|
||||
</div>
|
||||
|
||||
<div
|
||||
class="space-y-3 max-h-72 xl:max-h-96 overflow-y-auto overflow-x-hidden py-px"
|
||||
>
|
||||
{#each Object.entries(metaInstances) as [id, meta]}
|
||||
{@const linked = findLinkedInstance(meta.metaInstanceId)}
|
||||
{@const status = getStatus(meta, linked)}
|
||||
{@const corners = cornerClass(status)}
|
||||
<div
|
||||
class="relative group cursor-default"
|
||||
role="group"
|
||||
onmouseenter={() => handleHover(meta, linked)}
|
||||
onmouseleave={() => onHoverEnd?.()}
|
||||
>
|
||||
<!-- Corner accents -->
|
||||
<div
|
||||
class="absolute -top-px -left-px w-2 h-2 border-l border-t {corners}"
|
||||
></div>
|
||||
<div
|
||||
class="absolute -top-px -right-px w-2 h-2 border-r border-t {corners}"
|
||||
></div>
|
||||
<div
|
||||
class="absolute -bottom-px -left-px w-2 h-2 border-l border-b {corners}"
|
||||
></div>
|
||||
<div
|
||||
class="absolute -bottom-px -right-px w-2 h-2 border-r border-b {corners}"
|
||||
></div>
|
||||
|
||||
<div
|
||||
class="bg-exo-dark-gray/60 border border-l-2 {borderClass(status)} p-3"
|
||||
>
|
||||
<div class="flex justify-between items-start mb-2 pl-2">
|
||||
<div class="flex items-center gap-2">
|
||||
<div
|
||||
class="w-1.5 h-1.5 {statusDotClass(status)} rounded-full"
|
||||
></div>
|
||||
<span class="text-exo-light-gray font-mono text-sm tracking-wider">
|
||||
{meta.metaInstanceId.slice(0, 8).toUpperCase()}
|
||||
</span>
|
||||
<span
|
||||
class="{statusTextClass(
|
||||
status,
|
||||
)} text-[10px] font-mono tracking-wider"
|
||||
>
|
||||
{statusLabel(status)}
|
||||
</span>
|
||||
</div>
|
||||
<button
|
||||
onclick={() => onDelete?.(meta.metaInstanceId)}
|
||||
class="text-xs px-2 py-1 font-mono tracking-wider uppercase border border-red-500/30 text-red-400 hover:bg-red-500/20 hover:text-red-400 hover:border-red-500/50 transition-all duration-200 cursor-pointer"
|
||||
>
|
||||
DELETE
|
||||
</button>
|
||||
</div>
|
||||
<div class="pl-2">
|
||||
<div class="text-exo-yellow text-xs font-mono tracking-wide truncate">
|
||||
{formatModelId(meta.modelId)}
|
||||
</div>
|
||||
<div class="text-white/60 text-xs font-mono">
|
||||
{meta.sharding} · {meta.instanceMeta} · min {meta.minNodes}
|
||||
node{meta.minNodes !== 1 ? "s" : ""}
|
||||
</div>
|
||||
{#if meta.nodeIds && meta.nodeIds.length > 0}
|
||||
<div class="text-white/50 text-[10px] font-mono mt-0.5">
|
||||
Pinned: {meta.nodeIds.map((n) => n.slice(0, 8)).join(", ")}
|
||||
</div>
|
||||
{/if}
|
||||
{#if meta.placementError}
|
||||
<div
|
||||
class="text-red-400/80 text-[10px] font-mono mt-1 truncate"
|
||||
title={meta.placementError}
|
||||
>
|
||||
{meta.placementError}
|
||||
</div>
|
||||
{/if}
|
||||
{#if meta.lastFailureError}
|
||||
<div
|
||||
class="text-orange-400/80 text-[10px] font-mono mt-0.5 truncate"
|
||||
title={meta.lastFailureError}
|
||||
>
|
||||
Failure: {meta.lastFailureError}
|
||||
</div>
|
||||
{/if}
|
||||
{#if meta.consecutiveFailures > 0}
|
||||
<div class="text-orange-400/60 text-[10px] font-mono mt-0.5">
|
||||
Retries: {meta.consecutiveFailures}/3
|
||||
</div>
|
||||
{/if}
|
||||
{#if linked}
|
||||
<div class="text-purple-400/60 text-[10px] font-mono mt-1">
|
||||
Instance: {linked.instanceId.slice(0, 8)} · {linked.nodeIds
|
||||
.length} node{linked.nodeIds.length !== 1 ? "s" : ""}
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{/each}
|
||||
</div>
|
||||
@@ -21,7 +21,7 @@
|
||||
} | null;
|
||||
nodes?: Record<string, NodeInfo>;
|
||||
sharding?: "Pipeline" | "Tensor";
|
||||
runtime?: "MlxRing" | "MlxIbv" | "MlxJaccl";
|
||||
runtime?: "MlxRing" | "MlxJaccl";
|
||||
onLaunch?: () => void;
|
||||
tags?: string[];
|
||||
apiPreview?: PlacementPreview | null;
|
||||
@@ -348,7 +348,7 @@
|
||||
// Debug mode state
|
||||
const isDebugMode = $derived(debugMode());
|
||||
const topology = $derived(topologyData());
|
||||
const isRdma = $derived(runtime === "MlxIbv" || runtime === "MlxJaccl");
|
||||
const isRdma = $derived(runtime === "MlxJaccl");
|
||||
|
||||
// Get interface name for an IP from node data
|
||||
function getInterfaceForIp(nodeId: string, ip?: string): string | null {
|
||||
@@ -575,7 +575,7 @@
|
||||
>
|
||||
{runtime === "MlxRing"
|
||||
? "MLX Ring"
|
||||
: runtime === "MlxIbv" || runtime === "MlxJaccl"
|
||||
: runtime === "MlxJaccl"
|
||||
? "MLX RDMA"
|
||||
: runtime}
|
||||
</span>
|
||||
|
||||
@@ -11,4 +11,5 @@ export { default as FamilySidebar } from "./FamilySidebar.svelte";
|
||||
export { default as HuggingFaceResultItem } from "./HuggingFaceResultItem.svelte";
|
||||
export { default as ModelFilterPopover } from "./ModelFilterPopover.svelte";
|
||||
export { default as ModelPickerGroup } from "./ModelPickerGroup.svelte";
|
||||
export { default as MetaInstancePanel } from "./MetaInstancePanel.svelte";
|
||||
export { default as ModelPickerModal } from "./ModelPickerModal.svelte";
|
||||
|
||||
@@ -74,6 +74,18 @@ export interface Instance {
|
||||
};
|
||||
}
|
||||
|
||||
export interface MetaInstance {
|
||||
metaInstanceId: string;
|
||||
modelId: string;
|
||||
sharding: "Pipeline" | "Tensor";
|
||||
instanceMeta: "MlxRing" | "MlxJaccl";
|
||||
minNodes: number;
|
||||
nodeIds: string[] | null;
|
||||
placementError: string | null;
|
||||
consecutiveFailures: number;
|
||||
lastFailureError: string | null;
|
||||
}
|
||||
|
||||
// Granular node state types from the new state structure
|
||||
interface RawNodeIdentity {
|
||||
modelId?: string;
|
||||
@@ -168,7 +180,7 @@ export interface ModelDownloadStatus {
|
||||
export interface PlacementPreview {
|
||||
model_id: string;
|
||||
sharding: "Pipeline" | "Tensor";
|
||||
instance_meta: "MlxRing" | "MlxIbv" | "MlxJaccl";
|
||||
instance_meta: "MlxRing" | "MlxJaccl";
|
||||
instance: unknown | null;
|
||||
memory_delta_by_node: Record<string, number> | null;
|
||||
error: string | null;
|
||||
@@ -219,10 +231,10 @@ interface RawStateResponse {
|
||||
string,
|
||||
{
|
||||
MlxRingInstance?: Instance;
|
||||
MlxIbvInstance?: Instance;
|
||||
MlxJacclInstance?: Instance;
|
||||
}
|
||||
>;
|
||||
metaInstances?: Record<string, MetaInstance>;
|
||||
runners?: Record<string, unknown>;
|
||||
downloads?: Record<string, unknown[]>;
|
||||
// New granular node state fields
|
||||
@@ -533,6 +545,7 @@ class AppStore {
|
||||
// Topology state
|
||||
topologyData = $state<TopologyData | null>(null);
|
||||
instances = $state<Record<string, unknown>>({});
|
||||
metaInstances = $state<Record<string, MetaInstance>>({});
|
||||
runners = $state<Record<string, unknown>>({});
|
||||
downloads = $state<Record<string, unknown[]>>({});
|
||||
nodeDisk = $state<
|
||||
@@ -905,11 +918,7 @@ class AppStore {
|
||||
|
||||
let instanceType: string | null = null;
|
||||
if (instanceTag === "MlxRingInstance") instanceType = "MLX Ring";
|
||||
else if (
|
||||
instanceTag === "MlxIbvInstance" ||
|
||||
instanceTag === "MlxJacclInstance"
|
||||
)
|
||||
instanceType = "MLX RDMA";
|
||||
else if (instanceTag === "MlxJacclInstance") instanceType = "MLX RDMA";
|
||||
|
||||
let sharding: string | null = null;
|
||||
const inst = instance as {
|
||||
@@ -1271,6 +1280,9 @@ class AppStore {
|
||||
if (data.runners) {
|
||||
this.runners = data.runners;
|
||||
}
|
||||
if (data.metaInstances) {
|
||||
this.metaInstances = data.metaInstances;
|
||||
}
|
||||
if (data.downloads) {
|
||||
this.downloads = data.downloads;
|
||||
}
|
||||
@@ -3112,6 +3124,7 @@ export const totalTokens = () => appStore.totalTokens;
|
||||
export const prefillProgress = () => appStore.prefillProgress;
|
||||
export const topologyData = () => appStore.topologyData;
|
||||
export const instances = () => appStore.instances;
|
||||
export const metaInstances = () => appStore.metaInstances;
|
||||
export const runners = () => appStore.runners;
|
||||
export const downloads = () => appStore.downloads;
|
||||
export const nodeDisk = () => appStore.nodeDisk;
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
ChatMessages,
|
||||
ChatSidebar,
|
||||
ModelCard,
|
||||
MetaInstancePanel,
|
||||
ModelPickerModal,
|
||||
} from "$lib/components";
|
||||
import {
|
||||
@@ -24,6 +25,7 @@
|
||||
lastUpdate,
|
||||
clearChat,
|
||||
instances,
|
||||
metaInstances,
|
||||
runners,
|
||||
downloads,
|
||||
placementPreviews,
|
||||
@@ -60,6 +62,7 @@
|
||||
const data = $derived(topologyData());
|
||||
const update = $derived(lastUpdate());
|
||||
const instanceData = $derived(instances());
|
||||
const metaInstancesData = $derived(metaInstances());
|
||||
const runnersData = $derived(runners());
|
||||
const downloadsData = $derived(downloads());
|
||||
const previewsData = $derived(placementPreviews());
|
||||
@@ -224,7 +227,7 @@
|
||||
return model.tasks.includes("ImageToImage");
|
||||
}
|
||||
let selectedSharding = $state<"Pipeline" | "Tensor">("Pipeline");
|
||||
type InstanceMeta = "MlxRing" | "MlxIbv" | "MlxJaccl";
|
||||
type InstanceMeta = "MlxRing" | "MlxJaccl";
|
||||
|
||||
// Launch defaults persistence
|
||||
const LAUNCH_DEFAULTS_KEY = "exo-launch-defaults";
|
||||
@@ -481,7 +484,7 @@
|
||||
const matchesSelectedRuntime = (runtime: InstanceMeta): boolean =>
|
||||
selectedInstanceType === "MlxRing"
|
||||
? runtime === "MlxRing"
|
||||
: runtime === "MlxIbv" || runtime === "MlxJaccl";
|
||||
: runtime === "MlxJaccl";
|
||||
|
||||
// Helper to check if a model can be launched (has valid placement with >= minNodes)
|
||||
function canModelFit(modelId: string): boolean {
|
||||
@@ -1248,6 +1251,44 @@
|
||||
}
|
||||
}
|
||||
|
||||
async function deleteMetaInstance(metaInstanceId: string) {
|
||||
if (!confirm(`Delete meta-instance ${metaInstanceId.slice(0, 8)}...?`))
|
||||
return;
|
||||
try {
|
||||
const response = await fetch(`/meta_instance/${metaInstanceId}`, {
|
||||
method: "DELETE",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
if (!response.ok) {
|
||||
console.error("Failed to delete meta-instance:", response.status);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error deleting meta-instance:", error);
|
||||
}
|
||||
}
|
||||
|
||||
async function createMetaInstance() {
|
||||
if (!selectedModelId) return;
|
||||
try {
|
||||
const response = await fetch("/meta_instance", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model_id: selectedModelId,
|
||||
sharding: selectedSharding,
|
||||
instance_meta: selectedInstanceType,
|
||||
min_nodes: selectedMinNodes,
|
||||
node_ids: nodeFilter.size > 0 ? Array.from(nodeFilter) : null,
|
||||
}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
console.error("Failed to create meta-instance:", await response.text());
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error creating meta-instance:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to unwrap tagged unions like { MlxRingInstance: {...} }
|
||||
function getTagged(obj: unknown): [string | null, unknown] {
|
||||
if (!obj || typeof obj !== "object") return [null, null];
|
||||
@@ -1288,11 +1329,7 @@
|
||||
// Instance type from tag
|
||||
let instanceType = "Unknown";
|
||||
if (instanceTag === "MlxRingInstance") instanceType = "MLX Ring";
|
||||
else if (
|
||||
instanceTag === "MlxIbvInstance" ||
|
||||
instanceTag === "MlxJacclInstance"
|
||||
)
|
||||
instanceType = "MLX RDMA";
|
||||
else if (instanceTag === "MlxJacclInstance") instanceType = "MLX RDMA";
|
||||
|
||||
const inst = instance as {
|
||||
shardAssignments?: {
|
||||
@@ -1641,6 +1678,7 @@
|
||||
|
||||
const nodeCount = $derived(data ? Object.keys(data.nodes).length : 0);
|
||||
const instanceCount = $derived(Object.keys(instanceData).length);
|
||||
const metaInstanceCount = $derived(Object.keys(metaInstancesData).length);
|
||||
|
||||
// Helper to get the number of nodes in a placement preview
|
||||
function getPreviewNodeCount(preview: PlacementPreview): number {
|
||||
@@ -2286,6 +2324,19 @@
|
||||
<aside
|
||||
class="w-80 border-l border-exo-yellow/10 bg-exo-dark-gray flex flex-col flex-shrink-0"
|
||||
>
|
||||
<!-- Meta-Instances Panel -->
|
||||
{#if metaInstanceCount > 0}
|
||||
<div class="p-4 flex-shrink-0">
|
||||
<MetaInstancePanel
|
||||
metaInstances={metaInstancesData}
|
||||
instances={instanceData}
|
||||
onDelete={deleteMetaInstance}
|
||||
onHoverNodes={(nodes) => (hoveredPreviewNodes = nodes)}
|
||||
onHoverEnd={() => (hoveredPreviewNodes = new Set())}
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<!-- Running Instances Panel (only shown when instances exist) - Scrollable -->
|
||||
{#if instanceCount > 0}
|
||||
<div class="p-4 flex-shrink-0">
|
||||
@@ -2877,21 +2928,21 @@
|
||||
</button>
|
||||
<button
|
||||
onclick={() => {
|
||||
selectedInstanceType = "MlxIbv";
|
||||
selectedInstanceType = "MlxJaccl";
|
||||
saveLaunchDefaults();
|
||||
}}
|
||||
class="flex items-center gap-2 py-2 px-4 text-sm font-mono border rounded transition-all duration-200 cursor-pointer {selectedInstanceType ===
|
||||
'MlxIbv'
|
||||
'MlxJaccl'
|
||||
? 'bg-transparent text-exo-yellow border-exo-yellow'
|
||||
: 'bg-transparent text-white/70 border-exo-medium-gray/50 hover:border-exo-yellow/50'}"
|
||||
>
|
||||
<span
|
||||
class="w-4 h-4 rounded-full border-2 flex items-center justify-center {selectedInstanceType ===
|
||||
'MlxIbv'
|
||||
'MlxJaccl'
|
||||
? 'border-exo-yellow'
|
||||
: 'border-exo-medium-gray'}"
|
||||
>
|
||||
{#if selectedInstanceType === "MlxIbv"}
|
||||
{#if selectedInstanceType === "MlxJaccl"}
|
||||
<span class="w-2 h-2 rounded-full bg-exo-yellow"></span>
|
||||
{/if}
|
||||
</span>
|
||||
@@ -3018,6 +3069,15 @@
|
||||
/>
|
||||
</div>
|
||||
{/each}
|
||||
{#if selectedModelId}
|
||||
<button
|
||||
type="button"
|
||||
onclick={createMetaInstance}
|
||||
class="w-full mt-3 py-2 px-4 text-xs font-mono tracking-wider uppercase border border-purple-500/30 text-purple-400 hover:bg-purple-500/20 hover:border-purple-500/50 transition-all duration-200 cursor-pointer"
|
||||
>
|
||||
CREATE PERSISTENT
|
||||
</button>
|
||||
{/if}
|
||||
</div>
|
||||
{:else if selectedModel}
|
||||
<div class="text-center py-4">
|
||||
@@ -3101,6 +3161,19 @@
|
||||
</div>
|
||||
</button>
|
||||
|
||||
<!-- Meta-Instances Section -->
|
||||
{#if metaInstanceCount > 0}
|
||||
<div class="p-4 flex-shrink-0">
|
||||
<MetaInstancePanel
|
||||
metaInstances={metaInstancesData}
|
||||
instances={instanceData}
|
||||
onDelete={deleteMetaInstance}
|
||||
onHoverNodes={(nodes) => (hoveredPreviewNodes = nodes)}
|
||||
onHoverEnd={() => (hoveredPreviewNodes = new Set())}
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<!-- Instances Section (only shown when instances exist) -->
|
||||
{#if instanceCount > 0}
|
||||
<div class="p-4 flex-1">
|
||||
|
||||
Reference in New Issue
Block a user