mirror of
https://github.com/exo-explore/exo.git
synced 2026-02-06 12:11:22 -05:00
Compare commits
8 Commits
alexcheema
...
only-time-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b694552402 | ||
|
|
9f502793c1 | ||
|
|
c8371349d5 | ||
|
|
6b907398a4 | ||
|
|
572e647908 | ||
|
|
e59ebd986d | ||
|
|
5c2f29f3f2 | ||
|
|
ffe6396c91 |
@@ -14,6 +14,7 @@
|
||||
isAdding: boolean;
|
||||
onAdd: () => void;
|
||||
onSelect: () => void;
|
||||
downloadedOnNodes?: string[];
|
||||
};
|
||||
|
||||
let {
|
||||
@@ -22,6 +23,7 @@
|
||||
isAdding,
|
||||
onAdd,
|
||||
onSelect,
|
||||
downloadedOnNodes = [],
|
||||
}: HuggingFaceResultItemProps = $props();
|
||||
|
||||
function formatNumber(num: number): string {
|
||||
@@ -45,6 +47,28 @@
|
||||
<span class="text-sm font-mono text-white truncate" title={model.id}
|
||||
>{modelName}</span
|
||||
>
|
||||
{#if downloadedOnNodes.length > 0}
|
||||
<span
|
||||
class="flex-shrink-0"
|
||||
title={`Downloaded on ${downloadedOnNodes.join(", ")}`}
|
||||
>
|
||||
<svg
|
||||
class="w-4 h-4"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path
|
||||
class="text-white/40"
|
||||
d="M20 20a2 2 0 0 0 2-2V8a2 2 0 0 0-2-2h-7.9a2 2 0 0 1-1.69-.9L9.6 3.9A2 2 0 0 0 7.93 3H4a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2Z"
|
||||
/>
|
||||
<path class="text-green-400" d="m9 13 2 2 4-4" />
|
||||
</svg>
|
||||
</span>
|
||||
{/if}
|
||||
{#if isAdded}
|
||||
<span
|
||||
class="px-1.5 py-0.5 text-[10px] font-mono bg-green-500/20 text-green-400 rounded"
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
interface FilterState {
|
||||
capabilities: string[];
|
||||
sizeRange: { min: number; max: number } | null;
|
||||
downloadedOnly: boolean;
|
||||
}
|
||||
|
||||
type ModelFilterPopoverProps = {
|
||||
@@ -148,6 +149,36 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Downloaded only -->
|
||||
<div>
|
||||
<h4 class="text-xs font-mono text-white/50 mb-2">Availability</h4>
|
||||
<button
|
||||
type="button"
|
||||
class="px-2 py-1 text-xs font-mono rounded transition-colors {filters.downloadedOnly
|
||||
? 'bg-green-500/20 text-green-400 border border-green-500/30'
|
||||
: 'bg-white/5 text-white/60 hover:bg-white/10 border border-transparent'}"
|
||||
onclick={() =>
|
||||
onChange({ ...filters, downloadedOnly: !filters.downloadedOnly })}
|
||||
>
|
||||
<svg
|
||||
class="w-3.5 h-3.5 inline-block"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path
|
||||
class="text-white/40"
|
||||
d="M20 20a2 2 0 0 0 2-2V8a2 2 0 0 0-2-2h-7.9a2 2 0 0 1-1.69-.9L9.6 3.9A2 2 0 0 0 7.93 3H4a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2Z"
|
||||
/>
|
||||
<path class="text-green-400" d="m9 13 2 2 4-4" />
|
||||
</svg>
|
||||
<span class="ml-1">Downloaded</span>
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Size range -->
|
||||
<div>
|
||||
<h4 class="text-xs font-mono text-white/50 mb-2">Model Size</h4>
|
||||
|
||||
@@ -21,6 +21,12 @@
|
||||
hasMultipleVariants: boolean;
|
||||
}
|
||||
|
||||
type DownloadAvailability = {
|
||||
available: boolean;
|
||||
nodeNames: string[];
|
||||
nodeIds: string[];
|
||||
};
|
||||
|
||||
type ModelPickerGroupProps = {
|
||||
group: ModelGroup;
|
||||
isExpanded: boolean;
|
||||
@@ -31,6 +37,7 @@
|
||||
onSelectModel: (modelId: string) => void;
|
||||
onToggleFavorite: (baseModelId: string) => void;
|
||||
onShowInfo: (group: ModelGroup) => void;
|
||||
downloadStatusMap?: Map<string, DownloadAvailability>;
|
||||
};
|
||||
|
||||
let {
|
||||
@@ -43,8 +50,19 @@
|
||||
onSelectModel,
|
||||
onToggleFavorite,
|
||||
onShowInfo,
|
||||
downloadStatusMap,
|
||||
}: ModelPickerGroupProps = $props();
|
||||
|
||||
// Group-level download status: show if any variant is downloaded
|
||||
const groupDownloadStatus = $derived.by(() => {
|
||||
if (!downloadStatusMap || downloadStatusMap.size === 0) return undefined;
|
||||
// Return the first available entry (prefer "available" ones)
|
||||
for (const avail of downloadStatusMap.values()) {
|
||||
if (avail.available) return avail;
|
||||
}
|
||||
return downloadStatusMap.values().next().value;
|
||||
});
|
||||
|
||||
// Format storage size
|
||||
function formatSize(mb: number | undefined): string {
|
||||
if (!mb) return "";
|
||||
@@ -198,10 +216,42 @@
|
||||
</span>
|
||||
{/if}
|
||||
|
||||
<!-- Variant count -->
|
||||
<!-- Variant count with size range -->
|
||||
{#if group.hasMultipleVariants}
|
||||
{@const sizes = group.variants
|
||||
.map((v) => v.storage_size_megabytes || 0)
|
||||
.filter((s) => s > 0)
|
||||
.sort((a, b) => a - b)}
|
||||
<span class="text-xs font-mono text-white/30 flex-shrink-0">
|
||||
{group.variants.length} variants
|
||||
{group.variants.length} variants{#if sizes.length >= 2}{" "}({formatSize(
|
||||
sizes[0],
|
||||
)}-{formatSize(sizes[sizes.length - 1])}){/if}
|
||||
</span>
|
||||
{/if}
|
||||
|
||||
<!-- Download availability indicator -->
|
||||
{#if groupDownloadStatus && groupDownloadStatus.nodeIds.length > 0}
|
||||
<span
|
||||
class="flex-shrink-0"
|
||||
title={groupDownloadStatus.available
|
||||
? `Ready — downloaded on ${groupDownloadStatus.nodeNames.join(", ")}`
|
||||
: `Downloaded on ${groupDownloadStatus.nodeNames.join(", ")} (may need more nodes)`}
|
||||
>
|
||||
<svg
|
||||
class="w-4 h-4"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path
|
||||
class="text-white/40"
|
||||
d="M20 20a2 2 0 0 0 2-2V8a2 2 0 0 0-2-2h-7.9a2 2 0 0 1-1.69-.9L9.6 3.9A2 2 0 0 0 7.93 3H4a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2Z"
|
||||
/>
|
||||
<path class="text-green-400" d="m9 13 2 2 4-4" />
|
||||
</svg>
|
||||
</span>
|
||||
{/if}
|
||||
|
||||
@@ -305,6 +355,33 @@
|
||||
{formatSize(variant.storage_size_megabytes)}
|
||||
</span>
|
||||
|
||||
<!-- Download indicator for this variant -->
|
||||
{#if downloadStatusMap?.get(variant.id)}
|
||||
{@const variantDl = downloadStatusMap.get(variant.id)}
|
||||
{#if variantDl}
|
||||
<span
|
||||
class="flex-shrink-0"
|
||||
title={`Downloaded on ${variantDl.nodeNames.join(", ")}`}
|
||||
>
|
||||
<svg
|
||||
class="w-3.5 h-3.5"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path
|
||||
class="text-white/40"
|
||||
d="M20 20a2 2 0 0 0 2-2V8a2 2 0 0 0-2-2h-7.9a2 2 0 0 1-1.69-.9L9.6 3.9A2 2 0 0 0 7.93 3H4a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2Z"
|
||||
/>
|
||||
<path class="text-green-400" d="m9 13 2 2 4-4" />
|
||||
</svg>
|
||||
</span>
|
||||
{/if}
|
||||
{/if}
|
||||
|
||||
<!-- Check mark if selected -->
|
||||
{#if isSelected}
|
||||
<svg
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
import ModelPickerGroup from "./ModelPickerGroup.svelte";
|
||||
import ModelFilterPopover from "./ModelFilterPopover.svelte";
|
||||
import HuggingFaceResultItem from "./HuggingFaceResultItem.svelte";
|
||||
import { getNodesWithModelDownloaded } from "$lib/utils/downloads";
|
||||
|
||||
interface ModelInfo {
|
||||
id: string;
|
||||
@@ -33,6 +34,7 @@
|
||||
interface FilterState {
|
||||
capabilities: string[];
|
||||
sizeRange: { min: number; max: number } | null;
|
||||
downloadedOnly: boolean;
|
||||
}
|
||||
|
||||
interface HuggingFaceModel {
|
||||
@@ -58,6 +60,15 @@
|
||||
onDeleteModel: (modelId: string) => Promise<void>;
|
||||
totalMemoryGB: number;
|
||||
usedMemoryGB: number;
|
||||
downloadsData?: Record<string, unknown[]>;
|
||||
topologyNodes?: Record<
|
||||
string,
|
||||
{
|
||||
friendly_name?: string;
|
||||
system_info?: { model_id?: string };
|
||||
macmon_info?: { memory?: { ram_total?: number } };
|
||||
}
|
||||
>;
|
||||
};
|
||||
|
||||
let {
|
||||
@@ -74,6 +85,8 @@
|
||||
onDeleteModel,
|
||||
totalMemoryGB,
|
||||
usedMemoryGB,
|
||||
downloadsData,
|
||||
topologyNodes,
|
||||
}: ModelPickerModalProps = $props();
|
||||
|
||||
// Local state
|
||||
@@ -81,9 +94,75 @@
|
||||
let selectedFamily = $state<string | null>(null);
|
||||
let expandedGroups = $state<Set<string>>(new Set());
|
||||
let showFilters = $state(false);
|
||||
let filters = $state<FilterState>({ capabilities: [], sizeRange: null });
|
||||
let filters = $state<FilterState>({
|
||||
capabilities: [],
|
||||
sizeRange: null,
|
||||
downloadedOnly: false,
|
||||
});
|
||||
let infoGroup = $state<ModelGroup | null>(null);
|
||||
|
||||
// Download availability per model group
|
||||
type DownloadAvailability = {
|
||||
available: boolean;
|
||||
nodeNames: string[];
|
||||
nodeIds: string[];
|
||||
};
|
||||
|
||||
function getNodeName(nodeId: string): string {
|
||||
const node = topologyNodes?.[nodeId];
|
||||
return (
|
||||
node?.friendly_name || node?.system_info?.model_id || nodeId.slice(0, 8)
|
||||
);
|
||||
}
|
||||
|
||||
const modelDownloadAvailability = $derived.by(() => {
|
||||
const result = new Map<string, DownloadAvailability>();
|
||||
if (!downloadsData || !topologyNodes) return result;
|
||||
|
||||
for (const model of models) {
|
||||
const nodeIds = getNodesWithModelDownloaded(downloadsData, model.id);
|
||||
if (nodeIds.length === 0) continue;
|
||||
|
||||
// Sum total RAM across nodes that have the model
|
||||
let totalRamBytes = 0;
|
||||
for (const nodeId of nodeIds) {
|
||||
const ramTotal = topologyNodes[nodeId]?.macmon_info?.memory?.ram_total;
|
||||
if (typeof ramTotal === "number") totalRamBytes += ramTotal;
|
||||
}
|
||||
|
||||
const modelSizeBytes = (model.storage_size_megabytes || 0) * 1024 * 1024;
|
||||
result.set(model.id, {
|
||||
available: modelSizeBytes > 0 && totalRamBytes >= modelSizeBytes,
|
||||
nodeNames: nodeIds.map(getNodeName),
|
||||
nodeIds,
|
||||
});
|
||||
}
|
||||
return result;
|
||||
});
|
||||
|
||||
// Aggregate download availability per group (available if ANY variant is available)
|
||||
function getGroupDownloadAvailability(
|
||||
group: ModelGroup,
|
||||
): DownloadAvailability | undefined {
|
||||
for (const variant of group.variants) {
|
||||
const avail = modelDownloadAvailability.get(variant.id);
|
||||
if (avail && avail.nodeIds.length > 0) return avail;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
// Get per-variant download map for a group
|
||||
function getVariantDownloadMap(
|
||||
group: ModelGroup,
|
||||
): Map<string, DownloadAvailability> {
|
||||
const map = new Map<string, DownloadAvailability>();
|
||||
for (const variant of group.variants) {
|
||||
const avail = modelDownloadAvailability.get(variant.id);
|
||||
if (avail && avail.nodeIds.length > 0) map.set(variant.id, avail);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
// HuggingFace Hub state
|
||||
let hfSearchQuery = $state("");
|
||||
let hfSearchResults = $state<HuggingFaceModel[]>([]);
|
||||
@@ -95,15 +174,12 @@
|
||||
let manualModelId = $state("");
|
||||
let addModelError = $state<string | null>(null);
|
||||
|
||||
// Reset state when modal opens
|
||||
// Reset transient state when modal opens, but preserve tab selection
|
||||
$effect(() => {
|
||||
if (isOpen) {
|
||||
searchQuery = "";
|
||||
selectedFamily = null;
|
||||
expandedGroups = new Set();
|
||||
showFilters = false;
|
||||
hfSearchQuery = "";
|
||||
hfSearchResults = [];
|
||||
manualModelId = "";
|
||||
addModelError = null;
|
||||
}
|
||||
@@ -339,6 +415,16 @@
|
||||
});
|
||||
}
|
||||
|
||||
// Filter to downloaded models only
|
||||
if (filters.downloadedOnly) {
|
||||
result = result.filter((g) =>
|
||||
g.variants.some((v) => {
|
||||
const avail = modelDownloadAvailability.get(v.id);
|
||||
return avail && avail.nodeIds.length > 0;
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Sort: models that fit first, then by size (largest first)
|
||||
result.sort((a, b) => {
|
||||
const aFits = a.variants.some((v) => canModelFit(v.id));
|
||||
@@ -385,11 +471,13 @@
|
||||
}
|
||||
|
||||
function clearFilters() {
|
||||
filters = { capabilities: [], sizeRange: null };
|
||||
filters = { capabilities: [], sizeRange: null, downloadedOnly: false };
|
||||
}
|
||||
|
||||
const hasActiveFilters = $derived(
|
||||
filters.capabilities.length > 0 || filters.sizeRange !== null,
|
||||
filters.capabilities.length > 0 ||
|
||||
filters.sizeRange !== null ||
|
||||
filters.downloadedOnly,
|
||||
);
|
||||
</script>
|
||||
|
||||
@@ -576,6 +664,12 @@
|
||||
isAdding={addingModelId === model.id}
|
||||
onAdd={() => handleAddModel(model.id)}
|
||||
onSelect={() => handleSelectHfModel(model.id)}
|
||||
downloadedOnNodes={downloadsData
|
||||
? getNodesWithModelDownloaded(
|
||||
downloadsData,
|
||||
model.id,
|
||||
).map(getNodeName)
|
||||
: []}
|
||||
/>
|
||||
{/each}
|
||||
{/if}
|
||||
@@ -650,6 +744,7 @@
|
||||
onSelectModel={handleSelect}
|
||||
{onToggleFavorite}
|
||||
onShowInfo={(g) => (infoGroup = g)}
|
||||
downloadStatusMap={getVariantDownloadMap(group)}
|
||||
/>
|
||||
{/each}
|
||||
{/if}
|
||||
@@ -667,6 +762,11 @@
|
||||
>{cap}</span
|
||||
>
|
||||
{/each}
|
||||
{#if filters.downloadedOnly}
|
||||
<span class="px-1.5 py-0.5 bg-green-500/20 text-green-400 rounded"
|
||||
>Downloaded</span
|
||||
>
|
||||
{/if}
|
||||
{#if filters.sizeRange}
|
||||
<span class="px-1.5 py-0.5 bg-exo-yellow/20 text-exo-yellow rounded">
|
||||
{filters.sizeRange.min}GB - {filters.sizeRange.max}GB
|
||||
@@ -742,6 +842,40 @@
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
{#if getGroupDownloadAvailability(infoGroup)?.nodeNames?.length}
|
||||
{@const infoDownload = getGroupDownloadAvailability(infoGroup)}
|
||||
{#if infoDownload}
|
||||
<div class="mt-3 pt-3 border-t border-exo-yellow/10">
|
||||
<div class="flex items-center gap-2 mb-1">
|
||||
<svg
|
||||
class="w-3.5 h-3.5"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path
|
||||
class="text-white/40"
|
||||
d="M20 20a2 2 0 0 0 2-2V8a2 2 0 0 0-2-2h-7.9a2 2 0 0 1-1.69-.9L9.6 3.9A2 2 0 0 0 7.93 3H4a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2Z"
|
||||
/>
|
||||
<path class="text-green-400" d="m9 13 2 2 4-4" />
|
||||
</svg>
|
||||
<span class="text-white/40">Downloaded on:</span>
|
||||
</div>
|
||||
<div class="flex flex-wrap gap-1 mt-1">
|
||||
{#each infoDownload.nodeNames as nodeName}
|
||||
<span
|
||||
class="px-1.5 py-0.5 bg-green-500/10 text-green-400/80 border border-green-500/20 rounded text-[10px]"
|
||||
>
|
||||
{nodeName}
|
||||
</span>
|
||||
{/each}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
@@ -69,8 +69,6 @@ export interface Instance {
|
||||
runnerToShard?: Record<string, unknown>;
|
||||
nodeToRunner?: Record<string, string>;
|
||||
};
|
||||
draftModel?: string;
|
||||
numDraftTokens?: number;
|
||||
}
|
||||
|
||||
// Granular node state types from the new state structure
|
||||
|
||||
152
dashboard/src/lib/utils/downloads.ts
Normal file
152
dashboard/src/lib/utils/downloads.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
/**
|
||||
* Shared utilities for parsing and querying download state.
|
||||
*
|
||||
* The download state from `/state` is shaped as:
|
||||
* Record<NodeId, Array<TaggedDownloadEntry>>
|
||||
*
|
||||
* Each entry is a tagged union object like:
|
||||
* { "DownloadCompleted": { shard_metadata: { "PipelineShardMetadata": { model_card: { model_id: "..." }, ... } }, ... } }
|
||||
*/
|
||||
|
||||
/** Unwrap one level of tagged-union envelope, returning [tag, payload]. */
|
||||
function unwrapTagged(
|
||||
obj: Record<string, unknown>,
|
||||
): [string, Record<string, unknown>] | null {
|
||||
const keys = Object.keys(obj);
|
||||
if (keys.length !== 1) return null;
|
||||
const tag = keys[0];
|
||||
const payload = obj[tag];
|
||||
if (!payload || typeof payload !== "object") return null;
|
||||
return [tag, payload as Record<string, unknown>];
|
||||
}
|
||||
|
||||
/** Extract the model ID string from a download entry's nested shard_metadata. */
|
||||
export function extractModelIdFromDownload(
|
||||
downloadPayload: Record<string, unknown>,
|
||||
): string | null {
|
||||
const shardMetadata =
|
||||
downloadPayload.shard_metadata ?? downloadPayload.shardMetadata;
|
||||
if (!shardMetadata || typeof shardMetadata !== "object") return null;
|
||||
|
||||
const unwrapped = unwrapTagged(shardMetadata as Record<string, unknown>);
|
||||
if (!unwrapped) return null;
|
||||
const [, shardData] = unwrapped;
|
||||
|
||||
const modelMeta = shardData.model_card ?? shardData.modelCard;
|
||||
if (!modelMeta || typeof modelMeta !== "object") return null;
|
||||
|
||||
const meta = modelMeta as Record<string, unknown>;
|
||||
return (meta.model_id as string) ?? (meta.modelId as string) ?? null;
|
||||
}
|
||||
|
||||
/** Extract the shard_metadata object from a download entry payload. */
|
||||
export function extractShardMetadata(
|
||||
downloadPayload: Record<string, unknown>,
|
||||
): Record<string, unknown> | null {
|
||||
const shardMetadata =
|
||||
downloadPayload.shard_metadata ?? downloadPayload.shardMetadata;
|
||||
if (!shardMetadata || typeof shardMetadata !== "object") return null;
|
||||
return shardMetadata as Record<string, unknown>;
|
||||
}
|
||||
|
||||
/** Get the download tag (DownloadCompleted, DownloadOngoing, etc.) from a wrapped entry. */
|
||||
export function getDownloadTag(
|
||||
entry: unknown,
|
||||
): [string, Record<string, unknown>] | null {
|
||||
if (!entry || typeof entry !== "object") return null;
|
||||
return unwrapTagged(entry as Record<string, unknown>);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over all download entries for a given node, yielding [tag, payload, modelId].
|
||||
*/
|
||||
function* iterNodeDownloads(
|
||||
nodeDownloads: unknown[],
|
||||
): Generator<[string, Record<string, unknown>, string]> {
|
||||
for (const entry of nodeDownloads) {
|
||||
const tagged = getDownloadTag(entry);
|
||||
if (!tagged) continue;
|
||||
const [tag, payload] = tagged;
|
||||
const modelId = extractModelIdFromDownload(payload);
|
||||
if (!modelId) continue;
|
||||
yield [tag, payload, modelId];
|
||||
}
|
||||
}
|
||||
|
||||
/** Check if a specific model is fully downloaded (DownloadCompleted) on a specific node. */
|
||||
export function isModelDownloadedOnNode(
|
||||
downloadsData: Record<string, unknown[]>,
|
||||
nodeId: string,
|
||||
modelId: string,
|
||||
): boolean {
|
||||
const nodeDownloads = downloadsData[nodeId];
|
||||
if (!Array.isArray(nodeDownloads)) return false;
|
||||
|
||||
for (const [tag, , entryModelId] of iterNodeDownloads(nodeDownloads)) {
|
||||
if (tag === "DownloadCompleted" && entryModelId === modelId) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Get all node IDs where a model is fully downloaded (DownloadCompleted). */
|
||||
export function getNodesWithModelDownloaded(
|
||||
downloadsData: Record<string, unknown[]>,
|
||||
modelId: string,
|
||||
): string[] {
|
||||
const result: string[] = [];
|
||||
for (const nodeId of Object.keys(downloadsData)) {
|
||||
if (isModelDownloadedOnNode(downloadsData, nodeId, modelId)) {
|
||||
result.push(nodeId);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find shard metadata for a model from any download entry across all nodes.
|
||||
* Returns the first match found (completed entries are preferred).
|
||||
*/
|
||||
export function getShardMetadataForModel(
|
||||
downloadsData: Record<string, unknown[]>,
|
||||
modelId: string,
|
||||
): Record<string, unknown> | null {
|
||||
let fallback: Record<string, unknown> | null = null;
|
||||
|
||||
for (const nodeDownloads of Object.values(downloadsData)) {
|
||||
if (!Array.isArray(nodeDownloads)) continue;
|
||||
|
||||
for (const [tag, payload, entryModelId] of iterNodeDownloads(
|
||||
nodeDownloads,
|
||||
)) {
|
||||
if (entryModelId !== modelId) continue;
|
||||
const shard = extractShardMetadata(payload);
|
||||
if (!shard) continue;
|
||||
|
||||
if (tag === "DownloadCompleted") return shard;
|
||||
if (!fallback) fallback = shard;
|
||||
}
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the download status tag for a specific model on a specific node.
|
||||
* Returns the "best" status: DownloadCompleted > DownloadOngoing > others.
|
||||
*/
|
||||
export function getModelDownloadStatus(
|
||||
downloadsData: Record<string, unknown[]>,
|
||||
nodeId: string,
|
||||
modelId: string,
|
||||
): string | null {
|
||||
const nodeDownloads = downloadsData[nodeId];
|
||||
if (!Array.isArray(nodeDownloads)) return null;
|
||||
|
||||
let best: string | null = null;
|
||||
for (const [tag, , entryModelId] of iterNodeDownloads(nodeDownloads)) {
|
||||
if (entryModelId !== modelId) continue;
|
||||
if (tag === "DownloadCompleted") return tag;
|
||||
if (tag === "DownloadOngoing") best = tag;
|
||||
else if (!best) best = tag;
|
||||
}
|
||||
return best;
|
||||
}
|
||||
@@ -3264,4 +3264,6 @@
|
||||
onDeleteModel={deleteCustomModel}
|
||||
totalMemoryGB={clusterMemory().total / (1024 * 1024 * 1024)}
|
||||
usedMemoryGB={clusterMemory().used / (1024 * 1024 * 1024)}
|
||||
{downloadsData}
|
||||
topologyNodes={data?.nodes}
|
||||
/>
|
||||
|
||||
@@ -118,9 +118,10 @@
|
||||
{
|
||||
metal-toolchain = pkgs.callPackage ./nix/metal-toolchain.nix { };
|
||||
mlx = pkgs.callPackage ./nix/mlx.nix {
|
||||
metal-toolchain = self'.packages.metal-toolchain;
|
||||
inherit (self'.packages) metal-toolchain;
|
||||
inherit uvLockMlxVersion;
|
||||
};
|
||||
default = self'.packages.exo;
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
2
justfile
2
justfile
@@ -20,7 +20,7 @@ sync-clean:
|
||||
|
||||
rust-rebuild:
|
||||
cargo run --bin stub_gen
|
||||
just sync-clean
|
||||
uv sync --reinstall-package exo_pyo3_bindings
|
||||
|
||||
build-dashboard:
|
||||
#!/usr/bin/env bash
|
||||
|
||||
@@ -31,8 +31,6 @@ dependencies = [
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
exo-master = "exo.master.main:main"
|
||||
exo-worker = "exo.worker.main:main"
|
||||
exo = "exo.main:main"
|
||||
|
||||
# dependencies only required for development
|
||||
|
||||
@@ -59,6 +59,16 @@
|
||||
}
|
||||
);
|
||||
|
||||
mkPythonScript = name: path: pkgs.writeShellApplication {
|
||||
inherit name;
|
||||
runtimeInputs = [ exoVenv ];
|
||||
runtimeEnv = {
|
||||
EXO_DASHBOARD_DIR = self'.packages.dashboard;
|
||||
EXO_RESOURCES_DIR = inputs.self + /resources;
|
||||
};
|
||||
text = ''python ${path}'';
|
||||
};
|
||||
|
||||
exoPackage = pkgs.runCommand "exo"
|
||||
{
|
||||
nativeBuildInputs = [ pkgs.makeWrapper ];
|
||||
@@ -66,13 +76,11 @@
|
||||
''
|
||||
mkdir -p $out/bin
|
||||
|
||||
# Create wrapper scripts
|
||||
for script in exo exo-master exo-worker; do
|
||||
makeWrapper ${exoVenv}/bin/$script $out/bin/$script \
|
||||
--set EXO_DASHBOARD_DIR ${self'.packages.dashboard} \
|
||||
--set EXO_RESOURCES_DIR ${inputs.self + "/resources"} \
|
||||
${lib.optionalString pkgs.stdenv.isDarwin "--prefix PATH : ${pkgs.macmon}/bin"}
|
||||
done
|
||||
# Create wrapper script
|
||||
makeWrapper ${exoVenv}/bin/exo $out/bin/exo \
|
||||
--set EXO_DASHBOARD_DIR ${self'.packages.dashboard} \
|
||||
--set EXO_RESOURCES_DIR ${inputs.self + /resources} \
|
||||
${lib.optionalString pkgs.stdenv.hostPlatform.isDarwin "--prefix PATH : ${pkgs.macmon}/bin"}
|
||||
'';
|
||||
in
|
||||
{
|
||||
@@ -81,13 +89,15 @@
|
||||
exo = exoPackage;
|
||||
# Test environment for running pytest outside of Nix sandbox (needs GPU access)
|
||||
exo-test-env = testVenv;
|
||||
exo-bench = mkPythonScript "exo-bench" (inputs.self + /bench/exo_bench.py);
|
||||
exo-distributed-test = mkPythonScript "exo-distributed-test" (inputs.self + /tests/headless_runner.py);
|
||||
};
|
||||
|
||||
checks = {
|
||||
# Ruff linting (works on all platforms)
|
||||
lint = pkgs.runCommand "ruff-lint" { } ''
|
||||
export RUFF_CACHE_DIR="$TMPDIR/ruff-cache"
|
||||
${pkgs.ruff}/bin/ruff check ${inputs.self}/
|
||||
${pkgs.ruff}/bin/ruff check ${inputs.self}
|
||||
touch $out
|
||||
'';
|
||||
};
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
model_id = "mlx-community/Qwen3-Coder-Next-4bit"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
tasks = ["TextGeneration"]
|
||||
|
||||
[storage_size]
|
||||
in_bytes = 45644286500
|
||||
@@ -0,0 +1,8 @@
|
||||
model_id = "mlx-community/Qwen3-Coder-Next-5bit"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
tasks = ["TextGeneration"]
|
||||
|
||||
[storage_size]
|
||||
in_bytes = 57657697020
|
||||
@@ -0,0 +1,8 @@
|
||||
model_id = "mlx-community/Qwen3-Coder-Next-6bit"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
tasks = ["TextGeneration"]
|
||||
|
||||
[storage_size]
|
||||
in_bytes = 68899327465
|
||||
@@ -0,0 +1,8 @@
|
||||
model_id = "mlx-community/Qwen3-Coder-Next-8bit"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
tasks = ["TextGeneration"]
|
||||
|
||||
[storage_size]
|
||||
in_bytes = 89357758772
|
||||
@@ -0,0 +1,8 @@
|
||||
model_id = "mlx-community/Qwen3-Coder-Next-bf16"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
tasks = ["TextGeneration"]
|
||||
|
||||
[storage_size]
|
||||
in_bytes = 157548627945
|
||||
@@ -16,6 +16,7 @@ from exo.download.download_utils import (
|
||||
from exo.download.shard_downloader import ShardDownloader
|
||||
from exo.shared.models.model_cards import ModelId
|
||||
from exo.shared.types.commands import (
|
||||
CancelDownload,
|
||||
DeleteDownload,
|
||||
ForwarderDownloadCommand,
|
||||
StartDownload,
|
||||
@@ -53,11 +54,10 @@ class DownloadCoordinator:
|
||||
# Internal event channel for forwarding (initialized in __post_init__)
|
||||
event_sender: Sender[Event] = field(init=False)
|
||||
event_receiver: Receiver[Event] = field(init=False)
|
||||
_tg: TaskGroup = field(init=False)
|
||||
_tg: TaskGroup = field(init=False, default_factory=anyio.create_task_group)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.event_sender, self.event_receiver = channel[Event]()
|
||||
self._tg = anyio.create_task_group()
|
||||
|
||||
async def run(self) -> None:
|
||||
logger.info("Starting DownloadCoordinator")
|
||||
@@ -108,6 +108,13 @@ class DownloadCoordinator:
|
||||
await self._start_download(shard)
|
||||
case DeleteDownload(model_id=model_id):
|
||||
await self._delete_download(model_id)
|
||||
case CancelDownload(model_id=model_id):
|
||||
await self._cancel_download(model_id)
|
||||
|
||||
async def _cancel_download(self, model_id: ModelId) -> None:
|
||||
if model_id in self.active_downloads and model_id in self.download_status:
|
||||
logger.info(f"Cancelling download for {model_id}")
|
||||
self.active_downloads.pop(model_id).cancel()
|
||||
|
||||
async def _start_download(self, shard: ShardMetadata) -> None:
|
||||
model_id = shard.model_card.model_id
|
||||
|
||||
@@ -378,10 +378,14 @@ async def download_file_with_retry(
|
||||
logger.error(traceback.format_exc())
|
||||
await asyncio.sleep(2.0**attempt)
|
||||
except Exception as e:
|
||||
on_connection_lost()
|
||||
if attempt == n_attempts - 1:
|
||||
on_connection_lost()
|
||||
raise e
|
||||
break
|
||||
logger.error(
|
||||
f"Download error on attempt {attempt + 1}/{n_attempts} for {model_id=} {revision=} {path=} {target_dir=}"
|
||||
)
|
||||
logger.error(traceback.format_exc())
|
||||
await asyncio.sleep(2.0**attempt)
|
||||
raise Exception(
|
||||
f"Failed to download file {model_id=} {revision=} {path=} {target_dir=}"
|
||||
)
|
||||
|
||||
@@ -27,7 +27,6 @@ from exo.utils.pydantic_ext import CamelCaseModel
|
||||
from exo.worker.main import Worker
|
||||
|
||||
|
||||
# I marked this as a dataclass as I want trivial constructors.
|
||||
@dataclass
|
||||
class Node:
|
||||
router: Router
|
||||
@@ -106,6 +105,7 @@ class Node:
|
||||
global_event_sender=router.sender(topics.GLOBAL_EVENTS),
|
||||
local_event_receiver=router.receiver(topics.LOCAL_EVENTS),
|
||||
command_receiver=router.receiver(topics.COMMANDS),
|
||||
download_command_sender=router.sender(topics.DOWNLOAD_COMMANDS),
|
||||
)
|
||||
|
||||
er_send, er_recv = channel[ElectionResult]()
|
||||
@@ -136,7 +136,6 @@ class Node:
|
||||
|
||||
async def run(self):
|
||||
async with self._tg as tg:
|
||||
signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
|
||||
tg.start_soon(self.router.run)
|
||||
tg.start_soon(self.election.run)
|
||||
if self.download_coordinator:
|
||||
@@ -148,6 +147,8 @@ class Node:
|
||||
if self.api:
|
||||
tg.start_soon(self.api.run)
|
||||
tg.start_soon(self._elect_loop)
|
||||
signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
|
||||
signal.signal(signal.SIGTERM, lambda _, __: self.shutdown())
|
||||
|
||||
def shutdown(self):
|
||||
# if this is our second call to shutdown, just sys.exit
|
||||
@@ -188,6 +189,9 @@ class Node:
|
||||
global_event_sender=self.router.sender(topics.GLOBAL_EVENTS),
|
||||
local_event_receiver=self.router.receiver(topics.LOCAL_EVENTS),
|
||||
command_receiver=self.router.receiver(topics.COMMANDS),
|
||||
download_command_sender=self.router.sender(
|
||||
topics.DOWNLOAD_COMMANDS
|
||||
),
|
||||
)
|
||||
self._tg.start_soon(self.master.run)
|
||||
elif (
|
||||
|
||||
@@ -1320,29 +1320,40 @@ class API:
|
||||
]
|
||||
|
||||
async def run(self):
|
||||
shutdown_ev = anyio.Event()
|
||||
|
||||
try:
|
||||
async with create_task_group() as tg:
|
||||
self._tg = tg
|
||||
logger.info("Starting API")
|
||||
tg.start_soon(self._apply_state)
|
||||
tg.start_soon(self._pause_on_new_election)
|
||||
tg.start_soon(self._cleanup_expired_images)
|
||||
print_startup_banner(self.port)
|
||||
tg.start_soon(self.run_api, shutdown_ev)
|
||||
try:
|
||||
await anyio.sleep_forever()
|
||||
finally:
|
||||
with anyio.CancelScope(shield=True):
|
||||
shutdown_ev.set()
|
||||
finally:
|
||||
self.command_sender.close()
|
||||
self.global_event_receiver.close()
|
||||
|
||||
async def run_api(self, ev: anyio.Event):
|
||||
cfg = Config()
|
||||
cfg.bind = f"0.0.0.0:{self.port}"
|
||||
cfg.bind = [f"0.0.0.0:{self.port}"]
|
||||
# nb: shared.logging needs updating if any of this changes
|
||||
cfg.accesslog = None
|
||||
cfg.errorlog = "-"
|
||||
cfg.logger_class = InterceptLogger
|
||||
|
||||
async with create_task_group() as tg:
|
||||
self._tg = tg
|
||||
logger.info("Starting API")
|
||||
tg.start_soon(self._apply_state)
|
||||
tg.start_soon(self._pause_on_new_election)
|
||||
tg.start_soon(self._cleanup_expired_images)
|
||||
print_startup_banner(self.port)
|
||||
with anyio.CancelScope(shield=True):
|
||||
await serve(
|
||||
cast(ASGIFramework, self.app),
|
||||
cfg,
|
||||
shutdown_trigger=lambda: anyio.sleep_forever(),
|
||||
shutdown_trigger=ev.wait,
|
||||
)
|
||||
|
||||
self.command_sender.close()
|
||||
self.global_event_receiver.close()
|
||||
|
||||
async def _apply_state(self):
|
||||
with self.global_event_receiver as events:
|
||||
async for f_event in events:
|
||||
|
||||
@@ -6,6 +6,7 @@ from loguru import logger
|
||||
|
||||
from exo.master.placement import (
|
||||
add_instance_to_placements,
|
||||
cancel_unnecessary_downloads,
|
||||
delete_instance,
|
||||
get_transition_events,
|
||||
place_instance,
|
||||
@@ -16,12 +17,12 @@ from exo.shared.types.commands import (
|
||||
CreateInstance,
|
||||
DeleteInstance,
|
||||
ForwarderCommand,
|
||||
ForwarderDownloadCommand,
|
||||
ImageEdits,
|
||||
ImageGeneration,
|
||||
PlaceInstance,
|
||||
RequestEventLog,
|
||||
SendInputChunk,
|
||||
SetInstanceDraftModel,
|
||||
TaskFinished,
|
||||
TestCommand,
|
||||
TextGeneration,
|
||||
@@ -33,7 +34,6 @@ from exo.shared.types.events import (
|
||||
IndexedEvent,
|
||||
InputChunkReceived,
|
||||
InstanceDeleted,
|
||||
InstanceDraftModelUpdated,
|
||||
NodeGatheredInfo,
|
||||
NodeTimedOut,
|
||||
TaskCreated,
|
||||
@@ -68,12 +68,9 @@ class Master:
|
||||
session_id: SessionId,
|
||||
*,
|
||||
command_receiver: Receiver[ForwarderCommand],
|
||||
# Receiving indexed events from the forwarder to be applied to state
|
||||
# Ideally these would be WorkerForwarderEvents but type system says no :(
|
||||
local_event_receiver: Receiver[ForwarderEvent],
|
||||
# Send events to the forwarder to be indexed (usually from command processing)
|
||||
# Ideally these would be MasterForwarderEvents but type system says no :(
|
||||
global_event_sender: Sender[ForwarderEvent],
|
||||
download_command_sender: Sender[ForwarderDownloadCommand],
|
||||
):
|
||||
self.state = State()
|
||||
self._tg: TaskGroup = anyio.create_task_group()
|
||||
@@ -83,6 +80,7 @@ class Master:
|
||||
self.command_receiver = command_receiver
|
||||
self.local_event_receiver = local_event_receiver
|
||||
self.global_event_sender = global_event_sender
|
||||
self.download_command_sender = download_command_sender
|
||||
send, recv = channel[Event]()
|
||||
self.event_sender: Sender[Event] = send
|
||||
self._loopback_event_receiver: Receiver[Event] = recv
|
||||
@@ -98,16 +96,18 @@ class Master:
|
||||
async def run(self):
|
||||
logger.info("Starting Master")
|
||||
|
||||
async with self._tg as tg:
|
||||
tg.start_soon(self._event_processor)
|
||||
tg.start_soon(self._command_processor)
|
||||
tg.start_soon(self._loopback_processor)
|
||||
tg.start_soon(self._plan)
|
||||
self.global_event_sender.close()
|
||||
self.local_event_receiver.close()
|
||||
self.command_receiver.close()
|
||||
self._loopback_event_sender.close()
|
||||
self._loopback_event_receiver.close()
|
||||
try:
|
||||
async with self._tg as tg:
|
||||
tg.start_soon(self._event_processor)
|
||||
tg.start_soon(self._command_processor)
|
||||
tg.start_soon(self._loopback_processor)
|
||||
tg.start_soon(self._plan)
|
||||
finally:
|
||||
self.global_event_sender.close()
|
||||
self.local_event_receiver.close()
|
||||
self.command_receiver.close()
|
||||
self._loopback_event_sender.close()
|
||||
self._loopback_event_receiver.close()
|
||||
|
||||
async def shutdown(self):
|
||||
logger.info("Stopping Master")
|
||||
@@ -280,6 +280,14 @@ class Master:
|
||||
transition_events = get_transition_events(
|
||||
self.state.instances, placement
|
||||
)
|
||||
for cmd in cancel_unnecessary_downloads(
|
||||
placement, self.state.downloads
|
||||
):
|
||||
await self.download_command_sender.send(
|
||||
ForwarderDownloadCommand(
|
||||
origin=self.node_id, command=cmd
|
||||
)
|
||||
)
|
||||
generated_events.extend(transition_events)
|
||||
case PlaceInstance():
|
||||
placement = place_instance(
|
||||
@@ -310,14 +318,6 @@ class Master:
|
||||
chunk=chunk,
|
||||
)
|
||||
)
|
||||
case SetInstanceDraftModel():
|
||||
generated_events.append(
|
||||
InstanceDraftModelUpdated(
|
||||
instance_id=command.instance_id,
|
||||
draft_model=command.draft_model,
|
||||
num_draft_tokens=command.num_draft_tokens,
|
||||
)
|
||||
)
|
||||
case TaskFinished():
|
||||
generated_events.append(
|
||||
TaskDeleted(
|
||||
|
||||
@@ -15,14 +15,20 @@ from exo.master.placement_utils import (
|
||||
from exo.shared.models.model_cards import ModelId
|
||||
from exo.shared.topology import Topology
|
||||
from exo.shared.types.commands import (
|
||||
CancelDownload,
|
||||
CreateInstance,
|
||||
DeleteInstance,
|
||||
DownloadCommand,
|
||||
PlaceInstance,
|
||||
)
|
||||
from exo.shared.types.common import NodeId
|
||||
from exo.shared.types.events import Event, InstanceCreated, InstanceDeleted
|
||||
from exo.shared.types.memory import Memory
|
||||
from exo.shared.types.profiling import MemoryUsage, NodeNetworkInfo
|
||||
from exo.shared.types.worker.downloads import (
|
||||
DownloadOngoing,
|
||||
DownloadProgress,
|
||||
)
|
||||
from exo.shared.types.worker.instances import (
|
||||
Instance,
|
||||
InstanceId,
|
||||
@@ -147,8 +153,6 @@ def place_instance(
|
||||
shard_assignments=shard_assignments,
|
||||
jaccl_devices=mlx_jaccl_devices,
|
||||
jaccl_coordinators=mlx_jaccl_coordinators,
|
||||
draft_model=command.draft_model,
|
||||
num_draft_tokens=command.num_draft_tokens,
|
||||
)
|
||||
case InstanceMeta.MlxRing:
|
||||
ephemeral_port = random_ephemeral_port()
|
||||
@@ -163,8 +167,6 @@ def place_instance(
|
||||
shard_assignments=shard_assignments,
|
||||
hosts_by_node=hosts_by_node,
|
||||
ephemeral_port=ephemeral_port,
|
||||
draft_model=command.draft_model,
|
||||
num_draft_tokens=command.num_draft_tokens,
|
||||
)
|
||||
|
||||
return target_instances
|
||||
@@ -206,3 +208,29 @@ def get_transition_events(
|
||||
)
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def cancel_unnecessary_downloads(
|
||||
instances: Mapping[InstanceId, Instance],
|
||||
download_status: Mapping[NodeId, Sequence[DownloadProgress]],
|
||||
) -> Sequence[DownloadCommand]:
|
||||
commands: list[DownloadCommand] = []
|
||||
currently_downloading = [
|
||||
(k, v.shard_metadata.model_card.model_id)
|
||||
for k, vs in download_status.items()
|
||||
for v in vs
|
||||
if isinstance(v, (DownloadOngoing))
|
||||
]
|
||||
active_models = set(
|
||||
(
|
||||
node_id,
|
||||
instance.shard_assignments.runner_to_shard[runner_id].model_card.model_id,
|
||||
)
|
||||
for instance in instances.values()
|
||||
for node_id, runner_id in instance.shard_assignments.node_to_runner.items()
|
||||
)
|
||||
for pair in currently_downloading:
|
||||
if pair not in active_models:
|
||||
commands.append(CancelDownload(target_node_id=pair[0], model_id=pair[1]))
|
||||
|
||||
return commands
|
||||
|
||||
@@ -11,6 +11,7 @@ from exo.shared.models.model_cards import ModelCard, ModelTask
|
||||
from exo.shared.types.commands import (
|
||||
CommandId,
|
||||
ForwarderCommand,
|
||||
ForwarderDownloadCommand,
|
||||
PlaceInstance,
|
||||
TextGeneration,
|
||||
)
|
||||
@@ -47,6 +48,7 @@ async def test_master():
|
||||
ge_sender, global_event_receiver = channel[ForwarderEvent]()
|
||||
command_sender, co_receiver = channel[ForwarderCommand]()
|
||||
local_event_sender, le_receiver = channel[ForwarderEvent]()
|
||||
fcds, _fcdr = channel[ForwarderDownloadCommand]()
|
||||
|
||||
all_events: list[IndexedEvent] = []
|
||||
|
||||
@@ -67,6 +69,7 @@ async def test_master():
|
||||
global_event_sender=ge_sender,
|
||||
local_event_receiver=le_receiver,
|
||||
command_receiver=co_receiver,
|
||||
download_command_sender=fcds,
|
||||
)
|
||||
logger.info("run the master")
|
||||
async with anyio.create_task_group() as tg:
|
||||
|
||||
@@ -9,6 +9,7 @@ from anyio import (
|
||||
BrokenResourceError,
|
||||
ClosedResourceError,
|
||||
create_task_group,
|
||||
move_on_after,
|
||||
sleep_forever,
|
||||
)
|
||||
from anyio.abc import TaskGroup
|
||||
@@ -146,18 +147,21 @@ class Router:
|
||||
|
||||
async def run(self):
|
||||
logger.debug("Starting Router")
|
||||
async with create_task_group() as tg:
|
||||
self._tg = tg
|
||||
for topic in self.topic_routers:
|
||||
router = self.topic_routers[topic]
|
||||
tg.start_soon(router.run)
|
||||
tg.start_soon(self._networking_recv)
|
||||
tg.start_soon(self._networking_recv_connection_messages)
|
||||
tg.start_soon(self._networking_publish)
|
||||
# Router only shuts down if you cancel it.
|
||||
await sleep_forever()
|
||||
for topic in self.topic_routers:
|
||||
await self._networking_unsubscribe(str(topic))
|
||||
try:
|
||||
async with create_task_group() as tg:
|
||||
self._tg = tg
|
||||
for topic in self.topic_routers:
|
||||
router = self.topic_routers[topic]
|
||||
tg.start_soon(router.run)
|
||||
tg.start_soon(self._networking_recv)
|
||||
tg.start_soon(self._networking_recv_connection_messages)
|
||||
tg.start_soon(self._networking_publish)
|
||||
# Router only shuts down if you cancel it.
|
||||
await sleep_forever()
|
||||
finally:
|
||||
with move_on_after(1, shield=True):
|
||||
for topic in self.topic_routers:
|
||||
await self._networking_unsubscribe(str(topic))
|
||||
|
||||
async def shutdown(self):
|
||||
logger.debug("Shutting down Router")
|
||||
@@ -166,12 +170,12 @@ class Router:
|
||||
self._tg.cancel_scope.cancel()
|
||||
|
||||
async def _networking_subscribe(self, topic: str):
|
||||
logger.info(f"Subscribing to {topic}")
|
||||
await self._net.gossipsub_subscribe(topic)
|
||||
logger.info(f"Subscribed to {topic}")
|
||||
|
||||
async def _networking_unsubscribe(self, topic: str):
|
||||
logger.info(f"Unsubscribing from {topic}")
|
||||
await self._net.gossipsub_unsubscribe(topic)
|
||||
logger.info(f"Unsubscribed from {topic}")
|
||||
|
||||
async def _networking_recv(self):
|
||||
while True:
|
||||
|
||||
@@ -12,7 +12,6 @@ from exo.shared.types.events import (
|
||||
InputChunkReceived,
|
||||
InstanceCreated,
|
||||
InstanceDeleted,
|
||||
InstanceDraftModelUpdated,
|
||||
NodeDownloadProgress,
|
||||
NodeGatheredInfo,
|
||||
NodeTimedOut,
|
||||
@@ -70,8 +69,6 @@ def event_apply(event: Event, state: State) -> State:
|
||||
return apply_instance_created(event, state)
|
||||
case InstanceDeleted():
|
||||
return apply_instance_deleted(event, state)
|
||||
case InstanceDraftModelUpdated():
|
||||
return apply_instance_draft_model_updated(event, state)
|
||||
case NodeTimedOut():
|
||||
return apply_node_timed_out(event, state)
|
||||
case NodeDownloadProgress():
|
||||
@@ -190,25 +187,6 @@ def apply_instance_deleted(event: InstanceDeleted, state: State) -> State:
|
||||
return state.model_copy(update={"instances": new_instances})
|
||||
|
||||
|
||||
def apply_instance_draft_model_updated(
|
||||
event: InstanceDraftModelUpdated, state: State
|
||||
) -> State:
|
||||
if event.instance_id not in state.instances:
|
||||
return state
|
||||
instance = state.instances[event.instance_id]
|
||||
updated_instance = instance.model_copy(
|
||||
update={
|
||||
"draft_model": event.draft_model,
|
||||
"num_draft_tokens": event.num_draft_tokens,
|
||||
}
|
||||
)
|
||||
new_instances: Mapping[InstanceId, Instance] = {
|
||||
**state.instances,
|
||||
event.instance_id: updated_instance,
|
||||
}
|
||||
return state.model_copy(update={"instances": new_instances})
|
||||
|
||||
|
||||
def apply_runner_status_updated(event: RunnerStatusUpdated, state: State) -> State:
|
||||
new_runners: Mapping[RunnerId, RunnerStatus] = {
|
||||
**state.runners,
|
||||
@@ -230,58 +208,19 @@ def apply_runner_deleted(event: RunnerDeleted, state: State) -> State:
|
||||
def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
|
||||
topology = copy.deepcopy(state.topology)
|
||||
topology.remove_node(event.node_id)
|
||||
last_seen = {
|
||||
key: value for key, value in state.last_seen.items() if key != event.node_id
|
||||
}
|
||||
downloads = {
|
||||
key: value for key, value in state.downloads.items() if key != event.node_id
|
||||
}
|
||||
# Clean up all granular node mappings
|
||||
node_identities = {
|
||||
key: value
|
||||
for key, value in state.node_identities.items()
|
||||
if key != event.node_id
|
||||
}
|
||||
node_memory = {
|
||||
key: value for key, value in state.node_memory.items() if key != event.node_id
|
||||
}
|
||||
node_system = {
|
||||
key: value for key, value in state.node_system.items() if key != event.node_id
|
||||
}
|
||||
node_network = {
|
||||
key: value for key, value in state.node_network.items() if key != event.node_id
|
||||
}
|
||||
node_thunderbolt = {
|
||||
key: value
|
||||
for key, value in state.node_thunderbolt.items()
|
||||
if key != event.node_id
|
||||
}
|
||||
node_thunderbolt_bridge = {
|
||||
key: value
|
||||
for key, value in state.node_thunderbolt_bridge.items()
|
||||
if key != event.node_id
|
||||
}
|
||||
# Only recompute cycles if the leaving node had TB bridge enabled
|
||||
leaving_node_status = state.node_thunderbolt_bridge.get(event.node_id)
|
||||
leaving_node_had_tb_enabled = (
|
||||
leaving_node_status is not None and leaving_node_status.enabled
|
||||
)
|
||||
thunderbolt_bridge_cycles = (
|
||||
topology.get_thunderbolt_bridge_cycles(node_thunderbolt_bridge, node_network)
|
||||
topology.get_thunderbolt_bridge_cycles(state.node_thunderbolt_bridge, state.node_network)
|
||||
if leaving_node_had_tb_enabled
|
||||
else [list(cycle) for cycle in state.thunderbolt_bridge_cycles]
|
||||
)
|
||||
return state.model_copy(
|
||||
update={
|
||||
"downloads": downloads,
|
||||
"topology": topology,
|
||||
"last_seen": last_seen,
|
||||
"node_identities": node_identities,
|
||||
"node_memory": node_memory,
|
||||
"node_system": node_system,
|
||||
"node_network": node_network,
|
||||
"node_thunderbolt": node_thunderbolt,
|
||||
"node_thunderbolt_bridge": node_thunderbolt_bridge,
|
||||
"thunderbolt_bridge_cycles": thunderbolt_bridge_cycles,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -86,28 +86,29 @@ class Election:
|
||||
|
||||
async def run(self):
|
||||
logger.info("Starting Election")
|
||||
async with create_task_group() as tg:
|
||||
self._tg = tg
|
||||
tg.start_soon(self._election_receiver)
|
||||
tg.start_soon(self._connection_receiver)
|
||||
tg.start_soon(self._command_counter)
|
||||
try:
|
||||
async with create_task_group() as tg:
|
||||
self._tg = tg
|
||||
tg.start_soon(self._election_receiver)
|
||||
tg.start_soon(self._connection_receiver)
|
||||
tg.start_soon(self._command_counter)
|
||||
|
||||
# And start an election immediately, that instantly resolves
|
||||
candidates: list[ElectionMessage] = []
|
||||
logger.debug("Starting initial campaign")
|
||||
self._candidates = candidates
|
||||
await self._campaign(candidates, campaign_timeout=0.0)
|
||||
logger.debug("Initial campaign finished")
|
||||
|
||||
# Cancel and wait for the last election to end
|
||||
if self._campaign_cancel_scope is not None:
|
||||
logger.debug("Cancelling campaign")
|
||||
self._campaign_cancel_scope.cancel()
|
||||
if self._campaign_done is not None:
|
||||
logger.debug("Waiting for campaign to finish")
|
||||
await self._campaign_done.wait()
|
||||
logger.debug("Campaign cancelled and finished")
|
||||
logger.info("Election finished")
|
||||
# And start an election immediately, that instantly resolves
|
||||
candidates: list[ElectionMessage] = []
|
||||
logger.debug("Starting initial campaign")
|
||||
self._candidates = candidates
|
||||
await self._campaign(candidates, campaign_timeout=0.0)
|
||||
logger.debug("Initial campaign finished")
|
||||
finally:
|
||||
# Cancel and wait for the last election to end
|
||||
if self._campaign_cancel_scope is not None:
|
||||
logger.debug("Cancelling campaign")
|
||||
self._campaign_cancel_scope.cancel()
|
||||
if self._campaign_done is not None:
|
||||
logger.debug("Waiting for campaign to finish")
|
||||
await self._campaign_done.wait()
|
||||
logger.debug("Campaign cancelled and finished")
|
||||
logger.info("Election shutdown")
|
||||
|
||||
async def elect(self, em: ElectionMessage) -> None:
|
||||
logger.debug(f"Electing: {em}")
|
||||
|
||||
@@ -72,15 +72,12 @@ class DeleteDownload(BaseCommand):
|
||||
model_id: ModelId
|
||||
|
||||
|
||||
class SetInstanceDraftModel(BaseCommand):
|
||||
"""Set or update the draft model for an existing instance."""
|
||||
|
||||
instance_id: InstanceId
|
||||
draft_model: ModelId | None # None to disable speculative decoding
|
||||
num_draft_tokens: int = 4
|
||||
class CancelDownload(BaseCommand):
|
||||
target_node_id: NodeId
|
||||
model_id: ModelId
|
||||
|
||||
|
||||
DownloadCommand = StartDownload | DeleteDownload
|
||||
DownloadCommand = StartDownload | DeleteDownload | CancelDownload
|
||||
|
||||
|
||||
Command = (
|
||||
@@ -92,7 +89,6 @@ Command = (
|
||||
| PlaceInstance
|
||||
| CreateInstance
|
||||
| DeleteInstance
|
||||
| SetInstanceDraftModel
|
||||
| TaskFinished
|
||||
| SendInputChunk
|
||||
)
|
||||
|
||||
@@ -5,7 +5,7 @@ from pydantic import Field
|
||||
|
||||
from exo.shared.topology import Connection
|
||||
from exo.shared.types.chunks import GenerationChunk, InputImageChunk
|
||||
from exo.shared.types.common import CommandId, Id, ModelId, NodeId, SessionId
|
||||
from exo.shared.types.common import CommandId, Id, NodeId, SessionId
|
||||
from exo.shared.types.tasks import Task, TaskId, TaskStatus
|
||||
from exo.shared.types.worker.downloads import DownloadProgress
|
||||
from exo.shared.types.worker.instances import Instance, InstanceId
|
||||
@@ -68,14 +68,6 @@ class InstanceDeleted(BaseEvent):
|
||||
instance_id: InstanceId
|
||||
|
||||
|
||||
class InstanceDraftModelUpdated(BaseEvent):
|
||||
"""Draft model updated on an existing instance."""
|
||||
|
||||
instance_id: InstanceId
|
||||
draft_model: ModelId | None
|
||||
num_draft_tokens: int
|
||||
|
||||
|
||||
class RunnerStatusUpdated(BaseEvent):
|
||||
runner_id: RunnerId
|
||||
runner_status: RunnerStatus
|
||||
@@ -149,7 +141,6 @@ Event = (
|
||||
| TaskAcknowledged
|
||||
| InstanceCreated
|
||||
| InstanceDeleted
|
||||
| InstanceDraftModelUpdated
|
||||
| RunnerStatusUpdated
|
||||
| RunnerDeleted
|
||||
| NodeTimedOut
|
||||
|
||||
@@ -40,12 +40,6 @@ class DownloadModel(BaseTask): # emitted by Worker
|
||||
shard_metadata: ShardMetadata
|
||||
|
||||
|
||||
class DownloadDraftModel(BaseTask): # emitted by Worker
|
||||
"""Download a draft model for speculative decoding (rank 0 only)."""
|
||||
|
||||
model_id: str # HuggingFace model ID
|
||||
|
||||
|
||||
class LoadModel(BaseTask): # emitted by Worker
|
||||
pass
|
||||
|
||||
@@ -86,17 +80,9 @@ class Shutdown(BaseTask): # emitted by Worker
|
||||
runner_id: RunnerId
|
||||
|
||||
|
||||
class SetDraftModel(BaseTask): # emitted by Worker
|
||||
"""Load or clear a draft model on an already-running instance."""
|
||||
|
||||
model_id: str | None # HuggingFace model ID, or None to clear
|
||||
num_draft_tokens: int = 4
|
||||
|
||||
|
||||
Task = (
|
||||
CreateRunner
|
||||
| DownloadModel
|
||||
| DownloadDraftModel
|
||||
| ConnectToGroup
|
||||
| LoadModel
|
||||
| StartWarmup
|
||||
@@ -104,5 +90,4 @@ Task = (
|
||||
| ImageGeneration
|
||||
| ImageEdits
|
||||
| Shutdown
|
||||
| SetDraftModel
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@ from enum import Enum
|
||||
|
||||
from pydantic import model_validator
|
||||
|
||||
from exo.shared.types.common import Host, Id, ModelId, NodeId
|
||||
from exo.shared.types.common import Host, Id, NodeId
|
||||
from exo.shared.types.worker.runners import RunnerId, ShardAssignments, ShardMetadata
|
||||
from exo.utils.pydantic_ext import CamelCaseModel, TaggedModel
|
||||
|
||||
@@ -19,8 +19,6 @@ class InstanceMeta(str, Enum):
|
||||
class BaseInstance(TaggedModel):
|
||||
instance_id: InstanceId
|
||||
shard_assignments: ShardAssignments
|
||||
draft_model: ModelId | None = None # For speculative decoding (rank 0 only)
|
||||
num_draft_tokens: int = 4 # Tokens to draft per iteration (when draft_model is set)
|
||||
|
||||
def shard(self, runner_id: RunnerId) -> ShardMetadata | None:
|
||||
return self.shard_assignments.runner_to_shard.get(runner_id, None)
|
||||
|
||||
@@ -194,9 +194,10 @@ class MpReceiver[T]:
|
||||
raise EndOfStream from None
|
||||
return item
|
||||
|
||||
# nb: this function will not cancel particularly well
|
||||
async def receive_async(self) -> T:
|
||||
return await to_thread.run_sync(self.receive, limiter=CapacityLimiter(1))
|
||||
return await to_thread.run_sync(
|
||||
self.receive, limiter=CapacityLimiter(1), abandon_on_cancel=True
|
||||
)
|
||||
|
||||
def close(self) -> None:
|
||||
if not self._state.closed.is_set():
|
||||
|
||||
@@ -226,27 +226,6 @@ def load_mlx_items(
|
||||
return cast(Model, model), tokenizer
|
||||
|
||||
|
||||
def load_draft_model(model_id: ModelId) -> nn.Module:
|
||||
"""Load a draft model for speculative decoding (rank 0 only).
|
||||
|
||||
Draft models are small models (typically 0.5B-2B parameters) used to
|
||||
generate candidate tokens quickly, which are then verified by the main
|
||||
model in a single forward pass.
|
||||
|
||||
Assumes the model has already been downloaded by the worker.
|
||||
|
||||
Args:
|
||||
model_id: HuggingFace model ID for the draft model
|
||||
|
||||
Returns:
|
||||
The loaded draft model
|
||||
"""
|
||||
model_path = build_model_path(model_id)
|
||||
draft_model, _ = load_model(model_path, strict=True)
|
||||
logger.info(f"Loaded draft model from {model_path}")
|
||||
return draft_model
|
||||
|
||||
|
||||
def shard_and_load(
|
||||
shard_metadata: ShardMetadata,
|
||||
group: Group,
|
||||
|
||||
@@ -98,21 +98,23 @@ class Worker:
|
||||
info_send, info_recv = channel[GatheredInfo]()
|
||||
info_gatherer: InfoGatherer = InfoGatherer(info_send)
|
||||
|
||||
async with self._tg as tg:
|
||||
tg.start_soon(info_gatherer.run)
|
||||
tg.start_soon(self._forward_info, info_recv)
|
||||
tg.start_soon(self.plan_step)
|
||||
tg.start_soon(self._resend_out_for_delivery)
|
||||
tg.start_soon(self._event_applier)
|
||||
tg.start_soon(self._forward_events)
|
||||
tg.start_soon(self._poll_connection_updates)
|
||||
|
||||
# Actual shutdown code - waits for all tasks to complete before executing.
|
||||
self.local_event_sender.close()
|
||||
self.command_sender.close()
|
||||
self.download_command_sender.close()
|
||||
for runner in self.runners.values():
|
||||
runner.shutdown()
|
||||
try:
|
||||
async with self._tg as tg:
|
||||
tg.start_soon(info_gatherer.run)
|
||||
tg.start_soon(self._forward_info, info_recv)
|
||||
tg.start_soon(self.plan_step)
|
||||
tg.start_soon(self._resend_out_for_delivery)
|
||||
tg.start_soon(self._event_applier)
|
||||
tg.start_soon(self._forward_events)
|
||||
tg.start_soon(self._poll_connection_updates)
|
||||
finally:
|
||||
# Actual shutdown code - waits for all tasks to complete before executing.
|
||||
logger.info("Stopping Worker")
|
||||
self.local_event_sender.close()
|
||||
self.command_sender.close()
|
||||
self.download_command_sender.close()
|
||||
for runner in self.runners.values():
|
||||
runner.shutdown()
|
||||
|
||||
async def _forward_info(self, recv: Receiver[GatheredInfo]):
|
||||
with recv as info_stream:
|
||||
|
||||
@@ -8,10 +8,8 @@ import anyio
|
||||
from anyio import (
|
||||
BrokenResourceError,
|
||||
ClosedResourceError,
|
||||
create_task_group,
|
||||
to_thread,
|
||||
)
|
||||
from anyio.abc import TaskGroup
|
||||
from loguru import logger
|
||||
|
||||
from exo.shared.types.events import (
|
||||
@@ -49,7 +47,6 @@ class RunnerSupervisor:
|
||||
_ev_recv: MpReceiver[Event]
|
||||
_task_sender: MpSender[Task]
|
||||
_event_sender: Sender[Event]
|
||||
_tg: TaskGroup | None = field(default=None, init=False)
|
||||
status: RunnerStatus = field(default_factory=RunnerIdle, init=False)
|
||||
pending: dict[TaskId, anyio.Event] = field(default_factory=dict, init=False)
|
||||
completed: set[TaskId] = field(default_factory=set, init=False)
|
||||
@@ -93,28 +90,29 @@ class RunnerSupervisor:
|
||||
|
||||
async def run(self):
|
||||
self.runner_process.start()
|
||||
async with create_task_group() as tg:
|
||||
self._tg = tg
|
||||
tg.start_soon(self._forward_events)
|
||||
await self._forward_events()
|
||||
|
||||
def shutdown(self):
|
||||
logger.info("Runner supervisor shutting down")
|
||||
self._ev_recv.close()
|
||||
self._task_sender.close()
|
||||
self._event_sender.close()
|
||||
await to_thread.run_sync(self.runner_process.join, 30)
|
||||
self.runner_process.join(1)
|
||||
if not self.runner_process.is_alive():
|
||||
logger.info("Runner process succesfully terminated")
|
||||
return
|
||||
|
||||
# This is overkill but it's not technically bad, just unnecessary.
|
||||
logger.warning("Runner process didn't shutdown succesfully, terminating")
|
||||
self.runner_process.terminate()
|
||||
await to_thread.run_sync(self.runner_process.join, 5)
|
||||
self.runner_process.join(1)
|
||||
if not self.runner_process.is_alive():
|
||||
return
|
||||
|
||||
logger.critical("Runner process didn't respond to SIGTERM, killing")
|
||||
self.runner_process.kill()
|
||||
|
||||
await to_thread.run_sync(self.runner_process.join, 5)
|
||||
self.runner_process.join(1)
|
||||
if not self.runner_process.is_alive():
|
||||
return
|
||||
|
||||
@@ -122,10 +120,6 @@ class RunnerSupervisor:
|
||||
"Runner process didn't respond to SIGKILL. System resources may have leaked"
|
||||
)
|
||||
|
||||
def shutdown(self):
|
||||
assert self._tg
|
||||
self._tg.cancel_scope.cancel()
|
||||
|
||||
async def start_task(self, task: Task):
|
||||
if task.task_id in self.pending:
|
||||
logger.warning(
|
||||
|
||||
@@ -22,7 +22,7 @@ echo "Deploying $commit to $# hosts..."
|
||||
hosts=("$@")
|
||||
cleanup() {
|
||||
for host in "${hosts[@]}"; do
|
||||
ssh -T -o BatchMode=yes "$host@$host" "pkill -SIGINT -of exo-env" &
|
||||
ssh -T -o BatchMode=yes "$host@$host" "pkill -f bin/exo" &
|
||||
done
|
||||
wait
|
||||
jobs -pr | xargs -r kill 2>/dev/null || true
|
||||
@@ -34,21 +34,13 @@ reset=$'\e[0m'
|
||||
i=0
|
||||
for host; do
|
||||
colour=${colours[i++ % 4]}
|
||||
{
|
||||
ssh -T -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" \
|
||||
"/nix/var/nix/profiles/default/bin/nix shell nixpkgs#git -c bash -s -- '$commit'" \
|
||||
2>&1 | awk -v p="${colour}[${host}]${reset}" '{ print p $0; fflush() }' &
|
||||
} <<'EOF'
|
||||
set -euo pipefail
|
||||
cd exo
|
||||
git fetch -q origin
|
||||
git checkout -q "$1"
|
||||
EXO_LIBP2P_NAMESPACE="$1" /nix/var/nix/profiles/default/bin/nix run .#exo
|
||||
EOF
|
||||
ssh -T -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" \
|
||||
"EXO_LIBP2P_NAMESPACE=$commit /nix/var/nix/profiles/default/bin/nix run github:exo-explore/exo/$commit" |&
|
||||
awk -v p="${colour}[${host}]${reset}" '{ print p $0; fflush() }' &
|
||||
done
|
||||
|
||||
for host; do
|
||||
echo "Waiting for $host..."
|
||||
until curl -sf "http://$host:52415/models"; do sleep 1; done
|
||||
until curl -sf "http://$host:52415/models" &>/dev/null; do sleep 1; done
|
||||
done
|
||||
wait
|
||||
|
||||
Reference in New Issue
Block a user