fix InstanceViewModel.swift

wasn't caught when we merged the API changes
2026-02-16 17:11:57 -05:00 · 2026-02-02 17:57:02 +00:00
7 changed files with 1233 additions and 1252 deletions
--- a/app/EXO/EXO.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
+++ b/app/EXO/EXO.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
@@ -6,8 +6,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/sparkle-project/Sparkle.git",
      "state" : {
-        "revision" : "afea2cda87819c960114f26e26f369a1a0945b17",
-        "version" : "2.9.0-beta.2"
+        "revision" : "e641adb41915a8409895e2e30666aa64e487b637",
+        "version" : "2.9.0-beta.1"
      }
    }
  ],
--- a/dashboard/src/lib/components/ModelCard.svelte
+++ b/dashboard/src/lib/components/ModelCard.svelte
@@ -1003,5 +1003,16 @@
 </div>

 <style>
-  /* Styles removed - animations were causing GPU overhead */
+  @keyframes pulse-slow {
+    0%,
+    100% {
+      opacity: 0.8;
+    }
+    50% {
+      opacity: 1;
+    }
+  }
+  .animate-pulse-slow {
+    animation: pulse-slow 1.5s ease-in-out infinite;
+  }
 </style>
--- a/dashboard/src/lib/components/TopologyGraph.svelte
+++ b/dashboard/src/lib/components/TopologyGraph.svelte
--- a/tests/headless_runner.py
+++ b/tests/headless_runner.py
@@ -1,20 +1,25 @@
+import multiprocessing as mp
 import socket
-from typing import Literal
+import time
+import typing

 import anyio
 from fastapi import FastAPI
-from fastapi.responses import Response, StreamingResponse
+from fastapi.responses import StreamingResponse
 from hypercorn import Config
 from hypercorn.asyncio import serve  # pyright: ignore[reportUnknownVariableType]
 from loguru import logger
 from pydantic import BaseModel

-from exo.shared.constants import EXO_MODELS_DIR
+from exo.download.impl_shard_downloader import (
+    build_full_shard,
+    exo_shard_downloader,
+)
+from exo.shared.logging import InterceptLogger, logger_setup
 from exo.shared.models.model_cards import MODEL_CARDS, ModelId
-from exo.shared.types.chunks import TokenChunk
 from exo.shared.types.commands import CommandId
 from exo.shared.types.common import Host, NodeId
-from exo.shared.types.events import ChunkGenerated, Event, RunnerStatusUpdated
+from exo.shared.types.events import Event
 from exo.shared.types.tasks import (
    ConnectToGroup,
    LoadModel,
@@ -31,14 +36,9 @@ from exo.shared.types.worker.instances import (
    MlxJacclInstance,
    MlxRingInstance,
 )
-from exo.shared.types.worker.runners import (
-    RunnerFailed,
-    RunnerId,
-    RunnerShutdown,
-    ShardAssignments,
-)
+from exo.shared.types.worker.runners import RunnerId, ShardAssignments
 from exo.shared.types.worker.shards import PipelineShardMetadata, TensorShardMetadata
-from exo.utils.channels import channel, mp_channel
+from exo.utils.channels import MpReceiver, MpSender, channel, mp_channel
 from exo.utils.info_gatherer.info_gatherer import GatheredInfo, InfoGatherer
 from exo.worker.runner.bootstrap import entrypoint

@@ -46,37 +46,36 @@ from exo.worker.runner.bootstrap import entrypoint
 class Tests(BaseModel):
    # list[hostname, ip addr]
    devs: list[list[str]]
-    ibv_devs: list[list[str | None]] | None
-    model_id: ModelId
-    kind: Literal["ring", "jaccl", "both"]
+    model_id: str
+    kind: typing.Literal["init", "warmup", "inference"]


-iid = InstanceId("im testing here")
+mp.set_start_method("spawn", force=True)
+logger_setup(None)


 async def main():
    logger.info("starting cool server majig")
+    await assert_downloads()
    cfg = Config()
-    cfg.bind = "0.0.0.0:52414"
+    cfg.bind = "0.0.0.0:52415"
    # nb: shared.logging needs updating if any of this changes
    cfg.accesslog = "-"
    cfg.errorlog = "-"
-    ev = anyio.Event()
+    cfg.logger_class = InterceptLogger
    app = FastAPI()
-    app.post("/run_test")(run_test)
-    app.post("/kill")(lambda: kill(ev))
-    app.get("/tb_detection")(tb_detection)
-    app.get("/models")(list_models)
+    app.post("/ring")(ring_backend)
+    app.post("/jaccl")(jaccl_backend)
+    app.post("/tb_detection")(tb_detection)
+    shutdown = anyio.Event()
    await serve(
        app,  # type: ignore
        cfg,
-        shutdown_trigger=lambda: ev.wait(),
+        shutdown_trigger=lambda: shutdown.wait(),
    )
-
-
-def kill(ev: anyio.Event):
-    ev.set()
-    return Response(status_code=204)
+    await anyio.sleep_forever()
+    # gracefully shutdown the api
+    shutdown.set()


 async def tb_detection():
@@ -88,19 +87,29 @@ async def tb_detection():
        return recv.collect()


-def list_models():
-    sent = set[str]()
-    for path in EXO_MODELS_DIR.rglob("model-*.safetensors"):
-        if "--" not in path.parent.name:
-            continue
-        name = path.parent.name.replace("--", "/")
-        if name in sent:
-            continue
-        sent.add(name)
-        yield ModelId(path.parent.name.replace("--", "/"))
+async def assert_downloads():
+    sd = exo_shard_downloader()
+    # await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-0.6b"].model_id))
+    await sd.ensure_shard(
+        await build_full_shard(MODEL_CARDS["llama-3.1-8b-bf16"].model_id)
+    )
+    await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-30b"].model_id))
+    await sd.ensure_shard(
+        await build_full_shard(MODEL_CARDS["gpt-oss-120b-MXFP4-Q8"].model_id)
+    )
+    await sd.ensure_shard(
+        await build_full_shard(MODEL_CARDS["gpt-oss-20b-4bit"].model_id)
+    )
+    await sd.ensure_shard(
+        await build_full_shard(MODEL_CARDS["glm-4.7-8bit-gs32"].model_id)
+    )
+    await sd.ensure_shard(
+        await build_full_shard(MODEL_CARDS["minimax-m2.1-8bit"].model_id)
+    )


-async def run_test(test: Tests):
+async def ring_backend(test: Tests):
+    iid = InstanceId(str(hash(str(test.devs))))
    weird_hn = socket.gethostname()
    for dev in test.devs:
        if weird_hn.startswith(dev[0]) or dev[0].startswith(weird_hn):
@@ -108,67 +117,31 @@ async def run_test(test: Tests):
            break
    else:
        raise ValueError(f"{weird_hn} not in {test.devs}")
-
-    async def run():
-        logger.info(f"testing {test.model_id}")
-
-        instances: list[Instance] = []
-        if test.kind in ["ring", "both"]:
-            i = ring_instance(test, hn)
-            if i is None:
-                yield "no model found"
-                return
-            instances.append(i)
-        if test.kind in ["rdma", "both"]:
-            i = jaccl_instance(test)
-            if i is None:
-                yield "no model found"
-                return
-            instances.append(i)
-
-        for instance in instances:
-            recv = await execute_test(test, instance, hn)
-
-            str_out = ""
-
-            for item in recv:
-                if isinstance(item, ChunkGenerated):
-                    assert isinstance(item.chunk, TokenChunk)
-                    str_out += item.chunk.text
-
-                if isinstance(item, RunnerStatusUpdated) and isinstance(
-                    item.runner_status, (RunnerFailed, RunnerShutdown)
-                ):
-                    yield str_out + "\n"
-                    yield item.model_dump_json() + "\n"
-
-    return StreamingResponse(run())
+    return await execute_test(test, ring_instance(test, iid, hn), hn)


-def ring_instance(test: Tests, hn: str) -> Instance | None:
-    hbn = [Host(ip="198.51.100.0", port=52417) for _ in test.devs]
+def ring_instance(test: Tests, iid: InstanceId, hn: str) -> Instance:
+    hbn = [Host(ip="i dont care", port=52416) for _ in test.devs]
    world_size = len(test.devs)
    for i in range(world_size):
        if test.devs[i][0] == hn:
            hn = test.devs[i][0]
-        hbn[(i - 1) % world_size] = Host(ip=test.devs[i - 1][1], port=52417)
-        hbn[(i + 1) % world_size] = Host(ip=test.devs[i + 1][1], port=52417)
-        hbn[i] = Host(ip="0.0.0.0", port=52417)
-        break
+            if i - 1 >= 0:
+                hbn[i - 1] = Host(ip=test.devs[i - 1][1], port=52416)
+            if i + 1 < len(test.devs):
+                hbn[i + 1] = Host(ip=test.devs[i + 1][1], port=52416)
+            hbn[i] = Host(ip="0.0.0.0", port=52416)
+            break
    else:
        raise ValueError(f"{hn} not in {test.devs}")

-    card = next(
-        (card for card in MODEL_CARDS.values() if card.model_id == test.model_id), None
-    )
-    if card is None:
-        return None
+    card = MODEL_CARDS[test.model_id]
    instance = MlxRingInstance(
        instance_id=iid,
-        ephemeral_port=52417,
+        ephemeral_port=52416,
        hosts_by_node={NodeId(hn): hbn},
        shard_assignments=ShardAssignments(
-            model_id=test.model_id,
+            model_id=ModelId(test.model_id),
            node_to_runner={NodeId(host[0]): RunnerId(host[0]) for host in test.devs},
            runner_to_shard={
                RunnerId(test.devs[i][0]): PipelineShardMetadata(
@@ -190,79 +163,113 @@ def ring_instance(test: Tests, hn: str) -> Instance | None:
    return instance


-async def execute_test(test: Tests, instance: Instance, hn: str) -> list[Event]:
+async def execute_test(test: Tests, instance: Instance, hn: str):
    world_size = len(test.devs)
-    commands: list[Task] = [
-        (LoadModel(instance_id=iid)),
-        (StartWarmup(instance_id=iid)),
-        (
-            TextGeneration(
-                task_params=TextGenerationTaskParams(
-                    model=test.model_id,
-                    instructions="You are a helpful assistant",
-                    input="What is the capital of France?",
-                ),
-                command_id=CommandId("yo"),
-                instance_id=iid,
-            )
-        ),
-        (Shutdown(runner_id=RunnerId(hn), instance_id=iid)),
-    ]
+    iid = InstanceId(str(hash(str(test.devs))))
+    _handle, recv, send = new_runner(instance, hn)
    if world_size > 1:
-        commands.insert(0, ConnectToGroup(instance_id=iid))
-    bound_instance = BoundInstance(
-        instance=instance, bound_runner_id=RunnerId(hn), bound_node_id=NodeId(hn)
-    )
-    ev_send, _ev_recv = mp_channel[Event]()
-    task_send, task_recv = mp_channel[Task]()
+        send.send(ConnectToGroup(instance_id=iid))
+    send.send(LoadModel(instance_id=iid))

-    for command in commands:
-        task_send.send(command)
+    match test.kind:
+        case "init":
+            pass
+        case "warmup":
+            send.send(StartWarmup(instance_id=iid))
+        case "inference":
+            send.send(StartWarmup(instance_id=iid))
+            send.send(
+                TextGeneration(
+                    task_params=TextGenerationTaskParams(
+                        model=test.model_id,
+                        instructions="You are a helpful assistant",
+                        input="What is the capital of France?",
+                    ),
+                    command_id=CommandId("yo"),
+                    instance_id=iid,
+                )
+            )

-    entrypoint(
-        bound_instance,
-        ev_send,
-        task_recv,
-        logger,
-    )
+    send.send(Shutdown(runner_id=RunnerId(hn), instance_id=iid))

-    # TODO(evan): return ev_recv.collect()
-    return []
+    async def map_recv():
+        with recv:
+            try:
+                async for item in recv:
+                    yield item.model_dump_json() + "\n"
+            except anyio.ClosedResourceError:
+                pass
+
+    ret = StreamingResponse(map_recv())
+    ret._pls_dont_gc = _handle  # type: ignore
+    return ret


-def jaccl_instance(test: Tests) -> MlxJacclInstance | None:
-    card = next(
-        (card for card in MODEL_CARDS.values() if card.model_id == test.model_id), None
-    )
-    if card is None:
-        return None
+async def jaccl_backend(test: Tests):
+    iid = InstanceId(str(hash(str(test.devs))))
+    weird_hn = socket.gethostname()
+    for dev in test.devs:
+        if weird_hn.startswith(dev[0]) or dev[0].startswith(weird_hn):
+            hn = dev[0]
+            break
+    else:
+        raise ValueError(f"{weird_hn} not in {test.devs}")
+    return await execute_test(test, jaccl_instance(test, iid), hn)
+
+
+def jaccl_instance(test: Tests, iid: InstanceId):
+    card = MODEL_CARDS[test.model_id]
    world_size = len(test.devs)
-    assert test.ibv_devs

    return MlxJacclInstance(
        instance_id=iid,
-        jaccl_devices=test.ibv_devs,
+        jaccl_devices=[[None, "rdma_en3"], ["rdma_en3", None]],
        # rank 0 is always coordinator
        jaccl_coordinators={
-            NodeId(host[0]): test.devs[0][1] + ":52417" for host in test.devs
+            NodeId(host[0]): test.devs[0][1] + ":52416" for host in test.devs
        },
        shard_assignments=ShardAssignments(
-            model_id=test.model_id,
+            model_id=ModelId(test.model_id),
            node_to_runner={NodeId(host[0]): RunnerId(host[0]) for host in test.devs},
            runner_to_shard={
-                RunnerId(host[0]): TensorShardMetadata(
+                RunnerId(test.devs[i][0]): TensorShardMetadata(
                    model_card=card,
                    device_rank=i,
                    world_size=world_size,
-                    start_layer=0,
+                    start_layer=card.n_layers,
                    end_layer=card.n_layers,
                    n_layers=card.n_layers,
                )
-                for i, host in enumerate(test.devs)
+                for i in range(world_size)
            },
        ),
    )


+def new_runner(
+    instance: Instance,
+    hn: str,
+) -> tuple[mp.Process, MpReceiver[Event], MpSender[Task]]:
+    bound_instance = BoundInstance(
+        instance=instance, bound_runner_id=RunnerId(hn), bound_node_id=NodeId(hn)
+    )
+    ev_send, ev_recv = mp_channel[Event]()
+    task_send, task_recv = mp_channel[Task]()
+
+    runner_process = mp.Process(
+        target=entrypoint,
+        args=(
+            bound_instance,
+            ev_send,
+            task_recv,
+            logger,
+        ),
+    )
+    runner_process._pls_dont_gc = (ev_send, task_recv)  # type: ignore
+    runner_process.start()
+    time.sleep(0.1)
+    return (runner_process, ev_recv, task_send)
+
+
 if __name__ == "__main__":
    anyio.run(main)
--- a/tests/run_exo_on.sh
+++ b/tests/run_exo_on.sh
@@ -1,54 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-[ $# -lt 1 ] && {
-  echo "Usage: $0 host1 [host2 ...]"
-  exit 1
-}
-
-[ -z "$(git status --porcelain)" ] || {
-  echo "Uncommitted changes"
-  exit 1
-}
-
-commit=$(git rev-parse HEAD)
-git fetch -q origin
-git branch -r --contains "$commit" | grep -qE '^\s*origin/' || {
-  echo "Not pushed to origin"
-  exit 1
-}
-
-echo "Deploying $commit to $# hosts..."
-hosts=("$@")
-cleanup() {
-  for host in "${hosts[@]}"; do
-    ssh -T -o BatchMode=yes "$host@$host" "pkill -SIGINT -of exo-env" &
-  done
-  wait
-  jobs -pr | xargs -r kill 2>/dev/null || true
-}
-trap 'cleanup' EXIT INT TERM
-
-colours=($'\e[31m' $'\e[32m' $'\e[33m' $'\e[34m')
-reset=$'\e[0m'
-i=0
-for host; do
-  colour=${colours[i++ % 4]}
-  {
-    ssh -T -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" \
-      "/nix/var/nix/profiles/default/bin/nix shell nixpkgs#git -c bash -s -- '$commit'" \
-      2>&1 | awk -v p="${colour}[${host}]${reset}" '{ print p $0; fflush() }' &
-  } <<'EOF'
-        set -euo pipefail
-        cd exo
-        git fetch -q origin
-        git checkout -q "$1"
-        EXO_LIBP2P_NAMESPACE="$1" /nix/var/nix/profiles/default/bin/nix run .#exo
-EOF
-done
-
-for host; do
-  echo "Waiting for $host..."
-  until curl -sf "http://$host:52415/models"; do sleep 1; done
-done
-wait
--- a/tests/start_distributed_test.py
+++ b/tests/start_distributed_test.py
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-import itertools
-import json
-import subprocess
-import sys
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, cast
-from urllib.request import Request, urlopen
-
-if not (args := sys.argv[1:]):
-    sys.exit(
-        f"USAGE: {sys.argv[0]} <kind> [host1] [host2] ...\nkind is optional, and should be jaccl or ring"
-    )
-
-kind = args[0] if args[0] in ("jaccl", "ring") else "both"
-hosts = args[1:] if kind != "both" else args
-ts = subprocess.run(
-    ["tailscale", "status"], check=True, text=True, capture_output=True
-).stdout.splitlines()
-ip = {sl[1]: sl[0] for line in ts if len(sl := line.split()) >= 2}
-ips = [ip[h] for h in hosts]
-devs = [[h, ip[h]] for h in hosts]
-n = len(hosts)
-
-
-def get_tb(a: str) -> list[dict[str, Any]]:
-    with urlopen(f"http://{a}:52414/tb_detection", timeout=5) as r:  # pyright: ignore[reportAny]
-        return json.loads(r.read())  # pyright: ignore[reportAny]
-
-
-def get_models(a: str) -> set[str]:
-    with urlopen(f"http://{a}:52414/models", timeout=5) as r:  # pyright: ignore[reportAny]
-        return set(json.loads(r.read()))  # pyright: ignore[reportAny]
-
-
-def run(h: str, a: str, body: bytes) -> None:
-    with urlopen(
-        Request(
-            f"http://{a}:52414/run_test",
-            data=body,
-            method="POST",
-            headers={"Content-Type": "application/json"},
-        ),
-        timeout=300,
-    ) as r:  # pyright: ignore[reportAny]
-        for line in r.read().decode(errors="replace").splitlines():  # pyright: ignore[reportAny]
-            print(f"\n{h}@{a}: {line}", flush=True)
-
-
-with ThreadPoolExecutor(n) as exctr:
-    if kind in ("jaccl", "both"):
-        payloads = list(exctr.map(get_tb, ips))
-
-        u2e = {
-            ident["domainUuid"]: (i, ident["rdmaInterface"])
-            for i, p in enumerate(payloads)
-            for d in p
-            for ident in cast(
-                list[dict[str, str]],
-                d.get("MacThunderboltIdentifiers", {}).get("idents", []),  # pyright: ignore[reportAny]
-            )
-        }
-        edges = {
-            (u2e[s][0], u2e[t][0]): u2e[t][1]
-            for p in payloads
-            for d in p
-            for c in d.get("MacThunderboltConnections", {}).get("conns", [])  # pyright: ignore[reportAny]
-            if (s := c["sourceUuid"]) in u2e and (t := c["sinkUuid"]) in u2e  # pyright: ignore[reportAny]
-        }
-        ibv_devs = [[edges.get((i, j)) for j in range(n)] for i in range(n)]
-    else:
-        ibv_devs = None
-
-    models = set[str].intersection(*exctr.map(get_models, ips))
-
-    print("\n")
-    print("=" * 70)
-    print(f"Starting test with {models}")
-    print("=" * 70)
-    print("\n")
-    for model in models:
-        body = json.dumps(
-            {"devs": devs, "model_id": model, "ibv_devs": ibv_devs, "kind": kind}
-        ).encode()
-        list(exctr.map(run, hosts, ips, itertools.repeat(body)))
--- a/tests/start_distributed_test.sh
+++ b/tests/start_distributed_test.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+query() {
+  tailscale status | awk -v find="$1" '$2 == find { print $1 }'
+}
+
+if [[ $# -lt 2 ]]; then
+  echo "USAGE: $0 <test kind> [host1] [host2] ..."
+  exit 1
+fi
+
+kind=$1
+shift
+
+test_kinds="ring jaccl"
+
+if ! echo "$test_kinds" | grep -q "$kind"; then
+  printf "%s is not a known test kind.\nCurrent test kinds are %s" "$kind" "$test_kinds"
+  exit 1
+fi
+
+hostnames=("$@")
+weaved=()
+ips=()
+for name in "${hostnames[@]}"; do
+  ip=$(query "$name")
+  ips+=("$ip")
+  weaved+=("$name" "$ip")
+done
+
+devs_raw=$(printf '["%s", "%s"], ' "${weaved[@]}")
+devs="[${devs_raw%, }]"
+
+model_ids=("qwen3-30b" "gpt-oss-120b-MXFP4-Q8" "kimi-k2-thinking")
+
+for model_id in "${model_ids[@]}"; do
+  for i in "${!ips[@]}"; do
+    {
+      req="{
+        \"model_id\": \"${model_id}\",
+        \"devs\": ${devs},
+        \"kind\": \"inference\"
+       }"
+      echo "req $req"
+      curl -sN \
+        -X POST "http://${ips[$i]}:52415/${kind}" \
+        -H "Content-Type: application/json" -d "$req" \
+        2>&1 | sed "s/^/\n${hostnames[$i]}@${ips[$i]}: /" || echo "curl to ${hostnames[$i]} failed" && exit 1
+    } &
+  done
+  wait
+done