yay

Fix bench after recent updates (#1331 )
## Motivation A lot of changes happened without much attention to the state of exo bench. ## Changes Use TaggedModel for BenchChatCompletion so it serialises properly. Don't break after gpt oss tool call to preserve parity with the rest of the codebase. ## Why It Works  ## Test Plan ### Manual Testing <img width="2856" height="678" alt="image" src="https://github.com/user-attachments/assets/2e18cf0d-c0f8-467c-9763-1a6a59c8a327" /> Also tested GPT OSS tool calling in OpenCode
2026-02-01 01:33:03 -05:00 · 2026-01-30 11:53:18 +00:00 · 2026-01-29 19:14:40 +00:00
11 changed files with 349 additions and 212 deletions
--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -3,7 +3,7 @@ from collections.abc import Generator
 from typing import Annotated, Any, Literal

 from fastapi import UploadFile
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator
 from pydantic_core import PydanticUseDefault

 from exo.shared.models.model_cards import ModelCard, ModelId
@@ -11,7 +11,7 @@ from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
 from exo.shared.types.worker.shards import Sharding, ShardMetadata
-from exo.utils.pydantic_ext import CamelCaseModel
+from exo.utils.pydantic_ext import CamelCaseModel, TaggedModel

 FinishReason = Literal[
    "stop", "length", "tool_calls", "content_filter", "function_call", "error"
@@ -170,7 +170,9 @@ class BenchChatCompletionResponse(ChatCompletionResponse):
    generation_stats: GenerationStats | None = None


-class ChatCompletionTaskParams(BaseModel):
+class ChatCompletionTaskParams(TaggedModel):
+    model_config = ConfigDict(extra="ignore")
+
    model: str
    frequency_penalty: float | None = None
    messages: list[ChatCompletionMessage]
--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -2,6 +2,7 @@ from pydantic import Field

 from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.api import (
+    BenchChatCompletionTaskParams,
    ChatCompletionTaskParams,
    ImageEditsInternalParams,
    ImageGenerationTaskParams,
@@ -22,7 +23,7 @@ class TestCommand(BaseCommand):


 class ChatCompletion(BaseCommand):
-    request_params: ChatCompletionTaskParams
+    request_params: ChatCompletionTaskParams | BenchChatCompletionTaskParams


 class ImageGeneration(BaseCommand):
--- a/src/exo/shared/types/tasks.py
+++ b/src/exo/shared/types/tasks.py
@@ -3,6 +3,7 @@ from enum import Enum
 from pydantic import Field

 from exo.shared.types.api import (
+    BenchChatCompletionTaskParams,
    ChatCompletionTaskParams,
    ImageEditsInternalParams,
    ImageGenerationTaskParams,
@@ -54,7 +55,7 @@ class StartWarmup(BaseTask):  # emitted by Worker

 class ChatCompletion(BaseTask):  # emitted by Master
    command_id: CommandId
-    task_params: ChatCompletionTaskParams
+    task_params: ChatCompletionTaskParams | BenchChatCompletionTaskParams

    error_type: str | None = Field(default=None)
    error_message: str | None = Field(default=None)
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -39,7 +39,7 @@ def prefill(
    sampler: Callable[[mx.array], mx.array],
    prompt_tokens: mx.array,
    cache: KVCacheType,
-) -> float:
+) -> tuple[float, int]:
    """Prefill the KV cache with prompt tokens.

    This runs the model over the prompt tokens to populate the cache,
@@ -50,7 +50,7 @@ def prefill(
    """
    num_tokens = len(prompt_tokens)
    if num_tokens == 0:
-        return 0.0
+        return 0.0, 0

    logger.debug(f"Prefilling {num_tokens} tokens...")
    start_time = time.perf_counter()
@@ -85,7 +85,7 @@ def prefill(
        f"Prefill complete: {num_tokens} tokens in {elapsed:.2f}s "
        f"({tokens_per_sec:.1f} tok/s)"
    )
-    return tokens_per_sec
+    return tokens_per_sec, num_tokens


 def warmup_inference(
@@ -169,6 +169,8 @@ def mlx_generate(
    mx.reset_peak_memory()
    is_bench: bool = isinstance(task, BenchChatCompletionTaskParams)

+    logger.info(f"{is_bench=}")
+
    # Currently we support chat-completion tasks only.
    logger.debug(f"task_params: {task}")

@@ -204,7 +206,9 @@ def mlx_generate(
    )

    # Prefill cache with all tokens except the last one
-    prefill_tps = prefill(model, tokenizer, sampler, prompt_tokens[:-1], caches)
+    prefill_tps, prefill_tokens = prefill(
+        model, tokenizer, sampler, prompt_tokens[:-1], caches
+    )

    # stream_generate starts from the last token
    last_token = prompt_tokens[-1:]
@@ -233,7 +237,7 @@ def mlx_generate(
            stats = GenerationStats(
                prompt_tps=float(prefill_tps or out.prompt_tps),
                generation_tps=float(out.generation_tps),
-                prompt_tokens=int(out.prompt_tokens),
+                prompt_tokens=int(prefill_tokens + out.prompt_tokens),
                generation_tokens=int(out.generation_tokens),
                peak_memory_usage=Memory.from_gb(out.peak_memory),
            )
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -538,7 +538,6 @@ def parse_gpt_oss(
                    ]
                )
                tool_arg_parts = []
-                break
            current_tool_name = recipient

        # If inside a tool call, accumulate arguments
--- a/tests/headless_runner.py
+++ b/tests/headless_runner.py
@@ -1,26 +1,21 @@
-import multiprocessing as mp
 import socket
-import time
-import typing
+from typing import Literal

 import anyio
+from loguru import logger
 from fastapi import FastAPI
-from fastapi.responses import StreamingResponse
+from fastapi.responses import StreamingResponse, Response
 from hypercorn import Config
 from hypercorn.asyncio import serve  # pyright: ignore[reportUnknownVariableType]
-from loguru import logger
 from pydantic import BaseModel

-from exo.download.impl_shard_downloader import (
-    build_full_shard,
-    exo_shard_downloader,
-)
-from exo.shared.logging import InterceptLogger, logger_setup
+from exo.shared.constants import EXO_MODELS_DIR
 from exo.shared.models.model_cards import MODEL_CARDS, ModelId
 from exo.shared.types.api import ChatCompletionMessage, ChatCompletionTaskParams
+from exo.shared.types.chunks import TokenChunk
 from exo.shared.types.commands import CommandId
 from exo.shared.types.common import Host, NodeId
-from exo.shared.types.events import Event
+from exo.shared.types.events import ChunkGenerated, Event, RunnerStatusUpdated
 from exo.shared.types.tasks import (
    ChatCompletion,
    ConnectToGroup,
@@ -36,9 +31,14 @@ from exo.shared.types.worker.instances import (
    MlxJacclInstance,
    MlxRingInstance,
 )
-from exo.shared.types.worker.runners import RunnerId, ShardAssignments
+from exo.shared.types.worker.runners import (
+    RunnerFailed,
+    RunnerId,
+    RunnerShutdown,
+    ShardAssignments,
+)
 from exo.shared.types.worker.shards import PipelineShardMetadata, TensorShardMetadata
-from exo.utils.channels import MpReceiver, MpSender, channel, mp_channel
+from exo.utils.channels import channel, mp_channel
 from exo.utils.info_gatherer.info_gatherer import GatheredInfo, InfoGatherer
 from exo.worker.runner.bootstrap import entrypoint

@@ -46,37 +46,36 @@ from exo.worker.runner.bootstrap import entrypoint
 class Tests(BaseModel):
    # list[hostname, ip addr]
    devs: list[list[str]]
-    model_id: str
-    kind: typing.Literal["init", "warmup", "inference"]
+    rdma_devs: list[list[str | None]] | None
+    model_id: ModelId
+    kind: Literal["ring", "rdma", "both"]


-mp.set_start_method("spawn", force=True)
-logger_setup(None)
+iid = InstanceId("im testing here")


 async def main():
    logger.info("starting cool server majig")
-    await assert_downloads()
    cfg = Config()
-    cfg.bind = "0.0.0.0:52415"
+    cfg.bind = "0.0.0.0:52414"
    # nb: shared.logging needs updating if any of this changes
    cfg.accesslog = "-"
    cfg.errorlog = "-"
-    cfg.logger_class = InterceptLogger
+    ev = anyio.Event()
    app = FastAPI()
-    app.post("/ring")(ring_backend)
-    app.post("/jaccl")(jaccl_backend)
-    app.post("/tb_detection")(tb_detection)
-    shutdown = anyio.Event()
+    app.post("/run_test")(run_test)
+    app.post("/kill")(lambda: kill(ev))
+    app.get("/tb_detection")(tb_detection)
+    app.get("/models")(list_models)
    await serve(
        app,  # type: ignore
        cfg,
-        shutdown_trigger=lambda: shutdown.wait(),
+        shutdown_trigger = lambda: ev.wait()
    )
-    await anyio.sleep_forever()
-    # gracefully shutdown the api
-    shutdown.set()

+def kill(ev: anyio.Event):
+    ev.set()
+    return Response(status_code=204)

 async def tb_detection():
    send, recv = channel[GatheredInfo]()
@@ -87,29 +86,19 @@ async def tb_detection():
        return recv.collect()


-async def assert_downloads():
-    sd = exo_shard_downloader()
-    # await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-0.6b"].model_id))
-    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["llama-3.1-8b-bf16"].model_id)
-    )
-    await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-30b"].model_id))
-    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["gpt-oss-120b-MXFP4-Q8"].model_id)
-    )
-    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["gpt-oss-20b-4bit"].model_id)
-    )
-    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["glm-4.7-8bit-gs32"].model_id)
-    )
-    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["minimax-m2.1-8bit"].model_id)
-    )
+def list_models():
+    sent = set[str]()
+    for path in EXO_MODELS_DIR.rglob("model-*.safetensors"):
+        if "--" not in path.parent.name:
+            continue
+        name = path.parent.name.replace("--", "/")
+        if name in sent:
+            continue
+        sent.add(name)
+        yield ModelId(path.parent.name.replace("--", "/"))


-async def ring_backend(test: Tests):
-    iid = InstanceId(str(hash(str(test.devs))))
+async def run_test(test: Tests):
    weird_hn = socket.gethostname()
    for dev in test.devs:
        if weird_hn.startswith(dev[0]) or dev[0].startswith(weird_hn):
@@ -117,31 +106,67 @@ async def ring_backend(test: Tests):
            break
    else:
        raise ValueError(f"{weird_hn} not in {test.devs}")
-    return await execute_test(test, ring_instance(test, iid, hn), hn)
+
+    async def run():
+        logger.info(f"testing {test.model_id}")
+
+        instances: list[Instance] = []
+        if test.kind in ["ring", "both"]:
+            i = ring_instance(test, hn)
+            if i is None:
+                yield "no model found"
+                return
+            instances.append(i)
+        if test.kind in ["rdma", "both"]:
+            i = jaccl_instance(test)
+            if i is None:
+                yield "no model found"
+                return
+            instances.append(i)
+
+        for instance in instances:
+            recv = await execute_test(test, instance, hn)
+
+            str_out = ""
+
+            for item in recv:
+                if isinstance(item, ChunkGenerated):
+                    assert isinstance(item.chunk, TokenChunk)
+                    str_out += item.chunk.text
+
+                if isinstance(item, RunnerStatusUpdated) and isinstance(
+                    item.runner_status, (RunnerFailed, RunnerShutdown)
+                ):
+                    yield str_out + "\n"
+                    yield item.model_dump_json() + "\n"
+
+    return StreamingResponse(run())


-def ring_instance(test: Tests, iid: InstanceId, hn: str) -> Instance:
-    hbn = [Host(ip="i dont care", port=52416) for _ in test.devs]
+def ring_instance(test: Tests, hn: str) -> Instance | None:
+    hbn = [Host(ip="198.51.100.0", port=52417) for _ in test.devs]
    world_size = len(test.devs)
    for i in range(world_size):
        if test.devs[i][0] == hn:
            hn = test.devs[i][0]
-            if i - 1 >= 0:
-                hbn[i - 1] = Host(ip=test.devs[i - 1][1], port=52416)
-            if i + 1 < len(test.devs):
-                hbn[i + 1] = Host(ip=test.devs[i + 1][1], port=52416)
-            hbn[i] = Host(ip="0.0.0.0", port=52416)
-            break
+        hbn[(i - 1) % world_size] = Host(ip=test.devs[i - 1][1], port=52417)
+        hbn[(i + 1) % world_size] = Host(ip=test.devs[i + 1][1], port=52417)
+        hbn[i] = Host(ip="0.0.0.0", port=52417)
+        break
    else:
        raise ValueError(f"{hn} not in {test.devs}")

-    card = MODEL_CARDS[test.model_id]
+    card = next(
+        (card for card in MODEL_CARDS.values() if card.model_id == test.model_id), None
+    )
+    if card is None:
+        return None
    instance = MlxRingInstance(
        instance_id=iid,
-        ephemeral_port=52416,
+        ephemeral_port=52417,
        hosts_by_node={NodeId(hn): hbn},
        shard_assignments=ShardAssignments(
-            model_id=ModelId(test.model_id),
+            model_id=test.model_id,
            node_to_runner={NodeId(host[0]): RunnerId(host[0]) for host in test.devs},
            runner_to_shard={
                RunnerId(test.devs[i][0]): PipelineShardMetadata(
@@ -163,119 +188,86 @@ def ring_instance(test: Tests, iid: InstanceId, hn: str) -> Instance:
    return instance


-async def execute_test(test: Tests, instance: Instance, hn: str):
+async def execute_test(test: Tests, instance: Instance, hn: str) -> list[Event]:
    world_size = len(test.devs)
-    iid = InstanceId(str(hash(str(test.devs))))
-    _handle, recv, send = new_runner(instance, hn)
-    if world_size > 1:
-        send.send(ConnectToGroup(instance_id=iid))
-    send.send(LoadModel(instance_id=iid))
-
-    match test.kind:
-        case "init":
-            pass
-        case "warmup":
-            send.send(StartWarmup(instance_id=iid))
-        case "inference":
-            send.send(StartWarmup(instance_id=iid))
-            send.send(
-                ChatCompletion(
-                    task_params=ChatCompletionTaskParams(
-                        model=test.model_id,
-                        messages=[
-                            ChatCompletionMessage(
-                                role="system", content="You are a helpful assistant"
-                            ),
-                            ChatCompletionMessage(
-                                role="user", content="What is the capital of France?"
-                            ),
-                        ],
-                    ),
-                    command_id=CommandId("yo"),
-                    instance_id=iid,
-                )
+    commands: list[Task] = [
+        (LoadModel(instance_id=iid)),
+        (StartWarmup(instance_id=iid)),
+        (
+            ChatCompletion(
+                task_params=ChatCompletionTaskParams(
+                    model=test.model_id,
+                    messages=[
+                        ChatCompletionMessage(
+                            role="system", content="You are a helpful assistant"
+                        ),
+                        ChatCompletionMessage(
+                            role="user", content="What is the capital of France?"
+                        ),
+                    ],
+                    max_tokens=50,
+                ),
+                command_id=CommandId("yo"),
+                instance_id=iid,
            )
+        ),
+        (Shutdown(runner_id=RunnerId(hn), instance_id=iid)),
+    ]
+    if world_size > 1:
+        commands.insert(0, ConnectToGroup(instance_id=iid))
+    bound_instance = BoundInstance(
+        instance=instance, bound_runner_id=RunnerId(hn), bound_node_id=NodeId(hn)
+    )
+    ev_send, _ev_recv = mp_channel[Event]()
+    task_send, task_recv = mp_channel[Task]()

-    send.send(Shutdown(runner_id=RunnerId(hn), instance_id=iid))
+    for command in commands:
+        task_send.send(command)

-    async def map_recv():
-        with recv:
-            try:
-                async for item in recv:
-                    yield item.model_dump_json() + "\n"
-            except anyio.ClosedResourceError:
-                pass
+    entrypoint(
+        bound_instance,
+        ev_send,
+        task_recv,
+        logger,  # type: ignore
+    )

-    ret = StreamingResponse(map_recv())
-    ret._pls_dont_gc = _handle  # type: ignore
-    return ret
+    # TODO(evan): return ev_recv.collect()
+    return []


-async def jaccl_backend(test: Tests):
-    iid = InstanceId(str(hash(str(test.devs))))
-    weird_hn = socket.gethostname()
-    for dev in test.devs:
-        if weird_hn.startswith(dev[0]) or dev[0].startswith(weird_hn):
-            hn = dev[0]
-            break
-    else:
-        raise ValueError(f"{weird_hn} not in {test.devs}")
-    return await execute_test(test, jaccl_instance(test, iid), hn)
-
-
-def jaccl_instance(test: Tests, iid: InstanceId):
-    card = MODEL_CARDS[test.model_id]
+def jaccl_instance(test: Tests) -> MlxJacclInstance | None:
+    card = next(
+        (card for card in MODEL_CARDS.values() if card.model_id == test.model_id), None
+    )
+    if card is None:
+        return None
    world_size = len(test.devs)
+    assert test.rdma_devs

    return MlxJacclInstance(
        instance_id=iid,
-        jaccl_devices=[[None, "rdma_en3"], ["rdma_en3", None]],
+        jaccl_devices=test.rdma_devs,
        # rank 0 is always coordinator
        jaccl_coordinators={
-            NodeId(host[0]): test.devs[0][1] + ":52416" for host in test.devs
+            NodeId(host[0]): test.devs[0][1] + ":52417" for host in test.devs
        },
        shard_assignments=ShardAssignments(
-            model_id=ModelId(test.model_id),
+            model_id=test.model_id,
            node_to_runner={NodeId(host[0]): RunnerId(host[0]) for host in test.devs},
            runner_to_shard={
-                RunnerId(test.devs[i][0]): TensorShardMetadata(
+                RunnerId(host[0]): TensorShardMetadata(
                    model_card=card,
                    device_rank=i,
                    world_size=world_size,
-                    start_layer=card.n_layers,
+                    start_layer=0,
                    end_layer=card.n_layers,
                    n_layers=card.n_layers,
                )
-                for i in range(world_size)
+                for i, host in enumerate(test.devs)
            },
        ),
    )


-def new_runner(
-    instance: Instance,
-    hn: str,
-) -> tuple[mp.Process, MpReceiver[Event], MpSender[Task]]:
-    bound_instance = BoundInstance(
-        instance=instance, bound_runner_id=RunnerId(hn), bound_node_id=NodeId(hn)
-    )
-    ev_send, ev_recv = mp_channel[Event]()
-    task_send, task_recv = mp_channel[Task]()
-
-    runner_process = mp.Process(
-        target=entrypoint,
-        args=(
-            bound_instance,
-            ev_send,
-            task_recv,
-            logger,
-        ),
-    )
-    runner_process._pls_dont_gc = (ev_send, task_recv)  # type: ignore
-    runner_process.start()
-    time.sleep(0.1)
-    return (runner_process, ev_recv, task_send)
-
-
 if __name__ == "__main__":
    anyio.run(main)
--- a/tests/run_distributed_test.sh
+++ b/tests/run_distributed_test.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+[ $# -eq 0 ] && {
+  echo "Usage: $0 host1 [host2 ...]"
+  exit 1
+}
+
+[ -z "$(git status --porcelain)" ] || {
+  echo "Uncommitted changes"
+  exit 1
+}
+commit=$(git rev-parse HEAD)
+git fetch -q origin
+git branch -r --contains "$commit" | grep -qE '^\s*origin/' || {
+  echo "Not pushed to origin"
+  exit 1
+}
+for host; do
+  curl -m 1 -X POST "http://$host:52414/kill" >/dev/null 2>&1 || true &
+done
+wait
+
+echo "Deploying $commit to $# hosts..."
+
+pids=""
+trap 'xargs -r kill 2>/dev/null <<<"$pids" || true' EXIT INT TERM
+colours=($'\e[31m' $'\e[32m' $'\e[33m' $'\e[34m')
+reset=$'\e[0m'
+i=0
+
+for host; do
+  colour=${colours[i++ % 4]}
+  ssh -tt -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" "/usr/bin/env bash -lc '
+    set -euo pipefail
+    cd exo 
+    git fetch -q origin
+    git checkout -q $commit
+    nix develop -c uv sync
+    .venv/bin/python tests/headless_runner.py
+    '" 2>&1 | sed -u "s/^/${colour}[${host}]${reset}/" &
+  pids+=" $!"
+done
+
+for host; do
+  echo "Waiting for $host..."
+  until curl -sf "http://$host:52414/models"; do sleep 1; done
+done
+
+uv run tests/start_distributed_test.py "$@"
--- a/tests/start_distributed_test.py
+++ b/tests/start_distributed_test.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+import itertools
+import json
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, cast
+from urllib.request import Request, urlopen
+
+if not (args := sys.argv[1:]):
+    sys.exit(
+        f"USAGE: {sys.argv[0]} <kind> [host1] [host2] ...\nkind is optional, and should be rdma or ring"
+    )
+
+kind = args[0] if args[0] in ("rdma", "ring") else "both"
+hosts = args[1:] if kind != "both" else args
+ts = subprocess.run(
+    ["tailscale", "status"], check=True, text=True, capture_output=True
+).stdout.splitlines()
+ip = {sl[1]: sl[0] for line in ts if len(sl := line.split()) >= 2}
+ips = [ip[h] for h in hosts]
+devs = [[h, ip[h]] for h in hosts]
+n = len(hosts)
+
+
+def get_tb(a: str) -> list[dict[str, Any]]:
+    with urlopen(f"http://{a}:52414/tb_detection", timeout=5) as r:  # pyright: ignore[reportAny]
+        return json.loads(r.read())  # pyright: ignore[reportAny]
+
+
+def get_models(a: str) -> set[str]:
+    with urlopen(f"http://{a}:52414/models", timeout=5) as r:  # pyright: ignore[reportAny]
+        return set(json.loads(r.read()))  # pyright: ignore[reportAny]
+
+
+def run(h: str, a: str, body: bytes) -> None:
+    with urlopen(
+        Request(
+            f"http://{a}:52414/run_test",
+            data=body,
+            method="POST",
+            headers={"Content-Type": "application/json"},
+        ),
+        timeout=300,
+    ) as r:  # pyright: ignore[reportAny]
+        for line in r.read().decode(errors="replace").splitlines():  # pyright: ignore[reportAny]
+            print(f"\n{h}@{a}: {line}", flush=True)
+
+
+with ThreadPoolExecutor(n) as exctr:
+    if kind in ("rdma", "both"):
+        payloads = list(exctr.map(get_tb, ips))
+
+        u2e = {
+            ident["domainUuid"]: (i, ident["rdmaInterface"])
+            for i, p in enumerate(payloads)
+            for d in p
+            for ident in cast(
+                list[dict[str, str]],
+                d.get("MacThunderboltIdentifiers", {}).get("idents", []),  # pyright: ignore[reportAny]
+            )
+        }
+        edges = {
+            (u2e[s][0], u2e[t][0]): u2e[t][1]
+            for p in payloads
+            for d in p
+            for c in d.get("MacThunderboltConnections", {}).get("conns", [])  # pyright: ignore[reportAny]
+            if (s := c["sourceUuid"]) in u2e and (t := c["sinkUuid"]) in u2e  # pyright: ignore[reportAny]
+        }
+        rdma_devs = [[edges.get((i, j)) for j in range(n)] for i in range(n)]
+    else:
+        rdma_devs = None
+
+    models = set[str].intersection(*exctr.map(get_models, ips))
+
+    print("\n")
+    print("=" * 70)
+    print(f"Starting test with {models}")
+    print("=" * 70)
+    print("\n")
+    for model in models:
+        body = json.dumps(
+            {"devs": devs, "model_id": model, "rdma_devs": rdma_devs, "kind": kind}
+        ).encode()
+        list(exctr.map(run, hosts, ips, itertools.repeat(body)))
--- a/tests/start_distributed_test.sh
+++ b/tests/start_distributed_test.sh
@@ -1,54 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-query() {
-  tailscale status | awk -v find="$1" '$2 == find { print $1 }'
-}
-
-if [[ $# -lt 2 ]]; then
-  echo "USAGE: $0 <test kind> [host1] [host2] ..."
-  exit 1
-fi
-
-kind=$1
-shift
-
-test_kinds="ring jaccl"
-
-if ! echo "$test_kinds" | grep -q "$kind"; then
-  printf "%s is not a known test kind.\nCurrent test kinds are %s" "$kind" "$test_kinds"
-  exit 1
-fi
-
-hostnames=("$@")
-weaved=()
-ips=()
-for name in "${hostnames[@]}"; do
-  ip=$(query "$name")
-  ips+=("$ip")
-  weaved+=("$name" "$ip")
-done
-
-devs_raw=$(printf '["%s", "%s"], ' "${weaved[@]}")
-devs="[${devs_raw%, }]"
-
-model_ids=("qwen3-30b" "gpt-oss-120b-MXFP4-Q8" "kimi-k2-thinking")
-
-for model_id in "${model_ids[@]}"; do
-  for i in "${!ips[@]}"; do
-    {
-      req="{
-        \"model_id\": \"${model_id}\",
-        \"devs\": ${devs},
-        \"kind\": \"inference\"
-       }"
-      echo "req $req"
-      curl -sN \
-        -X POST "http://${ips[$i]}:52415/${kind}" \
-        -H "Content-Type: application/json" -d "$req" \
-        2>&1 | sed "s/^/\n${hostnames[$i]}@${ips[$i]}: /" || echo "curl to ${hostnames[$i]} failed" && exit 1
-    } &
-  done
-  wait
-done
--- a/tmp/config_examples/opencode.json
+++ b/tmp/config_examples/opencode.json
@@ -0,0 +1,18 @@
+{
+  "$schema": "https://opencode.ai/config.json",
+  "model": "exo/mlx-community/gpt-oss-120b-MXFP4-Q8",
+  "provider": {
+    "exo": {
+      "api": "http://localhost:52415/v1",
+      "models": {
+        "mlx-community/gpt-oss-120b-MXFP4-Q8": {
+          "name": "GPT OSS 120B",
+          "limit": {
+            "context": 32768,
+            "output": 8192
+          }
+        }
+      }
+    }
+  }
+}
--- a/tmp/run_exo_on.sh
+++ b/tmp/run_exo_on.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+[ $# -eq 0 ] && {
+  echo "Usage: $0 host1 [host2 ...]"
+  exit 1
+}
+
+[ -z "$(git status --porcelain)" ] || {
+  echo "Uncommitted changes"
+  exit 1
+}
+commit=$(git rev-parse HEAD)
+git fetch -q origin
+git branch -r --contains "$commit" | grep -qE '^\s*origin/' || {
+  echo "Not pushed to origin"
+  exit 1
+}
+echo "Deploying $commit to $# hosts..."
+
+pids=""
+trap 'xargs -r kill 2>/dev/null <<<"$pids" || true' EXIT INT TERM
+colours=($'\e[31m' $'\e[32m' $'\e[33m' $'\e[34m')
+reset=$'\e[0m'
+i=0
+
+for host; do
+  colour=${colours[i++ % 4]}
+  ssh -t -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" "/usr/bin/env bash -lc '
+    set -euo pipefail
+    cd exo 
+    git fetch -q origin
+    git checkout -q $commit
+    nix develop -c uv sync
+    uv run exo
+    '" 2>&1 | sed -u "s/^/${colour}[${host}]${reset}/" &
+  pids+=" $!"
+done
+wait