feat(dashboard): add light/dark mode toggle with warm parchment palette

Adds a theme system to the EXO dashboard with a "Mission Control, Dawn Shift" light mode — warm parchment backgrounds (oklch(0.97 0.015 80)) and deep amber/brass accents (oklch(0.50 0.14 65)) that feel premium rather than cold. Changes: - dashboard/src/lib/stores/theme.svelte.ts: new Svelte 5 rune store, persists choice to localStorage under 'exo-theme' - dashboard/src/app.html: FOUC prevention — html starts as class="dark", inline script reads localStorage and switches to class="light" before first paint - dashboard/src/routes/+layout.svelte: calls theme.init() on mount to sync rune state with the DOM class - dashboard/src/lib/components/HeaderNav.svelte: sun/moon toggle button in the right nav area - dashboard/src/app.css: full html.light palette + utility overrides (scrollbar, logo filter, graph links, scanlines, etc.) No new npm dependencies — avoids mode-watcher entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-18 23:06:23 -05:00 · 2026-02-18 12:36:51 -08:00
48 changed files with 1113 additions and 2370 deletions
--- a/.mlx_typings/mlx/nn/layers/base.pyi
+++ b/.mlx_typings/mlx/nn/layers/base.pyi
@@ -200,7 +200,7 @@ class Module(dict):
    ) -> mx.MX_ARRAY_TREE:  # -> dict[Any, Any | dict[Any, Any | dict[Any, Any] | list[Any]] | dict[Any, Any] | list[Any]]:
        """Return the submodules that do not contain other modules."""

-    def update(self, parameters: dict[str, Any], strict: bool = ...) -> Module:
+    def update(self, parameters: dict, strict: bool = ...) -> Module:
        """Replace the parameters of this Module with the provided ones in the
        dict of dicts and lists.

--- a/.mlx_typings/mlx/utils.pyi
+++ b/.mlx_typings/mlx/utils.pyi
@@ -7,10 +7,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from mlx.core import MX_ARRAY_TREE

 def tree_map(
-    fn: Callable[..., Any],
-    tree: Any,
-    *rest: Any,
-    is_leaf: Callable[..., bool] | None = ...,
+    fn: Callable, tree: Any, *rest: Any, is_leaf: Optional[Callable] = ...
 ) -> Any:
    """Applies ``fn`` to the leaves of the Python tree ``tree`` and
    returns a new collection with the results.
@@ -47,11 +44,11 @@ def tree_map(
    """

 def tree_map_with_path(
-    fn: Callable[..., Any],
+    fn: Callable,
    tree: Any,
    *rest: Any,
-    is_leaf: Callable[..., bool] | None = ...,
-    path: str | None = ...,
+    is_leaf: Optional[Callable] = ...,
+    path: Optional[Any] = ...,
 ) -> Any:
    """Applies ``fn`` to the path and leaves of the Python tree ``tree`` and
    returns a new collection with the results.
@@ -83,9 +80,9 @@ def tree_map_with_path(
 def tree_flatten(
    tree: Any,
    prefix: str = ...,
-    is_leaf: Callable[..., bool] | None = ...,
-    destination: list[tuple[str, Any]] | dict[str, Any] | None = ...,
-) -> list[tuple[str, Any]] | dict[str, Any]:
+    is_leaf: Optional[Callable] = ...,
+    destination: Optional[Union[List[Tuple[str, Any]], Dict[str, Any]]] = ...,
+) -> Union[List[Tuple[str, Any]], Dict[str, Any]]:
    """Flattens a Python tree to a list of key, value tuples.

    The keys are using the dot notation to define trees of arbitrary depth and
@@ -121,7 +118,7 @@ def tree_flatten(
            the Python tree.
    """

-def tree_unflatten(tree: list[tuple[str, Any]] | dict[str, Any]) -> Any:
+def tree_unflatten(tree: Union[List[Tuple[str, Any]], Dict[str, Any]]) -> Any:
    """Recreate a Python tree from its flat representation.

    .. code-block:: python
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -890,7 +890,7 @@ dependencies = [
 "delegate",
 "env_logger",
 "extend",
- "futures-lite",
+ "futures",
 "libp2p",
 "log",
 "networking",
@@ -914,12 +914,6 @@ dependencies = [
 "syn 2.0.111",
 ]

-[[package]]
-name = "fastrand"
-version = "2.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
-
 [[package]]
 name = "ff"
 version = "0.13.1"
@@ -1028,10 +1022,7 @@ version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad"
 dependencies = [
- "fastrand",
 "futures-core",
- "futures-io",
- "parking",
 "pin-project-lite",
 ]

@@ -2762,7 +2753,7 @@ dependencies = [
 "delegate",
 "either",
 "extend",
- "futures-lite",
+ "futures",
 "futures-timer",
 "keccak-const",
 "libp2p",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,13 +29,14 @@ util = { path = "rust/util" }
 # Macro dependecies
 extend = "1.2"
 delegate = "0.13"
+pin-project = "1"

 # Utility dependencies
 keccak-const = "0.2"

 # Async dependencies
 tokio = "1.46"
-futures-lite = "2.6.1"
+futures = "0.3"
 futures-timer = "3.0"

 # Data structures
--- a/bench/eval_tool_calls.py
+++ b/bench/eval_tool_calls.py
--- a/bench/exo_bench.py
+++ b/bench/exo_bench.py
@@ -1,47 +1,29 @@
-# type: ignore
 #!/usr/bin/env python3
-"""Tool-calling eval for exo's OpenAI-compatible API.
-
-Tests whether models correctly:
- Trigger tool calls when appropriate
- Return valid JSON arguments matching function schemas
- Handle multi-turn tool use (call -> result -> final answer)
- Avoid calling tools when unnecessary
-
-Start exo with a model first, then run:
-    uv run python tool_call_eval.py --model <model-id>
-    uv run python tool_call_eval.py --model <model-id> --host 10.0.0.5 --port 52415
-    uv run python tool_call_eval.py --model <model-id> --repeat 3
-    uv run python tool_call_eval.py --model <model-id> --scenarios weather_simple calculator_multi_turn
-"""
-
+# pyright: reportAny=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
 from __future__ import annotations

 import argparse
 import contextlib
+import http.client
 import itertools
 import json
+import os
 import sys
 import time
 from collections.abc import Callable
 from pathlib import Path
 from statistics import mean
 from typing import Any
+from urllib.parse import urlencode

-from harness import (
-    ExoClient,
-    ExoHttpError,
-    add_common_instance_args,
-    instance_id_from_instance,
-    nodes_used_in_instance,
-    resolve_model_short_id,
-    settle_and_fetch_placements,
-    wait_for_instance_gone,
-    wait_for_instance_ready,
-)
 from loguru import logger
 from transformers import AutoTokenizer

+# Backoff constants for cluster settling retry
+_SETTLE_INITIAL_BACKOFF_S = 1.0
+_SETTLE_MAX_BACKOFF_S = 60.0
+_SETTLE_BACKOFF_MULTIPLIER = 2.0
+
 # Monkey-patch for transformers 5.x compatibility
 # Kimi's tokenization_kimi.py imports bytes_to_unicode from the old location
 # which was moved in transformers 5.0.0rc2
@@ -121,6 +103,154 @@ def load_tokenizer_for_bench(model_id: str) -> Any:
    return AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)


+class ExoHttpError(RuntimeError):
+    def __init__(self, status: int, reason: str, body_preview: str):
+        super().__init__(f"HTTP {status} {reason}: {body_preview}")
+        self.status = status
+
+
+class ExoClient:
+    def __init__(self, host: str, port: int, timeout_s: float = 7200.0):
+        self.host = host
+        self.port = port
+        self.timeout_s = timeout_s
+
+    def request_json(
+        self,
+        method: str,
+        path: str,
+        params: dict[str, Any] | None = None,
+        body: dict[str, Any] | None = None,
+        headers: dict[str, str] | None = None,
+    ) -> Any:
+        if not path.startswith("/"):
+            path = "/" + path
+        if params:
+            path = path + "?" + urlencode(params)
+
+        conn = http.client.HTTPConnection(self.host, self.port, timeout=self.timeout_s)
+        try:
+            payload: bytes | None = None
+            hdrs: dict[str, str] = {"Accept": "application/json"}
+
+            if body is not None:
+                payload = json.dumps(body).encode("utf-8")
+                hdrs["Content-Type"] = "application/json"
+            if headers:
+                hdrs.update(headers)
+
+            conn.request(method.upper(), path, body=payload, headers=hdrs)
+            resp = conn.getresponse()
+            raw = resp.read()
+            text = raw.decode("utf-8", errors="replace") if raw else ""
+
+            if resp.status >= 400:
+                raise ExoHttpError(resp.status, resp.reason, text[:300])
+
+            if not text:
+                return None
+            return json.loads(text)
+        finally:
+            conn.close()
+
+    def post_bench_chat_completions(self, payload: dict[str, Any]) -> dict[str, Any]:
+        return self.request_json("POST", "/bench/chat/completions", body=payload)
+
+
+def unwrap_instance(instance: dict[str, Any]) -> dict[str, Any]:
+    if len(instance) != 1:
+        raise KeyError(f"Expected 1 key, got keys={list(instance.keys())}")
+
+    tag = next(iter(instance))
+    inner = instance[tag]
+    if not isinstance(inner, dict):
+        raise TypeError(f"payload for {tag} must be dict, got {type(inner)}")
+    return inner
+
+
+def instance_id_from_instance(instance: dict[str, Any]) -> str:
+    inner = unwrap_instance(instance)
+    return str(inner["instanceId"])
+
+
+def nodes_used_in_instance(instance: dict[str, Any]) -> int:
+    inner = unwrap_instance(instance)
+    return len(inner["shardAssignments"]["nodeToRunner"])
+
+
+def runner_ids_from_instance(instance: dict[str, Any]) -> list[str]:
+    inner = unwrap_instance(instance)
+    runner_to_shard = inner["shardAssignments"]["runnerToShard"]
+    return list(runner_to_shard.keys())
+
+
+def runner_ready(runner: dict[str, Any]) -> bool:
+    return "RunnerReady" in runner
+
+
+def runner_failed(runner: dict[str, Any]) -> bool:
+    return "RunnerFailed" in runner
+
+
+def get_runner_failed_message(runner: dict[str, Any]) -> str | None:
+    if "RunnerFailed" in runner:
+        return runner["RunnerFailed"].get("errorMessage")
+    return None
+
+
+def wait_for_instance_ready(
+    client: ExoClient, instance_id: str, timeout: float = 24000.0
+) -> None:
+    start_time = time.time()
+    instance_existed = False
+    while time.time() - start_time < timeout:
+        state = client.request_json("GET", "/state")
+        instances = state.get("instances", {})
+
+        if instance_id not in instances:
+            if instance_existed:
+                # Instance was deleted after being created - likely due to runner failure
+                raise RuntimeError(
+                    f"Instance {instance_id} was deleted (runner may have failed)"
+                )
+            time.sleep(0.1)
+            continue
+
+        instance_existed = True
+        instance = instances[instance_id]
+        runner_ids = runner_ids_from_instance(instance)
+        runners = state.get("runners", {})
+
+        # Check for failed runners first
+        for rid in runner_ids:
+            runner = runners.get(rid, {})
+            if runner_failed(runner):
+                error_msg = get_runner_failed_message(runner) or "Unknown error"
+                raise RuntimeError(f"Runner {rid} failed: {error_msg}")
+
+        if all(runner_ready(runners.get(rid, {})) for rid in runner_ids):
+            return
+
+        time.sleep(0.1)
+
+    raise TimeoutError(f"Instance {instance_id} did not become ready within {timeout=}")
+
+
+def wait_for_instance_gone(
+    client: ExoClient, instance_id: str, timeout: float = 3.0
+) -> None:
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            client.request_json("GET", f"/instance/{instance_id}")
+            time.sleep(0.4)
+        except ExoHttpError as e:
+            if e.status == 404:
+                return
+
+    raise TimeoutError(f"Instance {instance_id} did not get deleted within {timeout=}")
+
+
 def format_peak_memory(b: float) -> str:
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if b < 1024.0:
@@ -139,6 +269,184 @@ def parse_int_list(values: list[str]) -> list[int]:
    return items


+def resolve_model_short_id(client: ExoClient, model_arg: str) -> tuple[str, str]:
+    models = client.request_json("GET", "/models") or {}
+    data = models.get("data") or []
+
+    for m in data:
+        if m.get("name").lower() == model_arg.lower():
+            short_id = str(m["name"])
+            full_id = str(m.get("hugging_face_id") or m["name"])
+            return short_id, full_id
+
+    for m in data:
+        if m.get("hugging_face_id") == model_arg:
+            short_id = str(m["name"])
+            full_id = str(m["hugging_face_id"])
+            return short_id, full_id
+
+    raise ValueError(f"Model not found in /models: {model_arg}")
+
+
+def run_planning_phase(
+    client: ExoClient,
+    full_model_id: str,
+    preview: dict[str, Any],
+    danger_delete: bool,
+    timeout: float,
+    settle_deadline: float | None,
+) -> None:
+    """Check disk space and ensure model is downloaded before benchmarking."""
+    # Get model size from /models
+    models = client.request_json("GET", "/models") or {}
+    model_bytes = 0
+    for m in models.get("data", []):
+        if m.get("hugging_face_id") == full_model_id:
+            model_bytes = m.get("storage_size_megabytes", 0) * 1024 * 1024
+            break
+
+    if not model_bytes:
+        logger.warning(
+            f"Could not determine size for {full_model_id}, skipping disk check"
+        )
+        return
+
+    # Get nodes from preview
+    inner = unwrap_instance(preview["instance"])
+    node_ids = list(inner["shardAssignments"]["nodeToRunner"].keys())
+    runner_to_shard = inner["shardAssignments"]["runnerToShard"]
+
+    state = client.request_json("GET", "/state")
+    downloads = state.get("downloads", {})
+    node_disk = state.get("nodeDisk", {})
+
+    for node_id in node_ids:
+        node_downloads = downloads.get(node_id, [])
+
+        # Check if model already downloaded on this node
+        already_downloaded = any(
+            "DownloadCompleted" in p
+            and unwrap_instance(p["DownloadCompleted"]["shardMetadata"])["modelCard"][
+                "modelId"
+            ]
+            == full_model_id
+            for p in node_downloads
+        )
+        if already_downloaded:
+            continue
+
+        # Wait for disk info if settle_deadline is set
+        disk_info = node_disk.get(node_id, {})
+        backoff = _SETTLE_INITIAL_BACKOFF_S
+        while not disk_info and settle_deadline and time.monotonic() < settle_deadline:
+            remaining = settle_deadline - time.monotonic()
+            logger.info(
+                f"Waiting for disk info on {node_id} ({remaining:.0f}s remaining)..."
+            )
+            time.sleep(min(backoff, remaining))
+            backoff = min(backoff * _SETTLE_BACKOFF_MULTIPLIER, _SETTLE_MAX_BACKOFF_S)
+            state = client.request_json("GET", "/state")
+            node_disk = state.get("nodeDisk", {})
+            disk_info = node_disk.get(node_id, {})
+
+        if not disk_info:
+            logger.warning(f"No disk info for {node_id}, skipping space check")
+            continue
+
+        avail = disk_info.get("available", {}).get("inBytes", 0)
+        if avail >= model_bytes:
+            continue
+
+        if not danger_delete:
+            raise RuntimeError(
+                f"Insufficient disk on {node_id}: need {model_bytes // (1024**3)}GB, "
+                f"have {avail // (1024**3)}GB. Use --danger-delete-downloads to free space."
+            )
+
+        # Delete from smallest to largest
+        completed = [
+            (
+                unwrap_instance(p["DownloadCompleted"]["shardMetadata"])["modelCard"][
+                    "modelId"
+                ],
+                p["DownloadCompleted"]["totalBytes"]["inBytes"],
+            )
+            for p in node_downloads
+            if "DownloadCompleted" in p
+        ]
+        for del_model, size in sorted(completed, key=lambda x: x[1]):
+            logger.info(f"Deleting {del_model} from {node_id} ({size // (1024**2)}MB)")
+            client.request_json("DELETE", f"/download/{node_id}/{del_model}")
+            avail += size
+            if avail >= model_bytes:
+                break
+
+        if avail < model_bytes:
+            raise RuntimeError(f"Could not free enough space on {node_id}")
+
+    # Start downloads (idempotent)
+    for node_id in node_ids:
+        runner_id = inner["shardAssignments"]["nodeToRunner"][node_id]
+        shard = runner_to_shard[runner_id]
+        client.request_json(
+            "POST",
+            "/download/start",
+            body={
+                "targetNodeId": node_id,
+                "shardMetadata": shard,
+            },
+        )
+        logger.info(f"Started download on {node_id}")
+
+    # Wait for downloads
+    start = time.time()
+    while time.time() - start < timeout:
+        state = client.request_json("GET", "/state")
+        downloads = state.get("downloads", {})
+        all_done = True
+        for node_id in node_ids:
+            done = any(
+                "DownloadCompleted" in p
+                and unwrap_instance(p["DownloadCompleted"]["shardMetadata"])[
+                    "modelCard"
+                ]["modelId"]
+                == full_model_id
+                for p in downloads.get(node_id, [])
+            )
+            failed = [
+                p["DownloadFailed"]["errorMessage"]
+                for p in downloads.get(node_id, [])
+                if "DownloadFailed" in p
+                and unwrap_instance(p["DownloadFailed"]["shardMetadata"])["modelCard"][
+                    "modelId"
+                ]
+                == full_model_id
+            ]
+            if failed:
+                raise RuntimeError(f"Download failed on {node_id}: {failed[0]}")
+            if not done:
+                all_done = False
+        if all_done:
+            return
+        time.sleep(1)
+
+    raise TimeoutError("Downloads did not complete in time")
+
+
+def placement_filter(instance_meta: str, wanted: str) -> bool:
+    s = (instance_meta or "").lower()
+    if wanted == "both":
+        return ("ring" in s) or ("jaccl" in s)
+    return wanted in s
+
+
+def sharding_filter(sharding: str, wanted: str) -> bool:
+    s = (sharding or "").lower()
+    if wanted == "both":
+        return ("pipeline" in s) or ("tensor" in s)
+    return wanted in s
+
+
 def run_one_completion(
    client: ExoClient, model_id: str, pp_hint: int, tg: int, prompt_sizer: PromptSizer
 ) -> tuple[dict[str, Any], int]:
@@ -230,12 +538,76 @@ class PromptSizer:
        return content, tok


+def fetch_and_filter_placements(
+    client: ExoClient, full_model_id: str, args: argparse.Namespace
+) -> list[dict[str, Any]]:
+    previews_resp = client.request_json(
+        "GET", "/instance/previews", params={"model_id": full_model_id}
+    )
+    previews = previews_resp.get("previews") or []
+
+    selected: list[dict[str, Any]] = []
+    for p in previews:
+        if p.get("error") is not None:
+            continue
+        if not placement_filter(str(p.get("instance_meta", "")), args.instance_meta):
+            continue
+        if not sharding_filter(str(p.get("sharding", "")), args.sharding):
+            continue
+
+        instance = p.get("instance")
+        if not isinstance(instance, dict):
+            continue
+
+        n = nodes_used_in_instance(instance)
+        # Skip tensor ring single node as it is pointless when pipeline ring
+        if n == 1 and (
+            (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
+            or (
+                args.instance_meta == "both"
+                and "jaccl" in p.get("instance_meta", "").lower()
+            )
+        ):
+            continue
+
+        if (
+            args.skip_pipeline_jaccl
+            and (
+                args.instance_meta == "both"
+                and "jaccl" in p.get("instance_meta", "").lower()
+            )
+            and (
+                args.sharding == "both" and "pipeline" in p.get("sharding", "").lower()
+            )
+        ):
+            continue
+
+        if (
+            args.skip_tensor_ring
+            and (
+                args.instance_meta == "both"
+                and "ring" in p.get("instance_meta", "").lower()
+            )
+            and (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
+        ):
+            continue
+
+        if args.min_nodes <= n <= args.max_nodes:
+            selected.append(p)
+
+    return selected
+
+
 def main() -> int:
    ap = argparse.ArgumentParser(
        prog="exo-bench",
        description="Benchmark exo model throughput across placement previews.",
    )
-    add_common_instance_args(ap)
+    ap.add_argument("--host", default=os.environ.get("EXO_HOST", "localhost"))
+    ap.add_argument(
+        "--port", type=int, default=int(os.environ.get("EXO_PORT", "52415"))
+    )
+    ap.add_argument("--model", required=True, help="Model short id or huggingface id")
    ap.add_argument(
        "--pp",
        nargs="+",
@@ -248,6 +620,34 @@ def main() -> int:
        required=True,
        help="Generation lengths (ints). Accepts commas.",
    )
+    ap.add_argument(
+        "--max-nodes",
+        type=int,
+        default=4,
+        help="Only consider placements using <= this many nodes.",
+    )
+    ap.add_argument(
+        "--min-nodes",
+        type=int,
+        default=1,
+        help="Only consider placements using >= this many nodes.",
+    )
+    ap.add_argument(
+        "--instance-meta", choices=["ring", "jaccl", "both"], default="both"
+    )
+    ap.add_argument(
+        "--sharding", choices=["pipeline", "tensor", "both"], default="both"
+    )
+    ap.add_argument(
+        "--skip-pipeline-jaccl",
+        action="store_true",
+        help="Skip pipeline+jaccl placements, as it's often pointless.",
+    )
+    ap.add_argument(
+        "--skip-tensor-ring",
+        action="store_true",
+        help="Skip tensor+ring placements, as it's so slow.",
+    )
    ap.add_argument(
        "--repeat", type=int, default=1, help="Repetitions per (pp,tg) pair."
    )
@@ -257,6 +657,9 @@ def main() -> int:
        default=0,
        help="Warmup runs per placement (uses first pp/tg).",
    )
+    ap.add_argument(
+        "--timeout", type=float, default=7200.0, help="HTTP timeout (seconds)."
+    )
    ap.add_argument(
        "--json-out",
        default="bench/results.json",
@@ -271,6 +674,17 @@ def main() -> int:
        action="store_true",
        help="Force all pp×tg combinations (cartesian product) even when lists have equal length.",
    )
+    ap.add_argument(
+        "--settle-timeout",
+        type=float,
+        default=0,
+        help="Max seconds to wait for the cluster to produce valid placements (0 = try once).",
+    )
+    ap.add_argument(
+        "--danger-delete-downloads",
+        action="store_true",
+        help="Delete existing models from smallest to largest to make room for benchmark model.",
+    )
    args = ap.parse_args()

    pp_list = parse_int_list(args.pp)
@@ -305,10 +719,24 @@ def main() -> int:
        logger.error("[exo-bench] tokenizer usable but prompt sizing failed")
        raise

-    selected = settle_and_fetch_placements(
-        client, full_model_id, args, settle_timeout=args.settle_timeout
+    settle_deadline = (
+        time.monotonic() + args.settle_timeout if args.settle_timeout > 0 else None
    )

+    selected = fetch_and_filter_placements(client, full_model_id, args)
+
+    if not selected and settle_deadline:
+        backoff = _SETTLE_INITIAL_BACKOFF_S
+        while not selected and time.monotonic() < settle_deadline:
+            remaining = settle_deadline - time.monotonic()
+            logger.warning(
+                f"No valid placements yet (cluster may still be settling). "
+                f"Retrying in {backoff:.1f}s ({remaining:.0f}s remaining)..."
+            )
+            time.sleep(min(backoff, remaining))
+            backoff = min(backoff * _SETTLE_BACKOFF_MULTIPLIER, _SETTLE_MAX_BACKOFF_S)
+            selected = fetch_and_filter_placements(client, full_model_id, args)
+
    if not selected:
        logger.error("No valid placements matched your filters.")
        return 1
@@ -332,6 +760,16 @@ def main() -> int:
    if args.dry_run:
        return 0

+    logger.info("Planning phase: checking downloads...")
+    run_planning_phase(
+        client,
+        full_model_id,
+        selected[0],
+        args.danger_delete_downloads,
+        args.timeout,
+        settle_deadline,
+    )
+
    all_rows: list[dict[str, Any]] = []

    for preview in selected:
--- a/bench/harness.py
+++ b/bench/harness.py
@@ -1,327 +0,0 @@
-# type: ignore
-from __future__ import annotations
-
-import argparse
-import http.client
-import json
-import os
-import time
-from typing import Any
-from urllib.parse import urlencode
-
-from loguru import logger
-
-_SETTLE_INITIAL_BACKOFF_S = 1.0
-_SETTLE_MAX_BACKOFF_S = 60.0
-_SETTLE_BACKOFF_MULTIPLIER = 2.0
-
-
-class ExoHttpError(RuntimeError):
-    def __init__(self, status: int, reason: str, body_preview: str):
-        super().__init__(f"HTTP {status} {reason}: {body_preview}")
-        self.status = status
-
-
-class ExoClient:
-    def __init__(self, host: str, port: int, timeout_s: float = 7200.0):
-        self.host = host
-        self.port = port
-        self.timeout_s = timeout_s
-
-    def request_json(
-        self,
-        method: str,
-        path: str,
-        params: dict[str, Any] | None = None,
-        body: dict[str, Any] | None = None,
-        headers: dict[str, str] | None = None,
-    ) -> Any:
-        if not path.startswith("/"):
-            path = "/" + path
-        if params:
-            path = path + "?" + urlencode(params)
-
-        conn = http.client.HTTPConnection(self.host, self.port, timeout=self.timeout_s)
-        try:
-            payload: bytes | None = None
-            hdrs: dict[str, str] = {"Accept": "application/json"}
-
-            if body is not None:
-                payload = json.dumps(body).encode("utf-8")
-                hdrs["Content-Type"] = "application/json"
-            if headers:
-                hdrs.update(headers)
-
-            conn.request(method.upper(), path, body=payload, headers=hdrs)
-            resp = conn.getresponse()
-            raw = resp.read()
-            text = raw.decode("utf-8", errors="replace") if raw else ""
-
-            if resp.status >= 400:
-                raise ExoHttpError(resp.status, resp.reason, text[:300])
-
-            if not text:
-                return None
-            return json.loads(text)
-        finally:
-            conn.close()
-
-    def post_bench_chat_completions(self, payload: dict[str, Any]) -> dict[str, Any]:
-        return self.request_json("POST", "/bench/chat/completions", body=payload)
-
-
-def unwrap_instance(instance: dict[str, Any]) -> dict[str, Any]:
-    if len(instance) != 1:
-        raise KeyError(f"Expected 1 key, got keys={list(instance.keys())}")
-
-    tag = next(iter(instance))
-    inner = instance[tag]
-    if not isinstance(inner, dict):
-        raise TypeError(f"payload for {tag} must be dict, got {type(inner)}")
-    return inner
-
-
-def instance_id_from_instance(instance: dict[str, Any]) -> str:
-    inner = unwrap_instance(instance)
-    return str(inner["instanceId"])
-
-
-def nodes_used_in_instance(instance: dict[str, Any]) -> int:
-    inner = unwrap_instance(instance)
-    return len(inner["shardAssignments"]["nodeToRunner"])
-
-
-def runner_ids_from_instance(instance: dict[str, Any]) -> list[str]:
-    inner = unwrap_instance(instance)
-    runner_to_shard = inner["shardAssignments"]["runnerToShard"]
-    return list(runner_to_shard.keys())
-
-
-def runner_ready(runner: dict[str, Any]) -> bool:
-    return "RunnerReady" in runner
-
-
-def runner_failed(runner: dict[str, Any]) -> bool:
-    return "RunnerFailed" in runner
-
-
-def get_runner_failed_message(runner: dict[str, Any]) -> str | None:
-    if "RunnerFailed" in runner:
-        return runner["RunnerFailed"].get("errorMessage")
-    return None
-
-
-def wait_for_instance_ready(
-    client: ExoClient, instance_id: str, timeout: float = 24000.0
-) -> None:
-    start_time = time.time()
-    instance_existed = False
-    while time.time() - start_time < timeout:
-        state = client.request_json("GET", "/state")
-        instances = state.get("instances", {})
-
-        if instance_id not in instances:
-            if instance_existed:
-                # Instance was deleted after being created - likely due to runner failure
-                raise RuntimeError(
-                    f"Instance {instance_id} was deleted (runner may have failed)"
-                )
-            time.sleep(0.1)
-            continue
-
-        instance_existed = True
-        instance = instances[instance_id]
-        runner_ids = runner_ids_from_instance(instance)
-        runners = state.get("runners", {})
-
-        # Check for failed runners first
-        for rid in runner_ids:
-            runner = runners.get(rid, {})
-            if runner_failed(runner):
-                error_msg = get_runner_failed_message(runner) or "Unknown error"
-                raise RuntimeError(f"Runner {rid} failed: {error_msg}")
-
-        if all(runner_ready(runners.get(rid, {})) for rid in runner_ids):
-            return
-
-        time.sleep(0.1)
-
-    raise TimeoutError(f"Instance {instance_id} did not become ready within {timeout=}")
-
-
-def wait_for_instance_gone(
-    client: ExoClient, instance_id: str, timeout: float = 3.0
-) -> None:
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-        try:
-            client.request_json("GET", f"/instance/{instance_id}")
-            time.sleep(0.4)
-        except ExoHttpError as e:
-            if e.status == 404:
-                return
-            raise
-
-    raise TimeoutError(f"Instance {instance_id} did not get deleted within {timeout=}")
-
-
-def resolve_model_short_id(client: ExoClient, model_arg: str) -> tuple[str, str]:
-    models = client.request_json("GET", "/models") or {}
-    data = models.get("data") or []
-
-    for m in data:
-        if (m.get("name") or "").lower() == model_arg.lower():
-            short_id = str(m["name"])
-            full_id = str(m.get("hugging_face_id") or m["name"])
-            return short_id, full_id
-
-    for m in data:
-        if m.get("hugging_face_id") == model_arg:
-            short_id = str(m["name"])
-            full_id = str(m["hugging_face_id"])
-            return short_id, full_id
-
-    raise ValueError(f"Model not found in /models: {model_arg}")
-
-
-def placement_filter(instance_meta: str, wanted: str) -> bool:
-    s = (instance_meta or "").lower()
-    if wanted == "both":
-        return ("ring" in s) or ("jaccl" in s)
-    return wanted in s
-
-
-def sharding_filter(sharding: str, wanted: str) -> bool:
-    s = (sharding or "").lower()
-    if wanted == "both":
-        return ("pipeline" in s) or ("tensor" in s)
-    return wanted in s
-
-
-def fetch_and_filter_placements(
-    client: ExoClient, full_model_id: str, args: argparse.Namespace
-) -> list[dict[str, Any]]:
-    previews_resp = client.request_json(
-        "GET", "/instance/previews", params={"model_id": full_model_id}
-    )
-    previews = previews_resp.get("previews") or []
-
-    selected: list[dict[str, Any]] = []
-    for p in previews:
-        if p.get("error") is not None:
-            continue
-        if not placement_filter(str(p.get("instance_meta", "")), args.instance_meta):
-            continue
-        if not sharding_filter(str(p.get("sharding", "")), args.sharding):
-            continue
-
-        instance = p.get("instance")
-        if not isinstance(instance, dict):
-            continue
-
-        n = nodes_used_in_instance(instance)
-        # Skip tensor ring single node as it is pointless when pipeline ring
-        if n == 1 and (
-            (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
-            or (
-                args.instance_meta == "both"
-                and "jaccl" in p.get("instance_meta", "").lower()
-            )
-        ):
-            continue
-
-        if (
-            args.skip_pipeline_jaccl
-            and (
-                args.instance_meta == "both"
-                and "jaccl" in p.get("instance_meta", "").lower()
-            )
-            and (
-                args.sharding == "both" and "pipeline" in p.get("sharding", "").lower()
-            )
-        ):
-            continue
-
-        if (
-            args.skip_tensor_ring
-            and (
-                args.instance_meta == "both"
-                and "ring" in p.get("instance_meta", "").lower()
-            )
-            and (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
-        ):
-            continue
-
-        if args.min_nodes <= n <= args.max_nodes:
-            selected.append(p)
-
-    return selected
-
-
-def settle_and_fetch_placements(
-    client: ExoClient,
-    full_model_id: str,
-    args: argparse.Namespace,
-    settle_timeout: float = 0,
-) -> list[dict[str, Any]]:
-    selected = fetch_and_filter_placements(client, full_model_id, args)
-
-    if not selected and settle_timeout > 0:
-        backoff = _SETTLE_INITIAL_BACKOFF_S
-        deadline = time.monotonic() + settle_timeout
-        while not selected and time.monotonic() < deadline:
-            remaining = deadline - time.monotonic()
-            logger.warning(
-                f"No valid placements yet (cluster may still be settling). "
-                f"Retrying in {backoff:.1f}s ({remaining:.0f}s remaining)..."
-            )
-            time.sleep(min(backoff, remaining))
-            backoff = min(backoff * _SETTLE_BACKOFF_MULTIPLIER, _SETTLE_MAX_BACKOFF_S)
-            selected = fetch_and_filter_placements(client, full_model_id, args)
-
-    return selected
-
-
-def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
-    ap.add_argument("--host", default=os.environ.get("EXO_HOST", "localhost"))
-    ap.add_argument(
-        "--port", type=int, default=int(os.environ.get("EXO_PORT", "52415"))
-    )
-    ap.add_argument("--model", required=True, help="Model short id or huggingface id")
-    ap.add_argument(
-        "--max-nodes",
-        type=int,
-        default=4,
-        help="Only consider placements using <= this many nodes.",
-    )
-    ap.add_argument(
-        "--min-nodes",
-        type=int,
-        default=1,
-        help="Only consider placements using >= this many nodes.",
-    )
-    ap.add_argument(
-        "--instance-meta", choices=["ring", "jaccl", "both"], default="both"
-    )
-    ap.add_argument(
-        "--sharding", choices=["pipeline", "tensor", "both"], default="both"
-    )
-    ap.add_argument(
-        "--skip-pipeline-jaccl",
-        action="store_true",
-        help="Skip pipeline+jaccl placements, as it's often pointless.",
-    )
-    ap.add_argument(
-        "--skip-tensor-ring",
-        action="store_true",
-        help="Skip tensor+ring placements, as it's so slow.",
-    )
-    ap.add_argument(
-        "--timeout", type=float, default=7200.0, help="HTTP timeout (seconds)."
-    )
-    ap.add_argument(
-        "--settle-timeout",
-        type=float,
-        default=0,
-        help="Max seconds to wait for the cluster to produce valid placements (0 = try once).",
-    )
--- a/bench/pyproject.toml
+++ b/bench/pyproject.toml
@@ -4,7 +4,6 @@ version = "0.1.0"
 description = "Benchmarking tool for exo distributed inference"
 requires-python = ">=3.13"
 dependencies = [
-    "httpx>=0.27.0",
    "loguru>=0.7.3",
    "transformers>=5.0.0",
    "huggingface-hub>=0.33.4",
--- a/bench/scenarios.toml
+++ b/bench/scenarios.toml
@@ -1,240 +0,0 @@
-# Tool definitions — each becomes an OpenAI function tool.
-# All scenarios get all tools unless they specify a `tools` list.
-
-[tools.get_current_weather]
-description = "Get the current weather in a given location"
-required = ["location"]
-
-[tools.get_current_weather.properties.location]
-type = "string"
-description = "City and state, e.g. San Francisco, CA"
-
-[tools.get_current_weather.properties.unit]
-type = "string"
-enum = ["celsius", "fahrenheit"]
-description = "Temperature unit"
-
-[tools.calculate]
-description = "Evaluate a mathematical expression and return the numeric result"
-required = ["expression"]
-
-[tools.calculate.properties.expression]
-type = "string"
-description = "The math expression to evaluate, e.g. '2 + 3 * 4'"
-
-[tools.search_products]
-description = "Search for products in a catalog by query, category, and price"
-required = ["query"]
-
-[tools.search_products.properties.query]
-type = "string"
-description = "Search query string"
-
-[tools.search_products.properties.category]
-type = "string"
-enum = ["electronics", "clothing", "food", "books"]
-description = "Product category to filter by"
-
-[tools.search_products.properties.max_price]
-type = "number"
-description = "Maximum price in USD"
-
-# -- Should call a tool --
-
-[[scenarios]]
-name = "weather_simple"
-description = "Basic weather query -> get_current_weather"
-expect_tool_call = true
-expected_function = "get_current_weather"
-required_arg_keys = ["location"]
-
-[[scenarios.messages]]
-role = "user"
-content = "What's the weather like in Tokyo right now?"
-
-[[scenarios]]
-name = "calculator_simple"
-description = "Math question -> calculate"
-expect_tool_call = true
-expected_function = "calculate"
-required_arg_keys = ["expression"]
-
-[[scenarios.messages]]
-role = "user"
-content = "Use the calculator to compute 3847 * 926 + 17293"
-
-[[scenarios]]
-name = "search_with_filters"
-description = "Product search with category and price filter"
-expect_tool_call = true
-expected_function = "search_products"
-required_arg_keys = ["query"]
-
-[[scenarios.messages]]
-role = "user"
-content = "Find me electronics under $50"
-
-# -- Multi-turn: tool call then follow-up --
-
-[[scenarios]]
-name = "weather_multi_turn"
-description = "Weather query -> tool result -> natural language summary"
-expect_tool_call = true
-expected_function = "get_current_weather"
-required_arg_keys = ["location"]
-
-[scenarios.tool_result]
-temperature = "18C"
-condition = "partly cloudy"
-humidity = "65%"
-wind = "12 km/h NW"
-
-[[scenarios.messages]]
-role = "user"
-content = "What's the weather in Paris?"
-
-[[scenarios]]
-name = "calculator_multi_turn"
-description = "Math query -> tool result -> model reports the answer"
-expect_tool_call = true
-expected_function = "calculate"
-required_arg_keys = ["expression"]
-
-[scenarios.tool_result]
-result = 491682
-
-[[scenarios.messages]]
-role = "user"
-content = "Use the calculator to compute 1847 * 263 + 5921"
-
-[[scenarios]]
-name = "search_multi_turn"
-description = "Search query -> tool result -> model summarizes products"
-expect_tool_call = true
-expected_function = "search_products"
-required_arg_keys = ["query"]
-
-[[scenarios.tool_result.results]]
-name = "Hands-On Machine Learning"
-price = 45.99
-rating = 4.8
-
-[[scenarios.tool_result.results]]
-name = "Deep Learning with Python"
-price = 39.99
-rating = 4.6
-
-[[scenarios.messages]]
-role = "user"
-content = "Search for books about machine learning"
-
-# -- Sequential tool calls --
-
-[[scenarios]]
-name = "chained_tool_calls_same"
-description = "Thinking + weather(Tokyo) -> result -> model must call weather(London)"
-expect_tool_call = true
-expected_function = "get_current_weather"
-required_arg_keys = ["location"]
-
-[[scenarios.messages]]
-role = "user"
-content = "Compare the weather in Tokyo and London."
-
-[[scenarios.messages]]
-role = "assistant"
-content = "I'll check both cities. Let me start with Tokyo."
-
-[[scenarios.messages.tool_calls]]
-id = "call_1"
-name = "get_current_weather"
-arguments = { location = "Tokyo" }
-
-[[scenarios.messages]]
-role = "tool"
-tool_call_id = "call_1"
-content = '{"temperature": "25C", "condition": "sunny"}'
-
-[[scenarios]]
-name = "chained_tool_calls_different"
-description = "Thinking + weather(Berlin) -> result -> model must call calculator"
-expect_tool_call = true
-expected_function = "calculate"
-required_arg_keys = ["expression"]
-
-[[scenarios.messages]]
-role = "user"
-content = "What's the weather in Berlin, and also use the calculator to compute 4819 * 37 + 291."
-
-[[scenarios.messages]]
-role = "assistant"
-content = "I'll handle both. Let me check Berlin's weather first."
-
-[[scenarios.messages.tool_calls]]
-id = "call_2"
-name = "get_current_weather"
-arguments = { location = "Berlin" }
-
-[[scenarios.messages]]
-role = "tool"
-tool_call_id = "call_2"
-content = '{"temperature": "12C", "condition": "rainy"}'
-
-[[scenarios]]
-name = "chained_tool_calls_three"
-description = "Two prior thinking+tool calls -> results -> model must make a third"
-expect_tool_call = true
-expected_function = "get_current_weather"
-required_arg_keys = ["location"]
-
-[[scenarios.messages]]
-role = "user"
-content = "Compare weather in Tokyo, Paris, and London."
-
-[[scenarios.messages]]
-role = "assistant"
-content = "I'll check all three cities. Starting with Tokyo."
-
-[[scenarios.messages.tool_calls]]
-id = "call_3"
-name = "get_current_weather"
-arguments = { location = "Tokyo" }
-
-[[scenarios.messages]]
-role = "tool"
-tool_call_id = "call_3"
-content = '{"temperature": "25C", "condition": "sunny"}'
-
-[[scenarios.messages]]
-role = "assistant"
-content = "Got Tokyo. Now checking Paris."
-
-[[scenarios.messages.tool_calls]]
-id = "call_4"
-name = "get_current_weather"
-arguments = { location = "Paris" }
-
-[[scenarios.messages]]
-role = "tool"
-tool_call_id = "call_4"
-content = '{"temperature": "18C", "condition": "cloudy"}'
-
-# -- Should NOT call a tool --
-
-[[scenarios]]
-name = "no_tool_joke"
-description = "Joke request should NOT trigger any tool"
-expect_tool_call = false
-
-[[scenarios.messages]]
-role = "user"
-content = "Tell me a funny joke about cats."
-
-[[scenarios]]
-name = "no_tool_factual"
-description = "Factual question answerable from training data"
-expect_tool_call = false
-
-[[scenarios.messages]]
-role = "user"
-content = "What is the capital of Japan?"
--- a/dashboard/src/app.css
+++ b/dashboard/src/app.css
@@ -16,9 +16,10 @@
 	/* Gotham-inspired accent colors */
 	--exo-grid: oklch(0.25 0 0);
 	--exo-scanline: oklch(0.15 0 0);
-	--exo-glow-yellow: 0 0 20px oklch(0.85 0.18 85 / 0.3);
-	--exo-glow-yellow-strong: 0 0 40px oklch(0.85 0.18 85 / 0.5);
-	
+	--exo-glow-yellow: oklch(0.85 0.18 85 / 0.3);
+	--exo-glow-yellow-strong: oklch(0.85 0.18 85 / 0.5);
+	--exo-bg-hover: oklch(0.18 0 0);
+
 	/* Theme Variables */
 	--radius: 0.375rem;
 	--background: var(--exo-black);
@@ -41,6 +42,237 @@
 	--ring: var(--exo-yellow);
 }

+/* ============================================================
+   LIGHT THEME — "Mission Control, Dawn Shift"
+   Warm parchment + deep amber. Applied when <html> has .light class.
+   ============================================================ */
+html.light {
+	/* EXO brand palette — warm amber shift */
+	--exo-black: oklch(0.97 0.015 80);
+	--exo-dark-gray: oklch(0.92 0.012 80);
+	--exo-medium-gray: oklch(0.83 0.009 78);
+	--exo-light-gray: oklch(0.50 0.018 75);
+	--exo-yellow: oklch(0.50 0.14 65);
+	--exo-yellow-darker: oklch(0.40 0.13 65);
+	--exo-yellow-glow: oklch(0.60 0.14 65);
+
+	--exo-grid: oklch(0.88 0.009 80);
+	--exo-scanline: oklch(0.93 0.010 80);
+	--exo-glow-yellow: oklch(0.50 0.14 65 / 0.12);
+	--exo-glow-yellow-strong: oklch(0.50 0.14 65 / 0.22);
+	--exo-bg-hover: oklch(0.89 0.010 80);
+
+	/* Semantic tokens */
+	--background: oklch(0.97 0.015 80);
+	--foreground: oklch(0.13 0.015 75);
+	--card: oklch(0.92 0.012 80);
+	--card-foreground: oklch(0.13 0.015 75);
+	--popover: oklch(0.95 0.012 80);
+	--popover-foreground: oklch(0.13 0.015 75);
+	--primary: oklch(0.50 0.14 65);
+	--primary-foreground: oklch(0.97 0.015 80);
+	--secondary: oklch(0.88 0.008 80);
+	--secondary-foreground: oklch(0.15 0.012 75);
+	--muted: oklch(0.90 0.009 80);
+	--muted-foreground: oklch(0.50 0.018 75);
+	--accent: oklch(0.88 0.008 80);
+	--accent-foreground: oklch(0.15 0.012 75);
+	--destructive: oklch(0.52 0.22 25);
+	--border: oklch(0.84 0.007 78);
+	--input: oklch(0.87 0.008 80);
+	--ring: oklch(0.50 0.14 65);
+}
+
+/* ============================================================
+   LIGHT MODE UTILITY OVERRIDES
+   ============================================================ */
+html.light {
+	& .text-white,
+	& .text-white\/90,
+	& .text-white\/80,
+	& .text-white\/70 {
+		color: var(--foreground) !important;
+	}
+	& .text-white\/60,
+	& .text-white\/50 {
+		color: color-mix(in oklch, var(--foreground) 60%, transparent) !important;
+	}
+	& .text-white\/40,
+	& .text-white\/30 {
+		color: color-mix(in oklch, var(--foreground) 38%, transparent) !important;
+	}
+
+	& .bg-black\/80,
+	& .bg-black\/60,
+	& .bg-black\/50,
+	& .bg-black\/40 {
+		background-color: oklch(0.90 0.010 80 / 0.7) !important;
+	}
+	& [class*="bg-exo-black/"] {
+		background-color: oklch(0.90 0.010 80 / 0.6) !important;
+	}
+	& [class*="shadow-black"] {
+		--tw-shadow-color: oklch(0.30 0.010 75 / 0.10) !important;
+	}
+
+	& ::-webkit-scrollbar-track {
+		background: oklch(0.93 0.010 80) !important;
+	}
+	& ::-webkit-scrollbar-thumb {
+		background: oklch(0.76 0.010 78) !important;
+	}
+	& ::-webkit-scrollbar-thumb:hover {
+		background: oklch(0.50 0.14 65 / 0.6) !important;
+	}
+
+	& .command-panel {
+		background: linear-gradient(
+			180deg,
+			oklch(0.94 0.012 80 / 0.96) 0%,
+			oklch(0.91 0.010 80 / 0.98) 100%
+		) !important;
+		border-color: oklch(0.82 0.008 78) !important;
+		box-shadow:
+			inset 0 1px 0 oklch(1 0 0 / 0.6),
+			0 4px 20px oklch(0.30 0.010 75 / 0.08) !important;
+	}
+
+	& .glow-text {
+		text-shadow:
+			0 0 12px oklch(0.50 0.14 65 / 0.20),
+			0 1px 3px oklch(0.30 0.010 75 / 0.12) !important;
+	}
+
+	& .grid-bg {
+		background-image:
+			linear-gradient(oklch(0.75 0.008 78 / 0.25) 1px, transparent 1px),
+			linear-gradient(90deg, oklch(0.75 0.008 78 / 0.25) 1px, transparent 1px) !important;
+	}
+
+	& .scanlines::before {
+		background: repeating-linear-gradient(
+			0deg,
+			transparent,
+			transparent 2px,
+			oklch(0.50 0.010 78 / 0.018) 2px,
+			oklch(0.50 0.010 78 / 0.018) 4px
+		) !important;
+	}
+
+	& .crt-screen {
+		background: radial-gradient(
+			ellipse at center,
+			oklch(0.95 0.012 80) 0%,
+			oklch(0.92 0.010 80) 50%,
+			oklch(0.89 0.009 80) 100%
+		) !important;
+		box-shadow:
+			inset 0 0 60px oklch(0.30 0.010 75 / 0.04),
+			0 0 30px oklch(0.50 0.14 65 / 0.04) !important;
+	}
+
+	& .graph-link {
+		stroke: oklch(0.50 0.018 75 / 0.45) !important;
+		filter: none !important;
+	}
+	& .graph-link-active {
+		stroke: oklch(0.50 0.14 65 / 0.75) !important;
+		filter: none !important;
+	}
+
+	& .shooting-stars {
+		display: none !important;
+	}
+
+	& img[alt="EXO"] {
+		filter: brightness(0) drop-shadow(0 0 6px oklch(0.30 0.010 75 / 0.10)) !important;
+	}
+
+	& .text-red-400 { color: oklch(0.52 0.22 25) !important; }
+	& .text-green-400 { color: oklch(0.48 0.17 155) !important; }
+	& .text-blue-200,
+	& .text-blue-300,
+	& .text-blue-400 { color: oklch(0.48 0.17 250) !important; }
+
+	& .bg-red-500\/10 { background-color: oklch(0.52 0.22 25 / 0.07) !important; }
+	& .bg-red-500\/20 { background-color: oklch(0.52 0.22 25 / 0.11) !important; }
+	& .bg-red-500\/30 { background-color: oklch(0.52 0.22 25 / 0.14) !important; }
+
+	& textarea,
+	& input[type="text"] { color: var(--foreground) !important; }
+	& textarea::placeholder,
+	& input::placeholder { color: oklch(0.50 0.012 78 / 0.55) !important; }
+
+	& .code-block-wrapper,
+	& .math-display-wrapper {
+		background: oklch(0.95 0.010 80) !important;
+		border-color: oklch(0.83 0.007 78) !important;
+	}
+	& .code-block-header,
+	& .math-display-header {
+		background: oklch(0.91 0.009 80) !important;
+		border-color: oklch(0.85 0.007 78) !important;
+	}
+	& .inline-code {
+		background: oklch(0.89 0.009 80) !important;
+		color: oklch(0.20 0.012 75) !important;
+	}
+
+	& blockquote { background: oklch(0.93 0.010 80) !important; }
+	& th {
+		background: oklch(0.90 0.009 80) !important;
+		border-color: oklch(0.80 0.007 78) !important;
+	}
+	& td { border-color: oklch(0.84 0.007 78) !important; }
+	& hr { border-color: oklch(0.84 0.007 78) !important; }
+
+	& .hljs { color: oklch(0.22 0.012 75) !important; }
+	& .hljs-keyword, & .hljs-selector-tag, & .hljs-literal, & .hljs-section, & .hljs-link {
+		color: oklch(0.45 0.18 300) !important;
+	}
+	& .hljs-string, & .hljs-title, & .hljs-name, & .hljs-type,
+	& .hljs-attribute, & .hljs-symbol, & .hljs-bullet, & .hljs-addition,
+	& .hljs-variable, & .hljs-template-tag, & .hljs-template-variable {
+		color: oklch(0.45 0.14 65) !important;
+	}
+	& .hljs-comment, & .hljs-quote, & .hljs-deletion, & .hljs-meta {
+		color: oklch(0.55 0.010 78) !important;
+	}
+	& .hljs-number, & .hljs-regexp, & .hljs-built_in {
+		color: oklch(0.45 0.15 160) !important;
+	}
+	& .hljs-function, & .hljs-class .hljs-title {
+		color: oklch(0.42 0.17 240) !important;
+	}
+
+	& .katex, & .katex .mord, & .katex .minner, & .katex .mop,
+	& .katex .mbin, & .katex .mrel, & .katex .mpunct {
+		color: oklch(0.15 0.012 75) !important;
+	}
+	& .katex .frac-line, & .katex .overline-line, & .katex .underline-line,
+	& .katex .hline, & .katex .rule {
+		border-color: oklch(0.25 0.012 75) !important;
+		background: oklch(0.25 0.012 75) !important;
+	}
+	& .katex svg { fill: oklch(0.25 0.012 75) !important; stroke: oklch(0.25 0.012 75) !important; }
+	& .katex svg path { stroke: oklch(0.25 0.012 75) !important; }
+	& .katex .mopen, & .katex .mclose,
+	& .katex .delimsizing, & [class^="katex .delim-size"] {
+		color: oklch(0.35 0.012 75) !important;
+	}
+
+	& .latex-proof { background: oklch(0.96 0.010 80) !important; border-left-color: oklch(0.72 0.010 78) !important; }
+	& .latex-proof-header { color: oklch(0.22 0.012 75) !important; }
+	& .latex-proof-content { color: oklch(0.15 0.012 75) !important; }
+	& .latex-proof-content::after { color: oklch(0.48 0.012 75) !important; }
+	& .latex-theorem { background: oklch(0.94 0.010 80) !important; border-color: oklch(0.80 0.008 78) !important; }
+	& .latex-diagram-placeholder {
+		background: oklch(0.96 0.010 80) !important;
+		border-color: oklch(0.80 0.008 78) !important;
+		color: oklch(0.38 0.012 75) !important;
+	}
+}
+
@theme inline {
 	--radius-sm: calc(var(--radius) - 2px);
 	--radius-md: var(--radius);
--- a/dashboard/src/app.html
+++ b/dashboard/src/app.html
@@ -1,7 +1,15 @@
 <!doctype html>
-<html lang="en">
+<html lang="en" class="dark">
 	<head>
 		<meta charset="utf-8" />
+		<script>
+			try {
+				if (localStorage.getItem('exo-theme') === 'light') {
+					document.documentElement.classList.remove('dark');
+					document.documentElement.classList.add('light');
+				}
+			} catch (_) {}
+		</script>
 		<link rel="icon" href="%sveltekit.assets%/favicon.ico" />
 		<meta name="viewport" content="width=device-width, initial-scale=1" />
 		<title>EXO</title>
--- a/dashboard/src/lib/components/HeaderNav.svelte
+++ b/dashboard/src/lib/components/HeaderNav.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
  import { browser } from "$app/environment";
+  import { theme } from "$lib/stores/theme.svelte";

  export let showHome = true;
  export let onHome: (() => void) | null = null;
@@ -79,10 +80,48 @@
    />
  </button>

-  <!-- Right: Home + Downloads -->
+  <!-- Right: Theme toggle + Home + Downloads -->
  <div
    class="absolute right-6 top-1/2 -translate-y-1/2 flex items-center gap-4"
  >
+    <button
+      onclick={() => theme.toggle()}
+      class="p-2 rounded border border-exo-medium-gray/40 hover:border-exo-yellow/50 transition-colors cursor-pointer"
+      title={theme.isLight ? "Switch to dark mode" : "Switch to light mode"}
+      aria-label={theme.isLight
+        ? "Switch to dark mode"
+        : "Switch to light mode"}
+    >
+      {#if theme.isLight}
+        <svg
+          class="w-4 h-4 text-exo-light-gray"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            d="M21 12.79A9 9 0 1111.21 3a7 7 0 009.79 9.79z"
+          />
+        </svg>
+      {:else}
+        <svg
+          class="w-4 h-4 text-exo-light-gray"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <circle cx="12" cy="12" r="5" />
+          <path
+            stroke-linecap="round"
+            d="M12 1v2m0 18v2M4.22 4.22l1.42 1.42m12.72 12.72l1.42 1.42M1 12h2m18 0h2M4.22 19.78l1.42-1.42M18.36 5.64l1.42-1.42"
+          />
+        </svg>
+      {/if}
+    </button>
    {#if showHome}
      <button
        onclick={handleHome}
--- a/dashboard/src/lib/stores/theme.svelte.ts
+++ b/dashboard/src/lib/stores/theme.svelte.ts
@@ -0,0 +1,28 @@
+import { browser } from "$app/environment";
+
+let _isLight = $state(false);
+
+export const theme = {
+  get isLight() {
+    return _isLight;
+  },
+
+  init() {
+    if (!browser) return;
+    _isLight = document.documentElement.classList.contains("light");
+  },
+
+  toggle() {
+    if (!browser) return;
+    _isLight = !_isLight;
+    if (_isLight) {
+      document.documentElement.classList.remove("dark");
+      document.documentElement.classList.add("light");
+      localStorage.setItem("exo-theme", "light");
+    } else {
+      document.documentElement.classList.remove("light");
+      document.documentElement.classList.add("dark");
+      localStorage.setItem("exo-theme", "dark");
+    }
+  },
+};
--- a/dashboard/src/routes/+layout.svelte
+++ b/dashboard/src/routes/+layout.svelte
@@ -1,7 +1,13 @@
 <script lang="ts">
  import "../app.css";
+  import { onMount } from "svelte";
+  import { theme } from "$lib/stores/theme.svelte";

  let { children } = $props();
+
+  onMount(() => {
+    theme.init();
+  });
 </script>

 <svelte:head>
--- a/flake.nix
+++ b/flake.nix
@@ -74,6 +74,7 @@
      perSystem =
        { config, self', inputs', pkgs, lib, system, ... }:
        let
+          fenixToolchain = inputs'.fenix.packages.complete;
          # Use pinned nixpkgs for swift-format (swift is broken on x86_64-linux in newer nixpkgs)
          pkgsSwift = import inputs.nixpkgs-swift { inherit system; };
        in
--- a/python/parts.nix
+++ b/python/parts.nix
@@ -158,7 +158,6 @@
          exo-test-env = testVenv;
        } // {
        exo-bench = mkBenchScript "exo-bench" (inputs.self + /bench/exo_bench.py);
-        exo-eval-tool-calls = mkBenchScript "exo-eval-tool-calls" (inputs.self + /bench/eval_tool_calls.py);
        exo-get-all-models-on-cluster = mkSimplePythonScript "exo-get-all-models-on-cluster" (inputs.self + /tests/get_all_models_on_cluster.py);
      };

--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.5-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.5-4bit.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/MiniMax-M2.5-4bit"
-n_layers = 62
-hidden_size = 3072
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "minimax"
-quantization = "4bit"
-base_model = "MiniMax M2.5"
-capabilities = ["text", "thinking"]
-
-[storage_size]
-in_bytes = 128666664960
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.5-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.5-6bit.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/MiniMax-M2.5-6bit"
-n_layers = 62
-hidden_size = 3072
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "minimax"
-quantization = "6bit"
-base_model = "MiniMax M2.5"
-capabilities = ["text", "thinking"]
-
-[storage_size]
-in_bytes = 185826705408
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.5-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.5-8bit.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/MiniMax-M2.5-8bit"
-n_layers = 62
-hidden_size = 3072
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "minimax"
-quantization = "8bit"
-base_model = "MiniMax M2.5"
-capabilities = ["text", "thinking"]
-
-[storage_size]
-in_bytes = 242986745856
--- a/rust/clippy.toml
+++ b/rust/clippy.toml
@@ -0,0 +1,2 @@
+# we can manually exclude false-positive lint errors for dual packages (if in dependencies)
+#allowed-duplicate-crates = ["hashbrown"]
--- a/rust/exo_pyo3_bindings/Cargo.toml
+++ b/rust/exo_pyo3_bindings/Cargo.toml
@@ -27,7 +27,7 @@ networking = { workspace = true }
 # interop
 pyo3 = { version = "0.27.2", features = [
    # "abi3-py313", # tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.13
-    # "nightly", # enables better-supported GIL integration
+    "nightly", # enables better-supported GIL integration
    "experimental-async", # async support in #[pyfunction] & #[pymethods]
    #"experimental-inspect", # inspection of generated binary => easier to automate type-hint generation
    #"py-clone", # adding Clone-ing of `Py<T>` without GIL (may cause panics - remove if panics happen)
@@ -45,10 +45,11 @@ pyo3-log = "0.13.2"
 # macro dependencies
 extend = { workspace = true }
 delegate = { workspace = true }
+pin-project = { workspace = true }

 # async runtime
 tokio = { workspace = true, features = ["full", "tracing"] }
-futures-lite = { workspace = true }
+futures = { workspace = true }

 # utility dependencies
 util = { workspace = true }
@@ -59,4 +60,3 @@ env_logger = "0.11"

 # Networking
 libp2p = { workspace = true, features = ["full"] }
-pin-project = "1.1.10"
--- a/rust/exo_pyo3_bindings/src/allow_threading.rs
+++ b/rust/exo_pyo3_bindings/src/allow_threading.rs
@@ -2,6 +2,7 @@
 //!

 use pin_project::pin_project;
+use pyo3::marker::Ungil;
 use pyo3::prelude::*;
 use std::{
    future::Future,
@@ -25,8 +26,8 @@ where

 impl<F> Future for AllowThreads<F>
 where
-    F: Future + Send,
-    F::Output: Send,
+    F: Future + Ungil,
+    F::Output: Ungil,
 {
    type Output = F::Output;

--- a/rust/exo_pyo3_bindings/src/lib.rs
+++ b/rust/exo_pyo3_bindings/src/lib.rs
@@ -4,12 +4,25 @@
 //!
 //!

-mod allow_threading;
-mod ident;
-mod networking;
+// enable Rust-unstable features for convenience
+#![feature(trait_alias)]
+#![feature(tuple_trait)]
+#![feature(unboxed_closures)]
+// #![feature(stmt_expr_attributes)]
+// #![feature(assert_matches)]
+// #![feature(async_fn_in_dyn_trait)]
+// #![feature(async_for_loop)]
+// #![feature(auto_traits)]
+// #![feature(negative_impls)]
+
+extern crate core;
+mod allow_threading;
+pub(crate) mod networking;
+pub(crate) mod pylibp2p;

-use crate::ident::ident_submodule;
 use crate::networking::networking_submodule;
+use crate::pylibp2p::ident::ident_submodule;
+use crate::pylibp2p::multiaddr::multiaddr_submodule;
 use pyo3::prelude::PyModule;
 use pyo3::{Bound, PyResult, pyclass, pymodule};
 use pyo3_stub_gen::define_stub_info_gatherer;
@@ -19,6 +32,14 @@ pub(crate) mod r#const {
    pub const MPSC_CHANNEL_SIZE: usize = 1024;
 }

+/// Namespace for all the type/trait aliases used by this crate.
+pub(crate) mod alias {
+    use std::marker::Tuple;
+
+    pub trait SendFn<Args: Tuple + Send + 'static, Output> =
+        Fn<Args, Output = Output> + Send + 'static;
+}
+
 /// Namespace for crate-wide extension traits/methods
 pub(crate) mod ext {
    use crate::allow_threading::AllowThreads;
@@ -159,6 +180,7 @@ fn main_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
    //       work with maturin, where the types generate correctly, in the right folder, without
    //       too many importing issues...
    ident_submodule(m)?;
+    multiaddr_submodule(m)?;
    networking_submodule(m)?;

    // top-level constructs
--- a/rust/exo_pyo3_bindings/src/networking.rs
+++ b/rust/exo_pyo3_bindings/src/networking.rs
@@ -8,8 +8,8 @@
 use crate::r#const::MPSC_CHANNEL_SIZE;
 use crate::ext::{ByteArrayExt as _, FutureExt, PyErrExt as _};
 use crate::ext::{ResultExt as _, TokioMpscReceiverExt as _, TokioMpscSenderExt as _};
-use crate::ident::{PyKeypair, PyPeerId};
 use crate::pyclass;
+use crate::pylibp2p::ident::{PyKeypair, PyPeerId};
 use libp2p::futures::StreamExt as _;
 use libp2p::gossipsub;
 use libp2p::gossipsub::{IdentTopic, Message, MessageId, PublishError};
--- a/rust/exo_pyo3_bindings/src/pylibp2p/ident.rs
+++ b/rust/exo_pyo3_bindings/src/pylibp2p/ident.rs
--- a/rust/exo_pyo3_bindings/src/pylibp2p/mod.rs
+++ b/rust/exo_pyo3_bindings/src/pylibp2p/mod.rs
@@ -0,0 +1,8 @@
+//! A module for exposing Rust's libp2p datatypes over Pyo3
+//!
+//! TODO: right now we are coupled to libp2p's identity, but eventually we want to create our own
+//!       independent identity type of some kind or another. This may require handshaking.
+//!
+
+pub mod ident;
+pub mod multiaddr;
--- a/rust/exo_pyo3_bindings/src/pylibp2p/multiaddr.rs
+++ b/rust/exo_pyo3_bindings/src/pylibp2p/multiaddr.rs
@@ -0,0 +1,81 @@
+use crate::ext::ResultExt as _;
+use libp2p::Multiaddr;
+use pyo3::prelude::{PyBytesMethods as _, PyModule, PyModuleMethods as _};
+use pyo3::types::PyBytes;
+use pyo3::{Bound, PyResult, Python, pyclass, pymethods};
+use pyo3_stub_gen::derive::{gen_stub_pyclass, gen_stub_pymethods};
+use std::str::FromStr as _;
+
+/// Representation of a Multiaddr.
+#[gen_stub_pyclass]
+#[pyclass(name = "Multiaddr", frozen)]
+#[derive(Debug, Clone)]
+#[repr(transparent)]
+pub struct PyMultiaddr(pub Multiaddr);
+
+#[gen_stub_pymethods]
+#[pymethods]
+#[allow(clippy::needless_pass_by_value)]
+impl PyMultiaddr {
+    /// Create a new, empty multiaddress.
+    #[staticmethod]
+    fn empty() -> Self {
+        Self(Multiaddr::empty())
+    }
+
+    /// Create a new, empty multiaddress with the given capacity.
+    #[staticmethod]
+    fn with_capacity(n: usize) -> Self {
+        Self(Multiaddr::with_capacity(n))
+    }
+
+    /// Parse a `Multiaddr` value from its byte slice representation.
+    #[staticmethod]
+    fn from_bytes(bytes: Bound<'_, PyBytes>) -> PyResult<Self> {
+        let bytes = Vec::from(bytes.as_bytes());
+        Ok(Self(Multiaddr::try_from(bytes).pyerr()?))
+    }
+
+    /// Parse a `Multiaddr` value from its string representation.
+    #[staticmethod]
+    fn from_string(string: String) -> PyResult<Self> {
+        Ok(Self(Multiaddr::from_str(&string).pyerr()?))
+    }
+
+    /// Return the length in bytes of this multiaddress.
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    /// Returns true if the length of this multiaddress is 0.
+    fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    /// Return a copy of this [`Multiaddr`]'s byte representation.
+    fn to_bytes<'py>(&self, py: Python<'py>) -> Bound<'py, PyBytes> {
+        let bytes = self.0.to_vec();
+        PyBytes::new(py, &bytes)
+    }
+
+    /// Convert a Multiaddr to a string.
+    fn to_string(&self) -> String {
+        self.0.to_string()
+    }
+
+    #[gen_stub(skip)]
+    fn __repr__(&self) -> String {
+        format!("Multiaddr({})", self.0)
+    }
+
+    #[gen_stub(skip)]
+    fn __str__(&self) -> String {
+        self.to_string()
+    }
+}
+
+pub fn multiaddr_submodule(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<PyMultiaddr>()?;
+
+    Ok(())
+}
--- a/rust/networking/Cargo.toml
+++ b/rust/networking/Cargo.toml
@@ -22,7 +22,7 @@ delegate = { workspace = true }

 # async
 tokio = { workspace = true, features = ["full"] }
-futures-lite = { workspace = true }
+futures = { workspace = true }
 futures-timer = { workspace = true }

 # utility dependencies
--- a/rust/networking/examples/chatroom.rs
+++ b/rust/networking/examples/chatroom.rs
@@ -1,4 +1,4 @@
-use futures_lite::StreamExt;
+use futures::stream::StreamExt as _;
 use libp2p::{gossipsub, identity, swarm::SwarmEvent};
 use networking::{discovery, swarm};
 use tokio::{io, io::AsyncBufReadExt as _, select};
@@ -38,19 +38,19 @@ async fn main() {
                    println!("Publish error: {e:?}");
                }
            }
-            event = swarm.next() => match event {
+            event = swarm.select_next_some() => match event {
                // on gossipsub incoming
-                Some(SwarmEvent::Behaviour(swarm::BehaviourEvent::Gossipsub(gossipsub::Event::Message {
+                SwarmEvent::Behaviour(swarm::BehaviourEvent::Gossipsub(gossipsub::Event::Message {
                    propagation_source: peer_id,
                    message_id: id,
                    message,
-                }))) => println!(
+                })) => println!(
                        "\n\nGot message: '{}' with id: {id} from peer: {peer_id}\n\n",
                        String::from_utf8_lossy(&message.data),
                    ),

                // on discovery
-                Some(SwarmEvent::Behaviour(swarm::BehaviourEvent::Discovery(e)) )=> match e {
+                SwarmEvent::Behaviour(swarm::BehaviourEvent::Discovery(e)) => match e {
                    discovery::Event::ConnectionEstablished {
                        peer_id, connection_id, remote_ip, remote_tcp_port
                    } => {
@@ -64,7 +64,7 @@ async fn main() {
                }

                // ignore outgoing errors: those are normal
-                e@Some(SwarmEvent::OutgoingConnectionError { .. }) => { log::debug!("Outgoing connection error: {e:?}"); }
+                e@SwarmEvent::OutgoingConnectionError { .. } => { log::debug!("Outgoing connection error: {e:?}"); }

                // otherwise log any other event
                e => { log::info!("Other event {e:?}"); }
--- a/rust/networking/examples/chatroom_manual.rs
+++ b/rust/networking/examples/chatroom_manual.rs
@@ -0,0 +1,127 @@
+// Copyright 2018 Parity Technologies (UK) Ltd.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+use futures::stream::StreamExt;
+use libp2p::{
+    gossipsub, mdns, noise,
+    swarm::{NetworkBehaviour, SwarmEvent},
+    tcp, yamux,
+};
+use std::error::Error;
+use std::time::Duration;
+use tokio::{io, io::AsyncBufReadExt, select};
+use tracing_subscriber::EnvFilter;
+
+// We create a custom network behaviour that combines Gossipsub and Mdns.
+#[derive(NetworkBehaviour)]
+struct MyBehaviour {
+    gossipsub: gossipsub::Behaviour,
+    mdns: mdns::tokio::Behaviour,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let _ = tracing_subscriber::fmt()
+        .with_env_filter(EnvFilter::from_default_env())
+        .try_init();
+
+    let mut swarm = libp2p::SwarmBuilder::with_new_identity()
+        .with_tokio()
+        .with_tcp(
+            tcp::Config::default(),
+            noise::Config::new,
+            yamux::Config::default,
+        )?
+        .with_behaviour(|key| {
+            // Set a custom gossipsub configuration
+            let gossipsub_config = gossipsub::ConfigBuilder::default()
+                .heartbeat_interval(Duration::from_secs(10))
+                .validation_mode(gossipsub::ValidationMode::Strict) // This sets the kind of message validation. The default is Strict (enforce message signing)
+                .build()
+                .map_err(io::Error::other)?; // Temporary hack because `build` does not return a proper `std::error::Error`.
+
+            // build a gossipsub network behaviour
+            let gossipsub = gossipsub::Behaviour::new(
+                gossipsub::MessageAuthenticity::Signed(key.clone()),
+                gossipsub_config,
+            )?;
+
+            let mdns =
+                mdns::tokio::Behaviour::new(mdns::Config::default(), key.public().to_peer_id())?;
+            Ok(MyBehaviour { gossipsub, mdns })
+        })?
+        .build();
+
+    println!("Running swarm with identity {}", swarm.local_peer_id());
+
+    // Create a Gossipsub topic
+    let topic = gossipsub::IdentTopic::new("test-net");
+    // subscribes to our topic
+    swarm.behaviour_mut().gossipsub.subscribe(&topic)?;
+
+    // Read full lines from stdin
+    let mut stdin = io::BufReader::new(io::stdin()).lines();
+
+    // Listen on all interfaces and whatever port the OS assigns
+    swarm.listen_on("/ip4/0.0.0.0/tcp/0".parse()?)?;
+
+    println!("Enter messages via STDIN and they will be sent to connected peers using Gossipsub");
+
+    // Kick it off
+    loop {
+        select! {
+            Ok(Some(line)) = stdin.next_line() => {
+                if let Err(e) = swarm
+                    .behaviour_mut().gossipsub
+                    .publish(topic.clone(), line.as_bytes()) {
+                    println!("Publish error: {e:?}");
+                }
+            }
+            event = swarm.select_next_some() => match event {
+                SwarmEvent::Behaviour(MyBehaviourEvent::Mdns(mdns::Event::Discovered(list))) => {
+                    for (peer_id, multiaddr) in list {
+                        println!("mDNS discovered a new peer: {peer_id} on {multiaddr}");
+                        swarm.behaviour_mut().gossipsub.add_explicit_peer(&peer_id);
+                    }
+                },
+                SwarmEvent::Behaviour(MyBehaviourEvent::Mdns(mdns::Event::Expired(list))) => {
+                    for (peer_id, multiaddr) in list {
+                        println!("mDNS discover peer has expired: {peer_id} on {multiaddr}");
+                        swarm.behaviour_mut().gossipsub.remove_explicit_peer(&peer_id);
+                    }
+                },
+                SwarmEvent::Behaviour(MyBehaviourEvent::Gossipsub(gossipsub::Event::Message {
+                    propagation_source: peer_id,
+                    message_id: id,
+                    message,
+                })) => println!(
+                        "Got message: '{}' with id: {id} from peer: {peer_id}",
+                        String::from_utf8_lossy(&message.data),
+                    ),
+                SwarmEvent::NewListenAddr { address, .. } => {
+                    println!("Local node is listening on {address}");
+                }
+                e => {
+                    println!("Other swarm event: {:?}", e);
+                }
+            }
+        }
+    }
+}
--- a/rust/networking/src/discovery.rs
+++ b/rust/networking/src/discovery.rs
@@ -1,7 +1,7 @@
 use crate::ext::MultiaddrExt;
 use delegate::delegate;
 use either::Either;
-use futures_lite::FutureExt;
+use futures::FutureExt;
 use futures_timer::Delay;
 use libp2p::core::transport::PortUse;
 use libp2p::core::{ConnectedPoint, Endpoint};
@@ -362,7 +362,7 @@ impl NetworkBehaviour for Behaviour {
        }

        // retry connecting to all mDNS peers periodically (fails safely if already connected)
-        if self.retry_delay.poll(cx).is_ready() {
+        if self.retry_delay.poll_unpin(cx).is_ready() {
            for (p, mas) in self.mdns_discovered.clone() {
                for ma in mas {
                    self.dial(p, ma)
--- a/rust/networking/src/swarm.rs
+++ b/rust/networking/src/swarm.rs
@@ -31,7 +31,7 @@ pub fn create_swarm(keypair: identity::Keypair) -> alias::AnyResult<Swarm> {
 mod transport {
    use crate::alias;
    use crate::swarm::{NETWORK_VERSION, OVERRIDE_VERSION_ENV_VAR};
-    use futures_lite::{AsyncRead, AsyncWrite};
+    use futures::{AsyncRead, AsyncWrite};
    use keccak_const::Sha3_256;
    use libp2p::core::muxing;
    use libp2p::core::transport::Boxed;
--- a/rust/parts.nix
+++ b/rust/parts.nix
@@ -1,10 +1,11 @@
 { inputs, ... }:
 {
  perSystem =
-    { inputs', pkgs, lib, ... }:
+    { config, self', inputs', pkgs, lib, ... }:
    let
      # Fenix nightly toolchain with all components
-      rustToolchain = inputs'.fenix.packages.stable.withComponents [
+      fenixPkgs = inputs'.fenix.packages;
+      rustToolchain = fenixPkgs.complete.withComponents [
        "cargo"
        "rustc"
        "clippy"
--- a/rust/rust-toolchain.toml
+++ b/rust/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "nightly"
--- a/src/exo/shared/types/mlx.py
+++ b/src/exo/shared/types/mlx.py
@@ -4,13 +4,10 @@ from collections.abc import Sequence

 from mlx_lm.models.cache import (
    ArraysCache,
-    CacheList,
    KVCache,
    QuantizedKVCache,
    RotatingKVCache,
 )

 # This list contains one cache entry per transformer layer
-KVCacheType = Sequence[
-    KVCache | RotatingKVCache | QuantizedKVCache | ArraysCache | CacheList
-]
+KVCacheType = Sequence[KVCache | RotatingKVCache | QuantizedKVCache | ArraysCache]
--- a/src/exo/utils/channels.py
+++ b/src/exo/utils/channels.py
@@ -1,4 +1,3 @@
-import contextlib
 import multiprocessing as mp
 from dataclasses import dataclass, field
 from math import inf
@@ -133,8 +132,7 @@ class MpSender[T]:
    def close(self) -> None:
        if not self._state.closed.is_set():
            self._state.closed.set()
-        with contextlib.suppress(Exception):
-            self._state.buffer.put_nowait(_MpEndOfStream())
+        self._state.buffer.put(_MpEndOfStream())
        self._state.buffer.close()

    # == unique to Mp channels ==
@@ -206,8 +204,6 @@ class MpReceiver[T]:
    def close(self) -> None:
        if not self._state.closed.is_set():
            self._state.closed.set()
-        with contextlib.suppress(Exception):
-            self._state.buffer.put_nowait(_MpEndOfStream())
        self._state.buffer.close()

    # == unique to Mp channels ==
--- a/src/exo/worker/engines/mlx/cache.py
+++ b/src/exo/worker/engines/mlx/cache.py
@@ -5,7 +5,6 @@ import mlx.core as mx
 import psutil
 from mlx_lm.models.cache import (
    ArraysCache,
-    CacheList,
    KVCache,
    QuantizedKVCache,
    RotatingKVCache,
@@ -18,22 +17,10 @@ from exo.worker.engines.mlx import Model
 from exo.worker.engines.mlx.constants import CACHE_GROUP_SIZE, KV_CACHE_BITS
 from exo.worker.runner.bootstrap import logger

-
-# Fraction of device memory above which LRU eviction kicks in.
-# Smaller machines need more aggressive eviction.
-def _default_memory_threshold() -> float:
-    total_gb = psutil.virtual_memory().total / (1024**3)
-    if total_gb >= 128:
-        return 0.85
-    if total_gb >= 64:
-        return 0.80
-    if total_gb >= 32:
-        return 0.75
-    return 0.70
-
-
+# Fraction of device memory above which LRU eviction kicks in
+_DEFAULT_MEMORY_THRESHOLD = 0.9
 _MEMORY_THRESHOLD = float(
-    os.environ.get("EXO_MEMORY_THRESHOLD", _default_memory_threshold())
+    os.environ.get("EXO_MEMORY_THRESHOLD", _DEFAULT_MEMORY_THRESHOLD)
 )


@@ -77,7 +64,7 @@ def has_non_kv_caches(cache: KVCacheType) -> bool:


 class KVPrefixCache:
-    def __init__(self, group: mx.distributed.Group | None):
+    def __init__(self, group: mx.distributed.Group | None = None):
        self.prompts: list[mx.array] = []  # mx array of tokens (ints)
        self.caches: list[KVCacheType] = []
        self._snapshots: list[list[CacheSnapshot] | None] = []
@@ -169,15 +156,15 @@ class KVPrefixCache:
        best_length = 0
        is_exact = False

-        # Find best cache match
+        # Find best cache
        for i, cached_prompt in enumerate(self.prompts):
            length = get_prefix_length(prompt_tokens, cached_prompt)
-            if length >= max_length - 1:
-                best_index, best_length = i, length
-                is_exact = True
-                break
            if length > best_length:
                best_index, best_length = i, length
+            if length == max_length:
+                is_exact = True
+                best_index, best_length = i, length
+                break

        if best_index is None:
            return make_kv_cache(model), prompt_tokens, None
@@ -185,12 +172,11 @@ class KVPrefixCache:
        # For exact match: trim to max_length-1 so remaining has the last token
        # For partial match: trim to best_length, remaining has suffix to prefill
        # This ensures stream_generate always has at least one token to start with
-        has_ssm = has_non_kv_caches(self.caches[best_index])
-        target = (max_length - 1) if is_exact and not has_ssm else best_length
+        target = (max_length - 1) if is_exact else best_length
        restore_pos, restore_snap = self._get_snapshot(best_index, target)

        # No usable snapshot — need fresh cache
-        if restore_snap is None and has_ssm:
+        if restore_snap is None and has_non_kv_caches(self.caches[best_index]):
            return make_kv_cache(model), prompt_tokens, None

        prompt_cache = deepcopy(self.caches[best_index])
@@ -271,21 +257,10 @@ def encode_prompt(tokenizer: TokenizerWrapper, prompt: str) -> mx.array:
    return mx.array(prompt_tokens)


-def _entry_length(
-    c: KVCache | RotatingKVCache | QuantizedKVCache | ArraysCache | CacheList,
-) -> int:
-    # Use .offset attribute which KVCache types have (len() not implemented in older QuantizedKVCache).
-    if hasattr(c, "offset"):
-        return c.offset
-    # For CacheList
-    if hasattr(c, "size"):
-        return int(c.size())  # type: ignore
-    return 0
-
-
 def cache_length(cache: KVCacheType) -> int:
    """Get the number of tokens in a KV cache."""
-    return max(_entry_length(c) for c in cache)
+    # Use .offset attribute which KVCache types have (len() not implemented in older QuantizedKVCache).
+    return max(getattr(c, "offset", 0) for c in cache)


 def get_prefix_length(prompt: mx.array, cached_prompt: mx.array) -> int:
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -48,7 +48,7 @@ from exo.worker.runner.bootstrap import logger

 generation_stream = mx.new_stream(mx.default_device())

-_MIN_PREFIX_HIT_RATIO_TO_UPDATE = 0.5
+_MIN_PREFIX_HIT_TO_UPDATE = 1000


 def prefill(
@@ -57,7 +57,7 @@ def prefill(
    sampler: Callable[[mx.array], mx.array],
    prompt_tokens: mx.array,
    cache: KVCacheType,
-    group: mx.distributed.Group | None,
+    group: mx.distributed.Group | None = None,
 ) -> tuple[float, int, list[CacheSnapshot]]:
    """Prefill the KV cache with prompt tokens.

@@ -133,7 +133,7 @@ def prefill(
 def warmup_inference(
    model: Model,
    tokenizer: TokenizerWrapper,
-    group: mx.distributed.Group | None,
+    group: mx.distributed.Group | None = None,
 ) -> int:
    content = "Prompt to warm up the inference engine. Repeat this."

@@ -255,8 +255,8 @@ def mlx_generate(
    tokenizer: TokenizerWrapper,
    task: TextGenerationTaskParams,
    prompt: str,
-    kv_prefix_cache: KVPrefixCache | None,
-    group: mx.distributed.Group | None,
+    kv_prefix_cache: KVPrefixCache | None = None,
+    group: mx.distributed.Group | None = None,
 ) -> Generator[GenerationResponse]:
    # Ensure that generation stats only contains peak memory for this generation
    mx.reset_peak_memory()
@@ -436,14 +436,9 @@ def mlx_generate(
                full_prompt_tokens = mx.concatenate(
                    [all_prompt_tokens, generated_tokens_array]
                )
-                hit_ratio = (
-                    prefix_hit_length / len(all_prompt_tokens)
-                    if len(all_prompt_tokens) > 0
-                    else 0.0
-                )
                if (
                    matched_index is not None
-                    and hit_ratio >= _MIN_PREFIX_HIT_RATIO_TO_UPDATE
+                    and prefix_hit_length >= _MIN_PREFIX_HIT_TO_UPDATE
                ):
                    kv_prefix_cache.update_kv_cache(
                        matched_index,
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -292,8 +292,6 @@ def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
    elif "glm" in model_id_lower:
        # For GLM-4.5 and older
        return [151336, 151329, 151338]
-    elif "gpt-oss" in model_id_lower:
-        return [200002, 200012]
    return None


--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -11,7 +11,6 @@ from mlx_lm.models.gpt_oss import Model as GptOssModel
 from mlx_lm.tokenizer_utils import TokenizerWrapper
 from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
    HarmonyEncodingName,
-    HarmonyError,  # pyright: ignore[reportUnknownVariableType]
    Role,
    StreamableParser,
    load_harmony_encoding,
@@ -589,11 +588,7 @@ def parse_gpt_oss(

    for response in responses:
        assert isinstance(response, GenerationResponse)
-        try:
-            stream.process(response.token)
-        except HarmonyError:
-            logger.error("Encountered critical Harmony Error, returning early")
-            return
+        stream.process(response.token)

        delta = stream.last_content_delta
        ch = stream.current_channel
--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -103,7 +103,7 @@ class RunnerSupervisor:
        self._event_sender.close()
        self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
        self._cancel_sender.close()
-        self.runner_process.join(5)
+        self.runner_process.join(1)
        if not self.runner_process.is_alive():
            logger.info("Runner process succesfully terminated")
            return
--- a/src/exo/worker/tests/unittests/test_mlx/conftest.py
+++ b/src/exo/worker/tests/unittests/test_mlx/conftest.py
@@ -123,12 +123,7 @@ def run_gpt_oss_pipeline_device(
        generated_text = ""

        for response in mlx_generate(
-            model=model,
-            tokenizer=tokenizer,
-            task=task,
-            prompt=prompt,
-            kv_prefix_cache=None,
-            group=group,
+            model=model, tokenizer=tokenizer, task=task, prompt=prompt
        ):
            generated_text += response.text
            if response.finish_reason is not None:
@@ -199,8 +194,6 @@ def run_gpt_oss_tensor_parallel_device(
            tokenizer=tokenizer,
            task=task,
            prompt=prompt,
-            kv_prefix_cache=None,
-            group=group,
        ):
            generated_text += response.text
            if response.finish_reason is not None:
--- a/src/exo/worker/tests/unittests/test_mlx/test_kv_prefix_cache.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_kv_prefix_cache.py
@@ -88,12 +88,12 @@ class TestKVPrefix:
        return tokenizer

    def test_starts_empty(self, mock_tokenizer):
-        cache = KVPrefixCache(None)
+        cache = KVPrefixCache()
        assert len(cache.prompts) == 0
        assert len(cache.caches) == 0

    def test_clear_empties_cache(self, mock_tokenizer):
-        cache = KVPrefixCache(None)
+        cache = KVPrefixCache()
        cache.prompts.append(mx.array([1, 2, 3]))
        cache.caches.append([KVCache()])
        cache.clear()
@@ -101,7 +101,7 @@ class TestKVPrefix:
        assert len(cache.caches) == 0

    def test_clear_on_empty_cache(self, mock_tokenizer):
-        cache = KVPrefixCache(None)
+        cache = KVPrefixCache()
        cache.clear()
        assert len(cache.prompts) == 0

@@ -142,9 +142,7 @@ class TestKVPrefixCacheWithModel:
        tokens = encode_prompt(tokenizer, prompt)
        cache = make_kv_cache(model)

-        _, _, snapshots = prefill(
-            model, tokenizer, make_sampler(0.0), tokens, cache, group=None
-        )
+        _, _, snapshots = prefill(model, tokenizer, make_sampler(0.0), tokens, cache)

        # Cache should now hold the prompt tokens minus one
        assert cache_length(cache) == len(tokens) - 1
@@ -163,11 +161,9 @@ class TestKVPrefixCacheWithModel:
        tokens = encode_prompt(tokenizer, prompt)
        cache = make_kv_cache(model)

-        _, _, snapshots = prefill(
-            model, tokenizer, make_sampler(0.0), tokens, cache, group=None
-        )
+        _, _, snapshots = prefill(model, tokenizer, make_sampler(0.0), tokens, cache)

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()
        kv_prefix_cache.add_kv_cache(tokens, cache, snapshots)

        assert len(kv_prefix_cache.prompts) == 1
@@ -180,11 +176,9 @@ class TestKVPrefixCacheWithModel:
        )
        assert matched_index == 0

-        # Exact match returns last token(s) — for models with SSM/rotating caches,
-        # snapshot availability constrains how far back we can trim, so remaining
-        # may be 1 or 2 tokens depending on the model.
-        assert len(remaining_tokens) >= 1
-        assert mx.array_equal(remaining_tokens, tokens[-len(remaining_tokens) :])
+        # Exact match returns only last token
+        assert len(remaining_tokens) == 1
+        assert mx.array_equal(remaining_tokens, tokens[-1:])

    def test_add_and_get_prefix_match(self, model_and_tokenizer):
        """get_kv_cache with a longer prompt sharing prefix should return partial match."""
@@ -200,10 +194,10 @@ class TestKVPrefixCacheWithModel:
        cache = make_kv_cache(model)

        _, _, snapshots = prefill(
-            model, tokenizer, make_sampler(0.0), short_tokens, cache, group=None
+            model, tokenizer, make_sampler(0.0), short_tokens, cache
        )

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()
        kv_prefix_cache.add_kv_cache(short_tokens, cache, snapshots)

        # Query with longer prompt that shares the chat template prefix
@@ -244,11 +238,9 @@ class TestKVPrefixCacheWithModel:
        tokens = encode_prompt(tokenizer, prompt)
        cache = make_kv_cache(model)

-        _, _, snapshots = prefill(
-            model, tokenizer, make_sampler(0.0), tokens, cache, group=None
-        )
+        _, _, snapshots = prefill(model, tokenizer, make_sampler(0.0), tokens, cache)

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()
        kv_prefix_cache.add_kv_cache(tokens, cache, snapshots)

        stored_length = cache_length(kv_prefix_cache.caches[0])
@@ -284,11 +276,9 @@ class TestKVPrefixCacheWithModel:
        tokens = encode_prompt(tokenizer, prompt)
        cache = make_kv_cache(model)

-        _, _, snapshots = prefill(
-            model, tokenizer, make_sampler(0.0), tokens, cache, group=None
-        )
+        _, _, snapshots = prefill(model, tokenizer, make_sampler(0.0), tokens, cache)

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()
        kv_prefix_cache.add_kv_cache(tokens, cache, snapshots)

        stored_length = cache_length(kv_prefix_cache.caches[0])
@@ -311,7 +301,7 @@ class TestKVPrefixCacheWithModel:
        """mlx_generate should save the cache after generation completes."""
        model, tokenizer = model_and_tokenizer

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()
        task = TextGenerationTaskParams(
            model=DEFAULT_GPT_OSS_MODEL_ID,
            input=[InputMessage(role="user", content="Hello")],
@@ -328,7 +318,6 @@ class TestKVPrefixCacheWithModel:
            task=task,
            prompt=prompt,
            kv_prefix_cache=kv_prefix_cache,
-            group=None,
        ):
            generated_tokens += 1

@@ -342,7 +331,7 @@ class TestKVPrefixCacheWithModel:
        """Second mlx_generate call with same prompt should get a prefix hit from stored cache."""
        model, tokenizer = model_and_tokenizer

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()
        task = TextGenerationTaskParams(
            model=DEFAULT_GPT_OSS_MODEL_ID,
            input=[InputMessage(role="user", content="Reuse test")],
@@ -358,7 +347,6 @@ class TestKVPrefixCacheWithModel:
            task=task,
            prompt=prompt,
            kv_prefix_cache=kv_prefix_cache,
-            group=None,
        ):
            pass

@@ -380,7 +368,7 @@ class TestKVPrefixCacheWithModel:
        """With a prompt > 1000 tokens, second generation should update the cache entry in-place."""
        model, tokenizer = model_and_tokenizer

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()

        # Build a long user message (> 1000 tokens) to exceed _MIN_PREFIX_HIT_TO_UPDATE
        base_text = "The quick brown fox jumps over the lazy dog. "
@@ -407,7 +395,6 @@ class TestKVPrefixCacheWithModel:
            task=task1,
            prompt=prompt1,
            kv_prefix_cache=kv_prefix_cache,
-            group=None,
        ):
            pass
        first_gen_time = time.perf_counter() - t0
@@ -440,7 +427,6 @@ class TestKVPrefixCacheWithModel:
            task=task2,
            prompt=prompt2,
            kv_prefix_cache=kv_prefix_cache,
-            group=None,
        ):
            pass
        second_gen_time = time.perf_counter() - t0
@@ -461,7 +447,7 @@ class TestKVPrefixCacheWithModel:
        """After mlx_generate saves a cache, a second generation must not corrupt the stored copy."""
        model, tokenizer = model_and_tokenizer

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()
        task = TextGenerationTaskParams(
            model=DEFAULT_GPT_OSS_MODEL_ID,
            input=[InputMessage(role="user", content="Immutable test")],
@@ -476,7 +462,6 @@ class TestKVPrefixCacheWithModel:
            task=task,
            prompt=prompt,
            kv_prefix_cache=kv_prefix_cache,
-            group=None,
        ):
            pass

@@ -489,7 +474,6 @@ class TestKVPrefixCacheWithModel:
            task=task,
            prompt=prompt,
            kv_prefix_cache=kv_prefix_cache,
-            group=None,
        ):
            pass

@@ -500,7 +484,7 @@ class TestKVPrefixCacheWithModel:
        """Under memory pressure, adding a new cache entry evicts the least recently used one."""
        model, tokenizer = model_and_tokenizer

-        kv_prefix_cache = KVPrefixCache(None)
+        kv_prefix_cache = KVPrefixCache()

        # Add three cache entries with different prompts
        prompts = ["First entry", "Second entry", "Third entry"]
@@ -513,7 +497,7 @@ class TestKVPrefixCacheWithModel:
            prompt = apply_chat_template(tokenizer, task)
            tokens = encode_prompt(tokenizer, prompt)
            cache = make_kv_cache(model)
-            prefill(model, tokenizer, make_sampler(0.0), tokens, cache, group=None)
+            prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
            kv_prefix_cache.add_kv_cache(tokens, cache)
            # Stagger _last_used so LRU order is deterministic
            kv_prefix_cache._last_used[i] = float(i)
@@ -538,7 +522,7 @@ class TestKVPrefixCacheWithModel:
            prompt = apply_chat_template(tokenizer, task)
            tokens = encode_prompt(tokenizer, prompt)
            cache = make_kv_cache(model)
-            prefill(model, tokenizer, make_sampler(0.0), tokens, cache, group=None)
+            prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
            kv_prefix_cache.add_kv_cache(tokens, cache)

        # LRU entries should have been evicted (entries 0, 1, 2 in order of _last_used)
--- a/src/exo/worker/tests/unittests/test_mlx/test_prefix_cache_architectures.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_prefix_cache_architectures.py
@@ -1,297 +0,0 @@
-import copy
-import gc
-import importlib
-import json
-import shutil
-import tempfile
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, cast
-
-import mlx.core as mx
-import mlx.nn as nn
-import pytest
-from mlx.utils import tree_flatten, tree_unflatten
-from mlx_lm.tokenizer_utils import TokenizerWrapper
-
-from exo.shared.types.common import ModelId
-from exo.shared.types.text_generation import InputMessage, TextGenerationTaskParams
-from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.cache import KVPrefixCache
-from exo.worker.engines.mlx.generator.generate import mlx_generate
-from exo.worker.engines.mlx.utils_mlx import (
-    apply_chat_template,
-    load_tokenizer_for_model_id,
-)
-
-HF_CACHE = Path.home() / ".cache" / "huggingface" / "hub"
-
-# ── Config reduction ──────────────────────────────────────────────────────── #
-
-_REDUCE = {
-    "num_hidden_layers": 4,
-    "hidden_size": 256,
-    "num_attention_heads": 4,
-    "num_key_value_heads": 4,
-    "intermediate_size": 512,
-    "moe_intermediate_size": 128,
-    "num_experts": 4,
-    "num_experts_per_tok": 2,
-    "n_routed_experts": 4,
-    "num_local_experts": 4,
-    "num_nextn_predict_layers": 0,
-    "first_k_dense_replace": 0,
-    "linear_num_key_heads": 2,
-    "linear_num_value_heads": 2,
-    "num_attention_groups": 4,
-}
-
-
-def _reduce_dict(cfg: dict[str, Any]) -> dict[str, Any]:
-    result = dict(cfg)
-    for key, val in _REDUCE.items():
-        if key in result:
-            result[key] = val
-    return result
-
-
-def _reduce_config(cfg: dict[str, Any]) -> dict[str, Any]:
-    result = _reduce_dict(cfg)
-    n_layers = cast(int, result.get("num_hidden_layers", 4))
-
-    if "text_config" in result and isinstance(result["text_config"], dict):
-        result["text_config"] = _reduce_dict(
-            cast(dict[str, Any], result["text_config"])
-        )
-        tc: dict[str, Any] = result["text_config"]
-        if "num_nextn_predict_layers" in tc:
-            tc["num_nextn_predict_layers"] = 0
-
-    if "layer_types" in result and isinstance(result["layer_types"], list):
-        result["layer_types"] = result["layer_types"][:n_layers]
-
-    if "attention_other_setting" in result and isinstance(
-        result["attention_other_setting"], dict
-    ):
-        aos: dict[str, Any] = dict(
-            cast(dict[str, Any], result["attention_other_setting"])
-        )
-        if "num_attention_heads" in aos:
-            aos["num_attention_heads"] = result.get("num_attention_heads", 4)
-        if "num_attention_groups" in aos:
-            aos["num_attention_groups"] = result.get(
-                "num_attention_groups", cast(int, aos["num_attention_groups"])
-            )
-        result["attention_other_setting"] = aos
-
-    if "moe_layers_enum" in result and isinstance(result["moe_layers_enum"], str):
-        indices = [int(x) for x in result["moe_layers_enum"].split(",") if x.strip()]
-        valid = [i for i in indices if i < n_layers]
-        result["moe_layers_enum"] = ",".join(str(i) for i in valid) if valid else ""
-
-    return result
-
-
-# ── Helpers ───────────────────────────────────────────────────────────────── #
-
-
-def _find_snapshot(hub_name: str) -> Path | None:
-    model_dir = HF_CACHE / f"models--mlx-community--{hub_name}"
-    snaps = model_dir / "snapshots"
-    if not snaps.exists():
-        return None
-    children = sorted(snaps.iterdir())
-    return children[0] if children else None
-
-
-def _copy_tokenizer(src: Path, dst: Path) -> None:
-    for f in src.iterdir():
-        name = f.name
-        if (
-            "tokeniz" in name.lower()
-            or "tiktoken" in name.lower()
-            or name.startswith("vocab")
-            or name.endswith(".jinja")
-            or "tool_declaration" in name
-        ) and f.is_file():
-            shutil.copy2(f, dst / name)
-
-
-def _build_model(module_name: str, cfg: dict[str, Any]) -> Model:
-    mod = importlib.import_module(f"mlx_lm.models.{module_name}")
-    args = mod.ModelArgs.from_dict(cfg)  # pyright: ignore[reportAny]
-    model: nn.Module = mod.Model(args)  # pyright: ignore[reportAny]
-    flat = cast(list[tuple[str, mx.array]], tree_flatten(model.parameters()))
-    random_weights = [
-        (k, mx.random.normal(shape=v.shape, dtype=mx.float16)) for k, v in flat
-    ]
-    model.update(cast(dict[str, Any], tree_unflatten(random_weights)))
-    mx.eval(model.parameters())
-    return cast(Model, model)
-
-
-def _collect_tokens(
-    model: Model,
-    tokenizer: TokenizerWrapper,
-    task: TextGenerationTaskParams,
-    prompt: str,
-    kv_prefix_cache: KVPrefixCache | None,
-) -> list[int]:
-    tokens: list[int] = []
-    for resp in mlx_generate(
-        model=model,
-        tokenizer=tokenizer,
-        task=task,
-        prompt=prompt,
-        kv_prefix_cache=kv_prefix_cache,
-        group=None,
-    ):
-        tokens.append(resp.token)
-        if resp.finish_reason is not None:
-            break
-    return tokens
-
-
-# ── Architecture definitions ──────────────────────────────────────────────── #
-
-
-@dataclass(frozen=True)
-class ArchSpec:
-    name: str
-    hub_name: str
-    module: str
-    tokenizer_hub: str | None = None  # fallback for models without bundled tokenizer
-
-
-ARCHITECTURES: list[ArchSpec] = [
-    ArchSpec("llama", "Llama-3.2-1B-Instruct-4bit", "llama"),
-    ArchSpec("glm_moe_dsa", "GLM-5-MXFP4-Q8", "glm_moe_dsa"),
-    ArchSpec(
-        "glm4_moe", "GLM-4.5-Air-8bit", "glm4_moe", tokenizer_hub="GLM-4.7-8bit-gs32"
-    ),
-    ArchSpec(
-        "glm4_moe_lite",
-        "GLM-4.7-Flash-8bit",
-        "glm4_moe_lite",
-        tokenizer_hub="GLM-4.7-8bit-gs32",
-    ),
-    ArchSpec("glm4_moe_47", "GLM-4.7-8bit-gs32", "glm4_moe"),
-    ArchSpec("qwen3", "Qwen3-4B-Instruct-2507-4bit", "qwen3"),
-    ArchSpec("qwen3_moe", "Qwen3-30B-A3B-4bit", "qwen3_moe"),
-    ArchSpec("qwen3_next", "Qwen3-Next-80B-A3B-Thinking-4bit", "qwen3_next"),
-    ArchSpec("minimax", "MiniMax-M2.1-3bit", "minimax"),
-    ArchSpec("gpt_oss", "gpt-oss-20b-MXFP4-Q8", "gpt_oss"),
-    ArchSpec("step3p5", "Step-3.5-Flash-4bit", "step3p5"),
-    ArchSpec("kimi_k25", "Kimi-K2.5", "kimi_k25"),
-]
-
-
-def _arch_available(spec: ArchSpec) -> bool:
-    snap = _find_snapshot(spec.hub_name)
-    if snap is None:
-        return False
-    if spec.tokenizer_hub is not None:
-        return _find_snapshot(spec.tokenizer_hub) is not None
-    return True
-
-
-def _make_task() -> TextGenerationTaskParams:
-    return TextGenerationTaskParams(
-        model=ModelId("test"),
-        input=[
-            InputMessage(
-                role="user",
-                content="Use the calculator to compute 1847 * 263 + 5921",
-            )
-        ],
-        max_output_tokens=20,
-        temperature=0.0,
-        tools=[
-            {
-                "type": "function",
-                "function": {
-                    "name": "calculate",
-                    "description": "Evaluate a mathematical expression",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {"expression": {"type": "string"}},
-                        "required": ["expression"],
-                    },
-                },
-            }
-        ],
-    )
-
-
-# ── Test class ────────────────────────────────────────────────────────────── #
-
-
-@pytest.mark.slow
-class TestPrefixCacheArchitectures:
-    """Verify prefix cache produces identical output to fresh generation for every architecture."""
-
-    @pytest.fixture(autouse=True)
-    def _cleanup(self):
-        yield
-        mx.clear_cache()
-        gc.collect()
-
-    @pytest.mark.parametrize(
-        "spec",
-        ARCHITECTURES,
-        ids=[a.name for a in ARCHITECTURES],
-    )
-    def test_prefix_cache_exact_hit(self, spec: ArchSpec) -> None:
-        if not _arch_available(spec):
-            pytest.skip(f"Model {spec.hub_name} not cached locally")
-
-        snapshot = _find_snapshot(spec.hub_name)
-        assert snapshot is not None
-
-        tmpdir = Path(tempfile.mkdtemp(prefix=f"exo_test_{spec.name}_"))
-        try:
-            # Build reduced config
-            with open(snapshot / "config.json") as f:
-                cfg = cast(dict[str, Any], json.load(f))
-            reduced = _reduce_config(copy.deepcopy(cfg))
-            (tmpdir / "config.json").write_text(json.dumps(reduced))
-
-            # Copy tokenizer
-            tok_src = snapshot
-            if spec.tokenizer_hub is not None:
-                alt = _find_snapshot(spec.tokenizer_hub)
-                if alt is not None:
-                    tok_src = alt
-            _copy_tokenizer(tok_src, tmpdir)
-
-            # Load tokenizer and model
-            model_id = ModelId(f"mlx-community/{spec.hub_name}")
-            tokenizer = load_tokenizer_for_model_id(model_id, tmpdir)
-            mx.random.seed(0)
-            model = _build_model(spec.module, reduced)
-
-            task = _make_task()
-            prompt = apply_chat_template(tokenizer=tokenizer, task_params=task)
-
-            # Run 1: fresh
-            mx.random.seed(42)
-            fresh = _collect_tokens(model, tokenizer, task, prompt, None)
-            assert len(fresh) > 0, "Fresh generation produced no tokens"
-
-            # Run 2: populate cache
-            kv = KVPrefixCache(None)
-            mx.random.seed(42)
-            populate = _collect_tokens(model, tokenizer, task, prompt, kv)
-
-            # Run 3: exact cache hit
-            mx.random.seed(42)
-            cached = _collect_tokens(model, tokenizer, task, prompt, kv)
-
-            assert fresh == populate, (
-                f"Fresh vs populate mismatch: {fresh[:5]} vs {populate[:5]}"
-            )
-            assert fresh == cached, (
-                f"Fresh vs cached mismatch: {fresh[:5]} vs {cached[:5]}"
-            )
-        finally:
-            shutil.rmtree(tmpdir, ignore_errors=True)
--- a/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
@@ -343,16 +343,8 @@ async def test_kimi_tokenizer_specifically():
@pytest.mark.asyncio
 async def test_glm_tokenizer_specifically():
    """Test GLM tokenizer with its specific EOS tokens."""
-
-    def contains(card: ModelCard, x: str):
-        return x in card.model_id.lower()
-
    glm_model_cards = [
-        card
-        for card in await get_model_cards()
-        if contains(card, "glm")
-        and not contains(card, "-5")
-        and not contains(card, "4.7")
+        card for card in await get_model_cards() if "glm" in card.model_id.lower()
    ]

    if not glm_model_cards:
--- a/src/exo/worker/tests/unittests/test_runner/test_parse_gpt_oss.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_parse_gpt_oss.py
@@ -1,162 +0,0 @@
-from collections.abc import Generator
-
-from exo.shared.types.worker.runner_response import (
-    GenerationResponse,
-    ToolCallResponse,
-)
-from exo.worker.runner.runner import parse_gpt_oss
-
-# Token IDs from mlx-community/gpt-oss-20b-MXFP4-Q8 tokenizer.
-# These are stable since they come from the model's vocabulary.
-_CHANNEL = 200005  # <|channel|>
-_START = 200006  # <|start|>
-_MESSAGE = 200008  # <|message|>
-_CALL = 200012  # <|call|>
-_END = 200007  # <|end|>
-_ASSISTANT = 173781  # "assistant"
-
-# fmt: off
-# " to=functions.get_current_weather<|channel|>commentary json<|message|>{\"location\": \"Tokyo\"}<|call|>"
-FORMAT_A_TOKENS: list[tuple[int, str]] = [
-    (316,    " to"),
-    (28,     "="),
-    (44580,  "functions"),
-    (775,    ".get"),
-    (23981,  "_current"),
-    (170154, "_weather"),
-    (_CHANNEL, "<|channel|>"),
-    (12606,  "comment"),
-    (815,    "ary"),
-    (5701,   " json"),
-    (_MESSAGE, "<|message|>"),
-    (10848,  '{"'),
-    (7693,   "location"),
-    (1243,   '":'),
-    (392,    ' "'),
-    (173844, "Tokyo"),
-    (18583,  '"}'),
-    (_CALL,  "<|call|>"),
-]
-
-# "<|channel|>commentary to=functions.get_current_weather json<|message|>{\"location\": \"Tokyo\"}<|call|>"
-FORMAT_B_TOKENS: list[tuple[int, str]] = [
-    (_CHANNEL, "<|channel|>"),
-    (12606,  "comment"),
-    (815,    "ary"),
-    (316,    " to"),
-    (28,     "="),
-    (44580,  "functions"),
-    (775,    ".get"),
-    (23981,  "_current"),
-    (170154, "_weather"),
-    (5701,   " json"),
-    (_MESSAGE, "<|message|>"),
-    (10848,  '{"'),
-    (7693,   "location"),
-    (1243,   '":'),
-    (392,    ' "'),
-    (173844, "Tokyo"),
-    (18583,  '"}'),
-    (_CALL,  "<|call|>"),
-]
-
-# "<|channel|>analysis<|message|>Let me think...<|end|><|start|>assistant<|channel|>commentary to=functions.X ..."
-# Full analysis-then-tool-call as the model actually generates it.
-THINKING_THEN_TOOL_TOKENS: list[tuple[int, str]] = [
-    (_CHANNEL, "<|channel|>"),
-    (35644,  "analysis"),
-    (_MESSAGE, "<|message|>"),
-    (12845,  "Let"),
-    (668,    " me"),
-    (2411,   " think"),
-    (1078,   " about"),
-    (495,    " this"),
-    (13,     "."),
-    (_END,   "<|end|>"),
-    # Model generates a new message header for the tool call:
-    (_START, "<|start|>"),
-    (_ASSISTANT, "assistant"),
-    *FORMAT_B_TOKENS,
-]
-# fmt: on
-
-
-def _make_gen_responses(
-    tokens: list[tuple[int, str]],
-) -> list[GenerationResponse]:
-    """Build GenerationResponse list from (token_id, text) pairs."""
-    responses: list[GenerationResponse] = []
-    for i, (tid, text) in enumerate(tokens):
-        is_last = i == len(tokens) - 1
-        responses.append(
-            GenerationResponse(
-                text=text,
-                token=tid,
-                finish_reason="stop" if is_last else None,
-                usage=None,
-            )
-        )
-    return responses
-
-
-def _collect(
-    tokens: list[tuple[int, str]],
-) -> list[GenerationResponse | ToolCallResponse]:
-    """Feed tokens through parse_gpt_oss and collect all yielded responses."""
-
-    def _gen() -> Generator[GenerationResponse, None, None]:
-        yield from _make_gen_responses(tokens)
-
-    return list(parse_gpt_oss(_gen()))
-
-
-def _get_tool_call(
-    results: list[GenerationResponse | ToolCallResponse],
-) -> ToolCallResponse:
-    """Extract the single ToolCallResponse from results."""
-    tool_calls = [r for r in results if isinstance(r, ToolCallResponse)]
-    assert len(tool_calls) == 1, f"Expected 1 ToolCallResponse, got {len(tool_calls)}"
-    return tool_calls[0]
-
-
-class TestParseGptOssRecipientPlacement:
-    """Both Harmony recipient placements must produce identical tool calls."""
-
-    def test_format_a_yields_tool_call(self):
-        results = _collect(FORMAT_A_TOKENS)
-        tc = _get_tool_call(results)
-        assert tc.tool_calls[0].name == "get_current_weather"
-        assert '"location"' in tc.tool_calls[0].arguments
-        assert "Tokyo" in tc.tool_calls[0].arguments
-
-    def test_format_b_yields_tool_call(self):
-        results = _collect(FORMAT_B_TOKENS)
-        tc = _get_tool_call(results)
-        assert tc.tool_calls[0].name == "get_current_weather"
-        assert '"location"' in tc.tool_calls[0].arguments
-        assert "Tokyo" in tc.tool_calls[0].arguments
-
-    def test_both_formats_produce_identical_tool_calls(self):
-        tc_a = _get_tool_call(_collect(FORMAT_A_TOKENS))
-        tc_b = _get_tool_call(_collect(FORMAT_B_TOKENS))
-        assert tc_a.tool_calls[0].name == tc_b.tool_calls[0].name
-        assert tc_a.tool_calls[0].arguments == tc_b.tool_calls[0].arguments
-
-
-class TestParseGptOssThinkingThenToolCall:
-    """Analysis (thinking) followed by a tool call must yield both."""
-
-    def test_thinking_then_tool_call(self):
-        results = _collect(THINKING_THEN_TOOL_TOKENS)
-
-        # Should have thinking tags + content + tool call
-        text_parts = [r.text for r in results if isinstance(r, GenerationResponse)]
-        combined = "".join(text_parts)
-        assert "<think>" in combined
-        assert "</think>" in combined
-        assert "Let me think about this." in combined
-
-        # And the tool call
-        tc = _get_tool_call(results)
-        assert tc.tool_calls[0].name == "get_current_weather"
-        assert "Tokyo" in tc.tool_calls[0].arguments
--- a/tests/eval_tool_calls.sh
+++ b/tests/eval_tool_calls.sh
@@ -1,55 +0,0 @@
-#!/usr/bin/env bash
-
-[ $# -lt 1 ] && {
-  echo "Usage: $0 host1 [host2 ...]"
-  exit 1
-}
-
-[ -z "$(git status --porcelain)" ] || {
-  echo "Uncommitted changes"
-  exit 1
-}
-
-commit=$(git rev-parse HEAD)
-git fetch -q origin
-git branch -r --contains "$commit" | grep -qE '^\s*origin/' || {
-  echo "Not pushed to origin"
-  exit 1
-}
-hosts=("$@")
-cleanup() {
-  for host in "${hosts[@]}"; do
-    ssh -T -o BatchMode=yes "$host@$host" "pkill -f bin/exo" &
-  done
-  sleep 1
-  jobs -pr | xargs -r kill 2>/dev/null || true
-}
-trap 'cleanup' EXIT INT TERM
-
-for host; do
-  ssh -T -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" \
-    "EXO_LIBP2P_NAMESPACE=$commit /nix/var/nix/profiles/default/bin/nix build github:exo-explore/exo/$commit" &
-done
-wait
-for host; do
-  ssh -T -o BatchMode=yes -o ServerAliveInterval=30 "$host@$host" \
-    "EXO_LIBP2P_NAMESPACE=$commit /nix/var/nix/profiles/default/bin/nix run github:exo-explore/exo/$commit" &>/dev/null &
-done
-
-for host; do
-  echo "Waiting for $host..." 1>&2
-  until curl -sf "http://$host:52415/models" &>/dev/null; do sleep 1; done
-done
-
-echo "Waiting 30s for cluster setup" 1>&2
-sleep 30
-echo "EXO loaded" 1>&2
-eval_runner="${hosts[0]}"
-mkdir -p "./bench/$commit"
-nix run .#exo-get-all-models-on-cluster -- "$eval_runner" | while IFS= read -r model; do
-  echo "running eval for $model" 1>&2
-  ssh -Tn -o BatchMode=yes -o ServerAliveInterval=30 "$eval_runner@$eval_runner" \
-    "/nix/var/nix/profiles/default/bin/nix run github:exo-explore/exo/$commit#exo-eval-tool-calls -- --model $model --stdout" \
-    >>"./bench/$commit/${model//\//--}-eval.json"
-  echo
-done
--- a/uv.lock
+++ b/uv.lock
@@ -447,7 +447,6 @@ name = "exo-bench"
 version = "0.1.0"
 source = { editable = "bench" }
 dependencies = [
-    { name = "httpx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "loguru", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -457,7 +456,6 @@ dependencies = [

 [package.metadata]
 requires-dist = [
-    { name = "httpx", specifier = ">=0.27.0" },
    { name = "huggingface-hub", specifier = ">=0.33.4" },
    { name = "jinja2", specifier = ">=3.1.0" },
    { name = "loguru", specifier = ">=0.7.3" },