Add status=downloaded filter for model endpoint (#1539 )

## Motivation https://github.com/exo-explore/exo/issues/1346#issuecomment-3831427905 ## Test Plan ### Manual Testing **Without filter** <img width="1708" height="1010" alt="Screenshot 2026-02-18 at 22 26 22" src="https://github.com/user-attachments/assets/f4bf7142-717d-4042-ac28-d8a55a8e45e7" /> **With filter** <img width="1723" height="1021" alt="Screenshot 2026-02-18 at 22 26 45" src="https://github.com/user-attachments/assets/40a522d5-c6e6-4148-b21a-02caa1221ebe" />
Try multiple endpoints for internet connectivity check (#1516 )
2026-02-18 23:06:23 -05:00 · 2026-02-18 22:34:11 +00:00 · 2026-02-18 22:10:07 +00:00 · 2026-02-18 22:05:26 +00:00 · 2026-02-18 21:59:02 +00:00 · 2026-02-18 21:11:13 +00:00
121 changed files with 4846 additions and 4898 deletions
--- a/.github/workflows/pipeline.yml
+++ b/.github/workflows/pipeline.yml
@@ -8,33 +8,6 @@ on:
      - main

 jobs:
-  typecheck:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          lfs: false
-
-      - uses: cachix/install-nix-action@v31
-        with:
-          nix_path: nixpkgs=channel:nixos-unstable
-
-      - uses: cachix/cachix-action@v14
-        name: Configure Cachix
-        with:
-          name: exo
-          authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"
-
-      - name: Load nix develop environment
-        run: nix run github:nicknovitski/nix-develop/v1
-
-      - name: Sync dependencies
-        run: uv sync --all-packages
-
-      - name: Run type checker
-        run: uv run basedpyright --project pyproject.toml
-
  nix:
    name: Build and check (${{ matrix.system }})
    runs-on: ${{ matrix.runner }}
--- a/.mlx_typings/mlx/nn/layers/base.pyi
+++ b/.mlx_typings/mlx/nn/layers/base.pyi
@@ -200,7 +200,7 @@ class Module(dict):
    ) -> mx.MX_ARRAY_TREE:  # -> dict[Any, Any | dict[Any, Any | dict[Any, Any] | list[Any]] | dict[Any, Any] | list[Any]]:
        """Return the submodules that do not contain other modules."""

-    def update(self, parameters: dict, strict: bool = ...) -> Module:
+    def update(self, parameters: dict[str, Any], strict: bool = ...) -> Module:
        """Replace the parameters of this Module with the provided ones in the
        dict of dicts and lists.

--- a/.mlx_typings/mlx/utils.pyi
+++ b/.mlx_typings/mlx/utils.pyi
@@ -7,7 +7,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from mlx.core import MX_ARRAY_TREE

 def tree_map(
-    fn: Callable, tree: Any, *rest: Any, is_leaf: Optional[Callable] = ...
+    fn: Callable[..., Any],
+    tree: Any,
+    *rest: Any,
+    is_leaf: Callable[..., bool] | None = ...,
 ) -> Any:
    """Applies ``fn`` to the leaves of the Python tree ``tree`` and
    returns a new collection with the results.
@@ -44,11 +47,11 @@ def tree_map(
    """

 def tree_map_with_path(
-    fn: Callable,
+    fn: Callable[..., Any],
    tree: Any,
    *rest: Any,
-    is_leaf: Optional[Callable] = ...,
-    path: Optional[Any] = ...,
+    is_leaf: Callable[..., bool] | None = ...,
+    path: str | None = ...,
 ) -> Any:
    """Applies ``fn`` to the path and leaves of the Python tree ``tree`` and
    returns a new collection with the results.
@@ -80,9 +83,9 @@ def tree_map_with_path(
 def tree_flatten(
    tree: Any,
    prefix: str = ...,
-    is_leaf: Optional[Callable] = ...,
-    destination: Optional[Union[List[Tuple[str, Any]], Dict[str, Any]]] = ...,
-) -> Union[List[Tuple[str, Any]], Dict[str, Any]]:
+    is_leaf: Callable[..., bool] | None = ...,
+    destination: list[tuple[str, Any]] | dict[str, Any] | None = ...,
+) -> list[tuple[str, Any]] | dict[str, Any]:
    """Flattens a Python tree to a list of key, value tuples.

    The keys are using the dot notation to define trees of arbitrary depth and
@@ -118,7 +121,7 @@ def tree_flatten(
            the Python tree.
    """

-def tree_unflatten(tree: Union[List[Tuple[str, Any]], Dict[str, Any]]) -> Any:
+def tree_unflatten(tree: list[tuple[str, Any]] | dict[str, Any]) -> Any:
    """Recreate a Python tree from its flat representation.

    .. code-block:: python
--- a/.mlx_typings/mlx_lm/generate.pyi
+++ b/.mlx_typings/mlx_lm/generate.pyi
@@ -276,23 +276,24 @@ class BatchGenerator:
        logprobs: mx.array
        finish_reason: Optional[str]

-    unprocessed_prompts: List[Any]
-
    def __init__(
        self,
-        model: nn.Module,
+        model,
        max_tokens: int = ...,
-        stop_tokens: Optional[set[int]] = ...,
+        stop_tokens: Optional[set] = ...,
        sampler: Optional[Callable[[mx.array], mx.array]] = ...,
        completion_batch_size: int = ...,
        prefill_batch_size: int = ...,
        prefill_step_size: int = ...,
    ) -> None: ...
    def insert(
-        self, prompts: List[List[int]], max_tokens: Union[List[int], int, None] = ...
-    ) -> List[int]: ...
-    def stats(self) -> BatchStats: ...
-    def next(self) -> List[Response]: ...
+        self, prompts, max_tokens: Union[List[int], int, None] = ...
+    ):  # -> list[Any]:
+        ...
+    def stats(self):  # -> BatchStats:
+        ...
+    def next(self):  # -> list[Any]:
+        ...

 def batch_generate(
    model,
--- a/.mlx_typings/mlx_lm/models/glm_moe_dsa.pyi
+++ b/.mlx_typings/mlx_lm/models/glm_moe_dsa.pyi
@@ -0,0 +1,46 @@
+"""Type stubs for mlx_lm.models.glm_moe_dsa"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+from .base import BaseModelArgs
+from .deepseek_v32 import Model as DSV32Model
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    vocab_size: int
+    hidden_size: int
+    index_head_dim: int
+    index_n_heads: int
+    index_topk: int
+    intermediate_size: int
+    moe_intermediate_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    num_key_value_heads: int
+    n_shared_experts: Optional[int]
+    n_routed_experts: Optional[int]
+    routed_scaling_factor: float
+    kv_lora_rank: int
+    q_lora_rank: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    qk_nope_head_dim: int
+    topk_method: str
+    scoring_func: str
+    norm_topk_prob: bool
+    n_group: int
+    topk_group: int
+    num_experts_per_tok: int
+    moe_layer_freq: int
+    first_k_dense_replace: int
+    max_position_embeddings: int
+    rms_norm_eps: float
+    rope_parameters: Dict[str, Any]
+    attention_bias: bool
+    rope_scaling: Dict[str, Any] | None
+    rope_theta: float | None
+
+class Model(DSV32Model):
+    def __init__(self, config: ModelArgs) -> None: ...
--- a/.mlx_typings/mlx_lm/tokenizer_utils.pyi
+++ b/.mlx_typings/mlx_lm/tokenizer_utils.pyi
@@ -39,11 +39,11 @@ class StreamingDetokenizer:
    """

    __slots__ = ...
-    def reset(self) -> None: ...
-    def add_token(self, token: int) -> None: ...
-    def finalize(self) -> None: ...
+    def reset(self): ...
+    def add_token(self, token): ...
+    def finalize(self): ...
    @property
-    def last_segment(self) -> str:
+    def last_segment(self):
        """Return the last segment of readable text since last time this property was accessed."""

 class NaiveStreamingDetokenizer(StreamingDetokenizer):
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -116,49 +116,10 @@ From .cursorrules:
 - Catch exceptions only where you can handle them meaningfully
 - Use `@final` and immutability wherever applicable

-## Model Storage
-
-Downloaded models are stored in `~/.exo/models/` (not the standard HuggingFace cache location).
-
-## Creating Model Instances via API
-
-When testing with the API, you must first create a model instance before sending chat completions:
-
-```bash
-# 1. Get instance previews for a model
-curl "http://localhost:52415/instance/previews?model_id=llama-3.2-1b"
-
-# 2. Create an instance from the first valid preview
-INSTANCE=$(curl -s "http://localhost:52415/instance/previews?model_id=llama-3.2-1b" | jq -c '.previews[] | select(.error == null) | .instance' | head -n1)
-curl -X POST http://localhost:52415/instance -H 'Content-Type: application/json' -d "{\"instance\": $INSTANCE}"
-
-# 3. Wait for the runner to become ready (check logs for "runner ready")
-
-# 4. Send chat completions using the full model ID
-curl -X POST http://localhost:52415/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "mlx-community/Llama-3.2-1B-Instruct-4bit", "messages": [{"role": "user", "content": "Hello"}], "max_tokens": 50}'
-```
-
-## Logs
-
-Exo logs are stored in `~/.exo/exo.log`. This is useful for debugging runner crashes and distributed issues.
-
 ## Testing

 Tests use pytest-asyncio with `asyncio_mode = "auto"`. Tests are in `tests/` subdirectories alongside the code they test. The `EXO_TESTS=1` env var is set during tests.

-### Distributed Testing
-
-When running distributed tests across multiple machines, use `EXO_LIBP2P_NAMESPACE` to isolate your test cluster from other exo instances on the same network:
-
-```bash
-# On each machine in the test cluster, use the same unique namespace
-EXO_LIBP2P_NAMESPACE=my-test-cluster uv run exo
-```
-
-This prevents your test cluster from discovering and interfering with production or other developers' exo clusters.
-
 ## Dashboard UI Testing & Screenshots

 ### Building and Running the Dashboard
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -141,12 +141,6 @@ version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"

-[[package]]
-name = "arrayvec"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
-
 [[package]]
 name = "asn1-rs"
 version = "0.7.1"
@@ -304,19 +298,6 @@ version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba"

-[[package]]
-name = "bigdecimal"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "560f42649de9fa436b73517378a147ec21f6c997a546581df4b4b31677828934"
-dependencies = [
- "autocfg",
- "libm",
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "bimap"
 version = "0.6.3"
@@ -516,15 +497,6 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2f421161cb492475f1661ddc9815a745a1c894592070661180fdec3d4872e9c3"

-[[package]]
-name = "convert_case"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
-dependencies = [
- "unicode-segmentation",
-]
-
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -746,29 +718,6 @@ dependencies = [
 "powerfmt",
 ]

-[[package]]
-name = "derive_more"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618"
-dependencies = [
- "derive_more-impl",
-]
-
-[[package]]
-name = "derive_more-impl"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b"
-dependencies = [
- "convert_case",
- "proc-macro2",
- "quote",
- "rustc_version",
- "syn 2.0.111",
- "unicode-xid",
-]
-
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -939,22 +888,17 @@ name = "exo_pyo3_bindings"
 version = "0.0.1"
 dependencies = [
 "delegate",
- "derive_more",
 "env_logger",
 "extend",
 "futures",
- "impl-trait-for-tuples",
 "libp2p",
 "log",
 "networking",
- "once_cell",
 "pin-project",
 "pyo3",
 "pyo3-async-runtimes",
 "pyo3-log",
 "pyo3-stub-gen",
- "thiserror 2.0.17",
- "thread_local",
 "tokio",
 "util",
 ]
@@ -1640,17 +1584,6 @@ dependencies = [
 "xmltree",
 ]

-[[package]]
-name = "impl-trait-for-tuples"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.111",
-]
-
 [[package]]
 name = "indexmap"
 version = "2.12.1"
@@ -1829,12 +1762,6 @@ version = "0.2.178"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"

-[[package]]
-name = "libm"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de"
-
 [[package]]
 name = "libp2p"
 version = "0.56.0"
@@ -2824,16 +2751,13 @@ name = "networking"
 version = "0.0.1"
 dependencies = [
 "delegate",
- "derive_more",
 "either",
 "extend",
 "futures",
 "futures-timer",
- "impl-trait-for-tuples",
 "keccak-const",
 "libp2p",
 "log",
- "thiserror 2.0.17",
 "tokio",
 "tracing-subscriber",
 "util",
@@ -2918,17 +2842,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-rational"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
-dependencies = [
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -3279,28 +3192,14 @@ version = "0.27.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab53c047fcd1a1d2a8820fe84f05d6be69e9526be40cb03b73f86b6b03e6d87d"
 dependencies = [
- "bigdecimal",
- "either",
- "hashbrown 0.16.1",
- "indexmap",
 "indoc",
- "inventory",
 "libc",
- "lock_api",
 "memoffset",
- "num-bigint",
- "num-complex",
- "num-rational",
- "num-traits",
 "once_cell",
- "ordered-float",
- "parking_lot",
 "portable-atomic",
 "pyo3-build-config",
 "pyo3-ffi",
 "pyo3-macros",
- "rust_decimal",
- "smallvec",
 "unindent",
 ]

@@ -3741,16 +3640,6 @@ dependencies = [
 "tokio",
 ]

-[[package]]
-name = "rust_decimal"
-version = "1.39.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35affe401787a9bd846712274d97654355d21b2a2c092a3139aabe31e9022282"
-dependencies = [
- "arrayvec",
- "num-traits",
-]
-
 [[package]]
 name = "rustc-hash"
 version = "1.1.0"
@@ -4615,24 +4504,12 @@ version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"

-[[package]]
-name = "unicode-segmentation"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
-
 [[package]]
 name = "unicode-width"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"

-[[package]]
-name = "unicode-xid"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
-
 [[package]]
 name = "unicode_names2"
 version = "1.3.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,49 +26,21 @@ opt-level = 3
 networking = { path = "rust/networking" }
 util = { path = "rust/util" }

-# Proc-macro authoring tools
-syn = "2.0"
-quote = "1.0"
-proc-macro2 = "1.0"
-darling = "0.20"
-
 # Macro dependecies
 extend = "1.2"
 delegate = "0.13"
-impl-trait-for-tuples = "0.2"
-clap = "4.5"
-derive_more = { version = "2.0.1", features = ["display"] }
 pin-project = "1"

 # Utility dependencies
-itertools = "0.14"
-thiserror = "2"
-internment = "0.8"
-recursion = "0.5"
-regex = "1.11"
-once_cell = "1.21"
-thread_local = "1.1"
-bon = "3.4"
-generativity = "1.1"
-anyhow = "1.0"
 keccak-const = "0.2"

-# Functional generics/lenses frameworks
-frunk_core = "0.4"
-frunk = "0.4"
-frunk_utils = "0.2"
-frunk-enum-core = "0.3"
-
 # Async dependencies
 tokio = "1.46"
 futures = "0.3"
-futures-util = "0.3"
 futures-timer = "3.0"

 # Data structures
 either = "1.15"
-ordered-float = "5.0"
-ahash = "0.8"

 # Tracing/logging
 log = "0.4"
--- a/MISSED_THINGS.md
+++ b/MISSED_THINGS.md
@@ -5,21 +5,21 @@
 [X] Fetching download status of all models on start
 [X] Deduplication of tasks in plan_step.
 [X] resolve_allow_patterns should just be wildcard now.
-[] no mx_barrier in genreate.py mlx_generate at the end.
+[X] no mx_barrier in genreate.py mlx_generate at the end.
 [] cache assertion not needed in auto_parallel.py PipelineLastLayer.
-[] GPTOSS support dropped in auto_parallel.py.
-[] sharding changed "all-to-sharded" became _all_to_sharded in auto_parallel.py.
-[] same as above with "sharded-to-all" became _sharded_to_all in auto_parallel.py.
-[] Dropped support for Ministral3Model, DeepseekV32Model, Glm4MoeModel, Qwen3NextModel, GptOssMode in auto_parallel.py.
+[X] GPTOSS support dropped in auto_parallel.py.
+[X] sharding changed "all-to-sharded" became _all_to_sharded in auto_parallel.py.
+[X] same as above with "sharded-to-all" became _sharded_to_all in auto_parallel.py.
+[X] Dropped support for Ministral3Model, DeepseekV32Model, Glm4MoeModel, Qwen3NextModel, GptOssMode in auto_parallel.py.
 [] Dropped prefill/decode code in auto_parallel.py and utils_mlx.py.
 [X] KV_CACHE_BITS should be None to disable quantized KV cache.
-[] Dropped _set_nofile_limit in utils_mlx.py.
-[] We have group optional in load_mlx_items in utils_mlx.py.
-[] Dropped add_missing_chat_templates for GptOss in load_mlx_items in utils_mlx.py.
-[] Dropped model.make_cache in make_kv_cache in utils_mlx.py.
+[X] Dropped _set_nofile_limit in utils_mlx.py.
+[X] We have group optional in load_mlx_items in utils_mlx.py.
+[X] Dropped add_missing_chat_templates for GptOss in load_mlx_items in utils_mlx.py.
+[X] Dropped model.make_cache in make_kv_cache in utils_mlx.py.
 [X] We put cache limit back in utils_mlx.py.
-[] topology.py remove_node removes the connections after checking if node is is in self._node_id_to_rx_id_map. on beta_1 it checks after, so would remove stale connections I guess?
-[] Missing Glm 4.7 model cards (this isn't ready yet but should be picked up, probably create an issue... the blocker is transforemrs version doesn't support the tokenizer for Glm 4.7. rc-1 does but we can't upgrade as it breaks other things.)
+[X] topology.py remove_node removes the connections after checking if node is is in self._node_id_to_rx_id_map. on beta_1 it checks after, so would remove stale connections I guess?
+[X] Missing Glm 4.7 model cards (this isn't ready yet but should be picked up, probably create an issue... the blocker is transforemrs version doesn't support the tokenizer for Glm 4.7. rc-1 does but we can't upgrade as it breaks other things.)
 [] try-except in _command_processor only excepts ValueError. This was silently failing leading to un-debuggable errors (we had a KeyError that was happening ). Changed this to catch Exception instead of ValueError. See exo-v2 89ae38405e0052e3c22405daf094b065878aa873 and fb99fea69b5a39017efc90c5dad0072e677455f0.
 [X] In placement.py, place_instance no longer looks at model_meta.supports_tensor and check if this tensor parallel number of nodes is supported by the model's tensor dimensions.
 [X] In placement.py, place_instanec, we no longer have the special case to exclude DeepSeek v3.1 pipeline parallel (it doesn't work).
--- a/README.md
+++ b/README.md
@@ -72,16 +72,23 @@ There are two ways to run exo:

 ### Run from Source (macOS)

+If you have [Nix](https://nixos.org/) installed, you can skip most of the steps below and run exo directly (after accepting the Cachix cache):
+
+```bash
+nix run .#exo
+```
+
 **Prerequisites:**
+- [Xcode](https://developer.apple.com/xcode/) (provides the Metal ToolChain required for MLX compilation)
 - [brew](https://github.com/Homebrew/brew) (for simple package management on macOS)
-  
+
  ```bash
  /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
  ```
 - [uv](https://github.com/astral-sh/uv) (for Python dependency management)
 - [macmon](https://github.com/vladkens/macmon) (for hardware monitoring on Apple Silicon)
 - [node](https://github.com/nodejs/node) (for building the dashboard)
-  
+
  ```bash
  brew install uv macmon node
  ```
--- a/app/EXO/EXO/ExoProcessController.swift
+++ b/app/EXO/EXO/ExoProcessController.swift
@@ -126,11 +126,37 @@ final class ExoProcessController: ObservableObject {
            return
        }
        process.terminationHandler = nil
-        if process.isRunning {
-            process.terminate()
-        }
-        self.process = nil
        status = .stopped
+
+        guard process.isRunning else {
+            self.process = nil
+            return
+        }
+
+        let proc = process
+        self.process = nil
+
+        Task.detached {
+            proc.interrupt()
+
+            for _ in 0..<50 {
+                if !proc.isRunning { return }
+                try? await Task.sleep(nanoseconds: 100_000_000)
+            }
+
+            if proc.isRunning {
+                proc.terminate()
+            }
+
+            for _ in 0..<30 {
+                if !proc.isRunning { return }
+                try? await Task.sleep(nanoseconds: 100_000_000)
+            }
+
+            if proc.isRunning {
+                kill(proc.processIdentifier, SIGKILL)
+            }
+        }
    }

    func restart() {
--- a/bench/bench.toml
+++ b/bench/bench.toml
@@ -0,0 +1,7 @@
+# Canary benchmark manifest
+#
+# Lists the suite files to include. Each file defines benchmarks
+# with shared constraints, topology, and default args.
+include = [
+    "single-m3-ultra.toml",
+]
--- a/bench/eval_tool_calls.py
+++ b/bench/eval_tool_calls.py
--- a/bench/exo_bench.py
+++ b/bench/exo_bench.py
@@ -1,29 +1,47 @@
+# type: ignore
 #!/usr/bin/env python3
-# pyright: reportAny=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
+"""Tool-calling eval for exo's OpenAI-compatible API.
+
+Tests whether models correctly:
+- Trigger tool calls when appropriate
+- Return valid JSON arguments matching function schemas
+- Handle multi-turn tool use (call -> result -> final answer)
+- Avoid calling tools when unnecessary
+
+Start exo with a model first, then run:
+    uv run python tool_call_eval.py --model <model-id>
+    uv run python tool_call_eval.py --model <model-id> --host 10.0.0.5 --port 52415
+    uv run python tool_call_eval.py --model <model-id> --repeat 3
+    uv run python tool_call_eval.py --model <model-id> --scenarios weather_simple calculator_multi_turn
+"""
+
 from __future__ import annotations

 import argparse
 import contextlib
-import http.client
 import itertools
 import json
-import os
 import sys
 import time
 from collections.abc import Callable
 from pathlib import Path
 from statistics import mean
 from typing import Any
-from urllib.parse import urlencode

+from harness import (
+    ExoClient,
+    ExoHttpError,
+    add_common_instance_args,
+    instance_id_from_instance,
+    nodes_used_in_instance,
+    resolve_model_short_id,
+    settle_and_fetch_placements,
+    wait_for_instance_gone,
+    wait_for_instance_ready,
+)
 from loguru import logger
 from transformers import AutoTokenizer

-# Backoff constants for cluster settling retry
-_SETTLE_INITIAL_BACKOFF_S = 1.0
-_SETTLE_MAX_BACKOFF_S = 60.0
-_SETTLE_BACKOFF_MULTIPLIER = 2.0
-
 # Monkey-patch for transformers 5.x compatibility
 # Kimi's tokenization_kimi.py imports bytes_to_unicode from the old location
 # which was moved in transformers 5.0.0rc2
@@ -103,154 +121,6 @@ def load_tokenizer_for_bench(model_id: str) -> Any:
    return AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)


-class ExoHttpError(RuntimeError):
-    def __init__(self, status: int, reason: str, body_preview: str):
-        super().__init__(f"HTTP {status} {reason}: {body_preview}")
-        self.status = status
-
-
-class ExoClient:
-    def __init__(self, host: str, port: int, timeout_s: float = 7200.0):
-        self.host = host
-        self.port = port
-        self.timeout_s = timeout_s
-
-    def request_json(
-        self,
-        method: str,
-        path: str,
-        params: dict[str, Any] | None = None,
-        body: dict[str, Any] | None = None,
-        headers: dict[str, str] | None = None,
-    ) -> Any:
-        if not path.startswith("/"):
-            path = "/" + path
-        if params:
-            path = path + "?" + urlencode(params)
-
-        conn = http.client.HTTPConnection(self.host, self.port, timeout=self.timeout_s)
-        try:
-            payload: bytes | None = None
-            hdrs: dict[str, str] = {"Accept": "application/json"}
-
-            if body is not None:
-                payload = json.dumps(body).encode("utf-8")
-                hdrs["Content-Type"] = "application/json"
-            if headers:
-                hdrs.update(headers)
-
-            conn.request(method.upper(), path, body=payload, headers=hdrs)
-            resp = conn.getresponse()
-            raw = resp.read()
-            text = raw.decode("utf-8", errors="replace") if raw else ""
-
-            if resp.status >= 400:
-                raise ExoHttpError(resp.status, resp.reason, text[:300])
-
-            if not text:
-                return None
-            return json.loads(text)
-        finally:
-            conn.close()
-
-    def post_bench_chat_completions(self, payload: dict[str, Any]) -> dict[str, Any]:
-        return self.request_json("POST", "/bench/chat/completions", body=payload)
-
-
-def unwrap_instance(instance: dict[str, Any]) -> dict[str, Any]:
-    if len(instance) != 1:
-        raise KeyError(f"Expected 1 key, got keys={list(instance.keys())}")
-
-    tag = next(iter(instance))
-    inner = instance[tag]
-    if not isinstance(inner, dict):
-        raise TypeError(f"payload for {tag} must be dict, got {type(inner)}")
-    return inner
-
-
-def instance_id_from_instance(instance: dict[str, Any]) -> str:
-    inner = unwrap_instance(instance)
-    return str(inner["instanceId"])
-
-
-def nodes_used_in_instance(instance: dict[str, Any]) -> int:
-    inner = unwrap_instance(instance)
-    return len(inner["shardAssignments"]["nodeToRunner"])
-
-
-def runner_ids_from_instance(instance: dict[str, Any]) -> list[str]:
-    inner = unwrap_instance(instance)
-    runner_to_shard = inner["shardAssignments"]["runnerToShard"]
-    return list(runner_to_shard.keys())
-
-
-def runner_ready(runner: dict[str, Any]) -> bool:
-    return "RunnerReady" in runner
-
-
-def runner_failed(runner: dict[str, Any]) -> bool:
-    return "RunnerFailed" in runner
-
-
-def get_runner_failed_message(runner: dict[str, Any]) -> str | None:
-    if "RunnerFailed" in runner:
-        return runner["RunnerFailed"].get("errorMessage")
-    return None
-
-
-def wait_for_instance_ready(
-    client: ExoClient, instance_id: str, timeout: float = 24000.0
-) -> None:
-    start_time = time.time()
-    instance_existed = False
-    while time.time() - start_time < timeout:
-        state = client.request_json("GET", "/state")
-        instances = state.get("instances", {})
-
-        if instance_id not in instances:
-            if instance_existed:
-                # Instance was deleted after being created - likely due to runner failure
-                raise RuntimeError(
-                    f"Instance {instance_id} was deleted (runner may have failed)"
-                )
-            time.sleep(0.1)
-            continue
-
-        instance_existed = True
-        instance = instances[instance_id]
-        runner_ids = runner_ids_from_instance(instance)
-        runners = state.get("runners", {})
-
-        # Check for failed runners first
-        for rid in runner_ids:
-            runner = runners.get(rid, {})
-            if runner_failed(runner):
-                error_msg = get_runner_failed_message(runner) or "Unknown error"
-                raise RuntimeError(f"Runner {rid} failed: {error_msg}")
-
-        if all(runner_ready(runners.get(rid, {})) for rid in runner_ids):
-            return
-
-        time.sleep(0.1)
-
-    raise TimeoutError(f"Instance {instance_id} did not become ready within {timeout=}")
-
-
-def wait_for_instance_gone(
-    client: ExoClient, instance_id: str, timeout: float = 3.0
-) -> None:
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-        try:
-            client.request_json("GET", f"/instance/{instance_id}")
-            time.sleep(0.4)
-        except ExoHttpError as e:
-            if e.status == 404:
-                return
-
-    raise TimeoutError(f"Instance {instance_id} did not get deleted within {timeout=}")
-
-
 def format_peak_memory(b: float) -> str:
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if b < 1024.0:
@@ -269,39 +139,6 @@ def parse_int_list(values: list[str]) -> list[int]:
    return items


-def resolve_model_short_id(client: ExoClient, model_arg: str) -> tuple[str, str]:
-    models = client.request_json("GET", "/models") or {}
-    data = models.get("data") or []
-
-    for m in data:
-        if m.get("name").lower() == model_arg.lower():
-            short_id = str(m["name"])
-            full_id = str(m.get("hugging_face_id") or m["name"])
-            return short_id, full_id
-
-    for m in data:
-        if m.get("hugging_face_id") == model_arg:
-            short_id = str(m["name"])
-            full_id = str(m["hugging_face_id"])
-            return short_id, full_id
-
-    raise ValueError(f"Model not found in /models: {model_arg}")
-
-
-def placement_filter(instance_meta: str, wanted: str) -> bool:
-    s = (instance_meta or "").lower()
-    if wanted == "both":
-        return ("ring" in s) or ("jaccl" in s)
-    return wanted in s
-
-
-def sharding_filter(sharding: str, wanted: str) -> bool:
-    s = (sharding or "").lower()
-    if wanted == "both":
-        return ("pipeline" in s) or ("tensor" in s)
-    return wanted in s
-
-
 def run_one_completion(
    client: ExoClient, model_id: str, pp_hint: int, tg: int, prompt_sizer: PromptSizer
 ) -> tuple[dict[str, Any], int]:
@@ -393,76 +230,12 @@ class PromptSizer:
        return content, tok


-def fetch_and_filter_placements(
-    client: ExoClient, full_model_id: str, args: argparse.Namespace
-) -> list[dict[str, Any]]:
-    previews_resp = client.request_json(
-        "GET", "/instance/previews", params={"model_id": full_model_id}
-    )
-    previews = previews_resp.get("previews") or []
-
-    selected: list[dict[str, Any]] = []
-    for p in previews:
-        if p.get("error") is not None:
-            continue
-        if not placement_filter(str(p.get("instance_meta", "")), args.instance_meta):
-            continue
-        if not sharding_filter(str(p.get("sharding", "")), args.sharding):
-            continue
-
-        instance = p.get("instance")
-        if not isinstance(instance, dict):
-            continue
-
-        n = nodes_used_in_instance(instance)
-        # Skip tensor ring single node as it is pointless when pipeline ring
-        if n == 1 and (
-            (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
-            or (
-                args.instance_meta == "both"
-                and "jaccl" in p.get("instance_meta", "").lower()
-            )
-        ):
-            continue
-
-        if (
-            args.skip_pipeline_jaccl
-            and (
-                args.instance_meta == "both"
-                and "jaccl" in p.get("instance_meta", "").lower()
-            )
-            and (
-                args.sharding == "both" and "pipeline" in p.get("sharding", "").lower()
-            )
-        ):
-            continue
-
-        if (
-            args.skip_tensor_ring
-            and (
-                args.instance_meta == "both"
-                and "ring" in p.get("instance_meta", "").lower()
-            )
-            and (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
-        ):
-            continue
-
-        if args.min_nodes <= n <= args.max_nodes:
-            selected.append(p)
-
-    return selected
-
-
 def main() -> int:
    ap = argparse.ArgumentParser(
        prog="exo-bench",
        description="Benchmark exo model throughput across placement previews.",
    )
-    ap.add_argument("--host", default=os.environ.get("EXO_HOST", "localhost"))
-    ap.add_argument(
-        "--port", type=int, default=int(os.environ.get("EXO_PORT", "52415"))
-    )
-    ap.add_argument("--model", required=True, help="Model short id or huggingface id")
+    add_common_instance_args(ap)
    ap.add_argument(
        "--pp",
        nargs="+",
@@ -475,34 +248,6 @@ def main() -> int:
        required=True,
        help="Generation lengths (ints). Accepts commas.",
    )
-    ap.add_argument(
-        "--max-nodes",
-        type=int,
-        default=4,
-        help="Only consider placements using <= this many nodes.",
-    )
-    ap.add_argument(
-        "--min-nodes",
-        type=int,
-        default=1,
-        help="Only consider placements using >= this many nodes.",
-    )
-    ap.add_argument(
-        "--instance-meta", choices=["ring", "jaccl", "both"], default="both"
-    )
-    ap.add_argument(
-        "--sharding", choices=["pipeline", "tensor", "both"], default="both"
-    )
-    ap.add_argument(
-        "--skip-pipeline-jaccl",
-        action="store_true",
-        help="Skip pipeline+jaccl placements, as it's often pointless.",
-    )
-    ap.add_argument(
-        "--skip-tensor-ring",
-        action="store_true",
-        help="Skip tensor+ring placements, as it's so slow.",
-    )
    ap.add_argument(
        "--repeat", type=int, default=1, help="Repetitions per (pp,tg) pair."
    )
@@ -512,9 +257,6 @@ def main() -> int:
        default=0,
        help="Warmup runs per placement (uses first pp/tg).",
    )
-    ap.add_argument(
-        "--timeout", type=float, default=7200.0, help="HTTP timeout (seconds)."
-    )
    ap.add_argument(
        "--json-out",
        default="bench/results.json",
@@ -529,12 +271,6 @@ def main() -> int:
        action="store_true",
        help="Force all pp×tg combinations (cartesian product) even when lists have equal length.",
    )
-    ap.add_argument(
-        "--settle-timeout",
-        type=float,
-        default=0,
-        help="Max seconds to wait for the cluster to produce valid placements (0 = try once).",
-    )
    args = ap.parse_args()

    pp_list = parse_int_list(args.pp)
@@ -569,20 +305,9 @@ def main() -> int:
        logger.error("[exo-bench] tokenizer usable but prompt sizing failed")
        raise

-    selected = fetch_and_filter_placements(client, full_model_id, args)
-
-    if not selected and args.settle_timeout > 0:
-        backoff = _SETTLE_INITIAL_BACKOFF_S
-        deadline = time.monotonic() + args.settle_timeout
-        while not selected and time.monotonic() < deadline:
-            remaining = deadline - time.monotonic()
-            logger.warning(
-                f"No valid placements yet (cluster may still be settling). "
-                f"Retrying in {backoff:.1f}s ({remaining:.0f}s remaining)..."
-            )
-            time.sleep(min(backoff, remaining))
-            backoff = min(backoff * _SETTLE_BACKOFF_MULTIPLIER, _SETTLE_MAX_BACKOFF_S)
-            selected = fetch_and_filter_placements(client, full_model_id, args)
+    selected = settle_and_fetch_placements(
+        client, full_model_id, args, settle_timeout=args.settle_timeout
+    )

    if not selected:
        logger.error("No valid placements matched your filters.")
--- a/bench/harness.py
+++ b/bench/harness.py
@@ -0,0 +1,327 @@
+# type: ignore
+from __future__ import annotations
+
+import argparse
+import http.client
+import json
+import os
+import time
+from typing import Any
+from urllib.parse import urlencode
+
+from loguru import logger
+
+_SETTLE_INITIAL_BACKOFF_S = 1.0
+_SETTLE_MAX_BACKOFF_S = 60.0
+_SETTLE_BACKOFF_MULTIPLIER = 2.0
+
+
+class ExoHttpError(RuntimeError):
+    def __init__(self, status: int, reason: str, body_preview: str):
+        super().__init__(f"HTTP {status} {reason}: {body_preview}")
+        self.status = status
+
+
+class ExoClient:
+    def __init__(self, host: str, port: int, timeout_s: float = 7200.0):
+        self.host = host
+        self.port = port
+        self.timeout_s = timeout_s
+
+    def request_json(
+        self,
+        method: str,
+        path: str,
+        params: dict[str, Any] | None = None,
+        body: dict[str, Any] | None = None,
+        headers: dict[str, str] | None = None,
+    ) -> Any:
+        if not path.startswith("/"):
+            path = "/" + path
+        if params:
+            path = path + "?" + urlencode(params)
+
+        conn = http.client.HTTPConnection(self.host, self.port, timeout=self.timeout_s)
+        try:
+            payload: bytes | None = None
+            hdrs: dict[str, str] = {"Accept": "application/json"}
+
+            if body is not None:
+                payload = json.dumps(body).encode("utf-8")
+                hdrs["Content-Type"] = "application/json"
+            if headers:
+                hdrs.update(headers)
+
+            conn.request(method.upper(), path, body=payload, headers=hdrs)
+            resp = conn.getresponse()
+            raw = resp.read()
+            text = raw.decode("utf-8", errors="replace") if raw else ""
+
+            if resp.status >= 400:
+                raise ExoHttpError(resp.status, resp.reason, text[:300])
+
+            if not text:
+                return None
+            return json.loads(text)
+        finally:
+            conn.close()
+
+    def post_bench_chat_completions(self, payload: dict[str, Any]) -> dict[str, Any]:
+        return self.request_json("POST", "/bench/chat/completions", body=payload)
+
+
+def unwrap_instance(instance: dict[str, Any]) -> dict[str, Any]:
+    if len(instance) != 1:
+        raise KeyError(f"Expected 1 key, got keys={list(instance.keys())}")
+
+    tag = next(iter(instance))
+    inner = instance[tag]
+    if not isinstance(inner, dict):
+        raise TypeError(f"payload for {tag} must be dict, got {type(inner)}")
+    return inner
+
+
+def instance_id_from_instance(instance: dict[str, Any]) -> str:
+    inner = unwrap_instance(instance)
+    return str(inner["instanceId"])
+
+
+def nodes_used_in_instance(instance: dict[str, Any]) -> int:
+    inner = unwrap_instance(instance)
+    return len(inner["shardAssignments"]["nodeToRunner"])
+
+
+def runner_ids_from_instance(instance: dict[str, Any]) -> list[str]:
+    inner = unwrap_instance(instance)
+    runner_to_shard = inner["shardAssignments"]["runnerToShard"]
+    return list(runner_to_shard.keys())
+
+
+def runner_ready(runner: dict[str, Any]) -> bool:
+    return "RunnerReady" in runner
+
+
+def runner_failed(runner: dict[str, Any]) -> bool:
+    return "RunnerFailed" in runner
+
+
+def get_runner_failed_message(runner: dict[str, Any]) -> str | None:
+    if "RunnerFailed" in runner:
+        return runner["RunnerFailed"].get("errorMessage")
+    return None
+
+
+def wait_for_instance_ready(
+    client: ExoClient, instance_id: str, timeout: float = 24000.0
+) -> None:
+    start_time = time.time()
+    instance_existed = False
+    while time.time() - start_time < timeout:
+        state = client.request_json("GET", "/state")
+        instances = state.get("instances", {})
+
+        if instance_id not in instances:
+            if instance_existed:
+                # Instance was deleted after being created - likely due to runner failure
+                raise RuntimeError(
+                    f"Instance {instance_id} was deleted (runner may have failed)"
+                )
+            time.sleep(0.1)
+            continue
+
+        instance_existed = True
+        instance = instances[instance_id]
+        runner_ids = runner_ids_from_instance(instance)
+        runners = state.get("runners", {})
+
+        # Check for failed runners first
+        for rid in runner_ids:
+            runner = runners.get(rid, {})
+            if runner_failed(runner):
+                error_msg = get_runner_failed_message(runner) or "Unknown error"
+                raise RuntimeError(f"Runner {rid} failed: {error_msg}")
+
+        if all(runner_ready(runners.get(rid, {})) for rid in runner_ids):
+            return
+
+        time.sleep(0.1)
+
+    raise TimeoutError(f"Instance {instance_id} did not become ready within {timeout=}")
+
+
+def wait_for_instance_gone(
+    client: ExoClient, instance_id: str, timeout: float = 3.0
+) -> None:
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            client.request_json("GET", f"/instance/{instance_id}")
+            time.sleep(0.4)
+        except ExoHttpError as e:
+            if e.status == 404:
+                return
+            raise
+
+    raise TimeoutError(f"Instance {instance_id} did not get deleted within {timeout=}")
+
+
+def resolve_model_short_id(client: ExoClient, model_arg: str) -> tuple[str, str]:
+    models = client.request_json("GET", "/models") or {}
+    data = models.get("data") or []
+
+    for m in data:
+        if (m.get("name") or "").lower() == model_arg.lower():
+            short_id = str(m["name"])
+            full_id = str(m.get("hugging_face_id") or m["name"])
+            return short_id, full_id
+
+    for m in data:
+        if m.get("hugging_face_id") == model_arg:
+            short_id = str(m["name"])
+            full_id = str(m["hugging_face_id"])
+            return short_id, full_id
+
+    raise ValueError(f"Model not found in /models: {model_arg}")
+
+
+def placement_filter(instance_meta: str, wanted: str) -> bool:
+    s = (instance_meta or "").lower()
+    if wanted == "both":
+        return ("ring" in s) or ("jaccl" in s)
+    return wanted in s
+
+
+def sharding_filter(sharding: str, wanted: str) -> bool:
+    s = (sharding or "").lower()
+    if wanted == "both":
+        return ("pipeline" in s) or ("tensor" in s)
+    return wanted in s
+
+
+def fetch_and_filter_placements(
+    client: ExoClient, full_model_id: str, args: argparse.Namespace
+) -> list[dict[str, Any]]:
+    previews_resp = client.request_json(
+        "GET", "/instance/previews", params={"model_id": full_model_id}
+    )
+    previews = previews_resp.get("previews") or []
+
+    selected: list[dict[str, Any]] = []
+    for p in previews:
+        if p.get("error") is not None:
+            continue
+        if not placement_filter(str(p.get("instance_meta", "")), args.instance_meta):
+            continue
+        if not sharding_filter(str(p.get("sharding", "")), args.sharding):
+            continue
+
+        instance = p.get("instance")
+        if not isinstance(instance, dict):
+            continue
+
+        n = nodes_used_in_instance(instance)
+        # Skip tensor ring single node as it is pointless when pipeline ring
+        if n == 1 and (
+            (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
+            or (
+                args.instance_meta == "both"
+                and "jaccl" in p.get("instance_meta", "").lower()
+            )
+        ):
+            continue
+
+        if (
+            args.skip_pipeline_jaccl
+            and (
+                args.instance_meta == "both"
+                and "jaccl" in p.get("instance_meta", "").lower()
+            )
+            and (
+                args.sharding == "both" and "pipeline" in p.get("sharding", "").lower()
+            )
+        ):
+            continue
+
+        if (
+            args.skip_tensor_ring
+            and (
+                args.instance_meta == "both"
+                and "ring" in p.get("instance_meta", "").lower()
+            )
+            and (args.sharding == "both" and "tensor" in p.get("sharding", "").lower())
+        ):
+            continue
+
+        if args.min_nodes <= n <= args.max_nodes:
+            selected.append(p)
+
+    return selected
+
+
+def settle_and_fetch_placements(
+    client: ExoClient,
+    full_model_id: str,
+    args: argparse.Namespace,
+    settle_timeout: float = 0,
+) -> list[dict[str, Any]]:
+    selected = fetch_and_filter_placements(client, full_model_id, args)
+
+    if not selected and settle_timeout > 0:
+        backoff = _SETTLE_INITIAL_BACKOFF_S
+        deadline = time.monotonic() + settle_timeout
+        while not selected and time.monotonic() < deadline:
+            remaining = deadline - time.monotonic()
+            logger.warning(
+                f"No valid placements yet (cluster may still be settling). "
+                f"Retrying in {backoff:.1f}s ({remaining:.0f}s remaining)..."
+            )
+            time.sleep(min(backoff, remaining))
+            backoff = min(backoff * _SETTLE_BACKOFF_MULTIPLIER, _SETTLE_MAX_BACKOFF_S)
+            selected = fetch_and_filter_placements(client, full_model_id, args)
+
+    return selected
+
+
+def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
+    ap.add_argument("--host", default=os.environ.get("EXO_HOST", "localhost"))
+    ap.add_argument(
+        "--port", type=int, default=int(os.environ.get("EXO_PORT", "52415"))
+    )
+    ap.add_argument("--model", required=True, help="Model short id or huggingface id")
+    ap.add_argument(
+        "--max-nodes",
+        type=int,
+        default=4,
+        help="Only consider placements using <= this many nodes.",
+    )
+    ap.add_argument(
+        "--min-nodes",
+        type=int,
+        default=1,
+        help="Only consider placements using >= this many nodes.",
+    )
+    ap.add_argument(
+        "--instance-meta", choices=["ring", "jaccl", "both"], default="both"
+    )
+    ap.add_argument(
+        "--sharding", choices=["pipeline", "tensor", "both"], default="both"
+    )
+    ap.add_argument(
+        "--skip-pipeline-jaccl",
+        action="store_true",
+        help="Skip pipeline+jaccl placements, as it's often pointless.",
+    )
+    ap.add_argument(
+        "--skip-tensor-ring",
+        action="store_true",
+        help="Skip tensor+ring placements, as it's so slow.",
+    )
+    ap.add_argument(
+        "--timeout", type=float, default=7200.0, help="HTTP timeout (seconds)."
+    )
+    ap.add_argument(
+        "--settle-timeout",
+        type=float,
+        default=0,
+        help="Max seconds to wait for the cluster to produce valid placements (0 = try once).",
+    )
--- a/bench/pyproject.toml
+++ b/bench/pyproject.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 description = "Benchmarking tool for exo distributed inference"
 requires-python = ">=3.13"
 dependencies = [
+    "httpx>=0.27.0",
    "loguru>=0.7.3",
    "transformers>=5.0.0",
    "huggingface-hub>=0.33.4",
--- a/bench/scenarios.toml
+++ b/bench/scenarios.toml
@@ -0,0 +1,240 @@
+# Tool definitions — each becomes an OpenAI function tool.
+# All scenarios get all tools unless they specify a `tools` list.
+
+[tools.get_current_weather]
+description = "Get the current weather in a given location"
+required = ["location"]
+
+[tools.get_current_weather.properties.location]
+type = "string"
+description = "City and state, e.g. San Francisco, CA"
+
+[tools.get_current_weather.properties.unit]
+type = "string"
+enum = ["celsius", "fahrenheit"]
+description = "Temperature unit"
+
+[tools.calculate]
+description = "Evaluate a mathematical expression and return the numeric result"
+required = ["expression"]
+
+[tools.calculate.properties.expression]
+type = "string"
+description = "The math expression to evaluate, e.g. '2 + 3 * 4'"
+
+[tools.search_products]
+description = "Search for products in a catalog by query, category, and price"
+required = ["query"]
+
+[tools.search_products.properties.query]
+type = "string"
+description = "Search query string"
+
+[tools.search_products.properties.category]
+type = "string"
+enum = ["electronics", "clothing", "food", "books"]
+description = "Product category to filter by"
+
+[tools.search_products.properties.max_price]
+type = "number"
+description = "Maximum price in USD"
+
+# -- Should call a tool --
+
+[[scenarios]]
+name = "weather_simple"
+description = "Basic weather query -> get_current_weather"
+expect_tool_call = true
+expected_function = "get_current_weather"
+required_arg_keys = ["location"]
+
+[[scenarios.messages]]
+role = "user"
+content = "What's the weather like in Tokyo right now?"
+
+[[scenarios]]
+name = "calculator_simple"
+description = "Math question -> calculate"
+expect_tool_call = true
+expected_function = "calculate"
+required_arg_keys = ["expression"]
+
+[[scenarios.messages]]
+role = "user"
+content = "Use the calculator to compute 3847 * 926 + 17293"
+
+[[scenarios]]
+name = "search_with_filters"
+description = "Product search with category and price filter"
+expect_tool_call = true
+expected_function = "search_products"
+required_arg_keys = ["query"]
+
+[[scenarios.messages]]
+role = "user"
+content = "Find me electronics under $50"
+
+# -- Multi-turn: tool call then follow-up --
+
+[[scenarios]]
+name = "weather_multi_turn"
+description = "Weather query -> tool result -> natural language summary"
+expect_tool_call = true
+expected_function = "get_current_weather"
+required_arg_keys = ["location"]
+
+[scenarios.tool_result]
+temperature = "18C"
+condition = "partly cloudy"
+humidity = "65%"
+wind = "12 km/h NW"
+
+[[scenarios.messages]]
+role = "user"
+content = "What's the weather in Paris?"
+
+[[scenarios]]
+name = "calculator_multi_turn"
+description = "Math query -> tool result -> model reports the answer"
+expect_tool_call = true
+expected_function = "calculate"
+required_arg_keys = ["expression"]
+
+[scenarios.tool_result]
+result = 491682
+
+[[scenarios.messages]]
+role = "user"
+content = "Use the calculator to compute 1847 * 263 + 5921"
+
+[[scenarios]]
+name = "search_multi_turn"
+description = "Search query -> tool result -> model summarizes products"
+expect_tool_call = true
+expected_function = "search_products"
+required_arg_keys = ["query"]
+
+[[scenarios.tool_result.results]]
+name = "Hands-On Machine Learning"
+price = 45.99
+rating = 4.8
+
+[[scenarios.tool_result.results]]
+name = "Deep Learning with Python"
+price = 39.99
+rating = 4.6
+
+[[scenarios.messages]]
+role = "user"
+content = "Search for books about machine learning"
+
+# -- Sequential tool calls --
+
+[[scenarios]]
+name = "chained_tool_calls_same"
+description = "Thinking + weather(Tokyo) -> result -> model must call weather(London)"
+expect_tool_call = true
+expected_function = "get_current_weather"
+required_arg_keys = ["location"]
+
+[[scenarios.messages]]
+role = "user"
+content = "Compare the weather in Tokyo and London."
+
+[[scenarios.messages]]
+role = "assistant"
+content = "I'll check both cities. Let me start with Tokyo."
+
+[[scenarios.messages.tool_calls]]
+id = "call_1"
+name = "get_current_weather"
+arguments = { location = "Tokyo" }
+
+[[scenarios.messages]]
+role = "tool"
+tool_call_id = "call_1"
+content = '{"temperature": "25C", "condition": "sunny"}'
+
+[[scenarios]]
+name = "chained_tool_calls_different"
+description = "Thinking + weather(Berlin) -> result -> model must call calculator"
+expect_tool_call = true
+expected_function = "calculate"
+required_arg_keys = ["expression"]
+
+[[scenarios.messages]]
+role = "user"
+content = "What's the weather in Berlin, and also use the calculator to compute 4819 * 37 + 291."
+
+[[scenarios.messages]]
+role = "assistant"
+content = "I'll handle both. Let me check Berlin's weather first."
+
+[[scenarios.messages.tool_calls]]
+id = "call_2"
+name = "get_current_weather"
+arguments = { location = "Berlin" }
+
+[[scenarios.messages]]
+role = "tool"
+tool_call_id = "call_2"
+content = '{"temperature": "12C", "condition": "rainy"}'
+
+[[scenarios]]
+name = "chained_tool_calls_three"
+description = "Two prior thinking+tool calls -> results -> model must make a third"
+expect_tool_call = true
+expected_function = "get_current_weather"
+required_arg_keys = ["location"]
+
+[[scenarios.messages]]
+role = "user"
+content = "Compare weather in Tokyo, Paris, and London."
+
+[[scenarios.messages]]
+role = "assistant"
+content = "I'll check all three cities. Starting with Tokyo."
+
+[[scenarios.messages.tool_calls]]
+id = "call_3"
+name = "get_current_weather"
+arguments = { location = "Tokyo" }
+
+[[scenarios.messages]]
+role = "tool"
+tool_call_id = "call_3"
+content = '{"temperature": "25C", "condition": "sunny"}'
+
+[[scenarios.messages]]
+role = "assistant"
+content = "Got Tokyo. Now checking Paris."
+
+[[scenarios.messages.tool_calls]]
+id = "call_4"
+name = "get_current_weather"
+arguments = { location = "Paris" }
+
+[[scenarios.messages]]
+role = "tool"
+tool_call_id = "call_4"
+content = '{"temperature": "18C", "condition": "cloudy"}'
+
+# -- Should NOT call a tool --
+
+[[scenarios]]
+name = "no_tool_joke"
+description = "Joke request should NOT trigger any tool"
+expect_tool_call = false
+
+[[scenarios.messages]]
+role = "user"
+content = "Tell me a funny joke about cats."
+
+[[scenarios]]
+name = "no_tool_factual"
+description = "Factual question answerable from training data"
+expect_tool_call = false
+
+[[scenarios.messages]]
+role = "user"
+content = "What is the capital of Japan?"
--- a/bench/single-m3-ultra.toml
+++ b/bench/single-m3-ultra.toml
@@ -0,0 +1,189 @@
+# Single-node M3 Ultra benchmarks
+#
+# Shared constraints applied to ALL benchmarks in this file.
+constraints = [
+    "All(MacOsBuild(=25D125))",
+    "Hosts(=1)",
+    "All(Chip(m3_ultra))",
+    "All(GpuCores(=80))",
+]
+
+[topology]
+type = "none"
+
+# Default args merged into each benchmark's args (benchmark-level args win).
+[defaults]
+pp = [512, 2048, 8192, 16384]
+tg = 128
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-120b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-20b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/llama-3.3-70b-instruct-fp16"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-bf16"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-3bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-bf16"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-6bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-8Bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/DeepSeek-V3.1-4bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-6bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-8bit-gs32"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
--- a/conftest.py
+++ b/conftest.py
@@ -1 +0,0 @@
-collect_ignore = ["tests/start_distributed_test.py"]
--- a/dashboard/src/lib/components/ChatForm.svelte
+++ b/dashboard/src/lib/components/ChatForm.svelte
@@ -103,7 +103,7 @@
  const modelSupportsThinking = $derived(() => {
    if (!currentModel) return false;
    const caps = modelCapabilities[currentModel] || [];
-    return caps.includes("thinking") && caps.includes("text");
+    return caps.includes("thinking_toggle") && caps.includes("text");
  });

  const isEditOnlyWithoutImage = $derived(
@@ -265,6 +265,7 @@

  function handleSubmit() {
    if ((!message.trim() && uploadedFiles.length === 0) || loading) return;
+    if (isEditOnlyWithoutImage) return;

    const content = message.trim();
    const files = [...uploadedFiles];
@@ -289,7 +290,11 @@
      if (imageFile.preview) {
        editImage(content, imageFile.preview);
      }
-    } else if (isImageModel() && content) {
+    } else if (
+      currentModel &&
+      modelSupportsTextToImage(currentModel) &&
+      content
+    ) {
      // Use image generation for text-to-image models
      generateImage(content);
    } else {
--- a/dashboard/src/lib/components/ChatMessages.svelte
+++ b/dashboard/src/lib/components/ChatMessages.svelte
@@ -225,6 +225,7 @@
  }

  function handleDeleteClick(messageId: string) {
+    if (loading) return;
    deleteConfirmId = messageId;
  }

@@ -255,7 +256,7 @@
 </script>

 <div class="flex flex-col gap-4 sm:gap-6 {className}">
-  {#each messageList as message (message.id)}
+  {#each messageList as message, i (message.id)}
    <div
      class="group flex {message.role === 'user'
        ? 'justify-end'
@@ -317,9 +318,11 @@
          <!-- Delete confirmation -->
          <div class="bg-red-500/10 border border-red-500/30 rounded-lg p-3">
            <p class="text-xs text-red-400 mb-3">
-              Delete this message{message.role === "user"
-                ? " and all responses after it"
-                : ""}?
+              {#if i === messageList.length - 1}
+                Delete this message?
+              {:else}
+                Delete this message and all messages after it?
+              {/if}
            </p>
            <div class="flex gap-2 justify-end">
              <button
@@ -751,8 +754,13 @@
            <!-- Delete button -->
            <button
              onclick={() => handleDeleteClick(message.id)}
-              class="p-1.5 text-exo-light-gray hover:text-red-400 transition-colors rounded hover:bg-red-500/10 cursor-pointer"
-              title="Delete message"
+              disabled={loading}
+              class="p-1.5 transition-colors rounded {loading
+                ? 'text-exo-light-gray/30 cursor-not-allowed'
+                : 'text-exo-light-gray hover:text-red-400 hover:bg-red-500/10 cursor-pointer'}"
+              title={loading
+                ? "Cannot delete while generating"
+                : "Delete message"}
            >
              <svg
                class="w-3.5 h-3.5"
--- a/dashboard/src/lib/components/ImageParamsPanel.svelte
+++ b/dashboard/src/lib/components/ImageParamsPanel.svelte
@@ -59,13 +59,14 @@
  }

  const sizeOptions: ImageGenerationParams["size"][] = [
+    "auto",
    "512x512",
    "768x768",
    "1024x1024",
    "1024x768",
    "768x1024",
-    "1024x1365",
-    "1365x1024",
+    "1024x1536",
+    "1536x1024",
  ];

  const qualityOptions: ImageGenerationParams["quality"][] = [
@@ -176,92 +177,90 @@
 <div class="border-b border-exo-medium-gray/30 px-3 py-2">
  <!-- Basic params row -->
  <div class="flex items-center gap-3 flex-wrap">
-    <!-- Size (hidden in edit mode - output size comes from input image) -->
-    {#if !isEditMode}
-      <div class="flex items-center gap-1.5">
-        <span class="text-xs text-exo-light-gray uppercase tracking-wider"
-          >SIZE:</span
+    <!-- Size -->
+    <div class="flex items-center gap-1.5">
+      <span class="text-xs text-exo-light-gray uppercase tracking-wider"
+        >SIZE:</span
+      >
+      <div class="relative">
+        <button
+          bind:this={sizeButtonRef}
+          type="button"
+          onclick={() => (isSizeDropdownOpen = !isSizeDropdownOpen)}
+          class="bg-exo-medium-gray/50 border border-exo-yellow/30 rounded pl-2 pr-6 py-1 text-xs font-mono text-exo-yellow cursor-pointer transition-all duration-200 hover:border-exo-yellow/50 focus:outline-none focus:border-exo-yellow/70 {isSizeDropdownOpen
+            ? 'border-exo-yellow/70'
+            : ''}"
        >
-        <div class="relative">
-          <button
-            bind:this={sizeButtonRef}
-            type="button"
-            onclick={() => (isSizeDropdownOpen = !isSizeDropdownOpen)}
-            class="bg-exo-medium-gray/50 border border-exo-yellow/30 rounded pl-2 pr-6 py-1 text-xs font-mono text-exo-yellow cursor-pointer transition-all duration-200 hover:border-exo-yellow/50 focus:outline-none focus:border-exo-yellow/70 {isSizeDropdownOpen
-              ? 'border-exo-yellow/70'
-              : ''}"
+          {params.size.toUpperCase()}
+        </button>
+        <div
+          class="absolute right-1.5 top-1/2 -translate-y-1/2 pointer-events-none transition-transform duration-200 {isSizeDropdownOpen
+            ? 'rotate-180'
+            : ''}"
+        >
+          <svg
+            class="w-3 h-3 text-exo-yellow/60"
+            fill="none"
+            viewBox="0 0 24 24"
+            stroke="currentColor"
          >
-            {params.size}
-          </button>
-          <div
-            class="absolute right-1.5 top-1/2 -translate-y-1/2 pointer-events-none transition-transform duration-200 {isSizeDropdownOpen
-              ? 'rotate-180'
-              : ''}"
-          >
-            <svg
-              class="w-3 h-3 text-exo-yellow/60"
-              fill="none"
-              viewBox="0 0 24 24"
-              stroke="currentColor"
-            >
-              <path
-                stroke-linecap="round"
-                stroke-linejoin="round"
-                stroke-width="2"
-                d="M19 9l-7 7-7-7"
-              />
-            </svg>
+            <path
+              stroke-linecap="round"
+              stroke-linejoin="round"
+              stroke-width="2"
+              d="M19 9l-7 7-7-7"
+            />
+          </svg>
+        </div>
+      </div>
+
+      {#if isSizeDropdownOpen}
+        <!-- Backdrop to close dropdown -->
+        <button
+          type="button"
+          class="fixed inset-0 z-[9998] cursor-default"
+          onclick={() => (isSizeDropdownOpen = false)}
+          aria-label="Close dropdown"
+        ></button>
+
+        <!-- Dropdown Panel - fixed positioning to escape overflow:hidden -->
+        <div
+          class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto overflow-x-hidden min-w-max"
+          style="bottom: calc(100vh - {sizeDropdownPosition()
+            .top}px + 4px); left: {sizeDropdownPosition().left}px;"
+        >
+          <div class="py-1">
+            {#each sizeOptions as size}
+              <button
+                type="button"
+                onclick={() => selectSize(size)}
+                class="w-full px-3 py-1.5 text-left text-xs font-mono tracking-wide transition-colors duration-100 flex items-center gap-2 {params.size ===
+                size
+                  ? 'bg-transparent text-exo-yellow'
+                  : 'text-exo-light-gray hover:text-exo-yellow'}"
+              >
+                {#if params.size === size}
+                  <svg
+                    class="w-3 h-3 flex-shrink-0"
+                    fill="currentColor"
+                    viewBox="0 0 20 20"
+                  >
+                    <path
+                      fill-rule="evenodd"
+                      d="M16.707 5.293a1 1 0 010 1.414l-8 8a1 1 0 01-1.414 0l-4-4a1 1 0 011.414-1.414L8 12.586l7.293-7.293a1 1 0 011.414 0z"
+                      clip-rule="evenodd"
+                    />
+                  </svg>
+                {:else}
+                  <span class="w-3"></span>
+                {/if}
+                <span>{size.toUpperCase()}</span>
+              </button>
+            {/each}
          </div>
        </div>
-
-        {#if isSizeDropdownOpen}
-          <!-- Backdrop to close dropdown -->
-          <button
-            type="button"
-            class="fixed inset-0 z-[9998] cursor-default"
-            onclick={() => (isSizeDropdownOpen = false)}
-            aria-label="Close dropdown"
-          ></button>
-
-          <!-- Dropdown Panel - fixed positioning to escape overflow:hidden -->
-          <div
-            class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto min-w-max"
-            style="bottom: calc(100vh - {sizeDropdownPosition()
-              .top}px + 4px); left: {sizeDropdownPosition().left}px;"
-          >
-            <div class="py-1">
-              {#each sizeOptions as size}
-                <button
-                  type="button"
-                  onclick={() => selectSize(size)}
-                  class="w-full px-3 py-1.5 text-left text-xs font-mono tracking-wide transition-colors duration-100 flex items-center gap-2 {params.size ===
-                  size
-                    ? 'bg-transparent text-exo-yellow'
-                    : 'text-exo-light-gray hover:text-exo-yellow'}"
-                >
-                  {#if params.size === size}
-                    <svg
-                      class="w-3 h-3 flex-shrink-0"
-                      fill="currentColor"
-                      viewBox="0 0 20 20"
-                    >
-                      <path
-                        fill-rule="evenodd"
-                        d="M16.707 5.293a1 1 0 010 1.414l-8 8a1 1 0 01-1.414 0l-4-4a1 1 0 011.414-1.414L8 12.586l7.293-7.293a1 1 0 011.414 0z"
-                        clip-rule="evenodd"
-                      />
-                    </svg>
-                  {:else}
-                    <span class="w-3"></span>
-                  {/if}
-                  <span>{size}</span>
-                </button>
-              {/each}
-            </div>
-          </div>
-        {/if}
-      </div>
-    {/if}
+      {/if}
+    </div>

    <!-- Quality -->
    <div class="flex items-center gap-1.5">
@@ -311,7 +310,7 @@

        <!-- Dropdown Panel - fixed positioning to escape overflow:hidden -->
        <div
-          class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto min-w-max"
+          class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto overflow-x-hidden min-w-max"
          style="bottom: calc(100vh - {qualityDropdownPosition()
            .top}px + 4px); left: {qualityDropdownPosition().left}px;"
        >
--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
@@ -306,13 +306,14 @@ const IMAGE_PARAMS_STORAGE_KEY = "exo-image-generation-params";
 export interface ImageGenerationParams {
  // Basic params
  size:
+    | "auto"
    | "512x512"
    | "768x768"
    | "1024x1024"
    | "1024x768"
    | "768x1024"
-    | "1024x1365"
-    | "1365x1024";
+    | "1024x1536"
+    | "1536x1024";
  quality: "low" | "medium" | "high";
  outputFormat: "png" | "jpeg";
  numImages: number;
@@ -336,7 +337,7 @@ export interface EditingImage {
 }

 const DEFAULT_IMAGE_PARAMS: ImageGenerationParams = {
-  size: "1024x1024",
+  size: "auto",
  quality: "medium",
  outputFormat: "png",
  numImages: 1,
--- a/dashboard/src/routes/downloads/+page.svelte
+++ b/dashboard/src/routes/downloads/+page.svelte
--- a/flake.nix
+++ b/flake.nix
@@ -115,7 +115,7 @@
          packages = lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin (
            let
              uvLock = builtins.fromTOML (builtins.readFile ./uv.lock);
-              mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx") uvLock.package);
+              mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx" && p.source ? git) uvLock.package);
              uvLockMlxVersion = mlxPackage.version;
            in
            {
--- a/nix/mlx.nix
+++ b/nix/mlx.nix
@@ -41,16 +41,16 @@ let

  mlx = stdenv.mkDerivation rec {
    pname = "mlx";
-    version = let v = "0.30.6"; in
+    version = let v = "0.30.7.dev20260218+14841977"; in
      assert v == uvLockMlxVersion || throw "MLX version mismatch: nix/mlx.nix has ${v} but uv.lock has ${uvLockMlxVersion}. Update both the version and hash in nix/mlx.nix.";
      v;
    pyproject = true;

    src = fetchFromGitHub {
-      owner = "ml-explore";
-      repo = "mlx";
-      tag = "v${version}";
-      hash = "sha256-avD5EGhwgmPdXLAyQSqTO6AXk/W3ziH+f6AetjK3Sdo=";
+      owner = "rltakashige";
+      repo = "mlx-jaccl-fix-small-recv";
+      rev = "1484197707f35186ad3bd614357c7c47fdf86ebc";
+      hash = "sha256-FupCMoK/SF/ldfKuvMSAKECcOP8c+ANgkQlPZttDsLk=";
    };

    patches = [
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,9 @@ dependencies = [
    "loguru>=0.7.3",
    "exo_pyo3_bindings", # rust bindings
    "anyio==4.11.0",
-    "mlx==0.30.6; sys_platform == 'darwin'",
+    "mlx; sys_platform == 'darwin'",
    "mlx[cpu]==0.30.6; sys_platform == 'linux'",
-    "mlx-lm==0.30.6",
+    "mlx-lm==0.30.7",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
@@ -64,6 +64,7 @@ members = [

 [tool.uv.sources]
 exo_pyo3_bindings = { workspace = true }
+mlx = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git", branch = "address-rdma-gpu-locks", marker = "sys_platform == 'darwin'" }
 #mlx-lm = { git = "https://github.com/davidmcc73/mlx-lm", branch = "stable" }
 # Uncomment to use local mlx/mlx-lm development versions:
 # mlx = { path = "/Users/Shared/mlx", editable=true }
@@ -132,7 +133,7 @@ markers = [
 env = [
  "EXO_TESTS=1"
 ]
-addopts = "-m 'not slow'"
+addopts = "-m 'not slow' --ignore=tests/start_distributed_test.py"
 filterwarnings = [
    "ignore:builtin type Swig:DeprecationWarning",
 ]
--- a/python/parts.nix
+++ b/python/parts.nix
@@ -14,7 +14,9 @@

      # Override overlay to inject Nix-built components
      exoOverlay = final: prev: {
-        # Replace workspace exo_pyo3_bindings with Nix-built wheel
+        # Replace workspace exo_pyo3_bindings with Nix-built wheel.
+        # Preserve passthru so mkVirtualEnv can resolve dependency groups.
+        # Copy .pyi stub + py.typed marker so basedpyright can find the types.
        exo-pyo3-bindings = pkgs.stdenv.mkDerivation {
          pname = "exo-pyo3-bindings";
          version = "0.1.0";
@@ -22,6 +24,12 @@
          # Install from pre-built wheel
          nativeBuildInputs = [ final.pyprojectWheelHook ];
          dontStrip = true;
+          passthru = prev.exo-pyo3-bindings.passthru or { };
+          postInstall = ''
+            local siteDir=$out/${final.python.sitePackages}/exo_pyo3_bindings
+            cp ${inputs.self}/rust/exo_pyo3_bindings/exo_pyo3_bindings.pyi $siteDir/
+            touch $siteDir/py.typed
+          '';
        };
      };

@@ -29,17 +37,47 @@

      # Overlay to provide build systems and custom packages
      buildSystemsOverlay = final: prev: {
-        # Use our pure Nix-built MLX with Metal support
-        mlx = self'.packages.mlx;
-
        # mlx-lm is a git dependency that needs setuptools
        mlx-lm = prev.mlx-lm.overrideAttrs (old: {
          nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [
            final.setuptools
          ];
        });
+      } // lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin {
+        # Use our pure Nix-built MLX with Metal support (macOS only)
+        mlx = self'.packages.mlx;
      };

+      # Additional overlay for Linux-specific fixes (type checking env).
+      # Native wheels have shared lib dependencies we don't need at type-check time.
+      linuxOverlay = final: prev:
+        let
+          ignoreMissing = drv: drv.overrideAttrs { autoPatchelfIgnoreMissingDeps = [ "*" ]; };
+          nvidiaPackages = lib.filterAttrs (name: _: lib.hasPrefix "nvidia-" name) prev;
+        in
+        lib.optionalAttrs pkgs.stdenv.hostPlatform.isLinux (
+          (lib.mapAttrs (_: ignoreMissing) nvidiaPackages) // {
+            mlx = ignoreMissing prev.mlx;
+            mlx-cuda-13 = prev.mlx-cuda-13.overrideAttrs (old: {
+              buildInputs = (old.buildInputs or [ ]) ++ [
+                final.nvidia-cublas
+                final.nvidia-cuda-nvrtc
+                final.nvidia-cudnn-cu13
+                final.nvidia-nccl-cu13
+              ];
+              preFixup = ''
+                addAutoPatchelfSearchPath ${final.nvidia-cublas}
+                addAutoPatchelfSearchPath ${final.nvidia-cuda-nvrtc}
+                addAutoPatchelfSearchPath ${final.nvidia-cudnn-cu13}
+                addAutoPatchelfSearchPath ${final.nvidia-nccl-cu13}
+              '';
+              autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" ];
+            });
+            torch = ignoreMissing prev.torch;
+            triton = ignoreMissing prev.triton;
+          }
+        );
+
      pythonSet = (pkgs.callPackage inputs.pyproject-nix.build.packages {
        inherit python;
      }).overrideScope (
@@ -48,16 +86,28 @@
          overlay
          exoOverlay
          buildSystemsOverlay
+          linuxOverlay
        ]
      );
-      exoVenv = pythonSet.mkVirtualEnv "exo-env" workspace.deps.default;
+      # mlx-cpu and mlx-cuda-13 both ship mlx/ site-packages files; keep first.
+      # mlx-cpu/mlx-cuda-13 and nvidia-cudnn-cu12/cu13 ship overlapping files.
+      venvCollisionPaths = lib.optionals pkgs.stdenv.hostPlatform.isLinux [
+        "lib/python3.13/site-packages/mlx*"
+        "lib/python3.13/site-packages/nvidia*"
+      ];
+
+      exoVenv = (pythonSet.mkVirtualEnv "exo-env" workspace.deps.default).overrideAttrs {
+        venvIgnoreCollisions = venvCollisionPaths;
+      };

      # Virtual environment with dev dependencies for testing
-      testVenv = pythonSet.mkVirtualEnv "exo-test-env" (
+      testVenv = (pythonSet.mkVirtualEnv "exo-test-env" (
        workspace.deps.default // {
          exo = [ "dev" ]; # Include pytest, pytest-asyncio, pytest-env
        }
-      );
+      )).overrideAttrs {
+        venvIgnoreCollisions = venvCollisionPaths;
+      };

      mkPythonScript = name: path: pkgs.writeShellApplication {
        inherit name;
@@ -108,6 +158,7 @@
          exo-test-env = testVenv;
        } // {
        exo-bench = mkBenchScript "exo-bench" (inputs.self + /bench/exo_bench.py);
+        exo-eval-tool-calls = mkBenchScript "exo-eval-tool-calls" (inputs.self + /bench/eval_tool_calls.py);
        exo-get-all-models-on-cluster = mkSimplePythonScript "exo-get-all-models-on-cluster" (inputs.self + /tests/get_all_models_on_cluster.py);
      };

@@ -118,6 +169,21 @@
          ${pkgs.ruff}/bin/ruff check ${inputs.self}
          touch $out
        '';
+
+        # Hermetic basedpyright type checking
+        typecheck = pkgs.runCommand "typecheck"
+          {
+            nativeBuildInputs = [
+              testVenv
+              pkgs.basedpyright
+            ];
+          }
+          ''
+            cd ${inputs.self}
+            export HOME=$TMPDIR
+            basedpyright --pythonpath ${testVenv}/bin/python
+            touch $out
+          '';
      };
    };
 }
--- a/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "deepseek"
 quantization = "4bit"
 base_model = "DeepSeek V3.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 405874409472
--- a/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "deepseek"
 quantization = "8bit"
 base_model = "DeepSeek V3.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 765577920512
--- a/resources/inference_model_cards/mlx-community--GLM-4.5-Air-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.5-Air-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "8bit"
 base_model = "GLM 4.5 Air"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 122406567936
--- a/resources/inference_model_cards/mlx-community--GLM-4.5-Air-bf16.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.5-Air-bf16.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "bf16"
 base_model = "GLM 4.5 Air"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 229780750336
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "4bit"
 base_model = "GLM 4.7"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 198556925568
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-6bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "6bit"
 base_model = "GLM 4.7"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 286737579648
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-8bit-gs32.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-8bit-gs32.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "8bit"
 base_model = "GLM 4.7"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 396963397248
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "4bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 19327352832
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-5bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-5bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "5bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 22548578304
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-6bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "6bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 26843545600
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "8bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 34359738368
--- a/resources/inference_model_cards/mlx-community--GLM-5-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-8bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5-8bit-MXFP8"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "8bit"
+base_model = "GLM-5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 790517400864
--- a/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5-MXFP4-Q8"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "MXFP4-Q8"
+base_model = "GLM-5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 405478939008
--- a/resources/inference_model_cards/mlx-community--GLM-5-bf16.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-bf16.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "bf16"
+base_model = "GLM-5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 1487822475264
--- a/resources/inference_model_cards/mlx-community--Kimi-K2-Thinking.toml
+++ b/resources/inference_model_cards/mlx-community--Kimi-K2-Thinking.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "kimi"
 quantization = ""
 base_model = "Kimi K2"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 706522120192
--- a/resources/inference_model_cards/mlx-community--Kimi-K2.5.toml
+++ b/resources/inference_model_cards/mlx-community--Kimi-K2.5.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "kimi"
 quantization = ""
 base_model = "Kimi K2.5"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 662498705408
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.1-3bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.1-3bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "minimax"
 quantization = "3bit"
 base_model = "MiniMax M2.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 100086644736
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.1-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.1-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "minimax"
 quantization = "8bit"
 base_model = "MiniMax M2.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 242986745856
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.5-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.5-4bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/MiniMax-M2.5-4bit"
+n_layers = 62
+hidden_size = 3072
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "minimax"
+quantization = "4bit"
+base_model = "MiniMax M2.5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 128666664960
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.5-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.5-6bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/MiniMax-M2.5-6bit"
+n_layers = 62
+hidden_size = 3072
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "minimax"
+quantization = "6bit"
+base_model = "MiniMax M2.5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 185826705408
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.5-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.5-8bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/MiniMax-M2.5-8bit"
+n_layers = 62
+hidden_size = 3072
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "minimax"
+quantization = "8bit"
+base_model = "MiniMax M2.5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 242986745856
--- a/resources/inference_model_cards/mlx-community--Qwen3-0.6B-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-0.6B-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 0.6B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 342884352
--- a/resources/inference_model_cards/mlx-community--Qwen3-0.6B-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-0.6B-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 0.6B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 698351616
--- a/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 235B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 141733920768
--- a/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 235B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 268435456000
--- a/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 30B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 17612931072
--- a/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 30B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 33279705088
--- a/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 Next 80B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 47080074240
--- a/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 Next 80B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 88814387200
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "step"
 quantization = "4bit"
 base_model = "Step 3.5 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 114572190076
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-6bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "step"
 quantization = "6bit"
 base_model = "Step 3.5 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 159039627774
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-8Bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-8Bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "step"
 quantization = "8bit"
 base_model = "Step 3.5 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 209082699847
--- a/rust/exo_pyo3_bindings/Cargo.toml
+++ b/rust/exo_pyo3_bindings/Cargo.toml
@@ -25,17 +25,17 @@ workspace = true
 networking = { workspace = true }

 # interop
-pyo3 = { version = "0.27.1", features = [
-    # "abi3-py311", # tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.11
+pyo3 = { version = "0.27.2", features = [
+    # "abi3-py313", # tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.13
    "nightly", # enables better-supported GIL integration
    "experimental-async", # async support in #[pyfunction] & #[pymethods]
    #"experimental-inspect", # inspection of generated binary => easier to automate type-hint generation
    #"py-clone", # adding Clone-ing of `Py<T>` without GIL (may cause panics - remove if panics happen)
-    "multiple-pymethods", # allows multiple #[pymethods] sections per class
+    # "multiple-pymethods", # allows multiple #[pymethods] sections per class

    # integrations with other libraries
-    "arc_lock", "bigdecimal", "either", "hashbrown", "indexmap", "num-bigint", "num-complex", "num-rational",
-    "ordered-float", "rust_decimal", "smallvec",
+    # "arc_lock", "bigdecimal", "either", "hashbrown", "indexmap", "num-bigint", "num-complex", "num-rational",
+    # "ordered-float", "rust_decimal", "smallvec",
    # "anyhow", "chrono", "chrono-local", "chrono-tz", "eyre", "jiff-02", "lock_api", "parking-lot", "time",  "serde",
 ] }
 pyo3-stub-gen = { version = "0.17.2" }
@@ -45,8 +45,6 @@ pyo3-log = "0.13.2"
 # macro dependencies
 extend = { workspace = true }
 delegate = { workspace = true }
-impl-trait-for-tuples = { workspace = true }
-derive_more = { workspace = true }
 pin-project = { workspace = true }

 # async runtime
@@ -54,24 +52,11 @@ tokio = { workspace = true, features = ["full", "tracing"] }
 futures = { workspace = true }

 # utility dependencies
-once_cell = "1.21.3"
-thread_local = "1.1.9"
 util = { workspace = true }
-thiserror = { workspace = true }
-#internment = { workspace = true }
-#recursion = { workspace = true }
-#generativity = { workspace = true }
-#itertools = { workspace = true }
-

 # Tracing
-#tracing = "0.1"
-#tracing-subscriber = "0.3"
-#console-subscriber = "0.1.5"
-#tracing-log = "0.2.0"
 log = { workspace = true }
 env_logger = "0.11"

-
 # Networking
 libp2p = { workspace = true, features = ["full"] }
--- a/rust/exo_pyo3_bindings/src/allow_threading.rs
+++ b/rust/exo_pyo3_bindings/src/allow_threading.rs
@@ -6,7 +6,7 @@ use pyo3::marker::Ungil;
 use pyo3::prelude::*;
 use std::{
    future::Future,
-    pin::{Pin, pin},
+    pin::Pin,
    task::{Context, Poll},
 };

@@ -33,8 +33,6 @@ where

    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
        let waker = cx.waker();
-        Python::with_gil(|py| {
-            py.allow_threads(|| self.project().0.poll(&mut Context::from_waker(waker)))
-        })
+        Python::attach(|py| py.detach(|| self.project().0.poll(&mut Context::from_waker(waker))))
    }
 }
--- a/rust/exo_pyo3_bindings/src/examples/mod.rs
+++ b/rust/exo_pyo3_bindings/src/examples/mod.rs
@@ -1,240 +0,0 @@
-//! This module exists to hold examples of some pyo3 patterns that may be too complex to
-//! re-create from scratch, but too inhomogenous to create an abstraction/wrapper around.
-//!
-//! Pattern examples include:
-//!  - Async task handles: with GC-integrated cleanup
-//!  - Sync/async callbacks from python: with propper eventloop handling
-//!
-//! Mutability pattern: https://pyo3.rs/v0.26.0/async-await.html#send--static-constraint
-//!  - Store mutable fields in tokio's `Mutex<T>`
-//!  - For async code: take `&self` and `.lock().await`
-//!  - For sync code: take `&mut self` and `.get_mut()`
-
-use crate::ext::{PyResultExt as _, ResultExt as _, TokioRuntimeExt as _};
-use futures::FutureExt as _;
-use futures::future::BoxFuture;
-use pyo3::exceptions::PyRuntimeError;
-use pyo3::prelude::{PyModule, PyModuleMethods as _};
-use pyo3::{
-    Bound, Py, PyAny, PyErr, PyResult, PyTraverseError, PyVisit, Python, pyclass, pymethods,
-};
-use std::time::Duration;
-use tokio::sync::mpsc;
-use tokio::sync::mpsc::error::TryRecvError;
-
-fn needs_tokio_runtime() {
-    tokio::runtime::Handle::current();
-}
-
-type SyncCallback = Box<dyn Fn() + Send + Sync>;
-type AsyncCallback = Box<dyn Fn() -> BoxFuture<'static, ()> + Send + Sync>;
-
-enum AsyncTaskMessage {
-    SyncCallback(SyncCallback),
-    AsyncCallback(AsyncCallback),
-}
-
-async fn async_task(
-    sender: mpsc::UnboundedSender<()>,
-    mut receiver: mpsc::UnboundedReceiver<AsyncTaskMessage>,
-) {
-    log::info!("RUST: async task started");
-
-    // task state
-    let mut interval = tokio::time::interval(Duration::from_secs(1));
-
-    let mut sync_cbs: Vec<SyncCallback> = vec![];
-    let mut async_cbs: Vec<AsyncCallback> = vec![];
-
-    loop {
-        tokio::select! {
-            // handle incoming messages from task-handle
-            message = receiver.recv() => {
-                // handle closed channel by exiting
-                let Some(message) = message else {
-                    log::info!("RUST: channel closed");
-                    break;
-                };
-
-                // dispatch incoming event
-                match message {
-                    AsyncTaskMessage::SyncCallback(cb) => {
-                        sync_cbs.push(cb);
-                    }
-                    AsyncTaskMessage::AsyncCallback(cb) => {
-                        async_cbs.push(cb);
-                    }
-                }
-            }
-
-            // handle all other events
-            _ = interval.tick() => {
-                log::info!("RUST: async task tick");
-
-                // call back all sync callbacks
-                for cb in &sync_cbs {
-                    cb();
-                }
-
-                // call back all async callbacks
-                for cb in &async_cbs {
-                    cb().await;
-                }
-
-                // send event on unbounded channel
-                sender.send(()).expect("handle receiver cannot be closed/dropped");
-            }
-        }
-    }
-
-    log::info!("RUST: async task stopped");
-}
-
-// #[gen_stub_pyclass]
-#[pyclass(name = "AsyncTaskHandle")]
-#[derive(Debug)]
-struct PyAsyncTaskHandle {
-    sender: Option<mpsc::UnboundedSender<AsyncTaskMessage>>,
-    receiver: mpsc::UnboundedReceiver<()>,
-}
-
-#[allow(clippy::expect_used)]
-impl PyAsyncTaskHandle {
-    const fn sender(&self) -> &mpsc::UnboundedSender<AsyncTaskMessage> {
-        self.sender
-            .as_ref()
-            .expect("The sender should only be None after de-initialization.")
-    }
-
-    const fn sender_mut(&mut self) -> &mpsc::UnboundedSender<AsyncTaskMessage> {
-        self.sender
-            .as_mut()
-            .expect("The sender should only be None after de-initialization.")
-    }
-
-    const fn new(
-        sender: mpsc::UnboundedSender<AsyncTaskMessage>,
-        receiver: mpsc::UnboundedReceiver<()>,
-    ) -> Self {
-        Self {
-            sender: Some(sender),
-            receiver,
-        }
-    }
-}
-
-// #[gen_stub_pymethods]
-#[pymethods]
-impl PyAsyncTaskHandle {
-    #[new]
-    fn py_new(py: Python<'_>) -> PyResult<Self> {
-        use pyo3_async_runtimes::tokio::get_runtime;
-
-        // create communication channel TOWARDS our task
-        let (h_sender, t_receiver) = mpsc::unbounded_channel::<AsyncTaskMessage>();
-
-        // create communication channel FROM our task
-        let (t_sender, h_receiver) = mpsc::unbounded_channel::<()>();
-
-        // perform necessary setup within tokio context - or it crashes
-        let () = get_runtime().block_on(async { needs_tokio_runtime() });
-
-        // spawn tokio task with this thread's task-locals - without this, async callbacks on the new threads will not work!!
-        _ = get_runtime().spawn_with_scope(py, async move {
-            async_task(t_sender, t_receiver).await;
-        });
-        Ok(Self::new(h_sender, h_receiver))
-    }
-
-    /// NOTE: exceptions in callbacks are silently ignored until end of execution
-    fn add_sync_callback(
-        &self,
-        // #[gen_stub(override_type(
-        //     type_repr="collections.abc.Callable[[], None]",
-        //     imports=("collections.abc")
-        // ))]
-        callback: Py<PyAny>,
-    ) -> PyResult<()> {
-        // blocking call to async method -> can do non-blocking if needed
-        self.sender()
-            .send(AsyncTaskMessage::SyncCallback(Box::new(move || {
-                _ = Python::with_gil(|py| callback.call0(py).write_unraisable_with(py));
-            })))
-            .pyerr()?;
-        Ok(())
-    }
-
-    /// NOTE: exceptions in callbacks are silently ignored until end of execution
-    fn add_async_callback(
-        &self,
-        // #[gen_stub(override_type(
-        //     type_repr="collections.abc.Callable[[], collections.abc.Awaitable[None]]",
-        //     imports=("collections.abc")
-        // ))]
-        callback: Py<PyAny>,
-    ) -> PyResult<()> {
-        // blocking call to async method -> can do non-blocking if needed
-        self.sender()
-            .send(AsyncTaskMessage::AsyncCallback(Box::new(move || {
-                let c = Python::with_gil(|py| callback.clone_ref(py));
-                async move {
-                    if let Some(f) = Python::with_gil(|py| {
-                        let coroutine = c.call0(py).write_unraisable_with(py)?;
-                        pyo3_async_runtimes::tokio::into_future(coroutine.into_bound(py))
-                            .write_unraisable_with(py)
-                    }) {
-                        _ = f.await.write_unraisable();
-                    }
-                }
-                .boxed()
-            })))
-            .pyerr()?;
-        Ok(())
-    }
-
-    async fn receive_unit(&mut self) -> PyResult<()> {
-        self.receiver
-            .recv()
-            .await
-            .ok_or(PyErr::new::<PyRuntimeError, _>(
-                "cannot receive unit on closed channel",
-            ))
-    }
-
-    fn drain_units(&mut self) -> PyResult<i32> {
-        let mut cnt = 0;
-        loop {
-            match self.receiver.try_recv() {
-                Err(TryRecvError::Disconnected) => {
-                    return Err(PyErr::new::<PyRuntimeError, _>(
-                        "cannot receive unit on closed channel",
-                    ));
-                }
-                Err(TryRecvError::Empty) => return Ok(cnt),
-                Ok(()) => {
-                    cnt += 1;
-                    continue;
-                }
-            }
-        }
-    }
-
-    // #[gen_stub(skip)]
-    const fn __traverse__(&self, _visit: PyVisit<'_>) -> Result<(), PyTraverseError> {
-        Ok(()) // This is needed purely so `__clear__` can work
-    }
-
-    // #[gen_stub(skip)]
-    fn __clear__(&mut self) {
-        // TODO: may or may not need to await a "kill-signal" oneshot channel message,
-        //       to ensure that the networking task is done BEFORE exiting the clear function...
-        //       but this may require GIL?? and it may not be safe to call GIL here??
-        self.sender = None; // Using Option<T> as a trick to force `sender` channel to be dropped
-    }
-}
-
-pub fn examples_submodule(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<PyAsyncTaskHandle>()?;
-
-    Ok(())
-}
--- a/rust/exo_pyo3_bindings/src/lib.rs
+++ b/rust/exo_pyo3_bindings/src/lib.rs
@@ -17,7 +17,6 @@

 extern crate core;
 mod allow_threading;
-mod examples;
 pub(crate) mod networking;
 pub(crate) mod pylibp2p;

@@ -25,7 +24,6 @@ use crate::networking::networking_submodule;
 use crate::pylibp2p::ident::ident_submodule;
 use crate::pylibp2p::multiaddr::multiaddr_submodule;
 use pyo3::prelude::PyModule;
-use pyo3::prelude::*;
 use pyo3::{Bound, PyResult, pyclass, pymodule};
 use pyo3_stub_gen::define_stub_info_gatherer;

@@ -36,14 +34,10 @@ pub(crate) mod r#const {

 /// Namespace for all the type/trait aliases used by this crate.
 pub(crate) mod alias {
-    use std::error::Error;
    use std::marker::Tuple;

    pub trait SendFn<Args: Tuple + Send + 'static, Output> =
        Fn<Args, Output = Output> + Send + 'static;
-
-    pub type AnyError = Box<dyn Error + Send + Sync + 'static>;
-    pub type AnyResult<T> = Result<T, AnyError>;
 }

 /// Namespace for crate-wide extension traits/methods
@@ -51,7 +45,6 @@ pub(crate) mod ext {
    use crate::allow_threading::AllowThreads;
    use extend::ext;
    use pyo3::exceptions::{PyConnectionError, PyRuntimeError};
-    use pyo3::marker::Ungil;
    use pyo3::types::PyBytes;
    use pyo3::{Py, PyErr, PyResult, Python};
    use tokio::runtime::Runtime;
@@ -62,7 +55,7 @@ pub(crate) mod ext {
    #[ext(pub, name = ByteArrayExt)]
    impl [u8] {
        fn pybytes(&self) -> Py<PyBytes> {
-            Python::with_gil(|py| PyBytes::new(py, self).unbind())
+            Python::attach(|py| PyBytes::new(py, self).unbind())
        }
    }

@@ -98,7 +91,7 @@ pub(crate) mod ext {
    #[ext(pub, name = PyResultExt)]
    impl<T> PyResult<T> {
        fn write_unraisable(self) -> Option<T> {
-            Python::with_gil(|py| self.write_unraisable_with(py))
+            Python::attach(|py| self.write_unraisable_with(py))
        }

        fn write_unraisable_with(self, py: Python<'_>) -> Option<T> {
@@ -175,24 +168,6 @@ pub(crate) mod ext {
    }
 }

-pub(crate) mod private {
-    use std::marker::Sized;
-
-    /// Sealed traits support
-    pub trait Sealed {}
-    impl<T: ?Sized> Sealed for T {}
-}
-
-/// A wrapper around [`Py`] that implements [`Clone`] using [`Python::with_gil`].
-#[repr(transparent)]
-pub(crate) struct ClonePy<T>(pub Py<T>);
-
-impl<T> Clone for ClonePy<T> {
-    fn clone(&self) -> Self {
-        Python::with_gil(|py| Self(self.0.clone_ref(py)))
-    }
-}
-
 /// A Python module implemented in Rust. The name of this function must match
 /// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
 /// import the module.
--- a/rust/exo_pyo3_bindings/src/networking.rs
+++ b/rust/exo_pyo3_bindings/src/networking.rs
@@ -11,9 +11,9 @@ use crate::ext::{ResultExt as _, TokioMpscReceiverExt as _, TokioMpscSenderExt a
 use crate::pyclass;
 use crate::pylibp2p::ident::{PyKeypair, PyPeerId};
 use libp2p::futures::StreamExt as _;
+use libp2p::gossipsub;
 use libp2p::gossipsub::{IdentTopic, Message, MessageId, PublishError};
 use libp2p::swarm::SwarmEvent;
-use libp2p::{gossipsub, mdns};
 use networking::discovery;
 use networking::swarm::create_swarm;
 use pyo3::prelude::{PyModule, PyModuleMethods as _};
@@ -25,7 +25,7 @@ use tokio::sync::{Mutex, mpsc, oneshot};

 mod exception {
    use pyo3::types::PyTuple;
-    use pyo3::{PyErrArguments, exceptions::PyException, prelude::*};
+    use pyo3::{exceptions::PyException, prelude::*};
    use pyo3_stub_gen::derive::*;

    #[gen_stub_pyclass]
@@ -155,7 +155,6 @@ async fn networking_task(
 ) {
    use SwarmEvent::*;
    use ToTask::*;
-    use mdns::Event::*;
    use networking::swarm::BehaviourEvent::*;

    log::info!("RUST: networking task started");
@@ -485,7 +484,7 @@ impl PyNetworkingHandle {
        let (tx, rx) = oneshot::channel();

        // send off request to subscribe
-        let data = Python::with_gil(|py| Vec::from(data.as_bytes(py)));
+        let data = Python::attach(|py| Vec::from(data.as_bytes(py)));
        self.to_task_tx()
            .send_py(ToTask::GossipsubPublish {
                topic,
--- a/rust/networking/Cargo.toml
+++ b/rust/networking/Cargo.toml
@@ -19,8 +19,6 @@ either = { workspace = true }
 # macro dependencies
 extend = { workspace = true }
 delegate = { workspace = true }
-impl-trait-for-tuples = { workspace = true }
-derive_more = { workspace = true }

 # async
 tokio = { workspace = true, features = ["full"] }
@@ -29,11 +27,6 @@ futures-timer = { workspace = true }

 # utility dependencies
 util = { workspace = true }
-thiserror = { workspace = true }
-#internment = { workspace = true }
-#recursion = { workspace = true }
-#generativity = { workspace = true }
-#itertools = { workspace = true }
 tracing-subscriber = { version = "0.3.19", features = ["default", "env-filter"] }
 keccak-const = { workspace = true }

@@ -41,4 +34,4 @@ keccak-const = { workspace = true }
 log = { workspace = true }

 # networking
-libp2p = { workspace = true, features = ["full"] }
+libp2p = { workspace = true, features = ["full"] }
--- a/rust/networking/examples/chatroom_manual.rs
+++ b/rust/networking/examples/chatroom_manual.rs
@@ -24,8 +24,8 @@ use libp2p::{
    swarm::{NetworkBehaviour, SwarmEvent},
    tcp, yamux,
 };
+use std::error::Error;
 use std::time::Duration;
-use std::{error::Error, hash::Hash};
 use tokio::{io, io::AsyncBufReadExt, select};
 use tracing_subscriber::EnvFilter;

--- a/rust/networking/src/discovery.rs
+++ b/rust/networking/src/discovery.rs
@@ -1,5 +1,4 @@
 use crate::ext::MultiaddrExt;
-use crate::keep_alive;
 use delegate::delegate;
 use either::Either;
 use futures::FutureExt;
--- a/rust/networking/src/keep_alive.rs
+++ b/rust/networking/src/keep_alive.rs
@@ -1,44 +0,0 @@
-use delegate::delegate;
-use libp2p::swarm::handler::ConnectionEvent;
-use libp2p::swarm::{ConnectionHandlerEvent, SubstreamProtocol, dummy, handler};
-use std::task::{Context, Poll};
-
-/// An implementation of [`ConnectionHandler`] that doesn't handle any protocols, but it keeps
-/// the connection alive.
-#[derive(Clone)]
-#[repr(transparent)]
-pub struct ConnectionHandler(dummy::ConnectionHandler);
-
-impl ConnectionHandler {
-    pub fn new() -> Self {
-        ConnectionHandler(dummy::ConnectionHandler)
-    }
-}
-
-impl handler::ConnectionHandler for ConnectionHandler {
-    // delegate types and implementation mostly to dummy handler
-    type FromBehaviour = <dummy::ConnectionHandler as handler::ConnectionHandler>::FromBehaviour;
-    type ToBehaviour = <dummy::ConnectionHandler as handler::ConnectionHandler>::ToBehaviour;
-    type InboundProtocol =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::InboundProtocol;
-    type OutboundProtocol =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::OutboundProtocol;
-    type InboundOpenInfo =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::InboundOpenInfo;
-    type OutboundOpenInfo =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::OutboundOpenInfo;
-
-    delegate! {
-        to self.0 {
-            fn listen_protocol(&self) -> SubstreamProtocol<Self::InboundProtocol, Self::InboundOpenInfo>;
-            fn poll(&mut self, cx: &mut Context<'_>) -> Poll<ConnectionHandlerEvent<Self::OutboundProtocol, Self::OutboundOpenInfo, Self::ToBehaviour>>;
-            fn on_behaviour_event(&mut self, event: Self::FromBehaviour);
-            fn on_connection_event(&mut self, event: ConnectionEvent<Self::InboundProtocol, Self::OutboundProtocol, Self::InboundOpenInfo, Self::OutboundOpenInfo>);
-        }
-    }
-
-    // specifically override this to force connection to stay alive
-    fn connection_keep_alive(&self) -> bool {
-        true
-    }
-}
--- a/rust/networking/src/lib.rs
+++ b/rust/networking/src/lib.rs
@@ -3,19 +3,7 @@
 //! this is here as a placeholder documentation
 //!
 //!
-
-// enable Rust-unstable features for convenience
-#![feature(trait_alias)]
-// #![feature(stmt_expr_attributes)]
-// #![feature(unboxed_closures)]
-// #![feature(assert_matches)]
-// #![feature(async_fn_in_dyn_trait)]
-// #![feature(async_for_loop)]
-// #![feature(auto_traits)]
-// #![feature(negative_impls)]
-
 pub mod discovery;
-pub mod keep_alive;
 pub mod swarm;

 /// Namespace for all the type/trait aliases used by this crate.
@@ -54,11 +42,3 @@ pub(crate) mod ext {
        }
    }
 }
-
-pub(crate) mod private {
-    #![allow(dead_code)]
-
-    /// Sealed traits support
-    pub trait Sealed {}
-    impl<T: ?Sized> Sealed for T {}
-}
--- a/src/exo/download/coordinator.py
+++ b/src/exo/download/coordinator.py
@@ -14,6 +14,7 @@ from exo.download.download_utils import (
    map_repo_download_progress_to_download_progress_data,
 )
 from exo.download.shard_downloader import ShardDownloader
+from exo.shared.constants import EXO_MODELS_DIR
 from exo.shared.models.model_cards import ModelId
 from exo.shared.types.commands import (
    CancelDownload,
@@ -46,6 +47,7 @@ class DownloadCoordinator:
    download_command_receiver: Receiver[ForwarderDownloadCommand]
    local_event_sender: Sender[ForwarderEvent]
    event_index_counter: Iterator[int]
+    offline: bool = False

    # Local state
    download_status: dict[ModelId, DownloadProgress] = field(default_factory=dict)
@@ -61,8 +63,13 @@ class DownloadCoordinator:

    def __post_init__(self) -> None:
        self.event_sender, self.event_receiver = channel[Event]()
+        if self.offline:
+            self.shard_downloader.set_internet_connection(False)
        self.shard_downloader.on_progress(self._download_progress_callback)

+    def _model_dir(self, model_id: ModelId) -> str:
+        return str(EXO_MODELS_DIR / model_id.normalize())
+
    async def _download_progress_callback(
        self, callback_shard: ShardMetadata, progress: RepoDownloadProgress
    ) -> None:
@@ -74,6 +81,7 @@ class DownloadCoordinator:
                shard_metadata=callback_shard,
                node_id=self.node_id,
                total_bytes=progress.total_bytes,
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = completed
            await self.event_sender.send(
@@ -93,6 +101,7 @@ class DownloadCoordinator:
                download_progress=map_repo_download_progress_to_download_progress_data(
                    progress
                ),
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = ongoing
            await self.event_sender.send(
@@ -101,23 +110,30 @@ class DownloadCoordinator:
            self._last_progress_time[model_id] = current_time()

    async def run(self) -> None:
-        logger.info("Starting DownloadCoordinator")
-        self._test_internet_connection()
+        logger.info(
+            f"Starting DownloadCoordinator{' (offline mode)' if self.offline else ''}"
+        )
+        if not self.offline:
+            self._test_internet_connection()
        async with self._tg as tg:
            tg.start_soon(self._command_processor)
            tg.start_soon(self._forward_events)
            tg.start_soon(self._emit_existing_download_progress)
-            tg.start_soon(self._check_internet_connection)
+            if not self.offline:
+                tg.start_soon(self._check_internet_connection)

    def _test_internet_connection(self) -> None:
-        try:
-            socket.create_connection(("1.1.1.1", 443), timeout=3).close()
-            self.shard_downloader.set_internet_connection(True)
-        except OSError:
-            self.shard_downloader.set_internet_connection(False)
-        logger.debug(
-            f"Internet connectivity: {self.shard_downloader.internet_connection}"
-        )
+        # Try multiple endpoints since some ISPs/networks block specific IPs
+        for host in ("1.1.1.1", "8.8.8.8", "1.0.0.1"):
+            try:
+                socket.create_connection((host, 443), timeout=3).close()
+                self.shard_downloader.set_internet_connection(True)
+                logger.debug(f"Internet connectivity: True (via {host})")
+                return
+            except OSError:
+                continue
+        self.shard_downloader.set_internet_connection(False)
+        logger.debug("Internet connectivity: False")

    async def _check_internet_connection(self) -> None:
        first_connection = True
@@ -170,7 +186,11 @@ class DownloadCoordinator:
                return

        # Emit pending status
-        progress = DownloadPending(shard_metadata=shard, node_id=self.node_id)
+        progress = DownloadPending(
+            shard_metadata=shard,
+            node_id=self.node_id,
+            model_directory=self._model_dir(model_id),
+        )
        self.download_status[model_id] = progress
        await self.event_sender.send(NodeDownloadProgress(download_progress=progress))

@@ -184,6 +204,7 @@ class DownloadCoordinator:
                shard_metadata=shard,
                node_id=self.node_id,
                total_bytes=initial_progress.total_bytes,
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = completed
            await self.event_sender.send(
@@ -191,6 +212,20 @@ class DownloadCoordinator:
            )
            return

+        if self.offline:
+            logger.warning(
+                f"Offline mode: model {model_id} is not fully available locally, cannot download"
+            )
+            failed = DownloadFailed(
+                shard_metadata=shard,
+                node_id=self.node_id,
+                error_message=f"Model files not found locally in offline mode: {model_id}",
+                model_directory=self._model_dir(model_id),
+            )
+            self.download_status[model_id] = failed
+            await self.event_sender.send(NodeDownloadProgress(download_progress=failed))
+            return
+
        # Start actual download
        self._start_download_task(shard, initial_progress)

@@ -206,6 +241,7 @@ class DownloadCoordinator:
            download_progress=map_repo_download_progress_to_download_progress_data(
                initial_progress
            ),
+            model_directory=self._model_dir(model_id),
        )
        self.download_status[model_id] = status
        self.event_sender.send_nowait(NodeDownloadProgress(download_progress=status))
@@ -219,6 +255,7 @@ class DownloadCoordinator:
                    shard_metadata=shard,
                    node_id=self.node_id,
                    error_message=str(e),
+                    model_directory=self._model_dir(model_id),
                )
                self.download_status[model_id] = failed
                await self.event_sender.send(
@@ -253,6 +290,7 @@ class DownloadCoordinator:
            pending = DownloadPending(
                shard_metadata=current_status.shard_metadata,
                node_id=self.node_id,
+                model_directory=self._model_dir(model_id),
            )
            await self.event_sender.send(
                NodeDownloadProgress(download_progress=pending)
@@ -295,11 +333,18 @@ class DownloadCoordinator:
                            node_id=self.node_id,
                            shard_metadata=progress.shard,
                            total_bytes=progress.total_bytes,
+                            model_directory=self._model_dir(
+                                progress.shard.model_card.model_id
+                            ),
                        )
                    elif progress.status in ["in_progress", "not_started"]:
                        if progress.downloaded_bytes_this_session.in_bytes == 0:
                            status = DownloadPending(
-                                node_id=self.node_id, shard_metadata=progress.shard
+                                node_id=self.node_id,
+                                shard_metadata=progress.shard,
+                                model_directory=self._model_dir(
+                                    progress.shard.model_card.model_id
+                                ),
                            )
                        else:
                            status = DownloadOngoing(
@@ -308,6 +353,9 @@ class DownloadCoordinator:
                                download_progress=map_repo_download_progress_to_download_progress_data(
                                    progress
                                ),
+                                model_directory=self._model_dir(
+                                    progress.shard.model_card.model_id
+                                ),
                            )
                    else:
                        continue
--- a/src/exo/download/download_utils.py
+++ b/src/exo/download/download_utils.py
@@ -448,12 +448,13 @@ async def download_file_with_retry(
    target_dir: Path,
    on_progress: Callable[[int, int, bool], None] = lambda _, __, ___: None,
    on_connection_lost: Callable[[], None] = lambda: None,
+    skip_internet: bool = False,
 ) -> Path:
    n_attempts = 3
    for attempt in range(n_attempts):
        try:
            return await _download_file(
-                model_id, revision, path, target_dir, on_progress
+                model_id, revision, path, target_dir, on_progress, skip_internet
            )
        except HuggingFaceAuthenticationError:
            raise
@@ -487,10 +488,14 @@ async def _download_file(
    path: str,
    target_dir: Path,
    on_progress: Callable[[int, int, bool], None] = lambda _, __, ___: None,
+    skip_internet: bool = False,
 ) -> Path:
    target_path = target_dir / path

    if await aios.path.exists(target_path):
+        if skip_internet:
+            return target_path
+
        local_size = (await aios.stat(target_path)).st_size

        # Try to verify against remote, but allow offline operation
@@ -510,6 +515,11 @@ async def _download_file(
            )
            return target_path

+    if skip_internet:
+        raise FileNotFoundError(
+            f"File {path} not found locally and cannot download in offline mode"
+        )
+
    await aios.makedirs((target_dir / path).parent, exist_ok=True)
    length, etag = await file_meta(model_id, revision, path)
    remote_hash = etag[:-5] if etag.endswith("-gzip") else etag
@@ -814,6 +824,7 @@ async def download_shard(
                    file, curr_bytes, total_bytes, is_renamed
                ),
                on_connection_lost=on_connection_lost,
+                skip_internet=skip_internet,
            )

    if not skip_download:
--- a/src/exo/download/tests/test_offline_mode.py
+++ b/src/exo/download/tests/test_offline_mode.py
@@ -0,0 +1,230 @@
+"""Tests for offline/air-gapped mode."""
+
+from collections.abc import AsyncIterator
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import aiofiles
+import aiofiles.os as aios
+import pytest
+
+from exo.download.download_utils import (
+    _download_file,  # pyright: ignore[reportPrivateUsage]
+    download_file_with_retry,
+    fetch_file_list_with_cache,
+)
+from exo.shared.types.common import ModelId
+from exo.shared.types.worker.downloads import FileListEntry
+
+
+@pytest.fixture
+def model_id() -> ModelId:
+    return ModelId("test-org/test-model")
+
+
+@pytest.fixture
+async def temp_models_dir(tmp_path: Path) -> AsyncIterator[Path]:
+    models_dir = tmp_path / "models"
+    await aios.makedirs(models_dir, exist_ok=True)
+    with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
+        yield models_dir
+
+
+class TestDownloadFileOffline:
+    """Tests for _download_file with skip_internet=True."""
+
+    async def test_returns_local_file_without_http_verification(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """When skip_internet=True and file exists locally, return it immediately
+        without making any HTTP calls (no file_meta verification)."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        local_file = target_dir / "model.safetensors"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b"model weights data")
+
+        with patch(
+            "exo.download.download_utils.file_meta",
+            new_callable=AsyncMock,
+        ) as mock_file_meta:
+            result = await _download_file(
+                model_id,
+                "main",
+                "model.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+            assert result == local_file
+            mock_file_meta.assert_not_called()
+
+    async def test_raises_file_not_found_for_missing_file(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """When skip_internet=True and file does NOT exist locally,
+        raise FileNotFoundError instead of attempting download."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        with pytest.raises(FileNotFoundError, match="offline mode"):
+            await _download_file(
+                model_id,
+                "main",
+                "missing_model.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+    async def test_returns_local_file_in_subdirectory(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """When skip_internet=True and file exists in a subdirectory,
+        return it without HTTP calls."""
+        target_dir = tmp_path / "downloads"
+        subdir = target_dir / "transformer"
+        await aios.makedirs(subdir, exist_ok=True)
+
+        local_file = subdir / "diffusion_pytorch_model.safetensors"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b"weights")
+
+        with patch(
+            "exo.download.download_utils.file_meta",
+            new_callable=AsyncMock,
+        ) as mock_file_meta:
+            result = await _download_file(
+                model_id,
+                "main",
+                "transformer/diffusion_pytorch_model.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+            assert result == local_file
+            mock_file_meta.assert_not_called()
+
+
+class TestDownloadFileWithRetryOffline:
+    """Tests for download_file_with_retry with skip_internet=True."""
+
+    async def test_propagates_skip_internet_to_download_file(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Verify skip_internet is passed through to _download_file."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        local_file = target_dir / "config.json"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b'{"model_type": "qwen2"}')
+
+        with patch(
+            "exo.download.download_utils.file_meta",
+            new_callable=AsyncMock,
+        ) as mock_file_meta:
+            result = await download_file_with_retry(
+                model_id,
+                "main",
+                "config.json",
+                target_dir,
+                skip_internet=True,
+            )
+
+            assert result == local_file
+            mock_file_meta.assert_not_called()
+
+    async def test_file_not_found_does_not_retry(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """FileNotFoundError from offline mode should not trigger retries."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        with pytest.raises(FileNotFoundError):
+            await download_file_with_retry(
+                model_id,
+                "main",
+                "nonexistent.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+
+class TestFetchFileListOffline:
+    """Tests for fetch_file_list_with_cache with skip_internet=True."""
+
+    async def test_uses_cached_file_list(
+        self, model_id: ModelId, temp_models_dir: Path
+    ) -> None:
+        """When skip_internet=True and cache file exists, use it without network."""
+        from pydantic import TypeAdapter
+
+        cache_dir = temp_models_dir / "caches" / model_id.normalize()
+        await aios.makedirs(cache_dir, exist_ok=True)
+
+        cached_list = [
+            FileListEntry(type="file", path="model.safetensors", size=1000),
+            FileListEntry(type="file", path="config.json", size=200),
+        ]
+        cache_file = cache_dir / f"{model_id.normalize()}--main--file_list.json"
+        async with aiofiles.open(cache_file, "w") as f:
+            await f.write(
+                TypeAdapter(list[FileListEntry]).dump_json(cached_list).decode()
+            )
+
+        with patch(
+            "exo.download.download_utils.fetch_file_list_with_retry",
+            new_callable=AsyncMock,
+        ) as mock_fetch:
+            result = await fetch_file_list_with_cache(
+                model_id, "main", skip_internet=True
+            )
+
+            assert result == cached_list
+            mock_fetch.assert_not_called()
+
+    async def test_falls_back_to_local_directory_scan(
+        self, model_id: ModelId, temp_models_dir: Path
+    ) -> None:
+        """When skip_internet=True and no cache but local files exist,
+        build file list from local directory."""
+        import json
+
+        model_dir = temp_models_dir / model_id.normalize()
+        await aios.makedirs(model_dir, exist_ok=True)
+
+        async with aiofiles.open(model_dir / "config.json", "w") as f:
+            await f.write('{"model_type": "qwen2"}')
+
+        index_data = {
+            "metadata": {},
+            "weight_map": {"model.layers.0.weight": "model.safetensors"},
+        }
+        async with aiofiles.open(model_dir / "model.safetensors.index.json", "w") as f:
+            await f.write(json.dumps(index_data))
+
+        async with aiofiles.open(model_dir / "model.safetensors", "wb") as f:
+            await f.write(b"x" * 500)
+
+        with patch(
+            "exo.download.download_utils.fetch_file_list_with_retry",
+            new_callable=AsyncMock,
+        ) as mock_fetch:
+            result = await fetch_file_list_with_cache(
+                model_id, "main", skip_internet=True
+            )
+
+            mock_fetch.assert_not_called()
+            paths = {entry.path for entry in result}
+            assert "config.json" in paths
+            assert "model.safetensors" in paths
+
+    async def test_raises_when_no_cache_and_no_local_files(
+        self, model_id: ModelId, temp_models_dir: Path
+    ) -> None:
+        """When skip_internet=True and neither cache nor local files exist,
+        raise FileNotFoundError."""
+        with pytest.raises(FileNotFoundError, match="No internet"):
+            await fetch_file_list_with_cache(model_id, "main", skip_internet=True)
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -39,6 +39,7 @@ class Node:

    node_id: NodeId
    event_index_counter: Iterator[int]
+    offline: bool
    _tg: TaskGroup = field(init=False, default_factory=anyio.create_task_group)

    @classmethod
@@ -68,6 +69,7 @@ class Node:
                download_command_receiver=router.receiver(topics.DOWNLOAD_COMMANDS),
                local_event_sender=router.sender(topics.LOCAL_EVENTS),
                event_index_counter=event_index_counter,
+                offline=args.offline,
            )
        else:
            download_coordinator = None
@@ -132,10 +134,13 @@ class Node:
            api,
            node_id,
            event_index_counter,
+            args.offline,
        )

    async def run(self):
        async with self._tg as tg:
+            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
+            signal.signal(signal.SIGTERM, lambda _, __: self.shutdown())
            tg.start_soon(self.router.run)
            tg.start_soon(self.election.run)
            if self.download_coordinator:
@@ -147,8 +152,6 @@ class Node:
            if self.api:
                tg.start_soon(self.api.run)
            tg.start_soon(self._elect_loop)
-            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
-            signal.signal(signal.SIGTERM, lambda _, __: self.shutdown())

    def shutdown(self):
        # if this is our second call to shutdown, just sys.exit
@@ -222,6 +225,7 @@ class Node:
                            ),
                            local_event_sender=self.router.sender(topics.LOCAL_EVENTS),
                            event_index_counter=self.event_index_counter,
+                            offline=self.offline,
                        )
                        self._tg.start_soon(self.download_coordinator.run)
                    if self.worker:
@@ -260,6 +264,9 @@ def main():
    logger.info("Starting EXO")
    logger.info(f"EXO_LIBP2P_NAMESPACE: {os.getenv('EXO_LIBP2P_NAMESPACE')}")

+    if args.offline:
+        logger.info("Running in OFFLINE mode — no internet checks, local models only")
+
    # Set FAST_SYNCH override env var for runner subprocesses
    if args.fast_synch is True:
        os.environ["EXO_FAST_SYNCH"] = "on"
@@ -282,6 +289,7 @@ class Args(CamelCaseModel):
    tb_only: bool = False
    no_worker: bool = False
    no_downloads: bool = False
+    offline: bool = False
    fast_synch: bool | None = None  # None = auto, True = force on, False = force off

    @classmethod
@@ -329,6 +337,11 @@ class Args(CamelCaseModel):
            action="store_true",
            help="Disable the download coordinator (node won't download models)",
        )
+        parser.add_argument(
+            "--offline",
+            action="store_true",
+            help="Run in offline/air-gapped mode: skip internet checks, use only pre-staged local models",
+        )
        fast_synch_group = parser.add_mutually_exclusive_group()
        fast_synch_group.add_argument(
            "--fast-synch",
--- a/src/exo/master/adapters/chat_completions.py
+++ b/src/exo/master/adapters/chat_completions.py
@@ -17,6 +17,7 @@ from exo.shared.types.api import (
    LogprobsContentItem,
    StreamingChoiceResponse,
    ToolCall,
+    Usage,
 )
 from exo.shared.types.chunks import ErrorChunk, TokenChunk, ToolCallChunk
 from exo.shared.types.common import CommandId
@@ -125,6 +126,8 @@ async def generate_chat_stream(
    chunk_stream: AsyncGenerator[ErrorChunk | ToolCallChunk | TokenChunk, None],
 ) -> AsyncGenerator[str, None]:
    """Generate Chat Completions API streaming events from chunks."""
+    last_usage: Usage | None = None
+
    async for chunk in chunk_stream:
        if isinstance(chunk, ErrorChunk):
            error_response = ErrorResponse(
@@ -138,6 +141,8 @@ async def generate_chat_stream(
            yield "data: [DONE]\n\n"
            return

+        last_usage = chunk.usage or last_usage
+
        if isinstance(chunk, ToolCallChunk):
            tool_call_deltas = [
                ToolCall(
@@ -161,12 +166,15 @@ async def generate_chat_stream(
                        finish_reason="tool_calls",
                    )
                ],
+                usage=last_usage,
            )
            yield f"data: {tool_response.model_dump_json()}\n\n"
            yield "data: [DONE]\n\n"
            return

        chunk_response = chunk_to_response(chunk, command_id)
+        if chunk.finish_reason is not None:
+            chunk_response = chunk_response.model_copy(update={"usage": last_usage})
        yield f"data: {chunk_response.model_dump_json()}\n\n"

        if chunk.finish_reason is not None:
@@ -176,7 +184,9 @@ async def generate_chat_stream(
 async def collect_chat_response(
    command_id: CommandId,
    chunk_stream: AsyncGenerator[ErrorChunk | ToolCallChunk | TokenChunk, None],
-) -> ChatCompletionResponse:
+) -> AsyncGenerator[str]:
+    # This is an AsyncGenerator[str] rather than returning a ChatCompletionReponse because
+    # FastAPI handles the cancellation better but wouldn't auto-serialize for some reason
    """Collect all token chunks and return a single ChatCompletionResponse."""
    text_parts: list[str] = []
    tool_calls: list[ToolCall] = []
@@ -184,6 +194,7 @@ async def collect_chat_response(
    model: str | None = None
    finish_reason: FinishReason | None = None
    error_message: str | None = None
+    last_usage: Usage | None = None

    async for chunk in chunk_stream:
        if isinstance(chunk, ErrorChunk):
@@ -193,6 +204,8 @@ async def collect_chat_response(
        if model is None:
            model = chunk.model

+        last_usage = chunk.usage or last_usage
+
        if isinstance(chunk, TokenChunk):
            text_parts.append(chunk.text)
            if chunk.logprob is not None:
@@ -223,7 +236,7 @@ async def collect_chat_response(
    combined_text = "".join(text_parts)
    assert model is not None

-    return ChatCompletionResponse(
+    yield ChatCompletionResponse(
        id=command_id,
        created=int(time.time()),
        model=model,
@@ -241,4 +254,6 @@ async def collect_chat_response(
                finish_reason=finish_reason,
            )
        ],
-    )
+        usage=last_usage,
+    ).model_dump_json()
+    return
--- a/src/exo/master/adapters/claude.py
+++ b/src/exo/master/adapters/claude.py
@@ -4,7 +4,7 @@ import json
 from collections.abc import AsyncGenerator
 from typing import Any

-from exo.shared.types.api import FinishReason
+from exo.shared.types.api import FinishReason, Usage
 from exo.shared.types.chunks import ErrorChunk, TokenChunk, ToolCallChunk
 from exo.shared.types.claude_api import (
    ClaudeContentBlock,
@@ -161,12 +161,14 @@ async def collect_claude_response(
    command_id: CommandId,
    model: str,
    chunk_stream: AsyncGenerator[ErrorChunk | ToolCallChunk | TokenChunk, None],
-) -> ClaudeMessagesResponse:
+) -> AsyncGenerator[str]:
+    # This is an AsyncGenerator[str] rather than returning a ChatCompletionReponse because
+    # FastAPI handles the cancellation better but wouldn't auto-serialize for some reason
    """Collect all token chunks and return a single ClaudeMessagesResponse."""
    text_parts: list[str] = []
    tool_use_blocks: list[ClaudeToolUseBlock] = []
    stop_reason: ClaudeStopReason | None = None
-    last_stats = None
+    last_usage: Usage | None = None
    error_message: str | None = None

    async for chunk in chunk_stream:
@@ -174,6 +176,8 @@ async def collect_claude_response(
            error_message = chunk.error_message or "Internal server error"
            break

+        last_usage = chunk.usage or last_usage
+
        if isinstance(chunk, ToolCallChunk):
            for tool in chunk.tool_calls:
                tool_use_blocks.append(
@@ -183,12 +187,10 @@ async def collect_claude_response(
                        input=json.loads(tool.arguments),  # pyright: ignore[reportAny]
                    )
                )
-            last_stats = chunk.stats or last_stats
            stop_reason = "tool_use"
            continue

        text_parts.append(chunk.text)
-        last_stats = chunk.stats or last_stats

        if chunk.finish_reason is not None:
            stop_reason = finish_reason_to_claude_stop_reason(chunk.finish_reason)
@@ -208,11 +210,11 @@ async def collect_claude_response(
    if not content:
        content.append(ClaudeTextBlock(text=""))

-    # Use actual usage data from stats if available
-    input_tokens = last_stats.prompt_tokens if last_stats else 0
-    output_tokens = last_stats.generation_tokens if last_stats else 0
+    # Use actual usage data if available
+    input_tokens = last_usage.prompt_tokens if last_usage else 0
+    output_tokens = last_usage.completion_tokens if last_usage else 0

-    return ClaudeMessagesResponse(
+    yield ClaudeMessagesResponse(
        id=f"msg_{command_id}",
        model=model,
        content=content,
@@ -221,7 +223,8 @@ async def collect_claude_response(
            input_tokens=input_tokens,
            output_tokens=output_tokens,
        ),
-    )
+    ).model_dump_json()
+    return


 async def generate_claude_stream(
@@ -249,7 +252,7 @@ async def generate_claude_stream(

    output_tokens = 0
    stop_reason: ClaudeStopReason | None = None
-    last_stats = None
+    last_usage: Usage | None = None
    next_block_index = 1  # text block is 0, tool blocks start at 1

    async for chunk in chunk_stream:
@@ -257,8 +260,9 @@ async def generate_claude_stream(
            # Close text block and bail
            break

+        last_usage = chunk.usage or last_usage
+
        if isinstance(chunk, ToolCallChunk):
-            last_stats = chunk.stats or last_stats
            stop_reason = "tool_use"

            # Emit tool_use content blocks
@@ -290,7 +294,6 @@ async def generate_claude_stream(
            continue

        output_tokens += 1  # Count each chunk as one token
-        last_stats = chunk.stats or last_stats

        # content_block_delta
        delta_event = ClaudeContentBlockDeltaEvent(
@@ -302,9 +305,9 @@ async def generate_claude_stream(
        if chunk.finish_reason is not None:
            stop_reason = finish_reason_to_claude_stop_reason(chunk.finish_reason)

-    # Use actual token count from stats if available
-    if last_stats is not None:
-        output_tokens = last_stats.generation_tokens
+    # Use actual token count from usage if available
+    if last_usage is not None:
+        output_tokens = last_usage.completion_tokens

    # content_block_stop for text block
    block_stop = ClaudeContentBlockStopEvent(index=0)
--- a/src/exo/master/adapters/responses.py
+++ b/src/exo/master/adapters/responses.py
@@ -4,6 +4,7 @@ from collections.abc import AsyncGenerator
 from itertools import count
 from typing import Any

+from exo.shared.types.api import Usage
 from exo.shared.types.chunks import ErrorChunk, TokenChunk, ToolCallChunk
 from exo.shared.types.common import CommandId
 from exo.shared.types.openai_responses import (
@@ -121,13 +122,15 @@ async def collect_responses_response(
    command_id: CommandId,
    model: str,
    chunk_stream: AsyncGenerator[ErrorChunk | ToolCallChunk | TokenChunk, None],
-) -> ResponsesResponse:
+) -> AsyncGenerator[str]:
+    # This is an AsyncGenerator[str] rather than returning a ChatCompletionReponse because
+    # FastAPI handles the cancellation better but wouldn't auto-serialize for some reason
    """Collect all token chunks and return a single ResponsesResponse."""
    response_id = f"resp_{command_id}"
    item_id = f"item_{command_id}"
    accumulated_text = ""
    function_call_items: list[ResponseFunctionCallItem] = []
-    last_stats = None
+    last_usage: Usage | None = None
    error_message: str | None = None

    async for chunk in chunk_stream:
@@ -135,32 +138,32 @@ async def collect_responses_response(
            error_message = chunk.error_message or "Internal server error"
            break

+        last_usage = chunk.usage or last_usage
+
        if isinstance(chunk, ToolCallChunk):
            for tool in chunk.tool_calls:
                function_call_items.append(
                    ResponseFunctionCallItem(
-                        id=f"fc_{tool.id}",
-                        call_id=f"call_{tool.id}",
+                        id=tool.id,
+                        call_id=tool.id,
                        name=tool.name,
                        arguments=tool.arguments,
                    )
                )
-            last_stats = chunk.stats or last_stats
            continue

        accumulated_text += chunk.text
-        last_stats = chunk.stats or last_stats

    if error_message is not None:
        raise ValueError(error_message)

-    # Create usage from stats if available
+    # Create usage from usage data if available
    usage = None
-    if last_stats is not None:
+    if last_usage is not None:
        usage = ResponseUsage(
-            input_tokens=last_stats.prompt_tokens,
-            output_tokens=last_stats.generation_tokens,
-            total_tokens=last_stats.prompt_tokens + last_stats.generation_tokens,
+            input_tokens=last_usage.prompt_tokens,
+            output_tokens=last_usage.completion_tokens,
+            total_tokens=last_usage.total_tokens,
        )

    output: list[ResponseItem] = [
@@ -172,14 +175,15 @@ async def collect_responses_response(
    ]
    output.extend(function_call_items)

-    return ResponsesResponse(
+    yield ResponsesResponse(
        id=response_id,
        model=model,
        status="completed",
        output=output,
        output_text=accumulated_text,
        usage=usage,
-    )
+    ).model_dump_json()
+    return


 async def generate_responses_stream(
@@ -235,15 +239,16 @@ async def generate_responses_stream(

    accumulated_text = ""
    function_call_items: list[ResponseFunctionCallItem] = []
-    last_stats = None
+    last_usage: Usage | None = None
    next_output_index = 1  # message item is at 0

    async for chunk in chunk_stream:
        if isinstance(chunk, ErrorChunk):
            break

+        last_usage = chunk.usage or last_usage
+
        if isinstance(chunk, ToolCallChunk):
-            last_stats = chunk.stats or last_stats
            for tool in chunk.tool_calls:
                fc_id = f"fc_{tool.id}"
                call_id = f"call_{tool.id}"
@@ -302,7 +307,6 @@ async def generate_responses_stream(
            continue

        accumulated_text += chunk.text
-        last_stats = chunk.stats or last_stats

        # response.output_text.delta
        delta_event = ResponseTextDeltaEvent(
@@ -346,13 +350,13 @@ async def generate_responses_stream(
    )
    yield f"event: response.output_item.done\ndata: {item_done.model_dump_json()}\n\n"

-    # Create usage from stats if available
+    # Create usage from usage data if available
    usage = None
-    if last_stats is not None:
+    if last_usage is not None:
        usage = ResponseUsage(
-            input_tokens=last_stats.prompt_tokens,
-            output_tokens=last_stats.generation_tokens,
-            total_tokens=last_stats.prompt_tokens + last_stats.generation_tokens,
+            input_tokens=last_usage.prompt_tokens,
+            output_tokens=last_usage.completion_tokens,
+            total_tokens=last_usage.total_tokens,
        )

    # response.completed
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -85,6 +85,7 @@ from exo.shared.types.api import (
    ImageGenerationTaskParams,
    ImageListItem,
    ImageListResponse,
+    ImageSize,
    ModelList,
    ModelListModel,
    PlaceInstanceParams,
@@ -100,6 +101,7 @@ from exo.shared.types.api import (
    TraceRankStats,
    TraceResponse,
    TraceStatsResponse,
+    normalize_image_size,
 )
 from exo.shared.types.chunks import (
    ErrorChunk,
@@ -125,6 +127,7 @@ from exo.shared.types.commands import (
    PlaceInstance,
    SendInputChunk,
    StartDownload,
+    TaskCancelled,
    TaskFinished,
    TextGeneration,
 )
@@ -142,6 +145,7 @@ from exo.shared.types.openai_responses import (
    ResponsesResponse,
 )
 from exo.shared.types.state import State
+from exo.shared.types.worker.downloads import DownloadCompleted
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
 from exo.shared.types.worker.shards import Sharding
 from exo.utils.banner import print_startup_banner
@@ -540,16 +544,14 @@ class API:
                        break

        except anyio.get_cancelled_exc_class():
-            # TODO: TaskCancelled
-            """
-            self.command_sender.send_nowait(
-                ForwarderCommand(origin=self.node_id, command=command)
-            )
-            """
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
-            command = TaskFinished(finished_command_id=command_id)
-            await self._send(command)
+            await self._send(TaskFinished(finished_command_id=command_id))
            if command_id in self._text_generation_queues:
                del self._text_generation_queues[command_id]

@@ -644,11 +646,14 @@ class API:
                    "X-Accel-Buffering": "no",
                },
            )
-
-        return await collect_chat_response(
-            command.command_id,
-            self._token_chunk_stream(command.command_id),
-        )
+        else:
+            return StreamingResponse(
+                collect_chat_response(
+                    command.command_id,
+                    self._token_chunk_stream(command.command_id),
+                ),
+                media_type="application/json",
+            )

    async def bench_chat_completions(
        self, payload: BenchChatCompletionRequest
@@ -664,8 +669,7 @@ class API:
        command = TextGeneration(task_params=task_params)
        await self._send(command)

-        response = await self._collect_text_generation_with_stats(command.command_id)
-        return response
+        return await self._collect_text_generation_with_stats(command.command_id)

    async def _resolve_and_validate_text_model(self, model_id: ModelId) -> ModelId:
        """Validate a text model exists and return the resolved model ID.
@@ -750,9 +754,11 @@ class API:
        When stream=True and partial_images > 0, returns a StreamingResponse
        with SSE-formatted events for partial and final images.
        """
-        payload.model = await self._validate_image_model(ModelId(payload.model))
        payload = payload.model_copy(
-            update={"advanced_params": _ensure_seed(payload.advanced_params)}
+            update={
+                "model": await self._validate_image_model(ModelId(payload.model)),
+                "advanced_params": _ensure_seed(payload.advanced_params),
+            }
        )

        command = ImageGeneration(
@@ -883,6 +889,11 @@ class API:
                        del image_metadata[key]

        except anyio.get_cancelled_exc_class():
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
            await self._send(TaskFinished(finished_command_id=command_id))
@@ -964,6 +975,11 @@ class API:

            return (images, stats if capture_stats else None)
        except anyio.get_cancelled_exc_class():
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
            await self._send(TaskFinished(finished_command_id=command_id))
@@ -998,12 +1014,13 @@ class API:
    async def bench_image_generations(
        self, request: Request, payload: BenchImageGenerationTaskParams
    ) -> BenchImageGenerationResponse:
-        payload.model = await self._validate_image_model(ModelId(payload.model))
-
-        payload.stream = False
-        payload.partial_images = 0
        payload = payload.model_copy(
-            update={"advanced_params": _ensure_seed(payload.advanced_params)}
+            update={
+                "model": await self._validate_image_model(ModelId(payload.model)),
+                "stream": False,
+                "partial_images": 0,
+                "advanced_params": _ensure_seed(payload.advanced_params),
+            }
        )

        command = ImageGeneration(
@@ -1024,7 +1041,7 @@ class API:
        prompt: str,
        model: ModelId,
        n: int,
-        size: str,
+        size: ImageSize,
        response_format: Literal["url", "b64_json"],
        input_fidelity: Literal["low", "high"],
        stream: bool,
@@ -1094,7 +1111,7 @@ class API:
        prompt: str = Form(...),
        model: str = Form(...),
        n: int = Form(1),
-        size: str = Form("1024x1024"),
+        size: str | None = Form(None),
        response_format: Literal["url", "b64_json"] = Form("b64_json"),
        input_fidelity: Literal["low", "high"] = Form("low"),
        stream: str = Form("false"),
@@ -1120,7 +1137,7 @@ class API:
            prompt=prompt,
            model=ModelId(model),
            n=n,
-            size=size,
+            size=normalize_image_size(size),
            response_format=response_format,
            input_fidelity=input_fidelity,
            stream=stream_bool,
@@ -1156,7 +1173,7 @@ class API:
        prompt: str = Form(...),
        model: str = Form(...),
        n: int = Form(1),
-        size: str = Form("1024x1024"),
+        size: str | None = Form(None),
        response_format: Literal["url", "b64_json"] = Form("b64_json"),
        input_fidelity: Literal["low", "high"] = Form("low"),
        quality: Literal["high", "medium", "low"] = Form("medium"),
@@ -1176,7 +1193,7 @@ class API:
            prompt=prompt,
            model=ModelId(model),
            n=n,
-            size=size,
+            size=normalize_image_size(size),
            response_format=response_format,
            input_fidelity=input_fidelity,
            stream=False,
@@ -1221,12 +1238,15 @@ class API:
                    "X-Accel-Buffering": "no",
                },
            )
-
-        return await collect_claude_response(
-            command.command_id,
-            payload.model,
-            self._token_chunk_stream(command.command_id),
-        )
+        else:
+            return StreamingResponse(
+                collect_claude_response(
+                    command.command_id,
+                    payload.model,
+                    self._token_chunk_stream(command.command_id),
+                ),
+                media_type="application/json",
+            )

    async def openai_responses(
        self, payload: ResponsesRequest
@@ -1254,11 +1274,15 @@ class API:
                },
            )

-        return await collect_responses_response(
-            command.command_id,
-            payload.model,
-            self._token_chunk_stream(command.command_id),
-        )
+        else:
+            return StreamingResponse(
+                collect_responses_response(
+                    command.command_id,
+                    payload.model,
+                    self._token_chunk_stream(command.command_id),
+                ),
+                media_type="application/json",
+            )

    def _calculate_total_available_memory(self) -> Memory:
        """Calculate total available memory across all nodes in bytes."""
@@ -1269,8 +1293,18 @@ class API:

        return total_available

-    async def get_models(self) -> ModelList:
-        """Returns list of available models."""
+    async def get_models(self, status: str | None = Query(default=None)) -> ModelList:
+        """Returns list of available models, optionally filtered by being downloaded."""
+        cards = await get_model_cards()
+
+        if status == "downloaded":
+            downloaded_model_ids: set[str] = set()
+            for node_downloads in self.state.downloads.values():
+                for dl in node_downloads:
+                    if isinstance(dl, DownloadCompleted):
+                        downloaded_model_ids.add(dl.shard_metadata.model_card.model_id)
+            cards = [c for c in cards if c.model_id in downloaded_model_ids]
+
        return ModelList(
            data=[
                ModelListModel(
@@ -1288,7 +1322,7 @@ class API:
                    base_model=card.base_model,
                    capabilities=card.capabilities,
                )
-                for card in await get_model_cards()
+                for card in cards
            ]
        )

--- a/src/exo/master/main.py
+++ b/src/exo/master/main.py
@@ -24,6 +24,7 @@ from exo.shared.types.commands import (
    PlaceInstance,
    RequestEventLog,
    SendInputChunk,
+    TaskCancelled,
    TaskFinished,
    TestCommand,
    TextGeneration,
@@ -39,6 +40,7 @@ from exo.shared.types.events import (
    NodeTimedOut,
    TaskCreated,
    TaskDeleted,
+    TaskStatusUpdated,
    TraceEventData,
    TracesCollected,
    TracesMerged,
@@ -279,7 +281,7 @@ class Master:
                        case DeleteInstance():
                            placement = delete_instance(command, self.state.instances)
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            for cmd in cancel_unnecessary_downloads(
                                placement, self.state.downloads
@@ -299,7 +301,7 @@ class Master:
                                self.state.node_network,
                            )
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case CreateInstance():
@@ -309,7 +311,7 @@ class Master:
                                self.state.instances,
                            )
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case SendInputChunk(chunk=chunk):
@@ -319,6 +321,18 @@ class Master:
                                    chunk=chunk,
                                )
                            )
+                        case TaskCancelled():
+                            if (
+                                task_id := self.command_task_mapping.get(
+                                    command.cancelled_command_id
+                                )
+                            ) is not None:
+                                generated_events.append(
+                                    TaskStatusUpdated(
+                                        task_status=TaskStatus.Cancelled,
+                                        task_id=task_id,
+                                    )
+                                )
                        case TaskFinished():
                            generated_events.append(
                                TaskDeleted(
@@ -327,10 +341,9 @@ class Master:
                                    ]
                                )
                            )
-                            if command.finished_command_id in self.command_task_mapping:
-                                del self.command_task_mapping[
-                                    command.finished_command_id
-                                ]
+                            self.command_task_mapping.pop(
+                                command.finished_command_id, None
+                            )
                        case RequestEventLog():
                            # We should just be able to send everything, since other buffers will ignore old messages
                            # rate limit to 1000 at a time
--- a/src/exo/master/placement.py
+++ b/src/exo/master/placement.py
@@ -22,9 +22,15 @@ from exo.shared.types.commands import (
    PlaceInstance,
 )
 from exo.shared.types.common import NodeId
-from exo.shared.types.events import Event, InstanceCreated, InstanceDeleted
+from exo.shared.types.events import (
+    Event,
+    InstanceCreated,
+    InstanceDeleted,
+    TaskStatusUpdated,
+)
 from exo.shared.types.memory import Memory
 from exo.shared.types.profiling import MemoryUsage, NodeNetworkInfo
+from exo.shared.types.tasks import Task, TaskId, TaskStatus
 from exo.shared.types.worker.downloads import (
    DownloadOngoing,
    DownloadProgress,
@@ -186,6 +192,7 @@ def delete_instance(
 def get_transition_events(
    current_instances: Mapping[InstanceId, Instance],
    target_instances: Mapping[InstanceId, Instance],
+    tasks: Mapping[TaskId, Task],
 ) -> Sequence[Event]:
    events: list[Event] = []

@@ -201,6 +208,18 @@ def get_transition_events(
    # find instances to delete
    for instance_id in current_instances:
        if instance_id not in target_instances:
+            for task in tasks.values():
+                if task.instance_id == instance_id and task.task_status in [
+                    TaskStatus.Pending,
+                    TaskStatus.Running,
+                ]:
+                    events.append(
+                        TaskStatusUpdated(
+                            task_status=TaskStatus.Cancelled,
+                            task_id=task.task_id,
+                        )
+                    )
+
            events.append(
                InstanceDeleted(
                    instance_id=instance_id,
--- a/src/exo/master/tests/test_claude_tool_use.py
+++ b/src/exo/master/tests/test_claude_tool_use.py
@@ -4,7 +4,11 @@ import json
 from collections.abc import AsyncGenerator
 from typing import Any, cast

-from exo.master.adapters.claude import collect_claude_response, generate_claude_stream
+from exo.master.adapters.claude import (
+    ClaudeMessagesResponse,
+    collect_claude_response,
+    generate_claude_stream,
+)
 from exo.shared.types.api import ToolCallItem
 from exo.shared.types.chunks import ErrorChunk, TokenChunk, ToolCallChunk
 from exo.shared.types.common import CommandId, ModelId
@@ -17,6 +21,18 @@ async def _chunks_to_stream(
        yield chunk


+async def _collect_response(
+    command_id: CommandId,
+    model: str,
+    chunk_stream: AsyncGenerator[ErrorChunk | ToolCallChunk | TokenChunk, None],
+) -> ClaudeMessagesResponse:
+    """Helper to consume the async generator and parse the JSON response."""
+    parts: list[str] = []
+    async for part in collect_claude_response(command_id, model, chunk_stream):
+        parts.append(part)
+    return ClaudeMessagesResponse.model_validate_json("".join(parts))
+
+
 MODEL = ModelId("test-model")
 COMMAND_ID = CommandId("cmd_test123")

@@ -47,7 +63,7 @@ class TestCollectClaudeResponseToolUse:
                ],
            ),
        ]
-        response = await collect_claude_response(
+        response = await _collect_response(
            COMMAND_ID, "test-model", _chunks_to_stream(chunks)
        )

@@ -77,7 +93,7 @@ class TestCollectClaudeResponseToolUse:
                ],
            ),
        ]
-        response = await collect_claude_response(
+        response = await _collect_response(
            COMMAND_ID, "test-model", _chunks_to_stream(chunks)
        )

@@ -102,7 +118,7 @@ class TestCollectClaudeResponseToolUse:
                ],
            ),
        ]
-        response = await collect_claude_response(
+        response = await _collect_response(
            COMMAND_ID, "test-model", _chunks_to_stream(chunks)
        )

@@ -116,7 +132,7 @@ class TestCollectClaudeResponseToolUse:

    async def test_no_content_produces_empty_text_block(self):
        chunks: list[ErrorChunk | ToolCallChunk | TokenChunk] = []
-        response = await collect_claude_response(
+        response = await _collect_response(
            COMMAND_ID, "test-model", _chunks_to_stream(chunks)
        )
        assert len(response.content) == 1
--- a/src/exo/master/tests/test_placement.py
+++ b/src/exo/master/tests/test_placement.py
@@ -239,7 +239,7 @@ def test_get_transition_events_no_change(instance: Instance):
    target_instances = {instance_id: instance}

    # act
-    events = get_transition_events(current_instances, target_instances)
+    events = get_transition_events(current_instances, target_instances, {})

    # assert
    assert len(events) == 0
@@ -252,7 +252,7 @@ def test_get_transition_events_create_instance(instance: Instance):
    target_instances: dict[InstanceId, Instance] = {instance_id: instance}

    # act
-    events = get_transition_events(current_instances, target_instances)
+    events = get_transition_events(current_instances, target_instances, {})

    # assert
    assert len(events) == 1
@@ -266,7 +266,7 @@ def test_get_transition_events_delete_instance(instance: Instance):
    target_instances: dict[InstanceId, Instance] = {}

    # act
-    events = get_transition_events(current_instances, target_instances)
+    events = get_transition_events(current_instances, target_instances, {})

    # assert
    assert len(events) == 1
--- a/src/exo/shared/apply.py
+++ b/src/exo/shared/apply.py
@@ -184,19 +184,10 @@ def apply_instance_created(event: InstanceCreated, state: State) -> State:


 def apply_instance_deleted(event: InstanceDeleted, state: State) -> State:
-    deleted_instance = state.instances.get(event.instance_id)
    new_instances: Mapping[InstanceId, Instance] = {
        iid: inst for iid, inst in state.instances.items() if iid != event.instance_id
    }
-    runner_ids_to_remove: set[RunnerId] = set()
-    if deleted_instance is not None:
-        runner_ids_to_remove = set(
-            deleted_instance.shard_assignments.runner_to_shard.keys()
-        )
-    new_runners: Mapping[RunnerId, RunnerStatus] = {
-        rid: rs for rid, rs in state.runners.items() if rid not in runner_ids_to_remove
-    }
-    return state.model_copy(update={"instances": new_instances, "runners": new_runners})
+    return state.model_copy(update={"instances": new_instances})


 def apply_runner_status_updated(event: RunnerStatusUpdated, state: State) -> State:
@@ -227,11 +218,6 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
        key: value for key, value in state.downloads.items() if key != event.node_id
    }
    # Clean up all granular node mappings
-    node_identities = {
-        key: value
-        for key, value in state.node_identities.items()
-        if key != event.node_id
-    }
    node_memory = {
        key: value for key, value in state.node_memory.items() if key != event.node_id
    }
@@ -272,7 +258,6 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
            "downloads": downloads,
            "topology": topology,
            "last_seen": last_seen,
-            "node_identities": node_identities,
            "node_memory": node_memory,
            "node_disk": node_disk,
            "node_system": node_system,
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -44,7 +44,8 @@ async def _refresh_card_cache():
        async for toml_file in path.rglob("*.toml"):
            try:
                card = await ModelCard.load_from_path(toml_file)
-                _card_cache[card.model_id] = card
+                if card.model_id not in _card_cache:
+                    _card_cache[card.model_id] = card
            except (ValidationError, TOMLKitError):
                pass

@@ -182,6 +183,7 @@ class ConfigData(BaseModel):
    def supports_tensor(self) -> bool:
        return self.architectures in [
            ["Glm4MoeLiteForCausalLM"],
+            ["GlmMoeDsaForCausalLM"],
            ["DeepseekV32ForCausalLM"],
            ["DeepseekV3ForCausalLM"],
            ["Qwen3NextForCausalLM"],
--- a/src/exo/shared/tests/test_apply/test_apply_instance_deleted.py
+++ b/src/exo/shared/tests/test_apply/test_apply_instance_deleted.py
@@ -1,142 +0,0 @@
-from exo.shared.apply import apply_instance_deleted
-from exo.shared.models.model_cards import ModelId
-from exo.shared.tests.conftest import get_pipeline_shard_metadata
-from exo.shared.types.common import NodeId
-from exo.shared.types.events import InstanceDeleted
-from exo.shared.types.state import State
-from exo.shared.types.worker.instances import InstanceId, MlxRingInstance
-from exo.shared.types.worker.runners import (
-    RunnerId,
-    RunnerReady,
-    ShardAssignments,
-)
-from exo.shared.types.worker.shards import ShardMetadata
-from exo.worker.tests.constants import (
-    INSTANCE_1_ID,
-    INSTANCE_2_ID,
-    MODEL_A_ID,
-    MODEL_B_ID,
-    NODE_A,
-    NODE_B,
-    RUNNER_1_ID,
-    RUNNER_2_ID,
-)
-
-
-def _make_instance(
-    instance_id: InstanceId,
-    model_id: ModelId,
-    node_to_runner: dict[NodeId, RunnerId],
-    runner_to_shard: dict[RunnerId, ShardMetadata],
-) -> MlxRingInstance:
-    return MlxRingInstance(
-        instance_id=instance_id,
-        shard_assignments=ShardAssignments(
-            model_id=model_id,
-            node_to_runner=node_to_runner,
-            runner_to_shard=runner_to_shard,
-        ),
-        hosts_by_node={},
-        ephemeral_port=50000,
-    )
-
-
-def test_instance_deleted_removes_runners():
-    """Deleting an instance must also remove its runner entries from state."""
-    shard = get_pipeline_shard_metadata(MODEL_A_ID, device_rank=0)
-    instance = _make_instance(
-        INSTANCE_1_ID,
-        MODEL_A_ID,
-        {NODE_A: RUNNER_1_ID},
-        {RUNNER_1_ID: shard},
-    )
-    state = State(
-        instances={INSTANCE_1_ID: instance},
-        runners={RUNNER_1_ID: RunnerReady()},
-    )
-
-    new_state = apply_instance_deleted(
-        InstanceDeleted(instance_id=INSTANCE_1_ID), state
-    )
-
-    assert INSTANCE_1_ID not in new_state.instances
-    assert RUNNER_1_ID not in new_state.runners
-
-
-def test_instance_deleted_removes_only_its_runners():
-    """Deleting one instance must not remove runners belonging to another."""
-    shard_a = get_pipeline_shard_metadata(MODEL_A_ID, device_rank=0)
-    shard_b = get_pipeline_shard_metadata(MODEL_B_ID, device_rank=0)
-    instance_1 = _make_instance(
-        INSTANCE_1_ID,
-        MODEL_A_ID,
-        {NODE_A: RUNNER_1_ID},
-        {RUNNER_1_ID: shard_a},
-    )
-    instance_2 = _make_instance(
-        INSTANCE_2_ID,
-        MODEL_B_ID,
-        {NODE_B: RUNNER_2_ID},
-        {RUNNER_2_ID: shard_b},
-    )
-    state = State(
-        instances={INSTANCE_1_ID: instance_1, INSTANCE_2_ID: instance_2},
-        runners={RUNNER_1_ID: RunnerReady(), RUNNER_2_ID: RunnerReady()},
-    )
-
-    new_state = apply_instance_deleted(
-        InstanceDeleted(instance_id=INSTANCE_1_ID), state
-    )
-
-    assert INSTANCE_1_ID not in new_state.instances
-    assert RUNNER_1_ID not in new_state.runners
-    # Instance 2 and its runner must remain
-    assert INSTANCE_2_ID in new_state.instances
-    assert RUNNER_2_ID in new_state.runners
-
-
-def test_instance_deleted_multi_node_removes_all_runners():
-    """Deleting a multi-node instance removes all of its runners."""
-    shard1 = get_pipeline_shard_metadata(MODEL_A_ID, device_rank=0, world_size=2)
-    shard2 = get_pipeline_shard_metadata(MODEL_A_ID, device_rank=1, world_size=2)
-    instance = _make_instance(
-        INSTANCE_1_ID,
-        MODEL_A_ID,
-        {NODE_A: RUNNER_1_ID, NODE_B: RUNNER_2_ID},
-        {RUNNER_1_ID: shard1, RUNNER_2_ID: shard2},
-    )
-    state = State(
-        instances={INSTANCE_1_ID: instance},
-        runners={RUNNER_1_ID: RunnerReady(), RUNNER_2_ID: RunnerReady()},
-    )
-
-    new_state = apply_instance_deleted(
-        InstanceDeleted(instance_id=INSTANCE_1_ID), state
-    )
-
-    assert INSTANCE_1_ID not in new_state.instances
-    assert RUNNER_1_ID not in new_state.runners
-    assert RUNNER_2_ID not in new_state.runners
-    assert len(new_state.runners) == 0
-
-
-def test_instance_deleted_unknown_id_is_noop_for_runners():
-    """Deleting a non-existent instance should not affect runners."""
-    shard = get_pipeline_shard_metadata(MODEL_A_ID, device_rank=0)
-    instance = _make_instance(
-        INSTANCE_1_ID,
-        MODEL_A_ID,
-        {NODE_A: RUNNER_1_ID},
-        {RUNNER_1_ID: shard},
-    )
-    unknown_id = InstanceId("99999999-9999-4999-8999-999999999999")
-    state = State(
-        instances={INSTANCE_1_ID: instance},
-        runners={RUNNER_1_ID: RunnerReady()},
-    )
-
-    new_state = apply_instance_deleted(InstanceDeleted(instance_id=unknown_id), state)
-
-    # Everything should remain untouched
-    assert INSTANCE_1_ID in new_state.instances
-    assert RUNNER_1_ID in new_state.runners
--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -1,10 +1,9 @@
 import time
 from collections.abc import Generator
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any, Literal, get_args
 from uuid import uuid4

 from pydantic import BaseModel, Field, field_validator
-from pydantic_core import PydanticUseDefault

 from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.common import CommandId, NodeId
@@ -228,13 +227,6 @@ class PlaceInstanceParams(BaseModel):
    instance_meta: InstanceMeta = InstanceMeta.MlxRing
    min_nodes: int = 1

-    @field_validator("sharding", "instance_meta", mode="plain")
-    @classmethod
-    def use_default(cls, v: object):
-        if not v or not isinstance(v, (Sharding, InstanceMeta)):
-            raise PydanticUseDefault()
-        return v
-

 class CreateInstanceParams(BaseModel):
    instance: Instance
@@ -270,6 +262,27 @@ class DeleteInstanceResponse(BaseModel):
    instance_id: InstanceId


+ImageSize = Literal[
+    "auto",
+    "512x512",
+    "768x768",
+    "1024x768",
+    "768x1024",
+    "1024x1024",
+    "1024x1536",
+    "1536x1024",
+]
+
+
+def normalize_image_size(v: object) -> ImageSize:
+    """Shared validator for ImageSize fields: maps None → "auto" and rejects invalid values."""
+    if v is None:
+        return "auto"
+    if v not in get_args(ImageSize):
+        raise ValueError(f"Invalid size: {v!r}. Must be one of {get_args(ImageSize)}")
+    return v  # pyright: ignore[reportReturnType]
+
+
 class AdvancedImageParams(BaseModel):
    seed: Annotated[int, Field(ge=0)] | None = None
    num_inference_steps: Annotated[int, Field(ge=1, le=100)] | None = None
@@ -289,7 +302,7 @@ class ImageGenerationTaskParams(BaseModel):
    partial_images: int | None = 0
    quality: Literal["high", "medium", "low"] | None = "medium"
    response_format: Literal["url", "b64_json"] | None = "b64_json"
-    size: str | None = "1024x1024"
+    size: ImageSize = "auto"
    stream: bool | None = False
    style: str | None = "vivid"
    user: str | None = None
@@ -297,6 +310,11 @@ class ImageGenerationTaskParams(BaseModel):
    # Internal flag for benchmark mode - set by API, preserved through serialization
    bench: bool = False

+    @field_validator("size", mode="before")
+    @classmethod
+    def normalize_size(cls, v: object) -> ImageSize:
+        return normalize_image_size(v)
+

 class BenchImageGenerationTaskParams(ImageGenerationTaskParams):
    bench: bool = True
@@ -313,13 +331,18 @@ class ImageEditsTaskParams(BaseModel):
    quality: Literal["high", "medium", "low"] | None = "medium"
    output_format: Literal["png", "jpeg", "webp"] = "png"
    response_format: Literal["url", "b64_json"] | None = "b64_json"
-    size: str | None = "1024x1024"
+    size: ImageSize = "auto"
    image_strength: float | None = 0.7
    stream: bool = False
    partial_images: int | None = 0
    advanced_params: AdvancedImageParams | None = None
    bench: bool = False

+    @field_validator("size", mode="before")
+    @classmethod
+    def normalize_size(cls, v: object) -> ImageSize:
+        return normalize_image_size(v)
+
    def __repr_args__(self) -> Generator[tuple[str, Any], None, None]:
        for name, value in super().__repr_args__():  # pyright: ignore[reportAny]
            if name == "image_data":
--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -48,6 +48,10 @@ class DeleteInstance(BaseCommand):
    instance_id: InstanceId


+class TaskCancelled(BaseCommand):
+    cancelled_command_id: CommandId
+
+
 class TaskFinished(BaseCommand):
    finished_command_id: CommandId

@@ -89,6 +93,7 @@ Command = (
    | PlaceInstance
    | CreateInstance
    | DeleteInstance
+    | TaskCancelled
    | TaskFinished
    | SendInputChunk
 )
--- a/src/exo/shared/types/mlx.py
+++ b/src/exo/shared/types/mlx.py
@@ -4,10 +4,13 @@ from collections.abc import Sequence

 from mlx_lm.models.cache import (
    ArraysCache,
+    CacheList,
    KVCache,
    QuantizedKVCache,
    RotatingKVCache,
 )

 # This list contains one cache entry per transformer layer
-KVCacheType = Sequence[KVCache | RotatingKVCache | QuantizedKVCache | ArraysCache]
+KVCacheType = Sequence[
+    KVCache | RotatingKVCache | QuantizedKVCache | ArraysCache | CacheList
+]
--- a/src/exo/shared/types/tasks.py
+++ b/src/exo/shared/types/tasks.py
@@ -24,6 +24,7 @@ class TaskStatus(str, Enum):
    Complete = "Complete"
    TimedOut = "TimedOut"
    Failed = "Failed"
+    Cancelled = "Cancelled"


 class BaseTask(TaggedModel):
@@ -60,6 +61,11 @@ class TextGeneration(BaseTask):  # emitted by Master
    error_message: str | None = Field(default=None)


+class CancelTask(BaseTask):
+    cancelled_task_id: TaskId
+    runner_id: RunnerId
+
+
 class ImageGeneration(BaseTask):  # emitted by Master
    command_id: CommandId
    task_params: ImageGenerationTaskParams
@@ -87,6 +93,7 @@ Task = (
    | LoadModel
    | StartWarmup
    | TextGeneration
+    | CancelTask
    | ImageGeneration
    | ImageEdits
    | Shutdown
--- a/src/exo/shared/types/worker/downloads.py
+++ b/src/exo/shared/types/worker/downloads.py
@@ -26,6 +26,7 @@ class DownloadProgressData(CamelCaseModel):
 class BaseDownloadProgress(TaggedModel):
    node_id: NodeId
    shard_metadata: ShardMetadata
+    model_directory: str = ""


 class DownloadPending(BaseDownloadProgress):
--- a/src/exo/shared/types/worker/runner_response.py
+++ b/src/exo/shared/types/worker/runner_response.py
@@ -62,6 +62,7 @@ class PartialImageResponse(BaseRunnerResponse):
 class ToolCallResponse(BaseRunnerResponse):
    tool_calls: list[ToolCallItem]
    usage: Usage | None
+    stats: GenerationStats | None = None


 class FinishedResponse(BaseRunnerResponse):
--- a/src/exo/shared/types/worker/runners.py
+++ b/src/exo/shared/types/worker/runners.py
@@ -50,9 +50,7 @@ class RunnerReady(BaseRunnerStatus):


 class RunnerRunning(BaseRunnerStatus):
-    """Runner is processing requests and can accept more (continuous batching)."""
-
-    active_requests: int = 0
+    pass


 class RunnerShuttingDown(BaseRunnerStatus):
--- a/src/exo/utils/banner.py
+++ b/src/exo/utils/banner.py
@@ -1,5 +1,7 @@
+import sys
+
+
 def print_startup_banner(port: int) -> None:
-    """Print a prominent startup banner with API endpoint information."""
    dashboard_url = f"http://localhost:{port}"
    banner = f"""
 ╔═══════════════════════════════════════════════════════════════════════╗
@@ -27,4 +29,4 @@ def print_startup_banner(port: int) -> None:

 """

-    print(banner)
+    print(banner, file=sys.stderr)
--- a/src/exo/utils/channels.py
+++ b/src/exo/utils/channels.py
@@ -1,3 +1,4 @@
+import contextlib
 import multiprocessing as mp
 from dataclasses import dataclass, field
 from math import inf
@@ -125,12 +126,15 @@ class MpSender[T]:
            self._state.buffer.put(item, block=True)

    async def send_async(self, item: T) -> None:
-        await to_thread.run_sync(self.send, item, limiter=CapacityLimiter(1))
+        await to_thread.run_sync(
+            self.send, item, limiter=CapacityLimiter(1), abandon_on_cancel=True
+        )

    def close(self) -> None:
        if not self._state.closed.is_set():
            self._state.closed.set()
-        self._state.buffer.put(_MpEndOfStream())
+        with contextlib.suppress(Exception):
+            self._state.buffer.put_nowait(_MpEndOfStream())
        self._state.buffer.close()

    # == unique to Mp channels ==
@@ -202,6 +206,8 @@ class MpReceiver[T]:
    def close(self) -> None:
        if not self._state.closed.is_set():
            self._state.closed.set()
+        with contextlib.suppress(Exception):
+            self._state.buffer.put_nowait(_MpEndOfStream())
        self._state.buffer.close()

    # == unique to Mp channels ==
--- a/src/exo/worker/engines/image/generate.py
+++ b/src/exo/worker/engines/image/generate.py
@@ -14,6 +14,7 @@ from exo.shared.types.api import (
    ImageEditsTaskParams,
    ImageGenerationStats,
    ImageGenerationTaskParams,
+    ImageSize,
 )
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.runner_response import (
@@ -23,9 +24,9 @@ from exo.shared.types.worker.runner_response import (
 from exo.worker.engines.image.distributed_model import DistributedImageModel


-def parse_size(size_str: str | None) -> tuple[int, int]:
+def parse_size(size_str: ImageSize) -> tuple[int, int]:
    """Parse size parameter like '1024x1024' to (width, height) tuple."""
-    if not size_str:
+    if size_str == "auto":
        return (1024, 1024)

    try:
@@ -109,6 +110,9 @@ def generate_image(
            # Decode base64 image data and save to temp file
            image_path = Path(tmpdir) / "input.png"
            image_path.write_bytes(base64.b64decode(task.image_data))
+            if task.size == "auto":
+                with Image.open(image_path) as img:
+                    width, height = img.size

        for image_num in range(num_images):
            # Increment seed for each image to ensure unique results
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -163,11 +163,14 @@ class PipelineLastLayer(CustomMlxLayer):
                output, (self.r + 1) % self.s, group=self.group
            )
            if cache is not None:
-                cache.keys = mx.depends(cache.keys, output)  # type: ignore[reportUnknownMemberType]
+                # CacheList (used by MLA models like DeepSeekV32, GLM MoE DSA)
+                # doesn't have .keys directly; access via first sub-cache.
+                _cache = cache[0] if hasattr(cache, "caches") else cache  # type: ignore
+                _cache.keys = mx.depends(_cache.keys, output)  # type: ignore
            if self.is_prefill:
                mx.eval(output)
                if cache is not None:
-                    mx.eval(cache.keys)  # type: ignore
+                    mx.eval(_cache.keys)  # type: ignore

        if not self.is_prefill:
            output = mx.distributed.all_gather(output, group=self.group)[
@@ -307,7 +310,9 @@ def patch_pipeline_model[T](model: T, group: mx.distributed.Group) -> T:

        # Add dependency to last cache entry to ensure distributed ops are evaluated
        if cache is not None:
-            cache[-1].state = mx.depends(cache[-1].state, logits)  # type: ignore
+            last = cache[-1]  # type: ignore
+            dep_cache = last[0] if hasattr(last, "caches") else last  # type: ignore
+            dep_cache.keys = mx.depends(dep_cache.keys, logits)  # type: ignore

        return logits

@@ -333,7 +338,9 @@ def patch_tensor_model[T](model: T) -> T:

        # Add dependency to last cache entry to ensure distributed ops are evaluated
        if cache is not None and len(cache) > 0:  # pyright: ignore[reportAny]
-            cache[-1].state = mx.depends(cache[-1].state, logits)  # pyright: ignore[reportAny,reportUnknownMemberType]
+            last = cache[-1]  # pyright: ignore[reportAny]
+            dep_cache = last[0] if hasattr(last, "caches") else last  # pyright: ignore[reportAny]
+            dep_cache.keys = mx.depends(dep_cache.keys, logits)  # pyright: ignore[reportAny,reportUnknownMemberType]

        return logits

@@ -547,10 +554,12 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
        on_timeout: TimeoutCallback | None,
    ) -> nn.Module:
        model = cast(DeepseekV3Model, model)
+
        for layer in model.layers:
            eval_with_timeout(
                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
            )
+
            # Shard the self attention
            if layer.self_attn.q_lora_rank is None:
                layer.self_attn.q_proj = self.all_to_sharded_linear(
@@ -581,12 +590,18 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)

-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
+            # Shard the MoE.
            else:
-                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.gate_proj)
-                self.sharded_to_all_linear_in_place(layer.mlp.shared_experts.down_proj)
-                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.up_proj)
+                if getattr(layer.mlp, "shared_experts", None) is not None:
+                    self.all_to_sharded_linear_in_place(
+                        layer.mlp.shared_experts.gate_proj
+                    )
+                    self.sharded_to_all_linear_in_place(
+                        layer.mlp.shared_experts.down_proj
+                    )
+                    self.all_to_sharded_linear_in_place(
+                        layer.mlp.shared_experts.up_proj
+                    )
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
@@ -779,8 +794,7 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):

            layer.self_attn = WrappedMiniMaxAttention(layer.self_attn, self.group)  # pyright: ignore[reportAttributeAccessIssue,reportArgumentType]

-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
+            # Shard the MoE.
            self.all_to_sharded_linear_in_place(
                layer.block_sparse_moe.switch_mlp.gate_proj
            )
@@ -893,8 +907,7 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):
                    layer.self_attn.num_attention_heads //= self.N
                    layer.self_attn.num_key_value_heads //= self.N

-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
+            # Shard the MoE.
            if isinstance(layer.mlp, (Qwen3MoeSparseMoeBlock, Qwen3NextSparseMoeBlock)):
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
--- a/src/exo/worker/engines/mlx/cache.py
+++ b/src/exo/worker/engines/mlx/cache.py
@@ -5,6 +5,7 @@ import mlx.core as mx
 import psutil
 from mlx_lm.models.cache import (
    ArraysCache,
+    CacheList,
    KVCache,
    QuantizedKVCache,
    RotatingKVCache,
@@ -17,10 +18,22 @@ from exo.worker.engines.mlx import Model
 from exo.worker.engines.mlx.constants import CACHE_GROUP_SIZE, KV_CACHE_BITS
 from exo.worker.runner.bootstrap import logger

-# Fraction of device memory above which LRU eviction kicks in
-_DEFAULT_MEMORY_THRESHOLD = 0.9
+
+# Fraction of device memory above which LRU eviction kicks in.
+# Smaller machines need more aggressive eviction.
+def _default_memory_threshold() -> float:
+    total_gb = psutil.virtual_memory().total / (1024**3)
+    if total_gb >= 128:
+        return 0.85
+    if total_gb >= 64:
+        return 0.80
+    if total_gb >= 32:
+        return 0.75
+    return 0.70
+
+
 _MEMORY_THRESHOLD = float(
-    os.environ.get("EXO_MEMORY_THRESHOLD", _DEFAULT_MEMORY_THRESHOLD)
+    os.environ.get("EXO_MEMORY_THRESHOLD", _default_memory_threshold())
 )


@@ -64,7 +77,7 @@ def has_non_kv_caches(cache: KVCacheType) -> bool:


 class KVPrefixCache:
-    def __init__(self, group: mx.distributed.Group | None = None):
+    def __init__(self, group: mx.distributed.Group | None):
        self.prompts: list[mx.array] = []  # mx array of tokens (ints)
        self.caches: list[KVCacheType] = []
        self._snapshots: list[list[CacheSnapshot] | None] = []
@@ -156,15 +169,15 @@ class KVPrefixCache:
        best_length = 0
        is_exact = False

-        # Find best cache
+        # Find best cache match
        for i, cached_prompt in enumerate(self.prompts):
            length = get_prefix_length(prompt_tokens, cached_prompt)
+            if length >= max_length - 1:
+                best_index, best_length = i, length
+                is_exact = True
+                break
            if length > best_length:
                best_index, best_length = i, length
-            if length == max_length:
-                is_exact = True
-                best_index, best_length = i, length
-                break

        if best_index is None:
            return make_kv_cache(model), prompt_tokens, None
@@ -172,11 +185,12 @@ class KVPrefixCache:
        # For exact match: trim to max_length-1 so remaining has the last token
        # For partial match: trim to best_length, remaining has suffix to prefill
        # This ensures stream_generate always has at least one token to start with
-        target = (max_length - 1) if is_exact else best_length
+        has_ssm = has_non_kv_caches(self.caches[best_index])
+        target = (max_length - 1) if is_exact and not has_ssm else best_length
        restore_pos, restore_snap = self._get_snapshot(best_index, target)

        # No usable snapshot — need fresh cache
-        if restore_snap is None and has_non_kv_caches(self.caches[best_index]):
+        if restore_snap is None and has_ssm:
            return make_kv_cache(model), prompt_tokens, None

        prompt_cache = deepcopy(self.caches[best_index])
@@ -257,10 +271,21 @@ def encode_prompt(tokenizer: TokenizerWrapper, prompt: str) -> mx.array:
    return mx.array(prompt_tokens)


+def _entry_length(
+    c: KVCache | RotatingKVCache | QuantizedKVCache | ArraysCache | CacheList,
+) -> int:
+    # Use .offset attribute which KVCache types have (len() not implemented in older QuantizedKVCache).
+    if hasattr(c, "offset"):
+        return c.offset
+    # For CacheList
+    if hasattr(c, "size"):
+        return int(c.size())  # type: ignore
+    return 0
+
+
 def cache_length(cache: KVCacheType) -> int:
    """Get the number of tokens in a KV cache."""
-    # Use .offset attribute which KVCache types have (len() not implemented in older QuantizedKVCache).
-    return max(getattr(c, "offset", 0) for c in cache)
+    return max(_entry_length(c) for c in cache)


 def get_prefix_length(prompt: mx.array, cached_prompt: mx.array) -> int:
--- a/src/exo/worker/engines/mlx/generator/batch_engine.py
+++ b/src/exo/worker/engines/mlx/generator/batch_engine.py
@@ -1,317 +0,0 @@
-"""Batch generation engine using mlx_lm's BatchGenerator for continuous batching."""
-
-import time
-from dataclasses import dataclass, field
-from typing import get_args
-
-import mlx.core as mx
-from mlx_lm.generate import BatchGenerator
-from mlx_lm.sample_utils import make_sampler
-from mlx_lm.tokenizer_utils import StreamingDetokenizer, TokenizerWrapper
-
-from exo.shared.types.api import FinishReason, GenerationStats
-from exo.shared.types.common import CommandId
-from exo.shared.types.memory import Memory
-from exo.shared.types.tasks import TaskId
-from exo.shared.types.text_generation import TextGenerationTaskParams
-from exo.shared.types.worker.runner_response import GenerationResponse
-from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.constants import MAX_TOKENS
-from exo.worker.engines.mlx.generator.distributed_sync import share_object
-from exo.worker.engines.mlx.utils_mlx import apply_chat_template
-from exo.worker.runner.bootstrap import logger
-
-
-@dataclass
-class PendingInsert:
-    """Pre-tokenized request ready for batch insertion."""
-
-    command_id: CommandId
-    task_id: TaskId
-    tokens: list[int]
-    max_tokens: int
-    prompt_tokens: int
-    temperature: float | None = None
-    top_p: float | None = None
-    top_k: int | None = None
-
-
-@dataclass
-class ActiveRequest:
-    """Tracks an active request in the batch."""
-
-    command_id: CommandId
-    task_id: TaskId
-    uid: int  # BatchGenerator's internal ID
-    detokenizer: StreamingDetokenizer
-    tokens_generated: int = 0
-    prompt_tokens: int = 0
-    start_time: float = field(default_factory=time.perf_counter)
-
-
-@dataclass
-class BatchedGenerationResponse:
-    """Response from batch engine, tagged with command_id and task_id."""
-
-    command_id: CommandId
-    task_id: TaskId
-    response: GenerationResponse
-
-
-class BatchGenerationEngine:
-    """Manages continuous batching using mlx_lm's BatchGenerator."""
-
-    def __init__(
-        self,
-        model: Model,
-        tokenizer: TokenizerWrapper,
-        group: mx.distributed.Group | None = None,
-        max_tokens: int = MAX_TOKENS,
-        completion_batch_size: int = 32,
-        prefill_batch_size: int = 8,
-        prefill_step_size: int = 2048,
-    ):
-        self.model = model
-        self.tokenizer = tokenizer
-        self.max_tokens = max_tokens
-        self.active_requests: dict[int, ActiveRequest] = {}
-        self._pending_inserts: list[PendingInsert] = []
-        self._pending_completions: list[
-            int
-        ] = []  # UIDs completed but not yet synced/removed
-
-        self.group = group
-        self.rank = group.rank() if group else 0
-        self.is_distributed = group is not None and group.size() > 1
-
-        sampler = make_sampler(temp=0.7, top_p=1.0)
-
-        eos_tokens: set[int] = set(tokenizer.eos_token_ids or [])
-
-        self.batch_gen: BatchGenerator = BatchGenerator(
-            model=model,
-            max_tokens=max_tokens,
-            stop_tokens=eos_tokens,
-            sampler=sampler,
-            completion_batch_size=completion_batch_size,
-            prefill_batch_size=prefill_batch_size,
-            prefill_step_size=prefill_step_size,
-        )
-
-        logger.info(
-            f"BatchGenerationEngine initialized with completion_batch_size={completion_batch_size}, "
-            f"prefill_batch_size={prefill_batch_size}, distributed={self.is_distributed}"
-        )
-
-    def queue_request(
-        self,
-        command_id: CommandId,
-        task_id: TaskId,
-        task_params: TextGenerationTaskParams,
-    ) -> str:
-        """Queue a pre-tokenized request for insertion. Only rank 0 should call this.
-
-        Tokenization happens here (eagerly) so that sync_and_insert_pending()
-        only does the lightweight batch_gen.insert() call, keeping the decode
-        thread unblocked for as long as possible.
-
-        Returns the prompt string for caller use (e.g. thinking-mode detection).
-        """
-        assert self.rank == 0, "Only rank 0 should queue requests"
-        prompt_str = apply_chat_template(self.tokenizer, task_params)
-        tokens: list[int] = self.tokenizer.encode(prompt_str, add_special_tokens=False)
-        max_tokens = task_params.max_output_tokens or self.max_tokens
-        self._pending_inserts.append(
-            PendingInsert(
-                command_id=command_id,
-                task_id=task_id,
-                tokens=tokens,
-                max_tokens=max_tokens,
-                prompt_tokens=len(tokens),
-                temperature=task_params.temperature,
-                top_p=task_params.top_p,
-                top_k=task_params.top_k,
-            )
-        )
-        logger.info(
-            f"Queued request {command_id} for insertion (pending={len(self._pending_inserts)}, prompt_tokens={len(tokens)})"
-        )
-        return prompt_str
-
-    def sync_and_insert_pending(self) -> list[int]:
-        """Sync pre-tokenized pending inserts across ranks and insert them. Returns UIDs.
-
-        Tokens are already prepared by queue_request(), so this method only does
-        the lightweight batch_gen.insert() call plus distributed sync if needed.
-        """
-        inserts_to_process: list[PendingInsert]
-
-        if not self.is_distributed:
-            # Non-distributed: just insert directly from pending
-            inserts_to_process = list(self._pending_inserts)
-        else:
-            # Distributed: broadcast pre-tokenized inserts from rank 0 to all ranks
-            assert self.group is not None
-            inserts_to_process = share_object(
-                self._pending_inserts if self.rank == 0 else None,
-                self.rank,
-                self.group,
-            )
-
-        if not inserts_to_process:
-            self._pending_inserts.clear()
-            return []
-
-        # Update sampler from per-request parameters (last request wins for batch)
-        last = inserts_to_process[-1]
-        self.batch_gen.sampler = make_sampler(  # pyright: ignore[reportAttributeAccessIssue]
-            temp=last.temperature if last.temperature is not None else 0.7,
-            top_p=last.top_p if last.top_p is not None else 1.0,
-            top_k=last.top_k if last.top_k is not None else 0,
-        )
-
-        # Single batched insert for efficient prefill — tokens already prepared
-        all_tokens = [p.tokens for p in inserts_to_process]
-        all_max_tokens = [p.max_tokens for p in inserts_to_process]
-        uids = self.batch_gen.insert(all_tokens, max_tokens=all_max_tokens)
-
-        # Track all inserted requests
-        for i, uid in enumerate(uids):
-            p = inserts_to_process[i]
-            self.active_requests[uid] = ActiveRequest(
-                command_id=p.command_id,
-                task_id=p.task_id,
-                uid=uid,
-                detokenizer=self.tokenizer.detokenizer,
-                prompt_tokens=p.prompt_tokens,
-            )
-            logger.info(
-                f"Inserted request {p.command_id} with uid={uid}, prompt_tokens={p.prompt_tokens}, max_tokens={p.max_tokens}"
-            )
-
-        self._pending_inserts.clear()
-        return uids
-
-    def step(self) -> list[BatchedGenerationResponse]:
-        """Run one decode step. Tracks completions but does not sync - call sync_completions() at budget boundaries."""
-        responses = self.batch_gen.next()
-        if not responses:
-            return []
-
-        results: list[BatchedGenerationResponse] = []
-
-        for r in responses:
-            uid: int = r.uid
-            req = self.active_requests.get(uid)
-            if req is None:
-                logger.warning(f"Received response for unknown uid={uid}")
-                continue
-
-            req.tokens_generated += 1
-
-            # Decode the token
-            token: int = r.token
-            req.detokenizer.add_token(token)
-            text: str = req.detokenizer.last_segment
-
-            stats: GenerationStats | None = None
-            finish_reason: FinishReason | None = None
-
-            raw_finish_reason: str | None = r.finish_reason
-            if raw_finish_reason is not None:
-                # Finalize to get remaining text
-                req.detokenizer.finalize()
-                text = req.detokenizer.last_segment
-
-                elapsed = time.perf_counter() - req.start_time
-                generation_tps = req.tokens_generated / elapsed if elapsed > 0 else 0.0
-
-                stats = GenerationStats(
-                    prompt_tps=0.0,  # Not tracked per-request in batch mode
-                    generation_tps=generation_tps,
-                    prompt_tokens=req.prompt_tokens,
-                    generation_tokens=req.tokens_generated,
-                    peak_memory_usage=Memory.from_gb(mx.get_peak_memory() / 1e9),
-                )
-
-                if raw_finish_reason in get_args(FinishReason):
-                    finish_reason = raw_finish_reason  # pyright: ignore[reportAssignmentType]
-                else:
-                    logger.warning(f"Unknown finish_reason: {raw_finish_reason}")
-                    finish_reason = "stop"
-
-                # Track completion but don't remove yet - wait for sync_completions()
-                self._pending_completions.append(uid)
-                logger.info(
-                    f"Request {req.command_id} completed: {req.tokens_generated} tokens, {generation_tps:.2f} tps, reason={finish_reason}"
-                )
-
-            results.append(
-                BatchedGenerationResponse(
-                    command_id=req.command_id,
-                    task_id=req.task_id,
-                    response=GenerationResponse(
-                        text=text,
-                        token=token,
-                        finish_reason=finish_reason,
-                        stats=stats,
-                        usage=None,
-                    ),
-                )
-            )
-
-        # In non-distributed mode, clean up completions immediately
-        if not self.is_distributed:
-            self._remove_completed()
-
-        return results
-
-    def sync_completions(self) -> None:
-        """Sync and remove completed requests. Call at time budget boundaries in distributed mode."""
-        if not self.is_distributed:
-            # Non-distributed: early return if nothing to do
-            if not self._pending_completions:
-                return
-            self._remove_completed()
-            return
-
-        # Distributed mode: ALWAYS sync to ensure all ranks participate in collective op
-        # This prevents deadlock if one rank has completions and another doesn't
-        assert self.group is not None
-        self._pending_completions = share_object(
-            self._pending_completions if self.rank == 0 else None,
-            self.rank,
-            self.group,
-        )
-        self._remove_completed()
-
-    def _remove_completed(self) -> None:
-        """Remove completed requests from tracking."""
-        for uid in self._pending_completions:
-            if uid in self.active_requests:
-                del self.active_requests[uid]
-        self._pending_completions.clear()
-
-    @property
-    def has_active_requests(self) -> bool:
-        return bool(self.active_requests or self.batch_gen.unprocessed_prompts)
-
-    @property
-    def has_pending_inserts(self) -> bool:
-        return bool(self._pending_inserts)
-
-    @property
-    def active_count(self) -> int:
-        return len(self.active_requests)
-
-    @property
-    def pending_count(self) -> int:
-        return len(self.batch_gen.unprocessed_prompts)
-
-    @property
-    def pending_insert_count(self) -> int:
-        return len(self._pending_inserts)
-
-    @property
-    def has_pending_completions(self) -> bool:
-        return bool(self._pending_completions)
--- a/src/exo/worker/engines/mlx/generator/distributed_sync.py
+++ b/src/exo/worker/engines/mlx/generator/distributed_sync.py
@@ -1,34 +0,0 @@
-"""Distributed sync utilities using mx.distributed.all_sum() to broadcast from rank 0."""
-
-# pyright: reportAny=false
-
-import pickle
-from typing import cast
-
-import mlx.core as mx
-
-
-def share_object[T](obj: T | None, rank: int, group: mx.distributed.Group) -> T:
-    """Broadcast object from rank 0 to all ranks. Two-phase: size then data.
-
-    Rank 0 must always provide a non-None object. Non-rank-0 callers pass None
-    (they are receivers only). Use mx_barrier() instead if no data needs to be shared.
-    """
-    if rank == 0:
-        assert obj is not None, (
-            "Rank 0 must provide data; use mx_barrier() to sync without data"
-        )
-        data = mx.array(list(pickle.dumps(obj)), dtype=mx.uint8)
-        mx.eval(mx.distributed.all_sum(mx.array([data.size]), group=group))
-        mx.eval(mx.distributed.all_sum(data, group=group))
-        return obj
-    else:
-        size = int(mx.distributed.all_sum(mx.array([0]), group=group).item())
-        if size == 0:
-            raise RuntimeError(
-                "share_object received size=0 from rank 0 — protocol violation"
-            )
-        data = mx.zeros(size, dtype=mx.uint8)
-        data = mx.distributed.all_sum(data, group=group)
-        mx.eval(data)
-        return cast(T, pickle.loads(bytes(cast(list[int], data.tolist()))))
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`collect_ignore = ["tests/start_distributed_test.py"]`