feat(dashboard): add light/dark mode toggle with warm parchment palette

Adds a theme system to the EXO dashboard with a "Mission Control, Dawn Shift" light mode — warm parchment backgrounds (oklch(0.97 0.015 80)) and deep amber/brass accents (oklch(0.50 0.14 65)) that feel premium rather than cold. Changes: - dashboard/src/lib/stores/theme.svelte.ts: new Svelte 5 rune store, persists choice to localStorage under 'exo-theme' - dashboard/src/app.html: FOUC prevention — html starts as class="dark", inline script reads localStorage and switches to class="light" before first paint - dashboard/src/routes/+layout.svelte: calls theme.init() on mount to sync rune state with the DOM class - dashboard/src/lib/components/HeaderNav.svelte: sun/moon toggle button in the right nav area - dashboard/src/app.css: full html.light palette + utility overrides (scrollbar, logo filter, graph links, scanlines, etc.) No new npm dependencies — avoids mode-watcher entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
feat: only show thinking toggle for models that support it (#1497 )
2026-02-18 23:06:23 -05:00 · 2026-02-18 12:36:51 -08:00 · 2026-02-18 17:05:00 +00:00 · 2026-02-18 16:18:09 +00:00 · 2026-02-18 16:05:39 +00:00 · 2026-02-18 14:04:06 +00:00
79 changed files with 1939 additions and 1461 deletions
--- a/.mlx_typings/mlx_lm/models/glm_moe_dsa.pyi
+++ b/.mlx_typings/mlx_lm/models/glm_moe_dsa.pyi
@@ -0,0 +1,46 @@
+"""Type stubs for mlx_lm.models.glm_moe_dsa"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+from .base import BaseModelArgs
+from .deepseek_v32 import Model as DSV32Model
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str
+    vocab_size: int
+    hidden_size: int
+    index_head_dim: int
+    index_n_heads: int
+    index_topk: int
+    intermediate_size: int
+    moe_intermediate_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    num_key_value_heads: int
+    n_shared_experts: Optional[int]
+    n_routed_experts: Optional[int]
+    routed_scaling_factor: float
+    kv_lora_rank: int
+    q_lora_rank: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    qk_nope_head_dim: int
+    topk_method: str
+    scoring_func: str
+    norm_topk_prob: bool
+    n_group: int
+    topk_group: int
+    num_experts_per_tok: int
+    moe_layer_freq: int
+    first_k_dense_replace: int
+    max_position_embeddings: int
+    rms_norm_eps: float
+    rope_parameters: Dict[str, Any]
+    attention_bias: bool
+    rope_scaling: Dict[str, Any] | None
+    rope_theta: float | None
+
+class Model(DSV32Model):
+    def __init__(self, config: ModelArgs) -> None: ...
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -141,12 +141,6 @@ version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"

-[[package]]
-name = "arrayvec"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
-
 [[package]]
 name = "asn1-rs"
 version = "0.7.1"
@@ -304,19 +298,6 @@ version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba"

-[[package]]
-name = "bigdecimal"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "560f42649de9fa436b73517378a147ec21f6c997a546581df4b4b31677828934"
-dependencies = [
- "autocfg",
- "libm",
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "bimap"
 version = "0.6.3"
@@ -516,15 +497,6 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2f421161cb492475f1661ddc9815a745a1c894592070661180fdec3d4872e9c3"

-[[package]]
-name = "convert_case"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
-dependencies = [
- "unicode-segmentation",
-]
-
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -746,29 +718,6 @@ dependencies = [
 "powerfmt",
 ]

-[[package]]
-name = "derive_more"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618"
-dependencies = [
- "derive_more-impl",
-]
-
-[[package]]
-name = "derive_more-impl"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b"
-dependencies = [
- "convert_case",
- "proc-macro2",
- "quote",
- "rustc_version",
- "syn 2.0.111",
- "unicode-xid",
-]
-
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -939,22 +888,17 @@ name = "exo_pyo3_bindings"
 version = "0.0.1"
 dependencies = [
 "delegate",
- "derive_more",
 "env_logger",
 "extend",
 "futures",
- "impl-trait-for-tuples",
 "libp2p",
 "log",
 "networking",
- "once_cell",
 "pin-project",
 "pyo3",
 "pyo3-async-runtimes",
 "pyo3-log",
 "pyo3-stub-gen",
- "thiserror 2.0.17",
- "thread_local",
 "tokio",
 "util",
 ]
@@ -1640,17 +1584,6 @@ dependencies = [
 "xmltree",
 ]

-[[package]]
-name = "impl-trait-for-tuples"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.111",
-]
-
 [[package]]
 name = "indexmap"
 version = "2.12.1"
@@ -1829,12 +1762,6 @@ version = "0.2.178"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"

-[[package]]
-name = "libm"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de"
-
 [[package]]
 name = "libp2p"
 version = "0.56.0"
@@ -2824,16 +2751,13 @@ name = "networking"
 version = "0.0.1"
 dependencies = [
 "delegate",
- "derive_more",
 "either",
 "extend",
 "futures",
 "futures-timer",
- "impl-trait-for-tuples",
 "keccak-const",
 "libp2p",
 "log",
- "thiserror 2.0.17",
 "tokio",
 "tracing-subscriber",
 "util",
@@ -2918,17 +2842,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-rational"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
-dependencies = [
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -3279,28 +3192,14 @@ version = "0.27.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab53c047fcd1a1d2a8820fe84f05d6be69e9526be40cb03b73f86b6b03e6d87d"
 dependencies = [
- "bigdecimal",
- "either",
- "hashbrown 0.16.1",
- "indexmap",
 "indoc",
- "inventory",
 "libc",
- "lock_api",
 "memoffset",
- "num-bigint",
- "num-complex",
- "num-rational",
- "num-traits",
 "once_cell",
- "ordered-float",
- "parking_lot",
 "portable-atomic",
 "pyo3-build-config",
 "pyo3-ffi",
 "pyo3-macros",
- "rust_decimal",
- "smallvec",
 "unindent",
 ]

@@ -3741,16 +3640,6 @@ dependencies = [
 "tokio",
 ]

-[[package]]
-name = "rust_decimal"
-version = "1.39.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35affe401787a9bd846712274d97654355d21b2a2c092a3139aabe31e9022282"
-dependencies = [
- "arrayvec",
- "num-traits",
-]
-
 [[package]]
 name = "rustc-hash"
 version = "1.1.0"
@@ -4615,24 +4504,12 @@ version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"

-[[package]]
-name = "unicode-segmentation"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
-
 [[package]]
 name = "unicode-width"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"

-[[package]]
-name = "unicode-xid"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
-
 [[package]]
 name = "unicode_names2"
 version = "1.3.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,49 +26,21 @@ opt-level = 3
 networking = { path = "rust/networking" }
 util = { path = "rust/util" }

-# Proc-macro authoring tools
-syn = "2.0"
-quote = "1.0"
-proc-macro2 = "1.0"
-darling = "0.20"
-
 # Macro dependecies
 extend = "1.2"
 delegate = "0.13"
-impl-trait-for-tuples = "0.2"
-clap = "4.5"
-derive_more = { version = "2.0.1", features = ["display"] }
 pin-project = "1"

 # Utility dependencies
-itertools = "0.14"
-thiserror = "2"
-internment = "0.8"
-recursion = "0.5"
-regex = "1.11"
-once_cell = "1.21"
-thread_local = "1.1"
-bon = "3.4"
-generativity = "1.1"
-anyhow = "1.0"
 keccak-const = "0.2"

-# Functional generics/lenses frameworks
-frunk_core = "0.4"
-frunk = "0.4"
-frunk_utils = "0.2"
-frunk-enum-core = "0.3"
-
 # Async dependencies
 tokio = "1.46"
 futures = "0.3"
-futures-util = "0.3"
 futures-timer = "3.0"

 # Data structures
 either = "1.15"
-ordered-float = "5.0"
-ahash = "0.8"

 # Tracing/logging
 log = "0.4"
--- a/README.md
+++ b/README.md
@@ -72,16 +72,23 @@ There are two ways to run exo:

 ### Run from Source (macOS)

+If you have [Nix](https://nixos.org/) installed, you can skip most of the steps below and run exo directly (after accepting the Cachix cache):
+
+```bash
+nix run .#exo
+```
+
 **Prerequisites:**
+- [Xcode](https://developer.apple.com/xcode/) (provides the Metal ToolChain required for MLX compilation)
 - [brew](https://github.com/Homebrew/brew) (for simple package management on macOS)
-  
+
  ```bash
  /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
  ```
 - [uv](https://github.com/astral-sh/uv) (for Python dependency management)
 - [macmon](https://github.com/vladkens/macmon) (for hardware monitoring on Apple Silicon)
 - [node](https://github.com/nodejs/node) (for building the dashboard)
-  
+
  ```bash
  brew install uv macmon node
  ```
--- a/app/EXO/EXO/ExoProcessController.swift
+++ b/app/EXO/EXO/ExoProcessController.swift
@@ -126,11 +126,37 @@ final class ExoProcessController: ObservableObject {
            return
        }
        process.terminationHandler = nil
-        if process.isRunning {
-            process.terminate()
-        }
-        self.process = nil
        status = .stopped
+
+        guard process.isRunning else {
+            self.process = nil
+            return
+        }
+
+        let proc = process
+        self.process = nil
+
+        Task.detached {
+            proc.interrupt()
+
+            for _ in 0..<50 {
+                if !proc.isRunning { return }
+                try? await Task.sleep(nanoseconds: 100_000_000)
+            }
+
+            if proc.isRunning {
+                proc.terminate()
+            }
+
+            for _ in 0..<30 {
+                if !proc.isRunning { return }
+                try? await Task.sleep(nanoseconds: 100_000_000)
+            }
+
+            if proc.isRunning {
+                kill(proc.processIdentifier, SIGKILL)
+            }
+        }
    }

    func restart() {
--- a/bench/bench.toml
+++ b/bench/bench.toml
@@ -0,0 +1,7 @@
+# Canary benchmark manifest
+#
+# Lists the suite files to include. Each file defines benchmarks
+# with shared constraints, topology, and default args.
+include = [
+    "single-m3-ultra.toml",
+]
--- a/bench/single-m3-ultra.toml
+++ b/bench/single-m3-ultra.toml
@@ -0,0 +1,189 @@
+# Single-node M3 Ultra benchmarks
+#
+# Shared constraints applied to ALL benchmarks in this file.
+constraints = [
+    "All(MacOsBuild(=25D125))",
+    "Hosts(=1)",
+    "All(Chip(m3_ultra))",
+    "All(GpuCores(=80))",
+]
+
+[topology]
+type = "none"
+
+# Default args merged into each benchmark's args (benchmark-level args win).
+[defaults]
+pp = [512, 2048, 8192, 16384]
+tg = 128
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-120b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-0.6B-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.2-3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/gpt-oss-20b-MXFP4-Q8"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-30B-A3B-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-Flash-6bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-5bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
+extra_constraints = ["All(Memory(>=96GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Llama-3.3-70B-Instruct-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/llama-3.3-70b-instruct-fp16"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.5-Air-bf16"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-3bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/MiniMax-M2.1-8bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-Next-bf16"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-4bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-6bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Step-3.5-Flash-8Bit"
+extra_constraints = ["All(Memory(>=256GiB))"]
+
+[[benchmark]]
+model = "mlx-community/DeepSeek-V3.1-4bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-6bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/GLM-4.7-8bit-gs32"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
+
+[[benchmark]]
+model = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
+extra_constraints = ["All(Memory(>=512GiB))"]
--- a/dashboard/src/app.css
+++ b/dashboard/src/app.css
@@ -16,9 +16,10 @@
 	/* Gotham-inspired accent colors */
 	--exo-grid: oklch(0.25 0 0);
 	--exo-scanline: oklch(0.15 0 0);
-	--exo-glow-yellow: 0 0 20px oklch(0.85 0.18 85 / 0.3);
-	--exo-glow-yellow-strong: 0 0 40px oklch(0.85 0.18 85 / 0.5);
-	
+	--exo-glow-yellow: oklch(0.85 0.18 85 / 0.3);
+	--exo-glow-yellow-strong: oklch(0.85 0.18 85 / 0.5);
+	--exo-bg-hover: oklch(0.18 0 0);
+
 	/* Theme Variables */
 	--radius: 0.375rem;
 	--background: var(--exo-black);
@@ -41,6 +42,237 @@
 	--ring: var(--exo-yellow);
 }

+/* ============================================================
+   LIGHT THEME — "Mission Control, Dawn Shift"
+   Warm parchment + deep amber. Applied when <html> has .light class.
+   ============================================================ */
+html.light {
+	/* EXO brand palette — warm amber shift */
+	--exo-black: oklch(0.97 0.015 80);
+	--exo-dark-gray: oklch(0.92 0.012 80);
+	--exo-medium-gray: oklch(0.83 0.009 78);
+	--exo-light-gray: oklch(0.50 0.018 75);
+	--exo-yellow: oklch(0.50 0.14 65);
+	--exo-yellow-darker: oklch(0.40 0.13 65);
+	--exo-yellow-glow: oklch(0.60 0.14 65);
+
+	--exo-grid: oklch(0.88 0.009 80);
+	--exo-scanline: oklch(0.93 0.010 80);
+	--exo-glow-yellow: oklch(0.50 0.14 65 / 0.12);
+	--exo-glow-yellow-strong: oklch(0.50 0.14 65 / 0.22);
+	--exo-bg-hover: oklch(0.89 0.010 80);
+
+	/* Semantic tokens */
+	--background: oklch(0.97 0.015 80);
+	--foreground: oklch(0.13 0.015 75);
+	--card: oklch(0.92 0.012 80);
+	--card-foreground: oklch(0.13 0.015 75);
+	--popover: oklch(0.95 0.012 80);
+	--popover-foreground: oklch(0.13 0.015 75);
+	--primary: oklch(0.50 0.14 65);
+	--primary-foreground: oklch(0.97 0.015 80);
+	--secondary: oklch(0.88 0.008 80);
+	--secondary-foreground: oklch(0.15 0.012 75);
+	--muted: oklch(0.90 0.009 80);
+	--muted-foreground: oklch(0.50 0.018 75);
+	--accent: oklch(0.88 0.008 80);
+	--accent-foreground: oklch(0.15 0.012 75);
+	--destructive: oklch(0.52 0.22 25);
+	--border: oklch(0.84 0.007 78);
+	--input: oklch(0.87 0.008 80);
+	--ring: oklch(0.50 0.14 65);
+}
+
+/* ============================================================
+   LIGHT MODE UTILITY OVERRIDES
+   ============================================================ */
+html.light {
+	& .text-white,
+	& .text-white\/90,
+	& .text-white\/80,
+	& .text-white\/70 {
+		color: var(--foreground) !important;
+	}
+	& .text-white\/60,
+	& .text-white\/50 {
+		color: color-mix(in oklch, var(--foreground) 60%, transparent) !important;
+	}
+	& .text-white\/40,
+	& .text-white\/30 {
+		color: color-mix(in oklch, var(--foreground) 38%, transparent) !important;
+	}
+
+	& .bg-black\/80,
+	& .bg-black\/60,
+	& .bg-black\/50,
+	& .bg-black\/40 {
+		background-color: oklch(0.90 0.010 80 / 0.7) !important;
+	}
+	& [class*="bg-exo-black/"] {
+		background-color: oklch(0.90 0.010 80 / 0.6) !important;
+	}
+	& [class*="shadow-black"] {
+		--tw-shadow-color: oklch(0.30 0.010 75 / 0.10) !important;
+	}
+
+	& ::-webkit-scrollbar-track {
+		background: oklch(0.93 0.010 80) !important;
+	}
+	& ::-webkit-scrollbar-thumb {
+		background: oklch(0.76 0.010 78) !important;
+	}
+	& ::-webkit-scrollbar-thumb:hover {
+		background: oklch(0.50 0.14 65 / 0.6) !important;
+	}
+
+	& .command-panel {
+		background: linear-gradient(
+			180deg,
+			oklch(0.94 0.012 80 / 0.96) 0%,
+			oklch(0.91 0.010 80 / 0.98) 100%
+		) !important;
+		border-color: oklch(0.82 0.008 78) !important;
+		box-shadow:
+			inset 0 1px 0 oklch(1 0 0 / 0.6),
+			0 4px 20px oklch(0.30 0.010 75 / 0.08) !important;
+	}
+
+	& .glow-text {
+		text-shadow:
+			0 0 12px oklch(0.50 0.14 65 / 0.20),
+			0 1px 3px oklch(0.30 0.010 75 / 0.12) !important;
+	}
+
+	& .grid-bg {
+		background-image:
+			linear-gradient(oklch(0.75 0.008 78 / 0.25) 1px, transparent 1px),
+			linear-gradient(90deg, oklch(0.75 0.008 78 / 0.25) 1px, transparent 1px) !important;
+	}
+
+	& .scanlines::before {
+		background: repeating-linear-gradient(
+			0deg,
+			transparent,
+			transparent 2px,
+			oklch(0.50 0.010 78 / 0.018) 2px,
+			oklch(0.50 0.010 78 / 0.018) 4px
+		) !important;
+	}
+
+	& .crt-screen {
+		background: radial-gradient(
+			ellipse at center,
+			oklch(0.95 0.012 80) 0%,
+			oklch(0.92 0.010 80) 50%,
+			oklch(0.89 0.009 80) 100%
+		) !important;
+		box-shadow:
+			inset 0 0 60px oklch(0.30 0.010 75 / 0.04),
+			0 0 30px oklch(0.50 0.14 65 / 0.04) !important;
+	}
+
+	& .graph-link {
+		stroke: oklch(0.50 0.018 75 / 0.45) !important;
+		filter: none !important;
+	}
+	& .graph-link-active {
+		stroke: oklch(0.50 0.14 65 / 0.75) !important;
+		filter: none !important;
+	}
+
+	& .shooting-stars {
+		display: none !important;
+	}
+
+	& img[alt="EXO"] {
+		filter: brightness(0) drop-shadow(0 0 6px oklch(0.30 0.010 75 / 0.10)) !important;
+	}
+
+	& .text-red-400 { color: oklch(0.52 0.22 25) !important; }
+	& .text-green-400 { color: oklch(0.48 0.17 155) !important; }
+	& .text-blue-200,
+	& .text-blue-300,
+	& .text-blue-400 { color: oklch(0.48 0.17 250) !important; }
+
+	& .bg-red-500\/10 { background-color: oklch(0.52 0.22 25 / 0.07) !important; }
+	& .bg-red-500\/20 { background-color: oklch(0.52 0.22 25 / 0.11) !important; }
+	& .bg-red-500\/30 { background-color: oklch(0.52 0.22 25 / 0.14) !important; }
+
+	& textarea,
+	& input[type="text"] { color: var(--foreground) !important; }
+	& textarea::placeholder,
+	& input::placeholder { color: oklch(0.50 0.012 78 / 0.55) !important; }
+
+	& .code-block-wrapper,
+	& .math-display-wrapper {
+		background: oklch(0.95 0.010 80) !important;
+		border-color: oklch(0.83 0.007 78) !important;
+	}
+	& .code-block-header,
+	& .math-display-header {
+		background: oklch(0.91 0.009 80) !important;
+		border-color: oklch(0.85 0.007 78) !important;
+	}
+	& .inline-code {
+		background: oklch(0.89 0.009 80) !important;
+		color: oklch(0.20 0.012 75) !important;
+	}
+
+	& blockquote { background: oklch(0.93 0.010 80) !important; }
+	& th {
+		background: oklch(0.90 0.009 80) !important;
+		border-color: oklch(0.80 0.007 78) !important;
+	}
+	& td { border-color: oklch(0.84 0.007 78) !important; }
+	& hr { border-color: oklch(0.84 0.007 78) !important; }
+
+	& .hljs { color: oklch(0.22 0.012 75) !important; }
+	& .hljs-keyword, & .hljs-selector-tag, & .hljs-literal, & .hljs-section, & .hljs-link {
+		color: oklch(0.45 0.18 300) !important;
+	}
+	& .hljs-string, & .hljs-title, & .hljs-name, & .hljs-type,
+	& .hljs-attribute, & .hljs-symbol, & .hljs-bullet, & .hljs-addition,
+	& .hljs-variable, & .hljs-template-tag, & .hljs-template-variable {
+		color: oklch(0.45 0.14 65) !important;
+	}
+	& .hljs-comment, & .hljs-quote, & .hljs-deletion, & .hljs-meta {
+		color: oklch(0.55 0.010 78) !important;
+	}
+	& .hljs-number, & .hljs-regexp, & .hljs-built_in {
+		color: oklch(0.45 0.15 160) !important;
+	}
+	& .hljs-function, & .hljs-class .hljs-title {
+		color: oklch(0.42 0.17 240) !important;
+	}
+
+	& .katex, & .katex .mord, & .katex .minner, & .katex .mop,
+	& .katex .mbin, & .katex .mrel, & .katex .mpunct {
+		color: oklch(0.15 0.012 75) !important;
+	}
+	& .katex .frac-line, & .katex .overline-line, & .katex .underline-line,
+	& .katex .hline, & .katex .rule {
+		border-color: oklch(0.25 0.012 75) !important;
+		background: oklch(0.25 0.012 75) !important;
+	}
+	& .katex svg { fill: oklch(0.25 0.012 75) !important; stroke: oklch(0.25 0.012 75) !important; }
+	& .katex svg path { stroke: oklch(0.25 0.012 75) !important; }
+	& .katex .mopen, & .katex .mclose,
+	& .katex .delimsizing, & [class^="katex .delim-size"] {
+		color: oklch(0.35 0.012 75) !important;
+	}
+
+	& .latex-proof { background: oklch(0.96 0.010 80) !important; border-left-color: oklch(0.72 0.010 78) !important; }
+	& .latex-proof-header { color: oklch(0.22 0.012 75) !important; }
+	& .latex-proof-content { color: oklch(0.15 0.012 75) !important; }
+	& .latex-proof-content::after { color: oklch(0.48 0.012 75) !important; }
+	& .latex-theorem { background: oklch(0.94 0.010 80) !important; border-color: oklch(0.80 0.008 78) !important; }
+	& .latex-diagram-placeholder {
+		background: oklch(0.96 0.010 80) !important;
+		border-color: oklch(0.80 0.008 78) !important;
+		color: oklch(0.38 0.012 75) !important;
+	}
+}
+
@theme inline {
 	--radius-sm: calc(var(--radius) - 2px);
 	--radius-md: var(--radius);
--- a/dashboard/src/app.html
+++ b/dashboard/src/app.html
@@ -1,7 +1,15 @@
 <!doctype html>
-<html lang="en">
+<html lang="en" class="dark">
 	<head>
 		<meta charset="utf-8" />
+		<script>
+			try {
+				if (localStorage.getItem('exo-theme') === 'light') {
+					document.documentElement.classList.remove('dark');
+					document.documentElement.classList.add('light');
+				}
+			} catch (_) {}
+		</script>
 		<link rel="icon" href="%sveltekit.assets%/favicon.ico" />
 		<meta name="viewport" content="width=device-width, initial-scale=1" />
 		<title>EXO</title>
--- a/dashboard/src/lib/components/ChatForm.svelte
+++ b/dashboard/src/lib/components/ChatForm.svelte
@@ -103,7 +103,7 @@
  const modelSupportsThinking = $derived(() => {
    if (!currentModel) return false;
    const caps = modelCapabilities[currentModel] || [];
-    return caps.includes("thinking") && caps.includes("text");
+    return caps.includes("thinking_toggle") && caps.includes("text");
  });

  const isEditOnlyWithoutImage = $derived(
--- a/dashboard/src/lib/components/HeaderNav.svelte
+++ b/dashboard/src/lib/components/HeaderNav.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
  import { browser } from "$app/environment";
+  import { theme } from "$lib/stores/theme.svelte";

  export let showHome = true;
  export let onHome: (() => void) | null = null;
@@ -79,10 +80,48 @@
    />
  </button>

-  <!-- Right: Home + Downloads -->
+  <!-- Right: Theme toggle + Home + Downloads -->
  <div
    class="absolute right-6 top-1/2 -translate-y-1/2 flex items-center gap-4"
  >
+    <button
+      onclick={() => theme.toggle()}
+      class="p-2 rounded border border-exo-medium-gray/40 hover:border-exo-yellow/50 transition-colors cursor-pointer"
+      title={theme.isLight ? "Switch to dark mode" : "Switch to light mode"}
+      aria-label={theme.isLight
+        ? "Switch to dark mode"
+        : "Switch to light mode"}
+    >
+      {#if theme.isLight}
+        <svg
+          class="w-4 h-4 text-exo-light-gray"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            d="M21 12.79A9 9 0 1111.21 3a7 7 0 009.79 9.79z"
+          />
+        </svg>
+      {:else}
+        <svg
+          class="w-4 h-4 text-exo-light-gray"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <circle cx="12" cy="12" r="5" />
+          <path
+            stroke-linecap="round"
+            d="M12 1v2m0 18v2M4.22 4.22l1.42 1.42m12.72 12.72l1.42 1.42M1 12h2m18 0h2M4.22 19.78l1.42-1.42M18.36 5.64l1.42-1.42"
+          />
+        </svg>
+      {/if}
+    </button>
    {#if showHome}
      <button
        onclick={handleHome}
--- a/dashboard/src/lib/components/ImageParamsPanel.svelte
+++ b/dashboard/src/lib/components/ImageParamsPanel.svelte
@@ -59,13 +59,14 @@
  }

  const sizeOptions: ImageGenerationParams["size"][] = [
+    "auto",
    "512x512",
    "768x768",
    "1024x1024",
    "1024x768",
    "768x1024",
-    "1024x1365",
-    "1365x1024",
+    "1024x1536",
+    "1536x1024",
  ];

  const qualityOptions: ImageGenerationParams["quality"][] = [
@@ -176,92 +177,90 @@
 <div class="border-b border-exo-medium-gray/30 px-3 py-2">
  <!-- Basic params row -->
  <div class="flex items-center gap-3 flex-wrap">
-    <!-- Size (hidden in edit mode - output size comes from input image) -->
-    {#if !isEditMode}
-      <div class="flex items-center gap-1.5">
-        <span class="text-xs text-exo-light-gray uppercase tracking-wider"
-          >SIZE:</span
+    <!-- Size -->
+    <div class="flex items-center gap-1.5">
+      <span class="text-xs text-exo-light-gray uppercase tracking-wider"
+        >SIZE:</span
+      >
+      <div class="relative">
+        <button
+          bind:this={sizeButtonRef}
+          type="button"
+          onclick={() => (isSizeDropdownOpen = !isSizeDropdownOpen)}
+          class="bg-exo-medium-gray/50 border border-exo-yellow/30 rounded pl-2 pr-6 py-1 text-xs font-mono text-exo-yellow cursor-pointer transition-all duration-200 hover:border-exo-yellow/50 focus:outline-none focus:border-exo-yellow/70 {isSizeDropdownOpen
+            ? 'border-exo-yellow/70'
+            : ''}"
        >
-        <div class="relative">
-          <button
-            bind:this={sizeButtonRef}
-            type="button"
-            onclick={() => (isSizeDropdownOpen = !isSizeDropdownOpen)}
-            class="bg-exo-medium-gray/50 border border-exo-yellow/30 rounded pl-2 pr-6 py-1 text-xs font-mono text-exo-yellow cursor-pointer transition-all duration-200 hover:border-exo-yellow/50 focus:outline-none focus:border-exo-yellow/70 {isSizeDropdownOpen
-              ? 'border-exo-yellow/70'
-              : ''}"
+          {params.size.toUpperCase()}
+        </button>
+        <div
+          class="absolute right-1.5 top-1/2 -translate-y-1/2 pointer-events-none transition-transform duration-200 {isSizeDropdownOpen
+            ? 'rotate-180'
+            : ''}"
+        >
+          <svg
+            class="w-3 h-3 text-exo-yellow/60"
+            fill="none"
+            viewBox="0 0 24 24"
+            stroke="currentColor"
          >
-            {params.size}
-          </button>
-          <div
-            class="absolute right-1.5 top-1/2 -translate-y-1/2 pointer-events-none transition-transform duration-200 {isSizeDropdownOpen
-              ? 'rotate-180'
-              : ''}"
-          >
-            <svg
-              class="w-3 h-3 text-exo-yellow/60"
-              fill="none"
-              viewBox="0 0 24 24"
-              stroke="currentColor"
-            >
-              <path
-                stroke-linecap="round"
-                stroke-linejoin="round"
-                stroke-width="2"
-                d="M19 9l-7 7-7-7"
-              />
-            </svg>
+            <path
+              stroke-linecap="round"
+              stroke-linejoin="round"
+              stroke-width="2"
+              d="M19 9l-7 7-7-7"
+            />
+          </svg>
+        </div>
+      </div>
+
+      {#if isSizeDropdownOpen}
+        <!-- Backdrop to close dropdown -->
+        <button
+          type="button"
+          class="fixed inset-0 z-[9998] cursor-default"
+          onclick={() => (isSizeDropdownOpen = false)}
+          aria-label="Close dropdown"
+        ></button>
+
+        <!-- Dropdown Panel - fixed positioning to escape overflow:hidden -->
+        <div
+          class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto overflow-x-hidden min-w-max"
+          style="bottom: calc(100vh - {sizeDropdownPosition()
+            .top}px + 4px); left: {sizeDropdownPosition().left}px;"
+        >
+          <div class="py-1">
+            {#each sizeOptions as size}
+              <button
+                type="button"
+                onclick={() => selectSize(size)}
+                class="w-full px-3 py-1.5 text-left text-xs font-mono tracking-wide transition-colors duration-100 flex items-center gap-2 {params.size ===
+                size
+                  ? 'bg-transparent text-exo-yellow'
+                  : 'text-exo-light-gray hover:text-exo-yellow'}"
+              >
+                {#if params.size === size}
+                  <svg
+                    class="w-3 h-3 flex-shrink-0"
+                    fill="currentColor"
+                    viewBox="0 0 20 20"
+                  >
+                    <path
+                      fill-rule="evenodd"
+                      d="M16.707 5.293a1 1 0 010 1.414l-8 8a1 1 0 01-1.414 0l-4-4a1 1 0 011.414-1.414L8 12.586l7.293-7.293a1 1 0 011.414 0z"
+                      clip-rule="evenodd"
+                    />
+                  </svg>
+                {:else}
+                  <span class="w-3"></span>
+                {/if}
+                <span>{size.toUpperCase()}</span>
+              </button>
+            {/each}
          </div>
        </div>
-
-        {#if isSizeDropdownOpen}
-          <!-- Backdrop to close dropdown -->
-          <button
-            type="button"
-            class="fixed inset-0 z-[9998] cursor-default"
-            onclick={() => (isSizeDropdownOpen = false)}
-            aria-label="Close dropdown"
-          ></button>
-
-          <!-- Dropdown Panel - fixed positioning to escape overflow:hidden -->
-          <div
-            class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto min-w-max"
-            style="bottom: calc(100vh - {sizeDropdownPosition()
-              .top}px + 4px); left: {sizeDropdownPosition().left}px;"
-          >
-            <div class="py-1">
-              {#each sizeOptions as size}
-                <button
-                  type="button"
-                  onclick={() => selectSize(size)}
-                  class="w-full px-3 py-1.5 text-left text-xs font-mono tracking-wide transition-colors duration-100 flex items-center gap-2 {params.size ===
-                  size
-                    ? 'bg-transparent text-exo-yellow'
-                    : 'text-exo-light-gray hover:text-exo-yellow'}"
-                >
-                  {#if params.size === size}
-                    <svg
-                      class="w-3 h-3 flex-shrink-0"
-                      fill="currentColor"
-                      viewBox="0 0 20 20"
-                    >
-                      <path
-                        fill-rule="evenodd"
-                        d="M16.707 5.293a1 1 0 010 1.414l-8 8a1 1 0 01-1.414 0l-4-4a1 1 0 011.414-1.414L8 12.586l7.293-7.293a1 1 0 011.414 0z"
-                        clip-rule="evenodd"
-                      />
-                    </svg>
-                  {:else}
-                    <span class="w-3"></span>
-                  {/if}
-                  <span>{size}</span>
-                </button>
-              {/each}
-            </div>
-          </div>
-        {/if}
-      </div>
-    {/if}
+      {/if}
+    </div>

    <!-- Quality -->
    <div class="flex items-center gap-1.5">
@@ -311,7 +310,7 @@

        <!-- Dropdown Panel - fixed positioning to escape overflow:hidden -->
        <div
-          class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto min-w-max"
+          class="fixed bg-exo-dark-gray border border-exo-yellow/30 rounded shadow-lg shadow-black/50 z-[9999] max-h-48 overflow-y-auto overflow-x-hidden min-w-max"
          style="bottom: calc(100vh - {qualityDropdownPosition()
            .top}px + 4px); left: {qualityDropdownPosition().left}px;"
        >
--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
@@ -306,13 +306,14 @@ const IMAGE_PARAMS_STORAGE_KEY = "exo-image-generation-params";
 export interface ImageGenerationParams {
  // Basic params
  size:
+    | "auto"
    | "512x512"
    | "768x768"
    | "1024x1024"
    | "1024x768"
    | "768x1024"
-    | "1024x1365"
-    | "1365x1024";
+    | "1024x1536"
+    | "1536x1024";
  quality: "low" | "medium" | "high";
  outputFormat: "png" | "jpeg";
  numImages: number;
@@ -336,7 +337,7 @@ export interface EditingImage {
 }

 const DEFAULT_IMAGE_PARAMS: ImageGenerationParams = {
-  size: "1024x1024",
+  size: "auto",
  quality: "medium",
  outputFormat: "png",
  numImages: 1,
--- a/dashboard/src/lib/stores/theme.svelte.ts
+++ b/dashboard/src/lib/stores/theme.svelte.ts
@@ -0,0 +1,28 @@
+import { browser } from "$app/environment";
+
+let _isLight = $state(false);
+
+export const theme = {
+  get isLight() {
+    return _isLight;
+  },
+
+  init() {
+    if (!browser) return;
+    _isLight = document.documentElement.classList.contains("light");
+  },
+
+  toggle() {
+    if (!browser) return;
+    _isLight = !_isLight;
+    if (_isLight) {
+      document.documentElement.classList.remove("dark");
+      document.documentElement.classList.add("light");
+      localStorage.setItem("exo-theme", "light");
+    } else {
+      document.documentElement.classList.remove("light");
+      document.documentElement.classList.add("dark");
+      localStorage.setItem("exo-theme", "dark");
+    }
+  },
+};
--- a/dashboard/src/routes/+layout.svelte
+++ b/dashboard/src/routes/+layout.svelte
@@ -1,7 +1,13 @@
 <script lang="ts">
  import "../app.css";
+  import { onMount } from "svelte";
+  import { theme } from "$lib/stores/theme.svelte";

  let { children } = $props();
+
+  onMount(() => {
+    theme.init();
+  });
 </script>

 <svelte:head>
--- a/dashboard/src/routes/downloads/+page.svelte
+++ b/dashboard/src/routes/downloads/+page.svelte
--- a/flake.nix
+++ b/flake.nix
@@ -115,7 +115,7 @@
          packages = lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin (
            let
              uvLock = builtins.fromTOML (builtins.readFile ./uv.lock);
-              mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx") uvLock.package);
+              mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx" && p.source ? git) uvLock.package);
              uvLockMlxVersion = mlxPackage.version;
            in
            {
--- a/nix/mlx.nix
+++ b/nix/mlx.nix
@@ -41,16 +41,16 @@ let

  mlx = stdenv.mkDerivation rec {
    pname = "mlx";
-    version = let v = "0.30.6"; in
+    version = let v = "0.30.7.dev20260218+14841977"; in
      assert v == uvLockMlxVersion || throw "MLX version mismatch: nix/mlx.nix has ${v} but uv.lock has ${uvLockMlxVersion}. Update both the version and hash in nix/mlx.nix.";
      v;
    pyproject = true;

    src = fetchFromGitHub {
-      owner = "ml-explore";
-      repo = "mlx";
-      tag = "v${version}";
-      hash = "sha256-avD5EGhwgmPdXLAyQSqTO6AXk/W3ziH+f6AetjK3Sdo=";
+      owner = "rltakashige";
+      repo = "mlx-jaccl-fix-small-recv";
+      rev = "1484197707f35186ad3bd614357c7c47fdf86ebc";
+      hash = "sha256-FupCMoK/SF/ldfKuvMSAKECcOP8c+ANgkQlPZttDsLk=";
    };

    patches = [
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,9 @@ dependencies = [
    "loguru>=0.7.3",
    "exo_pyo3_bindings", # rust bindings
    "anyio==4.11.0",
-    "mlx==0.30.6; sys_platform == 'darwin'",
+    "mlx; sys_platform == 'darwin'",
    "mlx[cpu]==0.30.6; sys_platform == 'linux'",
-    "mlx-lm==0.30.6",
+    "mlx-lm==0.30.7",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
@@ -64,6 +64,7 @@ members = [

 [tool.uv.sources]
 exo_pyo3_bindings = { workspace = true }
+mlx = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git", branch = "address-rdma-gpu-locks", marker = "sys_platform == 'darwin'" }
 #mlx-lm = { git = "https://github.com/davidmcc73/mlx-lm", branch = "stable" }
 # Uncomment to use local mlx/mlx-lm development versions:
 # mlx = { path = "/Users/Shared/mlx", editable=true }
@@ -132,7 +133,7 @@ markers = [
 env = [
  "EXO_TESTS=1"
 ]
-addopts = "-m 'not slow'"
+addopts = "-m 'not slow' --ignore=tests/start_distributed_test.py"
 filterwarnings = [
    "ignore:builtin type Swig:DeprecationWarning",
 ]
--- a/python/parts.nix
+++ b/python/parts.nix
@@ -58,6 +58,21 @@
        lib.optionalAttrs pkgs.stdenv.hostPlatform.isLinux (
          (lib.mapAttrs (_: ignoreMissing) nvidiaPackages) // {
            mlx = ignoreMissing prev.mlx;
+            mlx-cuda-13 = prev.mlx-cuda-13.overrideAttrs (old: {
+              buildInputs = (old.buildInputs or [ ]) ++ [
+                final.nvidia-cublas
+                final.nvidia-cuda-nvrtc
+                final.nvidia-cudnn-cu13
+                final.nvidia-nccl-cu13
+              ];
+              preFixup = ''
+                addAutoPatchelfSearchPath ${final.nvidia-cublas}
+                addAutoPatchelfSearchPath ${final.nvidia-cuda-nvrtc}
+                addAutoPatchelfSearchPath ${final.nvidia-cudnn-cu13}
+                addAutoPatchelfSearchPath ${final.nvidia-nccl-cu13}
+              '';
+              autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" ];
+            });
            torch = ignoreMissing prev.torch;
            triton = ignoreMissing prev.triton;
          }
@@ -74,14 +89,25 @@
          linuxOverlay
        ]
      );
-      exoVenv = pythonSet.mkVirtualEnv "exo-env" workspace.deps.default;
+      # mlx-cpu and mlx-cuda-13 both ship mlx/ site-packages files; keep first.
+      # mlx-cpu/mlx-cuda-13 and nvidia-cudnn-cu12/cu13 ship overlapping files.
+      venvCollisionPaths = lib.optionals pkgs.stdenv.hostPlatform.isLinux [
+        "lib/python3.13/site-packages/mlx*"
+        "lib/python3.13/site-packages/nvidia*"
+      ];
+
+      exoVenv = (pythonSet.mkVirtualEnv "exo-env" workspace.deps.default).overrideAttrs {
+        venvIgnoreCollisions = venvCollisionPaths;
+      };

      # Virtual environment with dev dependencies for testing
-      testVenv = pythonSet.mkVirtualEnv "exo-test-env" (
+      testVenv = (pythonSet.mkVirtualEnv "exo-test-env" (
        workspace.deps.default // {
          exo = [ "dev" ]; # Include pytest, pytest-asyncio, pytest-env
        }
-      );
+      )).overrideAttrs {
+        venvIgnoreCollisions = venvCollisionPaths;
+      };

      mkPythonScript = name: path: pkgs.writeShellApplication {
        inherit name;
--- a/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "deepseek"
 quantization = "4bit"
 base_model = "DeepSeek V3.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 405874409472
--- a/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--DeepSeek-V3.1-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "deepseek"
 quantization = "8bit"
 base_model = "DeepSeek V3.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 765577920512
--- a/resources/inference_model_cards/mlx-community--GLM-4.5-Air-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.5-Air-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "8bit"
 base_model = "GLM 4.5 Air"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 122406567936
--- a/resources/inference_model_cards/mlx-community--GLM-4.5-Air-bf16.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.5-Air-bf16.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "bf16"
 base_model = "GLM 4.5 Air"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 229780750336
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "4bit"
 base_model = "GLM 4.7"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 198556925568
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-6bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "6bit"
 base_model = "GLM 4.7"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 286737579648
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-8bit-gs32.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-8bit-gs32.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "8bit"
 base_model = "GLM 4.7"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 396963397248
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "4bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 19327352832
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-5bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-5bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "5bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 22548578304
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-6bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "6bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 26843545600
--- a/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-4.7-Flash-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "glm"
 quantization = "8bit"
 base_model = "GLM 4.7 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 34359738368
--- a/resources/inference_model_cards/mlx-community--GLM-5-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-8bit.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5-8bit-MXFP8"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "8bit"
+base_model = "GLM-5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 790517400864
--- a/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-MXFP4-Q8.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5-MXFP4-Q8"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "MXFP4-Q8"
+base_model = "GLM-5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 405478939008
--- a/resources/inference_model_cards/mlx-community--GLM-5-bf16.toml
+++ b/resources/inference_model_cards/mlx-community--GLM-5-bf16.toml
@@ -0,0 +1,12 @@
+model_id = "mlx-community/GLM-5"
+n_layers = 78
+hidden_size = 6144
+supports_tensor = true
+tasks = ["TextGeneration"]
+family = "glm"
+quantization = "bf16"
+base_model = "GLM-5"
+capabilities = ["text", "thinking"]
+
+[storage_size]
+in_bytes = 1487822475264
--- a/resources/inference_model_cards/mlx-community--Kimi-K2-Thinking.toml
+++ b/resources/inference_model_cards/mlx-community--Kimi-K2-Thinking.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "kimi"
 quantization = ""
 base_model = "Kimi K2"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 706522120192
--- a/resources/inference_model_cards/mlx-community--Kimi-K2.5.toml
+++ b/resources/inference_model_cards/mlx-community--Kimi-K2.5.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "kimi"
 quantization = ""
 base_model = "Kimi K2.5"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 662498705408
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.1-3bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.1-3bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "minimax"
 quantization = "3bit"
 base_model = "MiniMax M2.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 100086644736
--- a/resources/inference_model_cards/mlx-community--MiniMax-M2.1-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--MiniMax-M2.1-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "minimax"
 quantization = "8bit"
 base_model = "MiniMax M2.1"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 242986745856
--- a/resources/inference_model_cards/mlx-community--Qwen3-0.6B-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-0.6B-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 0.6B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 342884352
--- a/resources/inference_model_cards/mlx-community--Qwen3-0.6B-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-0.6B-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 0.6B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 698351616
--- a/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 235B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 141733920768
--- a/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 235B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 268435456000
--- a/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 30B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 17612931072
--- a/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-30B-A3B-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 30B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 33279705088
--- a/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "4bit"
 base_model = "Qwen3 Next 80B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 47080074240
--- a/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
+++ b/resources/inference_model_cards/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "qwen"
 quantization = "8bit"
 base_model = "Qwen3 Next 80B"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 88814387200
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-4bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "step"
 quantization = "4bit"
 base_model = "Step 3.5 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 114572190076
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-6bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-6bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "step"
 quantization = "6bit"
 base_model = "Step 3.5 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 159039627774
--- a/resources/inference_model_cards/mlx-community--Step-3.5-Flash-8Bit.toml
+++ b/resources/inference_model_cards/mlx-community--Step-3.5-Flash-8Bit.toml
@@ -6,7 +6,7 @@ tasks = ["TextGeneration"]
 family = "step"
 quantization = "8bit"
 base_model = "Step 3.5 Flash"
-capabilities = ["text", "thinking"]
+capabilities = ["text", "thinking", "thinking_toggle"]

 [storage_size]
 in_bytes = 209082699847
--- a/rust/exo_pyo3_bindings/Cargo.toml
+++ b/rust/exo_pyo3_bindings/Cargo.toml
@@ -25,17 +25,17 @@ workspace = true
 networking = { workspace = true }

 # interop
-pyo3 = { version = "0.27.1", features = [
-    # "abi3-py311", # tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.11
+pyo3 = { version = "0.27.2", features = [
+    # "abi3-py313", # tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.13
    "nightly", # enables better-supported GIL integration
    "experimental-async", # async support in #[pyfunction] & #[pymethods]
    #"experimental-inspect", # inspection of generated binary => easier to automate type-hint generation
    #"py-clone", # adding Clone-ing of `Py<T>` without GIL (may cause panics - remove if panics happen)
-    "multiple-pymethods", # allows multiple #[pymethods] sections per class
+    # "multiple-pymethods", # allows multiple #[pymethods] sections per class

    # integrations with other libraries
-    "arc_lock", "bigdecimal", "either", "hashbrown", "indexmap", "num-bigint", "num-complex", "num-rational",
-    "ordered-float", "rust_decimal", "smallvec",
+    # "arc_lock", "bigdecimal", "either", "hashbrown", "indexmap", "num-bigint", "num-complex", "num-rational",
+    # "ordered-float", "rust_decimal", "smallvec",
    # "anyhow", "chrono", "chrono-local", "chrono-tz", "eyre", "jiff-02", "lock_api", "parking-lot", "time",  "serde",
 ] }
 pyo3-stub-gen = { version = "0.17.2" }
@@ -45,8 +45,6 @@ pyo3-log = "0.13.2"
 # macro dependencies
 extend = { workspace = true }
 delegate = { workspace = true }
-impl-trait-for-tuples = { workspace = true }
-derive_more = { workspace = true }
 pin-project = { workspace = true }

 # async runtime
@@ -54,24 +52,11 @@ tokio = { workspace = true, features = ["full", "tracing"] }
 futures = { workspace = true }

 # utility dependencies
-once_cell = "1.21.3"
-thread_local = "1.1.9"
 util = { workspace = true }
-thiserror = { workspace = true }
-#internment = { workspace = true }
-#recursion = { workspace = true }
-#generativity = { workspace = true }
-#itertools = { workspace = true }
-

 # Tracing
-#tracing = "0.1"
-#tracing-subscriber = "0.3"
-#console-subscriber = "0.1.5"
-#tracing-log = "0.2.0"
 log = { workspace = true }
 env_logger = "0.11"

-
 # Networking
 libp2p = { workspace = true, features = ["full"] }
--- a/rust/exo_pyo3_bindings/src/allow_threading.rs
+++ b/rust/exo_pyo3_bindings/src/allow_threading.rs
@@ -6,7 +6,7 @@ use pyo3::marker::Ungil;
 use pyo3::prelude::*;
 use std::{
    future::Future,
-    pin::{Pin, pin},
+    pin::Pin,
    task::{Context, Poll},
 };

@@ -33,8 +33,6 @@ where

    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
        let waker = cx.waker();
-        Python::with_gil(|py| {
-            py.allow_threads(|| self.project().0.poll(&mut Context::from_waker(waker)))
-        })
+        Python::attach(|py| py.detach(|| self.project().0.poll(&mut Context::from_waker(waker))))
    }
 }
--- a/rust/exo_pyo3_bindings/src/examples/mod.rs
+++ b/rust/exo_pyo3_bindings/src/examples/mod.rs
@@ -1,240 +0,0 @@
-//! This module exists to hold examples of some pyo3 patterns that may be too complex to
-//! re-create from scratch, but too inhomogenous to create an abstraction/wrapper around.
-//!
-//! Pattern examples include:
-//!  - Async task handles: with GC-integrated cleanup
-//!  - Sync/async callbacks from python: with propper eventloop handling
-//!
-//! Mutability pattern: https://pyo3.rs/v0.26.0/async-await.html#send--static-constraint
-//!  - Store mutable fields in tokio's `Mutex<T>`
-//!  - For async code: take `&self` and `.lock().await`
-//!  - For sync code: take `&mut self` and `.get_mut()`
-
-use crate::ext::{PyResultExt as _, ResultExt as _, TokioRuntimeExt as _};
-use futures::FutureExt as _;
-use futures::future::BoxFuture;
-use pyo3::exceptions::PyRuntimeError;
-use pyo3::prelude::{PyModule, PyModuleMethods as _};
-use pyo3::{
-    Bound, Py, PyAny, PyErr, PyResult, PyTraverseError, PyVisit, Python, pyclass, pymethods,
-};
-use std::time::Duration;
-use tokio::sync::mpsc;
-use tokio::sync::mpsc::error::TryRecvError;
-
-fn needs_tokio_runtime() {
-    tokio::runtime::Handle::current();
-}
-
-type SyncCallback = Box<dyn Fn() + Send + Sync>;
-type AsyncCallback = Box<dyn Fn() -> BoxFuture<'static, ()> + Send + Sync>;
-
-enum AsyncTaskMessage {
-    SyncCallback(SyncCallback),
-    AsyncCallback(AsyncCallback),
-}
-
-async fn async_task(
-    sender: mpsc::UnboundedSender<()>,
-    mut receiver: mpsc::UnboundedReceiver<AsyncTaskMessage>,
-) {
-    log::info!("RUST: async task started");
-
-    // task state
-    let mut interval = tokio::time::interval(Duration::from_secs(1));
-
-    let mut sync_cbs: Vec<SyncCallback> = vec![];
-    let mut async_cbs: Vec<AsyncCallback> = vec![];
-
-    loop {
-        tokio::select! {
-            // handle incoming messages from task-handle
-            message = receiver.recv() => {
-                // handle closed channel by exiting
-                let Some(message) = message else {
-                    log::info!("RUST: channel closed");
-                    break;
-                };
-
-                // dispatch incoming event
-                match message {
-                    AsyncTaskMessage::SyncCallback(cb) => {
-                        sync_cbs.push(cb);
-                    }
-                    AsyncTaskMessage::AsyncCallback(cb) => {
-                        async_cbs.push(cb);
-                    }
-                }
-            }
-
-            // handle all other events
-            _ = interval.tick() => {
-                log::info!("RUST: async task tick");
-
-                // call back all sync callbacks
-                for cb in &sync_cbs {
-                    cb();
-                }
-
-                // call back all async callbacks
-                for cb in &async_cbs {
-                    cb().await;
-                }
-
-                // send event on unbounded channel
-                sender.send(()).expect("handle receiver cannot be closed/dropped");
-            }
-        }
-    }
-
-    log::info!("RUST: async task stopped");
-}
-
-// #[gen_stub_pyclass]
-#[pyclass(name = "AsyncTaskHandle")]
-#[derive(Debug)]
-struct PyAsyncTaskHandle {
-    sender: Option<mpsc::UnboundedSender<AsyncTaskMessage>>,
-    receiver: mpsc::UnboundedReceiver<()>,
-}
-
-#[allow(clippy::expect_used)]
-impl PyAsyncTaskHandle {
-    const fn sender(&self) -> &mpsc::UnboundedSender<AsyncTaskMessage> {
-        self.sender
-            .as_ref()
-            .expect("The sender should only be None after de-initialization.")
-    }
-
-    const fn sender_mut(&mut self) -> &mpsc::UnboundedSender<AsyncTaskMessage> {
-        self.sender
-            .as_mut()
-            .expect("The sender should only be None after de-initialization.")
-    }
-
-    const fn new(
-        sender: mpsc::UnboundedSender<AsyncTaskMessage>,
-        receiver: mpsc::UnboundedReceiver<()>,
-    ) -> Self {
-        Self {
-            sender: Some(sender),
-            receiver,
-        }
-    }
-}
-
-// #[gen_stub_pymethods]
-#[pymethods]
-impl PyAsyncTaskHandle {
-    #[new]
-    fn py_new(py: Python<'_>) -> PyResult<Self> {
-        use pyo3_async_runtimes::tokio::get_runtime;
-
-        // create communication channel TOWARDS our task
-        let (h_sender, t_receiver) = mpsc::unbounded_channel::<AsyncTaskMessage>();
-
-        // create communication channel FROM our task
-        let (t_sender, h_receiver) = mpsc::unbounded_channel::<()>();
-
-        // perform necessary setup within tokio context - or it crashes
-        let () = get_runtime().block_on(async { needs_tokio_runtime() });
-
-        // spawn tokio task with this thread's task-locals - without this, async callbacks on the new threads will not work!!
-        _ = get_runtime().spawn_with_scope(py, async move {
-            async_task(t_sender, t_receiver).await;
-        });
-        Ok(Self::new(h_sender, h_receiver))
-    }
-
-    /// NOTE: exceptions in callbacks are silently ignored until end of execution
-    fn add_sync_callback(
-        &self,
-        // #[gen_stub(override_type(
-        //     type_repr="collections.abc.Callable[[], None]",
-        //     imports=("collections.abc")
-        // ))]
-        callback: Py<PyAny>,
-    ) -> PyResult<()> {
-        // blocking call to async method -> can do non-blocking if needed
-        self.sender()
-            .send(AsyncTaskMessage::SyncCallback(Box::new(move || {
-                _ = Python::with_gil(|py| callback.call0(py).write_unraisable_with(py));
-            })))
-            .pyerr()?;
-        Ok(())
-    }
-
-    /// NOTE: exceptions in callbacks are silently ignored until end of execution
-    fn add_async_callback(
-        &self,
-        // #[gen_stub(override_type(
-        //     type_repr="collections.abc.Callable[[], collections.abc.Awaitable[None]]",
-        //     imports=("collections.abc")
-        // ))]
-        callback: Py<PyAny>,
-    ) -> PyResult<()> {
-        // blocking call to async method -> can do non-blocking if needed
-        self.sender()
-            .send(AsyncTaskMessage::AsyncCallback(Box::new(move || {
-                let c = Python::with_gil(|py| callback.clone_ref(py));
-                async move {
-                    if let Some(f) = Python::with_gil(|py| {
-                        let coroutine = c.call0(py).write_unraisable_with(py)?;
-                        pyo3_async_runtimes::tokio::into_future(coroutine.into_bound(py))
-                            .write_unraisable_with(py)
-                    }) {
-                        _ = f.await.write_unraisable();
-                    }
-                }
-                .boxed()
-            })))
-            .pyerr()?;
-        Ok(())
-    }
-
-    async fn receive_unit(&mut self) -> PyResult<()> {
-        self.receiver
-            .recv()
-            .await
-            .ok_or(PyErr::new::<PyRuntimeError, _>(
-                "cannot receive unit on closed channel",
-            ))
-    }
-
-    fn drain_units(&mut self) -> PyResult<i32> {
-        let mut cnt = 0;
-        loop {
-            match self.receiver.try_recv() {
-                Err(TryRecvError::Disconnected) => {
-                    return Err(PyErr::new::<PyRuntimeError, _>(
-                        "cannot receive unit on closed channel",
-                    ));
-                }
-                Err(TryRecvError::Empty) => return Ok(cnt),
-                Ok(()) => {
-                    cnt += 1;
-                    continue;
-                }
-            }
-        }
-    }
-
-    // #[gen_stub(skip)]
-    const fn __traverse__(&self, _visit: PyVisit<'_>) -> Result<(), PyTraverseError> {
-        Ok(()) // This is needed purely so `__clear__` can work
-    }
-
-    // #[gen_stub(skip)]
-    fn __clear__(&mut self) {
-        // TODO: may or may not need to await a "kill-signal" oneshot channel message,
-        //       to ensure that the networking task is done BEFORE exiting the clear function...
-        //       but this may require GIL?? and it may not be safe to call GIL here??
-        self.sender = None; // Using Option<T> as a trick to force `sender` channel to be dropped
-    }
-}
-
-pub fn examples_submodule(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<PyAsyncTaskHandle>()?;
-
-    Ok(())
-}
--- a/rust/exo_pyo3_bindings/src/lib.rs
+++ b/rust/exo_pyo3_bindings/src/lib.rs
@@ -17,7 +17,6 @@

 extern crate core;
 mod allow_threading;
-mod examples;
 pub(crate) mod networking;
 pub(crate) mod pylibp2p;

@@ -25,7 +24,6 @@ use crate::networking::networking_submodule;
 use crate::pylibp2p::ident::ident_submodule;
 use crate::pylibp2p::multiaddr::multiaddr_submodule;
 use pyo3::prelude::PyModule;
-use pyo3::prelude::*;
 use pyo3::{Bound, PyResult, pyclass, pymodule};
 use pyo3_stub_gen::define_stub_info_gatherer;

@@ -36,14 +34,10 @@ pub(crate) mod r#const {

 /// Namespace for all the type/trait aliases used by this crate.
 pub(crate) mod alias {
-    use std::error::Error;
    use std::marker::Tuple;

    pub trait SendFn<Args: Tuple + Send + 'static, Output> =
        Fn<Args, Output = Output> + Send + 'static;
-
-    pub type AnyError = Box<dyn Error + Send + Sync + 'static>;
-    pub type AnyResult<T> = Result<T, AnyError>;
 }

 /// Namespace for crate-wide extension traits/methods
@@ -51,7 +45,6 @@ pub(crate) mod ext {
    use crate::allow_threading::AllowThreads;
    use extend::ext;
    use pyo3::exceptions::{PyConnectionError, PyRuntimeError};
-    use pyo3::marker::Ungil;
    use pyo3::types::PyBytes;
    use pyo3::{Py, PyErr, PyResult, Python};
    use tokio::runtime::Runtime;
@@ -62,7 +55,7 @@ pub(crate) mod ext {
    #[ext(pub, name = ByteArrayExt)]
    impl [u8] {
        fn pybytes(&self) -> Py<PyBytes> {
-            Python::with_gil(|py| PyBytes::new(py, self).unbind())
+            Python::attach(|py| PyBytes::new(py, self).unbind())
        }
    }

@@ -98,7 +91,7 @@ pub(crate) mod ext {
    #[ext(pub, name = PyResultExt)]
    impl<T> PyResult<T> {
        fn write_unraisable(self) -> Option<T> {
-            Python::with_gil(|py| self.write_unraisable_with(py))
+            Python::attach(|py| self.write_unraisable_with(py))
        }

        fn write_unraisable_with(self, py: Python<'_>) -> Option<T> {
@@ -175,24 +168,6 @@ pub(crate) mod ext {
    }
 }

-pub(crate) mod private {
-    use std::marker::Sized;
-
-    /// Sealed traits support
-    pub trait Sealed {}
-    impl<T: ?Sized> Sealed for T {}
-}
-
-/// A wrapper around [`Py`] that implements [`Clone`] using [`Python::with_gil`].
-#[repr(transparent)]
-pub(crate) struct ClonePy<T>(pub Py<T>);
-
-impl<T> Clone for ClonePy<T> {
-    fn clone(&self) -> Self {
-        Python::with_gil(|py| Self(self.0.clone_ref(py)))
-    }
-}
-
 /// A Python module implemented in Rust. The name of this function must match
 /// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
 /// import the module.
--- a/rust/exo_pyo3_bindings/src/networking.rs
+++ b/rust/exo_pyo3_bindings/src/networking.rs
@@ -11,9 +11,9 @@ use crate::ext::{ResultExt as _, TokioMpscReceiverExt as _, TokioMpscSenderExt a
 use crate::pyclass;
 use crate::pylibp2p::ident::{PyKeypair, PyPeerId};
 use libp2p::futures::StreamExt as _;
+use libp2p::gossipsub;
 use libp2p::gossipsub::{IdentTopic, Message, MessageId, PublishError};
 use libp2p::swarm::SwarmEvent;
-use libp2p::{gossipsub, mdns};
 use networking::discovery;
 use networking::swarm::create_swarm;
 use pyo3::prelude::{PyModule, PyModuleMethods as _};
@@ -25,7 +25,7 @@ use tokio::sync::{Mutex, mpsc, oneshot};

 mod exception {
    use pyo3::types::PyTuple;
-    use pyo3::{PyErrArguments, exceptions::PyException, prelude::*};
+    use pyo3::{exceptions::PyException, prelude::*};
    use pyo3_stub_gen::derive::*;

    #[gen_stub_pyclass]
@@ -155,7 +155,6 @@ async fn networking_task(
 ) {
    use SwarmEvent::*;
    use ToTask::*;
-    use mdns::Event::*;
    use networking::swarm::BehaviourEvent::*;

    log::info!("RUST: networking task started");
@@ -485,7 +484,7 @@ impl PyNetworkingHandle {
        let (tx, rx) = oneshot::channel();

        // send off request to subscribe
-        let data = Python::with_gil(|py| Vec::from(data.as_bytes(py)));
+        let data = Python::attach(|py| Vec::from(data.as_bytes(py)));
        self.to_task_tx()
            .send_py(ToTask::GossipsubPublish {
                topic,
--- a/rust/networking/Cargo.toml
+++ b/rust/networking/Cargo.toml
@@ -19,8 +19,6 @@ either = { workspace = true }
 # macro dependencies
 extend = { workspace = true }
 delegate = { workspace = true }
-impl-trait-for-tuples = { workspace = true }
-derive_more = { workspace = true }

 # async
 tokio = { workspace = true, features = ["full"] }
@@ -29,11 +27,6 @@ futures-timer = { workspace = true }

 # utility dependencies
 util = { workspace = true }
-thiserror = { workspace = true }
-#internment = { workspace = true }
-#recursion = { workspace = true }
-#generativity = { workspace = true }
-#itertools = { workspace = true }
 tracing-subscriber = { version = "0.3.19", features = ["default", "env-filter"] }
 keccak-const = { workspace = true }

@@ -41,4 +34,4 @@ keccak-const = { workspace = true }
 log = { workspace = true }

 # networking
-libp2p = { workspace = true, features = ["full"] }
+libp2p = { workspace = true, features = ["full"] }
--- a/rust/networking/examples/chatroom_manual.rs
+++ b/rust/networking/examples/chatroom_manual.rs
@@ -24,8 +24,8 @@ use libp2p::{
    swarm::{NetworkBehaviour, SwarmEvent},
    tcp, yamux,
 };
+use std::error::Error;
 use std::time::Duration;
-use std::{error::Error, hash::Hash};
 use tokio::{io, io::AsyncBufReadExt, select};
 use tracing_subscriber::EnvFilter;

--- a/rust/networking/src/discovery.rs
+++ b/rust/networking/src/discovery.rs
@@ -1,5 +1,4 @@
 use crate::ext::MultiaddrExt;
-use crate::keep_alive;
 use delegate::delegate;
 use either::Either;
 use futures::FutureExt;
--- a/rust/networking/src/keep_alive.rs
+++ b/rust/networking/src/keep_alive.rs
@@ -1,44 +0,0 @@
-use delegate::delegate;
-use libp2p::swarm::handler::ConnectionEvent;
-use libp2p::swarm::{ConnectionHandlerEvent, SubstreamProtocol, dummy, handler};
-use std::task::{Context, Poll};
-
-/// An implementation of [`ConnectionHandler`] that doesn't handle any protocols, but it keeps
-/// the connection alive.
-#[derive(Clone)]
-#[repr(transparent)]
-pub struct ConnectionHandler(dummy::ConnectionHandler);
-
-impl ConnectionHandler {
-    pub fn new() -> Self {
-        ConnectionHandler(dummy::ConnectionHandler)
-    }
-}
-
-impl handler::ConnectionHandler for ConnectionHandler {
-    // delegate types and implementation mostly to dummy handler
-    type FromBehaviour = <dummy::ConnectionHandler as handler::ConnectionHandler>::FromBehaviour;
-    type ToBehaviour = <dummy::ConnectionHandler as handler::ConnectionHandler>::ToBehaviour;
-    type InboundProtocol =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::InboundProtocol;
-    type OutboundProtocol =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::OutboundProtocol;
-    type InboundOpenInfo =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::InboundOpenInfo;
-    type OutboundOpenInfo =
-        <dummy::ConnectionHandler as handler::ConnectionHandler>::OutboundOpenInfo;
-
-    delegate! {
-        to self.0 {
-            fn listen_protocol(&self) -> SubstreamProtocol<Self::InboundProtocol, Self::InboundOpenInfo>;
-            fn poll(&mut self, cx: &mut Context<'_>) -> Poll<ConnectionHandlerEvent<Self::OutboundProtocol, Self::OutboundOpenInfo, Self::ToBehaviour>>;
-            fn on_behaviour_event(&mut self, event: Self::FromBehaviour);
-            fn on_connection_event(&mut self, event: ConnectionEvent<Self::InboundProtocol, Self::OutboundProtocol, Self::InboundOpenInfo, Self::OutboundOpenInfo>);
-        }
-    }
-
-    // specifically override this to force connection to stay alive
-    fn connection_keep_alive(&self) -> bool {
-        true
-    }
-}
--- a/rust/networking/src/lib.rs
+++ b/rust/networking/src/lib.rs
@@ -3,19 +3,7 @@
 //! this is here as a placeholder documentation
 //!
 //!
-
-// enable Rust-unstable features for convenience
-#![feature(trait_alias)]
-// #![feature(stmt_expr_attributes)]
-// #![feature(unboxed_closures)]
-// #![feature(assert_matches)]
-// #![feature(async_fn_in_dyn_trait)]
-// #![feature(async_for_loop)]
-// #![feature(auto_traits)]
-// #![feature(negative_impls)]
-
 pub mod discovery;
-pub mod keep_alive;
 pub mod swarm;

 /// Namespace for all the type/trait aliases used by this crate.
@@ -54,11 +42,3 @@ pub(crate) mod ext {
        }
    }
 }
-
-pub(crate) mod private {
-    #![allow(dead_code)]
-
-    /// Sealed traits support
-    pub trait Sealed {}
-    impl<T: ?Sized> Sealed for T {}
-}
--- a/src/exo/download/coordinator.py
+++ b/src/exo/download/coordinator.py
@@ -14,6 +14,7 @@ from exo.download.download_utils import (
    map_repo_download_progress_to_download_progress_data,
 )
 from exo.download.shard_downloader import ShardDownloader
+from exo.shared.constants import EXO_MODELS_DIR
 from exo.shared.models.model_cards import ModelId
 from exo.shared.types.commands import (
    CancelDownload,
@@ -46,6 +47,7 @@ class DownloadCoordinator:
    download_command_receiver: Receiver[ForwarderDownloadCommand]
    local_event_sender: Sender[ForwarderEvent]
    event_index_counter: Iterator[int]
+    offline: bool = False

    # Local state
    download_status: dict[ModelId, DownloadProgress] = field(default_factory=dict)
@@ -61,8 +63,13 @@ class DownloadCoordinator:

    def __post_init__(self) -> None:
        self.event_sender, self.event_receiver = channel[Event]()
+        if self.offline:
+            self.shard_downloader.set_internet_connection(False)
        self.shard_downloader.on_progress(self._download_progress_callback)

+    def _model_dir(self, model_id: ModelId) -> str:
+        return str(EXO_MODELS_DIR / model_id.normalize())
+
    async def _download_progress_callback(
        self, callback_shard: ShardMetadata, progress: RepoDownloadProgress
    ) -> None:
@@ -74,6 +81,7 @@ class DownloadCoordinator:
                shard_metadata=callback_shard,
                node_id=self.node_id,
                total_bytes=progress.total_bytes,
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = completed
            await self.event_sender.send(
@@ -93,6 +101,7 @@ class DownloadCoordinator:
                download_progress=map_repo_download_progress_to_download_progress_data(
                    progress
                ),
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = ongoing
            await self.event_sender.send(
@@ -101,13 +110,17 @@ class DownloadCoordinator:
            self._last_progress_time[model_id] = current_time()

    async def run(self) -> None:
-        logger.info("Starting DownloadCoordinator")
-        self._test_internet_connection()
+        logger.info(
+            f"Starting DownloadCoordinator{' (offline mode)' if self.offline else ''}"
+        )
+        if not self.offline:
+            self._test_internet_connection()
        async with self._tg as tg:
            tg.start_soon(self._command_processor)
            tg.start_soon(self._forward_events)
            tg.start_soon(self._emit_existing_download_progress)
-            tg.start_soon(self._check_internet_connection)
+            if not self.offline:
+                tg.start_soon(self._check_internet_connection)

    def _test_internet_connection(self) -> None:
        try:
@@ -170,7 +183,11 @@ class DownloadCoordinator:
                return

        # Emit pending status
-        progress = DownloadPending(shard_metadata=shard, node_id=self.node_id)
+        progress = DownloadPending(
+            shard_metadata=shard,
+            node_id=self.node_id,
+            model_directory=self._model_dir(model_id),
+        )
        self.download_status[model_id] = progress
        await self.event_sender.send(NodeDownloadProgress(download_progress=progress))

@@ -184,6 +201,7 @@ class DownloadCoordinator:
                shard_metadata=shard,
                node_id=self.node_id,
                total_bytes=initial_progress.total_bytes,
+                model_directory=self._model_dir(model_id),
            )
            self.download_status[model_id] = completed
            await self.event_sender.send(
@@ -191,6 +209,20 @@ class DownloadCoordinator:
            )
            return

+        if self.offline:
+            logger.warning(
+                f"Offline mode: model {model_id} is not fully available locally, cannot download"
+            )
+            failed = DownloadFailed(
+                shard_metadata=shard,
+                node_id=self.node_id,
+                error_message=f"Model files not found locally in offline mode: {model_id}",
+                model_directory=self._model_dir(model_id),
+            )
+            self.download_status[model_id] = failed
+            await self.event_sender.send(NodeDownloadProgress(download_progress=failed))
+            return
+
        # Start actual download
        self._start_download_task(shard, initial_progress)

@@ -206,6 +238,7 @@ class DownloadCoordinator:
            download_progress=map_repo_download_progress_to_download_progress_data(
                initial_progress
            ),
+            model_directory=self._model_dir(model_id),
        )
        self.download_status[model_id] = status
        self.event_sender.send_nowait(NodeDownloadProgress(download_progress=status))
@@ -219,6 +252,7 @@ class DownloadCoordinator:
                    shard_metadata=shard,
                    node_id=self.node_id,
                    error_message=str(e),
+                    model_directory=self._model_dir(model_id),
                )
                self.download_status[model_id] = failed
                await self.event_sender.send(
@@ -253,6 +287,7 @@ class DownloadCoordinator:
            pending = DownloadPending(
                shard_metadata=current_status.shard_metadata,
                node_id=self.node_id,
+                model_directory=self._model_dir(model_id),
            )
            await self.event_sender.send(
                NodeDownloadProgress(download_progress=pending)
@@ -295,11 +330,18 @@ class DownloadCoordinator:
                            node_id=self.node_id,
                            shard_metadata=progress.shard,
                            total_bytes=progress.total_bytes,
+                            model_directory=self._model_dir(
+                                progress.shard.model_card.model_id
+                            ),
                        )
                    elif progress.status in ["in_progress", "not_started"]:
                        if progress.downloaded_bytes_this_session.in_bytes == 0:
                            status = DownloadPending(
-                                node_id=self.node_id, shard_metadata=progress.shard
+                                node_id=self.node_id,
+                                shard_metadata=progress.shard,
+                                model_directory=self._model_dir(
+                                    progress.shard.model_card.model_id
+                                ),
                            )
                        else:
                            status = DownloadOngoing(
@@ -308,6 +350,9 @@ class DownloadCoordinator:
                                download_progress=map_repo_download_progress_to_download_progress_data(
                                    progress
                                ),
+                                model_directory=self._model_dir(
+                                    progress.shard.model_card.model_id
+                                ),
                            )
                    else:
                        continue
--- a/src/exo/download/download_utils.py
+++ b/src/exo/download/download_utils.py
@@ -448,12 +448,13 @@ async def download_file_with_retry(
    target_dir: Path,
    on_progress: Callable[[int, int, bool], None] = lambda _, __, ___: None,
    on_connection_lost: Callable[[], None] = lambda: None,
+    skip_internet: bool = False,
 ) -> Path:
    n_attempts = 3
    for attempt in range(n_attempts):
        try:
            return await _download_file(
-                model_id, revision, path, target_dir, on_progress
+                model_id, revision, path, target_dir, on_progress, skip_internet
            )
        except HuggingFaceAuthenticationError:
            raise
@@ -487,10 +488,14 @@ async def _download_file(
    path: str,
    target_dir: Path,
    on_progress: Callable[[int, int, bool], None] = lambda _, __, ___: None,
+    skip_internet: bool = False,
 ) -> Path:
    target_path = target_dir / path

    if await aios.path.exists(target_path):
+        if skip_internet:
+            return target_path
+
        local_size = (await aios.stat(target_path)).st_size

        # Try to verify against remote, but allow offline operation
@@ -510,6 +515,11 @@ async def _download_file(
            )
            return target_path

+    if skip_internet:
+        raise FileNotFoundError(
+            f"File {path} not found locally and cannot download in offline mode"
+        )
+
    await aios.makedirs((target_dir / path).parent, exist_ok=True)
    length, etag = await file_meta(model_id, revision, path)
    remote_hash = etag[:-5] if etag.endswith("-gzip") else etag
@@ -814,6 +824,7 @@ async def download_shard(
                    file, curr_bytes, total_bytes, is_renamed
                ),
                on_connection_lost=on_connection_lost,
+                skip_internet=skip_internet,
            )

    if not skip_download:
--- a/src/exo/download/tests/test_offline_mode.py
+++ b/src/exo/download/tests/test_offline_mode.py
@@ -0,0 +1,230 @@
+"""Tests for offline/air-gapped mode."""
+
+from collections.abc import AsyncIterator
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import aiofiles
+import aiofiles.os as aios
+import pytest
+
+from exo.download.download_utils import (
+    _download_file,  # pyright: ignore[reportPrivateUsage]
+    download_file_with_retry,
+    fetch_file_list_with_cache,
+)
+from exo.shared.types.common import ModelId
+from exo.shared.types.worker.downloads import FileListEntry
+
+
+@pytest.fixture
+def model_id() -> ModelId:
+    return ModelId("test-org/test-model")
+
+
+@pytest.fixture
+async def temp_models_dir(tmp_path: Path) -> AsyncIterator[Path]:
+    models_dir = tmp_path / "models"
+    await aios.makedirs(models_dir, exist_ok=True)
+    with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
+        yield models_dir
+
+
+class TestDownloadFileOffline:
+    """Tests for _download_file with skip_internet=True."""
+
+    async def test_returns_local_file_without_http_verification(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """When skip_internet=True and file exists locally, return it immediately
+        without making any HTTP calls (no file_meta verification)."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        local_file = target_dir / "model.safetensors"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b"model weights data")
+
+        with patch(
+            "exo.download.download_utils.file_meta",
+            new_callable=AsyncMock,
+        ) as mock_file_meta:
+            result = await _download_file(
+                model_id,
+                "main",
+                "model.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+            assert result == local_file
+            mock_file_meta.assert_not_called()
+
+    async def test_raises_file_not_found_for_missing_file(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """When skip_internet=True and file does NOT exist locally,
+        raise FileNotFoundError instead of attempting download."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        with pytest.raises(FileNotFoundError, match="offline mode"):
+            await _download_file(
+                model_id,
+                "main",
+                "missing_model.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+    async def test_returns_local_file_in_subdirectory(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """When skip_internet=True and file exists in a subdirectory,
+        return it without HTTP calls."""
+        target_dir = tmp_path / "downloads"
+        subdir = target_dir / "transformer"
+        await aios.makedirs(subdir, exist_ok=True)
+
+        local_file = subdir / "diffusion_pytorch_model.safetensors"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b"weights")
+
+        with patch(
+            "exo.download.download_utils.file_meta",
+            new_callable=AsyncMock,
+        ) as mock_file_meta:
+            result = await _download_file(
+                model_id,
+                "main",
+                "transformer/diffusion_pytorch_model.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+            assert result == local_file
+            mock_file_meta.assert_not_called()
+
+
+class TestDownloadFileWithRetryOffline:
+    """Tests for download_file_with_retry with skip_internet=True."""
+
+    async def test_propagates_skip_internet_to_download_file(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """Verify skip_internet is passed through to _download_file."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        local_file = target_dir / "config.json"
+        async with aiofiles.open(local_file, "wb") as f:
+            await f.write(b'{"model_type": "qwen2"}')
+
+        with patch(
+            "exo.download.download_utils.file_meta",
+            new_callable=AsyncMock,
+        ) as mock_file_meta:
+            result = await download_file_with_retry(
+                model_id,
+                "main",
+                "config.json",
+                target_dir,
+                skip_internet=True,
+            )
+
+            assert result == local_file
+            mock_file_meta.assert_not_called()
+
+    async def test_file_not_found_does_not_retry(
+        self, model_id: ModelId, tmp_path: Path
+    ) -> None:
+        """FileNotFoundError from offline mode should not trigger retries."""
+        target_dir = tmp_path / "downloads"
+        await aios.makedirs(target_dir, exist_ok=True)
+
+        with pytest.raises(FileNotFoundError):
+            await download_file_with_retry(
+                model_id,
+                "main",
+                "nonexistent.safetensors",
+                target_dir,
+                skip_internet=True,
+            )
+
+
+class TestFetchFileListOffline:
+    """Tests for fetch_file_list_with_cache with skip_internet=True."""
+
+    async def test_uses_cached_file_list(
+        self, model_id: ModelId, temp_models_dir: Path
+    ) -> None:
+        """When skip_internet=True and cache file exists, use it without network."""
+        from pydantic import TypeAdapter
+
+        cache_dir = temp_models_dir / "caches" / model_id.normalize()
+        await aios.makedirs(cache_dir, exist_ok=True)
+
+        cached_list = [
+            FileListEntry(type="file", path="model.safetensors", size=1000),
+            FileListEntry(type="file", path="config.json", size=200),
+        ]
+        cache_file = cache_dir / f"{model_id.normalize()}--main--file_list.json"
+        async with aiofiles.open(cache_file, "w") as f:
+            await f.write(
+                TypeAdapter(list[FileListEntry]).dump_json(cached_list).decode()
+            )
+
+        with patch(
+            "exo.download.download_utils.fetch_file_list_with_retry",
+            new_callable=AsyncMock,
+        ) as mock_fetch:
+            result = await fetch_file_list_with_cache(
+                model_id, "main", skip_internet=True
+            )
+
+            assert result == cached_list
+            mock_fetch.assert_not_called()
+
+    async def test_falls_back_to_local_directory_scan(
+        self, model_id: ModelId, temp_models_dir: Path
+    ) -> None:
+        """When skip_internet=True and no cache but local files exist,
+        build file list from local directory."""
+        import json
+
+        model_dir = temp_models_dir / model_id.normalize()
+        await aios.makedirs(model_dir, exist_ok=True)
+
+        async with aiofiles.open(model_dir / "config.json", "w") as f:
+            await f.write('{"model_type": "qwen2"}')
+
+        index_data = {
+            "metadata": {},
+            "weight_map": {"model.layers.0.weight": "model.safetensors"},
+        }
+        async with aiofiles.open(model_dir / "model.safetensors.index.json", "w") as f:
+            await f.write(json.dumps(index_data))
+
+        async with aiofiles.open(model_dir / "model.safetensors", "wb") as f:
+            await f.write(b"x" * 500)
+
+        with patch(
+            "exo.download.download_utils.fetch_file_list_with_retry",
+            new_callable=AsyncMock,
+        ) as mock_fetch:
+            result = await fetch_file_list_with_cache(
+                model_id, "main", skip_internet=True
+            )
+
+            mock_fetch.assert_not_called()
+            paths = {entry.path for entry in result}
+            assert "config.json" in paths
+            assert "model.safetensors" in paths
+
+    async def test_raises_when_no_cache_and_no_local_files(
+        self, model_id: ModelId, temp_models_dir: Path
+    ) -> None:
+        """When skip_internet=True and neither cache nor local files exist,
+        raise FileNotFoundError."""
+        with pytest.raises(FileNotFoundError, match="No internet"):
+            await fetch_file_list_with_cache(model_id, "main", skip_internet=True)
--- a/src/exo/main.py
+++ b/src/exo/main.py
@@ -39,6 +39,7 @@ class Node:

    node_id: NodeId
    event_index_counter: Iterator[int]
+    offline: bool
    _tg: TaskGroup = field(init=False, default_factory=anyio.create_task_group)

    @classmethod
@@ -68,6 +69,7 @@ class Node:
                download_command_receiver=router.receiver(topics.DOWNLOAD_COMMANDS),
                local_event_sender=router.sender(topics.LOCAL_EVENTS),
                event_index_counter=event_index_counter,
+                offline=args.offline,
            )
        else:
            download_coordinator = None
@@ -132,10 +134,13 @@ class Node:
            api,
            node_id,
            event_index_counter,
+            args.offline,
        )

    async def run(self):
        async with self._tg as tg:
+            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
+            signal.signal(signal.SIGTERM, lambda _, __: self.shutdown())
            tg.start_soon(self.router.run)
            tg.start_soon(self.election.run)
            if self.download_coordinator:
@@ -147,8 +152,6 @@ class Node:
            if self.api:
                tg.start_soon(self.api.run)
            tg.start_soon(self._elect_loop)
-            signal.signal(signal.SIGINT, lambda _, __: self.shutdown())
-            signal.signal(signal.SIGTERM, lambda _, __: self.shutdown())

    def shutdown(self):
        # if this is our second call to shutdown, just sys.exit
@@ -222,6 +225,7 @@ class Node:
                            ),
                            local_event_sender=self.router.sender(topics.LOCAL_EVENTS),
                            event_index_counter=self.event_index_counter,
+                            offline=self.offline,
                        )
                        self._tg.start_soon(self.download_coordinator.run)
                    if self.worker:
@@ -260,6 +264,9 @@ def main():
    logger.info("Starting EXO")
    logger.info(f"EXO_LIBP2P_NAMESPACE: {os.getenv('EXO_LIBP2P_NAMESPACE')}")

+    if args.offline:
+        logger.info("Running in OFFLINE mode — no internet checks, local models only")
+
    # Set FAST_SYNCH override env var for runner subprocesses
    if args.fast_synch is True:
        os.environ["EXO_FAST_SYNCH"] = "on"
@@ -282,6 +289,7 @@ class Args(CamelCaseModel):
    tb_only: bool = False
    no_worker: bool = False
    no_downloads: bool = False
+    offline: bool = False
    fast_synch: bool | None = None  # None = auto, True = force on, False = force off

    @classmethod
@@ -329,6 +337,11 @@ class Args(CamelCaseModel):
            action="store_true",
            help="Disable the download coordinator (node won't download models)",
        )
+        parser.add_argument(
+            "--offline",
+            action="store_true",
+            help="Run in offline/air-gapped mode: skip internet checks, use only pre-staged local models",
+        )
        fast_synch_group = parser.add_mutually_exclusive_group()
        fast_synch_group.add_argument(
            "--fast-synch",
--- a/src/exo/master/adapters/responses.py
+++ b/src/exo/master/adapters/responses.py
@@ -144,8 +144,8 @@ async def collect_responses_response(
            for tool in chunk.tool_calls:
                function_call_items.append(
                    ResponseFunctionCallItem(
-                        id=f"fc_{tool.id}",
-                        call_id=f"call_{tool.id}",
+                        id=tool.id,
+                        call_id=tool.id,
                        name=tool.name,
                        arguments=tool.arguments,
                    )
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -85,6 +85,7 @@ from exo.shared.types.api import (
    ImageGenerationTaskParams,
    ImageListItem,
    ImageListResponse,
+    ImageSize,
    ModelList,
    ModelListModel,
    PlaceInstanceParams,
@@ -100,6 +101,7 @@ from exo.shared.types.api import (
    TraceRankStats,
    TraceResponse,
    TraceStatsResponse,
+    normalize_image_size,
 )
 from exo.shared.types.chunks import (
    ErrorChunk,
@@ -751,9 +753,11 @@ class API:
        When stream=True and partial_images > 0, returns a StreamingResponse
        with SSE-formatted events for partial and final images.
        """
-        payload.model = await self._validate_image_model(ModelId(payload.model))
        payload = payload.model_copy(
-            update={"advanced_params": _ensure_seed(payload.advanced_params)}
+            update={
+                "model": await self._validate_image_model(ModelId(payload.model)),
+                "advanced_params": _ensure_seed(payload.advanced_params),
+            }
        )

        command = ImageGeneration(
@@ -1009,12 +1013,13 @@ class API:
    async def bench_image_generations(
        self, request: Request, payload: BenchImageGenerationTaskParams
    ) -> BenchImageGenerationResponse:
-        payload.model = await self._validate_image_model(ModelId(payload.model))
-
-        payload.stream = False
-        payload.partial_images = 0
        payload = payload.model_copy(
-            update={"advanced_params": _ensure_seed(payload.advanced_params)}
+            update={
+                "model": await self._validate_image_model(ModelId(payload.model)),
+                "stream": False,
+                "partial_images": 0,
+                "advanced_params": _ensure_seed(payload.advanced_params),
+            }
        )

        command = ImageGeneration(
@@ -1035,7 +1040,7 @@ class API:
        prompt: str,
        model: ModelId,
        n: int,
-        size: str,
+        size: ImageSize,
        response_format: Literal["url", "b64_json"],
        input_fidelity: Literal["low", "high"],
        stream: bool,
@@ -1105,7 +1110,7 @@ class API:
        prompt: str = Form(...),
        model: str = Form(...),
        n: int = Form(1),
-        size: str = Form("1024x1024"),
+        size: str | None = Form(None),
        response_format: Literal["url", "b64_json"] = Form("b64_json"),
        input_fidelity: Literal["low", "high"] = Form("low"),
        stream: str = Form("false"),
@@ -1131,7 +1136,7 @@ class API:
            prompt=prompt,
            model=ModelId(model),
            n=n,
-            size=size,
+            size=normalize_image_size(size),
            response_format=response_format,
            input_fidelity=input_fidelity,
            stream=stream_bool,
@@ -1167,7 +1172,7 @@ class API:
        prompt: str = Form(...),
        model: str = Form(...),
        n: int = Form(1),
-        size: str = Form("1024x1024"),
+        size: str | None = Form(None),
        response_format: Literal["url", "b64_json"] = Form("b64_json"),
        input_fidelity: Literal["low", "high"] = Form("low"),
        quality: Literal["high", "medium", "low"] = Form("medium"),
@@ -1187,7 +1192,7 @@ class API:
            prompt=prompt,
            model=ModelId(model),
            n=n,
-            size=size,
+            size=normalize_image_size(size),
            response_format=response_format,
            input_fidelity=input_fidelity,
            stream=False,
--- a/src/exo/shared/apply.py
+++ b/src/exo/shared/apply.py
@@ -218,11 +218,6 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
        key: value for key, value in state.downloads.items() if key != event.node_id
    }
    # Clean up all granular node mappings
-    node_identities = {
-        key: value
-        for key, value in state.node_identities.items()
-        if key != event.node_id
-    }
    node_memory = {
        key: value for key, value in state.node_memory.items() if key != event.node_id
    }
@@ -263,7 +258,6 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
            "downloads": downloads,
            "topology": topology,
            "last_seen": last_seen,
-            "node_identities": node_identities,
            "node_memory": node_memory,
            "node_disk": node_disk,
            "node_system": node_system,
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -44,7 +44,8 @@ async def _refresh_card_cache():
        async for toml_file in path.rglob("*.toml"):
            try:
                card = await ModelCard.load_from_path(toml_file)
-                _card_cache[card.model_id] = card
+                if card.model_id not in _card_cache:
+                    _card_cache[card.model_id] = card
            except (ValidationError, TOMLKitError):
                pass

@@ -182,6 +183,7 @@ class ConfigData(BaseModel):
    def supports_tensor(self) -> bool:
        return self.architectures in [
            ["Glm4MoeLiteForCausalLM"],
+            ["GlmMoeDsaForCausalLM"],
            ["DeepseekV32ForCausalLM"],
            ["DeepseekV3ForCausalLM"],
            ["Qwen3NextForCausalLM"],
--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -1,9 +1,9 @@
 import time
 from collections.abc import Generator
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any, Literal, get_args
 from uuid import uuid4

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator

 from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.common import CommandId, NodeId
@@ -262,6 +262,27 @@ class DeleteInstanceResponse(BaseModel):
    instance_id: InstanceId


+ImageSize = Literal[
+    "auto",
+    "512x512",
+    "768x768",
+    "1024x768",
+    "768x1024",
+    "1024x1024",
+    "1024x1536",
+    "1536x1024",
+]
+
+
+def normalize_image_size(v: object) -> ImageSize:
+    """Shared validator for ImageSize fields: maps None → "auto" and rejects invalid values."""
+    if v is None:
+        return "auto"
+    if v not in get_args(ImageSize):
+        raise ValueError(f"Invalid size: {v!r}. Must be one of {get_args(ImageSize)}")
+    return v  # pyright: ignore[reportReturnType]
+
+
 class AdvancedImageParams(BaseModel):
    seed: Annotated[int, Field(ge=0)] | None = None
    num_inference_steps: Annotated[int, Field(ge=1, le=100)] | None = None
@@ -281,7 +302,7 @@ class ImageGenerationTaskParams(BaseModel):
    partial_images: int | None = 0
    quality: Literal["high", "medium", "low"] | None = "medium"
    response_format: Literal["url", "b64_json"] | None = "b64_json"
-    size: str | None = "1024x1024"
+    size: ImageSize = "auto"
    stream: bool | None = False
    style: str | None = "vivid"
    user: str | None = None
@@ -289,6 +310,11 @@ class ImageGenerationTaskParams(BaseModel):
    # Internal flag for benchmark mode - set by API, preserved through serialization
    bench: bool = False

+    @field_validator("size", mode="before")
+    @classmethod
+    def normalize_size(cls, v: object) -> ImageSize:
+        return normalize_image_size(v)
+

 class BenchImageGenerationTaskParams(ImageGenerationTaskParams):
    bench: bool = True
@@ -305,13 +331,18 @@ class ImageEditsTaskParams(BaseModel):
    quality: Literal["high", "medium", "low"] | None = "medium"
    output_format: Literal["png", "jpeg", "webp"] = "png"
    response_format: Literal["url", "b64_json"] | None = "b64_json"
-    size: str | None = "1024x1024"
+    size: ImageSize = "auto"
    image_strength: float | None = 0.7
    stream: bool = False
    partial_images: int | None = 0
    advanced_params: AdvancedImageParams | None = None
    bench: bool = False

+    @field_validator("size", mode="before")
+    @classmethod
+    def normalize_size(cls, v: object) -> ImageSize:
+        return normalize_image_size(v)
+
    def __repr_args__(self) -> Generator[tuple[str, Any], None, None]:
        for name, value in super().__repr_args__():  # pyright: ignore[reportAny]
            if name == "image_data":
--- a/src/exo/shared/types/worker/downloads.py
+++ b/src/exo/shared/types/worker/downloads.py
@@ -26,6 +26,7 @@ class DownloadProgressData(CamelCaseModel):
 class BaseDownloadProgress(TaggedModel):
    node_id: NodeId
    shard_metadata: ShardMetadata
+    model_directory: str = ""


 class DownloadPending(BaseDownloadProgress):
--- a/src/exo/utils/banner.py
+++ b/src/exo/utils/banner.py
@@ -1,5 +1,7 @@
+import sys
+
+
 def print_startup_banner(port: int) -> None:
-    """Print a prominent startup banner with API endpoint information."""
    dashboard_url = f"http://localhost:{port}"
    banner = f"""
 ╔═══════════════════════════════════════════════════════════════════════╗
@@ -27,4 +29,4 @@ def print_startup_banner(port: int) -> None:

 """

-    print(banner)
+    print(banner, file=sys.stderr)
--- a/src/exo/worker/engines/image/generate.py
+++ b/src/exo/worker/engines/image/generate.py
@@ -14,6 +14,7 @@ from exo.shared.types.api import (
    ImageEditsTaskParams,
    ImageGenerationStats,
    ImageGenerationTaskParams,
+    ImageSize,
 )
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.runner_response import (
@@ -23,9 +24,9 @@ from exo.shared.types.worker.runner_response import (
 from exo.worker.engines.image.distributed_model import DistributedImageModel


-def parse_size(size_str: str | None) -> tuple[int, int]:
+def parse_size(size_str: ImageSize) -> tuple[int, int]:
    """Parse size parameter like '1024x1024' to (width, height) tuple."""
-    if not size_str:
+    if size_str == "auto":
        return (1024, 1024)

    try:
@@ -109,6 +110,9 @@ def generate_image(
            # Decode base64 image data and save to temp file
            image_path = Path(tmpdir) / "input.png"
            image_path.write_bytes(base64.b64decode(task.image_data))
+            if task.size == "auto":
+                with Image.open(image_path) as img:
+                    width, height = img.size

        for image_num in range(num_images):
            # Increment seed for each image to ensure unique results
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -163,11 +163,14 @@ class PipelineLastLayer(CustomMlxLayer):
                output, (self.r + 1) % self.s, group=self.group
            )
            if cache is not None:
-                cache.keys = mx.depends(cache.keys, output)  # type: ignore[reportUnknownMemberType]
+                # CacheList (used by MLA models like DeepSeekV32, GLM MoE DSA)
+                # doesn't have .keys directly; access via first sub-cache.
+                _cache = cache[0] if hasattr(cache, "caches") else cache  # type: ignore
+                _cache.keys = mx.depends(_cache.keys, output)  # type: ignore
            if self.is_prefill:
                mx.eval(output)
                if cache is not None:
-                    mx.eval(cache.keys)  # type: ignore
+                    mx.eval(_cache.keys)  # type: ignore

        if not self.is_prefill:
            output = mx.distributed.all_gather(output, group=self.group)[
@@ -307,7 +310,9 @@ def patch_pipeline_model[T](model: T, group: mx.distributed.Group) -> T:

        # Add dependency to last cache entry to ensure distributed ops are evaluated
        if cache is not None:
-            cache[-1].state = mx.depends(cache[-1].state, logits)  # type: ignore
+            last = cache[-1]  # type: ignore
+            dep_cache = last[0] if hasattr(last, "caches") else last  # type: ignore
+            dep_cache.keys = mx.depends(dep_cache.keys, logits)  # type: ignore

        return logits

@@ -333,7 +338,9 @@ def patch_tensor_model[T](model: T) -> T:

        # Add dependency to last cache entry to ensure distributed ops are evaluated
        if cache is not None and len(cache) > 0:  # pyright: ignore[reportAny]
-            cache[-1].state = mx.depends(cache[-1].state, logits)  # pyright: ignore[reportAny,reportUnknownMemberType]
+            last = cache[-1]  # pyright: ignore[reportAny]
+            dep_cache = last[0] if hasattr(last, "caches") else last  # pyright: ignore[reportAny]
+            dep_cache.keys = mx.depends(dep_cache.keys, logits)  # pyright: ignore[reportAny,reportUnknownMemberType]

        return logits

@@ -547,10 +554,12 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
        on_timeout: TimeoutCallback | None,
    ) -> nn.Module:
        model = cast(DeepseekV3Model, model)
+
        for layer in model.layers:
            eval_with_timeout(
                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
            )
+
            # Shard the self attention
            if layer.self_attn.q_lora_rank is None:
                layer.self_attn.q_proj = self.all_to_sharded_linear(
@@ -581,12 +590,18 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)

-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
+            # Shard the MoE.
            else:
-                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.gate_proj)
-                self.sharded_to_all_linear_in_place(layer.mlp.shared_experts.down_proj)
-                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.up_proj)
+                if getattr(layer.mlp, "shared_experts", None) is not None:
+                    self.all_to_sharded_linear_in_place(
+                        layer.mlp.shared_experts.gate_proj
+                    )
+                    self.sharded_to_all_linear_in_place(
+                        layer.mlp.shared_experts.down_proj
+                    )
+                    self.all_to_sharded_linear_in_place(
+                        layer.mlp.shared_experts.up_proj
+                    )
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
@@ -779,8 +794,7 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):

            layer.self_attn = WrappedMiniMaxAttention(layer.self_attn, self.group)  # pyright: ignore[reportAttributeAccessIssue,reportArgumentType]

-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
+            # Shard the MoE.
            self.all_to_sharded_linear_in_place(
                layer.block_sparse_moe.switch_mlp.gate_proj
            )
@@ -893,8 +907,7 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):
                    layer.self_attn.num_attention_heads //= self.N
                    layer.self_attn.num_key_value_heads //= self.N

-            # Shard the MoE. Shard in place since the MoE should be responsible
-            # for aggregating the results.
+            # Shard the MoE.
            if isinstance(layer.mlp, (Qwen3MoeSparseMoeBlock, Qwen3NextSparseMoeBlock)):
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -57,6 +57,7 @@ def prefill(
    sampler: Callable[[mx.array], mx.array],
    prompt_tokens: mx.array,
    cache: KVCacheType,
+    group: mx.distributed.Group | None = None,
 ) -> tuple[float, int, list[CacheSnapshot]]:
    """Prefill the KV cache with prompt tokens.

@@ -86,6 +87,9 @@ def prefill(

    set_pipeline_prefill(model, is_prefill=True)

+    mx_barrier(group)
+    logger.info("Starting prefill")
+
    # Use max_tokens=1 because max_tokens=0 does not work.
    # We just throw away the generated token - we only care about filling the cache
    for _ in stream_generate(
@@ -305,16 +309,9 @@ def mlx_generate(
    )
    max_stop_len = max((len(s) for s in stop_sequences), default=0)

-    mx_barrier(group)
-    logger.info("Ready to prefill")
-
    # Prefill cache with all tokens except the last one
    prefill_tps, prefill_tokens, ssm_snapshots_list = prefill(
-        model,
-        tokenizer,
-        sampler,
-        prompt_tokens[:-1],
-        caches,
+        model, tokenizer, sampler, prompt_tokens[:-1], caches, group
    )
    cache_snapshots: list[CacheSnapshot] | None = ssm_snapshots_list or None

@@ -331,6 +328,7 @@ def mlx_generate(
    think_start = tokenizer.think_start
    think_end = tokenizer.think_end

+    logger.info("Starting decode")
    mx_barrier(group)

    for completion_tokens, out in enumerate(
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -285,10 +285,12 @@ def get_eos_token_ids_for_model(model_id: ModelId) -> list[int] | None:
    model_id_lower = model_id.lower()
    if "kimi-k2" in model_id_lower:
        return [163586]
-    elif "glm-4.7-flash" in model_id_lower:
+    elif "glm-5" in model_id_lower or "glm-4.7" in model_id_lower:
+        # For GLM-5 and GLM-4.7
        # 154820: <|endoftext|>, 154827: <|user|>, 154829: <|observation|>
        return [154820, 154827, 154829]
    elif "glm" in model_id_lower:
+        # For GLM-4.5 and older
        return [151336, 151329, 151338]
    return None

@@ -353,7 +355,13 @@ def load_tokenizer_for_model_id(
            return list(hf_tokenizer.model.encode(text, allowed_special="all"))  # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType]

        hf_tokenizer.encode = _patched_encode
-        return TokenizerWrapper(hf_tokenizer, eos_token_ids=eos_token_ids)
+        return TokenizerWrapper(
+            hf_tokenizer,
+            eos_token_ids=eos_token_ids,
+            tool_call_start="<|tool_calls_section_begin|>",
+            tool_call_end="<|tool_calls_section_end|>",
+            tool_parser=_parse_kimi_tool_calls,
+        )

    tokenizer = load_tokenizer(
        model_path,
@@ -585,3 +593,41 @@ def mx_barrier(group: Group | None):
            mx.array(1.0), group=group, stream=mx.default_stream(mx.Device(mx.cpu))
        )
    )
+
+
+def _parse_kimi_tool_calls(text: str):
+    import regex as re
+
+    # kimi has a fixed function naming scheme, with a json formatted arg
+    #   functions.multiply:0<|tool_call_argument_begin|>{"a": 2, "b": 3}
+    _func_name_regex = re.compile(
+        r"^\s*((?:functions\.)?(.+?):\d+)\s*<\|tool_call_argument_begin\|>", re.DOTALL
+    )
+    _func_arg_regex = re.compile(r"<\|tool_call_argument_begin\|>\s*(.*)\s*", re.DOTALL)
+    _tool_call_split_regex = re.compile(
+        r"<\|tool_call_begin\|>(.*?)<\|tool_call_end\|>", re.DOTALL
+    )
+
+    def _parse_single_tool(text: str) -> dict[str, Any]:
+        func_name_match = _func_name_regex.search(text)
+        if func_name_match is None:
+            raise ValueError("No tool call found.")
+        tool_call_id = func_name_match.group(1)  # e.g. "functions.get_weather:0"
+        func_name = func_name_match.group(2)  # e.g. "get_weather"
+
+        func_args_match = _func_arg_regex.search(text)
+        if func_args_match is None:
+            raise ValueError("No tool call arguments found.")
+        func_args = func_args_match.group(1)
+        try:
+            arg_dct = json.loads(func_args)  # pyright: ignore[reportAny]
+        except Exception:
+            arg_dct = None
+
+        return dict(id=tool_call_id, name=func_name, arguments=arg_dct)
+
+    tool_matches = _tool_call_split_regex.findall(text)
+    if tool_matches:
+        return [_parse_single_tool(match) for match in tool_matches]  # pyright: ignore[reportAny]
+    else:
+        return [_parse_single_tool(text)]
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -1,11 +1,10 @@
 import base64
-import json
 import math
 import resource
 import time
 from collections.abc import Generator
 from functools import cache
-from typing import Any, Callable, Literal
+from typing import Literal

 import mlx.core as mx
 from mlx_lm.models.gpt_oss import Model as GptOssModel
@@ -16,7 +15,6 @@ from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
    StreamableParser,
    load_harmony_encoding,
 )
-from pydantic import ValidationError

 from exo.shared.constants import EXO_MAX_CHUNK_SIZE, EXO_TRACING_ENABLED
 from exo.shared.models.model_cards import ModelId, ModelTask
@@ -93,6 +91,8 @@ from exo.worker.engines.mlx.utils_mlx import (
 )
 from exo.worker.runner.bootstrap import logger

+from .tool_parsers import ToolParser, make_mlx_parser
+

 def _is_primary_output_node(shard_metadata: ShardMetadata) -> bool:
    """Check if this node is the primary output node for image generation.
@@ -138,6 +138,7 @@ def main(
    inference_model: Model | None = None
    image_model: DistributedImageModel | None = None
    tokenizer = None
+    tool_parser: ToolParser | None = None
    group = None
    kv_prefix_cache: KVPrefixCache | None = None
    check_for_cancel_every: int | None = None
@@ -203,8 +204,17 @@ def main(
                            bound_instance, group, on_timeout=on_model_load_timeout
                        )
                        logger.info(
-                            f"model has_tool_calling={tokenizer.has_tool_calling}"
+                            f"model has_tool_calling={tokenizer.has_tool_calling} using tokens {tokenizer.tool_call_start}, {tokenizer.tool_call_end}"
                        )
+                        if tokenizer.has_tool_calling:
+                            assert tokenizer.tool_call_start
+                            assert tokenizer.tool_call_end
+                            assert tokenizer.tool_parser  # pyright: ignore[reportAny]
+                            tool_parser = make_mlx_parser(
+                                tokenizer.tool_call_start,
+                                tokenizer.tool_call_end,
+                                tokenizer.tool_parser,  # pyright: ignore[reportAny]
+                            )
                        kv_prefix_cache = KVPrefixCache(group)

                    elif (
@@ -310,31 +320,11 @@ def main(
                                mlx_generator, tokenizer
                            )

-                        # Kimi-K2 has tool call sections - we don't care about them
-                        if "kimi" in shard_metadata.model_card.model_id.lower():
-                            mlx_generator = filter_kimi_tokens(mlx_generator)
-                            patch_kimi_tokenizer(tokenizer)
-
-                        # GLM models need patched parser (upstream has bug with None regex match)
-                        elif "glm" in shard_metadata.model_card.model_id.lower():
-                            patch_glm_tokenizer(tokenizer)
-
                        # GPT-OSS specific parsing to match other model formats.
-                        elif isinstance(inference_model, GptOssModel):
+                        if isinstance(inference_model, GptOssModel):
                            mlx_generator = parse_gpt_oss(mlx_generator)
-
-                        if tokenizer.has_tool_calling and not isinstance(
-                            inference_model, GptOssModel
-                        ):
-                            assert tokenizer.tool_call_start
-                            assert tokenizer.tool_call_end
-                            assert tokenizer.tool_parser  # pyright: ignore[reportAny]
-                            mlx_generator = parse_tool_calls(
-                                mlx_generator,
-                                tokenizer.tool_call_start,
-                                tokenizer.tool_call_end,
-                                tokenizer.tool_parser,  # pyright: ignore[reportAny]
-                            )
+                        elif tool_parser:
+                            mlx_generator = parse_tool_calls(mlx_generator, tool_parser)

                        completion_tokens = 0
                        tokens_since_last_cancel_check = 0
@@ -587,21 +577,8 @@ def get_gpt_oss_encoding():
    return encoding


-def filter_kimi_tokens(
-    responses: Generator[GenerationResponse | ToolCallResponse],
-) -> Generator[GenerationResponse]:
-    for resp in responses:
-        assert isinstance(resp, GenerationResponse)
-        if (
-            resp.text == "<|tool_calls_section_begin|>"
-            or resp.text == "<|tool_calls_section_end|>"
-        ):
-            continue
-        yield resp
-
-
 def parse_gpt_oss(
-    responses: Generator[GenerationResponse | ToolCallResponse],
+    responses: Generator[GenerationResponse],
 ) -> Generator[GenerationResponse | ToolCallResponse]:
    encoding = get_gpt_oss_encoding()
    stream = StreamableParser(encoding, role=Role.ASSISTANT)
@@ -658,9 +635,9 @@ def parse_gpt_oss(


 def parse_thinking_models(
-    responses: Generator[GenerationResponse | ToolCallResponse],
+    responses: Generator[GenerationResponse],
    tokenizer: TokenizerWrapper,
-) -> Generator[GenerationResponse | ToolCallResponse]:
+) -> Generator[GenerationResponse]:
    """
    For models that inject thinking tags in the prompt (like GLM-4.7),
    prepend the thinking tag to the output stream so the frontend
@@ -781,221 +758,55 @@ def _process_image_response(


 def parse_tool_calls(
-    responses: Generator[GenerationResponse | ToolCallResponse],
-    tool_call_start: str,
-    tool_call_end: str,
-    tool_parser: Callable[[str], dict[str, Any] | list[dict[str, Any]]],
+    responses: Generator[GenerationResponse], tool_parser: ToolParser
 ) -> Generator[GenerationResponse | ToolCallResponse]:
    in_tool_call = False
    tool_call_text_parts: list[str] = []
    for response in responses:
-        assert isinstance(response, GenerationResponse)
-        # assumption: the tool call start is one token
-        if response.text == tool_call_start:
+        if response.text.startswith(tool_parser.start_parsing):
            in_tool_call = True
-            continue
-        # assumption: the tool call end is one token
-        if in_tool_call and response.text == tool_call_end:
-            try:
-                # tool_parser returns an arbitrarily nested python dictionary
-                # we actually don't want the python dictionary, we just want to
-                # parse the top level { function: ..., arguments: ... } structure
-                # as we're just gonna hand it back to the api anyway
-                parsed = tool_parser("".join(tool_call_text_parts).strip())
-                logger.info(f"parsed {tool_call_text_parts=} into {parsed=}")
-                if isinstance(parsed, list):
-                    tools = [_validate_single_tool(tool) for tool in parsed]
-                else:
-                    tools = [_validate_single_tool(parsed)]
-                yield ToolCallResponse(
-                    tool_calls=tools, usage=response.usage, stats=response.stats
-                )
-
-            except (
-                json.JSONDecodeError,
-                ValidationError,
-                ValueError,
-                AttributeError,
-            ) as e:
-                # ValueError: our parsers raise this for malformed tool calls
-                # AttributeError: upstream parsers (e.g. glm47) may raise this when regex doesn't match
-                logger.opt(exception=e).warning("tool call parsing failed")
-                # assumption: talking about tool calls, not making a tool call
-                response.text = (
-                    tool_call_start + "".join(tool_call_text_parts) + tool_call_end
-                )
-                yield response
-
-            in_tool_call = False
-            tool_call_text_parts = []
-            continue

        if in_tool_call:
            tool_call_text_parts.append(response.text)
+            if response.text.endswith(tool_parser.end_parsing):
+                # parse the actual tool calls from the tool call text
+                parsed = tool_parser.parse_tool_calls(
+                    "".join(tool_call_text_parts).strip()
+                )
+                logger.info(f"parsed {tool_call_text_parts=} into {parsed=}")
+                if parsed is not None:
+                    yield ToolCallResponse(
+                        tool_calls=parsed, usage=response.usage, stats=response.stats
+                    )
+                else:
+                    logger.warning(
+                        f"tool call parsing failed for text {''.join(tool_call_text_parts)}"
+                    )
+                    response.text = "".join(tool_call_text_parts)
+                    yield response
+
+                in_tool_call = False
+                tool_call_text_parts = []
+                continue
+
            if response.finish_reason is not None:
                logger.info(
-                    "toll call parsing interrupted, yield partial tool call as text"
+                    "tool call parsing interrupted, yield partial tool call as text"
                )
-                yield GenerationResponse(
-                    text=tool_call_start + "".join(tool_call_text_parts),
-                    token=0,
-                    finish_reason=response.finish_reason,
-                    usage=response.usage,
-                    stats=response.stats,
+                response = response.model_copy(
+                    update={
+                        "text": "".join(tool_call_text_parts),
+                        "token": 0,
+                    }
                )
+                yield response
+
            continue
+
        # fallthrough
        yield response


-def patch_kimi_tokenizer(tokenizer: TokenizerWrapper):
-    """
-    Version of to-be-upstreamed kimi-k2 tool parser
-    """
-    import ast
-    import json
-    from typing import Any
-
-    import regex as re
-
-    # kimi has a fixed function naming scheme, with a json formatted arg
-    #   functions.multiply:0 <|tool_call_argument_begin|> {"a": 2, "b": 3}
-    #   Also needs to handle tools like call_0<|tool_call_argument_begin|>{"filePath": "..."}
-    _func_name_regex = re.compile(
-        r"^\s*(.+)[:](\d+)\s*<\|tool_call_argument_begin\|>", re.DOTALL
-    )
-    _func_arg_regex = re.compile(r"<\|tool_call_argument_begin\|>\s*(.*)\s*", re.DOTALL)
-
-    # kimi has a tool_calls_section - we're leaving this up to the caller to handle
-    tool_call_start = "<|tool_call_begin|>"
-    tool_call_end = "<|tool_call_end|>"
-
-    def _deserialize(value: str) -> Any:  # pyright: ignore[reportAny]
-        try:
-            return json.loads(value)  # pyright: ignore[reportAny]
-        except Exception:
-            pass
-
-        try:
-            return ast.literal_eval(value)  # pyright: ignore[reportAny]
-        except Exception:
-            pass
-        return value
-
-    def parse_tool_call(text: str, tools: Any | None = None):
-        func_name_match = _func_name_regex.search(text)
-        if func_name_match is None:
-            raise ValueError(f"Could not parse function name from tool call: {text!r}")
-        original_func_name = func_name_match.group(1)
-        tool_id = func_name_match.group(2)
-        # strip off the `functions.` prefix, if it exists.
-        func_name = original_func_name[original_func_name.find(".") + 1 :]
-
-        func_args_match = _func_arg_regex.search(text)
-        if func_args_match is None:
-            raise ValueError(f"Could not parse function args from tool call: {text!r}")
-        func_args = func_args_match.group(1)
-        # the args should be valid json - no need to check against our tools to deserialize
-        arg_dct = _deserialize(func_args)  # pyright: ignore[reportAny]
-
-        return dict(
-            id=f"{original_func_name}:{tool_id}",
-            name=func_name,
-            arguments=arg_dct,  # pyright: ignore[reportAny]
-        )
-
-    tokenizer._tool_call_start = tool_call_start
-    tokenizer._tool_call_end = tool_call_end
-    tokenizer._tool_parser = parse_tool_call
-
-
-def patch_glm_tokenizer(tokenizer: TokenizerWrapper):
-    """
-    Fixed version of mlx_lm's glm47 tool parser that handles regex match failures.
-    """
-    import ast
-    import json
-    from typing import Any
-
-    import regex as re
-
-    _func_name_regex = re.compile(r"^(.*?)<arg_key>", re.DOTALL)
-    _func_arg_regex = re.compile(
-        r"<arg_key>(.*?)</arg_key>(?:\n|\s)*<arg_value>(.*?)(?:</arg_value>|(?=<arg_key>)|$)",
-        re.DOTALL,
-    )
-
-    tool_call_start = "<tool_call>"
-    tool_call_end = "</tool_call>"
-
-    def _is_string_type(
-        tool_name: str,
-        arg_name: str,
-        tools: list[Any] | None,
-    ) -> bool:
-        if tools is None:
-            return False
-        for tool in tools:  # pyright: ignore[reportAny]
-            func = tool["function"]  # pyright: ignore[reportAny]
-            if func["name"] == tool_name:
-                params = func["parameters"]  # pyright: ignore[reportAny]
-                if params is None:
-                    return False
-                props = params.get("properties", {})  # pyright: ignore[reportAny]
-                arg_props = props.get(arg_name, {})  # pyright: ignore[reportAny]
-                arg_type = arg_props.get("type", None)  # pyright: ignore[reportAny]
-                return arg_type == "string"  # pyright: ignore[reportAny]
-        return False
-
-    def _deserialize(value: str) -> Any:  # pyright: ignore[reportAny]
-        try:
-            return json.loads(value)  # pyright: ignore[reportAny]
-        except Exception:
-            pass
-        try:
-            return ast.literal_eval(value)  # pyright: ignore[reportAny]
-        except Exception:
-            pass
-        return value
-
-    def parse_tool_call(text: str, tools: list[Any] | None = None):
-        func_name_match = _func_name_regex.search(text)
-        if func_name_match is None:
-            raise ValueError(f"Could not parse function name from tool call: {text!r}")
-        func_name = func_name_match.group(1)
-
-        pairs = _func_arg_regex.findall(text)
-        arg_dct: dict[str, Any] = {}
-        for key, value in pairs:  # pyright: ignore[reportAny]
-            arg_key = key.strip()  # pyright: ignore[reportAny]
-            arg_val = value.strip()  # pyright: ignore[reportAny]
-            if not _is_string_type(func_name, arg_key, tools):  # pyright: ignore[reportAny]
-                arg_val = _deserialize(arg_val)  # pyright: ignore[reportAny]
-            arg_dct[arg_key] = arg_val
-        return dict(name=func_name, arguments=arg_dct)
-
-    tokenizer._tool_call_start = tool_call_start
-    tokenizer._tool_call_end = tool_call_end
-    tokenizer._tool_parser = parse_tool_call
-
-
-def _validate_single_tool(obj: dict[str, Any]) -> ToolCallItem:
-    if (
-        ((name := obj.get("name")) is not None)
-        and ((args := obj.get("arguments")) is not None)
-        and isinstance(name, str)
-    ):
-        raw_id: object = obj.get("id")
-        extra = {"id": str(raw_id)} if raw_id is not None else {}
-        return ToolCallItem(
-            **extra,
-            name=name,
-            arguments=json.dumps(args),
-        )
-    else:
-        raise ValidationError
-
-
 EXO_RUNNER_MUST_FAIL = "EXO RUNNER MUST FAIL"
 EXO_RUNNER_MUST_OOM = "EXO RUNNER MUST OOM"
 EXO_RUNNER_MUST_TIMEOUT = "EXO RUNNER MUST TIMEOUT"
--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -191,7 +191,7 @@ class RunnerSupervisor:
        logger.info("Checking runner's status")
        if self.runner_process.is_alive():
            logger.info("Runner was found to be alive, attempting to join process")
-            await to_thread.run_sync(self.runner_process.join, 1)
+            await to_thread.run_sync(self.runner_process.join, 5)
        rc = self.runner_process.exitcode
        logger.info(f"RunnerSupervisor exited with exit code {rc}")
        if rc == 0:
--- a/src/exo/worker/runner/tool_parsers.py
+++ b/src/exo/worker/runner/tool_parsers.py
@@ -0,0 +1,72 @@
+import json
+from dataclasses import dataclass
+from typing import Any, Callable
+
+from exo.shared.types.api import ToolCallItem
+
+
+@dataclass
+class ToolParser:
+    start_parsing: str
+    end_parsing: str
+    parse_tool_calls: Callable[[str], list[ToolCallItem] | None]
+
+
+def make_mlx_parser(
+    tool_call_start: str,
+    tool_call_end: str,
+    tool_parser: Callable[[str], dict[str, Any] | list[dict[str, Any]]],
+) -> ToolParser:
+    def parse_tool_calls(text: str) -> list[ToolCallItem] | None:
+        try:
+            text = text.removeprefix(tool_call_start)
+            text = text.removesuffix(tool_call_end)
+            parsed = tool_parser(text)
+            if isinstance(parsed, list):
+                return [ToolCallItem.model_validate(_flatten(p)) for p in parsed]
+            else:
+                return [ToolCallItem.model_validate(_flatten(parsed))]
+
+        except Exception:
+            return None
+
+    return ToolParser(
+        start_parsing=tool_call_start,
+        end_parsing=tool_call_end,
+        parse_tool_calls=parse_tool_calls,
+    )
+
+
+# TODO / example code:
+def _parse_json_calls(text: str) -> list[ToolCallItem] | None:
+    try:
+        text = text.removeprefix("<tool_call>")
+        text = text.removesuffix("</tool_call>")
+        top_level = {
+            k: json.dumps(v) if isinstance(v, (dict, list)) else v
+            for k, v in json.loads(text).items()  # pyright: ignore[reportAny]
+        }
+        return [ToolCallItem.model_validate(top_level)]
+    except Exception:
+        return None
+
+
+def _flatten(p: dict[str, Any]) -> dict[str, str]:
+    return {
+        k: json.dumps(v) if isinstance(v, (dict, list)) else str(v)  # pyright: ignore[reportAny]
+        for k, v in p.items()  # pyright: ignore[reportAny]
+    }
+
+
+json_tool_parser = ToolParser(
+    start_parsing="<tool_call>",
+    end_parsing="</tool_call>",
+    parse_tool_calls=_parse_json_calls,
+)
+
+
+def infer_tool_parser(chat_template: str) -> ToolParser | None:
+    """Attempt to auto-infer a tool parser from the chat template."""
+    if "<tool_call>" in chat_template and "tool_call.name" in chat_template:
+        return json_tool_parser
+    return None
--- a/src/exo/worker/tests/unittests/test_runner/test_parse_tool_calls.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_parse_tool_calls.py
@@ -5,12 +5,13 @@ from typing import Any

 from exo.shared.types.worker.runner_response import GenerationResponse, ToolCallResponse
 from exo.worker.runner.runner import parse_tool_calls
+from exo.worker.runner.tool_parsers import make_mlx_parser


 def _make_responses(
    texts: list[str],
    finish_on_last: bool = True,
-) -> Generator[GenerationResponse | ToolCallResponse]:
+) -> Generator[GenerationResponse]:
    """Create a sequence of GenerationResponses from text strings."""
    for i, text in enumerate(texts):
        is_last = i == len(texts) - 1
@@ -22,10 +23,13 @@ def _make_responses(
        )


-def _dummy_parser(text: str) -> dict[str, Any]:
+def _dummier_parser(text: str) -> dict[str, Any]:
    return {"name": "test_fn", "arguments": {"arg": text}}


+_dummy_parser = make_mlx_parser("<tool_call>", "</tool_call>", _dummier_parser)
+
+
 class TestParseToolCalls:
    """Tests for parse_tool_calls generator."""

@@ -35,8 +39,6 @@ class TestParseToolCalls:
        results = list(
            parse_tool_calls(
                _make_responses(texts, finish_on_last=False),
-                "<tool_call>",
-                "</tool_call>",
                _dummy_parser,
            )
        )
@@ -50,8 +52,6 @@ class TestParseToolCalls:
        results = list(
            parse_tool_calls(
                _make_responses(texts),
-                "<tool_call>",
-                "</tool_call>",
                _dummy_parser,
            )
        )
@@ -76,9 +76,7 @@ class TestParseToolCalls:
        results = list(
            parse_tool_calls(
                _make_responses(texts, finish_on_last=False),
-                "<tool_call>",
-                "</tool_call>",
-                _failing_parser,
+                make_mlx_parser("<tool_call>", "</tool_call>", _failing_parser),
            )
        )

--- a/uv.lock
+++ b/uv.lock
@@ -377,8 +377,8 @@ dependencies = [
    { name = "hypercorn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "loguru", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "mflux", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", extra = ["cpu"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cpu"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.7.dev20260218+14841977", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }, marker = "sys_platform == 'darwin'" },
    { name = "mlx-lm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "msgspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "openai-harmony", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -416,9 +416,9 @@ requires-dist = [
    { name = "hypercorn", specifier = ">=0.18.0" },
    { name = "loguru", specifier = ">=0.7.3" },
    { name = "mflux", specifier = "==0.15.5" },
-    { name = "mlx", marker = "sys_platform == 'darwin'", specifier = "==0.30.6" },
+    { name = "mlx", marker = "sys_platform == 'darwin'", git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks" },
    { name = "mlx", extras = ["cpu"], marker = "sys_platform == 'linux'", specifier = "==0.30.6" },
-    { name = "mlx-lm", specifier = "==0.30.6" },
+    { name = "mlx-lm", specifier = "==0.30.7" },
    { name = "msgspec", specifier = ">=0.19.0" },
    { name = "openai-harmony", specifier = ">=0.0.8" },
    { name = "pillow", specifier = ">=11.0,<12.0" },
@@ -1020,8 +1020,8 @@ dependencies = [
    { name = "fonttools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "matplotlib", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", extra = ["cuda13"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cuda13"], marker = "sys_platform == 'linux'" },
+    { name = "mlx", version = "0.30.7.dev20260218+14841977", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }, marker = "sys_platform == 'darwin'" },
    { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "opencv-python", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "piexif", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1048,18 +1048,12 @@ wheels = [
 name = "mlx"
 version = "0.30.6"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mlx-metal", marker = "sys_platform == 'darwin'" },
+resolution-markers = [
+    "sys_platform == 'linux'",
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/5b/e460e144a34d5529e010056cccf50b538d56ed001473bc6b246018fd58cb/mlx-0.30.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ed86f8bffc174c2f259ca589ea25464c96cf69d1bb457074a2bf2ef53737e54f", size = 573515, upload-time = "2026-02-06T03:45:23.405Z" },
-    { url = "https://files.pythonhosted.org/packages/60/25/69833fefb9a3fef30b56792b1bcd022496c4fea83e45411d289b77ef7546/mlx-0.30.6-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:c52294958269e20f300639a17c1900ca8fc737d859ddda737f9811e94bd040e5", size = 573516, upload-time = "2026-02-06T03:45:24.618Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/6a/7e7fbeebc5cb51b6a5eba96b263a6298707bcbdc059f4b0b73e088bc3dea/mlx-0.30.6-cp313-cp313-macosx_26_0_arm64.whl", hash = "sha256:b5b6636f7c49a4d86d8ec82643b972f45a144a7a9f3a967b27b2e6e22cf71e6a", size = 573592, upload-time = "2026-02-06T03:45:25.928Z" },
    { url = "https://files.pythonhosted.org/packages/93/06/280f6f2ba80520a7109730425eda0d966658793aa0d02d8be8d351f75253/mlx-0.30.6-cp313-cp313-manylinux_2_35_aarch64.whl", hash = "sha256:67e6c9e30a9faeacc209917ef5523177cf9b086914b6b5d83ff886e4294b727d", size = 622011, upload-time = "2026-02-06T03:45:28.165Z" },
    { url = "https://files.pythonhosted.org/packages/fe/35/f872afbee9c079cc69924d9e9c46f5663adb7da58cba3511db082dd307c1/mlx-0.30.6-cp313-cp313-manylinux_2_35_x86_64.whl", hash = "sha256:47db8b16fcb6f6c5a47c0bdb24ed377b41237017ac93aa6cb6aa206c9bdf82e4", size = 663650, upload-time = "2026-02-06T03:45:30.315Z" },
-    { url = "https://files.pythonhosted.org/packages/60/23/361dc7a5797634e4d7e9bdd6564c6b28f9b1246672632def2f91bf066b18/mlx-0.30.6-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:78804a89dcff4a838f7c2da72392fe87a523e95122a3c840e53df019122aad45", size = 575028, upload-time = "2026-02-06T03:45:31.549Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/69/1854484d414171586814dfbe8def95f75c4ea2c7341ba13ba8ee675f7c62/mlx-0.30.6-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:ec13584ab069665cc7ad34a05494d9291cd623aef6ae96be48875fc87cfc25d6", size = 575026, upload-time = "2026-02-06T03:45:33.072Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/b8/3adbc441924209a7e4c568308b2a0b54bd09aee6a68db5bae85304791e54/mlx-0.30.6-cp314-cp314-macosx_26_0_arm64.whl", hash = "sha256:b2c5e8a090a753ef99a1380a4d059c983083f36198864f6df9faaf1223d083df", size = 575041, upload-time = "2026-02-06T03:45:34.814Z" },
    { url = "https://files.pythonhosted.org/packages/3f/54/9d9e06804fb2088202a2cdf60458e00b221f71420bea285720b60f9e82b5/mlx-0.30.6-cp314-cp314-manylinux_2_35_aarch64.whl", hash = "sha256:9ceddede4af0de31d1f6b3099f70e5469d60cd7c546975dedbdbeab3519cab3f", size = 624002, upload-time = "2026-02-06T03:45:36Z" },
    { url = "https://files.pythonhosted.org/packages/42/92/3140a15a50cb1f9267a6552171e1dfa577861de53e093124bc43707f2a0e/mlx-0.30.6-cp314-cp314-manylinux_2_35_x86_64.whl", hash = "sha256:4a6ffd2d16728cf95f63a1b555d7c2eaeea686a0e6b73228bd265411cb5d77a4", size = 663569, upload-time = "2026-02-06T03:45:37.242Z" },
 ]
@@ -1072,6 +1066,14 @@ cuda13 = [
    { name = "mlx-cuda-13", marker = "sys_platform == 'linux'" },
 ]

+[[package]]
+name = "mlx"
+version = "0.30.7.dev20260218+14841977"
+source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }
+resolution-markers = [
+    "sys_platform == 'darwin'",
+]
+
 [[package]]
 name = "mlx-cpu"
 version = "0.30.6"
@@ -1098,30 +1100,20 @@ wheels = [

 [[package]]
 name = "mlx-lm"
-version = "0.30.6"
+version = "0.30.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "mlx", marker = "sys_platform == 'darwin'" },
+    { name = "mlx", version = "0.30.7.dev20260218+14841977", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#1484197707f35186ad3bd614357c7c47fdf86ebc" }, marker = "sys_platform == 'darwin'" },
    { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "sentencepiece", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/76/cb/815deddc8699b1f694d7e1f9cbed52934c03a8b49432c8add72932bb2f0b/mlx_lm-0.30.6.tar.gz", hash = "sha256:807e042d7040268f1b19190b7eaefd8b2efbff5590a65460974ad4225b91dda1", size = 271733, upload-time = "2026-02-04T21:27:45.741Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/66/0d/56542e2ae13ec6f542d3977d7cff89a205d4f6c5122e0ce23f33265f61c9/mlx_lm-0.30.7.tar.gz", hash = "sha256:e5f31ac58d9f2381f28e1ba639ff903e64f7cff1bdc245c0bc97f72264be329c", size = 275764, upload-time = "2026-02-12T18:41:11.86Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/5f/01d281f1fa8a1521d5936659beb4f5ab1f32b463d059263cf9d4cef969d9/mlx_lm-0.30.6-py3-none-any.whl", hash = "sha256:a7405bd581eacc4bf8209d7a6b7f23629585a0d7c6740c2a97e51fee35b3b0e1", size = 379451, upload-time = "2026-02-04T21:27:43.222Z" },
-]
-
-[[package]]
-name = "mlx-metal"
-version = "0.30.6"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f3/85/44406b521f920248fad621334d4dc15e77660a494edf890e7cbee33bf38d/mlx_metal-0.30.6-py3-none-macosx_14_0_arm64.whl", hash = "sha256:ea6d0c973def9a5b4f652cc77036237db3f88c9d0af63701d76b5fddde99b820", size = 38437818, upload-time = "2026-02-06T03:44:56.19Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/cb/10a516995f7d0c154b0d7e633c54b51e96977a86a355105b6474cfcbe0d0/mlx_metal-0.30.6-py3-none-macosx_15_0_arm64.whl", hash = "sha256:0f8cb94634d07e06a372d6ad9a090f38a18bab1ff19a140aede60eacf707bb94", size = 38433701, upload-time = "2026-02-06T03:44:59.678Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/7d/70cb272f7373c334709f210ed8420511fc9d64d05a7a646c0b3b94c29c04/mlx_metal-0.30.6-py3-none-macosx_26_0_arm64.whl", hash = "sha256:d761ae26304f2c4b454eeea7f612a56919d9e5e57dbb1dc0788f8e34aa6f41c2", size = 47718448, upload-time = "2026-02-06T03:45:03.133Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/17/a41c798a3d9cbdc47f39c6db5bba4c2cd199203ead26bf911cb03b644070/mlx_lm-0.30.7-py3-none-any.whl", hash = "sha256:17442a4bf01c4c2d3bca1e647712fe44f19890c3f1eadc8589d389e57b44b9bf", size = 386591, upload-time = "2026-02-12T18:41:10.236Z" },
 ]

 [[package]]