Fix cancellation during async step

Skip final rank async send upon cancellation
Handle cancellation completion in dashboard
2026-01-31 01:01:11 -05:00 · 2026-01-26 17:37:24 +00:00 · 2026-01-26 15:55:47 +00:00 · 2026-01-26 10:23:54 +00:00 · 2026-01-26 10:23:54 +00:00 · 2026-01-26 10:23:54 +00:00
58 changed files with 2218 additions and 4238 deletions
--- a/.github/actions/typecheck/action.yml
+++ b/.github/actions/typecheck/action.yml
@@ -0,0 +1,12 @@
+name: Type Check
+
+description: "Run type checker"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Run type checker
+      run: |
+        nix --extra-experimental-features nix-command --extra-experimental-features flakes develop -c just sync
+        nix --extra-experimental-features nix-command --extra-experimental-features flakes develop -c just check
+      shell: bash
--- a/.github/workflows/pipeline.yml
+++ b/.github/workflows/pipeline.yml
@@ -26,14 +26,73 @@ jobs:
          name: exo
          authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"

-      - name: Load nix develop environment
-        run: nix run github:nicknovitski/nix-develop/v1
+      - name: Configure git user
+        run: |
+          git config --local user.email "github-actions@users.noreply.github.com"
+          git config --local user.name  "github-actions bot"
+        shell: bash

-      - name: Sync dependencies
-        run: uv sync --all-packages
+      - name: Pull LFS files
+        run: |
+          echo "Pulling Git LFS files..."
+          git lfs pull
+        shell: bash

-      - name: Run type checker
-        run: uv run basedpyright --project pyproject.toml
+      - name: Setup Nix Environment
+        run: |
+          echo "Checking for nix installation..."
+          
+          # Check if nix binary exists directly
+          if [ -f /nix/var/nix/profiles/default/bin/nix ]; then
+            echo "Found nix binary at /nix/var/nix/profiles/default/bin/nix"
+            export PATH="/nix/var/nix/profiles/default/bin:$PATH"
+            echo "PATH=$PATH" >> $GITHUB_ENV
+            nix --version
+          elif [ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]; then
+            echo "Found nix profile script, sourcing..."
+            source /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh
+            nix --version
+          elif command -v nix >/dev/null 2>&1; then
+            echo "Nix already in PATH"
+            nix --version
+          else
+            echo "Nix not found. Debugging info:"
+            echo "Contents of /nix/var/nix/profiles/default/:"
+            ls -la /nix/var/nix/profiles/default/ 2>/dev/null || echo "Directory not found"
+            echo "Contents of /nix/var/nix/profiles/default/bin/:"
+            ls -la /nix/var/nix/profiles/default/bin/ 2>/dev/null || echo "Directory not found"
+            exit 1
+          fi
+        shell: bash
+
+      - name: Configure basedpyright include for local MLX
+        run: |
+          RUNNER_LABELS='${{ toJSON(runner.labels) }}'
+          if echo "$RUNNER_LABELS" | grep -q "local_mlx"; then
+            if [ -d "/Users/Shared/mlx" ]; then
+              echo "Updating [tool.basedpyright].include to use /Users/Shared/mlx"
+              awk '
+                BEGIN { in=0 }
+                /^\[tool\.basedpyright\]/ { in=1; print; next }
+                in && /^\[/ { in=0 }  # next section
+                in && /^[ \t]*include[ \t]*=/ {
+                  print "include = [\"/Users/Shared/mlx\"]"
+                  next
+                }
+                { print }
+              ' pyproject.toml > pyproject.toml.tmp && mv pyproject.toml.tmp pyproject.toml
+
+              echo "New [tool.basedpyright] section:"
+              sed -n '/^\[tool\.basedpyright\]/,/^\[/p' pyproject.toml | sed '$d' || true
+            else
+              echo "local_mlx tag present but /Users/Shared/mlx not found; leaving pyproject unchanged."
+            fi
+          else
+            echo "Runner does not have 'local_mlx' tag; leaving pyproject unchanged."
+          fi
+        shell: bash
+
+      - uses: ./.github/actions/typecheck

  nix:
    name: Build and check (${{ matrix.system }})
@@ -64,63 +123,6 @@ jobs:
          name: exo
          authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"

-      - name: Build Metal packages (macOS only)
-        if: runner.os == 'macOS'
-        run: |
-          # Try to build metal-toolchain first (may succeed via cachix cache hit)
-          if nix build .#metal-toolchain 2>/dev/null; then
-            echo "metal-toolchain built successfully (likely cache hit)"
-          else
-            echo "metal-toolchain build failed, extracting from Xcode..."
-
-            NAR_HASH="sha256-ayR5mXN4sZAddwKEG2OszGRF93k9ZFc7H0yi2xbylQw="
-            NAR_NAME="metal-toolchain-17C48.nar"
-
-            # Use RUNNER_TEMP to avoid /tmp symlink issues on macOS
-            WORK_DIR="${RUNNER_TEMP}/metal-work"
-            mkdir -p "$WORK_DIR"
-
-            # Download the Metal toolchain component
-            xcodebuild -downloadComponent MetalToolchain
-
-            # Find and mount the DMG
-            DMG_PATH=$(find /System/Library/AssetsV2/com_apple_MobileAsset_MetalToolchain -name '*.dmg' 2>/dev/null | head -1)
-            if [ -z "$DMG_PATH" ]; then
-              echo "Error: Could not find Metal toolchain DMG"
-              exit 1
-            fi
-
-            echo "Found DMG at: $DMG_PATH"
-            hdiutil attach "$DMG_PATH" -mountpoint "${WORK_DIR}/metal-dmg"
-
-            # Copy the toolchain
-            cp -R "${WORK_DIR}/metal-dmg/Metal.xctoolchain" "${WORK_DIR}/metal-export"
-            hdiutil detach "${WORK_DIR}/metal-dmg"
-
-            # Create NAR and add to store
-            nix nar pack "${WORK_DIR}/metal-export" > "${WORK_DIR}/${NAR_NAME}"
-            STORE_PATH=$(nix store add --mode flat "${WORK_DIR}/${NAR_NAME}")
-            echo "Added NAR to store: $STORE_PATH"
-
-            # Verify the hash matches
-            ACTUAL_HASH=$(nix hash file "${WORK_DIR}/${NAR_NAME}")
-            if [ "$ACTUAL_HASH" != "$NAR_HASH" ]; then
-              echo "Warning: NAR hash mismatch!"
-              echo "Expected: $NAR_HASH"
-              echo "Actual:   $ACTUAL_HASH"
-              echo "The metal-toolchain.nix may need updating"
-            fi
-
-            # Clean up
-            rm -rf "$WORK_DIR"
-
-            # Retry the build now that NAR is in store
-            nix build .#metal-toolchain
-          fi
-
-          # Build mlx (depends on metal-toolchain)
-          nix build .#mlx
-
      - name: Build all Nix outputs
        run: |
          nix flake show --json | jq -r '
@@ -132,14 +134,3 @@ jobs:

      - name: Run nix flake check
        run: nix flake check
-
-      - name: Run pytest (macOS only)
-        if: runner.os == 'macOS'
-        run: |
-          # Build the test environment (requires relaxed sandbox for uv2nix on macOS)
-          TEST_ENV=$(nix build '.#exo-test-env' --option sandbox relaxed --print-out-paths)
-
-          # Run pytest outside sandbox (needs GPU access for MLX)
-          export HOME="$RUNNER_TEMP"
-          export EXO_TESTS=1
-          $TEST_ENV/bin/python -m pytest src -m "not slow" --import-mode=importlib
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,3 @@ target/
 dashboard/build/
 dashboard/node_modules/
 dashboard/.svelte-kit/
-
-# host config snapshots
-hosts_*.json
--- a/MISSED_THINGS.md
+++ b/MISSED_THINGS.md
@@ -5,18 +5,18 @@
 [X] Fetching download status of all models on start
 [X] Deduplication of tasks in plan_step.
 [X] resolve_allow_patterns should just be wildcard now.
-[] no mx_barrier in genreate.py mlx_generate at the end.
+[X] no mx_barrier in genreate.py mlx_generate at the end.
 [] cache assertion not needed in auto_parallel.py PipelineLastLayer.
-[] GPTOSS support dropped in auto_parallel.py.
-[] sharding changed "all-to-sharded" became _all_to_sharded in auto_parallel.py.
-[] same as above with "sharded-to-all" became _sharded_to_all in auto_parallel.py.
-[] Dropped support for Ministral3Model, DeepseekV32Model, Glm4MoeModel, Qwen3NextModel, GptOssMode in auto_parallel.py.
+[X] GPTOSS support dropped in auto_parallel.py.
+[X] sharding changed "all-to-sharded" became _all_to_sharded in auto_parallel.py.
+[X] same as above with "sharded-to-all" became _sharded_to_all in auto_parallel.py.
+[X] Dropped support for Ministral3Model, DeepseekV32Model, Glm4MoeModel, Qwen3NextModel, GptOssMode in auto_parallel.py.
 [] Dropped prefill/decode code in auto_parallel.py and utils_mlx.py.
 [X] KV_CACHE_BITS should be None to disable quantized KV cache.
-[] Dropped _set_nofile_limit in utils_mlx.py.
-[] We have group optional in load_mlx_items in utils_mlx.py.
+[X] Dropped _set_nofile_limit in utils_mlx.py.
+[X] We have group optional in load_mlx_items in utils_mlx.py.
 [] Dropped add_missing_chat_templates for GptOss in load_mlx_items in utils_mlx.py.
-[] Dropped model.make_cache in make_kv_cache in utils_mlx.py.
+[X] Dropped model.make_cache in make_kv_cache in utils_mlx.py.
 [X] We put cache limit back in utils_mlx.py.
 [] topology.py remove_node removes the connections after checking if node is is in self._node_id_to_rx_id_map. on beta_1 it checks after, so would remove stale connections I guess?
 [] Missing Glm 4.7 model cards (this isn't ready yet but should be picked up, probably create an issue... the blocker is transforemrs version doesn't support the tokenizer for Glm 4.7. rc-1 does but we can't upgrade as it breaks other things.)
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
  <img alt="exo logo" src="/docs/imgs/exo-logo-transparent.png" width="50%" height="50%">
 </picture>

-exo: Run frontier AI locally. Maintained by [exo labs](https://x.com/exolabs).
+exo: Run your own AI cluster at home with everyday devices. Maintained by [exo labs](https://x.com/exolabs).

 <p align="center">
  <a href="https://discord.gg/TJ4P57arEm" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/Discord-Join%20Server-5865F2?logo=discord&logoColor=white" alt="Discord"></a>
@@ -107,10 +107,6 @@ uv run exo

 This starts the exo dashboard and API at http://localhost:52415/

-
-*Please view the section on RDMA to enable this feature on MacOS >=26.2!*
-
-
 ### Run from Source (Linux)

 **Prerequisites:**
@@ -234,7 +230,7 @@ This removes:

 RDMA is a new capability added to macOS 26.2. It works on any Mac with Thunderbolt 5 (M4 Pro Mac Mini, M4 Max Mac Studio, M4 Max MacBook Pro, M3 Ultra Mac Studio).

-Please refer to the caveats for immediate troubleshooting.
+Note that on Mac Studio, you cannot use the Thunderbolt 5 port next to the Ethernet port.

 To enable RDMA on macOS, follow these steps:

@@ -251,14 +247,6 @@ To enable RDMA on macOS, follow these steps:

 After that, RDMA will be enabled in macOS and exo will take care of the rest.

-**Important Caveats**
-
-1. Devices that wish to be part of an RDMA cluster must be connected to all other devices in the cluster.
-2. The cables must support TB5.
-3. On a Mac Studio, you cannot use the Thunderbolt 5 port next to the Ethernet port.
-4. If running from source, please use the script found at `tmp/set_rdma_network_config.sh`, which will disable Thunderbolt Bridge and set dhcp on each RDMA port.
-5. RDMA ports may be unable to discover each other on different versions of MacOS. Please ensure that OS versions match exactly (even beta version numbers) on all devices.
-
 ---

 ### Using the API
--- a/app/EXO/EXO.xcodeproj/project.pbxproj
+++ b/app/EXO/EXO.xcodeproj/project.pbxproj
@@ -342,8 +342,6 @@
 				SDKROOT = macosx;
 				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_TREAT_WARNINGS_AS_ERRORS = YES;
-				GCC_TREAT_WARNINGS_AS_ERRORS = YES;
 			};
 			name = Debug;
 		};
@@ -399,8 +397,6 @@
 				MTL_FAST_MATH = YES;
 				SDKROOT = macosx;
 				SWIFT_COMPILATION_MODE = wholemodule;
-				SWIFT_TREAT_WARNINGS_AS_ERRORS = YES;
-				GCC_TREAT_WARNINGS_AS_ERRORS = YES;
 			};
 			name = Release;
 		};
--- a/app/EXO/EXO/EXOApp.swift
+++ b/app/EXO/EXO/EXOApp.swift
@@ -225,7 +225,7 @@ private final class ExoUpdaterDelegate: NSObject, SPUUpdaterDelegate {
        }
    }

-    nonisolated private func showNotification(title: String, body: String) {
+    private func showNotification(title: String, body: String) {
        let center = UNUserNotificationCenter.current()
        let content = UNMutableNotificationContent()
        content.title = title
--- a/app/EXO/EXO/Services/NetworkSetupHelper.swift
+++ b/app/EXO/EXO/Services/NetworkSetupHelper.swift
@@ -18,9 +18,6 @@ enum NetworkSetupHelper {

        set -euo pipefail

-        # Wait for macOS to finish network setup after boot
-        sleep 20
-
        PREFS="/Library/Preferences/SystemConfiguration/preferences.plist"

        # Remove bridge0 interface
@@ -34,35 +31,6 @@ enum NetworkSetupHelper {
        # Remove Thunderbolt Bridge from VirtualNetworkInterfaces in preferences.plist
        /usr/libexec/PlistBuddy -c "Delete :VirtualNetworkInterfaces:Bridge:bridge0" "$PREFS" 2>/dev/null || true

-        networksetup -listlocations | grep -q exo || {
-          networksetup -createlocation exo
-        }
-
-        networksetup -switchtolocation exo
-        networksetup -listallhardwareports \\
-          | awk -F': ' '/Hardware Port: / {print $2}' \\
-          | while IFS=":" read -r name; do
-              case "$name" in
-                "Ethernet Adapter"*)
-                        ;;
-                "Thunderbolt Bridge")
-                        ;;
-                "Thunderbolt "*)
-                  networksetup -listallnetworkservices \\
-                    | grep -q "EXO $name" \\
-                      || networksetup -createnetworkservice "EXO $name" "$name" 2>/dev/null \\
-                      || continue
-                  networksetup -setdhcp "EXO $name"
-                        ;;
-                *)
-                  networksetup -listallnetworkservices \\
-                    | grep -q "$name" \\
-                      || networksetup -createnetworkservice "$name" "$name" 2>/dev/null \\
-                      || continue
-                        ;;
-              esac
-            done
-
        networksetup -listnetworkservices | grep -q "Thunderbolt Bridge" && {
          networksetup -setnetworkserviceenabled "Thunderbolt Bridge" off
        } || true
@@ -83,7 +51,7 @@ enum NetworkSetupHelper {
                let alert = NSAlert()
                alert.messageText = "EXO Network Configuration"
                alert.informativeText =
-                    "EXO needs to install a system service to configure local networking. This will disable Thunderbolt Bridge (preventing packet storms) and install a Network Location.\n\nYou will be prompted for your password."
+                    "EXO needs to install a system service to automatically disable Thunderbolt Bridge on startup. This prevents network loops when connecting multiple Macs via Thunderbolt.\n\nYou will be prompted for your administrator password."
                alert.alertStyle = .informational
                alert.addButton(withTitle: "Install")
                alert.addButton(withTitle: "Not Now")
@@ -244,11 +212,11 @@ enum NetworkSetupHelper {
        rm -f "$LOG_OUT" "$LOG_ERR"

        # Switch back to Automatic network location
-        networksetup -switchtolocation Automatic >/dev/null 2>&1 || true
+        networksetup -switchtolocation Automatic 2>/dev/null || true

        # Delete the exo network location if it exists
-        networksetup -listlocations 2>/dev/null | grep -q '^exo$' && {
-          networksetup -deletelocation exo >/dev/null 2>&1 || true
+        networksetup -listlocations | grep -q '^exo$' && {
+          networksetup -deletelocation exo 2>/dev/null || true
        } || true

        # Re-enable any Thunderbolt Bridge service if it exists
@@ -258,12 +226,12 @@ enum NetworkSetupHelper {
          tb_devices=$(networksetup -listallhardwareports 2>/dev/null | awk '
            /^Hardware Port:/ { port = tolower(substr($0, 16)) }
            /^Device:/ { if (port ~ /thunderbolt/) print substr($0, 9) }
-          ') || true
+          ')
          [ -z "$tb_devices" ] && return 0

          # For each bridge device, check if it contains Thunderbolt interfaces
          for bridge in bridge0 bridge1 bridge2; do
-            members=$(ifconfig "$bridge" 2>/dev/null | awk '/member:/ {print $2}') || true
+            members=$(ifconfig "$bridge" 2>/dev/null | awk '/member:/ {print $2}')
            [ -z "$members" ] && continue

            for tb_dev in $tb_devices; do
@@ -272,7 +240,7 @@ enum NetworkSetupHelper {
                service_name=$(networksetup -listnetworkserviceorder 2>/dev/null | awk -v dev="$bridge" '
                  /^\\([0-9*]/ { gsub(/^\\([0-9*]+\\) /, ""); svc = $0 }
                  /Device:/ && $0 ~ dev { print svc; exit }
-                ') || true
+                ')
                if [ -n "$service_name" ]; then
                  networksetup -setnetworkserviceenabled "$service_name" on 2>/dev/null || true
                  return 0
@@ -280,9 +248,8 @@ enum NetworkSetupHelper {
              fi
            done
          done
-          return 0
        }
-        find_and_enable_thunderbolt_bridge || true
+        find_and_enable_thunderbolt_bridge

        echo "EXO network components removed successfully"
        """
--- a/app/EXO/EXO/Services/ThunderboltBridgeService.swift
+++ b/app/EXO/EXO/Services/ThunderboltBridgeService.swift
@@ -127,24 +127,21 @@ final class ThunderboltBridgeService: ObservableObject {

        // 2. Request specific network configuration rights
        let rightName = "system.services.systemconfiguration.network"
-        status = rightName.withCString { nameCString in
-            var item = AuthorizationItem(
-                name: nameCString,
-                valueLength: 0,
-                value: nil,
-                flags: 0
-            )
-            return withUnsafeMutablePointer(to: &item) { itemPointer in
-                var rights = AuthorizationRights(count: 1, items: itemPointer)
-                return AuthorizationCopyRights(
-                    authRef,
-                    &rights,
-                    nil,
-                    [.extendRights, .interactionAllowed],
-                    nil
-                )
-            }
-        }
+        var item = AuthorizationItem(
+            name: rightName,
+            valueLength: 0,
+            value: nil,
+            flags: 0
+        )
+        var rights = AuthorizationRights(count: 1, items: &item)
+
+        status = AuthorizationCopyRights(
+            authRef,
+            &rights,
+            nil,
+            [.extendRights, .interactionAllowed],
+            nil
+        )
        guard status == errAuthorizationSuccess else {
            if status == errAuthorizationCanceled {
                throw ThunderboltBridgeError.authorizationCanceled
--- a/app/EXO/uninstall-exo.sh
+++ b/app/EXO/uninstall-exo.sh
@@ -29,21 +29,21 @@ YELLOW='\033[1;33m'
 NC='\033[0m' # No Color

 echo_info() {
-  echo -e "${GREEN}[INFO]${NC} $1"
+    echo -e "${GREEN}[INFO]${NC} $1"
 }

 echo_warn() {
-  echo -e "${YELLOW}[WARN]${NC} $1"
+    echo -e "${YELLOW}[WARN]${NC} $1"
 }

 echo_error() {
-  echo -e "${RED}[ERROR]${NC} $1"
+    echo -e "${RED}[ERROR]${NC} $1"
 }

 # Check if running as root
 if [[ $EUID -ne 0 ]]; then
-  echo_error "This script must be run as root (use sudo)"
-  exit 1
+    echo_error "This script must be run as root (use sudo)"
+    exit 1
 fi

 echo ""
@@ -55,64 +55,64 @@ echo ""
 # Unload the LaunchDaemon if running
 echo_info "Stopping network setup daemon..."
 if launchctl list | grep -q "$LABEL"; then
-  launchctl bootout system/"$LABEL" 2>/dev/null || true
-  echo_info "Daemon stopped"
+    launchctl bootout system/"$LABEL" 2>/dev/null || true
+    echo_info "Daemon stopped"
 else
-  echo_warn "Daemon was not running"
+    echo_warn "Daemon was not running"
 fi

 # Remove LaunchDaemon plist
-if [[ -f $PLIST_DEST ]]; then
-  rm -f "$PLIST_DEST"
-  echo_info "Removed LaunchDaemon plist"
+if [[ -f "$PLIST_DEST" ]]; then
+    rm -f "$PLIST_DEST"
+    echo_info "Removed LaunchDaemon plist"
 else
-  echo_warn "LaunchDaemon plist not found (already removed?)"
+    echo_warn "LaunchDaemon plist not found (already removed?)"
 fi

 # Remove the script and parent directory
-if [[ -f $SCRIPT_DEST ]]; then
-  rm -f "$SCRIPT_DEST"
-  echo_info "Removed network setup script"
+if [[ -f "$SCRIPT_DEST" ]]; then
+    rm -f "$SCRIPT_DEST"
+    echo_info "Removed network setup script"
 else
-  echo_warn "Network setup script not found (already removed?)"
+    echo_warn "Network setup script not found (already removed?)"
 fi

 # Remove EXO directory if empty
 if [[ -d "/Library/Application Support/EXO" ]]; then
-  rmdir "/Library/Application Support/EXO" 2>/dev/null &&
-    echo_info "Removed EXO support directory" ||
-    echo_warn "EXO support directory not empty, leaving in place"
+    rmdir "/Library/Application Support/EXO" 2>/dev/null && \
+        echo_info "Removed EXO support directory" || \
+        echo_warn "EXO support directory not empty, leaving in place"
 fi

 # Remove log files
-if [[ -f $LOG_OUT ]] || [[ -f $LOG_ERR ]]; then
-  rm -f "$LOG_OUT" "$LOG_ERR"
-  echo_info "Removed log files"
+if [[ -f "$LOG_OUT" ]] || [[ -f "$LOG_ERR" ]]; then
+    rm -f "$LOG_OUT" "$LOG_ERR"
+    echo_info "Removed log files"
 else
-  echo_warn "Log files not found (already removed?)"
+    echo_warn "Log files not found (already removed?)"
 fi

 # Switch back to Automatic network location
 echo_info "Restoring network configuration..."
 if networksetup -listlocations | grep -q "^Automatic$"; then
-  networksetup -switchtolocation Automatic 2>/dev/null || true
-  echo_info "Switched to Automatic network location"
+    networksetup -switchtolocation Automatic 2>/dev/null || true
+    echo_info "Switched to Automatic network location"
 else
-  echo_warn "Automatic network location not found"
+    echo_warn "Automatic network location not found"
 fi

 # Delete the exo network location if it exists
 if networksetup -listlocations | grep -q "^exo$"; then
-  networksetup -deletelocation exo 2>/dev/null || true
-  echo_info "Deleted 'exo' network location"
+    networksetup -deletelocation exo 2>/dev/null || true
+    echo_info "Deleted 'exo' network location"
 else
-  echo_warn "'exo' network location not found (already removed?)"
+    echo_warn "'exo' network location not found (already removed?)"
 fi

 # Re-enable Thunderbolt Bridge if it exists
 if networksetup -listnetworkservices 2>/dev/null | grep -q "Thunderbolt Bridge"; then
-  networksetup -setnetworkserviceenabled "Thunderbolt Bridge" on 2>/dev/null || true
-  echo_info "Re-enabled Thunderbolt Bridge"
+    networksetup -setnetworkserviceenabled "Thunderbolt Bridge" on 2>/dev/null || true
+    echo_info "Re-enabled Thunderbolt Bridge"
 fi

 # Note about launch at login registration
@@ -124,14 +124,14 @@ echo_warn "  System Settings → General → Login Items → Remove EXO"
 # Check if EXO.app exists in common locations
 APP_FOUND=false
 for app_path in "/Applications/EXO.app" "$HOME/Applications/EXO.app"; do
-  if [[ -d $app_path ]]; then
-    if [[ $APP_FOUND == false ]]; then
-      echo ""
-      APP_FOUND=true
+    if [[ -d "$app_path" ]]; then
+        if [[ "$APP_FOUND" == false ]]; then
+            echo ""
+            APP_FOUND=true
+        fi
+        echo_warn "EXO.app found at: $app_path"
+        echo_warn "You may want to move it to Trash manually."
    fi
-    echo_warn "EXO.app found at: $app_path"
-    echo_warn "You may want to move it to Trash manually."
-  fi
 done

 echo ""
@@ -151,3 +151,4 @@ echo ""
 echo "Manual step required:"
 echo "  Remove EXO from Login Items in System Settings → General → Login Items"
 echo ""
+
--- a/dashboard/package-lock.json
+++ b/dashboard/package-lock.json
@@ -865,6 +865,7 @@
 			"integrity": "sha512-oH8tXw7EZnie8FdOWYrF7Yn4IKrqTFHhXvl8YxXxbKwTMcD/5NNCryUSEXRk2ZR4ojnub0P8rNrsVGHXWqIDtA==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"@standard-schema/spec": "^1.0.0",
 				"@sveltejs/acorn-typescript": "^1.0.5",
@@ -904,6 +905,7 @@
 			"integrity": "sha512-Y1Cs7hhTc+a5E9Va/xwKlAJoariQyHY+5zBgCZg4PFWNYQ1nMN9sjK1zhw1gK69DuqVP++sht/1GZg1aRwmAXQ==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"@sveltejs/vite-plugin-svelte-inspector": "^4.0.1",
 				"debug": "^4.4.1",
@@ -1520,6 +1522,7 @@
 			"integrity": "sha512-LCCV0HdSZZZb34qifBsyWlUmok6W7ouER+oQIGBScS8EsZsQbrtFTUrDX4hOl+CS6p7cnNC4td+qrSVGSCTUfQ==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"undici-types": "~6.21.0"
 			}
@@ -1529,6 +1532,7 @@
 			"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
 			"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
 			"license": "MIT",
+			"peer": true,
 			"bin": {
 				"acorn": "bin/acorn"
 			},
@@ -1941,6 +1945,7 @@
 			"integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
 			"dev": true,
 			"license": "ISC",
+			"peer": true,
 			"engines": {
 				"node": ">=12"
 			}
@@ -2648,6 +2653,7 @@
 			"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"engines": {
 				"node": ">=12"
 			},
@@ -2690,6 +2696,7 @@
 			"integrity": "sha512-UOnG6LftzbdaHZcKoPFtOcCKztrQ57WkHDeRD9t/PTQtmT0NHSeWWepj6pS0z/N7+08BHFDQVUrfmfMRcZwbMg==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"bin": {
 				"prettier": "bin/prettier.cjs"
 			},
@@ -2862,6 +2869,7 @@
 			"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.45.3.tgz",
 			"integrity": "sha512-ngKXNhNvwPzF43QqEhDOue7TQTrG09em1sd4HBxVF0Wr2gopAmdEWan+rgbdgK4fhBtSOTJO8bYU4chUG7VXZQ==",
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"@jridgewell/remapping": "^2.3.4",
 				"@jridgewell/sourcemap-codec": "^1.5.0",
@@ -3006,6 +3014,7 @@
 			"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
 			"dev": true,
 			"license": "Apache-2.0",
+			"peer": true,
 			"bin": {
 				"tsc": "bin/tsc",
 				"tsserver": "bin/tsserver"
@@ -3027,6 +3036,7 @@
 			"integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==",
 			"dev": true,
 			"license": "MIT",
+			"peer": true,
 			"dependencies": {
 				"esbuild": "^0.25.0",
 				"fdir": "^6.4.4",
--- a/dashboard/parts.nix
+++ b/dashboard/parts.nix
@@ -3,28 +3,12 @@
  perSystem =
    { pkgs, lib, ... }:
    let
-      # Filter source to ONLY include package.json and package-lock.json
-      # This ensures prettier-svelte only rebuilds when lockfiles change
-      dashboardLockfileSrc = lib.cleanSourceWith {
-        src = inputs.self;
-        filter =
-          path: type:
-          let
-            baseName = builtins.baseNameOf path;
-            isDashboardDir = baseName == "dashboard" && type == "directory";
-            isPackageFile =
-              (lib.hasInfix "/dashboard/" path || lib.hasSuffix "/dashboard" (builtins.dirOf path))
-              && (baseName == "package.json" || baseName == "package-lock.json");
-          in
-          isDashboardDir || isPackageFile;
-      };
-
      # Stub source with lockfiles and minimal files for build to succeed
      # This allows prettier-svelte to avoid rebuilding when dashboard source changes
      dashboardStubSrc = pkgs.runCommand "dashboard-stub-src" { } ''
        mkdir -p $out
-        cp ${dashboardLockfileSrc}/dashboard/package.json $out/
-        cp ${dashboardLockfileSrc}/dashboard/package-lock.json $out/
+        cp ${inputs.self}/dashboard/package.json $out/
+        cp ${inputs.self}/dashboard/package-lock.json $out/
        # Minimal files so vite build succeeds (produces empty output)
        echo '<!DOCTYPE html><html><head></head><body></body></html>' > $out/index.html
        mkdir -p $out/src
--- a/dashboard/src/lib/components/ChatForm.svelte
+++ b/dashboard/src/lib/components/ChatForm.svelte
@@ -12,6 +12,7 @@
    ttftMs,
    tps,
    totalTokens,
+    cancelRequest,
  } from "$lib/stores/app.svelte";
  import ChatAttachments from "./ChatAttachments.svelte";
  import ImageParamsPanel from "./ImageParamsPanel.svelte";
@@ -605,37 +606,15 @@
        style="min-height: 28px; max-height: 150px;"
      ></textarea>

-      <button
-        type="submit"
-        disabled={!canSend || loading || isEditOnlyWithoutImage}
-        class="px-2.5 sm:px-4 py-1.5 sm:py-2 rounded text-xs sm:text-xs tracking-[0.1em] sm:tracking-[0.15em] uppercase font-medium transition-all duration-200 whitespace-nowrap
-					{!canSend || loading || isEditOnlyWithoutImage
-          ? 'bg-exo-medium-gray/50 text-exo-light-gray cursor-not-allowed'
-          : 'bg-exo-yellow text-exo-black hover:bg-exo-yellow-darker hover:shadow-[0_0_20px_rgba(255,215,0,0.3)]'}"
-        aria-label={shouldShowEditMode
-          ? "Edit image"
-          : isImageModel()
-            ? "Generate image"
-            : "Send message"}
-      >
-        {#if loading}
+      {#if loading}
+        <button
+          type="button"
+          onclick={() => cancelRequest()}
+          class="px-2.5 sm:px-4 py-1.5 sm:py-2 rounded text-xs sm:text-xs tracking-[0.1em] sm:tracking-[0.15em] uppercase font-medium transition-all duration-200 whitespace-nowrap bg-exo-medium-gray/50 text-exo-light-gray border border-exo-medium-gray/50 hover:border-red-500/50 hover:text-red-400 cursor-pointer"
+        >
          <span class="inline-flex items-center gap-1 sm:gap-2">
-            <span
-              class="w-2.5 h-2.5 sm:w-3 sm:h-3 border-2 border-current border-t-transparent rounded-full animate-spin"
-            ></span>
-            <span class="hidden sm:inline"
-              >{shouldShowEditMode
-                ? "EDITING"
-                : isImageModel()
-                  ? "GENERATING"
-                  : "PROCESSING"}</span
-            >
-            <span class="sm:hidden">...</span>
-          </span>
-        {:else if shouldShowEditMode}
-          <span class="inline-flex items-center gap-1.5">
            <svg
-              class="w-3.5 h-3.5"
+              class="w-3 h-3"
              fill="none"
              viewBox="0 0 24 24"
              stroke="currentColor"
@@ -644,47 +623,81 @@
              <path
                stroke-linecap="round"
                stroke-linejoin="round"
-                d="M11 5H6a2 2 0 00-2 2v11a2 2 0 002 2h11a2 2 0 002-2v-5m-1.414-9.414a2 2 0 112.828 2.828L11.828 15H9v-2.828l8.586-8.586z"
+                d="M6 18L18 6M6 6l12 12"
              />
            </svg>
-            <span>EDIT</span>
+            <span class="hidden sm:inline">CANCEL</span>
+            <span class="sm:hidden">X</span>
          </span>
-        {:else if isEditOnlyWithoutImage}
-          <span class="inline-flex items-center gap-1.5">
-            <svg
-              class="w-3.5 h-3.5"
-              fill="none"
-              viewBox="0 0 24 24"
-              stroke="currentColor"
-              stroke-width="2"
-            >
-              <path
-                stroke-linecap="round"
-                stroke-linejoin="round"
-                d="M11 5H6a2 2 0 00-2 2v11a2 2 0 002 2h11a2 2 0 002-2v-5m-1.414-9.414a2 2 0 112.828 2.828L11.828 15H9v-2.828l8.586-8.586z"
-              />
-            </svg>
-            <span>EDIT</span>
-          </span>
-        {:else if isImageModel()}
-          <span class="inline-flex items-center gap-1.5">
-            <svg
-              class="w-3.5 h-3.5"
-              fill="none"
-              viewBox="0 0 24 24"
-              stroke="currentColor"
-              stroke-width="2"
-            >
-              <rect x="3" y="3" width="18" height="18" rx="2" ry="2" />
-              <circle cx="8.5" cy="8.5" r="1.5" />
-              <polyline points="21 15 16 10 5 21" />
-            </svg>
-            <span>GENERATE</span>
-          </span>
-        {:else}
-          SEND
-        {/if}
-      </button>
+        </button>
+      {:else}
+        <button
+          type="submit"
+          disabled={!canSend || isEditOnlyWithoutImage}
+          class="px-2.5 sm:px-4 py-1.5 sm:py-2 rounded text-xs sm:text-xs tracking-[0.1em] sm:tracking-[0.15em] uppercase font-medium transition-all duration-200 whitespace-nowrap
+            {!canSend || isEditOnlyWithoutImage
+            ? 'bg-exo-medium-gray/50 text-exo-light-gray cursor-not-allowed'
+            : 'bg-exo-yellow text-exo-black hover:bg-exo-yellow-darker hover:shadow-[0_0_20px_rgba(255,215,0,0.3)]'}"
+          aria-label={shouldShowEditMode
+            ? "Edit image"
+            : isImageModel()
+              ? "Generate image"
+              : "Send message"}
+        >
+          {#if shouldShowEditMode}
+            <span class="inline-flex items-center gap-1.5">
+              <svg
+                class="w-3.5 h-3.5"
+                fill="none"
+                viewBox="0 0 24 24"
+                stroke="currentColor"
+                stroke-width="2"
+              >
+                <path
+                  stroke-linecap="round"
+                  stroke-linejoin="round"
+                  d="M11 5H6a2 2 0 00-2 2v11a2 2 0 002 2h11a2 2 0 002-2v-5m-1.414-9.414a2 2 0 112.828 2.828L11.828 15H9v-2.828l8.586-8.586z"
+                />
+              </svg>
+              <span>EDIT</span>
+            </span>
+          {:else if isEditOnlyWithoutImage}
+            <span class="inline-flex items-center gap-1.5">
+              <svg
+                class="w-3.5 h-3.5"
+                fill="none"
+                viewBox="0 0 24 24"
+                stroke="currentColor"
+                stroke-width="2"
+              >
+                <path
+                  stroke-linecap="round"
+                  stroke-linejoin="round"
+                  d="M11 5H6a2 2 0 00-2 2v11a2 2 0 002 2h11a2 2 0 002-2v-5m-1.414-9.414a2 2 0 112.828 2.828L11.828 15H9v-2.828l8.586-8.586z"
+                />
+              </svg>
+              <span>EDIT</span>
+            </span>
+          {:else if isImageModel()}
+            <span class="inline-flex items-center gap-1.5">
+              <svg
+                class="w-3.5 h-3.5"
+                fill="none"
+                viewBox="0 0 24 24"
+                stroke="currentColor"
+                stroke-width="2"
+              >
+                <rect x="3" y="3" width="18" height="18" rx="2" ry="2" />
+                <circle cx="8.5" cy="8.5" r="1.5" />
+                <polyline points="21 15 16 10 5 21" />
+              </svg>
+              <span>GENERATE</span>
+            </span>
+          {:else}
+            SEND
+          {/if}
+        </button>
+      {/if}
    </div>

    <!-- Bottom accent line -->
--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
@@ -173,11 +173,6 @@ export interface PlacementPreviewResponse {
  previews: PlacementPreview[];
 }

-interface ImageApiResponse {
-  created: number;
-  data: Array<{ b64_json?: string; url?: string }>;
-}
-
 interface RawStateResponse {
  topology?: RawTopology;
  instances?: Record<
@@ -469,6 +464,7 @@ class AppStore {
  private previewsInterval: ReturnType<typeof setInterval> | null = null;
  private lastConversationPersistTs = 0;
  private previousNodeIds: Set<string> = new Set();
+  private activeAbortController: AbortController | null = null;

  constructor() {
    if (browser) {
@@ -1751,6 +1747,9 @@ class AppStore {
    const targetConversationId = this.activeConversationId;
    if (!targetConversationId) return;

+    this.activeAbortController = new AbortController();
+    const signal = this.activeAbortController.signal;
+
    this.isLoading = true;
    this.currentResponse = "";
    this.ttftMs = null;
@@ -1885,6 +1884,7 @@ class AppStore {
          temperature: 0.7,
          stream: true,
        }),
+        signal,
      });

      if (!response.ok) {
@@ -1980,6 +1980,9 @@ class AppStore {
        this.persistConversation(targetConversationId);
      }
    } catch (error) {
+      if (signal.aborted) {
+        return;
+      }
      console.error("Error sending message:", error);
      this.handleStreamingError(
        error,
@@ -1988,6 +1991,7 @@ class AppStore {
        "Failed to get response",
      );
    } finally {
+      this.activeAbortController = null;
      this.isLoading = false;
      this.currentResponse = "";
      this.saveConversationsToStorage();
@@ -2008,6 +2012,9 @@ class AppStore {
    const targetConversationId = this.activeConversationId;
    if (!targetConversationId) return;

+    this.activeAbortController = new AbortController();
+    const signal = this.activeAbortController.signal;
+
    this.isLoading = true;
    this.currentResponse = "";

@@ -2093,6 +2100,7 @@ class AppStore {
          "Content-Type": "application/json",
        },
        body: JSON.stringify(requestBody),
+        signal,
      });

      if (!response.ok) {
@@ -2100,138 +2108,121 @@ class AppStore {
        throw new Error(`API error: ${response.status} - ${errorText}`);
      }

-      // Streaming requires both stream=true AND partialImages > 0
-      const isStreaming = params.stream && params.partialImages > 0;
+      const reader = response.body?.getReader();
+      if (!reader) {
+        throw new Error("No response body");
+      }

-      if (!isStreaming) {
-        // Non-streaming: parse JSON response directly
-        const jsonResponse = (await response.json()) as ImageApiResponse;
-        const format = params.outputFormat || "png";
-        const mimeType = `image/${format}`;
+      interface ImageGenerationChunk {
+        data?: { b64_json?: string };
+        format?: string;
+        type?: "partial" | "final";
+        image_index?: number;
+        partial_index?: number;
+        total_partials?: number;
+      }

-        const attachments: MessageAttachment[] = jsonResponse.data
-          .filter((img) => img.b64_json)
-          .map((img, index) => ({
-            type: "generated-image" as const,
-            name: `generated-image-${index + 1}.${format}`,
-            preview: `data:${mimeType};base64,${img.b64_json}`,
-            mimeType,
-          }));
+      const numImages = params.numImages;

+      await this.parseSSEStream<ImageGenerationChunk>(
+        reader,
+        targetConversationId,
+        (parsed) => {
+          const imageData = parsed.data?.b64_json;
+
+          if (imageData) {
+            const format = parsed.format || "png";
+            const mimeType = `image/${format}`;
+            const imageIndex = parsed.image_index ?? 0;
+
+            if (parsed.type === "partial") {
+              // Update with partial image and progress
+              const partialNum = (parsed.partial_index ?? 0) + 1;
+              const totalPartials = parsed.total_partials ?? 3;
+              const progressText =
+                numImages > 1
+                  ? `Generating image ${imageIndex + 1}/${numImages}... ${partialNum}/${totalPartials}`
+                  : `Generating... ${partialNum}/${totalPartials}`;
+
+              const partialAttachment: MessageAttachment = {
+                type: "generated-image",
+                name: `generated-image.${format}`,
+                preview: `data:${mimeType};base64,${imageData}`,
+                mimeType,
+              };
+
+              this.updateConversationMessage(
+                targetConversationId,
+                assistantMessage.id,
+                (msg) => {
+                  msg.content = progressText;
+                  if (imageIndex === 0) {
+                    // First image - safe to replace attachments with partial preview
+                    msg.attachments = [partialAttachment];
+                  } else {
+                    // Subsequent images - keep existing finals, show partial at current position
+                    const existingAttachments = msg.attachments || [];
+                    // Keep only the completed final images (up to current imageIndex)
+                    const finals = existingAttachments.slice(0, imageIndex);
+                    msg.attachments = [...finals, partialAttachment];
+                  }
+                },
+              );
+            } else if (parsed.type === "final") {
+              // Final image - replace partial at this position
+              const newAttachment: MessageAttachment = {
+                type: "generated-image",
+                name: `generated-image-${imageIndex + 1}.${format}`,
+                preview: `data:${mimeType};base64,${imageData}`,
+                mimeType,
+              };
+
+              this.updateConversationMessage(
+                targetConversationId,
+                assistantMessage.id,
+                (msg) => {
+                  if (imageIndex === 0) {
+                    // First final image - replace any partial preview
+                    msg.attachments = [newAttachment];
+                  } else {
+                    // Subsequent images - keep previous finals, replace partial at current position
+                    const existingAttachments = msg.attachments || [];
+                    // Slice keeps indices 0 to imageIndex-1 (the previous final images)
+                    const previousFinals = existingAttachments.slice(
+                      0,
+                      imageIndex,
+                    );
+                    msg.attachments = [...previousFinals, newAttachment];
+                  }
+
+                  // Update progress message for multiple images
+                  if (numImages > 1 && imageIndex < numImages - 1) {
+                    msg.content = `Generating image ${imageIndex + 2}/${numImages}...`;
+                  } else {
+                    msg.content = "";
+                  }
+                },
+              );
+            }
+
+            this.syncActiveMessagesIfNeeded(targetConversationId);
+          }
+        },
+      );
+    } catch (error) {
+      if (signal.aborted) {
+        // Clean up the "Generating image..." message on cancellation
        this.updateConversationMessage(
          targetConversationId,
          assistantMessage.id,
          (msg) => {
-            msg.content = "";
-            msg.attachments = attachments;
+            msg.content = "Cancelled";
+            msg.attachments = [];
          },
        );
        this.syncActiveMessagesIfNeeded(targetConversationId);
-      } else {
-        // Streaming mode: use SSE parser
-        const reader = response.body?.getReader();
-        if (!reader) {
-          throw new Error("No response body");
-        }
-
-        interface ImageGenerationChunk {
-          data?: { b64_json?: string };
-          format?: string;
-          type?: "partial" | "final";
-          image_index?: number;
-          partial_index?: number;
-          total_partials?: number;
-        }
-
-        const numImages = params.numImages;
-
-        await this.parseSSEStream<ImageGenerationChunk>(
-          reader,
-          targetConversationId,
-          (parsed) => {
-            const imageData = parsed.data?.b64_json;
-
-            if (imageData) {
-              const format = parsed.format || "png";
-              const mimeType = `image/${format}`;
-              const imageIndex = parsed.image_index ?? 0;
-
-              if (parsed.type === "partial") {
-                // Update with partial image and progress
-                const partialNum = (parsed.partial_index ?? 0) + 1;
-                const totalPartials = parsed.total_partials ?? 3;
-                const progressText =
-                  numImages > 1
-                    ? `Generating image ${imageIndex + 1}/${numImages}... ${partialNum}/${totalPartials}`
-                    : `Generating... ${partialNum}/${totalPartials}`;
-
-                const partialAttachment: MessageAttachment = {
-                  type: "generated-image",
-                  name: `generated-image.${format}`,
-                  preview: `data:${mimeType};base64,${imageData}`,
-                  mimeType,
-                };
-
-                this.updateConversationMessage(
-                  targetConversationId,
-                  assistantMessage.id,
-                  (msg) => {
-                    msg.content = progressText;
-                    if (imageIndex === 0) {
-                      // First image - safe to replace attachments with partial preview
-                      msg.attachments = [partialAttachment];
-                    } else {
-                      // Subsequent images - keep existing finals, show partial at current position
-                      const existingAttachments = msg.attachments || [];
-                      // Keep only the completed final images (up to current imageIndex)
-                      const finals = existingAttachments.slice(0, imageIndex);
-                      msg.attachments = [...finals, partialAttachment];
-                    }
-                  },
-                );
-              } else if (parsed.type === "final") {
-                // Final image - replace partial at this position
-                const newAttachment: MessageAttachment = {
-                  type: "generated-image",
-                  name: `generated-image-${imageIndex + 1}.${format}`,
-                  preview: `data:${mimeType};base64,${imageData}`,
-                  mimeType,
-                };
-
-                this.updateConversationMessage(
-                  targetConversationId,
-                  assistantMessage.id,
-                  (msg) => {
-                    if (imageIndex === 0) {
-                      // First final image - replace any partial preview
-                      msg.attachments = [newAttachment];
-                    } else {
-                      // Subsequent images - keep previous finals, replace partial at current position
-                      const existingAttachments = msg.attachments || [];
-                      // Slice keeps indices 0 to imageIndex-1 (the previous final images)
-                      const previousFinals = existingAttachments.slice(
-                        0,
-                        imageIndex,
-                      );
-                      msg.attachments = [...previousFinals, newAttachment];
-                    }
-
-                    // Update progress message for multiple images
-                    if (numImages > 1 && imageIndex < numImages - 1) {
-                      msg.content = `Generating image ${imageIndex + 2}/${numImages}...`;
-                    } else {
-                      msg.content = "";
-                    }
-                  },
-                );
-              }
-
-              this.syncActiveMessagesIfNeeded(targetConversationId);
-            }
-          },
-        );
+        return;
      }
-    } catch (error) {
      console.error("Error generating image:", error);
      this.handleStreamingError(
        error,
@@ -2240,6 +2231,7 @@ class AppStore {
        "Failed to generate image",
      );
    } finally {
+      this.activeAbortController = null;
      this.isLoading = false;
      this.saveConversationsToStorage();
    }
@@ -2263,6 +2255,9 @@ class AppStore {
    const targetConversationId = this.activeConversationId;
    if (!targetConversationId) return;

+    this.activeAbortController = new AbortController();
+    const signal = this.activeAbortController.signal;
+
    this.isLoading = true;
    this.currentResponse = "";

@@ -2371,6 +2366,7 @@ class AppStore {
      const apiResponse = await fetch("/v1/images/edits", {
        method: "POST",
        body: formData,
+        signal,
      });

      if (!apiResponse.ok) {
@@ -2378,99 +2374,83 @@ class AppStore {
        throw new Error(`API error: ${apiResponse.status} - ${errorText}`);
      }

-      // Streaming requires both stream=true AND partialImages > 0
-      const isStreaming = params.stream && params.partialImages > 0;
+      const reader = apiResponse.body?.getReader();
+      if (!reader) {
+        throw new Error("No response body");
+      }

-      if (!isStreaming) {
-        // Non-streaming: parse JSON response directly
-        const jsonResponse = (await apiResponse.json()) as ImageApiResponse;
-        const format = params.outputFormat || "png";
-        const mimeType = `image/${format}`;
-        const attachments: MessageAttachment[] = jsonResponse.data
-          .filter((img) => img.b64_json)
-          .map((img) => ({
-            type: "generated-image" as const,
-            name: `edited-image.${format}`,
-            preview: `data:${mimeType};base64,${img.b64_json}`,
-            mimeType,
-          }));
+      interface ImageEditChunk {
+        data?: { b64_json?: string };
+        format?: string;
+        type?: "partial" | "final";
+        partial_index?: number;
+        total_partials?: number;
+      }

+      await this.parseSSEStream<ImageEditChunk>(
+        reader,
+        targetConversationId,
+        (parsed) => {
+          const imageData = parsed.data?.b64_json;
+
+          if (imageData) {
+            const format = parsed.format || "png";
+            const mimeType = `image/${format}`;
+            if (parsed.type === "partial") {
+              // Update with partial image and progress
+              const partialNum = (parsed.partial_index ?? 0) + 1;
+              const totalPartials = parsed.total_partials ?? 3;
+              this.updateConversationMessage(
+                targetConversationId,
+                assistantMessage.id,
+                (msg) => {
+                  msg.content = `Editing... ${partialNum}/${totalPartials}`;
+                  msg.attachments = [
+                    {
+                      type: "generated-image",
+                      name: `edited-image.${format}`,
+                      preview: `data:${mimeType};base64,${imageData}`,
+                      mimeType,
+                    },
+                  ];
+                },
+              );
+            } else if (parsed.type === "final") {
+              // Final image
+              this.updateConversationMessage(
+                targetConversationId,
+                assistantMessage.id,
+                (msg) => {
+                  msg.content = "";
+                  msg.attachments = [
+                    {
+                      type: "generated-image",
+                      name: `edited-image.${format}`,
+                      preview: `data:${mimeType};base64,${imageData}`,
+                      mimeType,
+                    },
+                  ];
+                },
+              );
+            }
+            this.syncActiveMessagesIfNeeded(targetConversationId);
+          }
+        },
+      );
+    } catch (error) {
+      if (signal.aborted) {
+        // Clean up the "Editing image..." message on cancellation
        this.updateConversationMessage(
          targetConversationId,
          assistantMessage.id,
          (msg) => {
-            msg.content = "";
-            msg.attachments = attachments;
+            msg.content = "cancelled";
+            msg.attachments = [];
          },
        );
        this.syncActiveMessagesIfNeeded(targetConversationId);
-      } else {
-        // Streaming mode: use SSE parser
-        const reader = apiResponse.body?.getReader();
-        if (!reader) {
-          throw new Error("No response body");
-        }
-
-        interface ImageEditChunk {
-          data?: { b64_json?: string };
-          format?: string;
-          type?: "partial" | "final";
-          partial_index?: number;
-          total_partials?: number;
-        }
-
-        await this.parseSSEStream<ImageEditChunk>(
-          reader,
-          targetConversationId,
-          (parsed) => {
-            const imageData = parsed.data?.b64_json;
-
-            if (imageData) {
-              const format = parsed.format || "png";
-              const mimeType = `image/${format}`;
-              if (parsed.type === "partial") {
-                // Update with partial image and progress
-                const partialNum = (parsed.partial_index ?? 0) + 1;
-                const totalPartials = parsed.total_partials ?? 3;
-                this.updateConversationMessage(
-                  targetConversationId,
-                  assistantMessage.id,
-                  (msg) => {
-                    msg.content = `Editing... ${partialNum}/${totalPartials}`;
-                    msg.attachments = [
-                      {
-                        type: "generated-image",
-                        name: `edited-image.${format}`,
-                        preview: `data:${mimeType};base64,${imageData}`,
-                        mimeType,
-                      },
-                    ];
-                  },
-                );
-              } else if (parsed.type === "final") {
-                // Final image
-                this.updateConversationMessage(
-                  targetConversationId,
-                  assistantMessage.id,
-                  (msg) => {
-                    msg.content = "";
-                    msg.attachments = [
-                      {
-                        type: "generated-image",
-                        name: `edited-image.${format}`,
-                        preview: `data:${mimeType};base64,${imageData}`,
-                        mimeType,
-                      },
-                    ];
-                  },
-                );
-              }
-              this.syncActiveMessagesIfNeeded(targetConversationId);
-            }
-          },
-        );
+        return;
      }
-    } catch (error) {
      console.error("Error editing image:", error);
      this.handleStreamingError(
        error,
@@ -2479,11 +2459,24 @@ class AppStore {
        "Failed to edit image",
      );
    } finally {
+      this.activeAbortController = null;
      this.isLoading = false;
      this.saveConversationsToStorage();
    }
  }

+  /**
+   * Cancel an in-flight request by aborting the active fetch
+   */
+  cancelRequest(): void {
+    if (this.activeAbortController) {
+      this.activeAbortController.abort();
+      this.activeAbortController = null;
+    }
+    this.isLoading = false;
+    this.currentResponse = "";
+  }
+
  /**
   * Clear current chat and go back to welcome state
   */
@@ -2620,6 +2613,7 @@ export const editMessage = (messageId: string, newContent: string) =>
 export const editAndRegenerate = (messageId: string, newContent: string) =>
  appStore.editAndRegenerate(messageId, newContent);
 export const regenerateLastResponse = () => appStore.regenerateLastResponse();
+export const cancelRequest = () => appStore.cancelRequest();

 // Conversation actions
 export const conversations = () => appStore.conversations;
--- a/flake.lock
+++ b/flake.lock
@@ -21,9 +21,7 @@
          "nixpkgs"
        ],
        "purescript-overlay": "purescript-overlay",
-        "pyproject-nix": [
-          "pyproject-nix"
-        ]
+        "pyproject-nix": "pyproject-nix"
      },
      "locked": {
        "lastModified": 1765953015,
@@ -151,44 +149,19 @@
        "type": "github"
      }
    },
-    "pyproject-build-systems": {
-      "inputs": {
-        "nixpkgs": [
-          "nixpkgs"
-        ],
-        "pyproject-nix": [
-          "pyproject-nix"
-        ],
-        "uv2nix": [
-          "uv2nix"
-        ]
-      },
-      "locked": {
-        "lastModified": 1763662255,
-        "narHash": "sha256-4bocaOyLa3AfiS8KrWjZQYu+IAta05u3gYZzZ6zXbT0=",
-        "owner": "pyproject-nix",
-        "repo": "build-system-pkgs",
-        "rev": "042904167604c681a090c07eb6967b4dd4dae88c",
-        "type": "github"
-      },
-      "original": {
-        "owner": "pyproject-nix",
-        "repo": "build-system-pkgs",
-        "type": "github"
-      }
-    },
    "pyproject-nix": {
      "inputs": {
        "nixpkgs": [
+          "dream2nix",
          "nixpkgs"
        ]
      },
      "locked": {
-        "lastModified": 1764134915,
-        "narHash": "sha256-xaKvtPx6YAnA3HQVp5LwyYG1MaN4LLehpQI8xEdBvBY=",
+        "lastModified": 1763017646,
+        "narHash": "sha256-Z+R2lveIp6Skn1VPH3taQIuMhABg1IizJd8oVdmdHsQ=",
        "owner": "pyproject-nix",
        "repo": "pyproject.nix",
-        "rev": "2c8df1383b32e5443c921f61224b198a2282a657",
+        "rev": "47bd6f296502842643078d66128f7b5e5370790c",
        "type": "github"
      },
      "original": {
@@ -205,10 +178,7 @@
        "flake-parts": "flake-parts",
        "nixpkgs": "nixpkgs",
        "nixpkgs-swift": "nixpkgs-swift",
-        "pyproject-build-systems": "pyproject-build-systems",
-        "pyproject-nix": "pyproject-nix",
-        "treefmt-nix": "treefmt-nix",
-        "uv2nix": "uv2nix"
+        "treefmt-nix": "treefmt-nix"
      }
    },
    "rust-analyzer-src": {
@@ -269,29 +239,6 @@
        "repo": "treefmt-nix",
        "type": "github"
      }
-    },
-    "uv2nix": {
-      "inputs": {
-        "nixpkgs": [
-          "nixpkgs"
-        ],
-        "pyproject-nix": [
-          "pyproject-nix"
-        ]
-      },
-      "locked": {
-        "lastModified": 1767701098,
-        "narHash": "sha256-CJhKZnWb3gumR9oTRjFvCg/6lYTGbZRU7xtvcyWIRwU=",
-        "owner": "pyproject-nix",
-        "repo": "uv2nix",
-        "rev": "9d357f0d2ce6f5f35ec7959d7e704452352eb4da",
-        "type": "github"
-      },
-      "original": {
-        "owner": "pyproject-nix",
-        "repo": "uv2nix",
-        "type": "github"
-      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@@ -24,26 +24,6 @@
    dream2nix = {
      url = "github:nix-community/dream2nix";
      inputs.nixpkgs.follows = "nixpkgs";
-      inputs.pyproject-nix.follows = "pyproject-nix";
-    };
-
-    # Python packaging with uv2nix
-    pyproject-nix = {
-      url = "github:pyproject-nix/pyproject.nix";
-      inputs.nixpkgs.follows = "nixpkgs";
-    };
-
-    uv2nix = {
-      url = "github:pyproject-nix/uv2nix";
-      inputs.pyproject-nix.follows = "pyproject-nix";
-      inputs.nixpkgs.follows = "nixpkgs";
-    };
-
-    pyproject-build-systems = {
-      url = "github:pyproject-nix/build-system-pkgs";
-      inputs.pyproject-nix.follows = "pyproject-nix";
-      inputs.uv2nix.follows = "uv2nix";
-      inputs.nixpkgs.follows = "nixpkgs";
    };

    # Pinned nixpkgs for swift-format (swift is broken on x86_64-linux in newer nixpkgs)
@@ -68,7 +48,6 @@
        inputs.treefmt-nix.flakeModule
        ./dashboard/parts.nix
        ./rust/parts.nix
-        ./python/parts.nix
      ];

      perSystem =
@@ -79,11 +58,6 @@
          pkgsSwift = import inputs.nixpkgs-swift { inherit system; };
        in
        {
-          # Allow unfree for metal-toolchain (needed for Darwin Metal packages)
-          _module.args.pkgs = import inputs.nixpkgs {
-            inherit system;
-            config.allowUnfreePredicate = pkg: (pkg.pname or "") == "metal-toolchain";
-          };
          treefmt = {
            projectRootFile = "flake.nix";
            programs = {
@@ -105,24 +79,14 @@
                enable = true;
                package = pkgsSwift.swiftPackages.swift-format;
              };
-              shfmt.enable = true;
            };
          };

-          packages = lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin (
-            let
-              uvLock = builtins.fromTOML (builtins.readFile ./uv.lock);
-              mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx") uvLock.package);
-              uvLockMlxVersion = mlxPackage.version;
-            in
-            {
-              metal-toolchain = pkgs.callPackage ./nix/metal-toolchain.nix { };
-              mlx = pkgs.callPackage ./nix/mlx.nix {
-                metal-toolchain = self'.packages.metal-toolchain;
-                inherit uvLockMlxVersion;
-              };
-            }
-          );
+          checks.lint = pkgs.runCommand "lint-check" { } ''
+            export RUFF_CACHE_DIR="$TMPDIR/ruff-cache"
+            ${pkgs.ruff}/bin/ruff check ${inputs.self}/
+            touch $out
+          '';

          devShells.default = with pkgs; pkgs.mkShell {
            inputsFrom = [ self'.checks.cargo-build ];
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 export NIX_CONFIG := "extra-experimental-features = nix-command flakes"

 fmt:
-    treefmt || nix fmt
+    nix fmt

 lint:
    uv run ruff check --fix
--- a/nix/darwin-build-fixes.patch
+++ b/nix/darwin-build-fixes.patch
@@ -1,79 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 0ed30932..d8528132 100644
--- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -177,11 +177,7 @@ if(MLX_BUILD_METAL)
-     add_compile_definitions(MLX_METAL_DEBUG)
-   endif()
-
-  # Throw an error if xcrun not found
-  execute_process(
-    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
-    OUTPUT_VARIABLE MACOS_SDK_VERSION
-    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)
-+  set(MACOS_SDK_VERSION @sdkVersion@)
-
-   if(${MACOS_SDK_VERSION} LESS 14.0)
-     message(
-@@ -199,11 +195,8 @@ if(MLX_BUILD_METAL)
-     endif()
-     set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
-   endif()
-  execute_process(
-    COMMAND
-      zsh "-c"
-      "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
-    OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
-+  set(
-+    MLX_METAL_VERSION @metalVersion@)
-   FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
-   FetchContent_MakeAvailable(metal_cpp)
-   target_include_directories(
-diff --git a/cmake/extension.cmake b/cmake/extension.cmake
-index 13db804a..5b385132 100644
--- a/cmake/extension.cmake
-+++ b/cmake/extension.cmake
-@@ -36,7 +36,7 @@ macro(mlx_build_metallib)
-   add_custom_command(
-     OUTPUT ${MTLLIB_BUILD_TARGET}
-     COMMAND
-      xcrun -sdk macosx metal
-+      metal -fmodules-cache-path=${CMAKE_BINARY_DIR}/metal-cache
-       "$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
-       ${MTLLIB_COMPILE_OPTIONS} ${MTLLIB_SOURCES} -o ${MTLLIB_BUILD_TARGET}
-     DEPENDS ${MTLLIB_DEPS} ${MTLLIB_SOURCES}
-diff --git a/mlx/backend/metal/kernels/CMakeLists.txt b/mlx/backend/metal/kernels/CMakeLists.txt
-index 262b0495..5c7446ad 100644
--- a/mlx/backend/metal/kernels/CMakeLists.txt
-+++ b/mlx/backend/metal/kernels/CMakeLists.txt
-@@ -29,7 +29,7 @@ function(build_kernel_base TARGET SRCFILE DEPS)
-                     "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
-   endif()
-   add_custom_command(
-    COMMAND xcrun -sdk macosx metal ${METAL_FLAGS} -c ${SRCFILE}
-+    COMMAND metal -fmodules-cache-path=${CMAKE_BINARY_DIR}/metal-cache ${METAL_FLAGS} -c ${SRCFILE}
-             -I${PROJECT_SOURCE_DIR} -o ${TARGET}.air
-     DEPENDS ${SRCFILE} ${DEPS} ${BASE_HEADERS}
-     OUTPUT ${TARGET}.air
-@@ -170,7 +170,7 @@ endif()
-
- add_custom_command(
-   OUTPUT ${MLX_METAL_PATH}/mlx.metallib
-  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o
-+  COMMAND metallib ${KERNEL_AIR} -o
-           ${MLX_METAL_PATH}/mlx.metallib
-   DEPENDS ${KERNEL_AIR}
-   COMMENT "Building mlx.metallib"
-diff --git a/mlx/backend/metal/make_compiled_preamble.sh b/mlx/backend/metal/make_compiled_preamble.sh
-index bb55ed3a..94ea7dd7 100644
--- a/mlx/backend/metal/make_compiled_preamble.sh
-+++ b/mlx/backend/metal/make_compiled_preamble.sh
-@@ -31,7 +31,7 @@ OUTPUT_FILE=${OUTPUT_DIR}/${SRC_NAME}.cpp
- mkdir -p "$OUTPUT_DIR"
-
- # Use the metal compiler to get a list of headers (with depth)
-CCC="xcrun -sdk macosx metal -x metal"
-+CCC="metal -x metal -fmodules-cache-path=${OUTPUT_DIR}/metal-cache"
- HDRS=$( $CCC -I"$SRC_DIR" -I"$JIT_INCLUDES" -DMLX_METAL_JIT -E -P -CC -C -H "$INPUT_FILE" $CFLAGS -w 2>&1 1>/dev/null )
-
- # Remove any included system frameworks (for MetalPerformancePrimitive headers)
--- a/nix/metal-toolchain.nix
+++ b/nix/metal-toolchain.nix
@@ -1,56 +0,0 @@
-{ lib, stdenvNoCC, requireFile, nix }:
-
-let
-  narFile = requireFile {
-    name = "metal-toolchain-17C48.nar";
-    message = ''
-      The Metal Toolchain NAR must be available.
-
-      If you have cachix configured for exo.cachix.org, this should be automatic.
-
-      Otherwise:
-        1. Install Xcode 26+ from the App Store
-        2. Run: xcodebuild -downloadComponent MetalToolchain
-        3. Export the toolchain:
-           hdiutil attach "$(find /System/Library/AssetsV2/com_apple_MobileAsset_MetalToolchain -name '*.dmg' | head -1)" -mountpoint /tmp/metal-dmg
-           cp -R /tmp/metal-dmg/Metal.xctoolchain /tmp/metal-export
-           hdiutil detach /tmp/metal-dmg
-        4. Create NAR and add to store:
-           nix nar pack /tmp/metal-export > /tmp/metal-toolchain-17C48.nar
-           nix store add --mode flat /tmp/metal-toolchain-17C48.nar
-    '';
-    hash = "sha256-ayR5mXN4sZAddwKEG2OszGRF93k9ZFc7H0yi2xbylQw=";
-  };
-in
-stdenvNoCC.mkDerivation {
-  pname = "metal-toolchain";
-  version = "17C48";
-
-  dontUnpack = true;
-  dontBuild = true;
-  dontFixup = true;
-
-  nativeBuildInputs = [ nix ];
-
-  installPhase = ''
-    runHook preInstall
-
-    nix-store --restore $out < ${narFile}
-
-    # Create bin directory with symlinks for PATH
-    mkdir -p $out/bin
-    ln -s $out/usr/bin/metal $out/bin/metal
-    ln -s $out/usr/bin/metallib $out/bin/metallib
-
-    runHook postInstall
-  '';
-
-  # Metal language version for CMake (from: echo __METAL_VERSION__ | metal -E -x metal -P -)
-  passthru.metalVersion = "400";
-
-  meta = {
-    description = "Apple Metal compiler toolchain";
-    platforms = [ "aarch64-darwin" ];
-    license = lib.licenses.unfree;
-  };
-}
--- a/nix/mlx.nix
+++ b/nix/mlx.nix
@@ -1,158 +0,0 @@
-{ stdenv
-, lib
-, fetchFromGitHub
-, replaceVars
-, fetchzip
-, cmake
-, nlohmann_json
-, apple-sdk_26
-, metal-toolchain
-, runCommand
-, fmt
-, python313Packages
-, uvLockMlxVersion
-}:
-
-assert stdenv.isDarwin;
-
-let
-  python = python313Packages.python;
-
-  # Static dependencies included directly during compilation
-  gguf-tools = fetchFromGitHub {
-    owner = "antirez";
-    repo = "gguf-tools";
-    rev = "8fa6eb65236618e28fd7710a0fba565f7faa1848";
-    hash = "sha256-15FvyPOFqTOr5vdWQoPnZz+mYH919++EtghjozDlnSA=";
-  };
-
-  metal_cpp = fetchzip {
-    url = "https://developer.apple.com/metal/cpp/files/metal-cpp_26.zip";
-    hash = "sha256-7n2eI2lw/S+Us6l7YPAATKwcIbRRpaQ8VmES7S8ZjY8=";
-  };
-
-  nanobind = fetchFromGitHub {
-    owner = "wjakob";
-    repo = "nanobind";
-    rev = "v2.10.2";
-    hash = "sha256-io44YhN+VpfHFWyvvLWSanRgbzA0whK8WlDNRi3hahU=";
-    fetchSubmodules = true;
-  };
-
-  mlx = stdenv.mkDerivation rec {
-    pname = "mlx";
-    version = let v = "0.30.4"; in
-      assert v == uvLockMlxVersion || throw "MLX version mismatch: nix/mlx.nix has ${v} but uv.lock has ${uvLockMlxVersion}. Update both the version and hash in nix/mlx.nix.";
-      v;
-    pyproject = true;
-
-    src = fetchFromGitHub {
-      owner = "ml-explore";
-      repo = "mlx";
-      tag = "v${version}";
-      hash = "sha256-OJk6jPlbaSlsUdk3ADz3tWcRzTWXRof3/q8Soe1AO6w=";
-    };
-
-    patches = [
-      (replaceVars ./darwin-build-fixes.patch {
-        sdkVersion = apple-sdk_26.version;
-        metalVersion = metal-toolchain.metalVersion;
-      })
-    ];
-
-    postPatch = ''
-      substituteInPlace mlx/backend/cpu/jit_compiler.cpp \
-        --replace-fail "g++" "$CXX"
-    '';
-
-    dontUseCmakeConfigure = true;
-
-    enableParallelBuilding = true;
-
-    # Allows multiple cores to be used in Python builds.
-    postUnpack = ''
-      export MAKEFLAGS+="''${enableParallelBuilding:+-j$NIX_BUILD_CORES}"
-    '';
-
-    # Updates the wrong fetcher rev attribute
-    passthru.skipBulkUpdate = true;
-
-    env = {
-      DEV_RELEASE = 1;
-      CMAKE_ARGS = toString [
-        (lib.cmakeBool "USE_SYSTEM_FMT" true)
-        (lib.cmakeOptionType "filepath" "FETCHCONTENT_SOURCE_DIR_GGUFLIB" "${gguf-tools}")
-        (lib.cmakeOptionType "filepath" "FETCHCONTENT_SOURCE_DIR_JSON" "${nlohmann_json.src}")
-        (lib.cmakeOptionType "filepath" "FETCHCONTENT_SOURCE_DIR_NANOBIND" "${nanobind}")
-        (lib.cmakeBool "FETCHCONTENT_FULLY_DISCONNECTED" true)
-        (lib.cmakeBool "MLX_BUILD_METAL" true)
-        (lib.cmakeOptionType "filepath" "FETCHCONTENT_SOURCE_DIR_METAL_CPP" "${metal_cpp}")
-        (lib.cmakeOptionType "string" "CMAKE_OSX_DEPLOYMENT_TARGET" "${apple-sdk_26.version}")
-        (lib.cmakeOptionType "filepath" "CMAKE_OSX_SYSROOT" "${apple-sdk_26.passthru.sdkroot}")
-      ];
-      SDKROOT = apple-sdk_26.passthru.sdkroot;
-      MACOSX_DEPLOYMENT_TARGET = apple-sdk_26.version;
-    };
-
-    build-system = [
-      python313Packages.setuptools
-    ];
-
-    nativeBuildInputs = [
-      cmake
-      metal-toolchain
-      python313Packages.pypaBuildHook
-      python313Packages.pypaInstallHook
-      python313Packages.setuptools
-      python313Packages.typing-extensions
-      python313Packages.wheel
-      python313Packages.cmake
-      python313Packages.ninja
-    ];
-
-    buildInputs = [
-      fmt
-      gguf-tools
-      python313Packages.nanobind
-      python313Packages.pybind11
-      apple-sdk_26
-    ];
-
-    # Tests require Metal GPU access which isn't available in the Nix sandbox.
-    # To run tests, build with: nix build --option sandbox false .#mlx.passthru.tests.mlxTest
-    doCheck = false;
-
-    pythonImportsCheck = [ "mlx" ];
-
-    passthru.tests = {
-      # Runs example scripts to verify MLX works. Requires --option sandbox false
-      # since Metal GPU access is needed.
-      mlxTest =
-        runCommand "run-mlx-examples"
-          {
-            buildInputs = [ mlx ];
-            nativeBuildInputs = [ python ];
-          }
-          ''
-            cp ${src}/examples/python/logistic_regression.py .
-            ${python.interpreter} logistic_regression.py
-            rm logistic_regression.py
-
-            cp ${src}/examples/python/linear_regression.py .
-            ${python.interpreter} linear_regression.py
-            rm linear_regression.py
-
-            touch $out
-          '';
-    };
-
-    meta = {
-      homepage = "https://github.com/ml-explore/mlx";
-      description = "Array framework for Apple silicon";
-      changelog = "https://github.com/ml-explore/mlx/releases/tag/${src.tag}";
-      license = lib.licenses.mit;
-      platforms = [ "aarch64-darwin" ];
-    };
-  };
-in
-mlx
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,9 @@ dependencies = [
    "loguru>=0.7.3",
    "exo_pyo3_bindings", # rust bindings
    "anyio==4.11.0",
-    "mlx==0.30.4; sys_platform == 'darwin'",
-    "mlx[cpu]==0.30.4; sys_platform == 'linux'",
-    "mlx-lm",
+    "mlx==0.30.3; sys_platform == 'darwin'",
+    "mlx[cpu]==0.30.3; sys_platform == 'linux'",
+    "mlx-lm @ git+https://github.com/AlexCheema/mlx-lm.git@fix-transformers-5.0.0rc2",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
@@ -63,7 +63,6 @@ members = [

 [tool.uv.sources]
 exo_pyo3_bindings = { workspace = true }
-mlx-lm = { git = "https://github.com/ml-explore/mlx-lm", branch = "main" }
 # Uncomment to use local mlx/mlx-lm development versions:
 # mlx = { path = "/Users/Shared/mlx", editable=true }
 # mlx-lm = { path = "/Users/Shared/mlx-lm", editable=true }
--- a/python/parts.nix
+++ b/python/parts.nix
@@ -1,93 +0,0 @@
-{ inputs, ... }:
-{
-  perSystem =
-    { config, self', pkgs, lib, system, ... }:
-    let
-      # Load workspace from uv.lock
-      workspace = inputs.uv2nix.lib.workspace.loadWorkspace {
-        workspaceRoot = inputs.self;
-      };
-
-      # Create overlay from workspace
-      # Use wheels from PyPI for most packages; we override mlx with our pure Nix Metal build
-      overlay = workspace.mkPyprojectOverlay { sourcePreference = "wheel"; };
-
-      # Override overlay to inject Nix-built components
-      exoOverlay = final: prev: {
-        # Replace workspace exo_pyo3_bindings with Nix-built wheel
-        exo-pyo3-bindings = pkgs.stdenv.mkDerivation {
-          pname = "exo-pyo3-bindings";
-          version = "0.1.0";
-          src = self'.packages.exo_pyo3_bindings;
-          # Install from pre-built wheel
-          nativeBuildInputs = [ final.pyprojectWheelHook ];
-          dontStrip = true;
-        };
-      };
-
-      python = pkgs.python313;
-
-      # Overlay to provide build systems and custom packages
-      buildSystemsOverlay = final: prev: {
-        # Use our pure Nix-built MLX with Metal support
-        mlx = self'.packages.mlx;
-
-        # mlx-lm is a git dependency that needs setuptools
-        mlx-lm = prev.mlx-lm.overrideAttrs (old: {
-          nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [
-            final.setuptools
-          ];
-        });
-      };
-
-      pythonSet = (pkgs.callPackage inputs.pyproject-nix.build.packages {
-        inherit python;
-      }).overrideScope (
-        lib.composeManyExtensions [
-          inputs.pyproject-build-systems.overlays.default
-          overlay
-          exoOverlay
-          buildSystemsOverlay
-        ]
-      );
-      exoVenv = pythonSet.mkVirtualEnv "exo-env" workspace.deps.default;
-
-      # Virtual environment with dev dependencies for testing
-      testVenv = pythonSet.mkVirtualEnv "exo-test-env" (
-        workspace.deps.default // {
-          exo = [ "dev" ]; # Include pytest, pytest-asyncio, pytest-env
-        }
-      );
-
-      exoPackage = pkgs.runCommand "exo"
-        {
-          nativeBuildInputs = [ pkgs.makeWrapper ];
-        }
-        ''
-          mkdir -p $out/bin
-
-          # Create wrapper scripts
-          for script in exo exo-master exo-worker; do
-            makeWrapper ${exoVenv}/bin/$script $out/bin/$script \
-              --set DASHBOARD_DIR ${self'.packages.dashboard}
-          done
-        '';
-    in
-    {
-      # Python package only available on macOS (requires MLX/Metal)
-      packages = lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin {
-        exo = exoPackage;
-        # Test environment for running pytest outside of Nix sandbox (needs GPU access)
-        exo-test-env = testVenv;
-      };
-
-      checks = {
-        # Ruff linting (works on all platforms)
-        lint = pkgs.runCommand "ruff-lint" { } ''
-          export RUFF_CACHE_DIR="$TMPDIR/ruff-cache"
-          ${pkgs.ruff}/bin/ruff check ${inputs.self}/
-          touch $out
-        '';
-      };
-    };
-}
--- a/src/exo/download/download_utils.py
+++ b/src/exo/download/download_utils.py
@@ -121,20 +121,11 @@ async def ensure_models_dir() -> Path:


 async def delete_model(model_id: ModelId) -> bool:
-    models_dir = await ensure_models_dir()
-    model_dir = models_dir / model_id.normalize()
-    cache_dir = models_dir / "caches" / model_id.normalize()
-
-    deleted = False
-    if await aios.path.exists(model_dir):
-        await asyncio.to_thread(shutil.rmtree, model_dir, ignore_errors=False)
-        deleted = True
-
-    # Also clear cache
-    if await aios.path.exists(cache_dir):
-        await asyncio.to_thread(shutil.rmtree, cache_dir, ignore_errors=False)
-
-    return deleted
+    model_dir = await ensure_models_dir() / model_id.normalize()
+    if not await aios.path.exists(model_dir):
+        return False
+    await asyncio.to_thread(shutil.rmtree, model_dir, ignore_errors=False)
+    return True


 async def seed_models(seed_dir: str | Path):
@@ -160,28 +151,16 @@ async def fetch_file_list_with_cache(
    target_dir = (await ensure_models_dir()) / "caches" / model_id.normalize()
    await aios.makedirs(target_dir, exist_ok=True)
    cache_file = target_dir / f"{model_id.normalize()}--{revision}--file_list.json"
-
-    # Always try fresh first
-    try:
-        file_list = await fetch_file_list_with_retry(
-            model_id, revision, recursive=recursive
-        )
-        # Update cache with fresh data
-        async with aiofiles.open(cache_file, "w") as f:
-            await f.write(
-                TypeAdapter(list[FileListEntry]).dump_json(file_list).decode()
-            )
-        return file_list
-    except Exception as e:
-        # Fetch failed - try cache fallback
-        if await aios.path.exists(cache_file):
-            logger.warning(
-                f"Failed to fetch file list for {model_id}, using cached data: {e}"
-            )
-            async with aiofiles.open(cache_file, "r") as f:
-                return TypeAdapter(list[FileListEntry]).validate_json(await f.read())
-        # No cache available, propagate the error
-        raise
+    if await aios.path.exists(cache_file):
+        async with aiofiles.open(cache_file, "r") as f:
+            return TypeAdapter(list[FileListEntry]).validate_json(await f.read())
+    file_list = await fetch_file_list_with_retry(
+        model_id, revision, recursive=recursive
+    )
+    await aios.makedirs(cache_file.parent, exist_ok=True)
+    async with aiofiles.open(cache_file, "w") as f:
+        await f.write(TypeAdapter(list[FileListEntry]).dump_json(file_list).decode())
+    return file_list


 async def fetch_file_list_with_retry(
@@ -353,28 +332,8 @@ async def _download_file(
    target_dir: Path,
    on_progress: Callable[[int, int, bool], None] = lambda _, __, ___: None,
 ) -> Path:
-    target_path = target_dir / path
-
-    if await aios.path.exists(target_path):
-        local_size = (await aios.stat(target_path)).st_size
-
-        # Try to verify against remote, but allow offline operation
-        try:
-            remote_size, _ = await file_meta(model_id, revision, path)
-            if local_size != remote_size:
-                logger.info(
-                    f"File {path} size mismatch (local={local_size}, remote={remote_size}), re-downloading"
-                )
-                await aios.remove(target_path)
-            else:
-                return target_path
-        except Exception as e:
-            # Offline or network error - trust local file
-            logger.debug(
-                f"Could not verify {path} against remote (offline?): {e}, using local file"
-            )
-            return target_path
-
+    if await aios.path.exists(target_dir / path):
+        return target_dir / path
    await aios.makedirs((target_dir / path).parent, exist_ok=True)
    length, etag = await file_meta(model_id, revision, path)
    remote_hash = etag[:-5] if etag.endswith("-gzip") else etag
@@ -583,26 +542,17 @@ async def download_shard(
    async def on_progress_wrapper(
        file: FileListEntry, curr_bytes: int, total_bytes: int, is_renamed: bool
    ) -> None:
-        previous_progress = file_progress.get(file.path)
-
-        # Detect re-download: curr_bytes < previous downloaded means file was deleted and restarted
-        is_redownload = (
-            previous_progress is not None
-            and curr_bytes < previous_progress.downloaded.in_bytes
+        start_time = (
+            file_progress[file.path].start_time
+            if file.path in file_progress
+            else time.time()
+        )
+        downloaded_this_session = (
+            file_progress[file.path].downloaded_this_session.in_bytes
+            + (curr_bytes - file_progress[file.path].downloaded.in_bytes)
+            if file.path in file_progress
+            else curr_bytes
        )
-
-        if is_redownload or previous_progress is None:
-            # Fresh download or re-download: reset tracking
-            start_time = time.time()
-            downloaded_this_session = curr_bytes
-        else:
-            # Continuing download: accumulate
-            start_time = previous_progress.start_time
-            downloaded_this_session = (
-                previous_progress.downloaded_this_session.in_bytes
-                + (curr_bytes - previous_progress.downloaded.in_bytes)
-            )
-
        speed = (
            downloaded_this_session / (time.time() - start_time)
            if time.time() - start_time > 0
--- a/src/exo/download/impl_shard_downloader.py
+++ b/src/exo/download/impl_shard_downloader.py
@@ -21,7 +21,7 @@ def exo_shard_downloader(max_parallel_downloads: int = 8) -> ShardDownloader:


 async def build_base_shard(model_id: ModelId) -> ShardMetadata:
-    model_card = await ModelCard.load(model_id)
+    model_card = await ModelCard.from_hf(model_id)
    return PipelineShardMetadata(
        model_card=model_card,
        device_rank=0,
@@ -166,8 +166,9 @@ class ResumableShardDownloader(ShardDownloader):
        for task in asyncio.as_completed(tasks):
            try:
                yield await task
+            # TODO: except Exception
            except Exception as e:
-                logger.warning(f"Error downloading shard: {type(e).__name__}")
+                logger.error("Error downloading shard:", e)

    async def get_shard_download_status_for_shard(
        self, shard: ShardMetadata
--- a/src/exo/download/tests/init.py
+++ b/src/exo/download/tests/init.py
--- a/src/exo/download/tests/test_download_verification.py
+++ b/src/exo/download/tests/test_download_verification.py
@@ -1,451 +0,0 @@
-"""Tests for download verification and cache behavior."""
-
-import time
-from collections.abc import AsyncIterator
-from datetime import timedelta
-from pathlib import Path
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import aiofiles
-import aiofiles.os as aios
-import pytest
-from pydantic import TypeAdapter
-
-from exo.download.download_utils import (
-    delete_model,
-    fetch_file_list_with_cache,
-)
-from exo.shared.types.common import ModelId
-from exo.shared.types.memory import Memory
-from exo.shared.types.worker.downloads import FileListEntry, RepoFileDownloadProgress
-
-
-@pytest.fixture
-def model_id() -> ModelId:
-    return ModelId("test-org/test-model")
-
-
-@pytest.fixture
-async def temp_models_dir(tmp_path: Path) -> AsyncIterator[Path]:
-    """Set up a temporary models directory for testing."""
-    models_dir = tmp_path / "models"
-    await aios.makedirs(models_dir, exist_ok=True)
-    with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
-        yield models_dir
-
-
-class TestFileVerification:
-    """Tests for file size verification in _download_file."""
-
-    async def test_redownload_when_file_size_changes_upstream(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test that files with mismatched sizes are re-downloaded."""
-        # Import inside test to allow patching
-        from exo.download.download_utils import (
-            _download_file,  # pyright: ignore[reportPrivateUsage]
-        )
-
-        target_dir = tmp_path / "downloads"
-        await aios.makedirs(target_dir, exist_ok=True)
-
-        # Create a local file with wrong size
-        local_file = target_dir / "test.safetensors"
-        async with aiofiles.open(local_file, "wb") as f:
-            await f.write(b"local content")  # 13 bytes
-
-        remote_size = 1000  # Different from local
-        remote_hash = "abc123"
-
-        with (
-            patch(
-                "exo.download.download_utils.file_meta",
-                new_callable=AsyncMock,
-                return_value=(remote_size, remote_hash),
-            ) as mock_file_meta,
-            patch(
-                "exo.download.download_utils.create_http_session"
-            ) as mock_session_factory,
-        ):
-            # Set up mock HTTP response for re-download
-            mock_response = MagicMock()
-            mock_response.status = 200
-            mock_response.content.read = AsyncMock(  # pyright: ignore[reportAny]
-                side_effect=[b"x" * remote_size, b""]
-            )
-
-            mock_session = MagicMock()
-            mock_session.get.return_value.__aenter__ = AsyncMock(  # pyright: ignore[reportAny]
-                return_value=mock_response
-            )
-            mock_session.get.return_value.__aexit__ = AsyncMock(  # pyright: ignore[reportAny]
-                return_value=None
-            )
-            mock_session_factory.return_value.__aenter__ = AsyncMock(  # pyright: ignore[reportAny]
-                return_value=mock_session
-            )
-            mock_session_factory.return_value.__aexit__ = AsyncMock(  # pyright: ignore[reportAny]
-                return_value=None
-            )
-
-            # Mock calc_hash to return the expected hash
-            with patch(
-                "exo.download.download_utils.calc_hash",
-                new_callable=AsyncMock,
-                return_value=remote_hash,
-            ):
-                await _download_file(model_id, "main", "test.safetensors", target_dir)
-
-            # file_meta should be called twice: once for verification, once for download
-            assert mock_file_meta.call_count == 2
-
-    async def test_skip_download_when_file_size_matches(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test that files with matching sizes are not re-downloaded."""
-        from exo.download.download_utils import (
-            _download_file,  # pyright: ignore[reportPrivateUsage]
-        )
-
-        target_dir = tmp_path / "downloads"
-        await aios.makedirs(target_dir, exist_ok=True)
-
-        # Create a local file
-        local_file = target_dir / "test.safetensors"
-        local_content = b"local content"
-        async with aiofiles.open(local_file, "wb") as f:
-            await f.write(local_content)
-
-        remote_size = len(local_content)  # Same as local
-        remote_hash = "abc123"
-
-        with (
-            patch(
-                "exo.download.download_utils.file_meta",
-                new_callable=AsyncMock,
-                return_value=(remote_size, remote_hash),
-            ) as mock_file_meta,
-            patch(
-                "exo.download.download_utils.create_http_session"
-            ) as mock_session_factory,
-        ):
-            result = await _download_file(
-                model_id, "main", "test.safetensors", target_dir
-            )
-
-            # Should return immediately without downloading
-            assert result == local_file
-            mock_file_meta.assert_called_once()
-            mock_session_factory.assert_not_called()
-
-    async def test_offline_fallback_uses_local_file(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test that local files are used when network is unavailable."""
-        from exo.download.download_utils import (
-            _download_file,  # pyright: ignore[reportPrivateUsage]
-        )
-
-        target_dir = tmp_path / "downloads"
-        await aios.makedirs(target_dir, exist_ok=True)
-
-        # Create a local file
-        local_file = target_dir / "test.safetensors"
-        async with aiofiles.open(local_file, "wb") as f:
-            await f.write(b"local content")
-
-        with (
-            patch(
-                "exo.download.download_utils.file_meta",
-                new_callable=AsyncMock,
-                side_effect=Exception("Network error"),
-            ),
-            patch(
-                "exo.download.download_utils.create_http_session"
-            ) as mock_session_factory,
-        ):
-            result = await _download_file(
-                model_id, "main", "test.safetensors", target_dir
-            )
-
-            # Should return local file without attempting download
-            assert result == local_file
-            mock_session_factory.assert_not_called()
-
-
-class TestFileListCache:
-    """Tests for file list caching behavior."""
-
-    async def test_fetch_fresh_and_update_cache(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test that fresh data is fetched and cache is updated."""
-        models_dir = tmp_path / "models"
-
-        file_list = [
-            FileListEntry(type="file", path="model.safetensors", size=1000),
-            FileListEntry(type="file", path="config.json", size=100),
-        ]
-
-        with (
-            patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir),
-            patch(
-                "exo.download.download_utils.fetch_file_list_with_retry",
-                new_callable=AsyncMock,
-                return_value=file_list,
-            ) as mock_fetch,
-        ):
-            result = await fetch_file_list_with_cache(model_id, "main")
-
-            assert result == file_list
-            mock_fetch.assert_called_once()
-
-            # Verify cache was written
-            cache_file = (
-                models_dir
-                / "caches"
-                / model_id.normalize()
-                / f"{model_id.normalize()}--main--file_list.json"
-            )
-            assert await aios.path.exists(cache_file)
-
-            async with aiofiles.open(cache_file, "r") as f:
-                cached_data = TypeAdapter(list[FileListEntry]).validate_json(
-                    await f.read()
-                )
-            assert cached_data == file_list
-
-    async def test_fallback_to_cache_when_fetch_fails(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test that cached data is used when fetch fails."""
-        models_dir = tmp_path / "models"
-        cache_dir = models_dir / "caches" / model_id.normalize()
-        await aios.makedirs(cache_dir, exist_ok=True)
-
-        # Create cache file
-        cached_file_list = [
-            FileListEntry(type="file", path="model.safetensors", size=1000),
-        ]
-        cache_file = cache_dir / f"{model_id.normalize()}--main--file_list.json"
-        async with aiofiles.open(cache_file, "w") as f:
-            await f.write(
-                TypeAdapter(list[FileListEntry]).dump_json(cached_file_list).decode()
-            )
-
-        with (
-            patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir),
-            patch(
-                "exo.download.download_utils.fetch_file_list_with_retry",
-                new_callable=AsyncMock,
-                side_effect=Exception("Network error"),
-            ),
-        ):
-            result = await fetch_file_list_with_cache(model_id, "main")
-
-            assert result == cached_file_list
-
-    async def test_error_propagates_when_no_cache(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test that errors propagate when fetch fails and no cache exists."""
-        models_dir = tmp_path / "models"
-
-        with (
-            patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir),
-            patch(
-                "exo.download.download_utils.fetch_file_list_with_retry",
-                new_callable=AsyncMock,
-                side_effect=Exception("Network error"),
-            ),
-            pytest.raises(Exception, match="Network error"),
-        ):
-            await fetch_file_list_with_cache(model_id, "main")
-
-
-class TestModelDeletion:
-    """Tests for model deletion including cache cleanup."""
-
-    async def test_delete_model_clears_cache(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test that deleting a model also deletes its cache."""
-        models_dir = tmp_path / "models"
-        model_dir = models_dir / model_id.normalize()
-        cache_dir = models_dir / "caches" / model_id.normalize()
-
-        # Create model and cache directories
-        await aios.makedirs(model_dir, exist_ok=True)
-        await aios.makedirs(cache_dir, exist_ok=True)
-
-        # Add some files
-        async with aiofiles.open(model_dir / "model.safetensors", "w") as f:
-            await f.write("model data")
-        async with aiofiles.open(cache_dir / "file_list.json", "w") as f:
-            await f.write("[]")
-
-        with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
-            result = await delete_model(model_id)
-
-            assert result is True
-            assert not await aios.path.exists(model_dir)
-            assert not await aios.path.exists(cache_dir)
-
-    async def test_delete_model_only_cache_exists(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test deleting when only cache exists (model already deleted)."""
-        models_dir = tmp_path / "models"
-        cache_dir = models_dir / "caches" / model_id.normalize()
-
-        # Only create cache directory
-        await aios.makedirs(cache_dir, exist_ok=True)
-        async with aiofiles.open(cache_dir / "file_list.json", "w") as f:
-            await f.write("[]")
-
-        with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
-            result = await delete_model(model_id)
-
-            # Returns False because model dir didn't exist
-            assert result is False
-            # But cache should still be cleaned up
-            assert not await aios.path.exists(cache_dir)
-
-    async def test_delete_nonexistent_model(
-        self, model_id: ModelId, tmp_path: Path
-    ) -> None:
-        """Test deleting a model that doesn't exist."""
-        models_dir = tmp_path / "models"
-        await aios.makedirs(models_dir, exist_ok=True)
-
-        with patch("exo.download.download_utils.EXO_MODELS_DIR", models_dir):
-            result = await delete_model(model_id)
-
-            assert result is False
-
-
-class TestProgressResetOnRedownload:
-    """Tests for progress tracking when files are re-downloaded."""
-
-    async def test_progress_resets_correctly_on_redownload(
-        self, model_id: ModelId
-    ) -> None:
-        """Test that progress tracking resets when a file is re-downloaded.
-
-        When a file is deleted and re-downloaded (due to size mismatch),
-        the progress tracking should reset rather than calculating negative
-        downloaded_this_session values.
-        """
-        # Simulate file_progress dict as it exists in download_shard
-        file_progress: dict[str, RepoFileDownloadProgress] = {}
-
-        # Initialize with old file progress (simulating existing large file)
-        old_file_size = 1_500_000_000  # 1.5 GB
-        file_progress["model.safetensors"] = RepoFileDownloadProgress(
-            repo_id=model_id,
-            repo_revision="main",
-            file_path="model.safetensors",
-            downloaded=Memory.from_bytes(old_file_size),
-            downloaded_this_session=Memory.from_bytes(0),
-            total=Memory.from_bytes(old_file_size),
-            speed=0,
-            eta=timedelta(0),
-            status="not_started",
-            start_time=time.time() - 10,  # Started 10 seconds ago
-        )
-
-        # Simulate the logic from on_progress_wrapper after re-download starts
-        # This is the exact logic from the fixed on_progress_wrapper
-        curr_bytes = 100_000  # 100 KB - new download just started
-        previous_progress = file_progress.get("model.safetensors")
-
-        # Detect re-download: curr_bytes < previous downloaded
-        is_redownload = (
-            previous_progress is not None
-            and curr_bytes < previous_progress.downloaded.in_bytes
-        )
-
-        if is_redownload or previous_progress is None:
-            # Fresh download or re-download: reset tracking
-            start_time = time.time()
-            downloaded_this_session = curr_bytes
-        else:
-            # Continuing download: accumulate
-            start_time = previous_progress.start_time
-            downloaded_this_session = (
-                previous_progress.downloaded_this_session.in_bytes
-                + (curr_bytes - previous_progress.downloaded.in_bytes)
-            )
-
-        # Key assertions
-        assert is_redownload is True, "Should detect re-download scenario"
-        assert downloaded_this_session == curr_bytes, (
-            "downloaded_this_session should equal curr_bytes on re-download"
-        )
-        assert downloaded_this_session > 0, (
-            "downloaded_this_session should be positive, not negative"
-        )
-
-        # Calculate speed (should be positive)
-        elapsed = time.time() - start_time
-        speed = downloaded_this_session / elapsed if elapsed > 0 else 0
-        assert speed >= 0, "Speed should be non-negative"
-
-    async def test_progress_accumulates_on_continuing_download(
-        self, model_id: ModelId
-    ) -> None:
-        """Test that progress accumulates correctly for continuing downloads.
-
-        When a download continues from where it left off (resume),
-        the progress should accumulate correctly.
-        """
-        file_progress: dict[str, RepoFileDownloadProgress] = {}
-
-        # Initialize with partial download progress
-        initial_downloaded = 500_000  # 500 KB already downloaded
-        start_time = time.time() - 5  # Started 5 seconds ago
-        file_progress["model.safetensors"] = RepoFileDownloadProgress(
-            repo_id=model_id,
-            repo_revision="main",
-            file_path="model.safetensors",
-            downloaded=Memory.from_bytes(initial_downloaded),
-            downloaded_this_session=Memory.from_bytes(initial_downloaded),
-            total=Memory.from_bytes(1_000_000),
-            speed=100_000,
-            eta=timedelta(seconds=5),
-            status="in_progress",
-            start_time=start_time,
-        )
-
-        # Progress callback with more bytes downloaded
-        curr_bytes = 600_000  # 600 KB - continuing download
-        previous_progress = file_progress.get("model.safetensors")
-
-        # This is NOT a re-download (curr_bytes > previous downloaded)
-        is_redownload = (
-            previous_progress is not None
-            and curr_bytes < previous_progress.downloaded.in_bytes
-        )
-
-        if is_redownload or previous_progress is None:
-            downloaded_this_session = curr_bytes
-            used_start_time = time.time()
-        else:
-            used_start_time = previous_progress.start_time
-            downloaded_this_session = (
-                previous_progress.downloaded_this_session.in_bytes
-                + (curr_bytes - previous_progress.downloaded.in_bytes)
-            )
-
-        # Key assertions
-        assert is_redownload is False, (
-            "Should NOT detect re-download for continuing download"
-        )
-        assert used_start_time == start_time, "Should preserve original start_time"
-        expected_session = initial_downloaded + (curr_bytes - initial_downloaded)
-        assert downloaded_this_session == expected_session, (
-            f"Should accumulate: {downloaded_this_session} == {expected_session}"
-        )
-        assert downloaded_this_session == 600_000, (
-            "downloaded_this_session should equal total downloaded so far"
-        )
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -65,9 +65,7 @@ from exo.shared.types.api import (
    StartDownloadParams,
    StartDownloadResponse,
    StreamingChoiceResponse,
-    StreamOptions,
    ToolCall,
-    Usage,
 )
 from exo.shared.types.chunks import (
    ErrorChunk,
@@ -90,6 +88,7 @@ from exo.shared.types.commands import (
    PlaceInstance,
    SendInputChunk,
    StartDownload,
+    TaskCancelled,
    TaskFinished,
 )
 from exo.shared.types.common import CommandId, Id, NodeId, SessionId
@@ -115,9 +114,7 @@ def _format_to_content_type(image_format: Literal["png", "jpeg", "webp"] | None)


 def chunk_to_response(
-    chunk: TokenChunk | ToolCallChunk,
-    command_id: CommandId,
-    usage: Usage | None,
+    chunk: TokenChunk | ToolCallChunk, command_id: CommandId
 ) -> ChatCompletionResponse:
    return ChatCompletionResponse(
        id=command_id,
@@ -142,10 +139,21 @@ def chunk_to_response(
                finish_reason=chunk.finish_reason,
            )
        ],
-        usage=usage,
    )


+async def resolve_model_card(model_id: ModelId) -> ModelCard:
+    if model_id in MODEL_CARDS:
+        model_card = MODEL_CARDS[model_id]
+        return model_card
+
+    for card in MODEL_CARDS.values():
+        if card.model_id == ModelId(model_id):
+            return card
+
+    return await ModelCard.from_hf(model_id)
+
+
 class API:
    def __init__(
        self,
@@ -267,7 +275,7 @@ class API:

    async def place_instance(self, payload: PlaceInstanceParams):
        command = PlaceInstance(
-            model_card=await ModelCard.load(payload.model_id),
+            model_card=await resolve_model_card(payload.model_id),
            sharding=payload.sharding,
            instance_meta=payload.instance_meta,
            min_nodes=payload.min_nodes,
@@ -284,7 +292,7 @@ class API:
        self, payload: CreateInstanceParams
    ) -> CreateInstanceResponse:
        instance = payload.instance
-        model_card = await ModelCard.load(instance.shard_assignments.model_id)
+        model_card = await resolve_model_card(instance.shard_assignments.model_id)
        required_memory = model_card.storage_size
        available_memory = self._calculate_total_available_memory()

@@ -312,7 +320,7 @@ class API:
        instance_meta: InstanceMeta = InstanceMeta.MlxRing,
        min_nodes: int = 1,
    ) -> Instance:
-        model_card = await ModelCard.load(model_id)
+        model_card = await resolve_model_card(model_id)

        try:
            placements = get_instance_placements(
@@ -501,24 +509,21 @@ class API:
                        break

        except anyio.get_cancelled_exc_class():
-            # TODO: TaskCancelled
-            """
-            self.command_sender.send_nowait(
-                ForwarderCommand(origin=self.node_id, command=command)
-            )
-            """
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
-            command = TaskFinished(finished_command_id=command_id)
-            await self._send(command)
+            await self._send(TaskFinished(finished_command_id=command_id))
            if command_id in self._chat_completion_queues:
                del self._chat_completion_queues[command_id]

    async def _generate_chat_stream(
-        self, command_id: CommandId, stream_options: StreamOptions | None = None
+        self, command_id: CommandId
    ) -> AsyncGenerator[str, None]:
        """Generate chat completion stream as JSON strings."""
-        include_usage = stream_options.include_usage if stream_options else False

        async for chunk in self._chat_chunk_stream(command_id):
            assert not isinstance(chunk, ImageChunk)
@@ -534,10 +539,8 @@ class API:
                yield "data: [DONE]\n\n"
                return

-            usage = chunk.usage if include_usage else None
-
            chunk_response: ChatCompletionResponse = chunk_to_response(
-                chunk, command_id, usage=usage
+                chunk, command_id
            )
            logger.debug(f"chunk_response: {chunk_response}")

@@ -553,9 +556,8 @@ class API:

        text_parts: list[str] = []
        tool_calls: list[ToolCall] = []
-        model: ModelId | None = None
+        model: str | None = None
        finish_reason: FinishReason | None = None
-        usage: Usage | None = None

        async for chunk in self._chat_chunk_stream(command_id):
            if isinstance(chunk, ErrorChunk):
@@ -580,9 +582,6 @@ class API:
                    for i, tool in enumerate(chunk.tool_calls)
                )

-            if chunk.usage is not None:
-                usage = chunk.usage
-
            if chunk.finish_reason is not None:
                finish_reason = chunk.finish_reason

@@ -604,7 +603,6 @@ class API:
                    finish_reason=finish_reason,
                )
            ],
-            usage=usage,
        )

    async def _collect_chat_completion_with_stats(
@@ -612,7 +610,7 @@ class API:
    ) -> BenchChatCompletionResponse:
        text_parts: list[str] = []
        tool_calls: list[ToolCall] = []
-        model: ModelId | None = None
+        model: str | None = None
        finish_reason: FinishReason | None = None

        stats: GenerationStats | None = None
@@ -665,7 +663,7 @@ class API:
        )
        return resp

-    async def _trigger_notify_user_to_download_model(self, model_id: ModelId) -> None:
+    async def _trigger_notify_user_to_download_model(self, model_id: str) -> None:
        logger.warning(
            "TODO: we should send a notification to the user to download the model"
        )
@@ -674,7 +672,7 @@ class API:
        self, payload: ChatCompletionTaskParams
    ) -> ChatCompletionResponse | StreamingResponse:
        """Handle chat completions, supporting both streaming and non-streaming responses."""
-        model_card = await ModelCard.load(ModelId(payload.model))
+        model_card = await resolve_model_card(ModelId(payload.model))
        payload.model = model_card.model_id

        if not any(
@@ -692,7 +690,7 @@ class API:
        await self._send(command)
        if payload.stream:
            return StreamingResponse(
-                self._generate_chat_stream(command.command_id, payload.stream_options),
+                self._generate_chat_stream(command.command_id),
                media_type="text/event-stream",
            )

@@ -701,7 +699,7 @@ class API:
    async def bench_chat_completions(
        self, payload: BenchChatCompletionTaskParams
    ) -> BenchChatCompletionResponse:
-        model_card = await ModelCard.load(ModelId(payload.model))
+        model_card = await resolve_model_card(ModelId(payload.model))
        payload.model = model_card.model_id

        if not any(
@@ -721,12 +719,12 @@ class API:
        response = await self._collect_chat_completion_with_stats(command.command_id)
        return response

-    async def _validate_image_model(self, model: ModelId) -> ModelId:
+    async def _validate_image_model(self, model: str) -> ModelId:
        """Validate model exists and return resolved model ID.

        Raises HTTPException 404 if no instance is found for the model.
        """
-        model_card = await ModelCard.load(model)
+        model_card = await resolve_model_card(ModelId(model))
        resolved_model = model_card.model_id
        if not any(
            instance.shard_assignments.model_id == resolved_model
@@ -772,7 +770,7 @@ class API:
        When stream=True and partial_images > 0, returns a StreamingResponse
        with SSE-formatted events for partial and final images.
        """
-        payload.model = await self._validate_image_model(ModelId(payload.model))
+        payload.model = await self._validate_image_model(payload.model)

        command = ImageGeneration(
            request_params=payload,
@@ -902,6 +900,11 @@ class API:
                        del image_metadata[key]

        except anyio.get_cancelled_exc_class():
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
            await self._send(TaskFinished(finished_command_id=command_id))
@@ -983,6 +986,11 @@ class API:

            return (images, stats if capture_stats else None)
        except anyio.get_cancelled_exc_class():
+            command = TaskCancelled(cancelled_command_id=command_id)
+            with anyio.CancelScope(shield=True):
+                await self.command_sender.send(
+                    ForwarderCommand(origin=self.node_id, command=command)
+                )
            raise
        finally:
            await self._send(TaskFinished(finished_command_id=command_id))
@@ -1017,7 +1025,7 @@ class API:
    async def bench_image_generations(
        self, request: Request, payload: BenchImageGenerationTaskParams
    ) -> BenchImageGenerationResponse:
-        payload.model = await self._validate_image_model(ModelId(payload.model))
+        payload.model = await self._validate_image_model(payload.model)

        payload.stream = False
        payload.partial_images = 0
@@ -1038,7 +1046,7 @@ class API:
        self,
        image: UploadFile,
        prompt: str,
-        model: ModelId,
+        model: str,
        n: int,
        size: str,
        response_format: Literal["url", "b64_json"],
@@ -1133,7 +1141,7 @@ class API:
        command = await self._send_image_edits_command(
            image=image,
            prompt=prompt,
-            model=ModelId(model),
+            model=model,
            n=n,
            size=size,
            response_format=response_format,
@@ -1189,7 +1197,7 @@ class API:
        command = await self._send_image_edits_command(
            image=image,
            prompt=prompt,
-            model=ModelId(model),
+            model=model,
            n=n,
            size=size,
            response_format=response_format,
--- a/src/exo/master/main.py
+++ b/src/exo/master/main.py
@@ -21,6 +21,7 @@ from exo.shared.types.commands import (
    PlaceInstance,
    RequestEventLog,
    SendInputChunk,
+    TaskCancelled,
    TaskFinished,
    TestCommand,
 )
@@ -35,6 +36,7 @@ from exo.shared.types.events import (
    NodeTimedOut,
    TaskCreated,
    TaskDeleted,
+    TaskStatusUpdated,
 )
 from exo.shared.types.state import State
 from exo.shared.types.tasks import (
@@ -246,7 +248,7 @@ class Master:
                        case DeleteInstance():
                            placement = delete_instance(command, self.state.instances)
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case PlaceInstance():
@@ -258,7 +260,7 @@ class Master:
                                self.state.node_network,
                            )
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case CreateInstance():
@@ -268,7 +270,7 @@ class Master:
                                self.state.instances,
                            )
                            transition_events = get_transition_events(
-                                self.state.instances, placement
+                                self.state.instances, placement, self.state.tasks
                            )
                            generated_events.extend(transition_events)
                        case SendInputChunk(chunk=chunk):
@@ -278,6 +280,18 @@ class Master:
                                    chunk=chunk,
                                )
                            )
+                        case TaskCancelled():
+                            if (
+                                task_id := self.command_task_mapping.get(
+                                    command.cancelled_command_id
+                                )
+                            ) is not None:
+                                generated_events.append(
+                                    TaskStatusUpdated(
+                                        task_status=TaskStatus.Cancelled,
+                                        task_id=task_id,
+                                    )
+                                )
                        case TaskFinished():
                            generated_events.append(
                                TaskDeleted(
@@ -286,10 +300,9 @@ class Master:
                                    ]
                                )
                            )
-                            if command.finished_command_id in self.command_task_mapping:
-                                del self.command_task_mapping[
-                                    command.finished_command_id
-                                ]
+                            self.command_task_mapping.pop(
+                                command.finished_command_id, None
+                            )
                        case RequestEventLog():
                            # We should just be able to send everything, since other buffers will ignore old messages
                            for i in range(command.since_idx, len(self._event_log)):
--- a/src/exo/master/placement.py
+++ b/src/exo/master/placement.py
@@ -20,9 +20,15 @@ from exo.shared.types.commands import (
    PlaceInstance,
 )
 from exo.shared.types.common import NodeId
-from exo.shared.types.events import Event, InstanceCreated, InstanceDeleted
+from exo.shared.types.events import (
+    Event,
+    InstanceCreated,
+    InstanceDeleted,
+    TaskStatusUpdated,
+)
 from exo.shared.types.memory import Memory
 from exo.shared.types.profiling import MemoryUsage, NodeNetworkInfo
+from exo.shared.types.tasks import Task, TaskId, TaskStatus
 from exo.shared.types.worker.instances import (
    Instance,
    InstanceId,
@@ -180,6 +186,7 @@ def delete_instance(
 def get_transition_events(
    current_instances: Mapping[InstanceId, Instance],
    target_instances: Mapping[InstanceId, Instance],
+    tasks: Mapping[TaskId, Task],
 ) -> Sequence[Event]:
    events: list[Event] = []

@@ -195,6 +202,18 @@ def get_transition_events(
    # find instances to delete
    for instance_id in current_instances:
        if instance_id not in target_instances:
+            for task in tasks.values():
+                if task.instance_id == instance_id and task.task_status in [
+                    TaskStatus.Pending,
+                    TaskStatus.Running,
+                ]:
+                    events.append(
+                        TaskStatusUpdated(
+                            task_status=TaskStatus.Cancelled,
+                            task_id=task.task_id,
+                        )
+                    )
+
            events.append(
                InstanceDeleted(
                    instance_id=instance_id,
--- a/src/exo/routing/router.py
+++ b/src/exo/routing/router.py
@@ -216,8 +216,6 @@ def get_node_id_keypair(
    Obtains the :class:`Keypair` associated with this node-ID.
    Obtain the :class:`PeerId` by from it.
    """
-    # TODO(evan): bring back node id persistence once we figure out how to deal with duplicates
-    return Keypair.generate_ed25519()

    def lock_path(path: str | bytes | PathLike[str] | PathLike[bytes]) -> Path:
        return Path(str(path) + ".lock")
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Annotated, Any
+from typing import Annotated

 import aiofiles
 import aiofiles.os as aios
@@ -7,14 +7,7 @@ import tomlkit
 from anyio import Path, open_file
 from huggingface_hub import model_info
 from loguru import logger
-from pydantic import (
-    AliasChoices,
-    BaseModel,
-    Field,
-    PositiveInt,
-    field_validator,
-    model_validator,
-)
+from pydantic import BaseModel, Field, PositiveInt, field_validator

 from exo.shared.constants import EXO_ENABLE_IMAGE_MODELS
 from exo.shared.types.common import ModelId
@@ -128,14 +121,6 @@ MODEL_CARDS: dict[str, ModelCard] = {
        supports_tensor=True,
        tasks=[ModelTask.TextGeneration],
    ),
-    "kimi-k2.5": ModelCard(
-        model_id=ModelId("mlx-community/Kimi-K2.5"),
-        storage_size=Memory.from_gb(617),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-        tasks=[ModelTask.TextGeneration],
-    ),
    # llama-3.1
    "llama-3.1-8b": ModelCard(
        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
@@ -428,9 +413,9 @@ MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

-_IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
+_IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
    "flux1-schnell": ModelCard(
-        model_id=ModelId("exolabs/FLUX.1-schnell"),
+        model_id=ModelId("black-forest-labs/FLUX.1-schnell"),
        storage_size=Memory.from_bytes(23782357120 + 9524621312),
        n_layers=57,
        hidden_size=1,
@@ -443,7 +428,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -457,7 +442,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23782357120),
-                n_layers=57,
+                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -472,7 +457,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "flux1-dev": ModelCard(
-        model_id=ModelId("exolabs/FLUX.1-dev"),
+        model_id=ModelId("black-forest-labs/FLUX.1-dev"),
        storage_size=Memory.from_bytes(23782357120 + 9524621312),
        n_layers=57,
        hidden_size=1,
@@ -485,7 +470,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -499,7 +484,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23802816640),
-                n_layers=57,
+                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -514,7 +499,7 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "flux1-krea-dev": ModelCard(
-        model_id=ModelId("exolabs/FLUX.1-Krea-dev"),
+        model_id=ModelId("black-forest-labs/FLUX.1-Krea-dev"),
        storage_size=Memory.from_bytes(23802816640 + 9524621312),  # Same as dev
        n_layers=57,
        hidden_size=1,
@@ -556,9 +541,9 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "qwen-image": ModelCard(
-        model_id=ModelId("exolabs/Qwen-Image"),
+        model_id=ModelId("Qwen/Qwen-Image"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,
+        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.TextToImage],
@@ -566,10 +551,10 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_bytes(16584333312),
+                storage_size=Memory.from_kb(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="transformer",
@@ -590,9 +575,9 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
        ],
    ),
    "qwen-image-edit-2509": ModelCard(
-        model_id=ModelId("exolabs/Qwen-Image-Edit-2509"),
+        model_id=ModelId("Qwen/Qwen-Image-Edit-2509"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,
+        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.ImageToImage],
@@ -600,10 +585,10 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_bytes(16584333312),
+                storage_size=Memory.from_kb(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,
+                safetensors_index_filename=None,  # Single file
            ),
            ComponentInfo(
                component_name="transformer",
@@ -625,92 +610,6 @@ _IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

-
-def _generate_image_model_quant_variants(
-    base_name: str,
-    base_card: ModelCard,
-) -> dict[str, ModelCard]:
-    """Create quantized variants of an image model card.
-
-    Only the transformer component is quantized; text encoders stay at bf16.
-    Sizes are calculated exactly from the base card's component sizes.
-    """
-    if base_card.components is None:
-        raise ValueError(f"Image model {base_name} must have components defined")
-
-    # quantizations = [8, 6, 5, 4, 3]
-    quantizations = [8, 4]
-
-    num_transformer_bytes = next(
-        c.storage_size.in_bytes
-        for c in base_card.components
-        if c.component_name == "transformer"
-    )
-
-    transformer_bytes = Memory.from_bytes(num_transformer_bytes)
-
-    remaining_bytes = Memory.from_bytes(
-        sum(
-            c.storage_size.in_bytes
-            for c in base_card.components
-            if c.component_name != "transformer"
-        )
-    )
-
-    def with_transformer_size(new_size: Memory) -> list[ComponentInfo]:
-        assert base_card.components is not None
-        return [
-            ComponentInfo(
-                component_name=c.component_name,
-                component_path=c.component_path,
-                storage_size=new_size
-                if c.component_name == "transformer"
-                else c.storage_size,
-                n_layers=c.n_layers,
-                can_shard=c.can_shard,
-                safetensors_index_filename=c.safetensors_index_filename,
-            )
-            for c in base_card.components
-        ]
-
-    variants = {
-        base_name: ModelCard(
-            model_id=base_card.model_id,
-            storage_size=transformer_bytes + remaining_bytes,
-            n_layers=base_card.n_layers,
-            hidden_size=base_card.hidden_size,
-            supports_tensor=base_card.supports_tensor,
-            tasks=base_card.tasks,
-            components=with_transformer_size(transformer_bytes),
-        )
-    }
-
-    for quant in quantizations:
-        quant_transformer_bytes = Memory.from_bytes(
-            (num_transformer_bytes * quant) // 16
-        )
-        total_bytes = remaining_bytes + quant_transformer_bytes
-
-        model_id = ModelId(base_card.model_id + f"-{quant}bit")
-
-        variants[f"{base_name}-{quant}bit"] = ModelCard(
-            model_id=model_id,
-            storage_size=total_bytes,
-            n_layers=base_card.n_layers,
-            hidden_size=base_card.hidden_size,
-            supports_tensor=base_card.supports_tensor,
-            tasks=base_card.tasks,
-            components=with_transformer_size(quant_transformer_bytes),
-        )
-
-    return variants
-
-
-_image_model_cards: dict[str, ModelCard] = {}
-for _base_name, _base_card in _IMAGE_BASE_MODEL_CARDS.items():
-    _image_model_cards |= _generate_image_model_quant_variants(_base_name, _base_card)
-_IMAGE_MODEL_CARDS = _image_model_cards
-
 if EXO_ENABLE_IMAGE_MODELS:
    MODEL_CARDS.update(_IMAGE_MODEL_CARDS)

@@ -718,18 +617,15 @@ if EXO_ENABLE_IMAGE_MODELS:
 class ConfigData(BaseModel):
    model_config = {"extra": "ignore"}  # Allow unknown fields

-    architectures: list[str] | None = None
+    # Common field names for number of layers across different architectures
+    num_hidden_layers: Annotated[int, Field(ge=0)] | None = None
+    num_layers: Annotated[int, Field(ge=0)] | None = None
+    n_layer: Annotated[int, Field(ge=0)] | None = None
+    n_layers: Annotated[int, Field(ge=0)] | None = None  # Sometimes used
+    num_decoder_layers: Annotated[int, Field(ge=0)] | None = None  # Transformer models
+    decoder_layers: Annotated[int, Field(ge=0)] | None = None  # Some architectures
    hidden_size: Annotated[int, Field(ge=0)] | None = None
-    layer_count: int = Field(
-        validation_alias=AliasChoices(
-            "num_hidden_layers",
-            "num_layers",
-            "n_layer",
-            "n_layers",
-            "num_decoder_layers",
-            "decoder_layers",
-        )
-    )
+    architectures: list[str] | None = None

    @property
    def supports_tensor(self) -> bool:
@@ -744,27 +640,25 @@ class ConfigData(BaseModel):
            ["GptOssForCausalLM"],
        ]

-    @model_validator(mode="before")
-    @classmethod
-    def defer_to_text_config(cls, data: dict[str, Any]):
-        text_config = data.get("text_config")
-        if text_config is None:
-            return data
+    @property
+    def layer_count(self) -> int:
+        # Check common field names for layer count
+        layer_fields = [
+            self.num_hidden_layers,
+            self.num_layers,
+            self.n_layer,
+            self.n_layers,
+            self.num_decoder_layers,
+            self.decoder_layers,
+        ]

-        for field in [
-            "architectures",
-            "hidden_size",
-            "num_hidden_layers",
-            "num_layers",
-            "n_layer",
-            "n_layers",
-            "num_decoder_layers",
-            "decoder_layers",
-        ]:
-            if (val := text_config.get(field)) is not None:  # pyright: ignore[reportAny]
-                data[field] = val
+        for layer_count in layer_fields:
+            if layer_count is not None:
+                return layer_count

-        return data
+        raise ValueError(
+            f"No layer count found in config.json: {self.model_dump_json()}"
+        )


 async def get_config_data(model_id: ModelId) -> ConfigData:
--- a/src/exo/shared/tests/test_node_id_persistence.py
+++ b/src/exo/shared/tests/test_node_id_persistence.py
@@ -8,7 +8,7 @@ from multiprocessing.synchronize import Event as EventT
 from multiprocessing.synchronize import Semaphore as SemaphoreT

 from loguru import logger
-from pytest import LogCaptureFixture, mark
+from pytest import LogCaptureFixture

 from exo.routing.router import get_node_id_keypair
 from exo.shared.constants import EXO_NODE_ID_KEYPAIR
@@ -74,7 +74,6 @@ def _delete_if_exists(p: str | bytes | os.PathLike[str] | os.PathLike[bytes]):
        os.remove(p)


-@mark.skip(reason="this functionality is currently disabled but may return in future")
 def test_node_id_fetching(caplog: LogCaptureFixture):
    reps = 10

--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -11,7 +11,7 @@ from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.memory import Memory
 from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
 from exo.shared.types.worker.shards import Sharding, ShardMetadata
-from exo.utils.pydantic_ext import CamelCaseModel, ConfigDict, TaggedModel
+from exo.utils.pydantic_ext import CamelCaseModel

 FinishReason = Literal[
    "stop", "length", "tool_calls", "content_filter", "function_call", "error"
@@ -116,8 +116,8 @@ class Usage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
-    prompt_tokens_details: PromptTokensDetails
-    completion_tokens_details: CompletionTokensDetails
+    prompt_tokens_details: PromptTokensDetails | None = None
+    completion_tokens_details: CompletionTokensDetails | None = None


 class StreamingChoiceResponse(BaseModel):
@@ -170,13 +170,7 @@ class BenchChatCompletionResponse(ChatCompletionResponse):
    generation_stats: GenerationStats | None = None


-class StreamOptions(BaseModel):
-    include_usage: bool = False
-
-
-class ChatCompletionTaskParams(TaggedModel):
-    model_config = ConfigDict(extra="ignore")
-
+class ChatCompletionTaskParams(BaseModel):
    model: str
    frequency_penalty: float | None = None
    messages: list[ChatCompletionMessage]
@@ -190,7 +184,6 @@ class ChatCompletionTaskParams(TaggedModel):
    seed: int | None = None
    stop: str | list[str] | None = None
    stream: bool = False
-    stream_options: StreamOptions | None = None
    temperature: float | None = None
    top_p: float | None = None
    tools: list[dict[str, Any]] | None = None
--- a/src/exo/shared/types/chunks.py
+++ b/src/exo/shared/types/chunks.py
@@ -2,7 +2,7 @@ from collections.abc import Generator
 from typing import Any, Literal

 from exo.shared.models.model_cards import ModelId
-from exo.shared.types.api import GenerationStats, ImageGenerationStats, Usage
+from exo.shared.types.api import GenerationStats, ImageGenerationStats
 from exo.utils.pydantic_ext import TaggedModel

 from .api import FinishReason
@@ -17,7 +17,6 @@ class BaseChunk(TaggedModel):
 class TokenChunk(BaseChunk):
    text: str
    token_id: int
-    usage: Usage | None
    finish_reason: Literal["stop", "length", "content_filter"] | None = None
    stats: GenerationStats | None = None

@@ -29,7 +28,6 @@ class ErrorChunk(BaseChunk):

 class ToolCallChunk(BaseChunk):
    tool_calls: list[ToolCallItem]
-    usage: Usage | None
    finish_reason: Literal["tool_calls"] = "tool_calls"
    stats: GenerationStats | None = None

--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -2,7 +2,6 @@ from pydantic import Field

 from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.api import (
-    BenchChatCompletionTaskParams,
    ChatCompletionTaskParams,
    ImageEditsInternalParams,
    ImageGenerationTaskParams,
@@ -23,7 +22,7 @@ class TestCommand(BaseCommand):


 class ChatCompletion(BaseCommand):
-    request_params: ChatCompletionTaskParams | BenchChatCompletionTaskParams
+    request_params: ChatCompletionTaskParams


 class ImageGeneration(BaseCommand):
@@ -49,6 +48,10 @@ class DeleteInstance(BaseCommand):
    instance_id: InstanceId


+class TaskCancelled(BaseCommand):
+    cancelled_command_id: CommandId
+
+
 class TaskFinished(BaseCommand):
    finished_command_id: CommandId

@@ -85,6 +88,7 @@ Command = (
    | PlaceInstance
    | CreateInstance
    | DeleteInstance
+    | TaskCancelled
    | TaskFinished
    | SendInputChunk
 )
--- a/src/exo/shared/types/mlx.py
+++ b/src/exo/shared/types/mlx.py
@@ -1,12 +0,0 @@
-"""Shared types for MLX-related functionality."""
-
-from collections.abc import Sequence
-
-from mlx_lm.models.cache import (
-    KVCache,
-    QuantizedKVCache,
-    RotatingKVCache,
-)
-
-# This list contains one cache entry per transformer layer
-KVCacheType = Sequence[KVCache | RotatingKVCache | QuantizedKVCache]
--- a/src/exo/shared/types/tasks.py
+++ b/src/exo/shared/types/tasks.py
@@ -3,7 +3,6 @@ from enum import Enum
 from pydantic import Field

 from exo.shared.types.api import (
-    BenchChatCompletionTaskParams,
    ChatCompletionTaskParams,
    ImageEditsInternalParams,
    ImageGenerationTaskParams,
@@ -25,6 +24,7 @@ class TaskStatus(str, Enum):
    Complete = "Complete"
    TimedOut = "TimedOut"
    Failed = "Failed"
+    Cancelled = "Cancelled"


 class BaseTask(TaggedModel):
@@ -55,12 +55,16 @@ class StartWarmup(BaseTask):  # emitted by Worker

 class ChatCompletion(BaseTask):  # emitted by Master
    command_id: CommandId
-    task_params: ChatCompletionTaskParams | BenchChatCompletionTaskParams
+    task_params: ChatCompletionTaskParams

    error_type: str | None = Field(default=None)
    error_message: str | None = Field(default=None)


+class CancelTask(BaseTask):
+    cancelled_task_id: TaskId
+
+
 class ImageGeneration(BaseTask):  # emitted by Master
    command_id: CommandId
    task_params: ImageGenerationTaskParams
@@ -88,6 +92,7 @@ Task = (
    | LoadModel
    | StartWarmup
    | ChatCompletion
+    | CancelTask
    | ImageGeneration
    | ImageEdits
    | Shutdown
--- a/src/exo/shared/types/worker/runner_response.py
+++ b/src/exo/shared/types/worker/runner_response.py
@@ -6,7 +6,6 @@ from exo.shared.types.api import (
    GenerationStats,
    ImageGenerationStats,
    ToolCallItem,
-    Usage,
 )
 from exo.utils.pydantic_ext import TaggedModel

@@ -25,7 +24,6 @@ class GenerationResponse(BaseRunnerResponse):
    # logprobs: list[float] | None = None # too big. we can change to be top-k
    finish_reason: FinishReason | None = None
    stats: GenerationStats | None = None
-    usage: Usage | None


 class ImageGenerationResponse(BaseRunnerResponse):
@@ -59,7 +57,6 @@ class PartialImageResponse(BaseRunnerResponse):

 class ToolCallResponse(BaseRunnerResponse):
    tool_calls: list[ToolCallItem]
-    usage: Usage | None


 class FinishedResponse(BaseRunnerResponse):
--- a/src/exo/utils/info_gatherer/info_gatherer.py
+++ b/src/exo/utils/info_gatherer/info_gatherer.py
@@ -349,8 +349,13 @@ class InfoGatherer:
    async def _monitor_misc(self):
        if self.misc_poll_interval is None:
            return
+        prev = await MiscData.gather()
+        await self.info_sender.send(prev)
        while True:
-            await self.info_sender.send(await MiscData.gather())
+            curr = await MiscData.gather()
+            if prev != curr:
+                prev = curr
+                await self.info_sender.send(curr)
            await anyio.sleep(self.misc_poll_interval)

    async def _monitor_system_profiler_thunderbolt_data(self):
@@ -360,12 +365,15 @@ class InfoGatherer:
        if iface_map is None:
            return

+        old_idents = []
        while True:
            data = await ThunderboltConnectivity.gather()
            assert data is not None

            idents = [it for i in data if (it := i.ident(iface_map)) is not None]
-            await self.info_sender.send(MacThunderboltIdentifiers(idents=idents))
+            if idents != old_idents:
+                await self.info_sender.send(MacThunderboltIdentifiers(idents=idents))
+            old_idents = idents

            conns = [it for i in data if (it := i.conn()) is not None]
            await self.info_sender.send(MacThunderboltConnections(conns=conns))
@@ -390,17 +398,22 @@ class InfoGatherer:
    async def _watch_system_info(self):
        if self.interface_watcher_interval is None:
            return
+        old_nics = []
        while True:
            nics = await get_network_interfaces()
-            await self.info_sender.send(NodeNetworkInterfaces(ifaces=nics))
+            if nics != old_nics:
+                old_nics = nics
+                await self.info_sender.send(NodeNetworkInterfaces(ifaces=nics))
            await anyio.sleep(self.interface_watcher_interval)

    async def _monitor_thunderbolt_bridge_status(self):
        if self.thunderbolt_bridge_poll_interval is None:
            return
+        prev: ThunderboltBridgeInfo | None = None
        while True:
            curr = await ThunderboltBridgeInfo.gather()
-            if curr is not None:
+            if curr is not None and prev != curr:
+                prev = curr
                await self.info_sender.send(curr)
            await anyio.sleep(self.thunderbolt_bridge_poll_interval)

--- a/src/exo/worker/engines/image/distributed_model.py
+++ b/src/exo/worker/engines/image/distributed_model.py
@@ -1,4 +1,4 @@
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from pathlib import Path
 from typing import Any, Literal, Optional

@@ -109,6 +109,7 @@ class DistributedImageModel:
        image_path: Path | None = None,
        partial_images: int = 0,
        advanced_params: AdvancedImageParams | None = None,
+        cancel_checker: Callable[[], bool] | None = None,
    ) -> Generator[Image.Image | tuple[Image.Image, int, int], None, None]:
        if (
            advanced_params is not None
@@ -153,6 +154,7 @@ class DistributedImageModel:
            guidance_override=guidance_override,
            negative_prompt=negative_prompt,
            num_sync_steps=num_sync_steps,
+            cancel_checker=cancel_checker,
        ):
            if isinstance(result, tuple):
                # Partial image: (GeneratedImage, partial_index, total_partials)
--- a/src/exo/worker/engines/image/generate.py
+++ b/src/exo/worker/engines/image/generate.py
@@ -3,6 +3,7 @@ import io
 import random
 import tempfile
 import time
+from collections.abc import Callable
 from pathlib import Path
 from typing import Generator, Literal

@@ -68,12 +69,18 @@ def warmup_image_generator(model: DistributedImageModel) -> Image.Image | None:
 def generate_image(
    model: DistributedImageModel,
    task: ImageGenerationTaskParams | ImageEditsInternalParams,
+    cancel_checker: Callable[[], bool] | None = None,
 ) -> Generator[ImageGenerationResponse | PartialImageResponse, None, None]:
    """Generate image(s), optionally yielding partial results.

    When partial_images > 0 or stream=True, yields PartialImageResponse for
    intermediate images, then ImageGenerationResponse for the final image.

+    Args:
+        model: The distributed image model to use for generation.
+        task: The task parameters for image generation or editing.
+        cancel_checker: Optional callback to check if generation should be cancelled.
+
    Yields:
        PartialImageResponse for intermediate images (if partial_images > 0, first image only)
        ImageGenerationResponse for final complete images
@@ -98,8 +105,8 @@ def generate_image(

    partial_images = (
        task.partial_images
-        if task.partial_images is not None and task.stream is not None and task.stream
-        else 0
+        if task.partial_images is not None
+        else (3 if task.stream else 0)
    )

    image_path: Path | None = None
@@ -123,6 +130,7 @@ def generate_image(
                image_path=image_path,
                partial_images=partial_images,
                advanced_params=advanced_params,
+                cancel_checker=cancel_checker,
            ):
                if isinstance(result, tuple):
                    # Partial image: (Image, partial_index, total_partials)
--- a/src/exo/worker/engines/image/pipeline/runner.py
+++ b/src/exo/worker/engines/image/pipeline/runner.py
@@ -1,3 +1,4 @@
+from collections.abc import Callable
 from math import ceil
 from typing import Any, Optional

@@ -94,6 +95,8 @@ class DiffusionRunner:
        self.total_layers = config.total_blocks

        self._guidance_override: float | None = None
+        self._cancel_checker: Callable[[], bool] | None = None
+        self._cancelling = False

        self._compute_assigned_blocks()

@@ -148,6 +151,54 @@ class DiffusionRunner:
            return self._guidance_override
        return self.config.guidance_scale

+    def _check_cancellation(self) -> bool:
+        if self._cancelling:
+            return True
+        if (
+            self.is_first_stage
+            and self._cancel_checker is not None
+            and self._cancel_checker()
+        ):
+            self._cancelling = True
+        return self._cancelling
+
+    def _is_sentinel(self, tensor: mx.array) -> bool:
+        return bool(mx.all(mx.isnan(tensor)).item())
+
+    def _make_sentinel_like(self, tensor: mx.array) -> mx.array:
+        return mx.full(tensor.shape, float("nan"), dtype=tensor.dtype)
+
+    def _recv(
+        self,
+        shape: tuple[int, ...],
+        dtype: mx.Dtype,
+        src: int,
+    ) -> mx.array:
+        """Receive data and check for cancellation sentinel."""
+        data = mx.distributed.recv(shape, dtype, src, group=self.group)
+        mx.eval(data)
+        if self._is_sentinel(data):
+            self._cancelling = True
+        return data
+
+    def _recv_like(self, template: mx.array, src: int) -> mx.array:
+        """Receive data matching template and check for cancellation sentinel."""
+        data = mx.distributed.recv_like(template, src=src, group=self.group)
+        mx.eval(data)
+        if self._is_sentinel(data):
+            self._cancelling = True
+        return data
+
+    def _send(self, data: mx.array, dst: int) -> mx.array:
+        """Send data, or sentinel if cancelling."""
+
+        if self._cancelling:
+            data = self._make_sentinel_like(data)
+
+        result = mx.distributed.send(data, dst, group=self.group)
+        mx.async_eval(result)
+        return result
+
    def _ensure_wrappers(
        self,
        text_seq_len: int,
@@ -244,6 +295,7 @@ class DiffusionRunner:
        guidance_override: float | None = None,
        negative_prompt: str | None = None,
        num_sync_steps: int = 1,
+        cancel_checker: Callable[[], bool] | None = None,
    ):
        """Primary entry point for image generation.

@@ -255,17 +307,21 @@ class DiffusionRunner:
        5. Decode to image

        Args:
-            settings: Generation config (steps, height, width)
+            runtime_config: Runtime configuration (steps, height, width)
            prompt: Text prompt
            seed: Random seed
            partial_images: Number of intermediate images to yield (0 for none)
            guidance_override: Optional override for guidance scale (CFG)
+            negative_prompt: Optional negative prompt for CFG
+            num_sync_steps: Number of synchronous pipeline steps
+            cancel_checker: Optional callback to check for cancellation

        Yields:
            Partial images as (GeneratedImage, partial_index, total_partials) tuples
            Final GeneratedImage
        """
        self._guidance_override = guidance_override
+        self._cancel_checker = cancel_checker
        latents = self.adapter.create_latents(seed, runtime_config)
        prompt_data = self.adapter.encode_prompt(prompt, negative_prompt)

@@ -307,7 +363,7 @@ class DiffusionRunner:
            except StopIteration as e:
                latents = e.value  # pyright: ignore[reportAny]

-        if self.is_last_stage:
+        if self.is_last_stage and not self._cancelling:
            yield self.adapter.decode_latents(latents, runtime_config, seed, prompt)  # pyright: ignore[reportAny]

    def _run_diffusion_loop(
@@ -323,6 +379,7 @@ class DiffusionRunner:
        if capture_steps is None:
            capture_steps = set()

+        self._cancelling = False
        self._reset_all_caches()

        time_steps = tqdm(range(runtime_config.num_inference_steps))
@@ -345,6 +402,9 @@ class DiffusionRunner:
                    num_sync_steps=num_sync_steps,
                )

+                if self._cancelling:
+                    break
+
                ctx.in_loop(  # pyright: ignore[reportAny]
                    t=t,
                    latents=latents,
@@ -357,7 +417,7 @@ class DiffusionRunner:
                    yield (latents, t)

            except KeyboardInterrupt:  # noqa: PERF203
-                ctx.interruption(t=t, latents=latents)  # pyright: ignore[reportAny]
+                ctx.interruption(t=t, latents=latents, time_steps=time_steps)  # pyright: ignore[reportAny]
                raise StopImageGenerationException(
                    f"Stopping image generation at step {t + 1}/{len(time_steps)}"
                ) from None
@@ -567,6 +627,8 @@ class DiffusionRunner:
            for wrapper in self.joint_block_wrappers:
                wrapper.set_encoder_mask(encoder_hidden_states_mask)

+        self._check_cancellation()
+
        encoder_hidden_states: mx.array | None = None
        if self.is_first_stage:
            hidden_states, encoder_hidden_states = self.adapter.compute_embeddings(
@@ -586,19 +648,12 @@ class DiffusionRunner:

        if self.has_joint_blocks:
            if not self.is_first_stage:
-                hidden_states = mx.distributed.recv(
-                    (batch_size, num_img_tokens, hidden_dim),
-                    dtype,
-                    self.prev_rank,
-                    group=self.group,
+                hidden_states = self._recv(
+                    (batch_size, num_img_tokens, hidden_dim), dtype, self.prev_rank
                )
-                encoder_hidden_states = mx.distributed.recv(
-                    (batch_size, text_seq_len, hidden_dim),
-                    dtype,
-                    self.prev_rank,
-                    group=self.group,
+                encoder_hidden_states = self._recv(
+                    (batch_size, text_seq_len, hidden_dim), dtype, self.prev_rank
                )
-                mx.eval(hidden_states, encoder_hidden_states)

            assert self.joint_block_wrappers is not None
            assert encoder_hidden_states is not None
@@ -620,30 +675,20 @@ class DiffusionRunner:
            if self.has_single_blocks or self.is_last_stage:
                hidden_states = concatenated
            else:
-                concatenated = mx.distributed.send(
-                    concatenated, self.next_rank, group=self.group
-                )
-                mx.async_eval(concatenated)
+                concatenated = self._send(concatenated, self.next_rank)

        elif self.has_joint_blocks and not self.is_last_stage:
            assert encoder_hidden_states is not None
-            hidden_states = mx.distributed.send(
-                hidden_states, self.next_rank, group=self.group
-            )
-            encoder_hidden_states = mx.distributed.send(
-                encoder_hidden_states, self.next_rank, group=self.group
-            )
-            mx.async_eval(hidden_states, encoder_hidden_states)
+            hidden_states = self._send(hidden_states, self.next_rank)
+            encoder_hidden_states = self._send(encoder_hidden_states, self.next_rank)

        if self.has_single_blocks:
            if not self.owns_concat_stage and not self.is_first_stage:
-                hidden_states = mx.distributed.recv(
+                hidden_states = self._recv(
                    (batch_size, text_seq_len + num_img_tokens, hidden_dim),
                    dtype,
                    self.prev_rank,
-                    group=self.group,
                )
-                mx.eval(hidden_states)

            assert self.single_block_wrappers is not None
            for wrapper in self.single_block_wrappers:
@@ -655,10 +700,7 @@ class DiffusionRunner:
                )

            if not self.is_last_stage:
-                hidden_states = mx.distributed.send(
-                    hidden_states, self.next_rank, group=self.group
-                )
-                mx.async_eval(hidden_states)
+                hidden_states = self._send(hidden_states, self.next_rank)

        hidden_states = hidden_states[:, text_seq_len:, ...]

@@ -742,14 +784,13 @@ class DiffusionRunner:
            )

            if not self.is_first_stage:
-                hidden_states = mx.distributed.send(hidden_states, 0, group=self.group)
-                mx.async_eval(hidden_states)
+                hidden_states = self._send(hidden_states, 0)

        elif self.is_first_stage:
-            hidden_states = mx.distributed.recv_like(
-                prev_latents, src=self.world_size - 1, group=self.group
-            )
-            mx.eval(hidden_states)
+            hidden_states = self._recv_like(prev_latents, src=self.world_size - 1)
+
+            if self._cancelling:
+                return prev_latents

        else:
            hidden_states = prev_latents
@@ -809,10 +850,9 @@ class DiffusionRunner:
                and not self.is_last_stage
                and not is_first_async_step
            ):
-                patch = mx.distributed.recv_like(
-                    patch, src=self.prev_rank, group=self.group
-                )
-                mx.eval(patch)
+                patch = self._recv_like(patch, src=self.prev_rank)
+
+            self._check_cancellation()

            step_patch = mx.concatenate([patch, patch], axis=0) if needs_cfg else patch

@@ -843,10 +883,19 @@ class DiffusionRunner:
                )

                if not self.is_first_stage and t != config.num_inference_steps - 1:
-                    patch_latents[patch_idx] = mx.distributed.send(
-                        patch_latents[patch_idx], self.next_rank, group=self.group
+                    patch_latents[patch_idx] = self._send(
+                        patch_latents[patch_idx], self.next_rank
                    )
-                    mx.async_eval(patch_latents[patch_idx])
+
+        # Drain final rank patch sends if cancelling
+        if (
+            self._cancelling
+            and self.is_first_stage
+            and not self.is_last_stage
+            and t != config.num_inference_steps - 1
+        ):
+            for patch_idx in range(len(patch_latents)):
+                _ = self._recv_like(patch_latents[patch_idx], src=self.prev_rank)

        return mx.concatenate(patch_latents, axis=1)

@@ -885,22 +934,16 @@ class DiffusionRunner:
        if self.has_joint_blocks:
            if not self.is_first_stage:
                patch_len = patch.shape[1]
-                patch = mx.distributed.recv(
-                    (batch_size, patch_len, hidden_dim),
-                    patch.dtype,
-                    self.prev_rank,
-                    group=self.group,
+                patch = self._recv(
+                    (batch_size, patch_len, hidden_dim), patch.dtype, self.prev_rank
                )
-                mx.eval(patch)

                if patch_idx == 0:
-                    encoder_hidden_states = mx.distributed.recv(
+                    encoder_hidden_states = self._recv(
                        (batch_size, text_seq_len, hidden_dim),
                        patch.dtype,
                        self.prev_rank,
-                        group=self.group,
                    )
-                    mx.eval(encoder_hidden_states)

            if self.is_first_stage:
                patch, encoder_hidden_states = self.adapter.compute_embeddings(
@@ -925,32 +968,25 @@ class DiffusionRunner:
            if self.has_single_blocks or self.is_last_stage:
                patch = patch_concat
            else:
-                patch_concat = mx.distributed.send(
-                    patch_concat, self.next_rank, group=self.group
-                )
-                mx.async_eval(patch_concat)
+                patch_concat = self._send(patch_concat, self.next_rank)

        elif self.has_joint_blocks and not self.is_last_stage:
-            patch = mx.distributed.send(patch, self.next_rank, group=self.group)
-            mx.async_eval(patch)
+            patch = self._send(patch, self.next_rank)

            if patch_idx == 0:
                assert encoder_hidden_states is not None
-                encoder_hidden_states = mx.distributed.send(
-                    encoder_hidden_states, self.next_rank, group=self.group
+                encoder_hidden_states = self._send(
+                    encoder_hidden_states, self.next_rank
                )
-                mx.async_eval(encoder_hidden_states)

        if self.has_single_blocks:
            if not self.owns_concat_stage and not self.is_first_stage:
                patch_len = patch.shape[1]
-                patch = mx.distributed.recv(
+                patch = self._recv(
                    (batch_size, text_seq_len + patch_len, hidden_dim),
                    patch.dtype,
                    self.prev_rank,
-                    group=self.group,
                )
-                mx.eval(patch)

            assert self.single_block_wrappers is not None
            for wrapper in self.single_block_wrappers:
@@ -962,8 +998,7 @@ class DiffusionRunner:
                )

            if not self.is_last_stage:
-                patch = mx.distributed.send(patch, self.next_rank, group=self.group)
-                mx.async_eval(patch)
+                patch = self._send(patch, self.next_rank)

        noise: mx.array | None = None
        if self.is_last_stage:
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -19,11 +19,8 @@ from mlx_lm.models.deepseek_v32 import DeepseekV32MLP
 from mlx_lm.models.deepseek_v32 import Model as DeepseekV32Model
 from mlx_lm.models.glm4_moe import Model as Glm4MoeModel
 from mlx_lm.models.glm4_moe import MoE
-from mlx_lm.models.glm4_moe_lite import Glm4MoeLiteDecoderLayer, Glm4MoeLiteMLP
-from mlx_lm.models.glm4_moe_lite import Model as GLM4MoeLiteModel
 from mlx_lm.models.gpt_oss import GptOssMoeModel
 from mlx_lm.models.gpt_oss import Model as GptOssModel
-from mlx_lm.models.kimi_k25 import Model as KimiK25Model
 from mlx_lm.models.llama import Model as LlamaModel
 from mlx_lm.models.minimax import Model as MiniMaxModel
 from mlx_lm.models.ministral3 import Model as Ministral3Model
@@ -148,10 +145,6 @@ class PipelineLastLayer(CustomMlxLayer):
            if cache is not None:
                cache.keys = mx.depends(cache.keys, output)  # type: ignore[reportUnknownMemberType]

-        output = mx.distributed.all_gather(output, group=self.group)[
-            -output.shape[0] :
-        ]  # type :ignore
-
        return output


@@ -201,9 +194,6 @@ def pipeline_auto_parallel(
    device_rank, world_size = model_shard_meta.device_rank, model_shard_meta.world_size

    layers = layers[start_layer:end_layer]
-    for layer in layers:
-        mx.eval(layer)  # type: ignore
-
    layers[0] = PipelineFirstLayer(layers[0], device_rank, group=group)
    layers[-1] = PipelineLastLayer(
        layers[-1],
@@ -262,6 +252,10 @@ def patch_pipeline_model[T](model: T, group: mx.distributed.Group) -> T:
        if cache is not None:
            cache[-1].state = mx.depends(cache[-1].state, logits)  # type: ignore

+        logits = mx.distributed.all_gather(logits, group=group)[
+            -logits.shape[0] :
+        ]  # type :ignore
+
        return logits

    cls.__call__ = patched_call
@@ -340,7 +334,15 @@ def tensor_auto_parallel(
        group=group,
    )

+    if hasattr(model, "shard") and not isinstance(model, GptOssModel):
+        try:
+            model.shard(group)  # type: ignore
+            return patch_tensor_model(model)
+        except (AttributeError, TypeError, NameError):
+            pass
+
    if isinstance(model, (LlamaModel, Ministral3Model)):
+        logger.warning("shouldn't be hit - upstream sharding exists")
        tensor_parallel_sharding_strategy = LlamaShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -348,7 +350,8 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
-    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model, KimiK25Model)):
+    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model)):
+        logger.warning("shouldn't be hit - upstream sharding exists")
        tensor_parallel_sharding_strategy = DeepSeekShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -364,14 +367,6 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
-    elif isinstance(model, GLM4MoeLiteModel):
-        tensor_parallel_sharding_strategy = GLM4MoeLiteShardingStrategy(
-            group,
-            all_to_sharded_linear,
-            sharded_to_all_linear,
-            all_to_sharded_linear_in_place,
-            sharded_to_all_linear_in_place,
-        )
    elif isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
        tensor_parallel_sharding_strategy = QwenShardingStrategy(
            group,
@@ -446,7 +441,7 @@ class LlamaShardingStrategy(TensorParallelShardingStrategy):
            layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
            layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
            layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
-            mx.eval(layer)
+
        return model


@@ -457,7 +452,7 @@ def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:

        # Update DeepSeek V3 specific parameters when layers are shrunk
        if isinstance(
-            model, (DeepseekV3Model, DeepseekV32Model, Glm4MoeModel, KimiK25Model)
+            model, (DeepseekV3Model, DeepseekV32Model, Glm4MoeModel)
        ) and hasattr(inner_model_instance, "num_layers"):
            logger.info(
                f"Setting num_layers to {len(layers)} for model {model.model.__class__.__name__}"
@@ -521,8 +516,6 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
                layer.mlp = ShardedDeepseekV3MoE(layer.mlp)  # type: ignore
                layer.mlp.sharding_group = self.group

-            mx.eval(layer)
-
        return model


@@ -540,84 +533,6 @@ class ShardedDeepseekV3MoE(CustomMlxLayer):
        return y


-class GLM4MoeLiteShardingStrategy(TensorParallelShardingStrategy):
-    def shard_model(
-        self,
-        model: nn.Module,
-        timeout_seconds: float,
-        on_timeout: TimeoutCallback | None,
-    ) -> nn.Module:
-        model = cast(GLM4MoeLiteModel, model)
-        for layer in model.layers:  # type: ignore
-            layer = cast(Glm4MoeLiteDecoderLayer, layer)
-            eval_with_timeout(
-                layer.parameters(),
-                timeout_seconds / len(model.layers),  # type: ignore
-                on_timeout,
-            )
-            if layer.self_attn.q_lora_rank is None:  # type: ignore
-                layer.self_attn.q_proj = self.all_to_sharded_linear(
-                    layer.self_attn.q_proj
-                )
-            else:
-                layer.self_attn.q_b_proj = self.all_to_sharded_linear(
-                    layer.self_attn.q_b_proj
-                )
-
-            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
-            layer.self_attn.num_heads //= self.N
-
-            # Logic from upstream mlx
-            num_heads = layer.self_attn.num_heads
-            sh = self.group.rank() * num_heads
-            eh = sh + num_heads
-
-            def shard_heads(w: mx.array, sh: int = sh, eh: int = eh) -> mx.array:
-                return w[sh:eh]
-
-            layer.self_attn.embed_q.apply(shard_heads)
-            layer.self_attn.unembed_out.apply(shard_heads)
-
-            if isinstance(layer.mlp, Glm4MoeLiteMLP):
-                layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
-                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
-                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
-
-            else:
-                if getattr(layer.mlp, "shared_experts", None) is not None:
-                    self.all_to_sharded_linear_in_place(
-                        layer.mlp.shared_experts.gate_proj
-                    )
-                    self.sharded_to_all_linear_in_place(
-                        layer.mlp.shared_experts.down_proj
-                    )
-                    self.all_to_sharded_linear_in_place(
-                        layer.mlp.shared_experts.up_proj
-                    )
-                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
-                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
-                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
-                layer.mlp = ShardedGLM4MoeLiteMoE(layer.mlp)  # type: ignore
-                layer.mlp.sharding_group = self.group  # type: ignore
-            mx.eval(layer)
-
-        return model
-
-
-class ShardedGLM4MoeLiteMoE(CustomMlxLayer):
-    def __init__(self, layer: _LayerCallable):
-        super().__init__(layer)
-        self.sharding_group: mx.distributed.Group | None = None
-
-    def __call__(self, x: mx.array) -> mx.array:
-        if self.sharding_group is not None:
-            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer.__call__(x)
-        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)
-        return y
-
-
 class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
    def shard_model(
        self,
@@ -626,7 +541,6 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
        on_timeout: TimeoutCallback | None,
    ) -> nn.Module:
        model = cast(MiniMaxModel, model)
-        rank = self.group.rank()
        for layer in model.layers:
            eval_with_timeout(
                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
@@ -636,16 +550,6 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
-
-            # Shard qk_norm weights if present (must match sharded head count)
-            if getattr(layer.self_attn, "use_qk_norm", False):
-                layer.self_attn.q_norm.weight = layer.self_attn.q_norm.weight.split(  # type: ignore
-                    self.N, axis=-1
-                )[rank]
-                layer.self_attn.k_norm.weight = layer.self_attn.k_norm.weight.split(  # type: ignore
-                    self.N, axis=-1
-                )[rank]
-
            layer.self_attn.num_attention_heads //= self.N
            layer.self_attn.num_key_value_heads //= self.N

@@ -662,7 +566,7 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
            )
            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
            layer.block_sparse_moe.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
-            mx.eval(layer)
+
        return model


@@ -703,7 +607,6 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):
                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)

-            mx.eval(layer)
        return model


@@ -758,7 +661,7 @@ class GptOssShardingStrategy(TensorParallelShardingStrategy):

            layer.mlp = ShardedGptOssMoE(layer.mlp)  # type: ignore
            layer.mlp.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
-            mx.eval(layer)
+
        return model


--- a/src/exo/worker/engines/mlx/cache.py
+++ b/src/exo/worker/engines/mlx/cache.py
@@ -1,234 +1,104 @@
-import os
+# type: ignore
+# TODO: Fix this file, including types!
 from copy import deepcopy
-from typing import Any, cast
+from typing import Callable

 import mlx.core as mx
-import psutil
-from mlx_lm.models.cache import (
-    KVCache,
-    QuantizedKVCache,
-    RotatingKVCache,
-    trim_prompt_cache,
-)
-from mlx_lm.models.gpt_oss import Model as GptOssModel
+from mlx_lm import stream_generate
+from mlx_lm.models.cache import _BaseCache, trim_prompt_cache
 from mlx_lm.tokenizer_utils import TokenizerWrapper

-from exo.shared.types.memory import Memory
-from exo.shared.types.mlx import KVCacheType
 from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.constants import CACHE_GROUP_SIZE, KV_CACHE_BITS
-from exo.worker.runner.bootstrap import logger
-
-# Fraction of device memory above which LRU eviction kicks in
-_DEFAULT_MEMORY_THRESHOLD = 0.9
-_MEMORY_THRESHOLD = float(
-    os.environ.get("EXO_MEMORY_THRESHOLD", _DEFAULT_MEMORY_THRESHOLD)
-)
+from exo.worker.engines.mlx.constants import KEEP_KV_SIZE, KV_BITS, KV_GROUP_SIZE
+from exo.worker.engines.mlx.utils_mlx import make_kv_cache


 class KVPrefixCache:
-    def __init__(
-        self, tokenizer: TokenizerWrapper, group: mx.distributed.Group | None = None
-    ):
+    def __init__(self):
+        # Only one prefix cache per runner.
        self.prompts: list[mx.array] = []  # mx array of tokens (ints)
-        self.caches: list[KVCacheType] = []
-        self._last_used: list[int] = []  # monotonic counter of last access per entry
-        self._access_counter: int = 0
-        self._tokenizer: TokenizerWrapper = tokenizer
-        self._group = group
+        self.caches: list[list[_BaseCache]] = []

-    def clear(self):
-        """Clear all cached prompts and caches."""
-        self.prompts.clear()
-        self.caches.clear()
-        self._last_used.clear()
-
-    def add_kv_cache(self, prompt: str, cache: KVCacheType):
-        """Add a new cache entry. Evicts LRU entries if memory is high."""
-        self._evict_if_needed()
-        tokenized_prompt = encode_prompt(self._tokenizer, prompt)
+    def add_kv_cache(
+        self, tokenizer: TokenizerWrapper, prompt: str, cache: list[_BaseCache]
+    ):
+        tokenized_prompt = self.encode_prompt(tokenizer, prompt)
        self.prompts.append(tokenized_prompt)
        self.caches.append(deepcopy(cache))
-        self._access_counter += 1
-        self._last_used.append(self._access_counter)
-        logger.info(f"KV cache added: {len(tokenized_prompt)} tokens")
-
-    def update_kv_cache(
-        self,
-        index: int,
-        prompt: str,
-        cache: KVCacheType,
-    ):
-        """Update an existing cache entry in-place."""
-        tokenized_prompt = encode_prompt(self._tokenizer, prompt)
-        self.prompts[index] = tokenized_prompt
-        self.caches[index] = deepcopy(cache)
-        self._access_counter += 1
-        self._last_used[index] = self._access_counter
-        logger.info(f"KV cache updated (index {index}): {len(tokenized_prompt)} tokens")

    def get_kv_cache(
        self,
        model: Model,
+        tokenizer: TokenizerWrapper,
+        sampler: Callable[[mx.array], mx.array],
        prompt: str,
-    ) -> tuple[KVCacheType, mx.array, int | None]:
-        """Get KV cache for prompt, returning remaining tokens to prefill.
-
-        Returns:
-            Tuple of (cache, remaining_tokens, matched_index) where:
-            - cache: KV cache to use for generation
-            - remaining_tokens: tokens that still need prefilling
-            - matched_index: index of the matched entry (None if no match)
-        """
-        tokenized_prompt = encode_prompt(self._tokenizer, prompt)
+    ) -> list[_BaseCache]:
+        tokenized_prompt = self.encode_prompt(tokenizer, prompt)
        max_length = len(tokenized_prompt)

        best_snapshot_index, best_snapshot_length = None, 0

        for i, cached_prompt in enumerate(self.prompts):
-            length = get_prefix_length(tokenized_prompt, cached_prompt)
+            length = _get_prefix_length(tokenized_prompt, cached_prompt)

            if length == max_length:
-                # Exact match - cached prompt starts with our entire prompt
-                # Trim cache to prompt length - 1, return last token for stream_generate
-                prompt_cache = deepcopy(self.caches[i])
-                cached_length = cache_length(self.caches[i])
-                tokens_to_trim = cached_length - (max_length - 1)
-                if tokens_to_trim > 0:
-                    trim_prompt_cache(cast(list[Any], prompt_cache), tokens_to_trim)
-                self._access_counter += 1
-                self._last_used[i] = self._access_counter
-                logger.info(f"KV cache exact match: {max_length} tokens (instant)")
-                return prompt_cache, tokenized_prompt[-1:], i
+                return self.caches[i]

            if length > best_snapshot_length:
                best_snapshot_index, best_snapshot_length = i, length

        if best_snapshot_index is not None:
-            new_tokens = max_length - best_snapshot_length
-            logger.info(
-                f"KV cache prefix match: {best_snapshot_length}/{max_length} tokens "
-                f"(reusing {best_snapshot_length}, need to prefill {new_tokens})"
-            )
-
            prompt_cache = deepcopy(self.caches[best_snapshot_index])
-
-            # Trim removes tokens from the end, so we trim (cached_length - prefix_length) to keep the prefix
-            cached_length = cache_length(self.caches[best_snapshot_index])
-            tokens_to_trim = cached_length - best_snapshot_length
-            if tokens_to_trim > 0:
-                trim_prompt_cache(cast(list[Any], prompt_cache), tokens_to_trim)
-
-            self._access_counter += 1
-            self._last_used[best_snapshot_index] = self._access_counter
-            remaining_tokens = tokenized_prompt[best_snapshot_length:]
-            return prompt_cache, remaining_tokens, best_snapshot_index
+            trim_prompt_cache(prompt_cache, max_length - best_snapshot_length)
+            tokenized_prompt = tokenized_prompt[best_snapshot_index:]

        else:
-            prompt_cache = make_kv_cache(model)
-            if len(self.prompts) == 0:
-                logger.info(f"KV cache empty, need to prefill {max_length} tokens")
-            else:
-                logger.info(
-                    f"KV cache no prefix match, need to prefill {max_length} tokens"
-                )
-
-            return prompt_cache, tokenized_prompt, None
-
-    def _evict_if_needed(self):
-        """Evict least recently used entries while memory usage is high."""
-        if len(self.caches) == 0:
-            return
-
-        # Evict LRU entries until below threshold or only one entry left
-        while (
-            len(self.caches) > 1
-            and self.get_memory_used_percentage() > _MEMORY_THRESHOLD
-        ):
-            lru_index = self._last_used.index(min(self._last_used))
-            evicted_tokens = len(self.prompts[lru_index])
-            self.prompts.pop(lru_index)
-            self.caches.pop(lru_index)
-            self._last_used.pop(lru_index)
-            logger.info(
-                f"KV cache evicted LRU entry ({evicted_tokens} tokens) due to memory usage"
+            prompt_cache = make_kv_cache(
+                model,
+                # max_kv_size=MAX_KV_SIZE,
+                # keep=KEEP_KV_SIZE
            )

-    def get_memory_used_percentage(self) -> float:
-        local_pressure: float = get_memory_used_percentage()
+        prefill(model, tokenizer, sampler, tokenized_prompt, prompt_cache)

-        if self._group is None:
-            return local_pressure
+        return prompt_cache

-        all_pressure = mx.distributed.all_gather(
-            mx.array([local_pressure], dtype=mx.float32),
-            group=self._group,
+    def encode_prompt(self, tokenizer: TokenizerWrapper, prompt: str) -> mx.array:
+        add_special_tokens = tokenizer.bos_token is None or not prompt.startswith(
+            tokenizer.bos_token
        )
-        # .item() evals.
-        max_pressure = float(mx.max(all_pressure).item())
-        return max_pressure
+        tokenized_prompt = tokenizer.encode(
+            prompt, add_special_tokens=add_special_tokens
+        )
+        return mx.array(tokenized_prompt)


-def encode_prompt(tokenizer: TokenizerWrapper, prompt: str) -> mx.array:
-    """Encode a prompt string to token array.
-
-    For chat-templated prompts (which have their own structure markers like
-    <|im_user|>, <|im_middle|>, etc.), we should NOT add BOS/EOS tokens as
-    that would corrupt the prompt structure.
-    """
-    # Chat templates define their own structure - don't add BOS/EOS
-    tokenized_prompt = tokenizer.encode(prompt, add_special_tokens=False)
-    return mx.array(tokenized_prompt)
-
-
-def cache_length(cache: KVCacheType) -> int:
-    """Get the number of tokens in a KV cache."""
-    # Use .offset attribute which all cache types have (len() not implemented in older QuantizedKVCache)
-    return max(c.offset for c in cache)  # type: ignore
-
-
-def get_prefix_length(prompt: mx.array, cached_prompt: mx.array) -> int:
-    """Find the length of the common prefix between two token arrays."""
-    n = min(int(prompt.shape[0]), int(cached_prompt.shape[0]))
+def _get_prefix_length(prompt: mx.array, cached_prompt: mx.array) -> int:
+    n = min(int(prompt.shape[0]), int(cached_prompt.shape[0]), KEEP_KV_SIZE)
    if n == 0:
        return 0

-    equal = mx.equal(prompt[:n], cached_prompt[:n]).astype(mx.int32)
+    equal = (prompt[:n] == cached_prompt[:n]).astype(mx.int32)
    prefix_mask = mx.cumprod(equal)  # stays 1 until first mismatch, then 0 forever
    return int(mx.sum(prefix_mask).item())


-def get_available_memory() -> Memory:
-    mem: int = psutil.virtual_memory().available
-    return Memory.from_bytes(mem)
-
-
-def get_memory_used_percentage() -> float:
-    mem = psutil.virtual_memory()
-    # percent is 0-100
-    return float(mem.percent / 100)
-
-
-def make_kv_cache(
-    model: Model, max_kv_size: int | None = None, keep: int = 0
-) -> KVCacheType:
-    assert hasattr(model, "layers")
-
-    # TODO: Do this for all models
-    if hasattr(model, "make_cache") and isinstance(model, GptOssModel):
-        logger.info("Using MLX LM's make cache")
-        return model.make_cache()  # type: ignore
-
-    if max_kv_size is None:
-        if KV_CACHE_BITS is None:
-            logger.info("Using default KV cache")
-            return [KVCache() for _ in model.layers]
-        else:
-            logger.info("Using quantized KV cache")
-            return [
-                QuantizedKVCache(group_size=CACHE_GROUP_SIZE, bits=KV_CACHE_BITS)
-                for _ in model.layers
-            ]
-    else:
-        logger.info(f"Using rotating KV cache with {max_kv_size=} with {keep=}")
-        return [RotatingKVCache(max_size=max_kv_size, keep=keep) for _ in model.layers]
+def prefill(
+    model: Model,
+    tokenizer: TokenizerWrapper,
+    sampler: Callable[[mx.array], mx.array],
+    prompt: mx.array,
+    cache: list[_BaseCache],
+) -> None:
+    for _ in stream_generate(
+        model=model,
+        tokenizer=tokenizer,
+        prompt=prompt,
+        max_tokens=0,
+        sampler=sampler,
+        prompt_cache=cache,
+        prefill_step_size=2048,
+        kv_group_size=KV_GROUP_SIZE,
+        kv_bits=KV_BITS,
+    ):
+        pass
--- a/src/exo/worker/engines/mlx/constants.py
+++ b/src/exo/worker/engines/mlx/constants.py
@@ -4,7 +4,7 @@
 KV_GROUP_SIZE: int | None = 32
 KV_BITS: int | None = None
 ATTENTION_KV_BITS: int | None = 4
-MAX_TOKENS: int = 32168
+MAX_TOKENS: int = 8192
 MAX_KV_SIZE: int | None = 3200
 KEEP_KV_SIZE: int | None = 1600
 QUANTIZE_MODEL_MODE: str | None = "affine"
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -1,94 +1,47 @@
-import time
 from typing import Any, Callable, Generator, cast, get_args

 import mlx.core as mx
 from mlx_lm.generate import stream_generate
-from mlx_lm.models.cache import trim_prompt_cache
+from mlx_lm.models.cache import KVCache
 from mlx_lm.sample_utils import make_sampler
 from mlx_lm.tokenizer_utils import TokenizerWrapper

+# from exo.engines.mlx.cache import KVPrefixCache
 from exo.shared.types.api import (
    BenchChatCompletionTaskParams,
    ChatCompletionMessage,
-    CompletionTokensDetails,
    FinishReason,
    GenerationStats,
-    PromptTokensDetails,
-    Usage,
 )
 from exo.shared.types.memory import Memory
-from exo.shared.types.mlx import KVCacheType
 from exo.shared.types.tasks import ChatCompletionTaskParams
 from exo.shared.types.worker.runner_response import (
    GenerationResponse,
 )
 from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.cache import KVPrefixCache, encode_prompt, make_kv_cache
 from exo.worker.engines.mlx.constants import KV_BITS, KV_GROUP_SIZE, MAX_TOKENS
 from exo.worker.engines.mlx.utils_mlx import (
    apply_chat_template,
-    mx_barrier,
+    make_kv_cache,
 )
 from exo.worker.runner.bootstrap import logger

 generation_stream = mx.new_stream(mx.default_device())

-_MIN_PREFIX_HIT_TO_UPDATE = 1000

-
-def prefill(
-    model: Model,
-    tokenizer: TokenizerWrapper,
-    sampler: Callable[[mx.array], mx.array],
-    prompt_tokens: mx.array,
-    cache: KVCacheType,
-) -> tuple[float, int]:
-    """Prefill the KV cache with prompt tokens.
-
-    This runs the model over the prompt tokens to populate the cache,
-    then trims off the extra generated token.
-
-    Returns:
-        tokens_per_sec
-    """
-    num_tokens = len(prompt_tokens)
-    if num_tokens == 0:
-        return 0.0, 0
-
-    logger.debug(f"Prefilling {num_tokens} tokens...")
-    start_time = time.perf_counter()
-
-    def progress_callback(processed: int, total: int) -> None:
-        elapsed = time.time() - start_time
-        tok_per_sec = processed / elapsed if elapsed > 0 else 0
-        logger.debug(
-            f"Prefill progress: {processed}/{total} tokens ({tok_per_sec:.1f} tok/s)"
-        )
-
-    # Use max_tokens=1 because max_tokens=0 does not work.
-    # We just throw away the generated token - we only care about filling the cache
-    for _ in stream_generate(
-        model=model,
-        tokenizer=tokenizer,
-        prompt=prompt_tokens,
-        max_tokens=1,
-        sampler=sampler,
-        prompt_cache=cache,
-        prefill_step_size=2048,
-        kv_group_size=KV_GROUP_SIZE,
-        kv_bits=KV_BITS,
-        prompt_progress_callback=progress_callback,
-    ):
-        break  # Stop after first iteration - cache is now filled
-    trim_prompt_cache(cast(list[Any], cache), 1)
-
-    elapsed = time.perf_counter() - start_time
-    tokens_per_sec = num_tokens / elapsed if elapsed > 0 else 0.0
-    logger.debug(
-        f"Prefill complete: {num_tokens} tokens in {elapsed:.2f}s "
-        f"({tokens_per_sec:.1f} tok/s)"
-    )
-    return tokens_per_sec, num_tokens
+def maybe_quantize_kv_cache(
+    prompt_cache: list[KVCache | Any],
+    quantized_kv_start: int,
+    kv_group_size: int,
+    kv_bits: int | None,
+) -> None:
+    if kv_bits is None:
+        return
+    for e, c in enumerate(prompt_cache):
+        if (
+            hasattr(c, "to_quantized") and c.offset >= quantized_kv_start  # type: ignore
+        ):
+            prompt_cache[e] = c.to_quantized(group_size=kv_group_size, bits=kv_bits)


 def warmup_inference(
@@ -136,10 +89,6 @@ def warmup_inference(

    logger.info("Generated ALL warmup tokens")

-    # TODO: Do we want an mx_barrier?
-    #  At least this version is actively incorrect, as it should use mx_barrier(group)
-    mx_barrier()
-
    return tokens_generated


@@ -166,36 +115,18 @@ def mlx_generate(
    tokenizer: TokenizerWrapper,
    task: ChatCompletionTaskParams,
    prompt: str,
-    kv_prefix_cache: KVPrefixCache | None = None,
 ) -> Generator[GenerationResponse]:
    # Ensure that generation stats only contains peak memory for this generation
    mx.reset_peak_memory()
    is_bench: bool = isinstance(task, BenchChatCompletionTaskParams)

-    logger.info(f"{is_bench=}")
-
    # Currently we support chat-completion tasks only.
    logger.debug(f"task_params: {task}")

    if task.seed is not None:
        mx.random.seed(task.seed)

-    # Do not use the prefix cache if we are trying to do benchmarks.
-    if is_bench:
-        kv_prefix_cache = None
-
-    # Use prefix cache if available, otherwise create fresh cache
-    prefix_hit_length = 0
-    matched_index: int | None = None
-    if kv_prefix_cache is None:
-        caches = make_kv_cache(model=model)
-        prompt_tokens = encode_prompt(tokenizer, prompt)
-    else:
-        caches, prompt_tokens, matched_index = kv_prefix_cache.get_kv_cache(
-            model, prompt
-        )
-        all_prompt_tokens = encode_prompt(tokenizer, prompt)
-        prefix_hit_length = len(all_prompt_tokens) - len(prompt_tokens)
+    caches = make_kv_cache(model=model)

    logits_processors: list[Callable[[mx.array, mx.array], mx.array]] = []
    if is_bench:
@@ -208,54 +139,28 @@ def mlx_generate(
        top_p=task.top_p if task.top_p is not None else 1.0,
    )

-    # Prefill cache with all tokens except the last one
-    prefill_tps, prefill_tokens = prefill(
-        model, tokenizer, sampler, prompt_tokens[:-1], caches
-    )
-
-    # stream_generate starts from the last token
-    last_token = prompt_tokens[-1:]
-
    max_tokens = task.max_tokens or MAX_TOKENS
-    generated_text_parts: list[str] = []
-    generation_start_time = time.perf_counter()
-    usage: Usage | None = None
-    in_thinking = False
-    reasoning_tokens = 0
-    think_start = tokenizer.think_start
-    think_end = tokenizer.think_end
-    for completion_tokens, out in enumerate(
-        stream_generate(
-            model=model,
-            tokenizer=tokenizer,
-            prompt=last_token,
-            max_tokens=max_tokens,
-            sampler=sampler,
-            logits_processors=logits_processors,
-            prompt_cache=caches,
-            # TODO: Dynamically change prefill step size to be the maximum possible without timing out.
-            prefill_step_size=2048,
-            kv_group_size=KV_GROUP_SIZE,
-            kv_bits=KV_BITS,
-        ),
-        start=1,
+    for out in stream_generate(
+        model=model,
+        tokenizer=tokenizer,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        sampler=sampler,
+        logits_processors=logits_processors,
+        prompt_cache=caches,
+        # TODO: Dynamically change prefill step size to be the maximum possible without timing out.
+        prefill_step_size=2048,
+        kv_group_size=KV_GROUP_SIZE,
+        kv_bits=KV_BITS,
    ):
-        generated_text_parts.append(out.text)
        logger.info(out.text)

-        if think_start is not None and out.text == think_start:
-            in_thinking = True
-        elif think_end is not None and out.text == think_end:
-            in_thinking = False
-        if in_thinking:
-            reasoning_tokens += 1
-
        stats: GenerationStats | None = None
        if out.finish_reason is not None:
            stats = GenerationStats(
-                prompt_tps=float(prefill_tps or out.prompt_tps),
+                prompt_tps=float(out.prompt_tps),
                generation_tps=float(out.generation_tps),
-                prompt_tokens=int(prefill_tokens + out.prompt_tokens),
+                prompt_tokens=int(out.prompt_tokens),
                generation_tokens=int(out.generation_tokens),
                peak_memory_usage=Memory.from_gb(out.peak_memory),
            )
@@ -267,47 +172,12 @@ def mlx_generate(
                    f"Model generated unexpected finish_reason: {out.finish_reason}"
                )

-            usage = Usage(
-                prompt_tokens=int(out.prompt_tokens),
-                completion_tokens=completion_tokens,
-                total_tokens=int(out.prompt_tokens) + completion_tokens,
-                prompt_tokens_details=PromptTokensDetails(
-                    cached_tokens=prefix_hit_length
-                ),
-                completion_tokens_details=CompletionTokensDetails(
-                    reasoning_tokens=reasoning_tokens
-                ),
-            )
-
        yield GenerationResponse(
            text=out.text,
            token=out.token,
            finish_reason=cast(FinishReason | None, out.finish_reason),
            stats=stats,
-            usage=usage,
        )

        if out.finish_reason is not None:
-            # Log generation stats
-            generation_elapsed = time.perf_counter() - generation_start_time
-            generated_tokens = len(generated_text_parts)
-            generation_tps = (
-                generated_tokens / generation_elapsed if generation_elapsed > 0 else 0.0
-            )
-            logger.debug(
-                f"Generation complete: prefill {prompt_tokens} tokens @ "
-                f"{prefill_tps:.1f} tok/s, generated {generated_tokens} tokens @ "
-                f"{generation_tps:.1f} tok/s"
-            )
-            if kv_prefix_cache is not None:
-                full_prompt = prompt + "".join(generated_text_parts)
-                if (
-                    matched_index is not None
-                    and prefix_hit_length >= _MIN_PREFIX_HIT_TO_UPDATE
-                ):
-                    kv_prefix_cache.update_kv_cache(matched_index, full_prompt, caches)
-                else:
-                    kv_prefix_cache.add_kv_cache(full_prompt, caches)
            break
-
-        # TODO: Do we want an mx_barrier?
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -18,12 +18,15 @@ try:
 except ImportError:
    pass  # transformers < 5.0 or bytes_to_unicode not available

-from mlx_lm.models.cache import KVCache
+from mlx_lm.models.cache import KVCache, QuantizedKVCache, RotatingKVCache
 from mlx_lm.models.deepseek_v3 import DeepseekV3Model
+from mlx_lm.models.gpt_oss import Model as GptOssModel
 from mlx_lm.tokenizer_utils import TokenizerWrapper

 from exo.shared.models.model_cards import ModelId
 from exo.worker.engines.mlx.constants import (
+    CACHE_GROUP_SIZE,
+    KV_CACHE_BITS,
    TRUST_REMOTE_CODE,
 )

@@ -67,8 +70,6 @@ Group = mx.distributed.Group
 resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 4096))


-# TODO: Test this
-#  ALSO https://github.com/exo-explore/exo/pull/233#discussion_r2549683673
 def get_weights_size(model_shard_meta: ShardMetadata) -> Memory:
    return Memory.from_float_kb(
        (model_shard_meta.end_layer - model_shard_meta.start_layer)
@@ -86,30 +87,6 @@ class ModelLoadingTimeoutError(Exception):
    pass


-def mx_barrier(group: Group | None = None):
-    mx.eval(
-        mx.distributed.all_sum(
-            mx.array(1.0),
-            stream=mx.default_stream(mx.Device(mx.cpu)),
-            group=group,
-        )
-    )
-
-
-def broadcast_from_zero(value: int, group: Group | None = None):
-    if group is None:
-        return value
-
-    if group.rank() == 0:
-        a = mx.array([value], dtype=mx.int32)
-    else:
-        a = mx.array([0], dtype=mx.int32)
-
-    m = mx.distributed.all_sum(a, stream=mx.Device(mx.DeviceType.cpu), group=group)
-    mx.eval(m)
-    return int(m.item())
-
-
 class HostList(RootModel[list[str]]):
    @classmethod
    def from_hosts(cls, hosts: list[Host]) -> "HostList":
@@ -165,11 +142,12 @@ def mlx_distributed_init(

                jaccl_coordinator = jaccl_coordinators[bound_instance.bound_node_id]

+                # TODO: update once upstream fixes
                logger.info(
-                    f"rank {rank} MLX_IBV_DEVICES: {coordination_file} with devices: {jaccl_devices_json}"
+                    f"rank {rank} MLX_JACCL_DEVICES: {coordination_file} with devices: {jaccl_devices_json}"
                )
                logger.info(f"rank {rank} MLX_JACCL_COORDINATOR: {jaccl_coordinator}")
-                os.environ["MLX_IBV_DEVICES"] = coordination_file
+                os.environ["MLX_JACCL_DEVICES"] = coordination_file
                os.environ["MLX_RANK"] = str(rank)
                os.environ["MLX_JACCL_COORDINATOR"] = jaccl_coordinator
                group = mx.distributed.init(backend="jaccl", strict=True)
@@ -258,10 +236,10 @@ def shard_and_load(

    logger.info(f"Group size: {group.size()}, group rank: {group.rank()}")

-    # Estimate timeout based on model size (5x default for large queued workloads)
-    base_timeout = float(os.environ.get("EXO_MODEL_LOAD_TIMEOUT", "300"))
+    # Estimate timeout based on model size
+    base_timeout = float(os.environ.get("EXO_MODEL_LOAD_TIMEOUT", "60"))
    model_size_gb = get_weights_size(shard_metadata).in_bytes / (1024**3)
-    timeout_seconds = base_timeout + model_size_gb
+    timeout_seconds = base_timeout + model_size_gb / 5
    logger.info(
        f"Evaluating model parameters with timeout of {timeout_seconds:.0f}s "
        f"(model size: {model_size_gb:.1f}GB)"
@@ -338,35 +316,8 @@ def load_tokenizer_for_model_id(

    # Kimi uses a custom TikTokenTokenizer that transformers 5.x can't load via AutoTokenizer
    if "kimi-k2" in model_id_lower:
-        import importlib.util
-        import types
-
        sys.path.insert(0, str(model_path))
-
-        # Load tool_declaration_ts first (tokenization_kimi imports it with relative import)
-        tool_decl_path = model_path / "tool_declaration_ts.py"
-        if tool_decl_path.exists():
-            spec = importlib.util.spec_from_file_location(
-                "tool_declaration_ts", tool_decl_path
-            )
-            if spec and spec.loader:
-                tool_decl_module = importlib.util.module_from_spec(spec)
-                sys.modules["tool_declaration_ts"] = tool_decl_module
-                spec.loader.exec_module(tool_decl_module)
-
-        # Load tokenization_kimi with patched source (convert relative to absolute import)
-        tok_path = model_path / "tokenization_kimi.py"
-        source = tok_path.read_text()
-        source = source.replace("from .tool_declaration_ts", "from tool_declaration_ts")
-        spec = importlib.util.spec_from_file_location("tokenization_kimi", tok_path)
-        if spec:
-            tok_module = types.ModuleType("tokenization_kimi")
-            tok_module.__file__ = str(tok_path)
-            sys.modules["tokenization_kimi"] = tok_module
-            exec(compile(source, tok_path, "exec"), tok_module.__dict__)  # noqa: S102
-            TikTokenTokenizer = tok_module.TikTokenTokenizer  # type: ignore[attr-defined]  # noqa: N806
-        else:
-            from tokenization_kimi import TikTokenTokenizer  # type: ignore[import-not-found]  # noqa: I001
+        from tokenization_kimi import TikTokenTokenizer  # type: ignore[import-not-found]  # noqa: I001

        hf_tokenizer: Any = TikTokenTokenizer.from_pretrained(model_path)  # pyright: ignore[reportUnknownVariableType,reportUnknownMemberType]

@@ -428,11 +379,7 @@ def apply_chat_template(
                continue

            message.content = "\n".join(c.text for c in message.content).strip()
-        if (
-            message.content is None
-            and message.thinking is None
-            and message.tool_calls is None
-        ):
+        if message.content is None and message.thinking is None:
            continue

        # Null values are not valid when applying templates in tokenizer
@@ -489,6 +436,31 @@ class NullKVCache(KVCache):
        raise NotImplementedError("We should not be setting a NullKVCache.")


+def make_kv_cache(
+    model: Model, max_kv_size: int | None = None, keep: int = 0
+) -> list[KVCache | RotatingKVCache | QuantizedKVCache]:
+    assert hasattr(model, "layers")
+
+    # TODO: Do this for all models
+    if hasattr(model, "make_cache") and isinstance(model, GptOssModel):
+        logger.info("Using MLX LM's make cache")
+        return model.make_cache()  # type: ignore
+
+    if max_kv_size is None:
+        if KV_CACHE_BITS is None:
+            logger.info("Using default KV cache")
+            return [KVCache() for _ in model.layers]
+        else:
+            logger.info("Using quantized KV cache")
+            return [
+                QuantizedKVCache(group_size=CACHE_GROUP_SIZE, bits=KV_CACHE_BITS)
+                for _ in model.layers
+            ]
+    else:
+        logger.info(f"Using rotating KV cache with {max_kv_size=} with {keep=}")
+        return [RotatingKVCache(max_size=max_kv_size, keep=keep) for _ in model.layers]
+
+
 def mlx_force_oom(size: int = 40000) -> None:
    """
    Force an Out-Of-Memory (OOM) error in MLX by performing large tensor operations.
@@ -538,3 +510,23 @@ def mlx_cleanup(
    import gc

    gc.collect()
+
+
+def mx_any(bool_: bool, group: Group | None) -> bool:
+    if group is None:
+        return bool_
+    num_true = mx.distributed.all_sum(
+        mx.array(bool_), group=group, stream=mx.default_stream(mx.Device(mx.cpu))
+    )
+    mx.eval(num_true)
+    return num_true.item() > 0
+
+
+def mx_barrier(group: Group | None):
+    if group is None:
+        return
+    mx.eval(
+        mx.distributed.all_sum(
+            mx.array(1.0), group=group, stream=mx.default_stream(mx.Device(mx.cpu))
+        )
+    )
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -33,6 +33,7 @@ from exo.shared.types.events import (
 from exo.shared.types.multiaddr import Multiaddr
 from exo.shared.types.state import State
 from exo.shared.types.tasks import (
+    CancelTask,
    CreateRunner,
    DownloadModel,
    ImageEdits,
@@ -115,8 +116,9 @@ class Worker:
        self.local_event_sender.close()
        self.command_sender.close()
        self.download_command_sender.close()
-        for runner in self.runners.values():
-            runner.shutdown()
+        async with create_task_group() as tg:
+            for runner in self.runners.values():
+                tg.start_soon(runner.shutdown)

    async def _forward_info(self, recv: Receiver[GatheredInfo]):
        with recv as info_stream:
@@ -220,15 +222,22 @@ class Worker:
                        )
                    )
                case Shutdown(runner_id=runner_id):
+                    runner = self.runners.pop(runner_id)
                    try:
                        with fail_after(3):
-                            await self.runners.pop(runner_id).start_task(task)
+                            await runner.start_task(task)
                    except TimeoutError:
                        await self.event_sender.send(
                            TaskStatusUpdated(
                                task_id=task.task_id, task_status=TaskStatus.TimedOut
                            )
                        )
+                    finally:
+                        await runner.shutdown()
+                case CancelTask(cancelled_task_id=cancelled_task_id):
+                    await self.runners[self._task_to_runner_id(task)].cancel_task(
+                        cancelled_task_id
+                    )
                case ImageEdits() if task.task_params.total_input_chunks > 0:
                    # Assemble image from chunks and inject into task
                    cmd_id = task.command_id
@@ -351,8 +360,6 @@ class Worker:
            for event in self.out_for_delivery.copy().values():
                await self.local_event_sender.send(event)

-    ## Op Executors
-
    def _create_supervisor(self, task: CreateRunner) -> RunnerSupervisor:
        """Creates and stores a new AssignedRunner with initial downloading status."""
        runner = RunnerSupervisor.create(
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@@ -4,6 +4,7 @@ from collections.abc import Mapping, Sequence

 from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.tasks import (
+    CancelTask,
    ChatCompletion,
    ConnectToGroup,
    CreateRunner,
@@ -59,7 +60,8 @@ def plan(
        or _init_distributed_backend(runners, all_runners)
        or _load_model(runners, all_runners, global_download_status)
        or _ready_to_warmup(runners, all_runners)
-        or _pending_tasks(runners, tasks, all_runners, input_chunk_buffer)
+        or _cancel_tasks(runners, tasks)
+        or _pending_tasks(runners, tasks, all_runners, input_chunk_buffer or {})
    )


@@ -270,7 +272,7 @@ def _pending_tasks(
    runners: Mapping[RunnerId, RunnerSupervisor],
    tasks: Mapping[TaskId, Task],
    all_runners: Mapping[RunnerId, RunnerStatus],
-    input_chunk_buffer: Mapping[CommandId, dict[int, str]] | None = None,
+    input_chunk_buffer: Mapping[CommandId, dict[int, str]],
 ) -> Task | None:
    for task in tasks.values():
        # for now, just forward chat completions
@@ -284,7 +286,7 @@ def _pending_tasks(
        if isinstance(task, ImageEdits) and task.task_params.total_input_chunks > 0:
            cmd_id = task.command_id
            expected = task.task_params.total_input_chunks
-            received = len((input_chunk_buffer or {}).get(cmd_id, {}))
+            received = len(input_chunk_buffer.get(cmd_id, {}))
            if received < expected:
                continue  # Wait for all chunks to arrive

@@ -292,16 +294,31 @@ def _pending_tasks(
            if task.instance_id != runner.bound_instance.instance.instance_id:
                continue

-            # I have a design point here; this is a state race in disguise as the task status doesn't get updated to completed fast enough
-            # however, realistically the task status should be set to completed by the LAST runner, so this is a true race
-            # the actual solution is somewhat deeper than this bypass - TODO!
+            # the task status _should_ be set to completed by the LAST runner
+            # it is currently set by the first
+            # this is definitely a hack
            if task.task_id in runner.completed:
                continue

-            # TODO: Check ordering aligns with MLX distributeds expectations.
-
            if isinstance(runner.status, RunnerReady) and all(
                isinstance(all_runners[global_runner_id], (RunnerReady, RunnerRunning))
                for global_runner_id in runner.bound_instance.instance.shard_assignments.runner_to_shard
            ):
                return task
+
+
+def _cancel_tasks(
+    runners: Mapping[RunnerId, RunnerSupervisor],
+    tasks: Mapping[TaskId, Task],
+) -> Task | None:
+    for task in tasks.values():
+        if task.task_status != TaskStatus.Cancelled:
+            continue
+        for runner in runners.values():
+            if task.instance_id != runner.bound_instance.instance.instance_id:
+                continue
+            if task.task_id in runner.cancelled:
+                continue
+            return CancelTask(
+                instance_id=task.instance_id, cancelled_task_id=task.task_id
+            )
--- a/src/exo/worker/runner/bootstrap.py
+++ b/src/exo/worker/runner/bootstrap.py
@@ -3,7 +3,7 @@ import os
 import loguru

 from exo.shared.types.events import Event, RunnerStatusUpdated
-from exo.shared.types.tasks import Task
+from exo.shared.types.tasks import Task, TaskId
 from exo.shared.types.worker.instances import BoundInstance, MlxJacclInstance
 from exo.shared.types.worker.runners import RunnerFailed
 from exo.utils.channels import ClosedResourceError, MpReceiver, MpSender
@@ -15,6 +15,7 @@ def entrypoint(
    bound_instance: BoundInstance,
    event_sender: MpSender[Event],
    task_receiver: MpReceiver[Task],
+    cancel_receiver: MpReceiver[TaskId],
    _logger: "loguru.Logger",
 ) -> None:
    fast_synch_override = os.environ.get("EXO_FAST_SYNCH")
@@ -38,7 +39,7 @@ def entrypoint(
    try:
        from exo.worker.runner.runner import main

-        main(bound_instance, event_sender, task_receiver)
+        main(bound_instance, event_sender, task_receiver, cancel_receiver)
    except ClosedResourceError:
        logger.warning("Runner communication closed unexpectedly")
    except Exception as e:
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -71,7 +71,6 @@ from exo.worker.engines.image import (
    warmup_image_generator,
 )
 from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.cache import KVPrefixCache
 from exo.worker.engines.mlx.generator.generate import mlx_generate, warmup_inference
 from exo.worker.engines.mlx.utils_mlx import (
    apply_chat_template,
@@ -79,6 +78,7 @@ from exo.worker.engines.mlx.utils_mlx import (
    initialize_mlx,
    load_mlx_items,
    mlx_force_oom,
+    mx_any,
 )
 from exo.worker.runner.bootstrap import logger

@@ -87,6 +87,7 @@ def main(
    bound_instance: BoundInstance,
    event_sender: MpSender[Event],
    task_receiver: MpReceiver[Task],
+    cancel_receiver: MpReceiver[TaskId],
 ):
    instance, runner_id, shard_metadata = (
        bound_instance.instance,
@@ -101,23 +102,22 @@ def main(
        time.sleep(timeout)

    setup_start_time = time.time()
+    cancelled_tasks = set[TaskId]()

-    model: Model | DistributedImageModel | None = None
+    # type checker was unhappy with me - splitting these fixed it
+    inference_model: Model | None = None
+    image_model: DistributedImageModel | None = None
    tokenizer = None
    group = None
-    kv_prefix_cache: KVPrefixCache | None = None

    current_status: RunnerStatus = RunnerIdle()
    logger.info("runner created")
    event_sender.send(
        RunnerStatusUpdated(runner_id=runner_id, runner_status=current_status)
    )
-    seen = set[TaskId]()
    with task_receiver as tasks:
        for task in tasks:
-            if task.task_id in seen:
-                logger.warning("repeat task - potential error")
-            seen.add(task.task_id)
+            cancelled_tasks.discard(TaskId("CANCEL_CURRENT_TASK"))
            event_sender.send(
                TaskStatusUpdated(task_id=task.task_id, task_status=TaskStatus.Running)
            )
@@ -162,28 +162,25 @@ def main(
                        time.sleep(0.5)

                    if ModelTask.TextGeneration in shard_metadata.model_card.tasks:
-                        model, tokenizer = load_mlx_items(
+                        inference_model, tokenizer = load_mlx_items(
                            bound_instance, group, on_timeout=on_model_load_timeout
                        )
                        logger.info(
                            f"model has_tool_calling={tokenizer.has_tool_calling}"
                        )
-                        kv_prefix_cache = KVPrefixCache(tokenizer, group)
-
                    elif (
                        ModelTask.TextToImage in shard_metadata.model_card.tasks
                        or ModelTask.ImageToImage in shard_metadata.model_card.tasks
                    ):
-                        model = initialize_image_model(bound_instance)
+                        image_model = initialize_image_model(bound_instance)
                    else:
                        raise ValueError(
                            f"Unknown model task(s): {shard_metadata.model_card.tasks}"
                        )
+
                    current_status = RunnerLoaded()
                    logger.info("runner loaded")
                case StartWarmup() if isinstance(current_status, RunnerLoaded):
-                    assert model
-
                    current_status = RunnerWarmingUp()
                    logger.info("runner warming up")
                    event_sender.send(
@@ -194,11 +191,11 @@ def main(

                    logger.info(f"warming up inference for instance: {instance}")
                    if ModelTask.TextGeneration in shard_metadata.model_card.tasks:
-                        assert not isinstance(model, DistributedImageModel)
+                        assert inference_model
                        assert tokenizer

                        toks = warmup_inference(
-                            model=model,
+                            model=inference_model,
                            tokenizer=tokenizer,
                            # kv_prefix_cache=kv_prefix_cache,  # supply for warmup-time prefix caching
                        )
@@ -210,8 +207,8 @@ def main(
                        ModelTask.TextToImage in shard_metadata.model_card.tasks
                        or ModelTask.ImageToImage in shard_metadata.model_card.tasks
                    ):
-                        assert isinstance(model, DistributedImageModel)
-                        image = warmup_image_generator(model=model)
+                        assert image_model
+                        image = warmup_image_generator(model=image_model)
                        if image is not None:
                            logger.info(f"warmed up by generating {image.size} image")
                        else:
@@ -230,7 +227,7 @@ def main(
                            runner_id=runner_id, runner_status=current_status
                        )
                    )
-                    assert model and not isinstance(model, DistributedImageModel)
+                    assert inference_model
                    assert tokenizer
                    assert task_params.messages[0].content is not None

@@ -242,11 +239,10 @@ def main(

                        # Generate responses using the actual MLX generation
                        mlx_generator = mlx_generate(
-                            model=model,
+                            model=inference_model,
                            tokenizer=tokenizer,
                            task=task_params,
                            prompt=prompt,
-                            kv_prefix_cache=kv_prefix_cache,
                        )

                        # For other thinking models (GLM, etc.), check if we need to
@@ -266,11 +262,11 @@ def main(
                            patch_glm_tokenizer(tokenizer)

                        # GPT-OSS specific parsing to match other model formats.
-                        elif isinstance(model, GptOssModel):
+                        elif isinstance(inference_model, GptOssModel):
                            mlx_generator = parse_gpt_oss(mlx_generator)

                        if tokenizer.has_tool_calling and not isinstance(
-                            model, GptOssModel
+                            inference_model, GptOssModel
                        ):
                            assert tokenizer.tool_call_start
                            assert tokenizer.tool_call_end
@@ -282,11 +278,21 @@ def main(
                                tokenizer.tool_parser,  # pyright: ignore[reportAny]
                            )

-                        completion_tokens = 0
+                        cancel_every = 5
+                        tokens_since_last_cancel_check = 0
                        for response in mlx_generator:
+                            tokens_since_last_cancel_check += 1
+                            if tokens_since_last_cancel_check >= cancel_every:
+                                tokens_since_last_cancel_check = 0
+                                cancelled_tasks.update(cancel_receiver.collect())
+                                want_to_cancel = (task.task_id in cancelled_tasks) or (
+                                    TaskId("CANCEL_CURRENT_TASK") in cancelled_tasks
+                                )
+                                if mx_any(want_to_cancel, group):
+                                    break
+
                            match response:
                                case GenerationResponse():
-                                    completion_tokens += 1
                                    if (
                                        device_rank == 0
                                        and response.finish_reason == "error"
@@ -314,7 +320,6 @@ def main(
                                                    model=shard_metadata.model_card.model_id,
                                                    text=response.text,
                                                    token_id=response.token,
-                                                    usage=response.usage,
                                                    finish_reason=response.finish_reason,
                                                    stats=response.stats,
                                                ),
@@ -328,7 +333,6 @@ def main(
                                                chunk=ToolCallChunk(
                                                    tool_calls=response.tool_calls,
                                                    model=shard_metadata.model_card.model_id,
-                                                    usage=response.usage,
                                                ),
                                            )
                                        )
@@ -350,72 +354,16 @@ def main(

                    current_status = RunnerReady()
                    logger.info("runner ready")
-                case ImageGeneration(
-                    task_params=task_params, command_id=command_id
-                ) if isinstance(current_status, RunnerReady):
-                    assert isinstance(model, DistributedImageModel)
-                    logger.info(f"received image generation request: {str(task)[:500]}")
-                    current_status = RunnerRunning()
-                    logger.info("runner running")
-                    event_sender.send(
-                        RunnerStatusUpdated(
-                            runner_id=runner_id, runner_status=current_status
-                        )
-                    )
-
-                    try:
-                        # Generate images using the image generation backend
-                        # Track image_index for final images only
-                        image_index = 0
-                        for response in generate_image(model=model, task=task_params):
-                            if (
-                                shard_metadata.device_rank
-                                == shard_metadata.world_size - 1
-                            ):
-                                match response:
-                                    case PartialImageResponse():
-                                        logger.info(
-                                            f"sending partial ImageChunk {response.partial_index}/{response.total_partials}"
-                                        )
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                    case ImageGenerationResponse():
-                                        logger.info("sending final ImageChunk")
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                        image_index += 1
-                    # can we make this more explicit?
-                    except Exception as e:
-                        if shard_metadata.device_rank == shard_metadata.world_size - 1:
-                            event_sender.send(
-                                ChunkGenerated(
-                                    command_id=command_id,
-                                    chunk=ErrorChunk(
-                                        model=shard_metadata.model_card.model_id,
-                                        finish_reason="error",
-                                        error_message=str(e),
-                                    ),
-                                )
-                            )
-                        raise
-
-                    current_status = RunnerReady()
-                    logger.info("runner ready")
-                case ImageEdits(task_params=task_params, command_id=command_id) if (
-                    isinstance(current_status, RunnerReady)
+                case ImageGeneration() | ImageEdits() if isinstance(
+                    current_status, RunnerReady
                ):
-                    assert isinstance(model, DistributedImageModel)
-                    logger.info(f"received image edits request: {str(task)[:500]}")
+                    assert image_model
+                    task_name = (
+                        "image generation"
+                        if isinstance(task, ImageGeneration)
+                        else "image edits"
+                    )
+                    logger.info(f"received {task_name} request: {str(task)[:500]}")
                    current_status = RunnerRunning()
                    logger.info("runner running")
                    event_sender.send(
@@ -425,39 +373,19 @@ def main(
                    )

                    try:
-                        image_index = 0
-                        for response in generate_image(model=model, task=task_params):
-                            if (
-                                shard_metadata.device_rank
-                                == shard_metadata.world_size - 1
-                            ):
-                                match response:
-                                    case PartialImageResponse():
-                                        logger.info(
-                                            f"sending partial ImageChunk {response.partial_index}/{response.total_partials}"
-                                        )
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                    case ImageGenerationResponse():
-                                        logger.info("sending final ImageChunk")
-                                        _process_image_response(
-                                            response,
-                                            command_id,
-                                            shard_metadata,
-                                            event_sender,
-                                            image_index,
-                                        )
-                                        image_index += 1
+                        _run_image_task(
+                            task=task,
+                            image_model=image_model,
+                            shard_metadata=shard_metadata,
+                            event_sender=event_sender,
+                            cancel_receiver=cancel_receiver,
+                            cancelled_tasks=cancelled_tasks,
+                        )
                    except Exception as e:
                        if shard_metadata.device_rank == shard_metadata.world_size - 1:
                            event_sender.send(
                                ChunkGenerated(
-                                    command_id=command_id,
+                                    command_id=task.command_id,
                                    chunk=ErrorChunk(
                                        model=shard_metadata.model_card.model_id,
                                        finish_reason="error",
@@ -489,7 +417,7 @@ def main(
                RunnerStatusUpdated(runner_id=runner_id, runner_status=current_status)
            )
            if isinstance(current_status, RunnerShutdown):
-                del model, tokenizer, group
+                del inference_model, image_model, tokenizer, group
                mx.clear_cache()
                import gc

@@ -544,10 +472,10 @@ def parse_gpt_oss(
                            name=current_tool_name,
                            arguments="".join(tool_arg_parts).strip(),
                        )
-                    ],
-                    usage=response.usage,
+                    ]
                )
                tool_arg_parts = []
+                break
            current_tool_name = recipient

        # If inside a tool call, accumulate arguments
@@ -598,6 +526,54 @@ def parse_thinking_models(
        yield response


+def _run_image_task(
+    task: ImageGeneration | ImageEdits,
+    image_model: DistributedImageModel,
+    shard_metadata: ShardMetadata,
+    event_sender: MpSender[Event],
+    cancel_receiver: MpReceiver[TaskId],
+    cancelled_tasks: set[TaskId],
+) -> None:
+    task_id = task.task_id
+    command_id = task.command_id
+
+    def check_cancelled(task_id: TaskId = task_id) -> bool:
+        cancelled_tasks.update(cancel_receiver.collect())
+        return (task_id in cancelled_tasks) or (
+            TaskId("CANCEL_CURRENT_TASK") in cancelled_tasks
+        )
+
+    image_index = 0
+    for response in generate_image(
+        model=image_model,
+        task=task.task_params,
+        cancel_checker=check_cancelled,
+    ):
+        if shard_metadata.device_rank == shard_metadata.world_size - 1:
+            match response:
+                case PartialImageResponse():
+                    logger.info(
+                        f"sending partial ImageChunk {response.partial_index}/{response.total_partials}"
+                    )
+                    _process_image_response(
+                        response,
+                        command_id,
+                        shard_metadata,
+                        event_sender,
+                        image_index,
+                    )
+                case ImageGenerationResponse():
+                    logger.info("sending final ImageChunk")
+                    _process_image_response(
+                        response,
+                        command_id,
+                        shard_metadata,
+                        event_sender,
+                        image_index,
+                    )
+                    image_index += 1
+
+
 def _send_image_chunk(
    encoded_data: str,
    command_id: CommandId,
@@ -693,7 +669,7 @@ def parse_tool_calls(
                    tools = [_validate_single_tool(tool) for tool in parsed]
                else:
                    tools = [_validate_single_tool(parsed)]
-                yield ToolCallResponse(tool_calls=tools, usage=response.usage)
+                yield ToolCallResponse(tool_calls=tools)

            except (
                json.JSONDecodeError,
--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -49,10 +49,12 @@ class RunnerSupervisor:
    _ev_recv: MpReceiver[Event]
    _task_sender: MpSender[Task]
    _event_sender: Sender[Event]
-    _tg: TaskGroup | None = field(default=None, init=False)
+    _cancel_sender: MpSender[TaskId]
+    _tg: TaskGroup = field(default_factory=create_task_group, init=False)
    status: RunnerStatus = field(default_factory=RunnerIdle, init=False)
    pending: dict[TaskId, anyio.Event] = field(default_factory=dict, init=False)
    completed: set[TaskId] = field(default_factory=set, init=False)
+    cancelled: set[TaskId] = field(default_factory=set, init=False)

    @classmethod
    def create(
@@ -63,8 +65,8 @@ class RunnerSupervisor:
        initialize_timeout: float = 400,
    ) -> Self:
        ev_send, ev_recv = mp_channel[Event]()
-        # A task is kind of a runner command
        task_sender, task_recv = mp_channel[Task]()
+        cancel_sender, cancel_recv = mp_channel[TaskId]()

        runner_process = Process(
            target=entrypoint,
@@ -72,6 +74,7 @@ class RunnerSupervisor:
                bound_instance,
                ev_send,
                task_recv,
+                cancel_recv,
                logger,
            ),
            daemon=True,
@@ -86,6 +89,7 @@ class RunnerSupervisor:
            initialize_timeout=initialize_timeout,
            _ev_recv=ev_recv,
            _task_sender=task_sender,
+            _cancel_sender=cancel_sender,
            _event_sender=event_sender,
        )

@@ -93,47 +97,46 @@ class RunnerSupervisor:

    async def run(self):
        self.runner_process.start()
-        async with create_task_group() as tg:
-            self._tg = tg
+        async with self._tg as tg:
            tg.start_soon(self._forward_events)

-        self._ev_recv.close()
-        self._task_sender.close()
-        self._event_sender.close()
-        await to_thread.run_sync(self.runner_process.join, 30)
-        if not self.runner_process.is_alive():
-            return
+        with anyio.CancelScope(shield=True), contextlib.suppress(ClosedResourceError):
+            await self._cancel_sender.send_async(TaskId("CANCEL_CURRENT_TASK"))

-        # This is overkill but it's not technically bad, just unnecessary.
-        logger.warning("Runner process didn't shutdown succesfully, terminating")
-        self.runner_process.terminate()
-        await to_thread.run_sync(self.runner_process.join, 5)
-        if not self.runner_process.is_alive():
-            return
+            self._ev_recv.close()
+            self._task_sender.close()
+            self._event_sender.close()
+            self._cancel_sender.close()

-        logger.critical("Runner process didn't respond to SIGTERM, killing")
-        self.runner_process.kill()
+            await to_thread.run_sync(self.runner_process.join, 10)
+            if not self.runner_process.is_alive():
+                return

-        await to_thread.run_sync(self.runner_process.join, 5)
-        if not self.runner_process.is_alive():
-            return
+            # This is overkill but it's not technically bad, just unnecessary.
+            logger.warning("Runner process didn't shutdown succesfully, terminating")
+            self.runner_process.terminate()
+            await to_thread.run_sync(self.runner_process.join, 5)
+            if not self.runner_process.is_alive():
+                return

-        logger.critical(
-            "Runner process didn't respond to SIGKILL. System resources may have leaked"
-        )
+            logger.critical("Runner process didn't respond to SIGTERM, killing")
+            self.runner_process.kill()

-    def shutdown(self):
-        assert self._tg
+            await to_thread.run_sync(self.runner_process.join, 5)
+            if not self.runner_process.is_alive():
+                return
+
+            logger.critical(
+                "Runner process didn't respond to SIGKILL. System resources may have leaked"
+            )
+
+    async def shutdown(self):
+        await self._cancel_sender.send_async(TaskId("CANCEL_CURRENT_TASK"))
        self._tg.cancel_scope.cancel()

    async def start_task(self, task: Task):
-        if task.task_id in self.pending:
-            logger.warning(
-                f"Skipping invalid task {task} as it has already been submitted"
-            )
-            return
        if task.task_id in self.completed:
-            logger.warning(
+            logger.info(
                f"Skipping invalid task {task} as it has already been completed"
            )
            return
@@ -141,12 +144,19 @@ class RunnerSupervisor:
        event = anyio.Event()
        self.pending[task.task_id] = event
        try:
-            await self._task_sender.send_async(task)
+            self._task_sender.send(task)
        except ClosedResourceError:
            logger.warning(f"Task {task} dropped, runner closed communication.")
            return
        await event.wait()

+    async def cancel_task(self, task_id: TaskId):
+        if task_id in self.completed:
+            logger.info(f"Unable to cancel {task_id} as it has been completed")
+            return
+        self.cancelled.add(task_id)
+        await self._cancel_sender.send_async(task_id)
+
    async def _forward_events(self):
        with self._ev_recv as events:
            try:
@@ -211,4 +221,4 @@ class RunnerSupervisor:
                runner_status=RunnerFailed(error_message=f"Terminated ({cause})"),
            )
        )
-        self.shutdown()
+        await self.shutdown()
--- a/src/exo/worker/tests/unittests/test_mlx/test_kv_prefix_cache.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_kv_prefix_cache.py
@@ -1,545 +0,0 @@
-# type: ignore
-import time
-from typing import cast
-from unittest.mock import patch
-
-import mlx.core as mx
-import pytest
-from mlx_lm.models.cache import KVCache
-from mlx_lm.sample_utils import make_sampler
-
-from exo.shared.types.api import ChatCompletionMessage
-from exo.shared.types.common import ModelId
-from exo.shared.types.tasks import ChatCompletionTaskParams
-from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.cache import (
-    KVPrefixCache,
-    cache_length,
-    encode_prompt,
-    get_prefix_length,
-    make_kv_cache,
-)
-from exo.worker.engines.mlx.generator.generate import mlx_generate, prefill
-from exo.worker.engines.mlx.utils_mlx import apply_chat_template
-from exo.worker.tests.unittests.test_mlx.conftest import (
-    DEFAULT_GPT_OSS_CONFIG,
-    DEFAULT_GPT_OSS_MODEL_ID,
-)
-
-
-def _check_model_exists() -> bool:
-    return DEFAULT_GPT_OSS_CONFIG.model_path.exists()
-
-
-class TestGetPrefixLength:
-    def test_identical_arrays(self):
-        a = mx.array([1, 2, 3, 4, 5])
-        b = mx.array([1, 2, 3, 4, 5])
-        assert get_prefix_length(a, b) == 5
-
-    def test_no_common_prefix(self):
-        a = mx.array([1, 2, 3])
-        b = mx.array([4, 5, 6])
-        assert get_prefix_length(a, b) == 0
-
-    def test_partial_prefix(self):
-        a = mx.array([1, 2, 3, 4, 5])
-        b = mx.array([1, 2, 3, 7, 8])
-        assert get_prefix_length(a, b) == 3
-
-    def test_prompt_longer_than_cached(self):
-        a = mx.array([1, 2, 3, 4, 5])
-        b = mx.array([1, 2, 3])
-        assert get_prefix_length(a, b) == 3
-
-    def test_cached_longer_than_prompt(self):
-        a = mx.array([1, 2, 3])
-        b = mx.array([1, 2, 3, 4, 5])
-        assert get_prefix_length(a, b) == 3
-
-    def test_single_token_match(self):
-        a = mx.array([1, 2, 3])
-        b = mx.array([1, 5, 6])
-        assert get_prefix_length(a, b) == 1
-
-    def test_empty_prompt(self):
-        a = mx.array([]).astype(mx.int32)
-        b = mx.array([1, 2, 3])
-        assert get_prefix_length(a, b) == 0
-
-    def test_empty_cached(self):
-        a = mx.array([1, 2, 3])
-        b = mx.array([]).astype(mx.int32)
-        assert get_prefix_length(a, b) == 0
-
-    def test_both_empty(self):
-        a = mx.array([]).astype(mx.int32)
-        b = mx.array([]).astype(mx.int32)
-        assert get_prefix_length(a, b) == 0
-
-
-class TestKVPrefix:
-    @pytest.fixture
-    def mock_tokenizer(self):
-        """Create a minimal mock tokenizer for tests that don't need real tokenization."""
-        from unittest.mock import MagicMock
-
-        tokenizer = MagicMock()
-        tokenizer.encode.return_value = [1, 2, 3]
-        return tokenizer
-
-    def test_starts_empty(self, mock_tokenizer):
-        cache = KVPrefixCache(mock_tokenizer)
-        assert len(cache.prompts) == 0
-        assert len(cache.caches) == 0
-
-    def test_clear_empties_cache(self, mock_tokenizer):
-        cache = KVPrefixCache(mock_tokenizer)
-        cache.prompts.append(mx.array([1, 2, 3]))
-        cache.caches.append([KVCache()])
-        cache.clear()
-        assert len(cache.prompts) == 0
-        assert len(cache.caches) == 0
-
-    def test_clear_on_empty_cache(self, mock_tokenizer):
-        cache = KVPrefixCache(mock_tokenizer)
-        cache.clear()
-        assert len(cache.prompts) == 0
-
-
-def _load_gpt_oss() -> tuple[Model, object]:
-    from mlx_lm.utils import load_model
-
-    from exo.worker.engines.mlx.utils_mlx import load_tokenizer_for_model_id
-
-    model_path = DEFAULT_GPT_OSS_CONFIG.model_path
-    model_id = ModelId(DEFAULT_GPT_OSS_MODEL_ID)
-
-    model, _ = load_model(model_path, lazy=False)
-    tokenizer = load_tokenizer_for_model_id(model_id, model_path)
-    return cast(Model, model), tokenizer
-
-
-@pytest.mark.slow
-@pytest.mark.skipif(
-    not _check_model_exists(),
-    reason=f"GPT-OSS model not found at {DEFAULT_GPT_OSS_CONFIG.model_path}",
-)
-class TestKVPrefixCacheWithModel:
-    @pytest.fixture(scope="class")
-    def model_and_tokenizer(self):
-        model, tokenizer = _load_gpt_oss()
-        return model, tokenizer
-
-    def test_prefill_populates_cache(self, model_and_tokenizer):
-        model, tokenizer = model_and_tokenizer
-
-        task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Hello!!")],
-            max_tokens=1,
-        )
-        prompt = apply_chat_template(tokenizer, task)
-        tokens = encode_prompt(tokenizer, prompt)
-        cache = make_kv_cache(model)
-
-        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
-
-        # Cache should now hold the prompt tokens
-        assert cache_length(cache) == len(tokens)
-
-    def test_add_and_get_exact_match(self, model_and_tokenizer):
-        model, tokenizer = model_and_tokenizer
-
-        task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Test exact")],
-            max_tokens=1,
-        )
-        prompt = apply_chat_template(tokenizer, task)
-        tokens = encode_prompt(tokenizer, prompt)
-        cache = make_kv_cache(model)
-
-        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-        kv_prefix_cache.add_kv_cache(prompt, cache)
-
-        assert len(kv_prefix_cache.prompts) == 1
-        stored_length = cache_length(kv_prefix_cache.caches[0])
-        assert stored_length > 0
-
-        # Retrieve with same prompt: exact match
-        result_cache, remaining_tokens, matched_index = kv_prefix_cache.get_kv_cache(
-            model, prompt
-        )
-        assert matched_index == 0
-
-        # Exact match returns only last token
-        assert len(remaining_tokens) == 1
-        assert mx.array_equal(remaining_tokens, tokens[-1:])
-
-    def test_add_and_get_prefix_match(self, model_and_tokenizer):
-        """get_kv_cache with a longer prompt sharing prefix should return partial match."""
-        model, tokenizer = model_and_tokenizer
-
-        short_task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Hi")],
-            max_tokens=1,
-        )
-        short_prompt = apply_chat_template(tokenizer, short_task)
-        short_tokens = encode_prompt(tokenizer, short_prompt)
-        cache = make_kv_cache(model)
-
-        prefill(model, tokenizer, make_sampler(0.0), short_tokens, cache)
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-        kv_prefix_cache.add_kv_cache(short_prompt, cache)
-
-        # Query with longer prompt that shares the chat template prefix
-        long_task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[
-                ChatCompletionMessage(role="user", content="Hi there, how are you?")
-            ],
-            max_tokens=1,
-        )
-        long_prompt = apply_chat_template(tokenizer, long_task)
-        long_tokens = encode_prompt(tokenizer, long_prompt)
-
-        # The prompts share a prefix (chat template preamble + "Hi")
-        expected_prefix = get_prefix_length(long_tokens, short_tokens)
-        assert expected_prefix > 0, (
-            "Prompts should share a prefix from the chat template"
-        )
-
-        result_cache, remaining_tokens, matched_index = kv_prefix_cache.get_kv_cache(
-            model, long_prompt
-        )
-        assert matched_index == 0
-
-        # remaining_tokens should be the suffix after the shared prefix
-        assert len(remaining_tokens) == len(long_tokens) - expected_prefix
-        assert mx.array_equal(remaining_tokens, long_tokens[expected_prefix:])
-
-    def test_stored_cache_not_mutated_after_get_and_generation(
-        self, model_and_tokenizer
-    ):
-        """Getting a cache and then mutating it (as generation does) must not corrupt stored cache."""
-        model, tokenizer = model_and_tokenizer
-
-        task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Mutation test")],
-            max_tokens=1,
-        )
-        prompt = apply_chat_template(tokenizer, task)
-        tokens = encode_prompt(tokenizer, prompt)
-        cache = make_kv_cache(model)
-
-        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-        kv_prefix_cache.add_kv_cache(prompt, cache)
-
-        stored_length = cache_length(kv_prefix_cache.caches[0])
-
-        # Get cache and mutate it (simulating what generation does)
-        result_cache, _, matched_index = kv_prefix_cache.get_kv_cache(model, prompt)
-        assert matched_index == 0
-
-        # Simulate generation: feed many additional tokens through the cache
-        head_dim = result_cache[0].keys.shape[-1]
-        num_heads = result_cache[0].keys.shape[1]
-        extra_keys = mx.random.normal((1, num_heads, 50, head_dim))
-        extra_values = mx.random.normal((1, num_heads, 50, head_dim))
-        for layer_cache in result_cache:
-            layer_cache.update_and_fetch(extra_keys, extra_values)
-        mx.eval([c.keys for c in result_cache])
-
-        # Stored cache must be unchanged
-        assert cache_length(kv_prefix_cache.caches[0]) == stored_length
-
-    def test_stored_cache_survives_repeated_get_mutate_cycles(
-        self, model_and_tokenizer
-    ):
-        """Multiple get+mutate cycles (like repeated user requests) must not corrupt cache."""
-        model, tokenizer = model_and_tokenizer
-
-        task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Repeat test")],
-            max_tokens=1,
-        )
-        prompt = apply_chat_template(tokenizer, task)
-        tokens = encode_prompt(tokenizer, prompt)
-        cache = make_kv_cache(model)
-
-        prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-        kv_prefix_cache.add_kv_cache(prompt, cache)
-
-        stored_length = cache_length(kv_prefix_cache.caches[0])
-
-        for i in range(3):
-            result_cache, _, _ = kv_prefix_cache.get_kv_cache(model, prompt)
-
-            head_dim = result_cache[0].keys.shape[-1]
-            num_heads = result_cache[0].keys.shape[1]
-            extra = mx.random.normal((1, num_heads, 30, head_dim))
-            for layer_cache in result_cache:
-                layer_cache.update_and_fetch(extra, extra)
-            mx.eval([c.keys for c in result_cache])
-
-            assert cache_length(kv_prefix_cache.caches[0]) == stored_length, (
-                f"Failed on loop {i}"
-            )
-
-    def test_mlx_generate_populates_cache(self, model_and_tokenizer):
-        """mlx_generate should save the cache after generation completes."""
-        model, tokenizer = model_and_tokenizer
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-        task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Hello")],
-            max_tokens=5,
-        )
-        prompt = apply_chat_template(tokenizer, task)
-        prompt_tokens = encode_prompt(tokenizer, prompt)
-
-        # Consume the entire generator so the cache-saving code after yield runs
-        generated_tokens = 0
-        for _response in mlx_generate(
-            model=model,
-            tokenizer=tokenizer,
-            task=task,
-            prompt=prompt,
-            kv_prefix_cache=kv_prefix_cache,
-        ):
-            generated_tokens += 1
-
-        assert len(kv_prefix_cache.prompts) == 1
-        assert len(kv_prefix_cache.caches) == 1
-        # Cache should contain prompt + generated tokens
-        expected_length = len(prompt_tokens) + generated_tokens
-        assert cache_length(kv_prefix_cache.caches[0]) == expected_length
-
-    def test_mlx_generate_second_call_gets_prefix_hit(self, model_and_tokenizer):
-        """Second mlx_generate call with same prompt should get a prefix hit from stored cache."""
-        model, tokenizer = model_and_tokenizer
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-        task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Reuse test")],
-            max_tokens=5,
-        )
-        prompt = apply_chat_template(tokenizer, task)
-        prompt_tokens = encode_prompt(tokenizer, prompt)
-
-        # First generation populates cache
-        for _response in mlx_generate(
-            model=model,
-            tokenizer=tokenizer,
-            task=task,
-            prompt=prompt,
-            kv_prefix_cache=kv_prefix_cache,
-        ):
-            pass
-
-        assert len(kv_prefix_cache.prompts) == 1
-
-        # Second call should find a prefix match (the stored cache contains
-        # prompt + generated tokens, which shares the prompt prefix)
-        result_cache, remaining_tokens, matched_index = kv_prefix_cache.get_kv_cache(
-            model, prompt
-        )
-        # The stored cache is longer than the prompt (it includes generated tokens),
-        # so this is a prefix match where our prompt is fully contained
-        assert matched_index == 0
-        # Exact match: remaining_tokens is just the last token
-        assert len(remaining_tokens) == 1
-        assert mx.array_equal(remaining_tokens, prompt_tokens[-1:])
-
-    def test_mlx_generate_long_prompt_updates_cache_in_place(self, model_and_tokenizer):
-        """With a prompt > 1000 tokens, second generation should update the cache entry in-place."""
-        model, tokenizer = model_and_tokenizer
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-
-        # Build a long user message (> 1000 tokens) to exceed _MIN_PREFIX_HIT_TO_UPDATE
-        base_text = "The quick brown fox jumps over the lazy dog. "
-        base_tokens = tokenizer.encode(base_text)
-        repeats = (1200 // len(base_tokens)) + 2
-        long_content = base_text * repeats
-
-        task1 = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content=long_content)],
-            max_tokens=5,
-        )
-        prompt1 = apply_chat_template(tokenizer, task1)
-        prompt1_tokens = encode_prompt(tokenizer, prompt1)
-        assert len(prompt1_tokens) > 1000, (
-            "Prompt must exceed _MIN_PREFIX_HIT_TO_UPDATE"
-        )
-
-        # First generation populates the cache (must prefill all tokens)
-        t0 = time.perf_counter()
-        for _response in mlx_generate(
-            model=model,
-            tokenizer=tokenizer,
-            task=task1,
-            prompt=prompt1,
-            kv_prefix_cache=kv_prefix_cache,
-        ):
-            pass
-        first_gen_time = time.perf_counter() - t0
-
-        assert len(kv_prefix_cache.prompts) == 1
-        first_cache_length = cache_length(kv_prefix_cache.caches[0])
-
-        # Second generation: same long prompt + extra content (simulating multi-turn)
-        task2 = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[
-                ChatCompletionMessage(role="user", content=long_content),
-                ChatCompletionMessage(role="assistant", content="Sure, I can help."),
-                ChatCompletionMessage(role="user", content="Tell me more."),
-            ],
-            max_tokens=5,
-        )
-        prompt2 = apply_chat_template(tokenizer, task2)
-        prompt2_tokens = encode_prompt(tokenizer, prompt2)
-
-        # Verify the prompts share a long prefix
-        prefix_len = get_prefix_length(prompt2_tokens, prompt1_tokens)
-        assert prefix_len > 1000, "Prompts must share > 1000 token prefix"
-
-        # Second generation should reuse the cached prefix (only prefill new tokens)
-        t0 = time.perf_counter()
-        for _response in mlx_generate(
-            model=model,
-            tokenizer=tokenizer,
-            task=task2,
-            prompt=prompt2,
-            kv_prefix_cache=kv_prefix_cache,
-        ):
-            pass
-        second_gen_time = time.perf_counter() - t0
-
-        # Second generation should be significantly faster due to prefix cache hit - hopefully not flaky
-        assert second_gen_time < first_gen_time * 0.5, (
-            f"Expected prefix cache speedup: "
-            f"first={first_gen_time:.2f}s, second={second_gen_time:.2f}s"
-        )
-
-        # With prefix_hit > 1000, should update in-place (not add a second entry)
-        assert len(kv_prefix_cache.prompts) == 1
-        # Updated cache should be longer (prompt2 + generated > prompt1 + generated)
-        updated_cache_length = cache_length(kv_prefix_cache.caches[0])
-        assert updated_cache_length > first_cache_length
-
-    def test_mlx_generate_stored_cache_not_mutated(self, model_and_tokenizer):
-        """After mlx_generate saves a cache, a second generation must not corrupt the stored copy."""
-        model, tokenizer = model_and_tokenizer
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-        task = ChatCompletionTaskParams(
-            model=DEFAULT_GPT_OSS_MODEL_ID,
-            messages=[ChatCompletionMessage(role="user", content="Immutable test")],
-            max_tokens=5,
-        )
-        prompt = apply_chat_template(tokenizer, task)
-
-        # First generation populates cache
-        for _response in mlx_generate(
-            model=model,
-            tokenizer=tokenizer,
-            task=task,
-            prompt=prompt,
-            kv_prefix_cache=kv_prefix_cache,
-        ):
-            pass
-
-        firstcache_length = cache_length(kv_prefix_cache.caches[0])
-
-        # Second generation gets the cache and mutates it during generation
-        for _response in mlx_generate(
-            model=model,
-            tokenizer=tokenizer,
-            task=task,
-            prompt=prompt,
-            kv_prefix_cache=kv_prefix_cache,
-        ):
-            pass
-
-        # The first stored cache must not have been mutated by the second generation
-        assert cache_length(kv_prefix_cache.caches[0]) == firstcache_length
-
-    def test_evicts_lru_entry_under_memory_pressure(self, model_and_tokenizer):
-        """Under memory pressure, adding a new cache entry evicts the least recently used one."""
-        model, tokenizer = model_and_tokenizer
-
-        kv_prefix_cache = KVPrefixCache(tokenizer)
-
-        # Add three cache entries with different prompts
-        prompts = ["First entry", "Second entry", "Third entry"]
-        for i, content in enumerate(prompts):
-            task = ChatCompletionTaskParams(
-                model=DEFAULT_GPT_OSS_MODEL_ID,
-                messages=[ChatCompletionMessage(role="user", content=content)],
-                max_tokens=1,
-            )
-            prompt = apply_chat_template(tokenizer, task)
-            tokens = encode_prompt(tokenizer, prompt)
-            cache = make_kv_cache(model)
-            prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
-            kv_prefix_cache.add_kv_cache(prompt, cache)
-            # Stagger _last_used so LRU order is deterministic
-            kv_prefix_cache._last_used[i] = float(i)
-
-        assert len(kv_prefix_cache.prompts) == 3
-
-        # Access the third entry to make it most recently used
-        kv_prefix_cache._last_used[2] = 100.0
-        # Entry 0 (_last_used=0.0) is LRU, entry 1 (_last_used=1.0) is next
-
-        # Simulate memory pressure: active memory exceeds threshold
-        fake_limit = 1000
-        fake_active = int(fake_limit * 0.90)  # Above _MEMORY_THRESHOLD (0.85)
-
-        with (
-            patch(
-                "exo.worker.engines.mlx.cache.mx.metal.get_active_memory",
-                return_value=fake_active,
-            ),
-            patch(
-                "exo.worker.engines.mlx.cache.mx.metal.device_info",
-                return_value={"max_recommended_working_set_size": fake_limit},
-            ),
-        ):
-            # Trigger eviction by adding a new entry
-            task = ChatCompletionTaskParams(
-                model=DEFAULT_GPT_OSS_MODEL_ID,
-                messages=[ChatCompletionMessage(role="user", content="New entry")],
-                max_tokens=1,
-            )
-            prompt = apply_chat_template(tokenizer, task)
-            tokens = encode_prompt(tokenizer, prompt)
-            cache = make_kv_cache(model)
-            prefill(model, tokenizer, make_sampler(0.0), tokens, cache)
-            kv_prefix_cache.add_kv_cache(prompt, cache)
-
-        # LRU entries should have been evicted (entries 0, 1, 2 in order of _last_used)
-        # Since fake_active stays above threshold after each eviction (we don't change it),
-        # all old entries get evicted, leaving only the newly added one
-        assert len(kv_prefix_cache.prompts) == 1
-        # The surviving entry should be the newly added one
-        new_tokens = encode_prompt(tokenizer, prompt)
-        assert get_prefix_length(kv_prefix_cache.prompts[0], new_tokens) == len(
-            new_tokens
-        )
--- a/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
@@ -109,8 +109,8 @@ def assert_events_equal(test_events: Iterable[Event], true_events: Iterable[Even

@pytest.fixture
 def patch_out_mlx(monkeypatch: pytest.MonkeyPatch):
-    # initialize_mlx returns a mock group
-    monkeypatch.setattr(mlx_runner, "initialize_mlx", make_nothin(MockGroup()))
+    # initialize_mlx returns a "group" equal to 1
+    monkeypatch.setattr(mlx_runner, "initialize_mlx", make_nothin(1))
    monkeypatch.setattr(mlx_runner, "load_mlx_items", make_nothin((1, MockTokenizer)))
    monkeypatch.setattr(mlx_runner, "warmup_inference", make_nothin(1))
    monkeypatch.setattr(mlx_runner, "_check_for_debug_prompts", nothin)
@@ -120,7 +120,7 @@ def patch_out_mlx(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setattr(mlx_runner, "detect_thinking_prompt_suffix", make_nothin(False))

    def fake_generate(*_1: object, **_2: object):
-        yield GenerationResponse(token=0, text="hi", finish_reason="stop", usage=None)
+        yield GenerationResponse(token=0, text="hi", finish_reason="stop")

    monkeypatch.setattr(mlx_runner, "mlx_generate", fake_generate)

@@ -147,14 +147,6 @@ class MockTokenizer:
    has_tool_calling = False


-class MockGroup:
-    def rank(self) -> int:
-        return 0
-
-    def size(self) -> int:
-        return 1
-
-
 def _run(tasks: Iterable[Task]):
    bound_instance = get_bound_mlx_ring_instance(
        instance_id=INSTANCE_1_ID,
@@ -190,8 +182,6 @@ def test_events_processed_in_correct_order(patch_out_mlx: pytest.MonkeyPatch):
            text="hi",
            token_id=0,
            finish_reason="stop",
-            usage=None,
-            stats=None,
        ),
    )

--- a/tests/start_distributed_test.sh
+++ b/tests/start_distributed_test.sh
@@ -11,6 +11,7 @@ if [[ $# -lt 2 ]]; then
  exit 1
 fi

+
 kind=$1
 shift

@@ -30,14 +31,14 @@ for name in "${hostnames[@]}"; do
  weaved+=("$name" "$ip")
 done

-devs_raw=$(printf '["%s", "%s"], ' "${weaved[@]}")
+devs_raw=$(printf "[\"%s\", \"%s\"], " "${weaved[@]}")
 devs="[${devs_raw%, }]"

 model_ids=("qwen3-30b" "gpt-oss-120b-MXFP4-Q8" "kimi-k2-thinking")

 for model_id in "${model_ids[@]}"; do
-  for i in "${!ips[@]}"; do
-    {
+  for i in "${!ips[@]}"; do  
+    { 
      req="{
        \"model_id\": \"${model_id}\",
        \"devs\": ${devs},
@@ -47,8 +48,9 @@ for model_id in "${model_ids[@]}"; do
      curl -sN \
        -X POST "http://${ips[$i]}:52415/${kind}" \
        -H "Content-Type: application/json" -d "$req" \
-        2>&1 | sed "s/^/\n${hostnames[$i]}@${ips[$i]}: /" || echo "curl to ${hostnames[$i]} failed" && exit 1
+      2>&1 | sed "s/^/\n${hostnames[$i]}@${ips[$i]}: /" || echo "curl to ${hostnames[$i]} failed" && exit 1
    } &
  done
  wait
 done
+
--- a/tmp/config_examples/opencode.json
+++ b/tmp/config_examples/opencode.json
@@ -1,18 +0,0 @@
-{
-  "$schema": "https://opencode.ai/config.json",
-  "model": "exo/mlx-community/gpt-oss-120b-MXFP4-Q8",
-  "provider": {
-    "exo": {
-      "api": "http://localhost:52415/v1",
-      "models": {
-        "mlx-community/gpt-oss-120b-MXFP4-Q8": {
-          "name": "GPT OSS 120B",
-          "limit": {
-            "context": 32768,
-            "output": 8192
-          }
-        }
-      }
-    }
-  }
-}
--- a/tmp/set_rdma_network_config.sh
+++ b/tmp/set_rdma_network_config.sh
@@ -1,47 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-PREFS="/Library/Preferences/SystemConfiguration/preferences.plist"
-
-# Remove bridge0 interface
-ifconfig bridge0 &>/dev/null && {
-  ifconfig bridge0 | grep -q 'member' && {
-    ifconfig bridge0 | awk '/member/ {print $2}' | xargs -n1 ifconfig bridge0 deletem 2>/dev/null || true
-  }
-  ifconfig bridge0 destroy 2>/dev/null || true
-}
-
-# Remove Thunderbolt Bridge from VirtualNetworkInterfaces in preferences.plist
-/usr/libexec/PlistBuddy -c "Delete :VirtualNetworkInterfaces:Bridge:bridge0" "$PREFS" 2>/dev/null || true
-
-networksetup -listlocations | grep -q exo || {
-  networksetup -createlocation exo
-}
-
-networksetup -switchtolocation exo
-networksetup -listallhardwareports |
-  awk -F': ' '/Hardware Port: / {print $2}' |
-  while IFS=":" read -r name; do
-    case "$name" in
-    "Ethernet Adapter"*) ;;
-    "Thunderbolt Bridge") ;;
-    "Thunderbolt "*)
-      networksetup -listallnetworkservices |
-        grep -q "EXO $name" ||
-        networksetup -createnetworkservice "EXO $name" "$name" 2>/dev/null ||
-        continue
-      networksetup -setdhcp "EXO $name"
-      ;;
-    *)
-      networksetup -listallnetworkservices |
-        grep -q "$name" ||
-        networksetup -createnetworkservice "$name" "$name" 2>/dev/null ||
-        continue
-      ;;
-    esac
-  done
-
-networksetup -listnetworkservices | grep -q "Thunderbolt Bridge" && {
-  networksetup -setnetworkserviceenabled "Thunderbolt Bridge" off
-} || true
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
ciaranbor	409fa80600	Fix cancellation during async step	2026-01-26 17:37:24 +00:00
ciaranbor	5a94c21daa	Skip final rank async send upon cancellation	2026-01-26 15:55:47 +00:00
ciaranbor	56ec049321	Handle cancellation completion in dashboard	2026-01-26 10:23:54 +00:00
ciaranbor	b477f88ace	Handle cancellation signal in diffusion runner	2026-01-26 10:23:54 +00:00
ciaranbor	4ea6e32f7b	Refactor duplicate image generation and image editing runner logic. Add cancellation checker to inject into model inference	2026-01-26 10:23:54 +00:00
ciaranbor	49c5345e93	Add generation cancellation button to UI	2026-01-26 10:22:21 +00:00
Evan	ea593075d7	api cancellation closing the http request to the api now - sends a cancellation from the api - writes that canellation in the master - worker plans off the cancellation - runner observes that cancellation after every generation step (+1 communication per token) - cancellation happens synchronously to prevent gpu locks	2026-01-24 21:50:50 +00:00