Merge pull request #619 from exo-explore/runners2

fix readme images
2026-02-07 04:32:28 -05:00 · 2025-01-23 02:18:33 +00:00 · 2025-01-23 02:17:58 +00:00 · 2025-01-23 02:05:17 +00:00 · 2025-01-23 01:55:14 +00:00 · 2025-01-23 01:44:57 +00:00
46 changed files with 2515 additions and 832 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -254,6 +254,33 @@ jobs:
          prompt: "Keep responses concise. Who was the king of pop?"
          expected_output: "Michael Jackson"

+  chatgpt_api_integration_test_tinygrad_linux:
+    machine:
+      image: ubuntu-2204:current
+    resource_class: xlarge
+    steps:
+      - checkout
+      - run:
+          name: Set up Python
+          command: |
+            sudo apt-get update
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get update
+            sudo apt-get install -y python3.12 python3.12-venv clang
+            python3.12 -m venv env
+            source env/bin/activate
+      - run:
+          name: Install dependencies
+          command: |
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install .
+      - run_chatgpt_api_test:
+          inference_engine: tinygrad
+          model_id: llama-3.2-1b
+          prompt: "Keep responses concise. Who was the king of pop?"
+          expected_output: "Michael Jackson"
+
  measure_pip_sizes:
    macos:
      xcode: "16.0.0"
@@ -342,5 +369,6 @@ workflows:
      - discovery_integration_test
      - chatgpt_api_integration_test_mlx
      - chatgpt_api_integration_test_tinygrad
+      - chatgpt_api_integration_test_tinygrad_linux
      - chatgpt_api_integration_test_dummy
      - measure_pip_sizes
--- a/.github/bench.py
+++ b/.github/bench.py
@@ -0,0 +1,401 @@
+import aiohttp
+import asyncio
+import time
+import json
+import os
+import boto3
+from typing import Dict, Any
+from datetime import datetime
+import subprocess
+import psutil
+import platform
+from pathlib import Path
+
+
+def check_system_state():
+    print("\n=== System State Check ===", flush=True)
+    
+    # Add macOS-specific checks
+    try:
+        # Check powermetrics with sudo
+        try:
+            power_metrics = subprocess.run(
+                ['sudo', 'powermetrics', '-n', '1', '-i', '1000', '--samplers', 'cpu_power'],
+                capture_output=True, text=True
+            )
+            print("\nPower Metrics:", power_metrics.stdout, flush=True)
+        except Exception as e:
+            print(f"Error getting power metrics: {e}", flush=True)
+        
+        # Check thermal state
+        thermal_state = subprocess.run(['pmset', '-g', 'therm'], capture_output=True, text=True)
+        print("\nThermal State:", thermal_state.stdout, flush=True)
+        
+        # Check if running under Rosetta
+        arch = subprocess.run(['arch'], capture_output=True, text=True)
+        print("\nArchitecture:", arch.stdout, flush=True)
+        
+        # Check MLX compilation mode - only if mlx is available
+        try:
+            import mlx.core as mx
+            if hasattr(mx, 'build_info'):
+                print("\nMLX Build Info:", mx.build_info(), flush=True)
+            else:
+                print("\nMLX Build Info: Not available in this version", flush=True)
+        except ImportError:
+            print("\nMLX: Not installed", flush=True)
+        except Exception as e:
+            print(f"\nError checking MLX: {e}", flush=True)
+        
+    except Exception as e:
+        print(f"Error in macOS checks: {e}", flush=True)
+
+    # CPU Info
+    print("\nCPU Information:", flush=True)
+    try:
+        if platform.system() == 'Darwin' and platform.processor() == 'arm':
+            # Use sysctl for Apple Silicon Macs
+            cpu_info = subprocess.run(['sysctl', 'machdep.cpu'], capture_output=True, text=True)
+            if cpu_info.returncode == 0:
+                print(f"CPU Info (Apple Silicon):", cpu_info.stdout, flush=True)
+            
+            # Parse powermetrics output for clearer CPU frequency display
+            try:
+                power_metrics = subprocess.run(
+                    ['sudo', 'powermetrics', '-n', '1', '-i', '100', '--samplers', 'cpu_power'],
+                    capture_output=True, text=True
+                )
+                if power_metrics.returncode == 0:
+                    output = power_metrics.stdout
+                    print("\nDetailed CPU Frequency Information:")
+                    
+                    # Extract cluster frequencies and max frequencies
+                    current_cluster = None
+                    max_freqs = {'E': 0, 'P0': 0, 'P1': 0}
+                    
+                    for line in output.split('\n'):
+                        # Track which cluster we're processing
+                        if "E-Cluster" in line:
+                            current_cluster = 'E'
+                        elif "P0-Cluster" in line:
+                            current_cluster = 'P0'
+                        elif "P1-Cluster" in line:
+                            current_cluster = 'P1'
+                            
+                        # Get current frequencies
+                        if "HW active frequency:" in line:
+                            freq = line.split(':')[1].strip()
+                            if freq != "0 MHz":
+                                print(f"Current {current_cluster}-Cluster Frequency: {freq}")
+                        
+                        # Get max frequencies from residency lines
+                        if current_cluster and "active residency:" in line and "MHz:" in line:
+                            try:
+                                # Extract all frequency values
+                                freqs = []
+                                parts = line.split('MHz:')[:-1]  # Skip last part as it's not a frequency
+                                for part in parts:
+                                    freq_str = part.split()[-1]
+                                    try:
+                                        freq = float(freq_str)
+                                        freqs.append(freq)
+                                    except ValueError:
+                                        continue
+                                if freqs:
+                                    max_freqs[current_cluster] = max(max_freqs[current_cluster], max(freqs))
+                            except Exception:
+                                continue
+                    
+                    # Print max frequencies
+                    print("\nMaximum Available Frequencies:")
+                    for cluster, max_freq in max_freqs.items():
+                        if max_freq > 0:
+                            print(f"{cluster}-Cluster Max: {max_freq:.0f} MHz")
+                            
+            except Exception as e:
+                print(f"Error parsing powermetrics: {e}", flush=True)
+        else:
+            # Use psutil for other systems
+            cpu_freq = psutil.cpu_freq()
+            print(f"CPU Frequency - Current: {cpu_freq.current:.2f}MHz, Min: {cpu_freq.min:.2f}MHz, Max: {cpu_freq.max:.2f}MHz", flush=True)
+        
+        print(f"\nCPU Usage per Core: {psutil.cpu_percent(percpu=True)}%", flush=True)
+        
+        # Check if running in low power mode
+        power_mode = subprocess.run(['pmset', '-g'], capture_output=True, text=True)
+        print("\nPower Settings:", power_mode.stdout, flush=True)
+    except Exception as e:
+        print(f"Error getting CPU info: {e}", flush=True)
+
+    # Memory Info
+    print("\nMemory Information:", flush=True)
+    try:
+        mem = psutil.virtual_memory()
+        print(f"Total: {mem.total/1024/1024/1024:.2f}GB", flush=True)
+        print(f"Available: {mem.available/1024/1024/1024:.2f}GB", flush=True)
+        print(f"Used: {mem.used/1024/1024/1024:.2f}GB ({mem.percent}%)", flush=True)
+        
+        # Check swap
+        swap = psutil.swap_memory()
+        print(f"Swap Used: {swap.used/1024/1024/1024:.2f}GB of {swap.total/1024/1024/1024:.2f}GB", flush=True)
+    except Exception as e:
+        print(f"Error getting memory info: {e}", flush=True)
+
+    # GPU Info
+    print("\nGPU Information:", flush=True)
+    try:
+        # Check MLX GPU settings
+        print("MLX Environment Variables:", flush=True)
+        mlx_vars = {k: v for k, v in os.environ.items() if k.startswith('MLX')}
+        print(json.dumps(mlx_vars, indent=2), flush=True)
+        
+        # Check Metal GPU memory allocation
+        gpu_mem = subprocess.run(['sysctl', 'iogpu'], capture_output=True, text=True)
+        print("GPU Memory Settings:", gpu_mem.stdout, flush=True)
+    except Exception as e:
+        print(f"Error getting GPU info: {e}", flush=True)
+
+    # Process Priority
+    print("\nProcess Priority Information:", flush=True)
+    try:
+        current_process = psutil.Process()
+        print(f"Process Nice Value: {current_process.nice()}", flush=True)
+        # Only try to get ionice if the platform supports it
+        if hasattr(current_process, 'ionice'):
+            print(f"Process IO Nice Value: {current_process.ionice()}", flush=True)
+    except Exception as e:
+        print(f"Error getting process priority info: {e}", flush=True)
+
+    # System Load
+    print("\nSystem Load:", flush=True)
+    try:
+        load_avg = psutil.getloadavg()
+        print(f"Load Average: {load_avg}", flush=True)
+        
+        # Get top processes by CPU and Memory
+        print("\nTop Processes by CPU Usage:", flush=True)
+        processes = []
+        for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
+            try:
+                pinfo = proc.info
+                if pinfo['cpu_percent'] is not None and pinfo['memory_percent'] is not None:
+                    processes.append(pinfo)
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                continue
+        
+        # Sort and display top 5 CPU-consuming processes
+        sorted_by_cpu = sorted(processes, key=lambda x: x['cpu_percent'] or 0, reverse=True)[:5]
+        for proc in sorted_by_cpu:
+            print(f"PID: {proc['pid']}, Name: {proc['name']}, CPU: {proc['cpu_percent']}%, Memory: {proc['memory_percent']:.1f}%")
+    except Exception as e:
+        print(f"Error getting system load info: {e}", flush=True)
+
+    print("\n=== End System State Check ===\n", flush=True)
+
+
+def check_gpu_access():
+    try:
+        # Check if MLX can see the GPU
+        import mlx.core as mx
+        print("MLX device info:", mx.default_device())
+        
+        # Check Metal device availability
+        result = subprocess.run(['system_profiler', 'SPDisplaysDataType'], capture_output=True, text=True)
+        print("GPU Info:", result.stdout)
+    except Exception as e:
+        print(f"Failed to check GPU access: {e}")
+
+
+async def measure_performance(api_endpoint: str, prompt: str, model: str) -> Dict[str, Any]:
+    """
+    Measures the performance of an API endpoint by sending a prompt and recording metrics.
+
+    Args:
+        api_endpoint (str): The API endpoint URL.
+        prompt (str): The prompt to send to the API.
+
+    Returns:
+        Dict[str, Any]: A dictionary containing performance metrics or error information.
+    """
+
+    results = {
+        'model': model,
+        'run_id': os.environ.get('GITHUB_RUN_ID', 'unknown'),
+        'branch': os.environ.get('GITHUB_REF_NAME', 'unknown'),
+        'commit': os.environ.get('GITHUB_SHA', 'unknown'),
+        'configuration': json.loads(os.environ.get('HARDWARE_CONFIG', '{}'))
+    }
+
+    # Get token count
+    session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=600, connect=10, sock_read=600, sock_connect=10))
+    try:
+        response = await session.post(
+            "http://localhost:52415/v1/chat/token/encode",
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}]
+            }
+        )
+        response.raise_for_status()
+        token_data = await response.json()
+        results['prompt_len'] = token_data['num_tokens']
+    except Exception as e:
+        await session.close()
+        raise RuntimeError(f"Failed to get token count: {str(e)}")
+
+    # Measure completion performance
+    try:
+        start_time = time.time()
+        response = await session.post(
+            api_endpoint,
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0,
+                "stream": True
+            }
+        )
+        response.raise_for_status()
+
+        first_token_time = None
+        total_tokens = 0
+
+        async for line in response.content.iter_chunks():
+            line = line[0].decode('utf-8').strip()
+            if not line.startswith('data: '):
+                continue
+
+            data = json.loads(line[6:])  # Skip 'data: ' prefix
+            if content := data.get('choices', [{}])[0].get('delta', {}).get('content'):
+                print(f"Received content: {content}", flush=True)
+                if first_token_time is None:
+                    first_token_time = time.time()
+                    ttft = first_token_time - start_time
+                    results.update({
+                        'ttft': ttft,
+                        'prompt_tps': results['prompt_len'] / ttft
+                    })
+                total_tokens += 1
+
+        total_time = time.time() - start_time
+        results.update({
+            'generation_tps': total_tokens / total_time,
+            'response_len': total_tokens,
+            'total_time': total_time
+        })
+
+    except Exception as e:
+        raise RuntimeError(f"Performance measurement failed: {str(e)}")
+    finally:
+        await session.close()
+
+    return results
+
+
+async def main() -> None:
+    api_endpoint = "http://localhost:52415/v1/chat/completions"
+
+    # Define prompts
+    prompt_warmup = "what is the capital of France?"
+    prompt_essay = "write an essay about cats"
+
+    model = os.environ.get('model', 'llama-3.2-1b')
+    # Warmup request
+    print("\nPerforming warmup request...", flush=True)
+    try:
+        warmup_results = await measure_performance(api_endpoint, prompt_warmup, model)
+        print("Warmup completed successfully", flush=True)
+    except Exception as e:
+        print(f"Warmup request failed: {e}", flush=True)
+
+    # Measure performance for the essay prompt
+    print("\nMeasuring performance for the essay prompt...", flush=True)
+    results = await measure_performance(api_endpoint, prompt_essay, model)
+
+    try:
+        s3_client = boto3.client(
+            's3',
+            aws_access_key_id=os.environ.get('aws_access_key_id'),
+            aws_secret_access_key=os.environ.get('aws_secret_key')
+        )
+        job_name = os.environ.get('GITHUB_JOB')
+
+        # Create S3 key with timestamp and commit info
+        now = datetime.utcnow()
+        timestamp = now.strftime('%H-%M-%S')
+        commit_sha = os.environ.get('GITHUB_SHA', 'unknown')[:7]
+        s3_key = f"{job_name}/{model}/{now.year}/{now.month}/{now.day}/{timestamp}_{commit_sha}.json"
+
+        # Upload to S3
+        s3_client.put_object(
+            Bucket='exo-benchmarks',
+            Key=s3_key,
+            Body=json.dumps(results),
+            ContentType='application/json'
+        )
+        print(f"Performance metrics uploaded to S3: s3://exo-benchmarks/{s3_key}", flush=True)
+    except Exception as e:
+        print(f"Failed to upload metrics to S3: {e}", flush=True)
+
+    # Optionally print the metrics for visibility
+    print("Performance metrics:", flush=True)
+    print(json.dumps(results, indent=4), flush=True)
+
+
+def optimize_system_performance():
+    """Set optimal system performance settings before running benchmark."""
+    try:
+        # Try to set high performance power mode
+        subprocess.run(['sudo', 'pmset', '-a', 'powermode', '2'], check=False)
+        
+        # Ensure MLX uses performance cores and GPU
+        os.environ['MLX_FORCE_P_CORES'] = '1'
+        os.environ['MLX_METAL_PREWARM'] = '1'
+        os.environ['MLX_USE_GPU'] = '1'
+        
+        # Set process priority
+        current_process = psutil.Process()
+        try:
+            # Set highest priority
+            subprocess.run(['sudo', 'renice', '-n', '-20', '-p', str(current_process.pid)], check=False)
+            
+            # Print current process state
+            print("\nProcess State Before Benchmark:", flush=True)
+            proc_info = subprocess.run(
+                ['ps', '-o', 'pid,ppid,user,%cpu,%mem,nice,stat,pri,command', '-p', str(current_process.pid)],
+                capture_output=True, text=True
+            )
+            print(proc_info.stdout, flush=True)
+            
+            # Verify power mode
+            power_info = subprocess.run(['pmset', '-g'], capture_output=True, text=True)
+            if 'powermode            0' in power_info.stdout:
+                print("\nWarning: System still in normal power mode. Trying to set high performance mode again...", flush=True)
+                subprocess.run(['sudo', 'pmset', '-a', 'powermode', '2'], check=False)
+            
+        except Exception as e:
+            print(f"Warning: Could not set process priority: {e}", flush=True)
+            
+    except Exception as e:
+        print(f"Warning: Could not optimize system performance: {e}", flush=True)
+    
+    # Print optimization status
+    print("\nOptimization Settings:", flush=True)
+    print("MLX Environment Variables:", flush=True)
+    for var in ['MLX_FORCE_P_CORES', 'MLX_METAL_PREWARM', 'MLX_USE_GPU']:
+        print(f"{var}: {os.environ.get(var, 'Not set')}", flush=True)
+    
+    try:
+        nice_value = psutil.Process().nice()
+        print(f"Process Nice Value: {nice_value}", flush=True)
+        if nice_value != -20:
+            print("Warning: Process not running at highest priority", flush=True)
+    except Exception:
+        pass
+
+
+if __name__ == "__main__":
+    check_system_state()
+    check_gpu_access()
+    optimize_system_performance()
+    asyncio.run(main())
--- a/.github/bootstrap.sh
+++ b/.github/bootstrap.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+set -e
+
+command_exists() {
+    command -v "$1" >/dev/null 2>&1
+}
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+if [ "$EUID" -eq 0 ]; then 
+    log "Please do not run as root. Run as regular user with sudo access."
+    exit 1
+fi
+
+# Check for required arguments
+if [ -z "$1" ]; then
+    log "Error: Runner token is required"
+    log "Usage: $0 <runner-token> [tailscale-auth-key]"
+    exit 1
+fi
+
+RUNNER_TOKEN=$1
+TAILSCALE_AUTH_KEY=$2
+REPO="exo-explore/exo"
+
+# Add sudoers configuration
+log "Configuring sudo access..."
+SUDOERS_CONTENT="$(whoami) ALL=(ALL) NOPASSWD: ALL"
+echo "$SUDOERS_CONTENT" | sudo tee /etc/sudoers.d/github-runner > /dev/null
+sudo chmod 440 /etc/sudoers.d/github-runner
+
+log "Configuring privacy permissions..."
+sudo tccutil reset All
+sudo tccutil reset SystemPolicyAllFiles
+sudo tccutil reset SystemPolicyNetworkVolumes
+
+# Configure power management for maximum performance
+log "Configuring power management..."
+sudo pmset -a powermode 2  # Force highest performance mode
+sudo pmset -a gpuswitch 2  # Force discrete/high-performance GPU
+sudo pmset -a lowpowermode 0
+sudo pmset -a lessbright 0
+sudo pmset -a disablesleep 1
+sudo pmset -a sleep 0
+sudo pmset -a hibernatemode 0
+sudo pmset -a autopoweroff 0
+sudo pmset -a standby 0
+sudo pmset -a powernap 0
+
+# For Python specifically
+PYTHON_PATH="/opt/homebrew/bin/python3.12"
+sudo chmod 755 "$PYTHON_PATH"
+
+# Add to firewall
+log "Configuring firewall access..."
+sudo /usr/libexec/ApplicationFirewall/socketfilterfw --add "$PYTHON_PATH"
+sudo /usr/libexec/ApplicationFirewall/socketfilterfw --unblock "$PYTHON_PATH"
+
+# Set Homebrew paths based on architecture
+if [ "$(uname -p)" = "arm" ]; then
+    BREW_PREFIX="/opt/homebrew"
+else
+    BREW_PREFIX="/usr/local"
+fi
+
+# Install Homebrew if not present
+if ! command_exists brew; then
+    log "Installing Homebrew..."
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+    echo 'eval "$(/opt/homebrew/bin/brew shellenv)"' >> ~/.zshrc
+    eval "$(/opt/homebrew/bin/brew shellenv)"
+fi
+
+# Install required packages
+log "Installing required packages..."
+export HOMEBREW_NO_AUTO_UPDATE=1
+brew install python@3.12 coreutils
+
+# Optional Tailscale setup if auth key is provided
+if [ -n "$TAILSCALE_AUTH_KEY" ]; then
+    log "Installing and configuring Tailscale..."
+    brew install --quiet tailscale
+    sudo brew services stop tailscale 2>/dev/null || true
+    sudo rm -f /var/db/tailscale/tailscaled.state 2>/dev/null || true
+    sudo brew services start tailscale
+    sleep 2
+    sudo tailscale up --authkey=$TAILSCALE_AUTH_KEY
+
+    # Enable SSH and Screen Sharing
+    log "Enabling remote access services..."
+    sudo launchctl load -w /System/Library/LaunchDaemons/ssh.plist
+    sudo /System/Library/CoreServices/RemoteManagement/ARDAgent.app/Contents/Resources/kickstart \
+        -activate \
+        -configure -access -on \
+        -configure -allowAccessFor -allUsers \
+        -configure -restart -agent -privs -all
+
+    # Create launch daemon for remote access
+    sudo bash -c 'cat > /Library/LaunchDaemons/com.remote.access.setup.plist' << 'EOL'
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>Label</key>
+    <string>com.remote.access.setup</string>
+    <key>ProgramArguments</key>
+    <array>
+        <string>/bin/bash</string>
+        <string>-c</string>
+        <string>
+            launchctl load -w /System/Library/LaunchDaemons/ssh.plist;
+            /System/Library/CoreServices/RemoteManagement/ARDAgent.app/Contents/Resources/kickstart -activate -configure -access -on
+        </string>
+    </array>
+    <key>RunAtLoad</key>
+    <true/>
+</dict>
+</plist>
+EOL
+
+    sudo chmod 644 /Library/LaunchDaemons/com.remote.access.setup.plist
+    sudo launchctl load -w /Library/LaunchDaemons/com.remote.access.setup.plist
+fi
+
+# Configure GitHub Actions Runner
+log "Gathering system metadata..."
+MACHINE_NAME=$(scutil --get ComputerName)
+MACHINE_NAME="runner-$(echo -n "$MACHINE_NAME" | tr '[:upper:]' '[:lower:]' | tr -cd '[:alnum:]-')"
+
+# Enhanced Apple Silicon detection
+MACHINE_INFO=$(system_profiler SPHardwareDataType)
+CHIP_FULL=$(echo "$MACHINE_INFO" | grep "Chip" | cut -d: -f2 | xargs)
+if [[ $CHIP_FULL =~ "Apple" ]]; then
+    CHIP_MODEL=$(echo "$CHIP_FULL" | sed 's/^Apple //' | tr -d ' ' | tr '[:lower:]' '[:upper:]')
+    GPU_CORES=$(ioreg -l | grep "gpu-core-count" | awk -F'= ' '{print $2}')
+    if [ -z "$GPU_CORES" ]; then
+        GPU_CORES="N/A"
+    fi
+else
+    CHIP_MODEL="Intel"
+    GPU_CORES="N/A"
+fi
+
+MEMORY=$(($(sysctl -n hw.memsize) / 1024 / 1024 / 1024))
+
+# Set up GitHub Runner
+RUNNER_DIR="$HOME/actions-runner"
+
+# Check if runner is already configured
+if [ -f "$RUNNER_DIR/.runner" ]; then
+  log "Runner already configured. Stopping existing service..."
+  sudo launchctl unload /Library/LaunchDaemons/com.github.runner.plist 2>/dev/null || true
+fi
+
+# Create runner directory if it doesn't exist
+mkdir -p "$RUNNER_DIR"
+cd "$RUNNER_DIR"
+
+CUSTOM_LABELS="self-hosted,macos,arm64,${CHIP_MODEL}_GPU${GPU_CORES}_${MEMORY}GB"
+
+# Only download and extract if not already present or if forced
+if [ ! -f "$RUNNER_DIR/run.sh" ] || [ "${FORCE_SETUP:-false}" = "true" ]; then
+  log "Downloading GitHub Actions runner..."
+  RUNNER_VERSION=$(curl -s https://api.github.com/repos/actions/runner/releases/latest | grep '"tag_name":' | cut -d'"' -f4)
+  curl -o actions-runner.tar.gz -L "https://github.com/actions/runner/releases/download/${RUNNER_VERSION}/actions-runner-osx-arm64-${RUNNER_VERSION#v}.tar.gz"
+  tar xzf actions-runner.tar.gz
+  rm actions-runner.tar.gz
+else
+  log "Runner already downloaded, skipping download step"
+fi
+
+log "Configuring runner with labels: $CUSTOM_LABELS"
+./config.sh --unattended \
+    --url "https://github.com/${REPO}" \
+    --token "${RUNNER_TOKEN}" \
+    --name "${MACHINE_NAME}" \
+    --labels "${CUSTOM_LABELS}" \
+    --work "_work"
+
+# Set optimal performance settings
+log "Configuring system for optimal performance..."
+
+# Configure CPU performance
+log "Setting CPU performance controls..."
+# Disable timer coalescing
+sudo sysctl -w kern.timer.coalescing_enabled=0
+sudo sysctl -w kern.timer_coalesce_bg_scale=-5
+sudo sysctl -w kern.timer_resort_threshold_ns=0
+# Set minimum timer intervals
+sudo sysctl -w kern.wq_max_timer_interval_usecs=1000
+sudo sysctl -w kern.timer_coalesce_bg_ns_max=1000
+# Set minimum timer coalescing for all tiers
+sudo sysctl -w kern.timer_coalesce_tier0_scale=-5
+sudo sysctl -w kern.timer_coalesce_tier0_ns_max=1000
+sudo sysctl -w kern.timer_coalesce_tier1_scale=-5
+sudo sysctl -w kern.timer_coalesce_tier1_ns_max=1000
+sudo sysctl -w kern.timer_coalesce_tier2_scale=-5
+sudo sysctl -w kern.timer_coalesce_tier2_ns_max=1000
+sudo sysctl -w kern.timer_coalesce_tier3_scale=-5
+sudo sysctl -w kern.timer_coalesce_tier3_ns_max=1000
+sudo sysctl -w kern.timer_coalesce_tier4_scale=-5
+sudo sysctl -w kern.timer_coalesce_tier4_ns_max=1000
+# Disable QoS restrictions
+sudo sysctl -w net.qos.policy.restricted=0
+sudo sysctl -w net.qos.policy.restrict_avapps=0
+sudo sysctl -w net.qos.policy.wifi_enabled=0
+sudo sysctl -w net.qos.policy.capable_enabled=0
+# Set scheduler parameters
+sudo sysctl -w kern.sched_rt_avoid_cpu0=0
+sudo sysctl -w debug.sched=2
+sudo sysctl -w net.pktsched.netem.sched_output_ival_ms=1
+
+# Clean up any existing runner services
+log "Cleaning up existing runner services..."
+for service in com.github.runner com.github.runner.monitor com.github.runner.cpuaffinity com.github.runner.affinity; do
+    sudo launchctl bootout system/$service 2>/dev/null || true
+    sudo rm -f /Library/LaunchDaemons/$service.plist
+done
+
+# Create a simple runner service configuration
+sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+    <dict>
+        <key>Label</key>
+        <string>com.github.runner</string>
+        <key>UserName</key>
+        <string>$(whoami)</string>
+        <key>GroupName</key>
+        <string>staff</string>
+        <key>WorkingDirectory</key>
+        <string>$RUNNER_DIR</string>
+        <key>ProgramArguments</key>
+        <array>
+            <string>$RUNNER_DIR/run.sh</string>
+        </array>
+        <key>RunAtLoad</key>
+        <true/>
+        <key>KeepAlive</key>
+        <dict>
+            <key>SuccessfulExit</key>
+            <false/>
+            <key>Crashed</key>
+            <true/>
+        </dict>
+        <key>ProcessType</key>
+        <string>Interactive</string>
+        <key>LowPriorityIO</key>
+        <false/>
+        <key>AbandonProcessGroup</key>
+        <false/>
+        <key>EnableTransactions</key>
+        <true/>
+        <key>ThrottleInterval</key>
+        <integer>0</integer>
+        <key>HardResourceLimits</key>
+        <dict>
+            <key>NumberOfFiles</key>
+            <integer>524288</integer>
+            <key>MemoryLock</key>
+            <integer>-1</integer>
+        </dict>
+        <key>SoftResourceLimits</key>
+        <dict>
+            <key>NumberOfFiles</key>
+            <integer>524288</integer>
+            <key>MemoryLock</key>
+            <integer>-1</integer>
+        </dict>
+        <key>QOSClass</key>
+        <string>User-Interactive</string>
+        <key>StandardOutPath</key>
+        <string>$RUNNER_DIR/_diag/runner.log</string>
+        <key>StandardErrorPath</key>
+        <string>$RUNNER_DIR/_diag/runner.err</string>
+        <key>EnvironmentVariables</key>
+        <dict>
+            <key>PATH</key>
+            <string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
+        </dict>
+        <key>Nice</key>
+        <integer>-20</integer>
+    </dict>
+</plist>
+EOF
+
+# Set proper permissions for the LaunchDaemon
+sudo chown root:wheel /Library/LaunchDaemons/com.github.runner.plist
+sudo chmod 644 /Library/LaunchDaemons/com.github.runner.plist
+
+# Remove any existing service
+sudo launchctl bootout system/com.github.runner 2>/dev/null || true
+
+# Load the new service using bootstrap
+sudo launchctl bootstrap system /Library/LaunchDaemons/com.github.runner.plist
+
+# Add Runner.Listener permissions (after runner installation)
+RUNNER_PATH="$RUNNER_DIR/bin/Runner.Listener"
+sudo chmod 755 "$RUNNER_PATH"
+sudo /usr/libexec/ApplicationFirewall/socketfilterfw --add "$RUNNER_PATH"
+sudo /usr/libexec/ApplicationFirewall/socketfilterfw --unblock "$RUNNER_PATH"
+
+# Create connection info file if Tailscale is configured
+if [ -n "$TAILSCALE_AUTH_KEY" ]; then
+    TAILSCALE_IP=$(tailscale ip)
+    cat > "$HOME/remote_access_info.txt" << EOL
+Mac Remote Access Information
+============================
+Computer Name: $MACHINE_NAME
+Username: $USER
+Tailscale IP: $TAILSCALE_IP
+
+SSH Command: ssh $USER@$TAILSCALE_IP
+Screen Sharing: vnc://$TAILSCALE_IP
+EOL
+    chmod 600 "$HOME/remote_access_info.txt"
+fi
+
+log "Verifying runner service status..."
+if sudo launchctl list | grep com.github.runner > /dev/null; then
+    log "GitHub Actions runner service is running successfully!"
+    log "Runner labels: $CUSTOM_LABELS"
+    [ -n "$TAILSCALE_AUTH_KEY" ] && log "Remote access details saved to: $HOME/remote_access_info.txt"
+else
+    log "Error: Failed to start GitHub Actions runner service"
+    exit 1
+fi
--- a/.github/optimize_performance.sh
+++ b/.github/optimize_performance.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+set -e
+
+# Function to log with timestamp
+log() {
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+log "Applying comprehensive performance optimizations..."
+
+# System-wide power management
+log "Configuring power management..."
+sudo pmset -a lessbright 0
+sudo pmset -a disablesleep 1
+sudo pmset -a sleep 0
+sudo pmset -a hibernatemode 0
+sudo pmset -a autopoweroff 0
+sudo pmset -a standby 0
+sudo pmset -a powernap 0
+sudo pmset -a proximitywake 0
+sudo pmset -a tcpkeepalive 1
+sudo pmset -a powermode 2
+sudo pmset -a gpuswitch 2
+sudo pmset -a displaysleep 0
+sudo pmset -a disksleep 0
+
+# Memory and kernel optimizations
+log "Configuring memory and kernel settings..."
+sudo sysctl -w kern.memorystatus_purge_on_warning=0
+sudo sysctl -w kern.memorystatus_purge_on_critical=0
+sudo sysctl -w kern.timer.coalescing_enabled=0
+
+# Metal and GPU optimizations
+log "Configuring Metal and GPU settings..."
+defaults write com.apple.CoreML MPSEnableGPUValidation -bool false
+defaults write com.apple.CoreML MPSEnableMetalValidation -bool false
+defaults write com.apple.CoreML MPSEnableGPUDebug -bool false
+defaults write com.apple.Metal GPUDebug -bool false
+defaults write com.apple.Metal GPUValidation -bool false
+defaults write com.apple.Metal MetalValidation -bool false
+defaults write com.apple.Metal MetalCaptureEnabled -bool false
+defaults write com.apple.Metal MTLValidationBehavior -string "Disabled"
+defaults write com.apple.Metal EnableMTLDebugLayer -bool false
+defaults write com.apple.Metal MTLDebugLevel -int 0
+defaults write com.apple.Metal PreferIntegratedGPU -bool false
+defaults write com.apple.Metal ForceMaximumPerformance -bool true
+defaults write com.apple.Metal MTLPreferredDeviceGPUFrame -bool true
+
+# Create MPS cache directory with proper permissions
+sudo mkdir -p /tmp/mps_cache
+sudo chmod 777 /tmp/mps_cache
+
+# Process and resource limits
+log "Configuring process limits..."
+sudo launchctl limit maxfiles 524288 524288
+ulimit -n 524288 || log "Warning: Could not set file descriptor limit"
+ulimit -c 0
+ulimit -l unlimited || log "Warning: Could not set memory lock limit"
+
+# Export performance-related environment variables
+cat << 'EOF' > /tmp/performance_env.sh
+# Metal optimizations
+export MTL_DEBUG_LAYER=0
+export METAL_DEVICE_WRAPPER_TYPE=1
+export METAL_DEBUG_ERROR_MODE=0
+export METAL_FORCE_PERFORMANCE_MODE=1
+export METAL_DEVICE_PRIORITY=high
+export METAL_MAX_COMMAND_QUEUES=1024
+export METAL_LOAD_LIMIT=0
+export METAL_VALIDATION_ENABLED=0
+export METAL_ENABLE_VALIDATION_LAYER=0
+export OBJC_DEBUG_MISSING_POOLS=NO
+export MPS_CACHEDIR=/tmp/mps_cache
+
+# MLX optimizations
+export MLX_USE_GPU=1
+export MLX_METAL_COMPILE_ASYNC=1
+export MLX_METAL_PREALLOCATE=1
+export MLX_METAL_MEMORY_GUARD=0
+export MLX_METAL_CACHE_KERNELS=1
+export MLX_PLACEMENT_POLICY=metal
+export MLX_METAL_VALIDATION=0
+export MLX_METAL_DEBUG=0
+export MLX_FORCE_P_CORES=1
+export MLX_METAL_MEMORY_BUDGET=0
+export MLX_METAL_PREWARM=1
+
+# Python optimizations
+export PYTHONUNBUFFERED=1
+export PYTHONOPTIMIZE=2
+export PYTHONHASHSEED=0
+export PYTHONDONTWRITEBYTECODE=1
+EOF
+
+log "Performance optimizations completed. Environment variables written to /tmp/performance_env.sh"
--- a/.github/workflows/bench_job.yml
+++ b/.github/workflows/bench_job.yml
@@ -0,0 +1,206 @@
+# This is the reusable workflow file
+name: Distributed Job Runner
+
+on:
+  workflow_call:
+    inputs:
+      config:
+        required: true
+        type: string
+      model:
+        required: true
+        type: string
+      calling_job_name:
+        required: true
+        type: string
+      network_interface:
+        required: true
+        type: string
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - id: set-matrix
+        env:
+          CONFIG: ${{ inputs.config }}
+        run: |
+          MATRIX=$(echo $CONFIG | jq -c '{cpu: [to_entries | .[] | .key as $k | range(.value) | $k]}')
+          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
+
+  run-distributed-job:
+    needs: generate-matrix
+    strategy:
+      matrix: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
+    runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
+    env:
+      HARDWARE_CONFIG: ${{ inputs.config }}
+      model: ${{ inputs.model }}
+      # Add performance-related environment variables
+      MTL_DEBUG_LAYER: 0
+      METAL_VALIDATION_ENABLED: 0
+      MLX_METAL_VALIDATION: 0
+      MLX_METAL_DEBUG: 0
+      MLX_FORCE_P_CORES: 1
+      MLX_METAL_PREWARM: 1
+      PYTHONOPTIMIZE: 2
+    steps:
+      - name: Cleanup workspace
+        run: |
+          sudo rm -rf "$GITHUB_WORKSPACE"
+          sudo mkdir -p "$GITHUB_WORKSPACE"
+          sudo chown -R $(whoami):$(id -g) "$GITHUB_WORKSPACE"
+
+      - uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
+          python3.12 -m venv .venv || {
+            echo "Failed to find python3.12. Checking installation locations:"
+            ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
+            exit 1
+          }
+          source .venv/bin/activate
+          pip install --upgrade pip
+          pip install -e .
+          pip install boto3==1.35.76
+
+      - name: Apply Performance Optimizations
+        run: |
+          # Export performance-related environment variables
+          cat << 'EOF' > /tmp/performance_env.sh
+          # MLX and Metal optimizations
+          export MTL_DEBUG_LAYER=0
+          export METAL_VALIDATION_ENABLED=0
+          export MLX_METAL_VALIDATION=0
+          export MLX_METAL_DEBUG=0
+          export MLX_FORCE_P_CORES=1
+          export MLX_METAL_PREWARM=1
+          export PYTHONOPTIMIZE=2
+          EOF
+          
+          # Source the performance environment variables
+          source /tmp/performance_env.sh
+
+          # MLX Memory Settings
+          ./configure_mlx.sh
+          
+          # Verify optimizations
+          echo "Verifying performance settings..."
+          env | grep -E "MLX_|METAL_|MTL_"
+
+      - name: Run exo
+        env:
+          aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
+          aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
+        run: |
+          # Source performance environment variables
+          source /tmp/performance_env.sh
+          
+          # Debug information
+          echo "Current commit SHA: $GITHUB_SHA"
+          git rev-parse HEAD
+          git status
+          
+          CALLING_JOB="${{ inputs.calling_job_name }}"
+          UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
+          ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
+          MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
+          
+          source .venv/bin/activate
+          export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
+          
+          echo "=== Before starting exo ==="
+          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | head -1
+          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | grep -i python
+          
+          echo "Starting exo daemon..."
+          
+          echo "Power mode settings:"
+          sudo pmset -g
+          
+          # Start exo with explicit process control
+          sudo taskpolicy -d default -g default -a -t 0 -l 0 .venv/bin/exo \
+            --node-id="${MY_NODE_ID}" \
+            --node-id-filter="${ALL_NODE_IDS}" \
+            --interface-type-filter="${{ inputs.network_interface }}" \
+            --disable-tui \
+            --max-generate-tokens 250 \
+            --chatgpt-api-port 52415 > output1.log 2>&1 &
+          PID1=$!
+          
+          echo "Exo process started with PID: $PID1"
+          tail -f output1.log &
+          TAIL1=$!
+
+          # Give process time to start
+          sleep 2
+          
+          # Set additional process priorities
+          sudo renice -n -20 -p $PID1
+          sudo taskpolicy -t 4 -p $PID1
+          
+          echo "=== After starting exo ==="
+          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | head -1
+          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | grep $PID1
+          
+          echo "Additional process details:"
+          sudo powermetrics -n 1 -i 1000 --show-process-energy | grep -A 5 $PID1 || true
+
+          trap 'kill $TAIL1' EXIT
+          trap 'kill $PID1' EXIT
+
+          echo "Waiting for all nodes to connect..."
+          for i in {1..20}; do
+            echo "Attempt $i: Checking node count..."
+            nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
+            echo "Current node count: $nodes"
+            if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
+              echo "All nodes connected successfully!"
+              break
+            fi
+            if [ $i -eq 20 ]; then
+              echo "ERROR: Failed to connect all nodes after 20 attempts. Expected ${{ strategy.job-total }} nodes, but got $nodes"
+              exit 1
+            fi
+            sleep 5
+          done
+
+          if ! kill -0 $PID1 2>/dev/null; then
+              echo "ERROR: Instance (PID $PID1) died unexpectedly. Full log output:"
+              cat output1.log
+              exit 1
+          fi
+
+          if [ "${{ strategy.job-index }}" -eq "0" ]; then
+            sleep 10
+            echo "This is the primary node (index 0). Running benchmark..."
+            GITHUB_JOB=$CALLING_JOB python .github/bench.py
+          else
+            echo "This is a secondary node (index ${{ strategy.job-index }}). Waiting for completion..."
+            sleep 10
+            while true; do
+              echo "Checking if primary node is still running..."
+              nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
+              echo "Current node count: $nodes"
+              if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
+                echo "Primary node completed, exiting..."
+                break
+              fi
+              sleep 5
+            done
+          fi
+
+      - name: Check Final System State
+        if: always()
+        run: |
+          echo "=== Final System State ==="
+          sudo pmset -g
+          sudo powermetrics -n 1 -i 1000 --show-process-energy || true
+          system_profiler SPDisplaysDataType
+          sysctl iogpu
+          ps -eo pid,ppid,user,%cpu,%mem,nice,state,command | grep -i python
+          env | grep -E "MLX_|METAL_|MTL_"
+          echo "=== End Final System State ==="
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,71 @@
+name: Build and Test
+
+on:
+  push:
+    branches: [ '*' ]
+    tags: [ '*' ]
+  pull_request:
+    branches: [ '*' ]
+
+jobs:
+  single-m4-pro:
+    strategy:
+      matrix:
+        model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b']
+    uses: ./.github/workflows/bench_job.yml
+    with:
+      config: '{"M4PRO_GPU16_24GB": 1}'
+      model: ${{ matrix.model }}
+      calling_job_name: 'single-m4-pro'
+      network_interface: 'Ethernet'
+    secrets: inherit
+
+  two-m4-pro-cluster:
+    strategy:
+      matrix:
+        model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b']
+    uses: ./.github/workflows/bench_job.yml
+    with:
+      config: '{"M4PRO_GPU16_24GB": 2}'
+      model: ${{ matrix.model }}
+      calling_job_name: 'two-m4-pro-cluster'
+      network_interface: 'Ethernet'
+    secrets: inherit
+
+  # two-m4-pro-cluster-thunderbolt:
+  #   strategy:
+  #     matrix:
+  #       model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b']
+  #   uses: ./.github/workflows/bench_job.yml
+  #   with:
+  #     config: '{"M4PRO_GPU16_24GB": 2}'
+  #     model: ${{ matrix.model }}
+  #     calling_job_name: 'two-m4-pro-cluster-thunderbolt'
+  #     network_interface: 'Thunderbolt'
+  #   secrets: inherit
+
+  three-m4-pro-cluster:
+    strategy:
+      matrix:
+        model: ['llama-3.2-1b', 'llama-3.2-3b', 'llama-3.1-8b', 'llama-3.3-70b']
+      fail-fast: false
+    uses: ./.github/workflows/bench_job.yml
+    with:
+      config: '{"M4PRO_GPU16_24GB": 3}'
+      model: ${{ matrix.model }}
+      calling_job_name: 'three-m4-pro-cluster'
+      network_interface: 'Ethernet'
+    secrets: inherit
+
+  # test-m3-single-node:
+  #   strategy:
+  #     matrix:
+  #       model: ['llama-3.2-1b']
+  #     fail-fast: false
+  #   uses: ./.github/workflows/bench_job.yml
+  #   with:
+  #     config: '{"M3MAX_GPU40_128GB": 1}'
+  #     model: ${{ matrix.model }}
+  #     calling_job_name: 'test-m3-cluster'
+  #     network_interface: 'Ethernet'
+  #   secrets: inherit
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ Unlike other distributed inference frameworks, exo does not use a master-worker

 Exo supports different [partitioning strategies](exo/topology/partitioning_strategy.py) to split up a model across devices. The default partitioning strategy is [ring memory weighted partitioning](exo/topology/ring_memory_weighted_partitioning_strategy.py). This runs an inference in a ring where each device runs a number of model layers proportional to the memory of the device.

-!["A screenshot of exo running 5 nodes](docs/exo-screenshot.png)
+!["A screenshot of exo running 5 nodes](docs/exo-screenshot.jpg)

 ## Installation

--- a/configure_mlx.sh
+++ b/configure_mlx.sh
@@ -3,16 +3,41 @@
 # Get the total memory in MB
 TOTAL_MEM_MB=$(($(sysctl -n hw.memsize) / 1024 / 1024))

-# Set WIRED_LIMIT_MB to 80%
-WIRED_LIMIT_MB=$(($TOTAL_MEM_MB * 80 / 100))
-# Set  WIRED_LWM_MB to 70%
-WIRED_LWM_MB=$(($TOTAL_MEM_MB * 70 / 100))
+# Calculate 80% and TOTAL_MEM_GB-5GB in MB
+EIGHTY_PERCENT=$(($TOTAL_MEM_MB * 80 / 100))
+MINUS_5GB=$((($TOTAL_MEM_MB - 5120)))
+
+# Calculate 70% and TOTAL_MEM_GB-8GB in MB
+SEVENTY_PERCENT=$(($TOTAL_MEM_MB * 70 / 100))
+MINUS_8GB=$((($TOTAL_MEM_MB - 8192)))
+
+# Set WIRED_LIMIT_MB to higher value
+if [ $EIGHTY_PERCENT -gt $MINUS_5GB ]; then
+  WIRED_LIMIT_MB=$EIGHTY_PERCENT
+else
+  WIRED_LIMIT_MB=$MINUS_5GB
+fi
+
+# Set WIRED_LWM_MB to higher value
+if [ $SEVENTY_PERCENT -gt $MINUS_8GB ]; then
+  WIRED_LWM_MB=$SEVENTY_PERCENT
+else
+  WIRED_LWM_MB=$MINUS_8GB
+fi

 # Display the calculated values
 echo "Total memory: $TOTAL_MEM_MB MB"
 echo "Maximum limit (iogpu.wired_limit_mb): $WIRED_LIMIT_MB MB"
 echo "Lower bound (iogpu.wired_lwm_mb): $WIRED_LWM_MB MB"

-# Apply the values with sysctl
-sudo sysctl -w iogpu.wired_limit_mb=$WIRED_LIMIT_MB
-sudo sysctl -w iogpu.wired_lwm_mb=$WIRED_LWM_MB
+# Apply the values with sysctl, but check if we're already root
+if [ "$EUID" -eq 0 ]; then
+  sysctl -w iogpu.wired_limit_mb=$WIRED_LIMIT_MB
+  sysctl -w iogpu.wired_lwm_mb=$WIRED_LWM_MB
+else
+  # Try without sudo first, fall back to sudo if needed
+  sysctl -w iogpu.wired_limit_mb=$WIRED_LIMIT_MB 2>/dev/null || \
+    sudo sysctl -w iogpu.wired_limit_mb=$WIRED_LIMIT_MB
+  sysctl -w iogpu.wired_lwm_mb=$WIRED_LWM_MB 2>/dev/null || \
+    sudo sysctl -w iogpu.wired_lwm_mb=$WIRED_LWM_MB
+fi
--- a/docs/exo-screenshot.jpg
+++ b/docs/exo-screenshot.jpg
--- a/docs/exo-screenshot.png
+++ b/docs/exo-screenshot.png
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:561ec71a226a154503b1d70553c9d57c7cd45dbb3bb0e1244ed5b00edbf0523d
-size 479724
--- a/docs/ring-topology.png
+++ b/docs/ring-topology.png
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3f57b11f2d3aefb3887c5266994c4b4335501830c77a6a53fa6901c8725d0f6c
-size 55857
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -21,12 +21,20 @@ from PIL import Image
 import numpy as np
 import base64
 from io import BytesIO
-import mlx.core as mx
+import platform
+
+if platform.system().lower() == "darwin" and platform.machine().lower() == "arm64":
+  import mlx.core as mx
+else:
+  import numpy as mx
+
 import tempfile
 from exo.download.hf.hf_shard_download import HFShardDownloader
 import shutil
 from exo.download.hf.hf_helpers import get_hf_home, get_repo_root
 from exo.apputil import create_animation_mp4
+from collections import defaultdict
+

 class Message:
  def __init__(self, role: str, content: Union[str, List[Dict[str, Union[str, Dict[str, str]]]]], tools: Optional[List[Dict]] = None):
@@ -41,7 +49,6 @@ class Message:
    return data


-
 class ChatCompletionRequest:
  def __init__(self, model: str, messages: List[Message], temperature: float, tools: Optional[List[Dict]] = None):
    self.model = model
@@ -132,16 +139,24 @@ def remap_messages(messages: List[Message]) -> List[Message]:

 def build_prompt(tokenizer, _messages: List[Message], tools: Optional[List[Dict]] = None):
  messages = remap_messages(_messages)
-  chat_template_args = {
-    "conversation": [m.to_dict() for m in messages],
-    "tokenize": False,
-    "add_generation_prompt": True
-  }
-  if tools: chat_template_args["tools"] = tools
+  chat_template_args = {"conversation": [m.to_dict() for m in messages], "tokenize": False, "add_generation_prompt": True}
+  if tools: 
+    chat_template_args["tools"] = tools

-  prompt = tokenizer.apply_chat_template(**chat_template_args)
-  print(f"!!! Prompt: {prompt}")
-  return prompt
+  try:
+    prompt = tokenizer.apply_chat_template(**chat_template_args)
+    if DEBUG >= 3: print(f"!!! Prompt: {prompt}")
+    return prompt
+  except UnicodeEncodeError:
+    # Handle Unicode encoding by ensuring everything is UTF-8
+    chat_template_args["conversation"] = [
+      {k: v.encode('utf-8').decode('utf-8') if isinstance(v, str) else v 
+       for k, v in m.to_dict().items()}
+      for m in messages
+    ]
+    prompt = tokenizer.apply_chat_template(**chat_template_args)
+    if DEBUG >= 3: print(f"!!! Prompt (UTF-8 encoded): {prompt}")
+    return prompt


 def parse_message(data: dict):
@@ -165,8 +180,17 @@ class PromptSession:
    self.timestamp = timestamp
    self.prompt = prompt

+
 class ChatGPTAPI:
-  def __init__(self, node: Node, inference_engine_classname: str, response_timeout: int = 90, on_chat_completion_request: Callable[[str, ChatCompletionRequest, str], None] = None, default_model: Optional[str] = None, system_prompt: Optional[str] = None):
+  def __init__(
+    self,
+    node: Node,
+    inference_engine_classname: str,
+    response_timeout: int = 90,
+    on_chat_completion_request: Callable[[str, ChatCompletionRequest, str], None] = None,
+    default_model: Optional[str] = None,
+    system_prompt: Optional[str] = None
+  ):
    self.node = node
    self.inference_engine_classname = inference_engine_classname
    self.response_timeout = response_timeout
@@ -176,6 +200,11 @@ class ChatGPTAPI:
    self.prev_token_lens: Dict[str, int] = {}
    self.stream_tasks: Dict[str, asyncio.Task] = {}
    self.default_model = default_model or "llama-3.2-1b"
+    self.token_queues = defaultdict(asyncio.Queue)
+
+    # Get the callback system and register our handler
+    self.token_callback = node.on_token.register("chatgpt-api-token-handler")
+    self.token_callback.on_next(lambda _request_id, tokens, is_finished: asyncio.create_task(self.handle_tokens(_request_id, tokens, is_finished)))
    self.system_prompt = system_prompt

    cors = aiohttp_cors.setup(self.app)
@@ -200,20 +229,25 @@ class ChatGPTAPI:
    cors.add(self.app.router.add_get("/initial_models", self.handle_get_initial_models), {"*": cors_options})
    cors.add(self.app.router.add_post("/create_animation", self.handle_create_animation), {"*": cors_options})
    cors.add(self.app.router.add_post("/download", self.handle_post_download), {"*": cors_options})
+    cors.add(self.app.router.add_get("/v1/topology", self.handle_get_topology), {"*": cors_options})
    cors.add(self.app.router.add_get("/topology", self.handle_get_topology), {"*": cors_options})

-      
+    # Add static routes
    if "__compiled__" not in globals():
      self.static_dir = Path(__file__).parent.parent/"tinychat"
      self.app.router.add_get("/", self.handle_root)
      self.app.router.add_static("/", self.static_dir, name="static")
-      self.app.router.add_static('/images/', get_exo_images_dir(), name='static_images')
+      
+    # Always add images route, regardless of compilation status
+    self.images_dir = get_exo_images_dir()
+    self.images_dir.mkdir(parents=True, exist_ok=True)
+    self.app.router.add_static('/images/', self.images_dir, name='static_images')

    self.app.middlewares.append(self.timeout_middleware)
    self.app.middlewares.append(self.log_request)

  async def handle_quit(self, request):
-    if DEBUG>=1: print("Received quit signal")
+    if DEBUG >= 1: print("Received quit signal")
    response = web.json_response({"detail": "Quit signal received"}, status=200)
    await response.prepare(request)
    await response.write_eof()
@@ -243,61 +277,48 @@ class ChatGPTAPI:

  async def handle_model_support(self, request):
    try:
-        response = web.StreamResponse(
-            status=200,
-            reason='OK',
-            headers={
-                'Content-Type': 'text/event-stream',
-                'Cache-Control': 'no-cache',
-                'Connection': 'keep-alive',
-            }
-        )
-        await response.prepare(request)
+      response = web.StreamResponse(status=200, reason='OK', headers={
+        'Content-Type': 'text/event-stream',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+      })
+      await response.prepare(request)

-        async def process_model(model_name, pretty):
-            if model_name in model_cards:
-                model_info = model_cards[model_name]
+      async def process_model(model_name, pretty):
+        if model_name in model_cards:
+          model_info = model_cards[model_name]

-                if self.inference_engine_classname in model_info.get("repo", {}):
-                    shard = build_base_shard(model_name, self.inference_engine_classname)
-                    if shard:
-                        downloader = HFShardDownloader(quick_check=True)
-                        downloader.current_shard = shard
-                        downloader.current_repo_id = get_repo(shard.model_id, self.inference_engine_classname)
-                        status = await downloader.get_shard_download_status()
+          if self.inference_engine_classname in model_info.get("repo", {}):
+            shard = build_base_shard(model_name, self.inference_engine_classname)
+            if shard:
+              downloader = HFShardDownloader(quick_check=True)
+              downloader.current_shard = shard
+              downloader.current_repo_id = get_repo(shard.model_id, self.inference_engine_classname)
+              status = await downloader.get_shard_download_status()

-                        download_percentage = status.get("overall") if status else None
-                        total_size = status.get("total_size") if status else None
-                        total_downloaded = status.get("total_downloaded") if status else False
+              download_percentage = status.get("overall") if status else None
+              total_size = status.get("total_size") if status else None
+              total_downloaded = status.get("total_downloaded") if status else False

-                        model_data = {
-                            model_name: {
-                                "name": pretty,
-                                "downloaded": download_percentage == 100 if download_percentage is not None else False,
-                                "download_percentage": download_percentage,
-                                "total_size": total_size,
-                                "total_downloaded": total_downloaded
-                            }
-                        }
+              model_data = {
+                model_name: {
+                  "name": pretty, "downloaded": download_percentage == 100 if download_percentage is not None else False, "download_percentage": download_percentage, "total_size": total_size,
+                  "total_downloaded": total_downloaded
+                }
+              }

-                        await response.write(f"data: {json.dumps(model_data)}\n\n".encode())
+              await response.write(f"data: {json.dumps(model_data)}\n\n".encode())

-        # Process all models in parallel
-        await asyncio.gather(*[
-            process_model(model_name, pretty)
-            for model_name, pretty in pretty_name.items()
-        ])
+      # Process all models in parallel
+      await asyncio.gather(*[process_model(model_name, pretty) for model_name, pretty in pretty_name.items()])

-        await response.write(b"data: [DONE]\n\n")
-        return response
+      await response.write(b"data: [DONE]\n\n")
+      return response

    except Exception as e:
-        print(f"Error in handle_model_support: {str(e)}")
-        traceback.print_exc()
-        return web.json_response(
-            {"detail": f"Server error: {str(e)}"},
-            status=500
-        )
+      print(f"Error in handle_model_support: {str(e)}")
+      traceback.print_exc()
+      return web.json_response({"detail": f"Server error: {str(e)}"}, status=500)

  async def handle_get_models(self, request):
    models_list = [{"id": model_name, "object": "model", "owned_by": "exo", "ready": True} for model_name, _ in model_cards.items()]
@@ -334,13 +355,13 @@ class ChatGPTAPI:

  async def handle_post_chat_completions(self, request):
    data = await request.json()
-    if DEBUG >= 2: print(f"Handling chat completions request from {request.remote}: {data}")
+    if DEBUG >= 2: print(f"[ChatGPTAPI] Handling chat completions request from {request.remote}: {data}")
    stream = data.get("stream", False)
    chat_request = parse_chat_request(data, self.default_model)
    if chat_request.model and chat_request.model.startswith("gpt-"):  # to be compatible with ChatGPT tools, point all gpt- model requests to default model
      chat_request.model = self.default_model
    if not chat_request.model or chat_request.model not in model_cards:
-      if DEBUG >= 1: print(f"Invalid model: {chat_request.model}. Supported: {list(model_cards.keys())}. Defaulting to {self.default_model}")
+      if DEBUG >= 1: print(f"[ChatGPTAPI] Invalid model: {chat_request.model}. Supported: {list(model_cards.keys())}. Defaulting to {self.default_model}")
      chat_request.model = self.default_model
    shard = build_base_shard(chat_request.model, self.inference_engine_classname)
    if not shard:
@@ -351,7 +372,7 @@ class ChatGPTAPI:
      )

    tokenizer = await resolve_tokenizer(get_repo(shard.model_id, self.inference_engine_classname))
-    if DEBUG >= 4: print(f"Resolved tokenizer: {tokenizer}")
+    if DEBUG >= 4: print(f"[ChatGPTAPI] Resolved tokenizer: {tokenizer}")

    # Add system prompt if set
    if self.system_prompt and not any(msg.role == "system" for msg in chat_request.messages):
@@ -364,28 +385,13 @@ class ChatGPTAPI:
        self.on_chat_completion_request(request_id, chat_request, prompt)
      except Exception as e:
        if DEBUG >= 2: traceback.print_exc()
-    # request_id = None
-    # match = self.prompts.find_longest_prefix(prompt)
-    # if match and len(prompt) > len(match[1].prompt):
-    #     if DEBUG >= 2:
-    #       print(f"Prompt for request starts with previous prompt {len(match[1].prompt)} of {len(prompt)}: {match[1].prompt}")
-    #     request_id = match[1].request_id
-    #     self.prompts.add(prompt, PromptSession(request_id=request_id, timestamp=int(time.time()), prompt=prompt))
-    #     # remove the matching prefix from the prompt
-    #     prompt = prompt[len(match[1].prompt):]
-    # else:
-    #   request_id = str(uuid.uuid4())
-    #   self.prompts.add(prompt, PromptSession(request_id=request_id, timestamp=int(time.time()), prompt=prompt))

-    callback_id = f"chatgpt-api-wait-response-{request_id}"
-    callback = self.node.on_token.register(callback_id)
-
-    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=}")
+    if DEBUG >= 2: print(f"[ChatGPTAPI] Processing prompt: {request_id=} {shard=} {prompt=}")

    try:
      await asyncio.wait_for(asyncio.shield(asyncio.create_task(self.node.process_prompt(shard, prompt, request_id=request_id))), timeout=self.response_timeout)

-      if DEBUG >= 2: print(f"Waiting for response to finish. timeout={self.response_timeout}s")
+      if DEBUG >= 2: print(f"[ChatGPTAPI] Waiting for response to finish. timeout={self.response_timeout}s")

      if stream:
        response = web.StreamResponse(
@@ -398,62 +404,74 @@ class ChatGPTAPI:
        )
        await response.prepare(request)

-        async def stream_result(_request_id: str, tokens: List[int], is_finished: bool):
-          prev_last_tokens_len = self.prev_token_lens.get(_request_id, 0)
-          self.prev_token_lens[_request_id] = max(prev_last_tokens_len, len(tokens))
-          new_tokens = tokens[prev_last_tokens_len:]
-          finish_reason = None
-          eos_token_id = tokenizer.special_tokens_map.get("eos_token_id") if hasattr(tokenizer, "_tokenizer") and isinstance(tokenizer._tokenizer,
-                                                                                                                             AutoTokenizer) else getattr(tokenizer, "eos_token_id", None)
-          if len(new_tokens) > 0 and new_tokens[-1] == eos_token_id:
-            new_tokens = new_tokens[:-1]
-            if is_finished:
-              finish_reason = "stop"
-          if is_finished and not finish_reason:
-            finish_reason = "length"
+        try:
+          # Stream tokens while waiting for inference to complete
+          while True:
+            if DEBUG >= 2: print(f"[ChatGPTAPI] Waiting for token from queue: {request_id=}")
+            tokens, is_finished = await asyncio.wait_for(
+              self.token_queues[request_id].get(),
+              timeout=self.response_timeout
+            )
+            if DEBUG >= 2: print(f"[ChatGPTAPI] Got token from queue: {request_id=} {tokens=} {is_finished=}")
+
+            eos_token_id = None
+            if not eos_token_id and hasattr(tokenizer, "eos_token_id"): eos_token_id = tokenizer.eos_token_id
+            if not eos_token_id and hasattr(tokenizer, "_tokenizer"): eos_token_id = tokenizer.special_tokens_map.get("eos_token_id")
+
+            finish_reason = None
+            if is_finished: finish_reason = "stop" if tokens[-1] == eos_token_id else "length"
+            if DEBUG >= 2: print(f"{eos_token_id=} {tokens[-1]=} {finish_reason=}")
+
+            completion = generate_completion(
+              chat_request,
+              tokenizer,
+              prompt,
+              request_id,
+              tokens,
+              stream,
+              finish_reason,
+              "chat.completion",
+            )

-          completion = generate_completion(
-            chat_request,
-            tokenizer,
-            prompt,
-            request_id,
-            new_tokens,
-            stream,
-            finish_reason,
-            "chat.completion",
-          )
-          if DEBUG >= 2: print(f"Streaming completion: {completion}")
-          try:
            await response.write(f"data: {json.dumps(completion)}\n\n".encode())
-          except Exception as e:
-            if DEBUG >= 2: print(f"Error streaming completion: {e}")
-            if DEBUG >= 2: traceback.print_exc()

-        def on_result(_request_id: str, tokens: List[int], is_finished: bool):
-          if _request_id == request_id: self.stream_tasks[_request_id] = asyncio.create_task(stream_result(_request_id, tokens, is_finished))
+            if is_finished:
+              break

-          return _request_id == request_id and is_finished
+          await response.write_eof()
+          return response

-        _, tokens, _ = await callback.wait(on_result, timeout=self.response_timeout)
-        if request_id in self.stream_tasks:  # in case there is still a stream task running, wait for it to complete
-          if DEBUG >= 2: print("Pending stream task. Waiting for stream task to complete.")
-          try:
-            await asyncio.wait_for(self.stream_tasks[request_id], timeout=30)
-          except asyncio.TimeoutError:
-            print("WARNING: Stream task timed out. This should not happen.")
-        await response.write_eof()
-        return response
+        except asyncio.TimeoutError:
+          if DEBUG >= 2: print(f"[ChatGPTAPI] Timeout waiting for token: {request_id=}")
+          return web.json_response({"detail": "Response generation timed out"}, status=408)
+
+        except Exception as e:
+          if DEBUG >= 2: 
+            print(f"[ChatGPTAPI] Error processing prompt: {e}")
+            traceback.print_exc()
+          return web.json_response(
+            {"detail": f"Error processing prompt: {str(e)}"},
+            status=500
+          )
+
+        finally:
+          # Clean up the queue for this request
+          if request_id in self.token_queues:
+            if DEBUG >= 2: print(f"[ChatGPTAPI] Cleaning up token queue: {request_id=}")
+            del self.token_queues[request_id]
      else:
-        _, tokens, _ = await callback.wait(
-          lambda _request_id, tokens, is_finished: _request_id == request_id and is_finished,
-          timeout=self.response_timeout,
-        )
-
+        tokens = []
+        while True:
+          _tokens, is_finished = await asyncio.wait_for(self.token_queues[request_id].get(), timeout=self.response_timeout)
+          tokens.extend(_tokens)
+          if is_finished:
+            break
        finish_reason = "length"
-        eos_token_id = tokenizer.special_tokens_map.get("eos_token_id") if isinstance(getattr(tokenizer, "_tokenizer", None), AutoTokenizer) else tokenizer.eos_token_id
+        eos_token_id = None
+        if not eos_token_id and hasattr(tokenizer, "eos_token_id"): eos_token_id = tokenizer.eos_token_id
+        if not eos_token_id and hasattr(tokenizer, "_tokenizer"): eos_token_id = tokenizer.special_tokens_map.get("eos_token_id")
        if DEBUG >= 2: print(f"Checking if end of tokens result {tokens[-1]=} is {eos_token_id=}")
        if tokens[-1] == eos_token_id:
-          tokens = tokens[:-1]
          finish_reason = "stop"

        return web.json_response(generate_completion(chat_request, tokenizer, prompt, request_id, tokens, stream, finish_reason, "chat.completion"))
@@ -462,11 +480,7 @@ class ChatGPTAPI:
    except Exception as e:
      if DEBUG >= 2: traceback.print_exc()
      return web.json_response({"detail": f"Error processing prompt (see logs with DEBUG>=2): {str(e)}"}, status=500)
-    finally:
-      deregistered_callback = self.node.on_token.deregister(callback_id)
-      if DEBUG >= 2: print(f"Deregister {callback_id=} {deregistered_callback=}")

-  
  async def handle_post_image_generations(self, request):
    data = await request.json()

@@ -479,7 +493,7 @@ class ChatGPTAPI:
    shard = build_base_shard(model, self.inference_engine_classname)
    if DEBUG >= 2: print(f"shard: {shard}")
    if not shard:
-        return web.json_response({"error": f"Unsupported model: {model} with inference engine {self.inference_engine_classname}"}, status=400)
+      return web.json_response({"error": f"Unsupported model: {model} with inference engine {self.inference_engine_classname}"}, status=400)

    request_id = str(uuid.uuid4())
    callback_id = f"chatgpt-api-wait-response-{request_id}"
@@ -491,77 +505,85 @@ class ChatGPTAPI:
        img = None
      await asyncio.wait_for(asyncio.shield(asyncio.create_task(self.node.process_prompt(shard, prompt, request_id=request_id, inference_state={"image": img}))), timeout=self.response_timeout)

-
-      response = web.StreamResponse(status=200, reason='OK', headers={'Content-Type': 'application/octet-stream',"Cache-Control": "no-cache",})
+      response = web.StreamResponse(status=200, reason='OK', headers={
+        'Content-Type': 'application/octet-stream',
+        "Cache-Control": "no-cache",
+      })
      await response.prepare(request)

      def get_progress_bar(current_step, total_steps, bar_length=50):
        # Calculate the percentage of completion
-        percent = float(current_step) / total_steps
+        percent = float(current_step)/total_steps
        # Calculate the number of hashes to display
-        arrow = '-' * int(round(percent * bar_length) - 1) + '>'
-        spaces = ' ' * (bar_length - len(arrow))
-        
+        arrow = '-'*int(round(percent*bar_length) - 1) + '>'
+        spaces = ' '*(bar_length - len(arrow))
+
        # Create the progress bar string
        progress_bar = f'Progress: [{arrow}{spaces}] {int(percent * 100)}% ({current_step}/{total_steps})'
        return progress_bar

      async def stream_image(_request_id: str, result, is_finished: bool):
-          if isinstance(result, list):
-              await response.write(json.dumps({'progress': get_progress_bar((result[0]), (result[1]))}).encode('utf-8') + b'\n')
+        if isinstance(result, list):
+          await response.write(json.dumps({'progress': get_progress_bar((result[0]), (result[1]))}).encode('utf-8') + b'\n')

-          elif isinstance(result, np.ndarray):
+        elif isinstance(result, np.ndarray):
+          try:
            im = Image.fromarray(np.array(result))
-            images_folder = get_exo_images_dir()
            # Save the image to a file
            image_filename = f"{_request_id}.png"
-            image_path = images_folder / image_filename
+            image_path = self.images_dir/image_filename
            im.save(image_path)
-            image_url = request.app.router['static_images'].url_for(filename=image_filename)
-            base_url = f"{request.scheme}://{request.host}"
-            # Construct the full URL correctly
-            full_image_url = base_url + str(image_url)
            
-            await response.write(json.dumps({'images': [{'url': str(full_image_url), 'content_type': 'image/png'}]}).encode('utf-8') + b'\n')
+            # Get URL for the saved image
+            try:
+              image_url = request.app.router['static_images'].url_for(filename=image_filename)
+              base_url = f"{request.scheme}://{request.host}"
+              full_image_url = base_url + str(image_url)
+              
+              await response.write(json.dumps({'images': [{'url': str(full_image_url), 'content_type': 'image/png'}]}).encode('utf-8') + b'\n')
+            except KeyError as e:
+              if DEBUG >= 2: print(f"Error getting image URL: {e}")
+              # Fallback to direct file path if URL generation fails
+              await response.write(json.dumps({'images': [{'url': str(image_path), 'content_type': 'image/png'}]}).encode('utf-8') + b'\n')
+            
            if is_finished:
              await response.write_eof()
-              
+            
+          except Exception as e:
+            if DEBUG >= 2: print(f"Error processing image: {e}")
+            if DEBUG >= 2: traceback.print_exc()
+            await response.write(json.dumps({'error': str(e)}).encode('utf-8') + b'\n')

      stream_task = None
+
      def on_result(_request_id: str, result, is_finished: bool):
-          nonlocal stream_task
-          stream_task = asyncio.create_task(stream_image(_request_id, result, is_finished))
-          return _request_id == request_id and is_finished
+        nonlocal stream_task
+        stream_task = asyncio.create_task(stream_image(_request_id, result, is_finished))
+        return _request_id == request_id and is_finished

      await callback.wait(on_result, timeout=self.response_timeout*10)
-      
+
      if stream_task:
-          # Wait for the stream task to complete before returning
-          await stream_task
+        # Wait for the stream task to complete before returning
+        await stream_task

      return response

    except Exception as e:
-        if DEBUG >= 2: traceback.print_exc()
-        return web.json_response({"detail": f"Error processing prompt (see logs with DEBUG>=2): {str(e)}"}, status=500)
-  
+      if DEBUG >= 2: traceback.print_exc()
+      return web.json_response({"detail": f"Error processing prompt (see logs with DEBUG>=2): {str(e)}"}, status=500)
+
  async def handle_delete_model(self, request):
    try:
      model_name = request.match_info.get('model_name')
      if DEBUG >= 2: print(f"Attempting to delete model: {model_name}")

      if not model_name or model_name not in model_cards:
-        return web.json_response(
-          {"detail": f"Invalid model name: {model_name}"},
-          status=400
-          )
+        return web.json_response({"detail": f"Invalid model name: {model_name}"}, status=400)

      shard = build_base_shard(model_name, self.inference_engine_classname)
      if not shard:
-        return web.json_response(
-          {"detail": "Could not build shard for model"},
-          status=400
-        )
+        return web.json_response({"detail": "Could not build shard for model"}, status=400)

      repo_id = get_repo(shard.model_id, self.inference_engine_classname)
      if DEBUG >= 2: print(f"Repo ID for model: {repo_id}")
@@ -576,38 +598,28 @@ class ChatGPTAPI:
        if DEBUG >= 2: print(f"Found model files at {cache_dir}, deleting...")
        try:
          shutil.rmtree(cache_dir)
-          return web.json_response({
-            "status": "success",
-            "message": f"Model {model_name} deleted successfully",
-            "path": str(cache_dir)
-          })
+          return web.json_response({"status": "success", "message": f"Model {model_name} deleted successfully", "path": str(cache_dir)})
        except Exception as e:
-          return web.json_response({
-            "detail": f"Failed to delete model files: {str(e)}"
-          }, status=500)
+          return web.json_response({"detail": f"Failed to delete model files: {str(e)}"}, status=500)
      else:
-        return web.json_response({
-          "detail": f"Model files not found at {cache_dir}"
-        }, status=404)
+        return web.json_response({"detail": f"Model files not found at {cache_dir}"}, status=404)

    except Exception as e:
-        print(f"Error in handle_delete_model: {str(e)}")
-        traceback.print_exc()
-        return web.json_response({
-            "detail": f"Server error: {str(e)}"
-        }, status=500)
+      print(f"Error in handle_delete_model: {str(e)}")
+      traceback.print_exc()
+      return web.json_response({"detail": f"Server error: {str(e)}"}, status=500)

  async def handle_get_initial_models(self, request):
    model_data = {}
    for model_name, pretty in pretty_name.items():
-        model_data[model_name] = {
-            "name": pretty,
-            "downloaded": None,  # Initially unknown
-            "download_percentage": None,  # Change from 0 to null
-            "total_size": None,
-            "total_downloaded": None,
-            "loading": True  # Add loading state
-        }
+      model_data[model_name] = {
+        "name": pretty,
+        "downloaded": None,  # Initially unknown
+        "download_percentage": None,  # Change from 0 to null
+        "total_size": None,
+        "total_downloaded": None,
+        "loading": True  # Add loading state
+      }
    return web.json_response(model_data)

  async def handle_create_animation(self, request):
@@ -633,17 +645,9 @@ class ChatGPTAPI:
      if DEBUG >= 2: print(f"Animation temp directory: {tmp_dir}, output file: {output_path}, directory exists: {tmp_dir.exists()}, directory permissions: {oct(tmp_dir.stat().st_mode)[-3:]}")

      # Create the animation
-      create_animation_mp4(
-        replacement_image_path,
-        output_path,
-        device_name,
-        prompt_text
-      )
+      create_animation_mp4(replacement_image_path, output_path, device_name, prompt_text)

-      return web.json_response({
-        "status": "success",
-        "output_path": output_path
-      })
+      return web.json_response({"status": "success", "output_path": output_path})

    except Exception as e:
      if DEBUG >= 2: traceback.print_exc()
@@ -659,10 +663,7 @@ class ChatGPTAPI:
      if not shard: return web.json_response({"error": f"Could not build shard for model {model_name}"}, status=400)
      asyncio.create_task(self.node.inference_engine.shard_downloader.ensure_shard(shard, self.inference_engine_classname))

-      return web.json_response({
-        "status": "success",
-        "message": f"Download started for model: {model_name}"
-      })
+      return web.json_response({"status": "success", "message": f"Download started for model: {model_name}"})
    except Exception as e:
      if DEBUG >= 2: traceback.print_exc()
      return web.json_response({"error": str(e)}, status=500)
@@ -676,10 +677,10 @@ class ChatGPTAPI:
        return web.json_response({})
    except Exception as e:
      if DEBUG >= 2: traceback.print_exc()
-      return web.json_response(
-        {"detail": f"Error getting topology: {str(e)}"},
-        status=500
-      )
+      return web.json_response({"detail": f"Error getting topology: {str(e)}"}, status=500)
+
+  async def handle_tokens(self, request_id: str, tokens: List[int], is_finished: bool):
+    await self.token_queues[request_id].put((tokens, is_finished))

  async def run(self, host: str = "0.0.0.0", port: int = 52415):
    runner = web.AppRunner(self.app)
@@ -690,15 +691,14 @@ class ChatGPTAPI:
  def base64_decode(self, base64_string):
    #decode and reshape image
    if base64_string.startswith('data:image'):
-        base64_string = base64_string.split(',')[1]
+      base64_string = base64_string.split(',')[1]
    image_data = base64.b64decode(base64_string)
    img = Image.open(BytesIO(image_data))
-    W, H = (dim - dim % 64 for dim in (img.width, img.height))
+    W, H = (dim - dim%64 for dim in (img.width, img.height))
    if W != img.width or H != img.height:
-        if DEBUG >= 2: print(f"Warning: image shape is not divisible by 64, downsampling to {W}x{H}")
-        img = img.resize((W, H), Image.NEAREST)  # use desired downsampling filter
+      if DEBUG >= 2: print(f"Warning: image shape is not divisible by 64, downsampling to {W}x{H}")
+      img = img.resize((W, H), Image.NEAREST)  # use desired downsampling filter
    img = mx.array(np.array(img))
-    img = (img[:, :, :3].astype(mx.float32) / 255) * 2 - 1
+    img = (img[:, :, :3].astype(mx.float32)/255)*2 - 1
    img = img[None]
    return img
-  
--- a/exo/apputil/anim.py
+++ b/exo/apputil/anim.py
@@ -2,6 +2,7 @@ from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import os
 import numpy as np
 import cv2
+import sys

 def draw_rounded_rectangle(draw, coords, radius, fill):
  left, top, right, bottom = coords
@@ -80,14 +81,20 @@ def create_animation_mp4(
    font = ImageFont.load_default()
    promptfont = ImageFont.load_default()

+  # Get the base directory for images when running as a bundled app
+  if hasattr(sys, '_MEIPASS'):
+    base_dir = os.path.join(sys._MEIPASS, "exo", "apputil", "baseimages")
+  else:
+    base_dir = os.path.join(os.path.dirname(__file__), "baseimages")
+
  # Process first frame
-  base_img = Image.open(os.path.join(os.path.dirname(__file__), "baseimages", "image1.png"))
+  base_img = Image.open(os.path.join(base_dir, "image1.png"))
  draw = ImageDraw.Draw(base_img)
  draw_centered_text_rounded(draw, device_name, font, device_coords)
  frames.extend([crop_image(base_img)] * 30)  # 1 second at 30fps

  # Process second frame with typing animation
-  base_img2 = Image.open(os.path.join(os.path.dirname(__file__), "baseimages", "image2.png"))
+  base_img2 = Image.open(os.path.join(base_dir, "image2.png"))
  for i in range(len(prompt_text) + 1):
    current_frame = base_img2.copy()
    draw = ImageDraw.Draw(current_frame)
@@ -101,7 +108,7 @@ def create_animation_mp4(

  # Create blur sequence
  replacement_img = Image.open(replacement_image_path)
-  base_img = Image.open(os.path.join(os.path.dirname(__file__), "baseimages", "image3.png"))
+  base_img = Image.open(os.path.join(base_dir, "image3.png"))
  blur_steps = [int(80 * (1 - i/8)) for i in range(9)]

  for i, blur_amount in enumerate(blur_steps):
@@ -123,7 +130,7 @@ def create_animation_mp4(
    frames.extend([crop_image(new_frame)] * 15)  # 0.5 seconds at 30fps

  # Create and add final frame (image4)
-  final_base = Image.open(os.path.join(os.path.dirname(__file__), "baseimages", "image4.png"))
+  final_base = Image.open(os.path.join(base_dir, "image4.png"))
  draw = ImageDraw.Draw(final_base)

  draw_centered_text_rounded(draw, device_name, font, device_coords)
@@ -158,4 +165,4 @@ def create_animation_mp4(
      out.write(frame_array)
    
    out.release()
-    print(f"Video saved successfully to {output_path}")
+    print(f"Video saved successfully to {output_path}")
--- a/exo/download/hf/hf_helpers.py
+++ b/exo/download/hf/hf_helpers.py
@@ -441,7 +441,7 @@ def get_allow_patterns(weight_map: Dict[str, str], shard: Shard) -> List[str]:
      shard_specific_patterns.add(sorted_file_names[-1])
  else:
    shard_specific_patterns = set(["*.safetensors"])
-  if DEBUG >= 2: print(f"get_allow_patterns {weight_map=} {shard=} {shard_specific_patterns=}")
+  if DEBUG >= 3: print(f"get_allow_patterns {weight_map=} {shard=} {shard_specific_patterns=}")
  return list(default_patterns | shard_specific_patterns)

 async def get_file_download_percentage(
--- a/exo/download/hf/hf_shard_download.py
+++ b/exo/download/hf/hf_shard_download.py
@@ -159,13 +159,14 @@ class HFShardDownloader(ShardDownloader):
          print(f"Download calculation for {self.current_repo_id}:")
          print(f"Total bytes: {total_bytes}")
          print(f"Downloaded bytes: {downloaded_bytes}")
+        if DEBUG >= 3:
          for file in relevant_files:
            print(f"File {file['path']}: size={file['size']}, percentage={status[file['path']]}")

      return status

    except Exception as e:
-      if DEBUG >= 2:
+      if DEBUG >= 3:
        print(f"Error getting shard download status: {e}")
        traceback.print_exc()
      return None
--- a/exo/helpers.py
+++ b/exo/helpers.py
@@ -7,12 +7,14 @@ import random
 import platform
 import psutil
 import uuid
-import netifaces
+from scapy.all import get_if_addr, get_if_list
+import re
 import subprocess
 from pathlib import Path
 import tempfile
 import json
 from concurrent.futures import ThreadPoolExecutor
+import traceback

 DEBUG = int(os.getenv("DEBUG", default="0"))
 DEBUG_DISCOVERY = int(os.getenv("DEBUG_DISCOVERY", default="0"))
@@ -229,28 +231,29 @@ def pretty_print_bytes_per_second(bytes_per_second: int) -> str:


 def get_all_ip_addresses_and_interfaces():
-  try:
    ip_addresses = []
-    for interface in netifaces.interfaces():
-      ifaddresses = netifaces.ifaddresses(interface)
-      if netifaces.AF_INET in ifaddresses:
-        for link in ifaddresses[netifaces.AF_INET]:
-          ip = link['addr']
-          ip_addresses.append((ip, interface))
+    for interface in get_if_list():
+      try:
+        ip = get_if_addr(interface)
+        if ip.startswith("0.0."): continue
+        simplified_interface = re.sub(r'^\\Device\\NPF_', '', interface)
+        ip_addresses.append((ip, simplified_interface))
+      except:
+        if DEBUG >= 1: print(f"Failed to get IP address for interface {interface}")
+        if DEBUG >= 1: traceback.print_exc()
+    if not ip_addresses:
+      if DEBUG >= 1: print("Failed to get any IP addresses. Defaulting to localhost.")
+      return [("localhost", "lo")]
    return list(set(ip_addresses))
-  except:
-    if DEBUG >= 1: print("Failed to get all IP addresses. Defaulting to localhost.")
-    return [("localhost", "lo")]
+
+

 async def get_macos_interface_type(ifname: str) -> Optional[Tuple[int, str]]:
  try:
    # Use the shared subprocess_pool
-    output = await asyncio.get_running_loop().run_in_executor(subprocess_pool, lambda: subprocess.run(
-      ['system_profiler', 'SPNetworkDataType', '-json'],
-      capture_output=True,
-      text=True,
-      close_fds=True
-    ).stdout)
+    output = await asyncio.get_running_loop().run_in_executor(
+      subprocess_pool, lambda: subprocess.run(['system_profiler', 'SPNetworkDataType', '-json'], capture_output=True, text=True, close_fds=True).stdout
+    )

    data = json.loads(output)

@@ -276,6 +279,7 @@ async def get_macos_interface_type(ifname: str) -> Optional[Tuple[int, str]]:

  return None

+
 async def get_interface_priority_and_type(ifname: str) -> Tuple[int, str]:
  # On macOS, try to get interface type using networksetup
  if psutil.MACOS:
@@ -283,8 +287,7 @@ async def get_interface_priority_and_type(ifname: str) -> Tuple[int, str]:
    if macos_type is not None: return macos_type

  # Local container/virtual interfaces
-  if (ifname.startswith(('docker', 'br-', 'veth', 'cni', 'flannel', 'calico', 'weave')) or
-    'bridge' in ifname):
+  if (ifname.startswith(('docker', 'br-', 'veth', 'cni', 'flannel', 'calico', 'weave')) or 'bridge' in ifname):
    return (7, "Container Virtual")

  # Loopback interface
@@ -310,6 +313,7 @@ async def get_interface_priority_and_type(ifname: str) -> Tuple[int, str]:
  # Other physical interfaces
  return (2, "Other")

+
 async def shutdown(signal, loop, server):
  """Gracefully shutdown the server and close the asyncio loop."""
  print(f"Received exit signal {signal.name}...")
@@ -327,18 +331,42 @@ def is_frozen():
    or ('Contents/MacOS' in str(os.path.dirname(sys.executable))) \
    or '__nuitka__' in globals() or getattr(sys, '__compiled__', False)

+async def get_mac_system_info() -> Tuple[str, str, int]:
+    """Get Mac system information using system_profiler."""
+    try:
+        output = await asyncio.get_running_loop().run_in_executor(
+            subprocess_pool,
+            lambda: subprocess.check_output(["system_profiler", "SPHardwareDataType"]).decode("utf-8")
+        )
+        
+        model_line = next((line for line in output.split("\n") if "Model Name" in line), None)
+        model_id = model_line.split(": ")[1] if model_line else "Unknown Model"
+        
+        chip_line = next((line for line in output.split("\n") if "Chip" in line), None)
+        chip_id = chip_line.split(": ")[1] if chip_line else "Unknown Chip"
+        
+        memory_line = next((line for line in output.split("\n") if "Memory" in line), None)
+        memory_str = memory_line.split(": ")[1] if memory_line else "Unknown Memory"
+        memory_units = memory_str.split()
+        memory_value = int(memory_units[0])
+        memory = memory_value * 1024 if memory_units[1] == "GB" else memory_value
+        
+        return model_id, chip_id, memory
+    except Exception as e:
+        if DEBUG >= 2: print(f"Error getting Mac system info: {e}")
+        return "Unknown Model", "Unknown Chip", 0

 def get_exo_home() -> Path:
-  if psutil.WINDOWS: docs_folder = Path(os.environ["USERPROFILE"]) / "Documents"
-  else: docs_folder = Path.home() / "Documents"
+  if psutil.WINDOWS: docs_folder = Path(os.environ["USERPROFILE"])/"Documents"
+  else: docs_folder = Path.home()/"Documents"
  if not docs_folder.exists(): docs_folder.mkdir(exist_ok=True)
-  exo_folder = docs_folder / "Exo"
+  exo_folder = docs_folder/"Exo"
  if not exo_folder.exists(): exo_folder.mkdir(exist_ok=True)
  return exo_folder

+
 def get_exo_images_dir() -> Path:
  exo_home = get_exo_home()
-  images_dir = exo_home / "Images"
+  images_dir = exo_home/"Images"
  if not images_dir.exists(): images_dir.mkdir(exist_ok=True)
  return images_dir
-  
--- a/exo/inference/inference_engine.py
+++ b/exo/inference/inference_engine.py
@@ -5,6 +5,7 @@ from exo.helpers import DEBUG  # Make sure to import DEBUG
 from typing import Tuple, Optional
 from abc import ABC, abstractmethod
 from .shard import Shard
+from exo.download.shard_download import ShardDownloader


 class InferenceEngine(ABC):
@@ -13,7 +14,7 @@ class InferenceEngine(ABC):
  @abstractmethod
  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
    pass
-  
+
  @abstractmethod
  async def sample(self, x: np.ndarray) -> np.ndarray:
    pass
@@ -32,13 +33,13 @@ class InferenceEngine(ABC):

  async def save_checkpoint(self, shard: Shard, path: str):
    pass
-  
+
  async def save_session(self, key, value):
    self.session[key] = value
-  
+
  async def clear_session(self):
    self.session.empty()
-  
+
  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, inference_state: Optional[dict] = None) -> tuple[np.ndarray, Optional[dict]]:
    tokens = await self.encode(shard, prompt)
    if shard.model_id != 'stable-diffusion-2-1-base':
@@ -49,13 +50,15 @@ class InferenceEngine(ABC):

    return output_data, inference_state

+
 inference_engine_classes = {
  "mlx": "MLXDynamicShardInferenceEngine",
  "tinygrad": "TinygradDynamicShardInferenceEngine",
  "dummy": "DummyInferenceEngine",
 }

-def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDownloader'):
+
+def get_inference_engine(inference_engine_name: str, shard_downloader: ShardDownloader):
  if DEBUG >= 2:
    print(f"get_inference_engine called with: {inference_engine_name}")
  if inference_engine_name == "mlx":
--- a/exo/inference/mlx/perf_improvements.md
+++ b/exo/inference/mlx/perf_improvements.md
@@ -0,0 +1,7 @@
+# Perf improvements
+
+Target: 460 tok/sec
+- removing sample goes from 369 -> 402
+- performance degrades as we generate more tokens
+- make mlx inference engien synchronous, removing thread pool executor: 402 -> 413
+- remove self.on_opaque_status.trigger_all: 413 -> 418
--- a/exo/inference/mlx/sharded_inference_engine.py
+++ b/exo/inference/mlx/sharded_inference_engine.py
@@ -1,155 +1,167 @@
 import numpy as np
 import mlx.core as mx
 import mlx.nn as nn
-from mlx_lm.sample_utils import top_p_sampling
+from mlx_lm.sample_utils import top_p_sampling, make_sampler
 import mlx.optimizers as optim
 from ..inference_engine import InferenceEngine
 from .sharded_utils import load_shard, get_image_from_str
-from .losses import loss_fns 
+from .losses import loss_fns
 from ..shard import Shard
 from typing import Dict, Optional, Tuple
 from exo.download.shard_download import ShardDownloader
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 from collections import OrderedDict
 from mlx_lm.models.cache import make_prompt_cache
-
-def sample_logits(
-  logits: mx.array,
-  temp: float = 0.0,
-  top_p: float = 1.0,
-  logit_bias: Optional[Dict[int, float]] = None
-) -> Tuple[mx.array, float]:
-  if logit_bias:
-    indices = mx.array(list(logit_bias.keys()))
-    values = mx.array(list(logit_bias.values()))
-    logits[:, indices] += values
-
-  if temp == 0:
-    token = mx.argmax(logits, axis=-1)
-  else:
-    if top_p > 0 and top_p < 1.0:
-      token = top_p_sampling(logits, top_p, temp)
-    else:
-      token = mx.random.categorical(logits*(1/temp))
-
-  return token
+from concurrent.futures import ThreadPoolExecutor

 class MLXDynamicShardInferenceEngine(InferenceEngine):
  def __init__(self, shard_downloader: ShardDownloader):
    self.shard = None
    self.shard_downloader = shard_downloader
-    self.executor = ThreadPoolExecutor(max_workers=1)
    self.caches = OrderedDict()
+    self.sampler_params: tuple[float, float] = (0.0, 0.0, 0.0, 1)
+    self.sampler = make_sampler(*self.sampler_params)
+    self._mlx_thread = ThreadPoolExecutor(max_workers=1, thread_name_prefix="mlx")
+    self._tokenizer_thread = ThreadPoolExecutor(max_workers=1, thread_name_prefix="tokenizer")
+    self.session = {}
+
+  async def _eval_mlx(self, *args):
+    await asyncio.get_running_loop().run_in_executor(self._mlx_thread, mx.eval, *args)

  async def poll_state(self, request_id: str, max_caches=2):
    if request_id in self.caches:
      self.caches.move_to_end(request_id)
    else:
-      newcache = await asyncio.get_running_loop().run_in_executor(self.executor, make_prompt_cache, self.model)
+      newcache = make_prompt_cache(self.model)
      if len(self.caches) > max_caches:
        self.caches.popitem(last=False)
      self.caches[request_id] = newcache
    return {"cache": self.caches[request_id]}

-  async def sample(self, x, temp: float = 0.0, top_p: float = 1.0) -> np.ndarray:
-    y = mx.array(x)
-    logits = y[:, -1, :]
-    out = np.array(sample_logits(logits, temp=temp, top_p=top_p), dtype=int)
-    return out
+  async def sample(self, x: np.ndarray, temp: float = 0.0, top_p: float = 1.0) -> np.ndarray:
+    if (temp, top_p, 0.0, 1) != self.sampler_params:
+      self.sampler_params = (temp, top_p, 0.0, 1)
+      self.sampler = make_sampler(*self.sampler_params)
+    logits = mx.array(x)
+    logits = logits[:, -1, :]
+    logprobs = logits - mx.logsumexp(logits, keepdims=True)
+    result = self.sampler(logprobs)
+    await self._eval_mlx(result)
+    return np.asarray(result, dtype=int)

  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
    await self.ensure_shard(shard)
-    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
-    return np.array(tokens)
+    return np.asarray(
+      await asyncio.get_running_loop().run_in_executor(
+        self._tokenizer_thread,
+        self.tokenizer.encode,
+        prompt
+      )
+    )

  async def decode(self, shard: Shard, tokens) -> str:
    await self.ensure_shard(shard)
-    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
-    return tokens
+    return await asyncio.get_running_loop().run_in_executor(
+      self._tokenizer_thread,
+      self.tokenizer.decode,
+      tokens
+    )

  async def save_checkpoint(self, shard: Shard, path: str):
    await self.ensure_shard(shard)
-    await asyncio.get_running_loop().run_in_executor(self.executor, self.model.save_weights, path)
+    await asyncio.get_running_loop().run_in_executor(self._mlx_thread, lambda: self.model.save_weights(path))

  async def load_checkpoint(self, shard: Shard, path: str):
    await self.ensure_shard(shard)
-    await asyncio.get_running_loop().run_in_executor(self.executor, self.model.load_weights, path)
-    
+    await asyncio.get_running_loop().run_in_executor(self._mlx_thread, lambda: self.model.load_weights(path))
+
  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[dict] = None) -> tuple[np.ndarray, Optional[dict]]:
    await self.ensure_shard(shard)
-    loop = asyncio.get_running_loop()
    state = await self.poll_state(request_id) if self.model.model_type != 'StableDiffusionPipeline' else {}
    x = mx.array(input_data)
+
    if self.model.model_type != 'StableDiffusionPipeline':
-      output_data = await loop.run_in_executor(self.executor, lambda: self.model(x, **state, **(inference_state or {})))
+      output_data = await asyncio.get_running_loop().run_in_executor(
+        self._mlx_thread,
+        lambda: self.model(x, **state, **(inference_state or {}))
+      )
+      inference_state = None
    else:
-      output_data, inference_state = await loop.run_in_executor(self.executor, lambda: self.model(x, **state, **(inference_state or {})))
-    output_data = np.array(output_data)
+      result = await asyncio.get_running_loop().run_in_executor(
+        self._mlx_thread,
+        lambda: self.model(x, **state, **(inference_state or {}))
+      )
+      output_data, inference_state = result
+
+    output_data = np.array(output_data, copy=False)
    return output_data, inference_state

  async def evaluate(self, request_id: str, shard: Shard, inputs, targets, lengths, loss: str = "length_masked_ce"):
    await self.ensure_shard(shard)
    await self.save_session('loss', loss_fns[loss])
-    loop = asyncio.get_running_loop()
-    #print(f"evaluate in <- {inputs}")
    x = mx.array(inputs)
    y = mx.array(targets)
    l = mx.array(lengths)
-    score = await loop.run_in_executor(self.executor, self.session['loss'], self.model, x, y, l)
-    #print(f"evaluate out -> {score}")
+
+    score = await asyncio.get_running_loop().run_in_executor(
+      self._mlx_thread,
+      lambda: self.session['loss'](self.model, x, y, l)
+    )
    return score

  async def ensure_train(self, shard: Shard, loss: str, opt=optim.SGD, lr=1e-5, trainable_layers=['input_layernorm', 'gate_proj']):
    await self.ensure_shard(shard)
+
    if 'train_layers' not in self.session or self.session['train_layers'] != trainable_layers:
      await self.save_session('train_layers', trainable_layers)
-      self.model.freeze()
-      self.model.apply_to_modules(lambda k, v: v.unfreeze() if any(lambda: k.endswith(i) for i in trainable_layers) else None)
+      def freeze_unfreeze():
+        self.model.freeze()
+        self.model.apply_to_modules(
+          lambda k, v: v.unfreeze() if any(k.endswith(layer_name) for layer_name in trainable_layers) else None
+        )
+      await asyncio.get_running_loop().run_in_executor(self._mlx_thread, freeze_unfreeze)
+
    if 'lossname' not in self.session or 'LVaG' not in self.session or self.session['lossname'] != loss:
      await self.save_session('lossname', loss)
      await self.save_session('LVaG', nn.value_and_grad(self.model, loss_fns[loss]))
+
    if 'opt' not in self.session:
      await self.save_session('opt', opt(lr))
    return True

  async def train(self, request_id: str, shard: Shard, inputs, targets, lengths, loss: str = "length_masked_ce", opt=optim.SGD, lr=1e-5):
-    loop = asyncio.get_running_loop()
-    nothin = await self.ensure_train(shard, loss, opt, lr)
+    await self.ensure_train(shard, loss, opt, lr)
+
    def train_step(inp, tar, lng):
      lval, grad = self.session['LVaG'](self.model, inp, tar, lng)
      gradlayers = grad['model']['layers']
      self.session['opt'].update(self.model, grad)
-      mx.eval(self.model.parameters(), self.session['opt'].state, lval)
-      return lval, gradlayers
+      return lval, gradlayers, (self.model.parameters(), self.session['opt'].state, lval)

    x = mx.array(inputs)
    y = mx.array(targets)
    l = mx.array(lengths)
+    score, gradients, eval_args = await asyncio.get_running_loop().run_in_executor(
+      self._mlx_thread,
+      lambda: train_step(x, y, l)
+    )
+    await self._eval_mlx(*eval_args)

-    score, gradients = await loop.run_in_executor(self.executor, train_step, x, y, l)
-    #print(f"{score=}")
-      
-    layers = [{k: v["weight"] for k,v in l.items() if 'weight' in v} for l in gradients if l]
-    #print(layers[0])
-
-    return score, np.array(layers[0]['input_layernorm'])
+    layers = [{k: v["weight"] for k, v in layer.items() if 'weight' in v} for layer in gradients if layer]
+    first_layer = np.array(layers[0]['input_layernorm'], copy=False)
+    await self._eval_mlx(first_layer)
+    return score, first_layer

  async def ensure_shard(self, shard: Shard):
    if self.shard == shard:
      return
-
    model_path = await self.shard_downloader.ensure_shard(shard, self.__class__.__name__)
-
    if self.shard != shard:
-
-      def load_shard_wrapper():
-        return asyncio.run(load_shard(model_path, shard))
-
-      model_shard, self.tokenizer = await asyncio.get_running_loop().run_in_executor(self.executor, load_shard_wrapper)
+      model_shard, self.tokenizer = await load_shard(model_path, shard)
      self.shard = shard
-      self.model = model_shard 
+      self.model = model_shard
      self.caches = OrderedDict()
      self.session = {}

+  async def cleanup(self):
+    self._mlx_thread.shutdown(wait=True)
--- a/exo/inference/mlx/test_non_blocking.py
+++ b/exo/inference/mlx/test_non_blocking.py
@@ -0,0 +1,81 @@
+import asyncio
+import time
+import numpy as np
+from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
+from exo.download.hf.hf_shard_download import HFShardDownloader
+from exo.inference.shard import Shard
+from exo.models import build_base_shard
+from collections import deque
+from statistics import mean, median
+
+async def test_non_blocking():
+    # Setup
+    shard_downloader = HFShardDownloader()
+    engine = MLXDynamicShardInferenceEngine(shard_downloader)
+    _shard = build_base_shard("llama-3.1-8b", "MLXDynamicShardInferenceEngine")
+    shard = Shard(_shard.model_id, _shard.start_layer, _shard.n_layers - 1, _shard.n_layers)
+    await engine.ensure_shard(shard)
+    
+    queue = asyncio.Queue()
+    measurements = deque(maxlen=1000000)
+    running = True
+
+    async def mlx_worker():
+        try:
+            start_time = time.time()
+            count = 0
+            while running and (time.time() - start_time) < 5:  # Hard time limit
+                start = time.perf_counter_ns()
+                await engine.infer_prompt("req1", shard, "test prompt")
+                duration = (time.perf_counter_ns() - start) / 1_000_000  # Convert to ms
+                count += 1
+                print(f"MLX operation {count} took: {duration:.3f}ms")
+        except asyncio.CancelledError:
+            pass
+        finally:
+            print(f"\nTotal MLX operations completed: {count}")
+            print(f"Average rate: {count/5:.1f} ops/second")
+
+    async def latency_producer():
+        try:
+            start_time = time.perf_counter_ns()
+            count = 0
+            while running:
+                await queue.put(time.perf_counter_ns())
+                count += 1
+                await asyncio.sleep(0)  # Yield to event loop without delay
+            duration = (time.perf_counter_ns() - start_time) / 1e9  # Convert to seconds
+            print(f"\nProducer iterations: {count}")
+            print(f"Producer rate: {count/duration:.1f} iterations/second")
+        except asyncio.CancelledError:
+            pass
+
+    async def latency_consumer():
+        try:
+            while running:
+                timestamp = await queue.get()
+                latency = (time.perf_counter_ns() - timestamp) / 1_000_000  # Convert to ms
+                measurements.append(latency)
+                queue.task_done()
+        except asyncio.CancelledError:
+            pass
+
+    tasks = [
+        asyncio.create_task(mlx_worker()),
+        asyncio.create_task(latency_producer()),
+        asyncio.create_task(latency_consumer())
+    ]
+    
+    try:
+        await asyncio.wait_for(asyncio.gather(*tasks), timeout=6)
+    except asyncio.TimeoutError:
+        print("\nTest timed out")
+    finally:
+        running = False
+        for task in tasks:
+            task.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+        print(f"\nFinal measurement count: {len(measurements)}")
+
+if __name__ == "__main__":
+    asyncio.run(test_non_blocking())
--- a/exo/main.py
+++ b/exo/main.py
@@ -13,7 +13,6 @@ import uuid
 import numpy as np
 from functools import partial
 from tqdm import tqdm
-from tqdm.asyncio import tqdm_asyncio
 from exo.train.dataset import load_dataset, iterate_batches, compose
 from exo.networking.manual.manual_discovery import ManualDiscovery
 from exo.networking.manual.network_topology_config import NetworkTopology
@@ -33,6 +32,46 @@ from exo.inference.tokenizers import resolve_tokenizer
 from exo.models import build_base_shard, get_repo
 from exo.viz.topology_viz import TopologyViz
 from exo.download.hf.hf_helpers import has_hf_home_read_access, has_hf_home_write_access, get_hf_home, move_models_to_hf
+import uvloop
+from contextlib import asynccontextmanager
+import concurrent.futures
+import socket
+import resource
+import psutil
+
+# TODO: figure out why this is happening
+os.environ["GRPC_VERBOSITY"] = "error"
+os.environ["TRANSFORMERS_VERBOSITY"] = "error"
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+# Configure uvloop for maximum performance
+def configure_uvloop():
+    # Install uvloop as event loop policy
+    uvloop.install()
+
+    # Create new event loop
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    # Increase file descriptor limits on Unix systems
+    if not psutil.WINDOWS:
+      soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+      try:
+          resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
+      except ValueError:
+        try:
+          resource.setrlimit(resource.RLIMIT_NOFILE, (8192, hard))
+        except ValueError:
+          pass
+
+    # Configure thread pool for blocking operations
+    loop.set_default_executor(
+      concurrent.futures.ThreadPoolExecutor(
+        max_workers=min(32, (os.cpu_count() or 1) * 4)
+      )
+    )
+
+    return loop

 # parse args
 parser = argparse.ArgumentParser(description="Initialize GRPC Discovery")
@@ -52,7 +91,6 @@ parser.add_argument("--models-seed-dir", type=str, default=None, help="Model see
 parser.add_argument("--listen-port", type=int, default=5678, help="Listening port for discovery")
 parser.add_argument("--download-quick-check", action="store_true", help="Quick check local path for model shards download")
 parser.add_argument("--max-parallel-downloads", type=int, default=4, help="Max parallel downloads for model shards download")
-parser.add_argument("--prometheus-client-port", type=int, default=None, help="Prometheus client port")
 parser.add_argument("--broadcast-port", type=int, default=5678, help="Broadcast port for discovery")
 parser.add_argument("--discovery-module", type=str, choices=["udp", "tailscale", "manual"], default="udp", help="Discovery module to use")
 parser.add_argument("--discovery-timeout", type=int, default=30, help="Discovery timeout in seconds")
@@ -69,6 +107,7 @@ parser.add_argument("--default-temp", type=float, help="Default token sampling t
 parser.add_argument("--tailscale-api-key", type=str, default=None, help="Tailscale API key")
 parser.add_argument("--tailnet-name", type=str, default=None, help="Tailnet name")
 parser.add_argument("--node-id-filter", type=str, default=None, help="Comma separated list of allowed node IDs (only for UDP and Tailscale discovery)")
+parser.add_argument("--interface-type-filter", type=str, default=None, help="Comma separated list of allowed interface types (only for UDP discovery)")
 parser.add_argument("--system-prompt", type=str, default=None, help="System prompt for the ChatGPT API")
 args = parser.parse_args()
 print(f"Selected inference engine: {args.inference_engine}")
@@ -101,8 +140,9 @@ if DEBUG >= 0:
  for chatgpt_api_endpoint in chatgpt_api_endpoints:
    print(f" - {terminal_link(chatgpt_api_endpoint)}")

-# Convert node-id-filter to list if provided
+# Convert node-id-filter and interface-type-filter to lists if provided
 allowed_node_ids = args.node_id_filter.split(',') if args.node_id_filter else None
+allowed_interface_types = args.interface_type_filter.split(',') if args.interface_type_filter else None

 if args.discovery_module == "udp":
  discovery = UDPDiscovery(
@@ -112,7 +152,8 @@ if args.discovery_module == "udp":
    args.broadcast_port,
    lambda peer_id, address, description, device_capabilities: GRPCPeerHandle(peer_id, address, description, device_capabilities),
    discovery_timeout=args.discovery_timeout,
-    allowed_node_ids=allowed_node_ids
+    allowed_node_ids=allowed_node_ids,
+    allowed_interface_types=allowed_interface_types
  )
 elif args.discovery_module == "tailscale":
  discovery = TailscaleDiscovery(
@@ -150,9 +191,16 @@ api = ChatGPTAPI(
  default_model=args.default_model,
  system_prompt=args.system_prompt
 )
-node.on_token.register("update_topology_viz").on_next(
-  lambda req_id, tokens, __: topology_viz.update_prompt_output(req_id, inference_engine.tokenizer.decode(tokens)) if topology_viz and hasattr(inference_engine, "tokenizer") and inference_engine.shard.model_id != 'stable-diffusion-2-1-base' else None
-)
+buffered_token_output = {}
+def update_topology_viz(req_id, tokens, __):
+  if not topology_viz: return
+  if not inference_engine.shard: return
+  if inference_engine.shard.model_id == 'stable-diffusion-2-1-base': return
+
+  if req_id in buffered_token_output: buffered_token_output[req_id].extend(tokens)
+  else: buffered_token_output[req_id] = tokens
+  topology_viz.update_prompt_output(req_id, inference_engine.tokenizer.decode(buffered_token_output[req_id]))
+node.on_token.register("update_topology_viz").on_next(update_topology_viz)

 def preemptively_start_download(request_id: str, opaque_status: str):
  try:
@@ -169,10 +217,6 @@ def preemptively_start_download(request_id: str, opaque_status: str):

 node.on_opaque_status.register("start_download").on_next(preemptively_start_download)

-if args.prometheus_client_port:
-  from exo.stats.metrics import start_metrics_server
-  start_metrics_server(node, args.prometheus_client_port)
-
 last_broadcast_time = 0


@@ -204,7 +248,11 @@ async def run_model_cli(node: Node, inference_engine: InferenceEngine, model_nam
    print(f"Processing prompt: {prompt}")
    await node.process_prompt(shard, prompt, request_id=request_id)

-    _, tokens, _ = await callback.wait(lambda _request_id, tokens, is_finished: _request_id == request_id and is_finished, timeout=300)
+    tokens = []
+    def on_token(_request_id, _tokens, _is_finished):
+      tokens.extend(_tokens)
+      return _request_id == request_id and _is_finished
+    await callback.wait(on_token, timeout=300)

    print("\nGenerated response:")
    print(tokenizer.decode(tokens))
@@ -223,7 +271,7 @@ def clean_path(path):
 async def hold_outstanding(node: Node):
  while node.outstanding_requests:
    await asyncio.sleep(.5)
-  return 
+  return

 async def run_iter(node: Node, shard: Shard, train: bool, data, batch_size=1):
  losses = []
@@ -234,7 +282,7 @@ async def run_iter(node: Node, shard: Shard, train: bool, data, batch_size=1):
    tokens.append(np.sum(lengths))
  total_tokens = np.sum(tokens)
  total_loss = np.sum(losses) / total_tokens
-  
+
  return total_loss, total_tokens

 async def eval_model_cli(node: Node, inference_engine: InferenceEngine, model_name, dataloader, batch_size, num_batches=-1):
@@ -270,7 +318,7 @@ async def train_model_cli(node: Node, inference_engine: InferenceEngine, model_n
      await hold_outstanding(node)
  await hold_outstanding(node)

-  
+
 async def main():
  loop = asyncio.get_running_loop()

@@ -285,7 +333,7 @@ async def main():
          {"❌ No read access" if not has_read else ""}
          {"❌ No write access" if not has_write else ""}
          """)
-    
+
  if not args.models_seed_dir is None:
    try:
      models_seed_dir = clean_path(args.models_seed_dir)
@@ -330,29 +378,31 @@ async def main():
        print("Error: This train ain't leaving the station without a model")
        return
      await train_model_cli(node, inference_engine, model_name, dataloader, args.batch_size, args.iters, save_interval=args.save_every, checkpoint_dir=args.save_checkpoint_dir)
-    
+
  else:
    asyncio.create_task(api.run(port=args.chatgpt_api_port))  # Start the API server as a non-blocking task
    await asyncio.Event().wait()
-  
+
  if args.wait_for_peers > 0:
    print("Cooldown to allow peers to exit gracefully")
    for i in tqdm(range(50)):
      await asyncio.sleep(.1)

+@asynccontextmanager
+async def setup_node(args):
+    # Rest of setup_node implementation...
+    pass

 def run():
-  loop = asyncio.new_event_loop()
-  asyncio.set_event_loop(loop)
-  try:
-    loop.run_until_complete(main())
-      
-  except KeyboardInterrupt:
-    print("Received keyboard interrupt. Shutting down...")
-  finally:
-    loop.run_until_complete(shutdown(signal.SIGTERM, loop, node.server))
-    loop.close()
-
+    loop = None
+    try:
+        loop = configure_uvloop()
+        loop.run_until_complete(main())
+    except KeyboardInterrupt:
+        print("\nShutdown requested... exiting")
+    finally:
+        if loop:
+            loop.close()

 if __name__ == "__main__":
  run()
--- a/exo/networking/grpc/grpc_peer_handle.py
+++ b/exo/networking/grpc/grpc_peer_handle.py
@@ -12,7 +12,13 @@ from exo.topology.topology import Topology
 from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
 from exo.helpers import DEBUG
 import json
-import mlx.core as mx
+import platform
+
+if platform.system().lower() == "darwin" and platform.machine().lower() == "arm64":
+  import mlx.core as mx
+else:
+  import numpy as mx
+

 class GRPCPeerHandle(PeerHandle):
  def __init__(self, _id: str, address: str, desc: str, device_capabilities: DeviceCapabilities):
@@ -22,6 +28,19 @@ class GRPCPeerHandle(PeerHandle):
    self._device_capabilities = device_capabilities
    self.channel = None
    self.stub = None
+    self.channel_options = [
+      ("grpc.max_metadata_size", 64 * 1024 * 1024),
+      ("grpc.max_receive_message_length", 256 * 1024 * 1024),
+      ("grpc.max_send_message_length", 256 * 1024 * 1024),
+      ("grpc.max_concurrent_streams", 100),
+      ("grpc.http2.min_time_between_pings_ms", 10000),
+      ("grpc.keepalive_time_ms", 20000),
+      ("grpc.keepalive_timeout_ms", 10000),
+      ("grpc.keepalive_permit_without_calls", 1),
+      ("grpc.http2.max_pings_without_data", 0),
+      ("grpc.tcp_nodelay", 1),
+      ("grpc.optimization_target", "throughput"),
+    ]

  def id(self) -> str:
    return self._id
@@ -37,11 +56,11 @@ class GRPCPeerHandle(PeerHandle):

  async def connect(self):
    if self.channel is None:
-      self.channel = grpc.aio.insecure_channel(self.address, options=[
-        ("grpc.max_metadata_size", 32*1024*1024),
-        ('grpc.max_receive_message_length', 32*1024*1024),
-        ('grpc.max_send_message_length', 32*1024*1024)
-      ])
+      self.channel = grpc.aio.insecure_channel(
+        self.address,
+        options=self.channel_options,
+        compression=grpc.Compression.Gzip
+      )
      self.stub = node_service_pb2_grpc.NodeServiceStub(self.channel)
    await self.channel.channel_ready()

@@ -55,7 +74,13 @@ class GRPCPeerHandle(PeerHandle):
    self.stub = None

  async def _ensure_connected(self):
-    if not await self.is_connected(): await asyncio.wait_for(self.connect(), timeout=5)
+    if not await self.is_connected():
+      try:
+        await asyncio.wait_for(self.connect(), timeout=10.0)
+      except asyncio.TimeoutError:
+        if DEBUG >= 2: print(f"Connection timeout for {self._id}@{self.address}")
+        await self.disconnect()
+        raise

  async def health_check(self) -> bool:
    try:
@@ -84,12 +109,7 @@ class GRPCPeerHandle(PeerHandle):
      request_id=request_id,
      inference_state=None if inference_state is None else self.serialize_inference_state(inference_state)
    )
-    response = await self.stub.SendPrompt(request)
-
-    if not response.tensor_data or not response.shape or not response.dtype:
-      return None
-
-    return np.frombuffer(response.tensor_data, dtype=np.dtype(response.dtype)).reshape(response.shape)
+    await self.stub.SendPrompt(request)

  async def send_tensor(self, shard: Shard, tensor: np.ndarray, inference_state: Optional[dict] = None, request_id: Optional[str] = None) -> Optional[np.array]:
    request = node_service_pb2.TensorRequest(
@@ -109,7 +129,7 @@ class GRPCPeerHandle(PeerHandle):
      return None

    return np.frombuffer(response.tensor_data, dtype=np.dtype(response.dtype)).reshape(response.shape)
-  
+
  async def send_example(self, shard: Shard, example: np.ndarray, target: np.ndarray, length: np.ndarray, train: bool, request_id: Optional[str] = None) -> Optional[np.array]:
    request = node_service_pb2.ExampleRequest(
      shard=node_service_pb2.Shard(
@@ -131,7 +151,7 @@ class GRPCPeerHandle(PeerHandle):
      return loss, grads
    else:
      return loss
-  
+
  async def send_loss(self, shard: Shard, tensor: np.ndarray, request_id: Optional[str] = None) -> Optional[np.array]:
    request = node_service_pb2.TensorRequest(
      shard=node_service_pb2.Shard(
@@ -150,26 +170,13 @@ class GRPCPeerHandle(PeerHandle):

    return np.frombuffer(response.tensor_data, dtype=np.dtype(response.dtype)).reshape(response.shape)

-  async def get_inference_result(self, request_id: str) -> Tuple[Optional[np.ndarray], bool]:
-    request = node_service_pb2.GetInferenceResultRequest(request_id=request_id)
-    response = await self.stub.GetInferenceResult(request)
-    if response.tensor is None:
-      return None, response.is_finished
-    return (
-      np.frombuffer(response.tensor.tensor_data, dtype=np.dtype(response.tensor.dtype)).reshape(response.tensor.shape),
-      response.is_finished,
-    )
-
  async def collect_topology(self, visited: set[str], max_depth: int) -> Topology:
    request = node_service_pb2.CollectTopologyRequest(visited=visited, max_depth=max_depth)
    response = await self.stub.CollectTopology(request)
    topology = Topology()
    for node_id, capabilities in response.nodes.items():
      device_capabilities = DeviceCapabilities(
-        model=capabilities.model,
-        chip=capabilities.chip,
-        memory=capabilities.memory,
-        flops=DeviceFlops(fp16=capabilities.flops.fp16, fp32=capabilities.flops.fp32, int8=capabilities.flops.int8)
+        model=capabilities.model, chip=capabilities.chip, memory=capabilities.memory, flops=DeviceFlops(fp16=capabilities.flops.fp16, fp32=capabilities.flops.fp32, int8=capabilities.flops.int8)
      )
      topology.update_node(node_id, device_capabilities)
    for node_id, peer_connections in response.peer_graph.items():
@@ -193,28 +200,20 @@ class GRPCPeerHandle(PeerHandle):
    proto_inference_state = node_service_pb2.InferenceState()
    other_data = {}
    for k, v in inference_state.items():
-        if isinstance(v, mx.array):
-            np_array = np.array(v)
-            tensor_data = node_service_pb2.Tensor(
-                tensor_data=np_array.tobytes(),
-                shape=list(np_array.shape),
-                dtype=str(np_array.dtype)
-            )
-            proto_inference_state.tensor_data[k].CopyFrom(tensor_data)
-        elif isinstance(v, list) and all(isinstance(item, mx.array) for item in v):
-            tensor_list = node_service_pb2.TensorList()
-            for tensor in v:
-                np_array = np.array(tensor)
-                tensor_data = node_service_pb2.Tensor(
-                    tensor_data=np_array.tobytes(),
-                    shape=list(np_array.shape),
-                    dtype=str(np_array.dtype)
-                )
-                tensor_list.tensors.append(tensor_data)
-            proto_inference_state.tensor_list_data[k].CopyFrom(tensor_list)
-        else:
-            # For non-tensor data, we'll still use JSON
-            other_data[k] = v
+      if isinstance(v, mx.array):
+        np_array = np.array(v)
+        tensor_data = node_service_pb2.Tensor(tensor_data=np_array.tobytes(), shape=list(np_array.shape), dtype=str(np_array.dtype))
+        proto_inference_state.tensor_data[k].CopyFrom(tensor_data)
+      elif isinstance(v, list) and all(isinstance(item, mx.array) for item in v):
+        tensor_list = node_service_pb2.TensorList()
+        for tensor in v:
+          np_array = np.array(tensor)
+          tensor_data = node_service_pb2.Tensor(tensor_data=np_array.tobytes(), shape=list(np_array.shape), dtype=str(np_array.dtype))
+          tensor_list.tensors.append(tensor_data)
+        proto_inference_state.tensor_list_data[k].CopyFrom(tensor_list)
+      else:
+        # For non-tensor data, we'll still use JSON
+        other_data[k] = v
    if other_data:
      proto_inference_state.other_data_json = json.dumps(other_data)
    return proto_inference_state
--- a/exo/networking/grpc/grpc_server.py
+++ b/exo/networking/grpc/grpc_server.py
@@ -3,13 +3,19 @@ from concurrent import futures
 import numpy as np
 from asyncio import CancelledError

+import platform
+
 from . import node_service_pb2
 from . import node_service_pb2_grpc
 from exo import DEBUG
 from exo.inference.shard import Shard
 from exo.orchestration import Node
 import json
-import mlx.core as mx
+
+if platform.system().lower() == "darwin" and platform.machine().lower() == "arm64":
+  import mlx.core as mx
+else:
+  import numpy as mx


 class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
@@ -21,11 +27,19 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):

  async def start(self) -> None:
    self.server = grpc.aio.server(
-      futures.ThreadPoolExecutor(max_workers=10),
+      futures.ThreadPoolExecutor(max_workers=32),
      options=[
        ("grpc.max_metadata_size", 32*1024*1024),
-        ("grpc.max_send_message_length", 128*1024*1024),
-        ("grpc.max_receive_message_length", 128*1024*1024),
+        ("grpc.max_send_message_length", 256*1024*1024),
+        ("grpc.max_receive_message_length", 256*1024*1024),
+        ("grpc.keepalive_time_ms", 10000),
+        ("grpc.keepalive_timeout_ms", 5000),
+        ("grpc.http2.max_pings_without_data", 0),
+        ("grpc.http2.min_time_between_pings_ms", 10000),
+        ("grpc.http2.min_ping_interval_without_data_ms", 5000),
+        ("grpc.max_concurrent_streams", 100),
+        ("grpc.tcp_nodelay", 1),
+        ("grpc.optimization_target", "throughput"),
      ],
    )
    node_service_pb2_grpc.add_NodeServiceServicer_to_server(self, self.server)
@@ -74,7 +88,7 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
    if DEBUG >= 5: print(f"SendTensor tensor {shard=} {tensor=} {request_id=} result: {result}")
    tensor_data = result.tobytes() if result is not None else None
    return node_service_pb2.Tensor(tensor_data=tensor_data, shape=result.shape, dtype=str(result.dtype)) if result is not None else node_service_pb2.Tensor()
-  
+
  async def SendExample(self, request, context):
    shard = Shard(
      model_id=request.shard.model_id,
@@ -96,7 +110,7 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
    else:
      loss = await self.node.process_example(shard, example, target, length, train, request_id)
      return node_service_pb2.Loss(loss=loss, grads=None)
-    
+
  async def CollectTopology(self, request, context):
    max_depth = request.max_depth
    visited = set(request.visited)
@@ -112,12 +126,7 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
      for node_id, cap in topology.nodes.items()
    }
    peer_graph = {
-      node_id: node_service_pb2.PeerConnections(
-        connections=[
-          node_service_pb2.PeerConnection(to_id=conn.to_id, description=conn.description)
-          for conn in connections
-        ]
-      )
+      node_id: node_service_pb2.PeerConnections(connections=[node_service_pb2.PeerConnection(to_id=conn.to_id, description=conn.description) for conn in connections])
      for node_id, connections in topology.peer_graph.items()
    }
    if DEBUG >= 5: print(f"CollectTopology {max_depth=} {visited=} {nodes=} {peer_graph=}")
@@ -131,7 +140,7 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
    if DEBUG >= 5: print(f"Received SendResult request: {request_id=} {result=} {is_finished=}")
    result = list(result)
    if len(img.tensor_data) > 0:
-      result=np.frombuffer(img.tensor_data, dtype=np.dtype(img.dtype)).reshape(img.shape)
+      result = np.frombuffer(img.tensor_data, dtype=np.dtype(img.dtype)).reshape(img.shape)
    self.node.on_token.trigger_all(request_id, result, is_finished)
    return node_service_pb2.Empty()

@@ -145,21 +154,18 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
  async def HealthCheck(self, request, context):
    return node_service_pb2.HealthCheckResponse(is_healthy=True)

-  def deserialize_inference_state(self,inference_state_proto: node_service_pb2.InferenceState) -> dict:
+  def deserialize_inference_state(self, inference_state_proto: node_service_pb2.InferenceState) -> dict:
    inference_state = {}
-    
+
    for k, tensor_data in inference_state_proto.tensor_data.items():
-        np_array = np.frombuffer(tensor_data.tensor_data, dtype=tensor_data.dtype).reshape(tensor_data.shape)
-        inference_state[k] = mx.array(np_array)
-    
+      np_array = np.frombuffer(tensor_data.tensor_data, dtype=tensor_data.dtype).reshape(tensor_data.shape)
+      inference_state[k] = mx.array(np_array)
+
    for k, tensor_list in inference_state_proto.tensor_list_data.items():
-        inference_state[k] = [
-            mx.array(np.frombuffer(tensor.tensor_data, dtype=tensor.dtype).reshape(tensor.shape))
-            for tensor in tensor_list.tensors
-        ]
-    
+      inference_state[k] = [mx.array(np.frombuffer(tensor.tensor_data, dtype=tensor.dtype).reshape(tensor.shape)) for tensor in tensor_list.tensors]
+
    if inference_state_proto.other_data_json:
-        other_data = json.loads(inference_state_proto.other_data_json)
-        inference_state.update(other_data)
-    
+      other_data = json.loads(inference_state_proto.other_data_json)
+      inference_state.update(other_data)
+
    return inference_state
--- a/exo/networking/grpc/node_service.proto
+++ b/exo/networking/grpc/node_service.proto
@@ -6,7 +6,6 @@ service NodeService {
  rpc SendPrompt (PromptRequest) returns (Tensor) {}
  rpc SendTensor (TensorRequest) returns (Tensor) {}
  rpc SendExample (ExampleRequest) returns (Loss) {}
-  rpc GetInferenceResult (GetInferenceResultRequest) returns (InferenceResult) {}
  rpc CollectTopology (CollectTopologyRequest) returns (Topology) {}
  rpc SendResult (SendResultRequest) returns (Empty) {}
  rpc SendOpaqueStatus (SendOpaqueStatusRequest) returns (Empty) {}
@@ -47,15 +46,6 @@ message Loss {
  float loss = 1;
  optional Tensor grads = 2;
 }
-  
-message GetInferenceResultRequest {
-  string request_id = 1;
-}
-
-message InferenceResult {
-  optional Tensor tensor = 1;
-  bool is_finished = 2;
-}

 message Tensor {
  bytes tensor_data = 1;
--- a/exo/networking/grpc/node_service_pb2.py
+++ b/exo/networking/grpc/node_service_pb2.py
--- a/exo/networking/grpc/node_service_pb2_grpc.py
+++ b/exo/networking/grpc/node_service_pb2_grpc.py
@@ -3,7 +3,7 @@
 import grpc
 import warnings

-from exo.networking.grpc import node_service_pb2 as exo_dot_networking_dot_grpc_dot_node__service__pb2
+from . import node_service_pb2 as node__service__pb2

 GRPC_GENERATED_VERSION = '1.68.0'
 GRPC_VERSION = grpc.__version__
@@ -18,7 +18,7 @@ except ImportError:
 if _version_not_supported:
    raise RuntimeError(
        f'The grpc package installed is at version {GRPC_VERSION},'
-        + f' but the generated code in exo/networking/grpc/node_service_pb2_grpc.py depends on'
+        + f' but the generated code in node_service_pb2_grpc.py depends on'
        + f' grpcio>={GRPC_GENERATED_VERSION}.'
        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
@@ -36,43 +36,38 @@ class NodeServiceStub(object):
        """
        self.SendPrompt = channel.unary_unary(
                '/node_service.NodeService/SendPrompt',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.PromptRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Tensor.FromString,
+                request_serializer=node__service__pb2.PromptRequest.SerializeToString,
+                response_deserializer=node__service__pb2.Tensor.FromString,
                _registered_method=True)
        self.SendTensor = channel.unary_unary(
                '/node_service.NodeService/SendTensor',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.TensorRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Tensor.FromString,
+                request_serializer=node__service__pb2.TensorRequest.SerializeToString,
+                response_deserializer=node__service__pb2.Tensor.FromString,
                _registered_method=True)
        self.SendExample = channel.unary_unary(
                '/node_service.NodeService/SendExample',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.ExampleRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Loss.FromString,
-                _registered_method=True)
-        self.GetInferenceResult = channel.unary_unary(
-                '/node_service.NodeService/GetInferenceResult',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.GetInferenceResultRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.InferenceResult.FromString,
+                request_serializer=node__service__pb2.ExampleRequest.SerializeToString,
+                response_deserializer=node__service__pb2.Loss.FromString,
                _registered_method=True)
        self.CollectTopology = channel.unary_unary(
                '/node_service.NodeService/CollectTopology',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.CollectTopologyRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Topology.FromString,
+                request_serializer=node__service__pb2.CollectTopologyRequest.SerializeToString,
+                response_deserializer=node__service__pb2.Topology.FromString,
                _registered_method=True)
        self.SendResult = channel.unary_unary(
                '/node_service.NodeService/SendResult',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.SendResultRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Empty.FromString,
+                request_serializer=node__service__pb2.SendResultRequest.SerializeToString,
+                response_deserializer=node__service__pb2.Empty.FromString,
                _registered_method=True)
        self.SendOpaqueStatus = channel.unary_unary(
                '/node_service.NodeService/SendOpaqueStatus',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Empty.FromString,
+                request_serializer=node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
+                response_deserializer=node__service__pb2.Empty.FromString,
                _registered_method=True)
        self.HealthCheck = channel.unary_unary(
                '/node_service.NodeService/HealthCheck',
-                request_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.HealthCheckRequest.SerializeToString,
-                response_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.HealthCheckResponse.FromString,
+                request_serializer=node__service__pb2.HealthCheckRequest.SerializeToString,
+                response_deserializer=node__service__pb2.HealthCheckResponse.FromString,
                _registered_method=True)


@@ -97,12 +92,6 @@ class NodeServiceServicer(object):
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')

-    def GetInferenceResult(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
    def CollectTopology(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
@@ -132,43 +121,38 @@ def add_NodeServiceServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'SendPrompt': grpc.unary_unary_rpc_method_handler(
                    servicer.SendPrompt,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.PromptRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Tensor.SerializeToString,
+                    request_deserializer=node__service__pb2.PromptRequest.FromString,
+                    response_serializer=node__service__pb2.Tensor.SerializeToString,
            ),
            'SendTensor': grpc.unary_unary_rpc_method_handler(
                    servicer.SendTensor,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.TensorRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Tensor.SerializeToString,
+                    request_deserializer=node__service__pb2.TensorRequest.FromString,
+                    response_serializer=node__service__pb2.Tensor.SerializeToString,
            ),
            'SendExample': grpc.unary_unary_rpc_method_handler(
                    servicer.SendExample,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.ExampleRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Loss.SerializeToString,
-            ),
-            'GetInferenceResult': grpc.unary_unary_rpc_method_handler(
-                    servicer.GetInferenceResult,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.GetInferenceResultRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.InferenceResult.SerializeToString,
+                    request_deserializer=node__service__pb2.ExampleRequest.FromString,
+                    response_serializer=node__service__pb2.Loss.SerializeToString,
            ),
            'CollectTopology': grpc.unary_unary_rpc_method_handler(
                    servicer.CollectTopology,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.CollectTopologyRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Topology.SerializeToString,
+                    request_deserializer=node__service__pb2.CollectTopologyRequest.FromString,
+                    response_serializer=node__service__pb2.Topology.SerializeToString,
            ),
            'SendResult': grpc.unary_unary_rpc_method_handler(
                    servicer.SendResult,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.SendResultRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Empty.SerializeToString,
+                    request_deserializer=node__service__pb2.SendResultRequest.FromString,
+                    response_serializer=node__service__pb2.Empty.SerializeToString,
            ),
            'SendOpaqueStatus': grpc.unary_unary_rpc_method_handler(
                    servicer.SendOpaqueStatus,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.SendOpaqueStatusRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.Empty.SerializeToString,
+                    request_deserializer=node__service__pb2.SendOpaqueStatusRequest.FromString,
+                    response_serializer=node__service__pb2.Empty.SerializeToString,
            ),
            'HealthCheck': grpc.unary_unary_rpc_method_handler(
                    servicer.HealthCheck,
-                    request_deserializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.HealthCheckRequest.FromString,
-                    response_serializer=exo_dot_networking_dot_grpc_dot_node__service__pb2.HealthCheckResponse.SerializeToString,
+                    request_deserializer=node__service__pb2.HealthCheckRequest.FromString,
+                    response_serializer=node__service__pb2.HealthCheckResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
@@ -196,8 +180,8 @@ class NodeService(object):
            request,
            target,
            '/node_service.NodeService/SendPrompt',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.PromptRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.Tensor.FromString,
+            node__service__pb2.PromptRequest.SerializeToString,
+            node__service__pb2.Tensor.FromString,
            options,
            channel_credentials,
            insecure,
@@ -223,8 +207,8 @@ class NodeService(object):
            request,
            target,
            '/node_service.NodeService/SendTensor',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.TensorRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.Tensor.FromString,
+            node__service__pb2.TensorRequest.SerializeToString,
+            node__service__pb2.Tensor.FromString,
            options,
            channel_credentials,
            insecure,
@@ -250,35 +234,8 @@ class NodeService(object):
            request,
            target,
            '/node_service.NodeService/SendExample',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.ExampleRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.Loss.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True)
-
-    @staticmethod
-    def GetInferenceResult(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/node_service.NodeService/GetInferenceResult',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.GetInferenceResultRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.InferenceResult.FromString,
+            node__service__pb2.ExampleRequest.SerializeToString,
+            node__service__pb2.Loss.FromString,
            options,
            channel_credentials,
            insecure,
@@ -304,8 +261,8 @@ class NodeService(object):
            request,
            target,
            '/node_service.NodeService/CollectTopology',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.CollectTopologyRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.Topology.FromString,
+            node__service__pb2.CollectTopologyRequest.SerializeToString,
+            node__service__pb2.Topology.FromString,
            options,
            channel_credentials,
            insecure,
@@ -331,8 +288,8 @@ class NodeService(object):
            request,
            target,
            '/node_service.NodeService/SendResult',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.SendResultRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.Empty.FromString,
+            node__service__pb2.SendResultRequest.SerializeToString,
+            node__service__pb2.Empty.FromString,
            options,
            channel_credentials,
            insecure,
@@ -358,8 +315,8 @@ class NodeService(object):
            request,
            target,
            '/node_service.NodeService/SendOpaqueStatus',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.Empty.FromString,
+            node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
+            node__service__pb2.Empty.FromString,
            options,
            channel_credentials,
            insecure,
@@ -385,8 +342,8 @@ class NodeService(object):
            request,
            target,
            '/node_service.NodeService/HealthCheck',
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.HealthCheckRequest.SerializeToString,
-            exo_dot_networking_dot_grpc_dot_node__service__pb2.HealthCheckResponse.FromString,
+            node__service__pb2.HealthCheckRequest.SerializeToString,
+            node__service__pb2.HealthCheckResponse.FromString,
            options,
            channel_credentials,
            insecure,
--- a/exo/networking/manual/manual_discovery.py
+++ b/exo/networking/manual/manual_discovery.py
@@ -63,8 +63,7 @@ class ManualDiscovery(Discovery):
            print(f"{peer_id=} at {peer_config.address}:{peer_config.port} is not healthy. Removing.")
        except Exception as e:
          if DEBUG_DISCOVERY >= 2: print(f"Exception occured when attempting to add {peer_id=}: {e}")
-      self.known_peers = new_known_peers
-      await asyncio.sleep(1.0)
+      await asyncio.sleep(5.0)

      if DEBUG_DISCOVERY >= 2: print(f"Current known peers: {[peer.id() for peer in self.known_peers.values()]}")

--- a/exo/networking/peer_handle.py
+++ b/exo/networking/peer_handle.py
@@ -51,10 +51,6 @@ class PeerHandle(ABC):
  async def send_result(self, request_id: str, result: List[int], is_finished: bool) -> None:
    pass

-  @abstractmethod
-  async def get_inference_result(self, request_id: str) -> Tuple[Optional[np.ndarray], bool]:
-    pass
-
  @abstractmethod
  async def collect_topology(self, visited: set[str], max_depth: int) -> Topology:
    pass
--- a/exo/networking/tailscale/tailscale_discovery.py
+++ b/exo/networking/tailscale/tailscale_discovery.py
@@ -40,7 +40,7 @@ class TailscaleDiscovery(Discovery):
    self.update_task = None

  async def start(self):
-    self.device_capabilities = device_capabilities()
+    self.device_capabilities = await device_capabilities()
    self.discovery_task = asyncio.create_task(self.task_discover_peers())
    self.cleanup_task = asyncio.create_task(self.task_cleanup_peers())
    self.update_task = asyncio.create_task(self.task_update_device_posture_attributes())
--- a/exo/networking/udp/udp_discovery.py
+++ b/exo/networking/udp/udp_discovery.py
@@ -3,7 +3,7 @@ import json
 import socket
 import time
 import traceback
-from typing import List, Dict, Callable, Tuple, Coroutine
+from typing import List, Dict, Callable, Tuple, Coroutine, Optional
 from exo.networking.discovery import Discovery
 from exo.networking.peer_handle import PeerHandle
 from exo.topology.device_capabilities import DeviceCapabilities, device_capabilities, UNKNOWN_DEVICE_CAPABILITIES
@@ -23,15 +23,29 @@ class ListenProtocol(asyncio.DatagramProtocol):
    asyncio.create_task(self.on_message(data, addr))


+def get_broadcast_address(ip_addr: str) -> str:
+  try:
+    # Split IP into octets and create broadcast address for the subnet
+    ip_parts = ip_addr.split('.')
+    return f"{ip_parts[0]}.{ip_parts[1]}.{ip_parts[2]}.255"
+  except:
+    return "255.255.255.255"
+
+
 class BroadcastProtocol(asyncio.DatagramProtocol):
-  def __init__(self, message: str, broadcast_port: int):
+  def __init__(self, message: str, broadcast_port: int, source_ip: str):
    self.message = message
    self.broadcast_port = broadcast_port
+    self.source_ip = source_ip

  def connection_made(self, transport):
    sock = transport.get_extra_info("socket")
    sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
-    transport.sendto(self.message.encode("utf-8"), ("<broadcast>", self.broadcast_port))
+    # Try both subnet-specific and global broadcast
+    broadcast_addr = get_broadcast_address(self.source_ip)
+    transport.sendto(self.message.encode("utf-8"), (broadcast_addr, self.broadcast_port))
+    if broadcast_addr != "255.255.255.255":
+      transport.sendto(self.message.encode("utf-8"), ("255.255.255.255", self.broadcast_port))


 class UDPDiscovery(Discovery):
@@ -45,7 +59,8 @@ class UDPDiscovery(Discovery):
    broadcast_interval: int = 2.5,
    discovery_timeout: int = 30,
    device_capabilities: DeviceCapabilities = UNKNOWN_DEVICE_CAPABILITIES,
-    allowed_node_ids: List[str] = None,
+    allowed_node_ids: Optional[List[str]] = None,
+    allowed_interface_types: Optional[List[str]] = None,
  ):
    self.node_id = node_id
    self.node_port = node_port
@@ -56,13 +71,14 @@ class UDPDiscovery(Discovery):
    self.discovery_timeout = discovery_timeout
    self.device_capabilities = device_capabilities
    self.allowed_node_ids = allowed_node_ids
+    self.allowed_interface_types = allowed_interface_types
    self.known_peers: Dict[str, Tuple[PeerHandle, float, float, int]] = {}
    self.broadcast_task = None
    self.listen_task = None
    self.cleanup_task = None

  async def start(self):
-    self.device_capabilities = device_capabilities()
+    self.device_capabilities = await device_capabilities()
    self.broadcast_task = asyncio.create_task(self.task_broadcast_presence())
    self.listen_task = asyncio.create_task(self.task_listen_for_peers())
    self.cleanup_task = asyncio.create_task(self.task_cleanup_peers())
@@ -82,11 +98,7 @@ class UDPDiscovery(Discovery):
    return [peer_handle for peer_handle, _, _, _ in self.known_peers.values()]

  async def task_broadcast_presence(self):
-    if DEBUG_DISCOVERY >= 2: print("Starting task_broadcast_presence...")
-
    while True:
-      # Explicitly broadcasting on all assigned ips since broadcasting on `0.0.0.0` on MacOS does not broadcast over
-      # the Thunderbolt bridge when other connection modalities exist such as WiFi or Ethernet
      for addr, interface_name in get_all_ip_addresses_and_interfaces():
        interface_priority, interface_type = await get_interface_priority_and_type(interface_name)
        message = json.dumps({
@@ -94,16 +106,26 @@ class UDPDiscovery(Discovery):
          "node_id": self.node_id,
          "grpc_port": self.node_port,
          "device_capabilities": self.device_capabilities.to_dict(),
-          "priority": interface_priority, # TODO: Prioritise interfaces based on bandwidth, latency, and jitter e.g. prioritise Thunderbolt over WiFi.
+          "priority": interface_priority,
          "interface_name": interface_name,
          "interface_type": interface_type,
        })
-        if DEBUG_DISCOVERY >= 3: print(f"Broadcasting presence at ({addr} - {interface_name} - {interface_priority}): {message}")

        transport = None
        try:
-          transport, _ = await asyncio.get_event_loop().create_datagram_endpoint(lambda: BroadcastProtocol(message, self.broadcast_port), local_addr=(addr, 0), family=socket.AF_INET)
-          if DEBUG_DISCOVERY >= 3: print(f"Broadcasting presence at ({addr} - {interface_name} - {interface_priority})")
+          sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+          sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
+          sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+          try:
+            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+          except AttributeError:
+            pass
+          sock.bind((addr, 0))
+          
+          transport, _ = await asyncio.get_event_loop().create_datagram_endpoint(
+            lambda: BroadcastProtocol(message, self.broadcast_port, addr),
+            sock=sock
+          )
        except Exception as e:
          print(f"Error in broadcast presence ({addr} - {interface_name} - {interface_priority}): {e}")
        finally:
@@ -111,7 +133,7 @@ class UDPDiscovery(Discovery):
            try: transport.close()
            except Exception as e:
              if DEBUG_DISCOVERY >= 2: print(f"Error closing transport: {e}")
-              if DEBUG_DISCOVERY >= 2: traceback.print_exc()
+
      await asyncio.sleep(self.broadcast_interval)

  async def on_listen_message(self, data, addr):
@@ -147,6 +169,12 @@ class UDPDiscovery(Discovery):
      peer_prio = message["priority"]
      peer_interface_name = message["interface_name"]
      peer_interface_type = message["interface_type"]
+
+      # Skip if interface type is not in allowed list
+      if self.allowed_interface_types and peer_interface_type not in self.allowed_interface_types:
+        if DEBUG_DISCOVERY >= 2: print(f"Ignoring peer {peer_id} as its interface type {peer_interface_type} is not in the allowed interface types list")
+        return
+
      device_capabilities = DeviceCapabilities(**message["device_capabilities"])

      if peer_id not in self.known_peers or self.known_peers[peer_id][0].addr() != f"{peer_host}:{peer_port}":
--- a/exo/orchestration/node.py
+++ b/exo/orchestration/node.py
@@ -8,7 +8,7 @@ from typing import List, Dict, Optional, Tuple, Union, Set
 from exo.networking import Discovery, PeerHandle, Server
 from exo.inference.inference_engine import InferenceEngine, Shard
 from exo.topology.topology import Topology
-from exo.topology.device_capabilities import device_capabilities
+from exo.topology.device_capabilities import device_capabilities, UNKNOWN_DEVICE_CAPABILITIES
 from exo.topology.partitioning_strategy import Partition, PartitioningStrategy, map_partitions_to_shards
 from exo import DEBUG
 from exo.helpers import AsyncCallbackSystem
@@ -37,7 +37,7 @@ class Node:
    self.partitioning_strategy = partitioning_strategy
    self.peers: List[PeerHandle] = {}
    self.topology: Topology = Topology()
-    self.device_capabilities = device_capabilities()
+    self.device_capabilities = UNKNOWN_DEVICE_CAPABILITIES
    self.buffered_token_output: Dict[str, Tuple[List[int], bool]] = {}
    self.buffered_logits: Dict[str, List[np.ndarray]] = {}
    self.buffered_inputs: Dict[str, List[np.ndarray]] = {}
@@ -56,6 +56,7 @@ class Node:
    self.outstanding_requests = {}

  async def start(self, wait_for_peers: int = 0) -> None:
+    self.device_capabilities = await device_capabilities()
    await self.server.start()
    await self.discovery.start()
    await self.update_peers(wait_for_peers)
@@ -70,25 +71,28 @@ class Node:
  def on_node_status(self, request_id, opaque_status):
    try:
      status_data = json.loads(opaque_status)
-      if status_data.get("type", "") == "supported_inference_engines":
+      status_type = status_data.get("type", "")
+      if status_type == "supported_inference_engines":
        node_id = status_data.get("node_id")
        engines = status_data.get("engines", [])
        self.topology_inference_engines_pool.append(engines)
-      if status_data.get("type", "") == "node_status":
+      elif status_type == "node_status":
        if status_data.get("status", "").startswith("start_"):
          self.current_topology.active_node_id = status_data.get("node_id")
        elif status_data.get("status", "").startswith("end_"):
          if status_data.get("node_id") == self.current_topology.active_node_id:
            self.current_topology.active_node_id = None
+
      download_progress = None
-      if status_data.get("type", "") == "download_progress":
+      if status_type == "download_progress":
        if DEBUG >= 8: print(f"Download progress from {status_data.get('node_id')}: {status_data.get('progress')}")
        download_progress = RepoProgressEvent.from_dict(status_data.get('progress'))
        self.node_download_progress[status_data.get('node_id')] = download_progress
+
      if self.topology_viz:
        self.topology_viz.update_visualization(self.topology, self.partitioning_strategy.partition(self.topology), self.id, self.node_download_progress)
    except Exception as e:
-      if DEBUG >= 1: print(f"Error updating visualization: {e}")
+      if DEBUG >= 1: print(f"Error on_node_status: {e}")
      if DEBUG >= 1: traceback.print_exc()

  def get_supported_inference_engines(self):
@@ -107,6 +111,8 @@ class Node:
  def get_topology_inference_engines(self) -> List[List[str]]:
    return self.topology_inference_engines_pool
  
+  token_count = 0
+  first_token_time = 0
  async def process_inference_result(
    self,
    shard,
@@ -124,9 +130,8 @@ class Node:
        self.buffered_token_output[request_id][0].append(token.item())
        is_finished = token.item() == self.inference_engine.tokenizer.eos_token_id or is_finished or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
        if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
-        asyncio.create_task(self.broadcast_result(request_id, *self.buffered_token_output[request_id]))
        forward = token.reshape(1, -1)
-        intermediate_result = self.buffered_token_output[request_id][0]
+        intermediate_result = [self.buffered_token_output[request_id][0][-1]]
      else:
        forward = result
    else:
@@ -157,6 +162,7 @@ class Node:
    inference_state: Optional[dict] = {},
  ) -> Optional[np.ndarray]:
    shard = self.get_current_shard(base_shard)
+    start_time = time.perf_counter_ns()
    asyncio.create_task(
      self.broadcast_opaque_status(
        request_id,
@@ -187,18 +193,17 @@ class Node:
          "prompt": prompt,
          "request_id": request_id,
          "elapsed_time_ns": elapsed_time_ns,
-          "result_size": resp.size if resp is not None else 0,
        }),
      )
    )
-    return resp
+    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=} {elapsed_time_ns=}")

  async def _process_prompt(self, base_shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[dict] = None) -> Optional[np.ndarray]:
    if request_id is None:
      request_id = str(uuid.uuid4())
    shard = self.get_current_shard(base_shard)
-
    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=}")
+
    if not shard.is_first_layer():
      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=}")
      self.outstanding_requests[request_id] = "waiting"
@@ -355,41 +360,11 @@ class Node:
    inference_state: Optional[dict] = None,
  ) -> Optional[np.ndarray]:
    shard = self.get_current_shard(base_shard)
-    asyncio.create_task(
-      self.broadcast_opaque_status(
-        request_id,
-        json.dumps({
-          "type": "node_status",
-          "node_id": self.id,
-          "status": "start_process_tensor",
-          "base_shard": base_shard.to_dict(),
-          "shard": shard.to_dict(),
-          "tensor_size": tensor.size,
-          "tensor_shape": tensor.shape,
-          "request_id": request_id,
-        }),
-      )
-    )
    start_time = time.perf_counter_ns()
    resp = await self._process_tensor(shard, tensor, request_id, inference_state)
    end_time = time.perf_counter_ns()
    elapsed_time_ns = end_time - start_time
-    asyncio.create_task(
-      self.broadcast_opaque_status(
-        request_id,
-        json.dumps({
-          "type": "node_status",
-          "node_id": self.id,
-          "status": "end_process_tensor",
-          "base_shard": base_shard.to_dict(),
-          "shard": shard.to_dict(),
-          "request_id": request_id,
-          "elapsed_time_ns": elapsed_time_ns,
-          "result_size": resp.size if resp is not None else 0,
-        }),
-      )
-    )
-    return resp
+    if DEBUG >= 2: print(f"[{request_id}] process_tensor: {base_shard=} {shard=} {tensor.size=} {tensor.shape=} {elapsed_time_ns=}")

  async def _process_tensor(
    self,
@@ -402,7 +377,6 @@ class Node:
      request_id = str(uuid.uuid4())
    shard = self.get_current_shard(base_shard)

-    if DEBUG >= 1: print(f"[{request_id}] process_tensor: {tensor.size=} {tensor.shape=}")
    try:
      self.outstanding_requests[request_id] = "processing"
      result, inference_state = await self.inference_engine.infer_tensor(request_id, shard, tensor, inference_state)
@@ -412,7 +386,6 @@ class Node:
      self.outstanding_requests.pop(request_id)
      print(f"Error processing tensor for shard {shard}: {e}")
      traceback.print_exc()
-      return None
  
  async def forward_example(
    self,
@@ -558,18 +531,13 @@ class Node:
      try:
        did_peers_change = await self.update_peers()
        if DEBUG >= 2: print(f"{did_peers_change=}")
+        await self.collect_topology(set())
        if did_peers_change:
-          await self.collect_topology(set())
          await self.select_best_inference_engine()
      except Exception as e:
        print(f"Error collecting topology: {e}")
        traceback.print_exc()

-  async def get_inference_result(self, request_id: str) -> Tuple[Optional[np.ndarray], bool]:
-    if request_id not in self.buffered_token_output:
-      return None, False
-    return np.array(self.buffered_token_output[request_id][0]), self.buffered_token_output[request_id][1]
-
  async def collect_topology(self, visited: set[str], max_depth: int = 4) -> Topology:
    next_topology = Topology()
    next_topology.update_node(self.id, self.device_capabilities)
@@ -614,7 +582,7 @@ class Node:
    return self._on_opaque_status

  def trigger_on_token_callbacks(self, request_id: str, tokens: List[int], is_finished: bool) -> None:
-    if DEBUG >= 2: print(f"Triggering all on_token callbacks with {request_id=} num_tokens={len(tokens)} {is_finished=}")
+    if DEBUG >= 2: print(f"Triggering all on_token callbacks with {request_id=} {tokens=} {is_finished=}")
    self.on_token.trigger_all(request_id, tokens, is_finished)
  
  async def broadcast_result(self, request_id: str, result: List[int], is_finished: bool) -> None:
--- a/exo/orchestration/test_node.py
+++ b/exo/orchestration/test_node.py
@@ -1,6 +1,7 @@
 import unittest
 from unittest.mock import Mock, AsyncMock
 import numpy as np
+import pytest

 from .node import Node
 from exo.networking.peer_handle import PeerHandle
@@ -55,3 +56,11 @@ class TestNode(unittest.IsolatedAsyncioTestCase):
    await self.node.process_tensor(input_tensor, None)

    self.node.inference_engine.process_shard.assert_called_once_with(input_tensor)
+
+  @pytest.mark.asyncio
+  async def test_node_capabilities():
+    node = Node()
+    await node.initialize()
+    caps = await node.get_device_capabilities()
+    assert caps is not None
+    assert caps.model != ""
--- a/exo/orchestration/tracing.py
+++ b/exo/orchestration/tracing.py
@@ -0,0 +1,166 @@
+from dataclasses import dataclass
+from typing import Dict, Optional, Any
+from opentelemetry import trace, context
+from opentelemetry.trace import Status, StatusCode, SpanContext
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+from contextlib import contextmanager
+import time
+from threading import Lock
+
+@dataclass
+class TraceContext:
+  request_id: str
+  sequence_number: int
+  current_span: Optional[trace.Span] = None
+  trace_parent: Optional[str] = None
+  token_group_span: Optional[trace.Span] = None
+  token_count: int = 0
+  token_group_size: int = 10  # Default group size
+  request_span: Optional[trace.Span] = None  # Track the main request span
+
+class Tracer:
+  def __init__(self):
+    self.tracer = trace.get_tracer("exo")
+    self.contexts: Dict[str, TraceContext] = {}
+    self._lock = Lock()
+    self.propagator = TraceContextTextMapPropagator()
+    
+  def get_context(self, request_id: str) -> Optional[TraceContext]:
+    with self._lock:
+      return self.contexts.get(request_id)
+
+  def set_context(self, request_id: str, context: TraceContext):
+    with self._lock:
+      self.contexts[request_id] = context
+
+  def inject_context(self, span: trace.Span) -> str:
+    """Inject current span context into carrier for propagation"""
+    carrier = {}
+    ctx = trace.set_span_in_context(span)
+    self.propagator.inject(carrier, context=ctx)
+    return carrier.get("traceparent", "")
+
+  def extract_context(self, trace_parent: str) -> Optional[context.Context]:
+    """Extract span context from carrier"""
+    if not trace_parent:
+      return None
+    carrier = {"traceparent": trace_parent}
+    return self.propagator.extract(carrier)
+
+  def create_context_from_parent(self, request_id: str, trace_parent: str, sequence_number: int = 0) -> TraceContext:
+    """Create a new context with the given trace parent"""
+    parent_ctx = self.extract_context(trace_parent)
+    if parent_ctx:
+      # Create a new request span that links to the parent context
+      request_span = self.tracer.start_span(
+        "request",
+        context=parent_ctx,
+        attributes={
+          "request_id": request_id,
+          "sequence_number": sequence_number
+        }
+      )
+      return TraceContext(
+        request_id=request_id,
+        sequence_number=sequence_number,
+        request_span=request_span,
+        current_span=request_span,
+        trace_parent=trace_parent
+      )
+    return TraceContext(request_id=request_id, sequence_number=sequence_number)
+
+  def handle_token(self, context: TraceContext, token: int, is_finished: bool = False):
+    """Handle token generation and manage token group spans"""
+    context.token_count += 1
+    
+    # Start a new token group span if needed
+    if not context.token_group_span and context.request_span:
+      group_number = (context.token_count - 1) // context.token_group_size + 1
+      
+      # Create token group span as child of request span
+      parent_ctx = trace.set_span_in_context(context.request_span)
+      context.token_group_span = self.tracer.start_span(
+        f"token_group_{group_number}",
+        context=parent_ctx,
+        attributes={
+          "request_id": context.request_id,
+          "group.number": group_number,
+          "group.start_token": context.token_count,
+          "group.max_tokens": context.token_group_size
+        }
+      )
+    
+    # Add token to current group span
+    if context.token_group_span:
+      relative_pos = ((context.token_count - 1) % context.token_group_size) + 1
+      context.token_group_span.set_attribute(f"token.{relative_pos}", token)
+      context.token_group_span.set_attribute("token.count", relative_pos)
+      
+      # End current group span if we've reached the group size or if generation is finished
+      if context.token_count % context.token_group_size == 0 or is_finished:
+        context.token_group_span.set_attribute("token.final_count", relative_pos)
+        context.token_group_span.end()
+        context.token_group_span = None
+
+  @contextmanager
+  def start_span(self, name: str, context: TraceContext, extra_attributes: Optional[Dict[str, Any]] = None):
+    """Start a new span with proper parent context"""
+    attributes = {
+      "request_id": context.request_id,
+      "sequence_number": context.sequence_number
+    }
+    if extra_attributes:
+      attributes.update(extra_attributes)
+      
+    # Use request span as parent if available
+    parent_ctx = None
+    if context.request_span:
+      parent_ctx = trace.set_span_in_context(context.request_span)
+    elif context.trace_parent:
+      parent_ctx = self.extract_context(context.trace_parent)
+      if parent_ctx and not context.request_span:
+        # Create a new request span that links to the parent context
+        context.request_span = self.tracer.start_span(
+          "request",
+          context=parent_ctx,
+          attributes={
+            "request_id": context.request_id,
+            "sequence_number": context.sequence_number
+          }
+        )
+        parent_ctx = trace.set_span_in_context(context.request_span)
+    elif context.current_span:
+      parent_ctx = trace.set_span_in_context(context.current_span)
+    
+    # Create span with parent context if it exists
+    if parent_ctx:
+      span = self.tracer.start_span(
+        name,
+        context=parent_ctx,
+        attributes=attributes
+      )
+    else:
+      span = self.tracer.start_span(
+        name,
+        attributes=attributes
+      )
+    
+    # Update context with current span
+    prev_span = context.current_span
+    context.current_span = span
+    
+    try:
+      start_time = time.perf_counter()
+      yield span
+      duration = time.perf_counter() - start_time
+      span.set_attribute("duration_s", duration)
+      span.set_status(Status(StatusCode.OK))
+    except Exception as e:
+      span.set_status(Status(StatusCode.ERROR, str(e)))
+      raise
+    finally:
+      span.end()
+      context.current_span = prev_span
+
+# Global tracer instance
+tracer = Tracer() 
--- a/exo/stats/init.py
+++ b/exo/stats/init.py
--- a/exo/stats/docker-compose-stats.yml
+++ b/exo/stats/docker-compose-stats.yml
@@ -1,27 +0,0 @@
-version: '3.8'
-
-services:
-  prometheus:
-    image: prom/prometheus:latest
-    container_name: prometheus
-    volumes:
-      - ./prometheus.yml:/etc/prometheus/prometheus.yml
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-    ports:
-      - "9090:9090"
-    networks:
-      - monitoring
-
-  grafana:
-    image: grafana/grafana:latest
-    container_name: grafana
-    ports:
-      - "3000:3000"
-    networks:
-      - monitoring
-    depends_on:
-      - prometheus
-
-networks:
-  monitoring:
--- a/exo/stats/metrics.py
+++ b/exo/stats/metrics.py
@@ -1,29 +0,0 @@
-from exo.orchestration import Node
-from prometheus_client import start_http_server, Counter, Histogram
-import json
-
-# Create metrics to track time spent and requests made.
-PROCESS_PROMPT_COUNTER = Counter("process_prompt_total", "Total number of prompts processed", ["node_id"])
-PROCESS_TENSOR_COUNTER = Counter("process_tensor_total", "Total number of tensors processed", ["node_id"])
-PROCESS_TENSOR_TIME = Histogram("process_tensor_seconds", "Time spent processing tensor", ["node_id"])
-
-
-def start_metrics_server(node: Node, port: int):
-  start_http_server(port)
-
-  def _on_opaque_status(request_id, opaque_status: str):
-    status_data = json.loads(opaque_status)
-    _type = status_data.get("type", "")
-    node_id = status_data.get("node_id", "")
-    if _type != "node_status":
-      return
-    status = status_data.get("status", "")
-
-    if status == "end_process_prompt":
-      PROCESS_PROMPT_COUNTER.labels(node_id=node_id).inc()
-    elif status == "end_process_tensor":
-      elapsed_time_ns = status_data.get("elapsed_time_ns", 0)
-      PROCESS_TENSOR_COUNTER.labels(node_id=node_id).inc()
-      PROCESS_TENSOR_TIME.labels(node_id=node_id).observe(elapsed_time_ns/1e9)  # Convert ns to seconds
-
-  node.on_opaque_status.register("stats").on_next(_on_opaque_status)
--- a/exo/stats/prometheus.yml
+++ b/exo/stats/prometheus.yml
@@ -1,7 +0,0 @@
-global:
-  scrape_interval: 15s
-
-scrape_configs:
-  - job_name: 'exo-node'
-    static_configs:
-      - targets: ['host.docker.internal:8005']
--- a/exo/tinychat/index.css
+++ b/exo/tinychat/index.css
@@ -654,4 +654,92 @@ main {

 .model-download-button i {
  font-size: 0.9em;
+}
+
+.topology-section {
+  margin-bottom: 30px;
+  padding: 15px;
+  background: rgba(255, 255, 255, 0.05);
+  border-radius: 8px;
+}
+
+.topology-visualization {
+  min-height: 150px;
+  position: relative;
+  margin-top: 10px;
+}
+
+.topology-loading {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  color: #666;
+  font-size: 0.9em;
+}
+
+.topology-node {
+  padding: 8px;
+  background: rgba(255, 255, 255, 0.05);
+  border-radius: 4px;
+  margin: 4px 0;
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+}
+
+.node-info {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  font-size: 0.9em;
+}
+
+.topology-node .status {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  flex-shrink: 0;
+}
+
+.topology-node .status.active {
+  background: #4CAF50;
+}
+
+.topology-node .status.inactive {
+  background: #666;
+}
+
+.node-details {
+  padding-left: 12px;
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+  font-size: 0.8em;
+  opacity: 0.6;
+}
+
+.node-details span {
+  display: flex;
+  align-items: center;
+}
+
+.peer-connections {
+  margin-top: 8px;
+  padding-left: 12px;
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+}
+
+.peer-connection {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-size: 0.85em;
+  color: #a0a0a0;
+}
+
+.peer-connection i {
+  font-size: 0.8em;
+  color: #666;
 }
--- a/exo/tinychat/index.html
+++ b/exo/tinychat/index.html
@@ -26,21 +26,36 @@
 <body>
 <main x-data="state" x-init="console.log(endpoint)">
  <div class="sidebar">
+    <!-- Add topology section -->
+    <div class="topology-section">
+      <h2 class="megrim-regular">Network Topology</h2>
+      <div class="topology-visualization"
+           x-init="initTopology()"
+           x-ref="topologyViz">
+        <!-- Loading indicator for topology -->
+        <div class="topology-loading" x-show="!topology">
+          <i class="fas fa-spinner fa-spin"></i>
+          <span>Loading topology...</span>
+        </div>
+        <!-- Topology visualization will be rendered here -->
+      </div>
+    </div>
+
    <h2 class="megrim-regular" style="margin-bottom: 20px;">Models</h2>
-    
+
    <!-- Loading indicator -->
    <div class="loading-container" x-show="Object.keys(models).length === 0">
        <i class="fas fa-spinner fa-spin"></i>
        <span>Loading models...</span>
    </div>
-    
+
    <template x-for="(model, key) in models" :key="key">
-        <div class="model-option" 
+        <div class="model-option"
             :class="{ 'selected': cstate.selectedModel === key }"
             @click="cstate.selectedModel = key">
            <div class="model-header">
                <div class="model-name" x-text="model.name"></div>
-                <button 
+                <button
                    @click.stop="deleteModel(key, model)"
                    class="model-delete-button"
                    x-show="model.download_percentage > 0">
@@ -56,7 +71,7 @@
                        <template x-if="!model.loading && model.download_percentage != null">
                            <span>
                                <!-- Check if there's an active download for this model -->
-                                <template x-if="downloadProgress?.some(p => 
+                                <template x-if="downloadProgress?.some(p =>
                                    p.repo_id && p.repo_id.toLowerCase().includes(key.toLowerCase()) && !p.isComplete
                                )">
                                    <i class="fas fa-circle-notch fa-spin"></i>
@@ -65,7 +80,7 @@
                            </span>
                        </template>
                        <template x-if="!model.loading && (model.download_percentage === null || model.download_percentage < 100) && !downloadProgress?.some(p => !p.isComplete)">
-                            <button 
+                            <button
                                @click.stop="handleDownload(key)"
                                class="model-download-button">
                                <i class="fas fa-download"></i>
@@ -75,22 +90,22 @@
                    </div>
                </div>
                <template x-if="model.total_size">
-                    <div class="model-size" x-text="model.total_downloaded ? 
-                        `${formatBytes(model.total_downloaded)} / ${formatBytes(model.total_size)}` : 
+                    <div class="model-size" x-text="model.total_downloaded ?
+                        `${formatBytes(model.total_downloaded)} / ${formatBytes(model.total_size)}` :
                        formatBytes(model.total_size)">
                    </div>
                </template>
            </div>
        </div>
    </template>
-  </div> 
+  </div>
    <!-- Error Toast -->
    <div x-show="errorMessage !== null" x-transition.opacity class="toast">
        <div class="toast-header">
            <span class="toast-error-message" x-text="errorMessage?.basic || ''"></span>
            <div class="toast-header-buttons">
-                <button @click="errorExpanded = !errorExpanded; if (errorTimeout) { clearTimeout(errorTimeout); errorTimeout = null; }" 
-                        class="toast-expand-button" 
+                <button @click="errorExpanded = !errorExpanded; if (errorTimeout) { clearTimeout(errorTimeout); errorTimeout = null; }"
+                        class="toast-expand-button"
                        x-show="errorMessage?.stack">
                    <span x-text="errorExpanded ? 'Hide Details' : 'Show Details'"></span>
                </button>
@@ -119,8 +134,8 @@
    " x-show="home === 0" x-transition="">
 <h1 class="title megrim-regular">tinychat</h1>
 <template x-if="histories.length">
-  <button 
-    @click="if(confirm('Are you sure you want to clear all history?')) clearAllHistory();" 
+  <button
+    @click="if(confirm('Are you sure you want to clear all history?')) clearAllHistory();"
    class="clear-history-button">
    <i class="fas fa-trash"></i> Clear All History
  </button>
@@ -162,14 +177,14 @@
 </template>
 </div>
 </div>
-<button 
+<button
    @click="
        home = 0;
        cstate = { time: null, messages: [], selectedModel: cstate.selectedModel };
        time_till_first = 0;
        tokens_per_second = 0;
        total_tokens = 0;
-    " 
+    "
    class="back-button"
    x-show="home === 2">
    <i class="fas fa-arrow-left"></i>
@@ -250,7 +265,7 @@
        <p><strong>Model:</strong> <span x-text="progress.repo_id + '@' + progress.repo_revision"></span></p>
        <p><strong>Status:</strong> <span x-text="progress.status"></span></p>
        <div class="progress-bar-container">
-          <div class="progress-bar" 
+          <div class="progress-bar"
               :class="progress.isComplete ? 'complete' : 'in-progress'"
               :style="`width: ${progress.percentage}%;`">
          </div>
@@ -294,10 +309,10 @@
 <i class="fas fa-times"></i>
 </button>
 </div>
-<textarea 
-    :disabled="generating || (downloadProgress?.length > 0 && downloadProgress.some(p => !p.isComplete))" 
+<textarea
+    :disabled="generating || (downloadProgress?.length > 0 && downloadProgress.some(p => !p.isComplete))"
    :placeholder="
-        generating ? 'Generating...' : 
+        generating ? 'Generating...' :
        (downloadProgress?.length > 0 && downloadProgress.some(p => !p.isComplete)) ? 'Download in progress...' :
        'Say something'
    "
@@ -329,9 +344,9 @@
        });
    "
    x-ref="inputForm"></textarea>
-<button 
-    :disabled="generating || (downloadProgress?.length > 0 && downloadProgress.some(p => !p.isComplete))" 
-    @click="await handleSend()" 
+<button
+    :disabled="generating || (downloadProgress?.length > 0 && downloadProgress.some(p => !p.isComplete))"
+    @click="await handleSend()"
    class="input-button">
    <i :class="generating ? 'fa-spinner fa-spin' : 'fa-paper-plane'" class="fas"></i>
 </button>
--- a/exo/tinychat/index.js
+++ b/exo/tinychat/index.js
@@ -5,7 +5,7 @@ document.addEventListener("alpine:init", () => {
      time: null,
      messages: [],
      selectedModel: 'llama-3.2-1b',
-    },    
+    },

    // historical state
    histories: JSON.parse(localStorage.getItem("histories")) || [],
@@ -13,7 +13,7 @@ document.addEventListener("alpine:init", () => {
    home: 0,
    generating: false,
    endpoint: `${window.location.origin}/v1`,
-    
+
    // Initialize error message structure
    errorMessage: null,
    errorExpanded: false,
@@ -39,6 +39,9 @@ document.addEventListener("alpine:init", () => {
    // Add models state alongside existing state
    models: {},

+    topology: null,
+    topologyInterval: null,
+
    init() {
      // Clean up any pending messages
      localStorage.removeItem("pendingMessage");
@@ -48,7 +51,7 @@ document.addEventListener("alpine:init", () => {

      // Start polling for download progress
      this.startDownloadProgressPolling();
-      
+
      // Start model polling with the new pattern
      this.startModelPolling();
    },
@@ -82,14 +85,14 @@ document.addEventListener("alpine:init", () => {
    async populateSelector() {
      return new Promise((resolve, reject) => {
        const evtSource = new EventSource(`${window.location.origin}/modelpool`);
-        
+
        evtSource.onmessage = (event) => {
          if (event.data === "[DONE]") {
            evtSource.close();
            resolve();
            return;
          }
-          
+
          const modelData = JSON.parse(event.data);
          // Update existing model data while preserving other properties
          Object.entries(modelData).forEach(([modelName, data]) => {
@@ -102,7 +105,7 @@ document.addEventListener("alpine:init", () => {
            }
          });
        };
-        
+
        evtSource.onerror = (error) => {
          console.error('EventSource failed:', error);
          evtSource.close();
@@ -509,7 +512,7 @@ document.addEventListener("alpine:init", () => {
        stack: error.stack || ""
      };
      this.errorExpanded = false;
-      
+
      if (this.errorTimeout) {
        clearTimeout(this.errorTimeout);
      }
@@ -524,10 +527,10 @@ document.addEventListener("alpine:init", () => {

    async deleteModel(modelName, model) {
      const downloadedSize = model.total_downloaded || 0;
-      const sizeMessage = downloadedSize > 0 ? 
+      const sizeMessage = downloadedSize > 0 ?
        `This will free up ${this.formatBytes(downloadedSize)} of space.` :
        'This will remove any partially downloaded files.';
-      
+
      if (!confirm(`Are you sure you want to delete ${model.name}? ${sizeMessage}`)) {
        return;
      }
@@ -541,7 +544,7 @@ document.addEventListener("alpine:init", () => {
        });

        const data = await response.json();
-        
+
        if (!response.ok) {
          throw new Error(data.detail || 'Failed to delete model');
        }
@@ -600,6 +603,71 @@ document.addEventListener("alpine:init", () => {
        console.error('Error starting download:', error);
        this.setError(error);
      }
+    },
+
+    async fetchTopology() {
+      try {
+        const response = await fetch(`${this.endpoint}/topology`);
+        if (!response.ok) throw new Error('Failed to fetch topology');
+        return await response.json();
+      } catch (error) {
+        console.error('Topology fetch error:', error);
+        return null;
+      }
+    },
+
+    initTopology() {
+      // Initial fetch
+      this.updateTopology();
+
+      // Set up periodic updates
+      this.topologyInterval = setInterval(() => this.updateTopology(), 5000);
+
+      // Cleanup on page unload
+      window.addEventListener('beforeunload', () => {
+        if (this.topologyInterval) {
+          clearInterval(this.topologyInterval);
+        }
+      });
+    },
+
+    async updateTopology() {
+      const topologyData = await this.fetchTopology();
+      if (!topologyData) return;
+
+      const vizElement = this.$refs.topologyViz;
+      vizElement.innerHTML = ''; // Clear existing visualization
+
+      // Create nodes from object
+      Object.entries(topologyData.nodes).forEach(([nodeId, node]) => {
+        const nodeElement = document.createElement('div');
+        nodeElement.className = 'topology-node';
+
+        // Get peer connections for this node
+        const peerConnections = topologyData.peer_graph[nodeId] || [];
+        const peerConnectionsHtml = peerConnections.map(peer => `
+          <div class="peer-connection">
+            <i class="fas fa-arrow-right"></i>
+            <span>To ${peer.to_id}: ${peer.description}</span>
+          </div>
+        `).join('');
+
+        nodeElement.innerHTML = `
+          <div class="node-info">
+            <span class="status ${nodeId === topologyData.active_node_id ? 'active' : 'inactive'}"></span>
+            <span>${node.model}</span>
+          </div>
+          <div class="node-details">
+            <span>${node.chip}</span>
+            <span>${(node.memory / 1024).toFixed(1)}GB RAM</span>
+            <span>${node.flops.fp32.toFixed(1)} TF</span>
+          </div>
+          <div class="peer-connections">
+            ${peerConnectionsHtml}
+          </div>
+        `;
+        vizElement.appendChild(nodeElement);
+      });
    }
  }));
 });
--- a/exo/topology/device_capabilities.py
+++ b/exo/topology/device_capabilities.py
@@ -3,6 +3,8 @@ from pydantic import BaseModel
 from exo import DEBUG
 import subprocess
 import psutil
+import asyncio
+from exo.helpers import get_mac_system_info, subprocess_pool

 TFLOPS = 1.00

@@ -144,11 +146,13 @@ CHIP_FLOPS.update({f"{key} LAPTOP GPU": value for key, value in CHIP_FLOPS.items
 CHIP_FLOPS.update({f"{key} Laptop GPU": value for key, value in CHIP_FLOPS.items()})


-def device_capabilities() -> DeviceCapabilities:
+async def device_capabilities() -> DeviceCapabilities:
  if psutil.MACOS:
-    return mac_device_capabilities()
+    return await mac_device_capabilities()
  elif psutil.LINUX:
-    return linux_device_capabilities()
+    return await linux_device_capabilities()
+  elif psutil.WINDOWS:
+    return await windows_device_capabilities()
  else:
    return DeviceCapabilities(
      model="Unknown Device",
@@ -158,27 +162,18 @@ def device_capabilities() -> DeviceCapabilities:
    )


-def mac_device_capabilities() -> DeviceCapabilities:
-  # Fetch the model of the Mac using system_profiler
-  model = subprocess.check_output(["system_profiler", "SPHardwareDataType"]).decode("utf-8")
-  model_line = next((line for line in model.split("\n") if "Model Name" in line), None)
-  model_id = model_line.split(": ")[1] if model_line else "Unknown Model"
-  chip_line = next((line for line in model.split("\n") if "Chip" in line), None)
-  chip_id = chip_line.split(": ")[1] if chip_line else "Unknown Chip"
-  memory_line = next((line for line in model.split("\n") if "Memory" in line), None)
-  memory_str = memory_line.split(": ")[1] if memory_line else "Unknown Memory"
-  memory_units = memory_str.split()
-  memory_value = int(memory_units[0])
-  if memory_units[1] == "GB":
-    memory = memory_value*1024
-  else:
-    memory = memory_value
-
-  # Assuming static values for other attributes for demonstration
-  return DeviceCapabilities(model=model_id, chip=chip_id, memory=memory, flops=CHIP_FLOPS.get(chip_id, DeviceFlops(fp32=0, fp16=0, int8=0)))
+async def mac_device_capabilities() -> DeviceCapabilities:
+  model_id, chip_id, memory = await get_mac_system_info()
+  
+  return DeviceCapabilities(
+    model=model_id,
+    chip=chip_id,
+    memory=memory,
+    flops=CHIP_FLOPS.get(chip_id, DeviceFlops(fp32=0, fp16=0, int8=0))
+  )


-def linux_device_capabilities() -> DeviceCapabilities:
+async def linux_device_capabilities() -> DeviceCapabilities:
  import psutil
  from tinygrad import Device

@@ -194,6 +189,8 @@ def linux_device_capabilities() -> DeviceCapabilities:

    if DEBUG >= 2: print(f"NVIDIA device {gpu_name=} {gpu_memory_info=}")

+    pynvml.nvmlShutdown()
+
    return DeviceCapabilities(
      model=f"Linux Box ({gpu_name})",
      chip=gpu_name,
@@ -201,13 +198,24 @@ def linux_device_capabilities() -> DeviceCapabilities:
      flops=CHIP_FLOPS.get(gpu_name, DeviceFlops(fp32=0, fp16=0, int8=0)),
    )
  elif Device.DEFAULT == "AMD":
-    # TODO AMD support
+    # For AMD GPUs, pyrsmi is the way (Official python package for rocm-smi)
+    from pyrsmi import rocml
+
+    rocml.smi_initialize()
+    gpu_name = rocml.smi_get_device_name(0).upper()
+    gpu_memory_info = rocml.smi_get_device_memory_total(0)
+
+    if DEBUG >= 2: print(f"AMD device {gpu_name=} {gpu_memory_info=}")
+
+    rocml.smi_shutdown()
+
    return DeviceCapabilities(
-      model="Linux Box (AMD)",
-      chip="Unknown AMD",
-      memory=psutil.virtual_memory().total // 2**20,
+      model="Linux Box ({gpu_name})",
+      chip={gpu_name},
+      memory=gpu_memory_info.total // 2**20,
      flops=DeviceFlops(fp32=0, fp16=0, int8=0),
    )
+
  else:
    return DeviceCapabilities(
      model=f"Linux Box (Device: {Device.DEFAULT})",
@@ -215,3 +223,74 @@ def linux_device_capabilities() -> DeviceCapabilities:
      memory=psutil.virtual_memory().total // 2**20,
      flops=DeviceFlops(fp32=0, fp16=0, int8=0),
    )
+
+
+def windows_device_capabilities() -> DeviceCapabilities:
+  import psutil
+
+  def get_gpu_info():
+    import win32com.client  # install pywin32
+
+    wmiObj = win32com.client.GetObject("winmgmts:\\\\.\\root\\cimv2")
+    gpus = wmiObj.ExecQuery("SELECT * FROM Win32_VideoController")
+
+    gpu_info = []
+    for gpu in gpus:
+      info = {
+        "Name": gpu.Name,
+        "AdapterRAM": gpu.AdapterRAM,  # Bug in this property, returns -ve for VRAM > 4GB (uint32 overflow)
+        "DriverVersion": gpu.DriverVersion,
+        "VideoProcessor": gpu.VideoProcessor
+      }
+      gpu_info.append(info)
+
+    return gpu_info
+
+  gpus_info = get_gpu_info()
+  gpu_names = [gpu['Name'] for gpu in gpus_info]
+
+  contains_nvidia = any('nvidia' in gpu_name.lower() for gpu_name in gpu_names)
+  contains_amd = any('amd' in gpu_name.lower() for gpu_name in gpu_names)
+
+  if contains_nvidia:
+    import pynvml
+
+    pynvml.nvmlInit()
+    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+    gpu_raw_name = pynvml.nvmlDeviceGetName(handle).upper()
+    gpu_name = gpu_raw_name.rsplit(" ", 1)[0] if gpu_raw_name.endswith("GB") else gpu_raw_name
+    gpu_memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+    if DEBUG >= 2: print(f"NVIDIA device {gpu_name=} {gpu_memory_info=}")
+
+    return DeviceCapabilities(
+      model=f"Windows Box ({gpu_name})",
+      chip=gpu_name,
+      memory=gpu_memory_info.total // 2**20,
+      flops=CHIP_FLOPS.get(gpu_name, DeviceFlops(fp32=0, fp16=0, int8=0)),
+    )
+  elif contains_amd:
+    # For AMD GPUs, pyrsmi is the way (Official python package for rocm-smi)
+    from pyrsmi import rocml
+
+    rocml.smi_initialize()
+    gpu_name = rocml.smi_get_device_name(0).upper()
+    gpu_memory_info = rocml.smi_get_device_memory_total(0)
+
+    if DEBUG >= 2: print(f"AMD device {gpu_name=} {gpu_memory_info=}")
+
+    rocml.smi_shutdown()
+
+    return DeviceCapabilities(
+      model="Windows Box ({gpu_name})",
+      chip={gpu_name},
+      memory=gpu_memory_info.total // 2**20,
+      flops=DeviceFlops(fp32=0, fp16=0, int8=0),
+    )
+  else:
+    return DeviceCapabilities(
+      model=f"Windows Box (Device: Unknown)",
+      chip=f"Unknown Chip (Device(s): {gpu_names})",
+      memory=psutil.virtual_memory().total // 2**20,
+      flops=DeviceFlops(fp32=0, fp16=0, int8=0),
+    )
--- a/exo/topology/partitioning_strategy.py
+++ b/exo/topology/partitioning_strategy.py
@@ -1,8 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Dict
 from dataclasses import dataclass
 from .topology import Topology
 from exo.inference.shard import Shard
+from exo.topology.device_capabilities import device_capabilities
+import asyncio


 # Partitions shard-space into pieces of contiguous shards, represented by floating point range [start, end) between 0 and 1
--- a/exo/topology/test_device_capabilities.py
+++ b/exo/topology/test_device_capabilities.py
@@ -1,11 +1,11 @@
-import unittest
+import pytest
 from unittest.mock import patch
-from exo.topology.device_capabilities import mac_device_capabilities, DeviceCapabilities, DeviceFlops, TFLOPS
+from exo.topology.device_capabilities import mac_device_capabilities, DeviceCapabilities, DeviceFlops, TFLOPS, device_capabilities


-class TestMacDeviceCapabilities(unittest.TestCase):
-  @patch("subprocess.check_output")
-  def test_mac_device_capabilities_pro(self, mock_check_output):
+@pytest.mark.asyncio
+@patch("subprocess.check_output")
+async def test_mac_device_capabilities_pro(mock_check_output):
    # Mock the subprocess output
    mock_check_output.return_value = b"""
 Hardware:
@@ -27,20 +27,19 @@ Activation Lock Status: Enabled
 """

    # Call the function
-    result = mac_device_capabilities()
+    result = await mac_device_capabilities()

    # Check the results
-    self.assertIsInstance(result, DeviceCapabilities)
-    self.assertEqual(result.model, "MacBook Pro")
-    self.assertEqual(result.chip, "Apple M3 Max")
-    self.assertEqual(result.memory, 131072)  # 16 GB in MB
-    self.assertEqual(
-      str(result),
-      "Model: MacBook Pro. Chip: Apple M3 Max. Memory: 131072MB. Flops: 14.20 TFLOPS, fp16: 28.40 TFLOPS, int8: 56.80 TFLOPS",
-    )
+    assert isinstance(result, DeviceCapabilities)
+    assert result.model == "MacBook Pro"
+    assert result.chip == "Apple M3 Max"
+    assert result.memory == 131072  # 128 GB in MB
+    assert str(result) == "Model: MacBook Pro. Chip: Apple M3 Max. Memory: 131072MB. Flops: 14.20 TFLOPS, fp16: 28.40 TFLOPS, int8: 56.80 TFLOPS"

-  @patch("subprocess.check_output")
-  def test_mac_device_capabilities_air(self, mock_check_output):
+
+@pytest.mark.asyncio
+@patch("subprocess.check_output")
+async def test_mac_device_capabilities_air(mock_check_output):
    # Mock the subprocess output
    mock_check_output.return_value = b"""
 Hardware:
@@ -62,30 +61,34 @@ Activation Lock Status: Disabled
 """

    # Call the function
-    result = mac_device_capabilities()
+    result = await mac_device_capabilities()

    # Check the results
-    self.assertIsInstance(result, DeviceCapabilities)
-    self.assertEqual(result.model, "MacBook Air")
-    self.assertEqual(result.chip, "Apple M2")
-    self.assertEqual(result.memory, 8192)  # 8 GB in MB
+    assert isinstance(result, DeviceCapabilities)
+    assert result.model == "MacBook Air"
+    assert result.chip == "Apple M2"
+    assert result.memory == 8192  # 8 GB in MB

-  @unittest.skip("Unskip this test when running on a MacBook Pro, Apple M3 Max, 128GB")
-  def test_mac_device_capabilities_real(self):
+
+@pytest.mark.skip(reason="Unskip this test when running on a MacBook Pro, Apple M3 Max, 128GB")
+@pytest.mark.asyncio
+async def test_mac_device_capabilities_real():
    # Call the function without mocking
-    result = mac_device_capabilities()
+    result = await mac_device_capabilities()

    # Check the results
-    self.assertIsInstance(result, DeviceCapabilities)
-    self.assertEqual(result.model, "MacBook Pro")
-    self.assertEqual(result.chip, "Apple M3 Max")
-    self.assertEqual(result.memory, 131072)  # 128 GB in MB
-    self.assertEqual(result.flops, DeviceFlops(fp32=14.20*TFLOPS, fp16=28.40*TFLOPS, int8=56.80*TFLOPS))
-    self.assertEqual(
-      str(result),
-      "Model: MacBook Pro. Chip: Apple M3 Max. Memory: 131072MB. Flops: 14.20 TFLOPS, fp16: 28.40 TFLOPS, int8: 56.80 TFLOPS",
-    )
+    assert isinstance(result, DeviceCapabilities)
+    assert result.model == "MacBook Pro"
+    assert result.chip == "Apple M3 Max"
+    assert result.memory == 131072  # 128 GB in MB
+    assert result.flops == DeviceFlops(fp32=14.20*TFLOPS, fp16=28.40*TFLOPS, int8=56.80*TFLOPS)
+    assert str(result) == "Model: MacBook Pro. Chip: Apple M3 Max. Memory: 131072MB. Flops: 14.20 TFLOPS, fp16: 28.40 TFLOPS, int8: 56.80 TFLOPS"


-if __name__ == "__main__":
-  unittest.main()
+@pytest.mark.asyncio
+async def test_device_capabilities():
+    caps = await device_capabilities()
+    assert caps.model != ""
+    assert caps.chip != ""
+    assert caps.memory > 0
+    assert caps.flops is not None
--- a/extra/line_counter.py
+++ b/extra/line_counter.py
@@ -74,9 +74,9 @@ def gen_diff(table_old, table_new):

 def create_json_report(table, is_diff=False):
    timestamp = datetime.now(timezone.utc).isoformat()
-    commit_sha = os.environ.get('CIRCLE_SHA1', 'unknown')
-    branch = os.environ.get('CIRCLE_BRANCH', 'unknown')
-    pr_number = os.environ.get('CIRCLE_PR_NUMBER', '')
+    commit_sha = os.environ.get('GITHUB_SHA', 'unknown')
+    branch = os.environ.get('GITHUB_REF_NAME', 'unknown')
+    pr_number = os.environ.get('GITHUB_EVENT_NUMBER', '')

    if is_diff:
        files = [{
--- a/scripts/build_exo.py
+++ b/scripts/build_exo.py
@@ -6,6 +6,9 @@ import pkgutil

 def run():
    site_packages = site.getsitepackages()[0]
+    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    baseimages_dir = os.path.join(base_dir, "exo", "apputil", "baseimages")
+    
    command = [
        f"{sys.executable}", "-m", "nuitka", "exo/main.py",
        "--company-name=exolabs",
@@ -15,7 +18,8 @@ def run():
        "--standalone",
        "--output-filename=exo",
        "--python-flag=no_site",
-        "--onefile"
+        "--onefile",
+        f"--include-data-dir={baseimages_dir}=exo/apputil/baseimages"
    ]

    if sys.platform == "darwin": 
@@ -23,7 +27,7 @@ def run():
            "--macos-app-name=exo",
            "--macos-app-mode=gui",
            "--macos-app-version=0.0.1",
-            "--macos-signed-app-name=com.exolabs.exo",
+            "--macos-signed-app-name=net.exolabs.exo",
            "--include-distribution-meta=mlx",
            "--include-module=mlx._reprlib_fix",
            "--include-module=mlx._os_warning",
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import sys
 import platform
+import subprocess

 from setuptools import find_packages, setup

@@ -11,7 +12,6 @@ install_requires = [
  "grpcio==1.68.0",
  "grpcio-tools==1.68.0",
  "Jinja2==3.1.4",
-  "netifaces==0.11.0",
  "numpy==2.0.0",
  "nuitka==2.5.1",
  "nvidia-ml-py==12.560.30",
@@ -23,27 +23,61 @@ install_requires = [
  "pydantic==2.9.2",
  "requests==2.32.3",
  "rich==13.7.1",
+  "scapy==2.6.1",
  "tenacity==9.0.0",
  "tqdm==4.66.4",
  "transformers==4.46.3",
  "uuid==1.30",
+  "uvloop==0.21.0",
  "tinygrad @ git+https://github.com/tinygrad/tinygrad.git@3b26e51fcebfc6576f4e0f99693e6f1406d61d79",
 ]

 extras_require = {
-  "formatting": [
-    "yapf==0.40.2",
-  ],
+  "formatting": ["yapf==0.40.2",],
  "apple_silicon": [
-    "mlx==0.20.0",
-    "mlx-lm==0.19.3",
+    "mlx==0.21.1",
+    "mlx-lm==0.20.4",
  ],
+  "windows": ["pywin32==308",],
+  "nvidia-gpu": ["nvidia-ml-py==12.560.30",],
+  "amd-gpu": ["pyrsmi==0.2.0"],
 }

 # Check if running on macOS with Apple Silicon
 if sys.platform.startswith("darwin") and platform.machine() == "arm64":
  install_requires.extend(extras_require["apple_silicon"])

+# Check if running Windows
+if sys.platform.startswith("win32"):
+  install_requires.extend(extras_require["windows"])
+
+
+def _add_gpu_requires():
+  global install_requires
+  # Add Nvidia-GPU
+  try:
+    out = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], shell=True, text=True, capture_output=True, check=False)
+    if out.returncode == 0:
+      install_requires.extend(extras_require["nvidia-gpu"])
+  except subprocess.CalledProcessError:
+    pass
+
+  # Add AMD-GPU
+  # This will mostly work only on Linux, amd/rocm-smi is not yet supported on Windows
+  try:
+    out = subprocess.run(['amd-smi', 'list', '--csv'], shell=True, text=True, capture_output=True, check=False)
+    if out.returncode == 0:
+      install_requires.extend(extras_require["amd-gpu"])
+  except:
+    out = subprocess.run(['rocm-smi', 'list', '--csv'], shell=True, text=True, capture_output=True, check=False)
+    if out.returncode == 0:
+      install_requires.extend(extras_require["amd-gpu"])
+  finally:
+    pass
+
+
+_add_gpu_requires()
+
 setup(
  name="exo",
  version="0.0.1",
Author	SHA1	Message	Date
Alex Cheema	a635b23044	Merge pull request #619 from exo-explore/runners2 fix readme images	2025-01-23 02:18:33 +00:00
Alex Cheema	ad0e0d02d8	fix readme images	2025-01-23 02:17:58 +00:00
Alex Cheema	2644fd02c8	Merge pull request #617 from exo-explore/runners2 Lots of fixes and QoL improvements.	2025-01-23 02:05:17 +00:00
Alex Cheema	88ac12df6c	install clang test	2025-01-23 01:55:14 +00:00
Alex Cheema	dfd9d3eb48	linux install	2025-01-23 01:44:57 +00:00
Alex Cheema	200ff4d713	linux install	2025-01-23 01:43:00 +00:00
Alex Cheema	b2764f177f	linux install	2025-01-23 01:40:59 +00:00
Alex Cheema	e57fa1dfa0	xlarge	2025-01-23 01:40:13 +00:00
Alex Cheema	209163c595	add linux tinygrad test	2025-01-23 01:38:10 +00:00
Alex Cheema	495987b50b	beef up the instance	2025-01-23 01:37:38 +00:00
Alex Cheema	8484eb4165	fix config	2025-01-23 01:37:01 +00:00
Alex Cheema	790c08afd4	add linux tinygrad test	2025-01-23 01:31:44 +00:00
Alex Cheema	a8a9e3ffa1	explicitly enable TOKENIZERS_PARALLELISM=true	2025-01-23 01:26:27 +00:00
Alex Cheema	5c9bcb8620	set GRPC_VERBOSITY=error; TRANSFORMERS_VERBOSITY=error	2025-01-23 01:22:19 +00:00
Alex Cheema	d54e19c20a	runners back	2025-01-23 00:55:52 +00:00
Alex Cheema	cc78738e24	remove kern scan intervals	2025-01-23 00:49:32 +00:00
Alex Cheema	2391051c11	remove kern.timer.scan_interval from bootstrap.sh	2025-01-23 00:41:40 +00:00
Alex Cheema	112dea1582	add back the benchmarks baby	2025-01-23 00:15:54 +00:00
Alex Cheema	dc5cdc4d78	add back opaque	2025-01-22 23:59:39 +00:00
Alex Cheema	f8db4e131e	fix check for sd2.1	2025-01-22 23:53:42 +00:00
Alex Cheema	bbb6856988	fix check for sd2.1	2025-01-22 23:51:09 +00:00
Alex Cheema	9ba8bbbcf8	fix filter to include 169.254.* since thats what mac uses for ethernet	2025-01-22 23:47:43 +00:00
Alex Cheema	8ab9977f01	fix stable diffusion case for tui, make mlx run on its own thread again and non-blocking	2025-01-22 23:22:53 +00:00
Alex Cheema	3a4bae0dab	fix issue with eos_token_id	2025-01-22 22:58:09 +00:00
Alex Cheema	87d1271d33	fix stream: false completion	2025-01-22 22:46:04 +00:00
Alex Cheema	55d1846f5e	clean up DEBUG=2 logs, a few fixes for token	2025-01-22 22:27:02 +00:00
Alex Cheema	9954ce8e4d	fix treating token as a list	2025-01-22 22:13:13 +00:00
Alex Cheema	09e12d8673	temporarily disable github runner benchmarks	2025-01-22 22:00:13 +00:00
Alex Cheema	98d6e986bd	add back .circleci	2025-01-22 21:58:46 +00:00
Alex Cheema	d80324fe20	disable test-m3-single-node	2025-01-22 21:58:40 +00:00
Alex Cheema	97f3bad38f	fix peer_handle	2025-01-22 21:07:49 +00:00
Alex Cheema	461e4f37cb	Merge remote-tracking branch 'origin/main' into runners2	2025-01-22 21:06:12 +00:00
Alex Cheema	07ceb19f0a	Merge pull request #614 from samiamjidkhan/main animation fix	2025-01-22 14:59:54 +00:00
Sami Khan	27b4577f38	directory for images	2025-01-22 05:47:25 -05:00
Sami Khan	a70943f8d2	base images for animation	2025-01-22 05:46:38 -05:00
Alex Cheema	410d901505	Merge pull request #613 from samiamjidkhan/dmg-backend image and text mode fix	2025-01-21 13:12:08 +00:00
Sami Khan	5c4ce5392c	image and text mode fix	2025-01-21 04:33:54 -05:00
Alex Cheema	819ec7626e	Merge pull request #611 from exo-explore/fixbuildname fix scripts/build_exo.py: com.exolabs.exo -> net.exolabs.exo	2025-01-21 05:36:34 +00:00
Alex Cheema	ba5bb3e171	fix scripts/build_exo.py: com.exolabs.exo -> net.exolabs.exo	2025-01-21 05:36:02 +00:00
Alex Cheema	f4bbcf4c8f	Merge pull request #607 from tensorsofthewall/smol_fix Fixes for cross-platform operability	2025-01-21 02:21:18 +00:00
Alex Cheema	6b8cd0577e	fix some issues with results	2025-01-20 16:30:16 +00:00
Alex Cheema	218c1e79d9	Merge branch 'main' into runners2	2025-01-20 16:12:55 +00:00
Sandesh Bharadwaj	b9eccedc3d	Formatting	2025-01-17 05:40:42 -05:00
Sandesh Bharadwaj	5f06aa2759	Replace netifaces (unmaintained,outdated) with scapy + add dependencies for previous fixes	2025-01-17 05:37:01 -05:00
Sandesh Bharadwaj	349b5344eb	Minor fix for Shard typing	2025-01-16 14:36:46 -05:00
Sandesh Bharadwaj	df3624d27a	Add AMD GPU querying + Windows device capabilities	2025-01-14 20:37:02 -05:00
Sandesh Bharadwaj	6737e36e23	Fixed MLX import blocking native Windows execution of exo. (Not Final)	2025-01-14 20:35:21 -05:00
Alex Cheema	023ddc207e	support different network interface tests	2024-12-17 21:03:00 +00:00
Alex Cheema	2f0b543a1e	add peer connection info to tinychat	2024-12-17 17:37:40 +00:00
Alex Cheema	7ac4004392	change it back to collecting topology periodically even if peers dont change	2024-12-17 17:32:18 +00:00
Alex Cheema	198308b1eb	more robust udp broadcast	2024-12-17 17:28:55 +00:00
Alex Cheema	1f108a06ff	remove test sleep	2024-12-17 16:47:05 +00:00
Alex Cheema	3a58576f8c	make sure this is actually doing something	2024-12-17 16:22:22 +00:00
Alex Cheema	0a07223074	switch to uvloop (faster asyncio event loop) and optimise grpc settings	2024-12-17 16:10:56 +00:00
Alex Cheema	58f0a0f547	optimise grpc parameters	2024-12-17 14:50:52 +00:00
Alex Cheema	e2474c3f15	fail if we never get the desired node count	2024-12-16 21:59:02 +00:00
Alex Cheema	1b14be6013	make device_capabilities async running on a thread pool	2024-12-16 21:17:30 +00:00
Alex Cheema	036224f877	add topology to tinychat ui	2024-12-16 21:17:12 +00:00
Alex Cheema	b17faa8199	dont broadcast every single process_tensor	2024-12-16 20:54:38 +00:00
Alex Cheema	35d90d947c	Merge remote-tracking branch 'origin/main' into runners	2024-12-16 20:04:03 +00:00
Alex Cheema	8d94b8ae12	trigger test	2024-12-16 20:03:22 +00:00
Alex Cheema	99a70f1045	Merge commit: trigger test	2024-12-16 20:01:23 +00:00
Alex Cheema	bd0febe35f	Merge commit: trigger test	2024-12-16 20:01:09 +00:00
Alex Cheema	34ecbbe01c	Merge commit: trigger test	2024-12-16 20:00:50 +00:00
Alex Cheema	427d0718b3	Merge commit: trigger test	2024-12-16 20:00:39 +00:00
Alex Cheema	b49c4ca0e5	Merge commit: trigger test	2024-12-16 20:00:21 +00:00
Alex Cheema	41eaaec5a9	Merge commit: trigger test	2024-12-16 20:00:10 +00:00
Alex Cheema	bf1aafdea7	Merge commit: trigger test	2024-12-16 19:59:51 +00:00
Alex Cheema	bfa06ee9f3	Merge commit: trigger test	2024-12-16 19:59:39 +00:00
Alex Cheema	c0534b67c3	Merge commit: trigger test	2024-12-16 19:59:08 +00:00
Alex Cheema	063964aab3	remove redundant sample_logits, put back opaque status for process_prompt so we have a way of preemptively starting downloads	2024-12-16 19:50:36 +00:00
Alex Cheema	804ad4705a	upgrade mlx	2024-12-16 19:50:33 +00:00
Alex Cheema	c9ded9ba96	optimise networking, remove bloat	2024-12-16 19:50:29 +00:00
Alex Cheema	64365d684f	one two and three m4 pro clusters	2024-12-16 19:50:24 +00:00
Alex Cheema	9397464fad	add commit to results	2024-12-16 19:50:19 +00:00
Nel Nibcord	08912d1b64	Only collect topology if peers changed	2024-12-16 19:50:18 +00:00
Alex Cheema	06c2e236b8	rip out stats bloat	2024-12-16 19:50:17 +00:00
Alex Cheema	cb4615c95d	fix SendNewToken	2024-12-16 19:50:14 +00:00
Alex Cheema	f55a53ae7e	one token at a time	2024-12-16 19:49:52 +00:00
Gary	25b4af70e0	Merge branch 'main' into runners	2024-12-14 20:48:58 +00:00
Alex Cheema	a93092105c	set max-generate-tokens to 250	2024-12-14 19:10:03 +00:00
Alex Cheema	0c6ab35333	increase timeout of http request in bench.py up to 10 mins	2024-12-14 18:33:41 +00:00
Alex Cheema	e5d54c77a9	add llama-3.3-70b to 3 M4 Pro cluster	2024-12-12 18:51:26 +00:00
Alex Cheema	2ff4638122	Merge remote-tracking branch 'origin/main' into runners	2024-12-12 17:14:40 +00:00
Alex Cheema	b6f2385c41	run llama-3.1-8b on 3 m4 pro cluster	2024-12-12 15:13:10 +00:00
Alex Cheema	9472ab0d2c	t	2024-12-12 15:05:55 +00:00
Alex Cheema	dbb7ad3c08	run with three m4 pro	2024-12-12 14:36:18 +00:00
Alex Cheema	2abe57be21	grasping at straws	2024-12-12 12:03:20 +00:00
Alex Cheema	eeecdcb409	try a different taskpolicy	2024-12-12 11:45:01 +00:00
Alex Cheema	f9f76129a1	better bench system info	2024-12-12 11:34:37 +00:00
Alex Cheema	8c6d37d9b8	m4 cluster test	2024-12-12 11:13:13 +00:00
Alex Cheema	1194db6e65	m3	2024-12-12 00:02:20 +00:00
Alex Cheema	8cb7327da2	re-enable m4 cluster run	2024-12-12 00:01:14 +00:00
Alex Cheema	bba0aa0877	single node test 20	2024-12-11 22:58:44 +00:00
Alex Cheema	279354a1fd	single node test 19	2024-12-11 22:58:38 +00:00
Alex Cheema	92e2b74902	single node test 18	2024-12-11 22:58:33 +00:00
Alex Cheema	76196b8c2f	single node test 17	2024-12-11 22:58:27 +00:00
Alex Cheema	8408c8499f	single node test 16	2024-12-11 22:58:21 +00:00
Alex Cheema	c65d1d9141	single node test 15	2024-12-11 22:58:16 +00:00
Alex Cheema	0bd44c0f78	single node test 14	2024-12-11 22:58:10 +00:00
Alex Cheema	f22bc99f2c	single node test 13	2024-12-11 22:58:04 +00:00
Alex Cheema	3fda05aa39	single node test 12	2024-12-11 22:57:58 +00:00
Alex Cheema	6c322ac070	single node test 11	2024-12-11 22:57:53 +00:00
Alex Cheema	c5c27a32af	single node test 10	2024-12-11 22:57:47 +00:00
Alex Cheema	9f1393dc7f	single node test 9	2024-12-11 22:57:42 +00:00
Alex Cheema	32ff3ef9af	single node test 8	2024-12-11 22:57:36 +00:00
Alex Cheema	b23c3fdaad	single node test 7	2024-12-11 22:57:31 +00:00
Alex Cheema	8b47a9d017	single node test 6	2024-12-11 22:57:25 +00:00
Alex Cheema	f89b85b3f2	single node test 5	2024-12-11 22:57:19 +00:00
Alex Cheema	6f097c9321	single node test 4	2024-12-11 22:57:14 +00:00
Alex Cheema	fb7a0defe1	single node test 3	2024-12-11 22:57:08 +00:00
Alex Cheema	fe506a53d9	single node test 2	2024-12-11 22:57:02 +00:00
Alex Cheema	3f6ef1c763	single node test 1	2024-12-11 22:56:56 +00:00
Alex Cheema	e63c224c71	testtt	2024-12-11 22:53:02 +00:00
Alex Cheema	20e3065e57	les goh	2024-12-11 22:49:29 +00:00
Alex Cheema	83892d5b7e	t	2024-12-11 22:45:59 +00:00
Alex Cheema	83470a98b4	t	2024-12-11 22:42:02 +00:00
Alex Cheema	92edfa5efc	t	2024-12-11 22:40:47 +00:00
Alex Cheema	225dcba788	t	2024-12-11 22:37:11 +00:00
Alex Cheema	6249bee793	tes	2024-12-11 22:35:30 +00:00
Alex Cheema	741c31836e	test	2024-12-11 22:27:10 +00:00
Alex Cheema	d0b7f1b4bb	t	2024-12-11 22:11:01 +00:00
Alex Cheema	90677415c7	t	2024-12-11 22:01:29 +00:00
Alex Cheema	6cf2af39e8	t	2024-12-11 21:55:24 +00:00
Alex Cheema	5a1a0f5fd2	t	2024-12-11 21:45:53 +00:00
Alex Cheema	dd3fd279dc	t	2024-12-11 21:42:01 +00:00
Alex Cheema	61c09631c0	t	2024-12-11 21:40:47 +00:00
Alex Cheema	e698ef6ab1	t	2024-12-11 21:39:27 +00:00
Alex Cheema	26351e719d	t	2024-12-11 21:36:59 +00:00
Alex Cheema	5dee5e55fe	t	2024-12-11 21:33:03 +00:00
Alex Cheema	6acfb81860	t	2024-12-11 20:28:07 +00:00
Alex Cheema	b1142d4ff4	t	2024-12-11 19:39:58 +00:00
Alex Cheema	a932afc01c	oi	2024-12-11 19:30:28 +00:00
Alex Cheema	cdae702673	t	2024-12-11 19:24:43 +00:00
Alex Cheema	d95f40b6c8	a	2024-12-11 19:07:36 +00:00
Alex Cheema	97ffb83e86	t	2024-12-11 19:01:24 +00:00
Alex Cheema	9a11e27c93	ttt	2024-12-11 18:54:51 +00:00
Alex Cheema	d6c2146dd9	t	2024-12-11 18:34:35 +00:00
Alex Cheema	63da9fc194	a	2024-12-11 18:30:02 +00:00
Alex Cheema	7c0c5ef7fc	ttttttt	2024-12-11 18:23:59 +00:00
Alex Cheema	739b7d178e	tttttt	2024-12-11 18:02:22 +00:00
Alex Cheema	cacf50cd57	tttt	2024-12-11 18:00:28 +00:00
Alex Cheema	0904cda3ac	ttt	2024-12-11 17:58:59 +00:00
Alex Cheema	6bb38939ec	tt	2024-12-11 17:56:22 +00:00
Alex Cheema	1dbe11caf9	t	2024-12-11 17:54:41 +00:00
Alex Cheema	8d9e3b88d3	t	2024-12-11 17:52:07 +00:00
Alex Cheema	9dd33d37f2	t	2024-12-11 17:44:14 +00:00
Alex Cheema	a4bb4bb6ac	update bootstrap	2024-12-11 17:37:38 +00:00
Alex Cheema	7b99cb4a12	t	2024-12-11 17:30:50 +00:00
Alex Cheema	9848a45da5	TT	2024-12-11 17:27:53 +00:00
Alex Cheema	378975813c	t	2024-12-11 17:15:39 +00:00
Alex Cheema	e680e8a1ed	fix name	2024-12-11 17:07:45 +00:00
Alex Cheema	7b2282d300	run without debug flag	2024-12-11 17:07:19 +00:00
Alex Cheema	3b1ea1933b	use .venv exo	2024-12-11 17:02:58 +00:00
Alex Cheema	668766fc4b	t	2024-12-11 16:55:57 +00:00
Alex Cheema	e501eeaf91	tweak install	2024-12-11 16:52:07 +00:00
Alex Cheema	41902f716f	tweaks	2024-12-11 16:40:21 +00:00
Alex Cheema	b7bab80ec8	test2	2024-12-11 16:36:50 +00:00
Alex Cheema	6169996c70	test	2024-12-11 16:35:26 +00:00
Alex Cheema	bbb58460f8	Test on m4	2024-12-11 16:29:52 +00:00
Alex Cheema	cff03fc6c5	perf diag	2024-12-11 16:19:47 +00:00
Alex Cheema	f7122d400d	add system_status check to bench	2024-12-11 16:13:53 +00:00
Alex Cheema	c938efb531	t	2024-12-11 16:06:14 +00:00
Alex Cheema	e2d3a90832	runner-token typo	2024-12-11 15:47:10 +00:00
Alex Cheema	ba96413a63	bootstrap script tweaks	2024-12-11 15:45:05 +00:00
Alex Cheema	cb40eb23ce	more robust configure_mlx.sh	2024-12-11 15:38:45 +00:00
Alex Cheema	afe71c01da	check gpu usage	2024-12-11 15:28:57 +00:00
Alex Cheema	a84cba4e3a	Merge remote-tracking branch 'origin/main' into runners	2024-12-11 15:22:35 +00:00
Alex Cheema	23158a42ad	add branch name to results	2024-12-11 12:59:55 +00:00
Alex Cheema	18e7919971	test 30	2024-12-11 12:55:05 +00:00
Alex Cheema	0e32a625d7	test 29	2024-12-11 12:54:59 +00:00
Alex Cheema	04bc163fea	test 28	2024-12-11 12:54:52 +00:00
Alex Cheema	949055dec0	test 27	2024-12-11 12:54:45 +00:00
Alex Cheema	070b163cc7	test 26	2024-12-11 12:54:38 +00:00
Alex Cheema	fc26ad4006	test 25	2024-12-11 12:54:27 +00:00
Alex Cheema	5d3be3c6ed	test 24	2024-12-11 12:54:20 +00:00
Alex Cheema	23dd5de3ae	test 23	2024-12-11 12:54:14 +00:00
Alex Cheema	6030b39964	test 22	2024-12-11 12:54:08 +00:00
Alex Cheema	4f4ac0fa52	test 21	2024-12-11 12:54:01 +00:00
Alex Cheema	16d9839071	test {i}	2024-12-11 12:53:55 +00:00
Alex Cheema	8269b4b190	t	2024-12-11 12:38:51 +00:00
Alex Cheema	1e869a0f15	trigger test	2024-12-10 02:04:52 +00:00
Alex Cheema	5a4d128db6	trigger test	2024-12-09 08:02:29 +00:00
Alex Cheema	8a5d212cfc	test 20	2024-12-08 23:38:30 +00:00
Alex Cheema	53edb8508b	test 19	2024-12-08 23:38:24 +00:00
Alex Cheema	29d9df04bf	test 18	2024-12-08 23:38:18 +00:00
Alex Cheema	4d6af6e6ca	test 17	2024-12-08 23:38:13 +00:00
Alex Cheema	8c7c156f57	test 16	2024-12-08 23:38:07 +00:00
Alex Cheema	310843487f	test 15	2024-12-08 23:38:01 +00:00
Alex Cheema	a4b221d0a0	test 14	2024-12-08 23:37:55 +00:00
Alex Cheema	286db875de	test 13	2024-12-08 23:37:49 +00:00
Alex Cheema	d714e40f62	test 12	2024-12-08 23:37:43 +00:00
Alex Cheema	e78ef75531	test 11	2024-12-08 23:37:37 +00:00
Alex Cheema	38eaecf087	test 10	2024-12-08 23:37:31 +00:00
Alex Cheema	3cf28f8452	test 9	2024-12-08 23:37:26 +00:00
Alex Cheema	9ba8bbdd70	test 8	2024-12-08 23:37:20 +00:00
Alex Cheema	af6048e373	test 7	2024-12-08 23:37:14 +00:00
Alex Cheema	d93b8e8948	test 6	2024-12-08 23:37:08 +00:00
Alex Cheema	b69cb49a46	test 5	2024-12-08 23:37:02 +00:00
Alex Cheema	cc74b1f9b3	test 4	2024-12-08 23:36:57 +00:00
Alex Cheema	e78a52de5f	test 3	2024-12-08 23:36:51 +00:00
Alex Cheema	f6c2c37c4b	test 2	2024-12-08 23:36:45 +00:00
Alex Cheema	314a5d9781	test 1	2024-12-08 23:36:22 +00:00
Alex Cheema	b4e885bbd2	test range	2024-12-08 23:36:14 +00:00
Alex Cheema	bd9d11861b	sleep before bench	2024-12-08 23:24:46 +00:00
Alex Cheema	571b26c50e	allowed interface types	2024-12-08 23:20:08 +00:00
Glen	b21681931d	remove	2024-12-08 23:13:10 +00:00
Alex Cheema	f584e86d8e	get rid of lfs stuff	2024-12-08 22:55:19 +00:00
Alex Cheema	fd05bca1c8	lfs	2024-12-08 22:46:49 +00:00
Alex Cheema	cbac4d6a3e	git version	2024-12-08 22:44:32 +00:00
Alex Cheema	b0977f97ab	t	2024-12-08 22:43:23 +00:00
Glen	1716f637f7	test	2024-12-08 22:32:03 +00:00
Glen	903a5aabf7	fix	2024-12-08 22:26:44 +00:00
Glen	b4f86496ea	bootstrap	2024-12-08 22:23:28 +00:00
Alex Cheema	8e57f3385c	trigger test	2024-12-08 22:14:23 +00:00
Alex Cheema	3ccbdf19de	add DEBUG_DISCOVERY	2024-12-08 22:07:48 +00:00
Alex Cheema	3687ba18df	bench logs	2024-12-08 22:02:39 +00:00
Alex Cheema	6bb7c11bbb	enable debug	2024-12-08 21:54:24 +00:00
Glen	c8f93721c5	model matrix	2024-12-08 21:14:36 +00:00
Alex Cheema	fb8d87025f	t	2024-12-08 21:02:42 +00:00
Alex Cheema	87865f0cd9	list exo processes before test, warmup req in bench	2024-12-08 20:58:44 +00:00
Glen	755dd477dd	jobname	2024-12-08 20:37:50 +00:00
Alex Cheema	fb44eb086c	simplify bench	2024-12-08 20:30:07 +00:00
Alex Cheema	be8cbc0f56	trigger test	2024-12-08 19:28:55 +00:00
Glen	fe8074929f	fix	2024-12-08 19:08:47 +00:00
Glen	c3c80c61c9	name	2024-12-08 19:02:53 +00:00
Glen	c138de0875	job_name	2024-12-08 18:56:37 +00:00
Glen	38bd00390c	fix	2024-12-08 18:32:38 +00:00
Glen	732ba915aa	new_conf	2024-12-08 18:32:06 +00:00
Glen	785710355f	aws	2024-12-07 19:28:54 +00:00
Glen	320892dccc	maxtok	2024-12-07 19:28:54 +00:00
Glen	6dae3a4719	conf	2024-12-07 19:28:54 +00:00
Glen	7b77ef000e	flush	2024-12-07 19:28:54 +00:00
Glen	6c08b32350	nodebug	2024-12-07 19:28:54 +00:00
Glen	4dd617ad37	shorter	2024-12-07 19:28:54 +00:00
Glen	acdee16aee	debug	2024-12-07 19:28:54 +00:00
Glen	9fc33587da	path	2024-12-07 19:28:54 +00:00
Glen	f087c0ac99	fix	2024-12-07 19:28:54 +00:00
Glen	16b126d890	fix	2024-12-07 19:28:54 +00:00
Glen	faf0aaedba	jq	2024-12-07 19:28:54 +00:00
Glen	4cac1bb151	quotes	2024-12-07 19:28:54 +00:00
Glen	cb3c1477bb	fix	2024-12-07 19:28:54 +00:00
Glen	19a7d5a5cf	fix	2024-12-07 19:28:54 +00:00
Glen	f7e0348f62	activate	2024-12-07 19:28:54 +00:00
Glen	c3dfac60a6	debug	2024-12-07 19:28:54 +00:00
Glen	64954aacfe	fixed	2024-12-07 19:28:54 +00:00
Glen	ccc5415cc6	try	2024-12-07 19:28:54 +00:00
Glen	1dcc731b43	fix	2024-12-07 19:28:54 +00:00
Glen	3662ec402a	fix	2024-12-07 19:28:54 +00:00
Glen	0739dc9564	fix	2024-12-07 19:28:54 +00:00
Glen	d16280ddfc	debug	2024-12-07 19:28:54 +00:00
Glen	f9c23617a7	fix3	2024-12-07 19:28:54 +00:00
Glen	ce2ccddc93	fix2	2024-12-07 19:28:54 +00:00
Glen	1af28cb5a1	fix	2024-12-07 19:28:54 +00:00
Glen	6b61fc6660	tweak python install	2024-12-07 19:28:54 +00:00
Glen	bdf417f25e	tweak	2024-12-07 19:28:54 +00:00
Glen	d154d37ac4	add exo run	2024-12-07 19:28:54 +00:00
Glen	90fd5c13a4	matrix	2024-12-07 19:28:54 +00:00
Glen	7d223a0095	matrix	2024-12-07 19:28:54 +00:00
Glen	cb3d89eb48	test runner	2024-12-07 19:28:54 +00:00
Glen	8302fd0aae	test runner	2024-12-07 19:28:54 +00:00
Alex Cheema	deb80d2577	clang for tinygrad	2024-12-07 19:28:54 +00:00
Alex Cheema	976e5f2fdb	disable mlx test for now..plan to run this on a self-hosted runner	2024-12-07 19:28:54 +00:00
Alex Cheema	9dc76ef03b	tooonygrad	2024-12-07 19:28:54 +00:00
Alex Cheema	32cd1f1d72	give this a goh	2024-12-07 19:28:54 +00:00
Alex Cheema	6b54188140	cond	2024-12-07 19:28:54 +00:00
Alex Cheema	58bcf5b429	check discovery on integration tests too	2024-12-07 19:28:54 +00:00
Alex Cheema	3c0297c3e9	more robust discovery log check	2024-12-07 19:28:54 +00:00
Alex Cheema	8d433e6579	run tinygrad and discovery integratrion tests on linux	2024-12-07 19:28:54 +00:00
Alex Cheema	676125bfe6	job	2024-12-07 19:28:54 +00:00
Alex Cheema	902e0d35e1	github env vars	2024-12-07 19:28:54 +00:00
Alex Cheema	972aea446c	macos 15	2024-12-07 19:28:53 +00:00
Alex Cheema	0d0338f871	migrate from circleci to github actions	2024-12-07 19:28:53 +00:00
Alex Cheema	f94c9067e2	trigger test	2024-12-04 03:09:12 +00:00
Alex Cheema	f0bb515d1d	trigger test	2024-12-02 11:20:21 +00:00
Alex Cheema	71db641fe4	trigger test	2024-12-02 04:11:43 +00:00
Alex Cheema	f339f74fe3	trigger test	2024-12-01 17:39:53 +00:00
Alex Cheema	7dc0a7467b	trigger test	2024-12-01 14:31:23 +00:00