python: add hermetic basedpyright typecheck to nix flake check

The existing CI typecheck job used `uv run basedpyright` which depends on a non-hermetic uv sync step. This replaces it with a fully hermetic typecheck as a Nix flake check using the uv2nix virtual environment. Added a `typecheckVenv` with dev dependencies, a `linuxOverlay` to ignore native shared library deps (NVIDIA, torch, triton, mlx) that aren't needed at type-check time, and `passthru` preservation plus `.pyi` stub copying on the `exo-pyo3-bindings` overlay so basedpyright can resolve the Rust bindings types. Also guarded the `mlx` Nix build override to macOS only since it requires Metal. Removed the old non-hermetic `typecheck` CI job since `nix flake check` now covers it. The hermetic check ensures type checking uses exactly the locked dependency versions and catches type errors without requiring a working uv/pip environment. Test plan: - CI (`nix flake check` runs on x86_64-linux, aarch64-linux, aarch64-darwin) - Verified `nix build ".#checks.x86_64-linux.typecheck"` passes with 0 errors
2026-02-14 16:15:43 -05:00 · 2026-02-14 14:21:49 +00:00
21 changed files with 43 additions and 998 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,15 +0,0 @@
-.venv/
-.direnv/
-target/
-.git/
-.idea/
-.pytest_cache/
-.ruff_cache/
-dashboard/node_modules/
-dashboard/.svelte-kit/
-dashboard/build/
-dist/
-*.pdb
-**/__pycache__
-**/.DS_Store
-.mlx_typings/
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -1,42 +0,0 @@
-name: e2e-tests
-
-on:
-  push:
-  pull_request:
-    branches:
-      - staging
-      - main
-
-jobs:
-  e2e:
-    runs-on: ubuntu-latest
-    timeout-minutes: 45
-    steps:
-      - name: Free up disk space
-        run: |
-          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
-            /opt/hostedtoolcache /usr/local/share/boost /usr/share/swift \
-            /opt/microsoft /opt/az
-          docker system prune -af
-          df -h /
-
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          lfs: false
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build E2E image with cache
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: e2e/Dockerfile
-          tags: exo-e2e:latest
-          load: true
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-      - name: Run E2E tests
-        run: python3 e2e/run_all.py
--- a/.github/workflows/pipeline.yml
+++ b/.github/workflows/pipeline.yml
@@ -8,33 +8,6 @@ on:
      - main

 jobs:
-  typecheck:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          lfs: false
-
-      - uses: cachix/install-nix-action@v31
-        with:
-          nix_path: nixpkgs=channel:nixos-unstable
-
-      - uses: cachix/cachix-action@v14
-        name: Configure Cachix
-        with:
-          name: exo
-          authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"
-
-      - name: Load nix develop environment
-        run: nix run github:nicknovitski/nix-develop/v1
-
-      - name: Sync dependencies
-        run: uv sync --all-packages
-
-      - name: Run type checker
-        run: uv run basedpyright --project pyproject.toml
-
  nix:
    name: Build and check (${{ matrix.system }})
    runs-on: ${{ matrix.runner }}
--- a/conftest.py
+++ b/conftest.py
@@ -1 +0,0 @@
-collect_ignore = ["tests/start_distributed_test.py"]
--- a/e2e/Dockerfile
+++ b/e2e/Dockerfile
@@ -1,58 +0,0 @@
-# Stage 1: Build the dashboard
-FROM node:22-slim AS dashboard
-WORKDIR /app/dashboard
-COPY dashboard/package.json dashboard/package-lock.json ./
-RUN npm ci
-COPY dashboard/ .
-RUN npm run build
-
-# Stage 2: Build and run exo
-FROM python:3.13-slim
-
-# Install system dependencies
-# libblas-dev/liblapack-dev/liblapacke-dev are required by MLX CPU backend on Linux
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    pkg-config \
-    libssl-dev \
-    libblas-dev \
-    liblapack-dev \
-    liblapacke-dev \
-    curl \
-    protobuf-compiler \
-    iptables \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Rust nightly
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain nightly
-ENV PATH="/root/.cargo/bin:${PATH}"
-
-# Wrap g++ with -fpermissive to fix MLX CPU JIT compilation with GCC 14
-# (GCC 14 treats _Float128/_Float32/_Float64 as built-in types, conflicting with MLX-generated code)
-# Must be done BEFORE uv sync so any source builds also get the fix
-RUN mv /usr/bin/g++ /usr/bin/g++.real && \
-    printf '#!/bin/sh\nexec /usr/bin/g++.real -fpermissive "$@"\n' > /usr/bin/g++ && \
-    chmod +x /usr/bin/g++
-
-# Install uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
-
-WORKDIR /app
-
-# Copy dependency files first for better layer caching
-COPY pyproject.toml Cargo.toml uv.lock README.md ./
-COPY rust/ ./rust/
-COPY bench/pyproject.toml ./bench/pyproject.toml
-
-# Copy source and resources
-COPY src/ ./src/
-COPY resources/ ./resources/
-
-# Copy built dashboard from stage 1
-COPY --from=dashboard /app/dashboard/build ./dashboard/build/
-
-# Install Python deps and build Rust bindings, then clean up build artifacts
-# to keep the layer small (Rust target/ and cargo registry can be 1-2 GB)
-RUN uv sync && rm -rf /app/rust/target /root/.cargo/registry /root/.cargo/git
-
-CMD [".venv/bin/exo", "-v"]
--- a/e2e/conftest.py
+++ b/e2e/conftest.py
@@ -1,195 +0,0 @@
-"""Shared E2E test infrastructure for exo cluster tests."""
-
-import asyncio
-import json
-import os
-import sys
-from pathlib import Path
-from urllib.error import URLError
-from urllib.request import Request, urlopen
-
-E2E_DIR = Path(__file__).parent.resolve()
-TIMEOUT = int(os.environ.get("E2E_TIMEOUT", "120"))
-
-
-class Cluster:
-    """Async wrapper around a docker compose exo cluster."""
-
-    def __init__(self, name: str, overrides: list[str] | None = None):
-        self.name = name
-        self.project = f"e2e-{name}"
-        compose_files = [str(E2E_DIR / "docker-compose.yml")]
-        for path in overrides or []:
-            compose_files.append(str(E2E_DIR / path))
-        self._compose_base = [
-            "docker",
-            "compose",
-            "-p",
-            self.project,
-            *[arg for f in compose_files for arg in ("-f", f)],
-        ]
-
-    async def __aenter__(self):
-        return self
-
-    async def __aexit__(self, *exc):
-        await self.stop()
-
-    async def _run(self, *args: str, check: bool = True) -> str:
-        proc = await asyncio.create_subprocess_exec(
-            *self._compose_base,
-            *args,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
-        )
-        stdout, _ = await proc.communicate()
-        output = stdout.decode()
-        if check and proc.returncode != 0:
-            print(output, file=sys.stderr)
-            raise RuntimeError(
-                f"docker compose {' '.join(args)} failed (rc={proc.returncode})"
-            )
-        return output
-
-    async def build(self):
-        # Skip build if the image was pre-built (e.g. in CI with buildx cache)
-        proc = await asyncio.create_subprocess_exec(
-            "docker",
-            "image",
-            "inspect",
-            "exo-e2e:latest",
-            stdout=asyncio.subprocess.DEVNULL,
-            stderr=asyncio.subprocess.DEVNULL,
-        )
-        await proc.wait()
-        if proc.returncode == 0:
-            print("  Using pre-built image (exo-e2e:latest)")
-            return
-        print("  Building images...")
-        await self._run("build", "--quiet")
-
-    async def start(self):
-        print("  Starting cluster...")
-        await self._run("up", "-d")
-
-    async def stop(self):
-        print("  Cleaning up...")
-        await self._run("down", "--timeout", "5", check=False)
-
-    async def logs(self) -> str:
-        return await self._run("logs", check=False)
-
-    async def exec(
-        self, service: str, *cmd: str, check: bool = True
-    ) -> tuple[int, str]:
-        """Run a command inside a running container. Returns (returncode, output)."""
-        proc = await asyncio.create_subprocess_exec(
-            *self._compose_base,
-            "exec",
-            "-T",
-            service,
-            *cmd,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
-        )
-        stdout, _ = await proc.communicate()
-        output = stdout.decode()
-        if check and proc.returncode != 0:
-            raise RuntimeError(
-                f"exec {' '.join(cmd)} in {service} failed (rc={proc.returncode})"
-            )
-        return proc.returncode, output
-
-    async def wait_for(self, description: str, check_fn, timeout: int = TIMEOUT):
-        """Poll check_fn every 2s until it returns True or timeout expires."""
-        print(f"  Waiting for {description}...")
-        deadline = asyncio.get_event_loop().time() + timeout
-        while asyncio.get_event_loop().time() < deadline:
-            if await check_fn():
-                print(f"  {description}")
-                return
-            await asyncio.sleep(2)
-        output = await self.logs()
-        print(f"--- cluster logs ---\n{output}\n---", file=sys.stderr)
-        raise TimeoutError(f"Timed out waiting for {description}")
-
-    async def assert_healthy(self):
-        """Verify the cluster formed correctly: nodes started, discovered each other, elected a master, API responds."""
-
-        async def both_nodes_started():
-            log = await self.logs()
-            return log.count("Starting node") >= 2
-
-        async def nodes_discovered():
-            log = await self.logs()
-            return log.count("ConnectionMessageType.Connected") >= 2
-
-        async def master_elected():
-            log = await self.logs()
-            return "demoting self" in log
-
-        async def api_responding():
-            try:
-                with urlopen("http://localhost:52415/v1/models", timeout=3) as resp:
-                    return resp.status == 200
-            except (URLError, OSError):
-                return False
-
-        await self.wait_for("Both nodes started", both_nodes_started)
-        await self.wait_for("Nodes discovered each other", nodes_discovered)
-        await self.wait_for("Master election resolved", master_elected)
-        await self.wait_for("API responding", api_responding)
-
-    async def _api(
-        self, method: str, path: str, body: dict | None = None, timeout: int = 30
-    ) -> dict:
-        """Make an API request to the cluster. Returns parsed JSON."""
-        url = f"http://localhost:52415{path}"
-        data = json.dumps(body).encode() if body else None
-        req = Request(
-            url, data=data, headers={"Content-Type": "application/json"}, method=method
-        )
-        loop = asyncio.get_event_loop()
-        resp_bytes = await loop.run_in_executor(
-            None, lambda: urlopen(req, timeout=timeout).read()
-        )
-        return json.loads(resp_bytes)
-
-    async def place_model(self, model: str, timeout: int = 600):
-        """Place a model instance on the cluster (triggers download) and wait until it's ready."""
-        await self._api("POST", "/place_instance", {"model_id": model})
-
-        async def model_ready():
-            try:
-                resp = await self._api("GET", "/v1/models")
-                return any(m.get("id") == model for m in resp.get("data", []))
-            except Exception:
-                return False
-
-        await self.wait_for(f"Model {model} ready", model_ready, timeout=timeout)
-
-    async def chat(
-        self, model: str, messages: list[dict], timeout: int = 600, **kwargs
-    ) -> dict:
-        """Send a chat completion request. Retries until model is downloaded and inference completes."""
-        body = json.dumps({"model": model, "messages": messages, **kwargs}).encode()
-        deadline = asyncio.get_event_loop().time() + timeout
-        last_error = None
-
-        while asyncio.get_event_loop().time() < deadline:
-            try:
-                req = Request(
-                    "http://localhost:52415/v1/chat/completions",
-                    data=body,
-                    headers={"Content-Type": "application/json"},
-                )
-                loop = asyncio.get_event_loop()
-                resp_bytes = await loop.run_in_executor(
-                    None, lambda r=req: urlopen(r, timeout=300).read()
-                )
-                return json.loads(resp_bytes)
-            except Exception as e:
-                last_error = e
-                await asyncio.sleep(5)
-
-        raise TimeoutError(f"Chat request failed after {timeout}s: {last_error}")
--- a/e2e/docker-compose.yml
+++ b/e2e/docker-compose.yml
@@ -1,20 +0,0 @@
-services:
-  exo-node-1:
-    image: exo-e2e:latest
-    build:
-      context: ..
-      dockerfile: e2e/Dockerfile
-    environment:
-      - EXO_LIBP2P_NAMESPACE=docker-e2e
-    command: [".venv/bin/exo", "-v"]
-    ports:
-      - "52415:52415"
-
-  exo-node-2:
-    image: exo-e2e:latest
-    build:
-      context: ..
-      dockerfile: e2e/Dockerfile
-    environment:
-      - EXO_LIBP2P_NAMESPACE=docker-e2e
-    command: [".venv/bin/exo", "-v"]
--- a/e2e/run_all.py
+++ b/e2e/run_all.py
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-"""Discovers and runs all E2E tests in e2e/test_*.py.
-
-Tests with '# slow' on the first line of their docstring are skipped
-unless --slow is passed or E2E_SLOW=1 is set.
-"""
-
-import os
-import subprocess
-import sys
-from pathlib import Path
-
-E2E_DIR = Path(__file__).parent.resolve()
-
-
-def is_slow(test_file: Path) -> bool:
-    """Check if the test file is marked as slow (has '# slow' in first 3 lines)."""
-    with open(test_file) as f:
-        for line in f:
-            if line.strip().startswith("#"):
-                continue
-            if line.strip().startswith('"""') or line.strip().startswith("'''"):
-                # Read into the docstring
-                for doc_line in f:
-                    if "slow" in doc_line.lower() and doc_line.strip().startswith(
-                        "slow"
-                    ):
-                        return True
-                    if '"""' in doc_line or "'''" in doc_line:
-                        break
-            break
-    return False
-
-
-def main():
-    run_slow = "--slow" in sys.argv or os.environ.get("E2E_SLOW") == "1"
-    if "--update-snapshots" in sys.argv:
-        os.environ["UPDATE_SNAPSHOTS"] = "1"
-    test_files = sorted(E2E_DIR.glob("test_*.py"))
-    if not test_files:
-        print("No test files found")
-        sys.exit(1)
-
-    passed = 0
-    failed = 0
-    skipped = 0
-    failures = []
-
-    for test_file in test_files:
-        name = test_file.stem
-        if is_slow(test_file) and not run_slow:
-            print(f"=== {name} === SKIPPED (slow, use --slow to run)")
-            skipped += 1
-            continue
-
-        print(f"=== {name} ===")
-        result = subprocess.run([sys.executable, str(test_file)])
-        if result.returncode == 0:
-            passed += 1
-        else:
-            failed += 1
-            failures.append(name)
-        print()
-
-    total = passed + failed + skipped
-    print("================================")
-    print(
-        f"{passed}/{total} tests passed" + (f", {skipped} skipped" if skipped else "")
-    )
-
-    if failed:
-        print(f"Failed: {' '.join(failures)}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/e2e/snapshot.py
+++ b/e2e/snapshot.py
@@ -1,69 +0,0 @@
-"""Snapshot testing infrastructure for E2E tests.
-
-Provides deterministic regression testing by comparing inference output
-against saved snapshots. On first run, snapshots are created automatically.
-Set UPDATE_SNAPSHOTS=1 to regenerate snapshots when output intentionally changes.
-
-Snapshots are stored per-architecture (e.g. snapshots/x86_64/, snapshots/arm64/)
-since floating-point results differ between CPU architectures.
-"""
-
-import difflib
-import json
-import os
-import platform
-from pathlib import Path
-
-ARCH = platform.machine()
-SNAPSHOTS_DIR = Path(__file__).parent / "snapshots" / ARCH
-
-
-def assert_snapshot(
-    name: str,
-    content: str,
-    metadata: dict,
-) -> None:
-    """Compare content against a saved snapshot, or create one if missing.
-
-    Args:
-        name: Snapshot identifier (used as filename: snapshots/{arch}/{name}.json).
-        content: The actual inference output to compare.
-        metadata: Additional context stored alongside content (model, seed, etc.).
-                  Not used for comparison -- purely documentary.
-
-    Raises:
-        AssertionError: If content doesn't match the saved snapshot.
-
-    Environment:
-        UPDATE_SNAPSHOTS=1: Overwrite existing snapshot with actual content.
-    """
-    snapshot_file = SNAPSHOTS_DIR / f"{name}.json"
-    update = os.environ.get("UPDATE_SNAPSHOTS") == "1"
-
-    if snapshot_file.exists() and not update:
-        snapshot = json.loads(snapshot_file.read_text())
-        expected = snapshot["content"]
-        if content != expected:
-            diff = "\n".join(
-                difflib.unified_diff(
-                    expected.splitlines(),
-                    content.splitlines(),
-                    fromfile=f"expected ({snapshot_file.relative_to(SNAPSHOTS_DIR.parent.parent)})",
-                    tofile="actual",
-                    lineterm="",
-                )
-            )
-            raise AssertionError(
-                f"Snapshot mismatch for '{name}' on {ARCH}!\n\n"
-                f"{diff}\n\n"
-                f"Expected: {expected!r}\n"
-                f"Actual:   {content!r}\n\n"
-                f"To update: UPDATE_SNAPSHOTS=1 python3 e2e/run_all.py"
-            )
-        print(f"  Output matches snapshot ({ARCH}/{snapshot_file.name})")
-    else:
-        SNAPSHOTS_DIR.mkdir(parents=True, exist_ok=True)
-        snapshot_data = {**metadata, "arch": ARCH, "content": content}
-        snapshot_file.write_text(json.dumps(snapshot_data, indent=2) + "\n")
-        action = "Updated" if update else "Created"
-        print(f"  {action} snapshot: {ARCH}/{snapshot_file.name}")
--- a/e2e/test_cluster_formation.py
+++ b/e2e/test_cluster_formation.py
@@ -1,22 +0,0 @@
-"""Test: Basic cluster formation.
-
-Verifies two nodes discover each other, elect a master, and the API responds.
-"""
-
-import asyncio
-import sys
-
-sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent))
-from conftest import Cluster
-
-
-async def main():
-    async with Cluster("cluster_formation") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-        print("PASSED: cluster_formation")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_inference_snapshot.py
+++ b/e2e/test_inference_snapshot.py
@@ -1,60 +0,0 @@
-"""Test: Deterministic inference output (snapshot test).
-
-Sends a chat completion request with a fixed seed,
-then verifies the output matches a known-good snapshot. This ensures
-inference produces consistent results across runs.
-
-Uses MLX CPU backend in Docker on x86 Linux.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-from snapshot import assert_snapshot
-
-from conftest import Cluster
-
-MODEL = "mlx-community/Qwen3-0.6B-4bit"
-SEED = 42
-PROMPT = "What is 2+2? Reply with just the number."
-MAX_TOKENS = 32
-
-
-async def main():
-    async with Cluster("inference_snapshot") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        print(f"  Launching model {MODEL}...")
-        await cluster.place_model(MODEL)
-
-        print(f"  Sending chat completion (seed={SEED})...")
-        resp = await cluster.chat(
-            model=MODEL,
-            messages=[{"role": "user", "content": PROMPT}],
-            seed=SEED,
-            max_tokens=MAX_TOKENS,
-        )
-
-        content = resp["choices"][0]["message"]["content"]
-        print(f"  Response: {content!r}")
-
-        assert_snapshot(
-            name="inference_snapshot",
-            content=content,
-            metadata={
-                "model": MODEL,
-                "seed": SEED,
-                "prompt": PROMPT,
-                "max_tokens": MAX_TOKENS,
-            },
-        )
-
-        print("PASSED: inference_snapshot")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_no_internet.py
+++ b/e2e/test_no_internet.py
@@ -1,47 +0,0 @@
-"""Test: Cluster works without internet access.
-
-Verifies exo functions correctly when containers can talk to each other
-but cannot reach the internet. Uses iptables to block all outbound traffic
-except private subnets and multicast (for mDNS discovery).
-"""
-
-import asyncio
-import sys
-
-sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent))
-from conftest import Cluster
-
-
-async def main():
-    async with Cluster(
-        "no_internet",
-        overrides=["tests/no_internet/docker-compose.override.yml"],
-    ) as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        # Verify internet is actually blocked from inside the containers
-        for node in ["exo-node-1", "exo-node-2"]:
-            rc, _ = await cluster.exec(
-                node,
-                "curl",
-                "-sf",
-                "--max-time",
-                "3",
-                "https://huggingface.co",
-                check=False,
-            )
-            assert rc != 0, f"{node} should not be able to reach the internet"
-            print(f"  {node}: internet correctly blocked")
-
-        # Verify exo detected no internet connectivity
-        log = await cluster.logs()
-        assert "Internet connectivity: False" in log, "exo should detect no internet"
-        print("  exo correctly detected no internet connectivity")
-
-        print("PASSED: no_internet")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_snapshot_code_gen.py
+++ b/e2e/test_snapshot_code_gen.py
@@ -1,58 +0,0 @@
-"""Test: Code generation snapshot.
-
-Verifies deterministic output for a code generation prompt.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-from snapshot import assert_snapshot
-
-from conftest import Cluster
-
-MODEL = "mlx-community/Qwen3-0.6B-4bit"
-SEED = 42
-PROMPT = (
-    "Write a Python function to reverse a string. Only output the code, no explanation."
-)
-MAX_TOKENS = 64
-
-
-async def main():
-    async with Cluster("snapshot_code_gen") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        print(f"  Launching model {MODEL}...")
-        await cluster.place_model(MODEL)
-
-        print(f"  Sending chat completion (seed={SEED})...")
-        resp = await cluster.chat(
-            model=MODEL,
-            messages=[{"role": "user", "content": PROMPT}],
-            seed=SEED,
-            max_tokens=MAX_TOKENS,
-        )
-
-        content = resp["choices"][0]["message"]["content"]
-        print(f"  Response: {content!r}")
-
-        assert_snapshot(
-            name="snapshot_code_gen",
-            content=content,
-            metadata={
-                "model": MODEL,
-                "seed": SEED,
-                "prompt": PROMPT,
-                "max_tokens": MAX_TOKENS,
-            },
-        )
-
-        print("PASSED: snapshot_code_gen")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_snapshot_edge.py
+++ b/e2e/test_snapshot_edge.py
@@ -1,63 +0,0 @@
-"""Test: Edge case snapshots.
-
-Verifies deterministic output for edge-case prompts: single word input,
-special characters, and unicode.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-from snapshot import assert_snapshot
-
-from conftest import Cluster
-
-MODEL = "mlx-community/Qwen3-0.6B-4bit"
-SEED = 42
-MAX_TOKENS = 32
-
-CASES = [
-    ("edge_single_word", "Hi"),
-    ("edge_special_chars", "What does 2 * (3 + 4) / 7 - 1 equal? Use <math> tags."),
-    ("edge_unicode", "Translate 'hello' to Japanese, Chinese, and Korean."),
-]
-
-
-async def main():
-    async with Cluster("snapshot_edge") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        print(f"  Launching model {MODEL}...")
-        await cluster.place_model(MODEL)
-
-        for snapshot_name, prompt in CASES:
-            print(f"  [{snapshot_name}] Sending: {prompt!r}")
-            resp = await cluster.chat(
-                model=MODEL,
-                messages=[{"role": "user", "content": prompt}],
-                seed=SEED,
-                max_tokens=MAX_TOKENS,
-            )
-
-            content = resp["choices"][0]["message"]["content"]
-            print(f"  [{snapshot_name}] Response: {content!r}")
-
-            assert_snapshot(
-                name=snapshot_name,
-                content=content,
-                metadata={
-                    "model": MODEL,
-                    "seed": SEED,
-                    "prompt": prompt,
-                    "max_tokens": MAX_TOKENS,
-                },
-            )
-
-        print("PASSED: snapshot_edge")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_snapshot_long_output.py
+++ b/e2e/test_snapshot_long_output.py
@@ -1,56 +0,0 @@
-"""Test: Longer output snapshot.
-
-Verifies deterministic output with a higher max_tokens (128).
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-from snapshot import assert_snapshot
-
-from conftest import Cluster
-
-MODEL = "mlx-community/Qwen3-0.6B-4bit"
-SEED = 42
-PROMPT = "Explain how a binary search algorithm works."
-MAX_TOKENS = 128
-
-
-async def main():
-    async with Cluster("snapshot_long_output") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        print(f"  Launching model {MODEL}...")
-        await cluster.place_model(MODEL)
-
-        print(f"  Sending chat completion (seed={SEED}, max_tokens={MAX_TOKENS})...")
-        resp = await cluster.chat(
-            model=MODEL,
-            messages=[{"role": "user", "content": PROMPT}],
-            seed=SEED,
-            max_tokens=MAX_TOKENS,
-        )
-
-        content = resp["choices"][0]["message"]["content"]
-        print(f"  Response: {content!r}")
-
-        assert_snapshot(
-            name="snapshot_long_output",
-            content=content,
-            metadata={
-                "model": MODEL,
-                "seed": SEED,
-                "prompt": PROMPT,
-                "max_tokens": MAX_TOKENS,
-            },
-        )
-
-        print("PASSED: snapshot_long_output")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_snapshot_multi_model.py
+++ b/e2e/test_snapshot_multi_model.py
@@ -1,72 +0,0 @@
-"""Test: Multi-model snapshot tests.
-slow
-
-Verifies deterministic output across different model architectures to catch
-model-specific regressions. Each model uses its own snapshot file.
-Run with: python3 e2e/run_all.py --slow  or  E2E_SLOW=1 python3 e2e/run_all.py
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-from snapshot import assert_snapshot
-
-from conftest import Cluster
-
-SEED = 42
-PROMPT = "What is the capital of France?"
-MAX_TOKENS = 32
-
-MODELS = [
-    "mlx-community/SmolLM2-135M-Instruct",
-    "mlx-community/Llama-3.2-1B-Instruct-4bit",
-    "mlx-community/gemma-2-2b-it-4bit",
-]
-
-
-async def main():
-    async with Cluster("snapshot_multi_model") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        for model in MODELS:
-            short_name = (
-                model.split("/")[-1].lower().replace("-", "_").replace(".", "_")
-            )
-            snapshot_name = f"snapshot_multi_{short_name}"
-
-            print(f"  Launching model {model}...")
-            await cluster.place_model(model)
-
-            print(f"  Sending chat completion (seed={SEED})...")
-            resp = await cluster.chat(
-                model=model,
-                messages=[{"role": "user", "content": PROMPT}],
-                seed=SEED,
-                max_tokens=MAX_TOKENS,
-            )
-
-            content = resp["choices"][0]["message"]["content"]
-            print(f"  [{short_name}] Response: {content!r}")
-
-            assert_snapshot(
-                name=snapshot_name,
-                content=content,
-                metadata={
-                    "model": model,
-                    "seed": SEED,
-                    "prompt": PROMPT,
-                    "max_tokens": MAX_TOKENS,
-                },
-            )
-
-            print(f"  [{short_name}] PASSED")
-
-        print("PASSED: snapshot_multi_model")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/test_snapshot_reasoning.py
+++ b/e2e/test_snapshot_reasoning.py
@@ -1,56 +0,0 @@
-"""Test: Reasoning/math snapshot.
-
-Verifies deterministic output for a simple reasoning prompt.
-"""
-
-import asyncio
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent))
-from snapshot import assert_snapshot
-
-from conftest import Cluster
-
-MODEL = "mlx-community/Qwen3-0.6B-4bit"
-SEED = 42
-PROMPT = "If I have 3 apples and give away 1, how many do I have? Think step by step."
-MAX_TOKENS = 64
-
-
-async def main():
-    async with Cluster("snapshot_reasoning") as cluster:
-        await cluster.build()
-        await cluster.start()
-        await cluster.assert_healthy()
-
-        print(f"  Launching model {MODEL}...")
-        await cluster.place_model(MODEL)
-
-        print(f"  Sending chat completion (seed={SEED})...")
-        resp = await cluster.chat(
-            model=MODEL,
-            messages=[{"role": "user", "content": PROMPT}],
-            seed=SEED,
-            max_tokens=MAX_TOKENS,
-        )
-
-        content = resp["choices"][0]["message"]["content"]
-        print(f"  Response: {content!r}")
-
-        assert_snapshot(
-            name="snapshot_reasoning",
-            content=content,
-            metadata={
-                "model": MODEL,
-                "seed": SEED,
-                "prompt": PROMPT,
-                "max_tokens": MAX_TOKENS,
-            },
-        )
-
-        print("PASSED: snapshot_reasoning")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/e2e/tests/no_internet/docker-compose.override.yml
+++ b/e2e/tests/no_internet/docker-compose.override.yml
@@ -1,32 +0,0 @@
-# Block all outbound internet traffic using iptables while preserving:
-#   - Multicast (224.0.0.0/4) for mDNS peer discovery
-#   - Private subnets (10/8, 172.16/12, 192.168/16) for inter-container communication
-#   - Loopback (127/8)
-# Requires NET_ADMIN capability for iptables.
-services:
-  exo-node-1:
-    cap_add:
-      - NET_ADMIN
-    entrypoint: ["/bin/sh", "-c"]
-    command:
-      - |
-        iptables -A OUTPUT -d 127.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 10.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 172.16.0.0/12 -j ACCEPT
-        iptables -A OUTPUT -d 192.168.0.0/16 -j ACCEPT
-        iptables -A OUTPUT -d 224.0.0.0/4 -j ACCEPT
-        iptables -A OUTPUT -j REJECT
-        exec .venv/bin/exo -v
-  exo-node-2:
-    cap_add:
-      - NET_ADMIN
-    entrypoint: ["/bin/sh", "-c"]
-    command:
-      - |
-        iptables -A OUTPUT -d 127.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 10.0.0.0/8 -j ACCEPT
-        iptables -A OUTPUT -d 172.16.0.0/12 -j ACCEPT
-        iptables -A OUTPUT -d 192.168.0.0/16 -j ACCEPT
-        iptables -A OUTPUT -d 224.0.0.0/4 -j ACCEPT
-        iptables -A OUTPUT -j REJECT
-        exec .venv/bin/exo -v
--- a/python/parts.nix
+++ b/python/parts.nix
@@ -14,7 +14,9 @@

      # Override overlay to inject Nix-built components
      exoOverlay = final: prev: {
-        # Replace workspace exo_pyo3_bindings with Nix-built wheel
+        # Replace workspace exo_pyo3_bindings with Nix-built wheel.
+        # Preserve passthru so mkVirtualEnv can resolve dependency groups.
+        # Copy .pyi stub + py.typed marker so basedpyright can find the types.
        exo-pyo3-bindings = pkgs.stdenv.mkDerivation {
          pname = "exo-pyo3-bindings";
          version = "0.1.0";
@@ -22,6 +24,12 @@
          # Install from pre-built wheel
          nativeBuildInputs = [ final.pyprojectWheelHook ];
          dontStrip = true;
+          passthru = prev.exo-pyo3-bindings.passthru or { };
+          postInstall = ''
+            local siteDir=$out/${final.python.sitePackages}/exo_pyo3_bindings
+            cp ${inputs.self}/rust/exo_pyo3_bindings/exo_pyo3_bindings.pyi $siteDir/
+            touch $siteDir/py.typed
+          '';
        };
      };

@@ -29,17 +37,32 @@

      # Overlay to provide build systems and custom packages
      buildSystemsOverlay = final: prev: {
-        # Use our pure Nix-built MLX with Metal support
-        mlx = self'.packages.mlx;
-
        # mlx-lm is a git dependency that needs setuptools
        mlx-lm = prev.mlx-lm.overrideAttrs (old: {
          nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [
            final.setuptools
          ];
        });
+      } // lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin {
+        # Use our pure Nix-built MLX with Metal support (macOS only)
+        mlx = self'.packages.mlx;
      };

+      # Additional overlay for Linux-specific fixes (type checking env).
+      # Native wheels have shared lib dependencies we don't need at type-check time.
+      linuxOverlay = final: prev:
+        let
+          ignoreMissing = drv: drv.overrideAttrs { autoPatchelfIgnoreMissingDeps = [ "*" ]; };
+          nvidiaPackages = lib.filterAttrs (name: _: lib.hasPrefix "nvidia-" name) prev;
+        in
+        lib.optionalAttrs pkgs.stdenv.hostPlatform.isLinux (
+          (lib.mapAttrs (_: ignoreMissing) nvidiaPackages) // {
+            mlx = ignoreMissing prev.mlx;
+            torch = ignoreMissing prev.torch;
+            triton = ignoreMissing prev.triton;
+          }
+        );
+
      pythonSet = (pkgs.callPackage inputs.pyproject-nix.build.packages {
        inherit python;
      }).overrideScope (
@@ -48,6 +71,7 @@
          overlay
          exoOverlay
          buildSystemsOverlay
+          linuxOverlay
        ]
      );
      exoVenv = pythonSet.mkVirtualEnv "exo-env" workspace.deps.default;
@@ -118,6 +142,21 @@
          ${pkgs.ruff}/bin/ruff check ${inputs.self}
          touch $out
        '';
+
+        # Hermetic basedpyright type checking
+        typecheck = pkgs.runCommand "typecheck"
+          {
+            nativeBuildInputs = [
+              testVenv
+              pkgs.basedpyright
+            ];
+          }
+          ''
+            cd ${inputs.self}
+            export HOME=$TMPDIR
+            basedpyright --pythonpath ${testVenv}/bin/python
+            touch $out
+          '';
      };
    };
 }
--- a/resources/inference_model_cards/mlx-community--SmolLM2-135M-Instruct.toml
+++ b/resources/inference_model_cards/mlx-community--SmolLM2-135M-Instruct.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/SmolLM2-135M-Instruct"
-n_layers = 30
-hidden_size = 576
-supports_tensor = true
-tasks = ["TextGeneration"]
-family = "llama"
-quantization = "bf16"
-base_model = "SmolLM2 135M"
-capabilities = ["text"]
-
-[storage_size]
-in_bytes = 269060381
--- a/resources/inference_model_cards/mlx-community--gemma-2-2b-it-4bit.toml
+++ b/resources/inference_model_cards/mlx-community--gemma-2-2b-it-4bit.toml
@@ -1,12 +0,0 @@
-model_id = "mlx-community/gemma-2-2b-it-4bit"
-n_layers = 26
-hidden_size = 2304
-supports_tensor = false
-tasks = ["TextGeneration"]
-family = "gemma2"
-quantization = "4bit"
-base_model = "Gemma 2 2B"
-capabilities = ["text"]
-
-[storage_size]
-in_bytes = 1492755242
				`@@ -1 +0,0 @@`
				`collect_ignore = ["tests/start_distributed_test.py"]`