aaa

switch from synchronous threaded pinging to an async implementation (#1170 )
still seeing churn in our networking - lets properly rate limit it ## changes added an httpx client with max connections with a persistent AsyncClient ## testing deployed on cluster, discovery VASTLY more stable (the only deleted edges were those discovered by mdns)
2026-01-16 09:59:43 -05:00 · 2026-01-16 14:20:34 +00:00 · 2026-01-16 13:20:03 +00:00 · 2026-01-16 13:10:50 +00:00 · 2026-01-16 12:39:22 +00:00 · 2026-01-16 12:34:28 +00:00
19 changed files with 822 additions and 1344 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -30,17 +30,14 @@ uv run pytest src/exo/shared/tests/test_election.py
 # Run a specific test function
 uv run pytest src/exo/shared/tests/test_election.py::test_function_name

-# Type checking (strict mode) - MUST pass before committing
-uv run basedpyright --project pyproject.toml
+# Type checking (strict mode)
+uv run basedpyright

 # Linting
 uv run ruff check

 # Format code (using nix)
 nix fmt
-
-# Run all checks (do this before committing)
-uv run basedpyright --project pyproject.toml && uv run ruff check && nix fmt
 ```

 ## Architecture
@@ -94,10 +91,6 @@ From .cursorrules:
 - Catch exceptions only where you can handle them meaningfully
 - Use `@final` and immutability wherever applicable

-## File Locations
-
- **Downloaded models**: `~/.exo/models/` (NOT in huggingface cache)
-
 ## Testing

 Tests use pytest-asyncio with `asyncio_mode = "auto"`. Tests are in `tests/` subdirectories alongside the code they test. The `EXO_TESTS=1` env var is set during tests.
--- a/app/EXO/EXO/ContentView.swift
+++ b/app/EXO/EXO/ContentView.swift
@@ -56,6 +56,11 @@ struct ContentView: View {
    }

    private var shouldShowLocalNetworkWarning: Bool {
+        // Show warning if local network is not working and EXO is running.
+        // The checker uses a longer timeout on first launch to allow time for
+        // the permission prompt, so this correctly handles both:
+        // 1. User denied permission on first launch
+        // 2. Permission broke after restart (macOS TCC bug)
        if case .notWorking = localNetworkChecker.status {
            return controller.status != .stopped
        }
--- a/app/EXO/EXO/Services/LocalNetworkChecker.swift
+++ b/app/EXO/EXO/Services/LocalNetworkChecker.swift
@@ -5,8 +5,8 @@ import os.log
 /// Checks if the app's local network permission is actually functional.
 ///
 /// macOS local network permission can appear enabled in System Preferences but not
-/// actually work after a restart. This service detects this by creating a UDP
-/// connection to the mDNS multicast address (224.0.0.251:5353).
+/// actually work after a restart. This service uses NWConnection to mDNS multicast
+/// to verify actual connectivity.
@MainActor
 final class LocalNetworkChecker: ObservableObject {
    enum Status: Equatable {
@@ -35,30 +35,43 @@ final class LocalNetworkChecker: ObservableObject {
    }

    private static let logger = Logger(subsystem: "io.exo.EXO", category: "LocalNetworkChecker")
+    private static let hasCompletedInitialCheckKey = "LocalNetworkChecker.hasCompletedInitialCheck"

    @Published private(set) var status: Status = .unknown
-    @Published private(set) var lastConnectionState: String = "none"

    private var connection: NWConnection?
    private var checkTask: Task<Void, Never>?

+    /// Whether we've completed at least one check (stored in UserDefaults)
+    private var hasCompletedInitialCheck: Bool {
+        get { UserDefaults.standard.bool(forKey: Self.hasCompletedInitialCheckKey) }
+        set { UserDefaults.standard.set(newValue, forKey: Self.hasCompletedInitialCheckKey) }
+    }
+
    /// Checks if local network access is working.
    func check() {
        checkTask?.cancel()
        status = .checking
-        lastConnectionState = "connecting"
+
+        // Use longer timeout on first launch to allow time for permission prompt
+        let isFirstCheck = !hasCompletedInitialCheck
+        let timeout: UInt64 = isFirstCheck ? 30_000_000_000 : 3_000_000_000

        checkTask = Task { [weak self] in
            guard let self else { return }
-            let result = await self.performCheck()
+
+            Self.logger.info("Checking local network connectivity (first check: \(isFirstCheck))")
+            let result = await self.checkConnectivity(timeout: timeout)
            self.status = result
+            self.hasCompletedInitialCheck = true
+
            Self.logger.info("Local network check complete: \(result.displayText)")
        }
    }

-    private func performCheck() async -> Status {
-        Self.logger.info("Checking local network access via UDP multicast")
-
+    /// Checks connectivity using NWConnection to mDNS multicast.
+    /// The connection attempt triggers the permission prompt if not yet shown.
+    private func checkConnectivity(timeout: UInt64) async -> Status {
        connection?.cancel()
        connection = nil

@@ -84,22 +97,7 @@ final class LocalNetworkChecker: ObservableObject {
                continuation.resume(returning: status)
            }

-            conn.stateUpdateHandler = { [weak self] state in
-                let stateStr: String
-                switch state {
-                case .setup: stateStr = "setup"
-                case .preparing: stateStr = "preparing"
-                case .ready: stateStr = "ready"
-                case .waiting(let e): stateStr = "waiting(\(e))"
-                case .failed(let e): stateStr = "failed(\(e))"
-                case .cancelled: stateStr = "cancelled"
-                @unknown default: stateStr = "unknown"
-                }
-
-                Task { @MainActor in
-                    self?.lastConnectionState = stateStr
-                }
-
+            conn.stateUpdateHandler = { state in
                switch state {
                case .ready:
                    resumeOnce(.working)
@@ -108,6 +106,7 @@ final class LocalNetworkChecker: ObservableObject {
                    if errorStr.contains("54") || errorStr.contains("ECONNRESET") {
                        resumeOnce(.notWorking(reason: "Connection blocked"))
                    }
+                // Otherwise keep waiting - might be showing permission prompt
                case .failed(let error):
                    let errorStr = "\(error)"
                    if errorStr.contains("65") || errorStr.contains("EHOSTUNREACH")
@@ -127,7 +126,7 @@ final class LocalNetworkChecker: ObservableObject {
            conn.start(queue: .main)

            Task {
-                try? await Task.sleep(nanoseconds: 3_000_000_000)
+                try? await Task.sleep(nanoseconds: timeout)
                let state = conn.state
                switch state {
                case .ready:
--- a/bench/exo_bench.py
+++ b/bench/exo_bench.py
@@ -241,6 +241,9 @@ class PromptSizer:
            ids = tokenizer.apply_chat_template(
                messages, tokenize=True, add_generation_prompt=True
            )
+            # Fix for transformers 5.x
+            if hasattr(ids, "input_ids"):
+                ids = ids.input_ids
            return int(len(ids))

        return count_fn
--- a/dashboard/src/lib/components/ChatForm.svelte
+++ b/dashboard/src/lib/components/ChatForm.svelte
@@ -60,12 +60,39 @@
 		return models;
 	});

-	// Auto-select the first available model if none is selected
+	// Track previous model IDs to detect newly added models (plain variable to avoid reactive loop)
+	let previousModelIds: Set<string> = new Set();
+
+	// Auto-select the first available model if none is selected, if current selection is stale, or if a new model is added
 	$effect(() => {
 		const models = availableModels();
-		if (models.length > 0 && !currentModel) {
-			setSelectedChatModel(models[0].id);
+		const currentModelIds = new Set(models.map(m => m.id));
+
+		if (models.length > 0) {
+			// Find newly added models (in current but not in previous)
+			const newModels = models.filter(m => !previousModelIds.has(m.id));
+
+			// If no model selected, select the first available
+			if (!currentModel) {
+				setSelectedChatModel(models[0].id);
+			}
+			// If current model is stale (no longer has a running instance), reset to first available
+			else if (!models.some(m => m.id === currentModel)) {
+				setSelectedChatModel(models[0].id);
+			}
+			// If a new model was just added, select it
+			else if (newModels.length > 0 && previousModelIds.size > 0) {
+				setSelectedChatModel(newModels[0].id);
+			}
+		} else {
+			// No instances running - clear the selected model
+			if (currentModel) {
+				setSelectedChatModel('');
+			}
 		}
+
+		// Update previous model IDs for next comparison
+		previousModelIds = currentModelIds;
 	});

 	function getInstanceModelId(instanceWrapped: unknown): string {
--- a/dashboard/src/routes/+page.svelte
+++ b/dashboard/src/routes/+page.svelte
@@ -400,10 +400,8 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 				const errorText = await response.text();
 				console.error('Failed to launch instance:', errorText);
 			} else {
-				// Auto-select the launched model only if no model is currently selected
-				if (!selectedChatModel()) {
-					setSelectedChatModel(modelId);
-				}
+				// Always auto-select the newly launched model so the user chats to what they just launched
+				setSelectedChatModel(modelId);
 				
 				// Scroll to the bottom of instances container to show the new instance
 				// Use multiple attempts to ensure DOM has updated with the new instance
@@ -763,6 +761,10 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 	async function deleteInstance(instanceId: string) {
 		if (!confirm(`Delete instance ${instanceId.slice(0, 8)}...?`)) return;
 		
+		// Get the model ID of the instance being deleted before we delete it
+		const deletedInstanceModelId = getInstanceModelId(instanceData[instanceId]);
+		const wasSelected = selectedChatModel() === deletedInstanceModelId;
+		
 		try {
 			const response = await fetch(`/instance/${instanceId}`, {
 				method: 'DELETE',
@@ -771,6 +773,24 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 			
 			if (!response.ok) {
 				console.error('Failed to delete instance:', response.status);
+			} else if (wasSelected) {
+				// If we deleted the currently selected model, switch to another available model
+				// Find another instance that isn't the one we just deleted
+				const remainingInstances = Object.entries(instanceData).filter(([id]) => id !== instanceId);
+				if (remainingInstances.length > 0) {
+					// Select the last instance (most recently added, since objects preserve insertion order)
+					const [, lastInstance] = remainingInstances[remainingInstances.length - 1];
+					const newModelId = getInstanceModelId(lastInstance);
+					if (newModelId && newModelId !== 'Unknown' && newModelId !== 'Unknown Model') {
+						setSelectedChatModel(newModelId);
+					} else {
+						// Clear selection if no valid model found
+						setSelectedChatModel('');
+					}
+				} else {
+					// No more instances, clear the selection
+					setSelectedChatModel('');
+				}
 			}
 		} catch (error) {
 			console.error('Error deleting instance:', error);
--- a/2
+++ b/2
@@ -1,3 +1,5 @@
+export NIX_CONFIG := "extra-experimental-features = nix-command flakes"
+
 fmt:
    nix fmt

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,8 +6,6 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
    "aiofiles>=24.1.0",
-    "aiohttp>=3.12.14",
-    "types-aiofiles>=24.1.0.20250708",
    "pydantic>=2.11.7",
    "fastapi>=0.116.1",
    "filelock>=3.18.0",
@@ -23,6 +21,7 @@ dependencies = [
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
+    "httpx>=0.28.1",
 ]

 [project.scripts]
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -13,12 +13,6 @@ from hypercorn.asyncio import serve  # pyright: ignore[reportUnknownVariableType
 from hypercorn.config import Config
 from hypercorn.typing import ASGIFramework
 from loguru import logger
-from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
-    HarmonyEncodingName,
-    Role,
-    StreamableParser,
-    load_harmony_encoding,
-)

 from exo.master.placement import place_instance as get_instance_placements
 from exo.shared.apply import apply
@@ -67,8 +61,6 @@ from exo.utils.channels import Receiver, Sender, channel
 from exo.utils.dashboard_path import find_dashboard
 from exo.utils.event_buffer import OrderedBuffer

-encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-

 def chunk_to_response(
    chunk: TokenChunk, command_id: CommandId
@@ -381,35 +373,8 @@ class API:
            instance_id=instance_id,
        )

-    async def _process_gpt_oss(self, token_chunks: Receiver[TokenChunk]):
-        stream = StreamableParser(encoding, role=Role.ASSISTANT)
-        thinking = False
-
-        async for chunk in token_chunks:
-            stream.process(chunk.token_id)
-
-            delta = stream.last_content_delta
-            ch = stream.current_channel
-
-            if ch == "analysis" and not thinking:
-                thinking = True
-                yield chunk.model_copy(update={"text": "<think>"})
-
-            if ch != "analysis" and thinking:
-                thinking = False
-                yield chunk.model_copy(update={"text": "</think>"})
-
-            if delta:
-                yield chunk.model_copy(update={"text": delta})
-
-            if chunk.finish_reason is not None:
-                if thinking:
-                    yield chunk.model_copy(update={"text": "</think>"})
-                yield chunk
-                break
-
    async def _chat_chunk_stream(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> AsyncGenerator[TokenChunk, None]:
        """Yield `TokenChunk`s for a given command until completion."""

@@ -417,16 +382,10 @@ class API:
            self._chat_completion_queues[command_id], recv = channel[TokenChunk]()

            with recv as token_chunks:
-                if parse_gpt_oss:
-                    async for chunk in self._process_gpt_oss(token_chunks):
-                        yield chunk
-                        if chunk.finish_reason is not None:
-                            break
-                else:
-                    async for chunk in token_chunks:
-                        yield chunk
-                        if chunk.finish_reason is not None:
-                            break
+                async for chunk in token_chunks:
+                    yield chunk
+                    if chunk.finish_reason is not None:
+                        break

        except anyio.get_cancelled_exc_class():
            # TODO: TaskCancelled
@@ -442,11 +401,11 @@ class API:
            del self._chat_completion_queues[command_id]

    async def _generate_chat_stream(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> AsyncGenerator[str, None]:
        """Generate chat completion stream as JSON strings."""

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            chunk_response: ChatCompletionResponse = chunk_to_response(
                chunk, command_id
            )
@@ -458,7 +417,7 @@ class API:
                yield "data: [DONE]\n\n"

    async def _collect_chat_completion(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> ChatCompletionResponse:
        """Collect all token chunks for a chat completion and return a single response."""

@@ -466,7 +425,7 @@ class API:
        model: str | None = None
        finish_reason: FinishReason | None = None

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            if model is None:
                model = chunk.model

@@ -495,7 +454,7 @@ class API:
        )

    async def _collect_chat_completion_with_stats(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> BenchChatCompletionResponse:
        text_parts: list[str] = []
        model: str | None = None
@@ -503,7 +462,7 @@ class API:

        stats: GenerationStats | None = None

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            if model is None:
                model = chunk.model

@@ -544,8 +503,6 @@ class API:
        """Handle chat completions, supporting both streaming and non-streaming responses."""
        model_meta = await resolve_model_meta(payload.model)
        payload.model = model_meta.model_id
-        parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
-        logger.info(f"{parse_gpt_oss=}")

        if not any(
            instance.shard_assignments.model_id == payload.model
@@ -562,17 +519,16 @@ class API:
        await self._send(command)
        if payload.stream:
            return StreamingResponse(
-                self._generate_chat_stream(command.command_id, parse_gpt_oss),
+                self._generate_chat_stream(command.command_id),
                media_type="text/event-stream",
            )

-        return await self._collect_chat_completion(command.command_id, parse_gpt_oss)
+        return await self._collect_chat_completion(command.command_id)

    async def bench_chat_completions(
        self, payload: BenchChatCompletionTaskParams
    ) -> BenchChatCompletionResponse:
        model_meta = await resolve_model_meta(payload.model)
-        parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
        payload.model = model_meta.model_id

        if not any(
@@ -589,10 +545,7 @@ class API:
        command = ChatCompletion(request_params=payload)
        await self._send(command)

-        response = await self._collect_chat_completion_with_stats(
-            command.command_id,
-            parse_gpt_oss,
-        )
+        response = await self._collect_chat_completion_with_stats(command.command_id)
        return response

    def _calculate_total_available_memory(self) -> Memory:
--- a/src/exo/shared/logging.py
+++ b/src/exo/shared/logging.py
@@ -29,6 +29,11 @@ class _InterceptHandler(logging.Handler):

 def logger_setup(log_file: Path | None, verbosity: int = 0):
    """Set up logging for this process - formatting, file handles, verbosity and output"""
+
+    logging.getLogger("exo_pyo3_bindings").setLevel(logging.WARNING)
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+
    logger.remove()

    # replace all stdlib loggers with _InterceptHandlers that log to loguru
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -425,15 +425,15 @@ MODEL_CARDS: dict[str, ModelCard] = {
            supports_tensor=True,
        ),
    ),
-    "gpt-oss-20b-4bit": ModelCard(
-        short_id="gpt-oss-20b-4bit",
-        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
-        name="GPT-OSS 20B (MXFP4-Q4, MLX)",
-        description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization.""",
+    "gpt-oss-20b-MXFP4-Q8": ModelCard(
+        short_id="gpt-oss-20b-MXFP4-Q8",
+        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
+        name="GPT-OSS 20B (MXFP4-Q8, MLX)",
+        description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this variant is a 4-bit MLX conversion for Apple Silicon.""",
        tags=[],
        metadata=ModelMetadata(
-            model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
-            pretty_name="GPT-OSS 20B (MXFP4-Q4, MLX)",
+            model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
+            pretty_name="GPT-OSS 20B (MXFP4-Q8, MLX)",
            storage_size=Memory.from_kb(11_744_051),
            n_layers=24,
            hidden_size=2880,
--- a/src/exo/worker/download/download_utils.py
+++ b/src/exo/worker/download/download_utils.py
@@ -7,13 +7,13 @@ import time
 import traceback
 from datetime import timedelta
 from pathlib import Path
-from typing import Callable, Literal
+from typing import Callable, Literal, cast
 from urllib.parse import urljoin

 import aiofiles
 import aiofiles.os as aios
-import aiohttp
 import certifi
+import httpx
 from loguru import logger
 from pydantic import (
    BaseModel,
@@ -207,23 +207,22 @@ async def _fetch_file_list(
    headers = await get_download_headers()
    async with (
        create_http_session(timeout_profile="short") as session,
-        session.get(url, headers=headers) as response,
    ):
-        if response.status == 200:
-            data_json = await response.text()
-            data = TypeAdapter(list[FileListEntry]).validate_json(data_json)
-            files: list[FileListEntry] = []
-            for item in data:
-                if item.type == "file":
-                    files.append(FileListEntry.model_validate(item))
-                elif item.type == "directory" and recursive:
-                    subfiles = await _fetch_file_list(
-                        repo_id, revision, item.path, recursive
-                    )
-                    files.extend(subfiles)
-            return files
-        else:
-            raise Exception(f"Failed to fetch file list: {response.status}")
+        response = await session.get(url, headers=headers)
+        if response.status_code != 200:
+            raise Exception(f"Failed to fetch file list: {response.status_code}")
+
+        data = TypeAdapter(list[FileListEntry]).validate_json(response.text)
+        files: list[FileListEntry] = []
+        for item in data:
+            if item.type == "file":
+                files.append(FileListEntry.model_validate(item))
+            elif item.type == "directory" and recursive:
+                subfiles = await _fetch_file_list(
+                    repo_id, revision, item.path, recursive
+                )
+                files.extend(subfiles)
+        return files


 async def get_download_headers() -> dict[str, str]:
@@ -231,31 +230,25 @@ async def get_download_headers() -> dict[str, str]:


 def create_http_session(
-    auto_decompress: bool = False,
    timeout_profile: Literal["short", "long"] = "long",
-) -> aiohttp.ClientSession:
+) -> httpx.AsyncClient:
    if timeout_profile == "short":
        total_timeout = 30
        connect_timeout = 10
-        sock_read_timeout = 30
-        sock_connect_timeout = 10
+        read_timeout = 30
    else:
        total_timeout = 1800
        connect_timeout = 60
-        sock_read_timeout = 1800
-        sock_connect_timeout = 60
+        read_timeout = 1800

    ssl_context = ssl.create_default_context(cafile=certifi.where())
-    connector = aiohttp.TCPConnector(ssl=ssl_context)

-    return aiohttp.ClientSession(
-        auto_decompress=auto_decompress,
-        connector=connector,
-        timeout=aiohttp.ClientTimeout(
-            total=total_timeout,
+    return httpx.AsyncClient(
+        verify=ssl_context,
+        timeout=httpx.Timeout(
            connect=connect_timeout,
-            sock_read=sock_read_timeout,
-            sock_connect=sock_connect_timeout,
+            read=read_timeout,
+            write=total_timeout,
        ),
    )

@@ -282,23 +275,25 @@ async def file_meta(
    headers = await get_download_headers()
    async with (
        create_http_session(timeout_profile="short") as session,
-        session.head(url, headers=headers) as r,
    ):
-        if r.status == 307:
+        r = await session.head(url, headers=headers)
+        if r.status_code == 307:
            # On redirect, only trust Hugging Face's x-linked-* headers.
-            x_linked_size = r.headers.get("x-linked-size")
-            x_linked_etag = r.headers.get("x-linked-etag")
+            x_linked_size = cast(str | None, r.headers.get("x-linked-size"))
+            x_linked_etag = cast(str | None, r.headers.get("x-linked-etag"))
            if x_linked_size and x_linked_etag:
                content_length = int(x_linked_size)
                etag = trim_etag(x_linked_etag)
                return content_length, etag
            # Otherwise, follow the redirect to get authoritative size/hash
-            redirected_location = r.headers.get("location")
+            redirected_location = cast(str | None, r.headers.get("location"))
            return await file_meta(repo_id, revision, path, redirected_location)
-        content_length = int(
-            r.headers.get("x-linked-size") or r.headers.get("content-length") or 0
+        content_length = cast(
+            str | None,
+            r.headers.get("x-linked-size") or r.headers.get("content-length"),
        )
-        etag = r.headers.get("x-linked-etag") or r.headers.get("etag")
+        content_length = 0 if content_length is None else int(content_length)
+        etag = cast(str | None, r.headers.get("x-linked-etag") or r.headers.get("etag"))
        assert content_length > 0, f"No content length for {url}"
        assert etag is not None, f"No remote hash for {url}"
        etag = trim_etag(etag)
@@ -357,17 +352,17 @@ async def _download_file(
        n_read = resume_byte_pos or 0
        async with (
            create_http_session(timeout_profile="long") as session,
-            session.get(url, headers=headers) as r,
        ):
-            if r.status == 404:
+            r = await session.get(url, headers=headers)
+            if r.status_code == 404:
                raise FileNotFoundError(f"File not found: {url}")
-            assert r.status in [200, 206], (
-                f"Failed to download {path} from {url}: {r.status}"
+            assert r.status_code in [200, 206], (
+                f"Failed to download {path} from {url}: {r.status_code}"
            )
            async with aiofiles.open(
                partial_path, "ab" if resume_byte_pos else "wb"
            ) as f:
-                while chunk := await r.content.read(8 * 1024 * 1024):
+                async for chunk in r.aiter_bytes(8 * 1024 * 1024):
                    n_read = n_read + (await f.write(chunk))
                    on_progress(n_read, length, False)

--- a/src/exo/worker/engines/mlx/cache.py
+++ b/src/exo/worker/engines/mlx/cache.py
@@ -1,189 +1,104 @@
-"""KV prefix cache for reusing computed prompt prefixes across requests."""
-
-from collections import OrderedDict
+# type: ignore
+# TODO: Fix this file, including types!
 from copy import deepcopy
-from typing import Any, Callable, Protocol
+from typing import Callable

 import mlx.core as mx
-import numpy as np
-from mlx_lm.models.cache import trim_prompt_cache
+from mlx_lm import stream_generate
+from mlx_lm.models.cache import _BaseCache, trim_prompt_cache
+from mlx_lm.tokenizer_utils import TokenizerWrapper

 from exo.worker.engines.mlx import Model
+from exo.worker.engines.mlx.constants import KEEP_KV_SIZE, KV_BITS, KV_GROUP_SIZE
 from exo.worker.engines.mlx.utils_mlx import make_kv_cache
-from exo.worker.runner.bootstrap import logger
-
-# Type alias for KV cache - the actual type is _BaseCache but it's private
-KVCacheType = Any
-
-
-class TokenizerProtocol(Protocol):
-    """Protocol for tokenizers used with KVPrefixCache."""
-
-    bos_token: str | None
-
-    def encode(self, text: str, **kwargs: bool) -> list[int]: ...


 class KVPrefixCache:
-    """Cache for common prompt prefixes to avoid re-processing.
+    def __init__(self):
+        # Only one prefix cache per runner.
+        self.prompts: list[mx.array] = []  # mx array of tokens (ints)
+        self.caches: list[list[_BaseCache]] = []

-    Uses LRU eviction when capacity is reached. Stores tokenized prompts
-    and their corresponding KV caches for reuse.
-    """
-
-    def __init__(self, max_size: int = 10):
-        """Initialize prefix cache.
-
-        Args:
-            max_size: Maximum number of cached entries before LRU eviction.
-        """
-        self.max_size = max_size
-        # OrderedDict maintains insertion order for LRU - most recent at end
-        # Key: token bytes, Value: (tokens as mx.array, KV cache)
-        self._cache: OrderedDict[bytes, tuple[mx.array, list[KVCacheType]]] = (
-            OrderedDict()
-        )
-
-    def _token_key(self, tokens: mx.array) -> bytes:
-        """Create hashable key from token array."""
-        return np.array(tokens.tolist(), dtype=np.int32).tobytes()
-
-    def _encode_prompt(self, tokenizer: TokenizerProtocol, prompt: str) -> mx.array:
-        """Tokenize prompt string to mx.array."""
-        add_special_tokens = tokenizer.bos_token is None or not prompt.startswith(
-            tokenizer.bos_token
-        )
-        tokenized = tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
-        return mx.array(tokenized)
-
-    def _find_best_prefix(self, tokens: mx.array) -> tuple[bytes | None, int]:
-        """Find cached entry with longest matching prefix.
-
-        Returns:
-            Tuple of (cache_key, prefix_length). cache_key is None if no match found.
-        """
-        best_key: bytes | None = None
-        best_length = 0
-        target_len = tokens.shape[0]
-
-        for key, (cached_tokens, _cache) in self._cache.items():
-            prefix_len = get_prefix_length(tokens, cached_tokens)
-
-            # Exact match - return immediately
-            if prefix_len == target_len and prefix_len == cached_tokens.shape[0]:
-                return key, prefix_len
-
-            # Better prefix match
-            if prefix_len > best_length:
-                best_key = key
-                best_length = prefix_len
-
-        return best_key, best_length
+    def add_kv_cache(
+        self, tokenizer: TokenizerWrapper, prompt: str, cache: list[_BaseCache]
+    ):
+        tokenized_prompt = self.encode_prompt(tokenizer, prompt)
+        self.prompts.append(tokenized_prompt)
+        self.caches.append(deepcopy(cache))

    def get_kv_cache(
        self,
        model: Model,
-        tokenizer: TokenizerProtocol,
+        tokenizer: TokenizerWrapper,
        sampler: Callable[[mx.array], mx.array],
        prompt: str,
-    ) -> tuple[list[KVCacheType], int]:
-        """Get KV cache for prompt, reusing prefix if available.
+    ) -> list[_BaseCache]:
+        tokenized_prompt = self.encode_prompt(tokenizer, prompt)
+        max_length = len(tokenized_prompt)

-        Args:
-            model: The model to create cache for.
-            tokenizer: Tokenizer for encoding prompt.
-            sampler: Sampler function for prefill.
-            prompt: The prompt string to process.
+        best_snapshot_index, best_snapshot_length = None, 0

-        Returns:
-            Tuple of (kv_cache, tokens_reused). tokens_reused indicates how many
-            tokens were reused from cache (0 if no cache hit).
-        """
-        tokens = self._encode_prompt(tokenizer, prompt)
-        target_len = int(tokens.shape[0])
+        for i, cached_prompt in enumerate(self.prompts):
+            length = _get_prefix_length(tokenized_prompt, cached_prompt)

-        # Find best prefix match
-        best_key, prefix_len = self._find_best_prefix(tokens)
+            if length == max_length:
+                return self.caches[i]

-        if best_key is not None and prefix_len > 0:
-            cached_tokens, cached_kv = self._cache[best_key]
-            cached_len = int(cached_tokens.shape[0])
+            if length > best_snapshot_length:
+                best_snapshot_index, best_snapshot_length = i, length

-            # Move to end (most recently used)
-            self._cache.move_to_end(best_key)
+        if best_snapshot_index is not None:
+            prompt_cache = deepcopy(self.caches[best_snapshot_index])
+            trim_prompt_cache(prompt_cache, max_length - best_snapshot_length)
+            tokenized_prompt = tokenized_prompt[best_snapshot_index:]

-            if prefix_len == target_len and prefix_len == cached_len:
-                # Exact match - return deepcopy directly
-                logger.debug(f"Prefix cache: exact match, reusing {prefix_len} tokens")
-                return deepcopy(cached_kv), prefix_len
-
-            # Partial match - need to trim and/or extend
-            prompt_cache = deepcopy(cached_kv)
-
-            if cached_len > prefix_len:
-                # Cached prompt is longer - trim to prefix length
-                num_to_trim = cached_len - prefix_len
-                trim_prompt_cache(prompt_cache, num_to_trim)
-                logger.debug(
-                    f"Prefix cache: trimmed {num_to_trim} tokens from cached entry"
-                )
-
-            # Note: We don't prefill remaining tokens here - stream_generate will do it
-            # when processing the full prompt with this partial cache
-            return prompt_cache, prefix_len
-
-        # No cache hit - return fresh cache (stream_generate will prefill)
-        logger.debug(
-            f"Prefix cache: miss, will prefill {target_len} tokens during generation"
-        )
-        prompt_cache = make_kv_cache(model=model)
-
-        return prompt_cache, 0
-
-    def put(
-        self, tokenizer: TokenizerProtocol, prompt: str, cache: list[KVCacheType]
-    ) -> None:
-        """Store KV cache for prompt after generation completes.
-
-        Args:
-            tokenizer: Tokenizer for encoding prompt.
-            prompt: The prompt string that was processed.
-            cache: The KV cache to store.
-        """
-        tokens = self._encode_prompt(tokenizer, prompt)
-        key = self._token_key(tokens)
-
-        # If already in cache, just move to end
-        if key in self._cache:
-            self._cache.move_to_end(key)
-            return
-
-        # Evict LRU entry if at capacity
-        if len(self._cache) >= self.max_size:
-            evicted_key, _ = self._cache.popitem(last=False)
-            logger.debug(
-                f"Prefix cache: evicted LRU entry ({len(evicted_key)} token bytes)"
+        else:
+            prompt_cache = make_kv_cache(
+                model,
+                # max_kv_size=MAX_KV_SIZE,
+                # keep=KEEP_KV_SIZE
            )

-        # Store deepcopy
-        self._cache[key] = (tokens, deepcopy(cache))
-        logger.debug(f"Prefix cache: stored entry with {tokens.shape[0]} tokens")
+        prefill(model, tokenizer, sampler, tokenized_prompt, prompt_cache)

-    def clear(self) -> None:
-        """Clear all cached entries."""
-        self._cache.clear()
+        return prompt_cache

-    def __len__(self) -> int:
-        """Return number of cached entries."""
-        return len(self._cache)
+    def encode_prompt(self, tokenizer: TokenizerWrapper, prompt: str) -> mx.array:
+        add_special_tokens = tokenizer.bos_token is None or not prompt.startswith(
+            tokenizer.bos_token
+        )
+        tokenized_prompt = tokenizer.encode(
+            prompt, add_special_tokens=add_special_tokens
+        )
+        return mx.array(tokenized_prompt)


-def get_prefix_length(prompt: mx.array, cached_prompt: mx.array) -> int:
-    """Calculate length of matching prefix between two token arrays."""
-    n = min(int(prompt.shape[0]), int(cached_prompt.shape[0]))
+def _get_prefix_length(prompt: mx.array, cached_prompt: mx.array) -> int:
+    n = min(int(prompt.shape[0]), int(cached_prompt.shape[0]), KEEP_KV_SIZE)
    if n == 0:
        return 0

-    equal = mx.equal(prompt[:n], cached_prompt[:n]).astype(mx.int32)
+    equal = (prompt[:n] == cached_prompt[:n]).astype(mx.int32)
    prefix_mask = mx.cumprod(equal)  # stays 1 until first mismatch, then 0 forever
    return int(mx.sum(prefix_mask).item())
+
+
+def prefill(
+    model: Model,
+    tokenizer: TokenizerWrapper,
+    sampler: Callable[[mx.array], mx.array],
+    prompt: mx.array,
+    cache: list[_BaseCache],
+) -> None:
+    for _ in stream_generate(
+        model=model,
+        tokenizer=tokenizer,
+        prompt=prompt,
+        max_tokens=0,
+        sampler=sampler,
+        prompt_cache=cache,
+        prefill_step_size=2048,
+        kv_group_size=KV_GROUP_SIZE,
+        kv_bits=KV_BITS,
+    ):
+        pass
--- a/src/exo/worker/engines/mlx/generator/generate.py
+++ b/src/exo/worker/engines/mlx/generator/generate.py
@@ -6,6 +6,7 @@ from mlx_lm.models.cache import KVCache
 from mlx_lm.sample_utils import make_sampler
 from mlx_lm.tokenizer_utils import TokenizerWrapper

+# from exo.engines.mlx.cache import KVPrefixCache
 from exo.shared.types.api import (
    BenchChatCompletionTaskParams,
    ChatCompletionMessage,
@@ -18,7 +19,6 @@ from exo.shared.types.worker.runner_response import (
    GenerationResponse,
 )
 from exo.worker.engines.mlx import Model
-from exo.worker.engines.mlx.cache import KVPrefixCache
 from exo.worker.engines.mlx.constants import KV_BITS, KV_GROUP_SIZE, MAX_TOKENS
 from exo.worker.engines.mlx.utils_mlx import (
    apply_chat_template,
@@ -119,7 +119,6 @@ def mlx_generate(
    model: Model,
    tokenizer: TokenizerWrapper,
    task: ChatCompletionTaskParams,
-    prefix_cache: KVPrefixCache | None = None,
 ) -> Generator[GenerationResponse]:
    # Ensure that generation stats only contains peak memory for this generation
    mx.reset_peak_memory()
@@ -136,6 +135,8 @@ def mlx_generate(
        chat_task_data=task,
    )

+    caches = make_kv_cache(model=model)
+
    logits_processors: list[Callable[[mx.array, mx.array], mx.array]] = []
    if is_bench:
        # Only sample length eos tokens
@@ -147,20 +148,6 @@ def mlx_generate(
        top_p=task.top_p if task.top_p is not None else 1.0,
    )

-    # Get KV cache - either from prefix cache or fresh
-    tokens_reused = 0
-    if prefix_cache is not None:
-        caches, tokens_reused = prefix_cache.get_kv_cache(
-            model=model,
-            tokenizer=tokenizer,
-            sampler=sampler,
-            prompt=prompt,
-        )
-        if tokens_reused > 0:
-            logger.info(f"Prefix cache hit: reused {tokens_reused} tokens")
-    else:
-        caches = make_kv_cache(model=model)
-
    max_tokens = task.max_tokens or MAX_TOKENS
    for out in stream_generate(
        model=model,
@@ -202,9 +189,6 @@ def mlx_generate(
        )

        if out.finish_reason is not None:
-            # Store in prefix cache for future reuse
-            if prefix_cache is not None:
-                prefix_cache.put(tokenizer=tokenizer, prompt=prompt, cache=caches)
            break

        # TODO: Do we want an mx_barrier?
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -20,6 +20,7 @@ except ImportError:

 from mlx_lm.models.cache import KVCache, QuantizedKVCache, RotatingKVCache
 from mlx_lm.models.deepseek_v3 import DeepseekV3Model
+from mlx_lm.models.gpt_oss import Model as GptOssModel
 from mlx_lm.tokenizer_utils import TokenizerWrapper

 from exo.worker.engines.mlx.constants import (
@@ -365,6 +366,8 @@ def apply_chat_template(
        tools=chat_task_data.tools,
    )

+    logger.info(prompt)
+
    return prompt


@@ -396,6 +399,11 @@ def make_kv_cache(
 ) -> list[KVCache | RotatingKVCache | QuantizedKVCache]:
    assert hasattr(model, "layers")

+    # TODO: Do this for all models
+    if hasattr(model, "make_cache") and isinstance(model, GptOssModel):
+        logger.info("Using MLX LM's make cache")
+        return model.make_cache()  # type: ignore
+
    if max_kv_size is None:
        if KV_CACHE_BITS is None:
            logger.info("Using default KV cache")
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -1,6 +1,15 @@
 import time
+from collections.abc import Generator
+from functools import cache

 import mlx.core as mx
+from mlx_lm.models.gpt_oss import Model as GptOssModel
+from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
+    HarmonyEncodingName,
+    Role,
+    StreamableParser,
+    load_harmony_encoding,
+)

 from exo.shared.types.api import ChatCompletionMessageText
 from exo.shared.types.chunks import TokenChunk
@@ -39,7 +48,6 @@ from exo.shared.types.worker.runners import (
    RunnerWarmingUp,
 )
 from exo.utils.channels import MpReceiver, MpSender
-from exo.worker.engines.mlx.cache import KVPrefixCache
 from exo.worker.engines.mlx.generator.generate import mlx_generate, warmup_inference
 from exo.worker.engines.mlx.utils_mlx import (
    initialize_mlx,
@@ -70,7 +78,6 @@ def main(
    model = None
    tokenizer = None
    group = None
-    prefix_cache: KVPrefixCache | None = None

    current_status: RunnerStatus = RunnerIdle()
    logger.info("runner created")
@@ -112,8 +119,6 @@ def main(
                    )

                    model, tokenizer = load_mlx_items(bound_instance, group)
-                    prefix_cache = KVPrefixCache(max_size=10)
-                    logger.info("prefix cache initialized")

                    current_status = RunnerLoaded()
                    logger.info("runner loaded")
@@ -157,12 +162,19 @@ def main(
                    _check_for_debug_prompts(task_params.messages[0].content)

                    # Generate responses using the actual MLX generation
-                    for response in mlx_generate(
+                    mlx_generator = mlx_generate(
                        model=model,
                        tokenizer=tokenizer,
                        task=task_params,
-                        prefix_cache=prefix_cache,
-                    ):
+                    )
+
+                    # GPT-OSS specific parsing to match other model formats.
+                    if isinstance(model, GptOssModel):
+                        mlx_generator = parse_gpt_oss(mlx_generator)
+
+                    # TODO: Add tool call parser here
+
+                    for response in mlx_generator:
                        match response:
                            case GenerationResponse():
                                if shard_metadata.device_rank == 0:
@@ -212,6 +224,43 @@ def main(
                break


+@cache
+def get_gpt_oss_encoding():
+    encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    return encoding
+
+
+def parse_gpt_oss(
+    responses: Generator[GenerationResponse],
+) -> Generator[GenerationResponse]:
+    encoding = get_gpt_oss_encoding()
+    stream = StreamableParser(encoding, role=Role.ASSISTANT)
+    thinking = False
+
+    for response in responses:
+        stream.process(response.token)
+
+        delta = stream.last_content_delta
+        ch = stream.current_channel
+
+        if ch == "analysis" and not thinking:
+            thinking = True
+            yield response.model_copy(update={"text": "<think>"})
+
+        if ch != "analysis" and thinking:
+            thinking = False
+            yield response.model_copy(update={"text": "</think>"})
+
+        if delta:
+            yield response.model_copy(update={"text": delta})
+
+        if response.finish_reason is not None:
+            if thinking:
+                yield response.model_copy(update={"text": "</think>"})
+            yield response
+            break
+
+
 EXO_RUNNER_MUST_FAIL = "EXO RUNNER MUST FAIL"
 EXO_RUNNER_MUST_OOM = "EXO RUNNER MUST OOM"
 EXO_RUNNER_MUST_TIMEOUT = "EXO RUNNER MUST TIMEOUT"
--- a/src/exo/worker/tests/unittests/test_mlx/test_prefix_cache.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_prefix_cache.py
@@ -1,141 +0,0 @@
-"""Tests for KVPrefixCache."""
-
-import mlx.core as mx
-import pytest
-
-from exo.worker.engines.mlx.cache import (
-    KVCacheType,
-    KVPrefixCache,
-    TokenizerProtocol,
-    get_prefix_length,
-)
-
-
-class MockTokenizer(TokenizerProtocol):
-    """Mock tokenizer that converts string to list of char codes."""
-
-    bos_token: str | None = None
-
-    def encode(self, text: str, **kwargs: bool) -> list[int]:
-        """Encode text to list of character codes."""
-        del kwargs  # unused
-        return [ord(c) for c in text]
-
-
-class TestGetPrefixLength:
-    """Tests for the core prefix matching algorithm."""
-
-    def test_identical_arrays(self) -> None:
-        a = mx.array([1, 2, 3, 4, 5])
-        b = mx.array([1, 2, 3, 4, 5])
-        assert get_prefix_length(a, b) == 5
-
-    def test_partial_match(self) -> None:
-        a = mx.array([1, 2, 3, 4, 5])
-        b = mx.array([1, 2, 3, 9, 9])
-        assert get_prefix_length(a, b) == 3
-
-    def test_no_match(self) -> None:
-        a = mx.array([1, 2, 3])
-        b = mx.array([9, 9, 9])
-        assert get_prefix_length(a, b) == 0
-
-    def test_different_lengths(self) -> None:
-        short = mx.array([1, 2, 3])
-        long = mx.array([1, 2, 3, 4, 5])
-        # Should return length of shorter when they match
-        assert get_prefix_length(short, long) == 3
-        assert get_prefix_length(long, short) == 3
-
-    def test_empty_array(self) -> None:
-        empty: mx.array = mx.array([])
-        tokens = mx.array([1, 2, 3])
-        assert get_prefix_length(empty, tokens) == 0
-        assert get_prefix_length(tokens, empty) == 0
-
-
-class TestKVPrefixCache:
-    """Tests for the KV prefix cache."""
-
-    @pytest.fixture
-    def tokenizer(self) -> MockTokenizer:
-        """Mock tokenizer that converts string to list of char codes."""
-        return MockTokenizer()
-
-    @pytest.fixture
-    def fake_kv(self) -> list[KVCacheType]:
-        """Fake KV cache for testing."""
-        return [object()]
-
-    def test_put_stores_entry(
-        self, tokenizer: MockTokenizer, fake_kv: list[KVCacheType]
-    ) -> None:
-        cache = KVPrefixCache(max_size=10)
-
-        cache.put(tokenizer, "hello", fake_kv)
-
-        assert len(cache) == 1
-
-    def test_put_same_prompt_twice_does_not_duplicate(
-        self, tokenizer: MockTokenizer, fake_kv: list[KVCacheType]
-    ) -> None:
-        cache = KVPrefixCache(max_size=10)
-
-        cache.put(tokenizer, "hello", fake_kv)
-        cache.put(tokenizer, "hello", fake_kv)
-
-        assert len(cache) == 1
-
-    def test_lru_eviction(
-        self, tokenizer: MockTokenizer, fake_kv: list[KVCacheType]
-    ) -> None:
-        cache = KVPrefixCache(max_size=2)
-
-        # Fill cache
-        cache.put(tokenizer, "first", fake_kv)
-        cache.put(tokenizer, "second", fake_kv)
-        assert len(cache) == 2
-
-        # Add third - should evict "first" (oldest)
-        cache.put(tokenizer, "third", fake_kv)
-        assert len(cache) == 2
-
-        # Add "first" again - if it was evicted, cache size stays 2
-        # If it wasn't evicted, this would be a no-op
-        cache.put(tokenizer, "first", fake_kv)
-        # Now add fourth - if "first" was re-added, size is still 2
-        cache.put(tokenizer, "fourth", fake_kv)
-        assert len(cache) == 2
-
-    def test_lru_access_refreshes_entry(
-        self, tokenizer: MockTokenizer, fake_kv: list[KVCacheType]
-    ) -> None:
-        cache = KVPrefixCache(max_size=2)
-
-        # Add two entries
-        cache.put(tokenizer, "first", fake_kv)
-        cache.put(tokenizer, "second", fake_kv)
-
-        # Access "first" again (moves to end of LRU)
-        cache.put(tokenizer, "first", fake_kv)
-
-        # Add third - should evict "second" now (oldest)
-        cache.put(tokenizer, "third", fake_kv)
-
-        # Add "second" again - this will add it as new entry
-        cache.put(tokenizer, "second", fake_kv)
-        # Now "first" is oldest, adding fourth should evict it
-        cache.put(tokenizer, "fourth", fake_kv)
-
-        # Cache should have "second" and "fourth", not "first"
-        assert len(cache) == 2
-
-    def test_clear(self, tokenizer: MockTokenizer, fake_kv: list[KVCacheType]) -> None:
-        cache = KVPrefixCache()
-        cache.put(tokenizer, "hello", fake_kv)
-        cache.put(tokenizer, "world", fake_kv)
-        assert len(cache) == 2
-
-        cache.clear()
-
-        assert len(cache) == 0
--- a/src/exo/worker/utils/net_profile.py
+++ b/src/exo/worker/utils/net_profile.py
@@ -1,60 +1,63 @@
-import http.client
-import time
-
-from anyio import create_task_group, to_thread
+import anyio
+import httpx
+from anyio import create_task_group
 from loguru import logger

 from exo.shared.topology import Topology
 from exo.shared.types.common import NodeId

-BAD_STATUSLINE_ATTEMPTS = 3
+REACHABILITY_ATTEMPTS = 3


 async def check_reachability(
    target_ip: str,
    expected_node_id: NodeId,
-    self_node_id: NodeId,
    out: dict[NodeId, set[str]],
+    client: httpx.AsyncClient,
 ) -> None:
    """Check if a node is reachable at the given IP and verify its identity."""
+    if ":" in target_ip:
+        # TODO: use real IpAddress types
+        target_ip = f"[{target_ip}]"
+    url = f"http://{target_ip}:52415/node_id"

-    # TODO: use an async http client
-    def _fetch_remote_node_id(*, attempt: int = 1) -> NodeId | None:
-        connection = http.client.HTTPConnection(target_ip, 52415, timeout=3)
+    remote_node_id = None
+
+    last_error = None
+
+    for _ in range(REACHABILITY_ATTEMPTS):
        try:
-            connection.request("GET", "/node_id")
-            response = connection.getresponse()
-            if response.status != 200:
-                return None
+            r = await client.get(url)
+            if r.status_code != 200:
+                await anyio.sleep(1)
+                continue

-            body = response.read().decode("utf-8").strip()
+            body = r.text.strip().strip('"')
+            if not body:
+                await anyio.sleep(1)
+                continue

-            # Strip quotes if present (JSON string response)
-            if body.startswith('"') and body.endswith('"') and len(body) >= 2:
-                body = body[1:-1]
+            remote_node_id = NodeId(body)
+            break

-            return NodeId(body) or None
-        except OSError:
-            return None
-        except http.client.BadStatusLine:
-            if attempt >= BAD_STATUSLINE_ATTEMPTS:
-                logger.warning(
-                    f"BadStatusLine from {target_ip}, after {attempt} attempts, assuming connection to {expected_node_id} has dropped"
-                )
-                return None
-            time.sleep(1)
-            return _fetch_remote_node_id(attempt=attempt + 1)
-        except http.client.HTTPException as e:
-            logger.warning(f"HTTPException from {target_ip}: {type(e).__name__}: {e}")
-            return None
-        finally:
-            connection.close()
+        except (
+            httpx.ConnectError,
+            httpx.ConnectTimeout,
+            httpx.ReadTimeout,
+            httpx.RemoteProtocolError,
+        ) as e:
+            last_error = e
+            await anyio.sleep(1)

-    remote_node_id = await to_thread.run_sync(_fetch_remote_node_id)
-    if remote_node_id is None:
-        return
-
-    if remote_node_id == self_node_id:
+    else:
+        if last_error is not None:
+            logger.warning(
+                f"connect error {type(last_error).__name__} from {target_ip} after {REACHABILITY_ATTEMPTS} attempts; treating as down"
+            )
+        else:
+            logger.warning(
+                f"malformed response from {target_ip} after {REACHABILITY_ATTEMPTS} attempts; treating as down"
+            )
        return

    if remote_node_id != expected_node_id:
@@ -74,18 +77,33 @@ async def check_reachable(
    topology: Topology, self_node_id: NodeId
 ) -> dict[NodeId, set[str]]:
    """Check which nodes are reachable and return their IPs."""
+
    reachable: dict[NodeId, set[str]] = {}
-    async with create_task_group() as tg:
+
+    # these are intentionally httpx's defaults so we can tune them later
+    timeout = httpx.Timeout(timeout=5.0)
+    limits = httpx.Limits(
+        max_connections=100,
+        max_keepalive_connections=20,
+        keepalive_expiry=5,
+    )
+
+    async with (
+        httpx.AsyncClient(timeout=timeout, limits=limits) as client,
+        create_task_group() as tg,
+    ):
        for node in topology.list_nodes():
            if not node.node_profile:
                continue
+            if node.node_id == self_node_id:
+                continue
            for iface in node.node_profile.network_interfaces:
                tg.start_soon(
                    check_reachability,
                    iface.ip_address,
                    node.node_id,
-                    self_node_id,
                    reachable,
+                    client,
                )

    return reachable
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Evan	1f28b22bf7	aaa	2026-01-16 14:20:34 +00:00
Evan Quiney	39ee2bf7bd	switch from synchronous threaded pinging to an async implementation (#1170 ) still seeing churn in our networking - lets properly rate limit it ## changes added an httpx client with max connections with a persistent AsyncClient ## testing deployed on cluster, discovery VASTLY more stable (the only deleted edges were those discovered by mdns)	2026-01-16 13:20:03 +00:00
Sami Khan	991adfbd6f	fix local network warning (#1136 ) ## Motivation Local network warning banner was showing on fresh install even though mDNS was working. The check would fail before the user had a chance to grant permission via the macOS prompt. ## Changes - Added `hasWorkedBefore` flag persisted in UserDefaults - Only show warning if permission previously worked but now doesn't ## Why It Works On fresh install, the check may fail (no permission yet), but `hasWorkedBefore` is false so no warning shows. Once the user grants permission and a check succeeds, we record it. Future failures (zombie permission after restart) will show the warning since `hasWorkedBefore` is now true. ## Test Plan ### Manual Testing Run locally ### Automated Testing N/A	2026-01-16 13:10:50 +00:00
rltakashige	4b3de6b984	Fix exo bench for transformers 5.x (#1168 ) ## Motivation Prompt Sizer was broken as transformers 5.x tokenizers create BatchEncodings which are essentially a dictionary of {input_ids: []} instead of the list of input ids. ## Test Plan ### Manual Testing Tested that exo bench runs as expected. ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-16 12:39:22 +00:00
Evan	c8de3b90ea	quiet rust logs rust logs were too verbose - now only warnings propagate to python entirely happy not to merge this and to clean up rust logging instead, but this felt saner right now	2026-01-16 12:34:28 +00:00
Sami Khan	6e6567a802	resolve issue #1070 (#1076 ) ## Motivation https://github.com/exo-explore/exo/issues/1070 ## Changes Added check in ChatForm.svelte to reset selectedChatModel when it no longer matches any running instance. ## Why It Works The $effect now detects when the selected model is stale (not in availableModels()) and resets to the first available model. ## Test Plan ### Manual Testing 1. Create instance of Model A → Delete it → Create instance of Model B → Chat 2. Verify request goes to Model B (not Model A) --------- Co-authored-by: Alex Cheema <41707476+AlexCheema@users.noreply.github.com>	2026-01-15 20:00:41 +00:00
rltakashige	a735dad667	Parse GPT OSS in runner (#1160 ) ## Motivation Simplification of API + moving model specific code to the runner <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Test Plan ### Manual Testing Tested that GPT OSS outputs are parsed correctly on the dashboard. ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-15 19:53:55 +00:00
rltakashige	aaf4e36bc3	FIX GPT OSS (#1165 ) ## Motivation Adds several unmerged fixes for GPT OSS. Also adds GPT OSS 20B MXFP4 Q8 instead of Q4 for numerical stability (as this is unstable for MLX LM too) <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Test Plan ### Manual Testing Manually tested. No further gibberish responses. ### Automated Testing Ran EXO Bench - pipeline, tensor and single node work on both 20B and 120B models	2026-01-15 19:20:17 +00:00