aaa

switch from synchronous threaded pinging to an async implementation (#1170 )
still seeing churn in our networking - lets properly rate limit it ## changes added an httpx client with max connections with a persistent AsyncClient ## testing deployed on cluster, discovery VASTLY more stable (the only deleted edges were those discovered by mdns)
2026-01-16 09:59:43 -05:00 · 2026-01-16 14:20:34 +00:00 · 2026-01-16 13:20:03 +00:00 · 2026-01-16 13:10:50 +00:00 · 2026-01-16 12:39:22 +00:00 · 2026-01-16 12:34:28 +00:00
17 changed files with 759 additions and 1078 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -91,51 +91,6 @@ From .cursorrules:
 - Catch exceptions only where you can handle them meaningfully
 - Use `@final` and immutability wherever applicable

-## API Reference
-
-The API is served at `http://localhost:52415` by default. Key files:
- `docs/api.md`: Full API documentation
- `src/exo/master/api.py`: FastAPI implementation
- `src/exo/shared/types/api.py`: Request/response Pydantic models
-
-### Key Endpoints
-
-```
-GET  /node_id              # Current master node ID
-GET  /state                # Full cluster state (topology, instances, downloads, etc.)
-GET  /events               # Event log for debugging
-
-POST /instance             # Create model instance
-GET  /instance/{id}        # Get instance details
-DELETE /instance/{id}      # Delete instance
-GET  /instance/previews    # Preview placements for a model
-GET  /instance/placement   # Compute placement without creating
-
-GET  /models               # List available models
-GET  /v1/models            # OpenAI-compatible model list
-
-POST /v1/chat/completions  # OpenAI-compatible chat completions (streaming/non-streaming)
-POST /bench/chat/completions # Chat completions with performance stats
-```
-
-### Useful curl Commands
-
-```bash
-# Check cluster state
-curl -s http://localhost:52415/state | python3 -m json.tool
-
-# List models
-curl -s http://localhost:52415/models | python3 -m json.tool
-
-# Preview placements for a model
-curl -s "http://localhost:52415/instance/previews?model_id=llama-3.2-1b" | python3 -m json.tool
-
-# Chat completion
-curl -X POST http://localhost:52415/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "llama-3.2-1b", "messages": [{"role": "user", "content": "Hello"}]}'
-```
-
 ## Testing

 Tests use pytest-asyncio with `asyncio_mode = "auto"`. Tests are in `tests/` subdirectories alongside the code they test. The `EXO_TESTS=1` env var is set during tests.
--- a/app/EXO/EXO/ContentView.swift
+++ b/app/EXO/EXO/ContentView.swift
@@ -56,6 +56,11 @@ struct ContentView: View {
    }

    private var shouldShowLocalNetworkWarning: Bool {
+        // Show warning if local network is not working and EXO is running.
+        // The checker uses a longer timeout on first launch to allow time for
+        // the permission prompt, so this correctly handles both:
+        // 1. User denied permission on first launch
+        // 2. Permission broke after restart (macOS TCC bug)
        if case .notWorking = localNetworkChecker.status {
            return controller.status != .stopped
        }
--- a/app/EXO/EXO/Services/LocalNetworkChecker.swift
+++ b/app/EXO/EXO/Services/LocalNetworkChecker.swift
@@ -5,8 +5,8 @@ import os.log
 /// Checks if the app's local network permission is actually functional.
 ///
 /// macOS local network permission can appear enabled in System Preferences but not
-/// actually work after a restart. This service detects this by creating a UDP
-/// connection to the mDNS multicast address (224.0.0.251:5353).
+/// actually work after a restart. This service uses NWConnection to mDNS multicast
+/// to verify actual connectivity.
@MainActor
 final class LocalNetworkChecker: ObservableObject {
    enum Status: Equatable {
@@ -35,30 +35,43 @@ final class LocalNetworkChecker: ObservableObject {
    }

    private static let logger = Logger(subsystem: "io.exo.EXO", category: "LocalNetworkChecker")
+    private static let hasCompletedInitialCheckKey = "LocalNetworkChecker.hasCompletedInitialCheck"

    @Published private(set) var status: Status = .unknown
-    @Published private(set) var lastConnectionState: String = "none"

    private var connection: NWConnection?
    private var checkTask: Task<Void, Never>?

+    /// Whether we've completed at least one check (stored in UserDefaults)
+    private var hasCompletedInitialCheck: Bool {
+        get { UserDefaults.standard.bool(forKey: Self.hasCompletedInitialCheckKey) }
+        set { UserDefaults.standard.set(newValue, forKey: Self.hasCompletedInitialCheckKey) }
+    }
+
    /// Checks if local network access is working.
    func check() {
        checkTask?.cancel()
        status = .checking
-        lastConnectionState = "connecting"
+
+        // Use longer timeout on first launch to allow time for permission prompt
+        let isFirstCheck = !hasCompletedInitialCheck
+        let timeout: UInt64 = isFirstCheck ? 30_000_000_000 : 3_000_000_000

        checkTask = Task { [weak self] in
            guard let self else { return }
-            let result = await self.performCheck()
+
+            Self.logger.info("Checking local network connectivity (first check: \(isFirstCheck))")
+            let result = await self.checkConnectivity(timeout: timeout)
            self.status = result
+            self.hasCompletedInitialCheck = true
+
            Self.logger.info("Local network check complete: \(result.displayText)")
        }
    }

-    private func performCheck() async -> Status {
-        Self.logger.info("Checking local network access via UDP multicast")
-
+    /// Checks connectivity using NWConnection to mDNS multicast.
+    /// The connection attempt triggers the permission prompt if not yet shown.
+    private func checkConnectivity(timeout: UInt64) async -> Status {
        connection?.cancel()
        connection = nil

@@ -84,22 +97,7 @@ final class LocalNetworkChecker: ObservableObject {
                continuation.resume(returning: status)
            }

-            conn.stateUpdateHandler = { [weak self] state in
-                let stateStr: String
-                switch state {
-                case .setup: stateStr = "setup"
-                case .preparing: stateStr = "preparing"
-                case .ready: stateStr = "ready"
-                case .waiting(let e): stateStr = "waiting(\(e))"
-                case .failed(let e): stateStr = "failed(\(e))"
-                case .cancelled: stateStr = "cancelled"
-                @unknown default: stateStr = "unknown"
-                }
-
-                Task { @MainActor in
-                    self?.lastConnectionState = stateStr
-                }
-
+            conn.stateUpdateHandler = { state in
                switch state {
                case .ready:
                    resumeOnce(.working)
@@ -108,6 +106,7 @@ final class LocalNetworkChecker: ObservableObject {
                    if errorStr.contains("54") || errorStr.contains("ECONNRESET") {
                        resumeOnce(.notWorking(reason: "Connection blocked"))
                    }
+                // Otherwise keep waiting - might be showing permission prompt
                case .failed(let error):
                    let errorStr = "\(error)"
                    if errorStr.contains("65") || errorStr.contains("EHOSTUNREACH")
@@ -127,7 +126,7 @@ final class LocalNetworkChecker: ObservableObject {
            conn.start(queue: .main)

            Task {
-                try? await Task.sleep(nanoseconds: 3_000_000_000)
+                try? await Task.sleep(nanoseconds: timeout)
                let state = conn.state
                switch state {
                case .ready:
--- a/bench/exo_bench.py
+++ b/bench/exo_bench.py
@@ -241,6 +241,9 @@ class PromptSizer:
            ids = tokenizer.apply_chat_template(
                messages, tokenize=True, add_generation_prompt=True
            )
+            # Fix for transformers 5.x
+            if hasattr(ids, "input_ids"):
+                ids = ids.input_ids
            return int(len(ids))

        return count_fn
--- a/dashboard/src/lib/components/ChatForm.svelte
+++ b/dashboard/src/lib/components/ChatForm.svelte
@@ -60,12 +60,39 @@
 		return models;
 	});

-	// Auto-select the first available model if none is selected
+	// Track previous model IDs to detect newly added models (plain variable to avoid reactive loop)
+	let previousModelIds: Set<string> = new Set();
+
+	// Auto-select the first available model if none is selected, if current selection is stale, or if a new model is added
 	$effect(() => {
 		const models = availableModels();
-		if (models.length > 0 && !currentModel) {
-			setSelectedChatModel(models[0].id);
+		const currentModelIds = new Set(models.map(m => m.id));
+
+		if (models.length > 0) {
+			// Find newly added models (in current but not in previous)
+			const newModels = models.filter(m => !previousModelIds.has(m.id));
+
+			// If no model selected, select the first available
+			if (!currentModel) {
+				setSelectedChatModel(models[0].id);
+			}
+			// If current model is stale (no longer has a running instance), reset to first available
+			else if (!models.some(m => m.id === currentModel)) {
+				setSelectedChatModel(models[0].id);
+			}
+			// If a new model was just added, select it
+			else if (newModels.length > 0 && previousModelIds.size > 0) {
+				setSelectedChatModel(newModels[0].id);
+			}
+		} else {
+			// No instances running - clear the selected model
+			if (currentModel) {
+				setSelectedChatModel('');
+			}
 		}
+
+		// Update previous model IDs for next comparison
+		previousModelIds = currentModelIds;
 	});

 	function getInstanceModelId(instanceWrapped: unknown): string {
--- a/dashboard/src/routes/+page.svelte
+++ b/dashboard/src/routes/+page.svelte
@@ -400,10 +400,8 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 				const errorText = await response.text();
 				console.error('Failed to launch instance:', errorText);
 			} else {
-				// Auto-select the launched model only if no model is currently selected
-				if (!selectedChatModel()) {
-					setSelectedChatModel(modelId);
-				}
+				// Always auto-select the newly launched model so the user chats to what they just launched
+				setSelectedChatModel(modelId);
 				
 				// Scroll to the bottom of instances container to show the new instance
 				// Use multiple attempts to ensure DOM has updated with the new instance
@@ -763,6 +761,10 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 	async function deleteInstance(instanceId: string) {
 		if (!confirm(`Delete instance ${instanceId.slice(0, 8)}...?`)) return;
 		
+		// Get the model ID of the instance being deleted before we delete it
+		const deletedInstanceModelId = getInstanceModelId(instanceData[instanceId]);
+		const wasSelected = selectedChatModel() === deletedInstanceModelId;
+		
 		try {
 			const response = await fetch(`/instance/${instanceId}`, {
 				method: 'DELETE',
@@ -771,6 +773,24 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
 			
 			if (!response.ok) {
 				console.error('Failed to delete instance:', response.status);
+			} else if (wasSelected) {
+				// If we deleted the currently selected model, switch to another available model
+				// Find another instance that isn't the one we just deleted
+				const remainingInstances = Object.entries(instanceData).filter(([id]) => id !== instanceId);
+				if (remainingInstances.length > 0) {
+					// Select the last instance (most recently added, since objects preserve insertion order)
+					const [, lastInstance] = remainingInstances[remainingInstances.length - 1];
+					const newModelId = getInstanceModelId(lastInstance);
+					if (newModelId && newModelId !== 'Unknown' && newModelId !== 'Unknown Model') {
+						setSelectedChatModel(newModelId);
+					} else {
+						// Clear selection if no valid model found
+						setSelectedChatModel('');
+					}
+				} else {
+					// No more instances, clear the selection
+					setSelectedChatModel('');
+				}
 			}
 		} catch (error) {
 			console.error('Error deleting instance:', error);
--- a/2
+++ b/2
@@ -1,3 +1,5 @@
+export NIX_CONFIG := "extra-experimental-features = nix-command flakes"
+
 fmt:
    nix fmt

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,8 +6,6 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
    "aiofiles>=24.1.0",
-    "aiohttp>=3.12.14",
-    "types-aiofiles>=24.1.0.20250708",
    "pydantic>=2.11.7",
    "fastapi>=0.116.1",
    "filelock>=3.18.0",
@@ -23,6 +21,7 @@ dependencies = [
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
+    "httpx>=0.28.1",
 ]

 [project.scripts]
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -13,12 +13,6 @@ from hypercorn.asyncio import serve  # pyright: ignore[reportUnknownVariableType
 from hypercorn.config import Config
 from hypercorn.typing import ASGIFramework
 from loguru import logger
-from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
-    HarmonyEncodingName,
-    Role,
-    StreamableParser,
-    load_harmony_encoding,
-)

 from exo.master.placement import place_instance as get_instance_placements
 from exo.shared.apply import apply
@@ -67,8 +61,6 @@ from exo.utils.channels import Receiver, Sender, channel
 from exo.utils.dashboard_path import find_dashboard
 from exo.utils.event_buffer import OrderedBuffer

-encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-

 def chunk_to_response(
    chunk: TokenChunk, command_id: CommandId
@@ -381,35 +373,8 @@ class API:
            instance_id=instance_id,
        )

-    async def _process_gpt_oss(self, token_chunks: Receiver[TokenChunk]):
-        stream = StreamableParser(encoding, role=Role.ASSISTANT)
-        thinking = False
-
-        async for chunk in token_chunks:
-            stream.process(chunk.token_id)
-
-            delta = stream.last_content_delta
-            ch = stream.current_channel
-
-            if ch == "analysis" and not thinking:
-                thinking = True
-                yield chunk.model_copy(update={"text": "<think>"})
-
-            if ch != "analysis" and thinking:
-                thinking = False
-                yield chunk.model_copy(update={"text": "</think>"})
-
-            if delta:
-                yield chunk.model_copy(update={"text": delta})
-
-            if chunk.finish_reason is not None:
-                if thinking:
-                    yield chunk.model_copy(update={"text": "</think>"})
-                yield chunk
-                break
-
    async def _chat_chunk_stream(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> AsyncGenerator[TokenChunk, None]:
        """Yield `TokenChunk`s for a given command until completion."""

@@ -417,16 +382,10 @@ class API:
            self._chat_completion_queues[command_id], recv = channel[TokenChunk]()

            with recv as token_chunks:
-                if parse_gpt_oss:
-                    async for chunk in self._process_gpt_oss(token_chunks):
-                        yield chunk
-                        if chunk.finish_reason is not None:
-                            break
-                else:
-                    async for chunk in token_chunks:
-                        yield chunk
-                        if chunk.finish_reason is not None:
-                            break
+                async for chunk in token_chunks:
+                    yield chunk
+                    if chunk.finish_reason is not None:
+                        break

        except anyio.get_cancelled_exc_class():
            # TODO: TaskCancelled
@@ -442,11 +401,11 @@ class API:
            del self._chat_completion_queues[command_id]

    async def _generate_chat_stream(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> AsyncGenerator[str, None]:
        """Generate chat completion stream as JSON strings."""

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            chunk_response: ChatCompletionResponse = chunk_to_response(
                chunk, command_id
            )
@@ -458,7 +417,7 @@ class API:
                yield "data: [DONE]\n\n"

    async def _collect_chat_completion(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> ChatCompletionResponse:
        """Collect all token chunks for a chat completion and return a single response."""

@@ -466,7 +425,7 @@ class API:
        model: str | None = None
        finish_reason: FinishReason | None = None

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            if model is None:
                model = chunk.model

@@ -495,7 +454,7 @@ class API:
        )

    async def _collect_chat_completion_with_stats(
-        self, command_id: CommandId, parse_gpt_oss: bool
+        self, command_id: CommandId
    ) -> BenchChatCompletionResponse:
        text_parts: list[str] = []
        model: str | None = None
@@ -503,7 +462,7 @@ class API:

        stats: GenerationStats | None = None

-        async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
+        async for chunk in self._chat_chunk_stream(command_id):
            if model is None:
                model = chunk.model

@@ -544,8 +503,6 @@ class API:
        """Handle chat completions, supporting both streaming and non-streaming responses."""
        model_meta = await resolve_model_meta(payload.model)
        payload.model = model_meta.model_id
-        parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
-        logger.info(f"{parse_gpt_oss=}")

        if not any(
            instance.shard_assignments.model_id == payload.model
@@ -562,17 +519,16 @@ class API:
        await self._send(command)
        if payload.stream:
            return StreamingResponse(
-                self._generate_chat_stream(command.command_id, parse_gpt_oss),
+                self._generate_chat_stream(command.command_id),
                media_type="text/event-stream",
            )

-        return await self._collect_chat_completion(command.command_id, parse_gpt_oss)
+        return await self._collect_chat_completion(command.command_id)

    async def bench_chat_completions(
        self, payload: BenchChatCompletionTaskParams
    ) -> BenchChatCompletionResponse:
        model_meta = await resolve_model_meta(payload.model)
-        parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
        payload.model = model_meta.model_id

        if not any(
@@ -589,10 +545,7 @@ class API:
        command = ChatCompletion(request_params=payload)
        await self._send(command)

-        response = await self._collect_chat_completion_with_stats(
-            command.command_id,
-            parse_gpt_oss,
-        )
+        response = await self._collect_chat_completion_with_stats(command.command_id)
        return response

    def _calculate_total_available_memory(self) -> Memory:
--- a/src/exo/shared/logging.py
+++ b/src/exo/shared/logging.py
@@ -29,6 +29,11 @@ class _InterceptHandler(logging.Handler):

 def logger_setup(log_file: Path | None, verbosity: int = 0):
    """Set up logging for this process - formatting, file handles, verbosity and output"""
+
+    logging.getLogger("exo_pyo3_bindings").setLevel(logging.WARNING)
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+
    logger.remove()

    # replace all stdlib loggers with _InterceptHandlers that log to loguru
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -425,15 +425,15 @@ MODEL_CARDS: dict[str, ModelCard] = {
            supports_tensor=True,
        ),
    ),
-    "gpt-oss-20b-4bit": ModelCard(
-        short_id="gpt-oss-20b-4bit",
-        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
-        name="GPT-OSS 20B (MXFP4-Q4, MLX)",
-        description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization.""",
+    "gpt-oss-20b-MXFP4-Q8": ModelCard(
+        short_id="gpt-oss-20b-MXFP4-Q8",
+        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
+        name="GPT-OSS 20B (MXFP4-Q8, MLX)",
+        description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this variant is a 4-bit MLX conversion for Apple Silicon.""",
        tags=[],
        metadata=ModelMetadata(
-            model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
-            pretty_name="GPT-OSS 20B (MXFP4-Q4, MLX)",
+            model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
+            pretty_name="GPT-OSS 20B (MXFP4-Q8, MLX)",
            storage_size=Memory.from_kb(11_744_051),
            n_layers=24,
            hidden_size=2880,
--- a/src/exo/worker/download/download_utils.py
+++ b/src/exo/worker/download/download_utils.py
@@ -7,13 +7,13 @@ import time
 import traceback
 from datetime import timedelta
 from pathlib import Path
-from typing import Callable, Literal
+from typing import Callable, Literal, cast
 from urllib.parse import urljoin

 import aiofiles
 import aiofiles.os as aios
-import aiohttp
 import certifi
+import httpx
 from loguru import logger
 from pydantic import (
    BaseModel,
@@ -207,23 +207,22 @@ async def _fetch_file_list(
    headers = await get_download_headers()
    async with (
        create_http_session(timeout_profile="short") as session,
-        session.get(url, headers=headers) as response,
    ):
-        if response.status == 200:
-            data_json = await response.text()
-            data = TypeAdapter(list[FileListEntry]).validate_json(data_json)
-            files: list[FileListEntry] = []
-            for item in data:
-                if item.type == "file":
-                    files.append(FileListEntry.model_validate(item))
-                elif item.type == "directory" and recursive:
-                    subfiles = await _fetch_file_list(
-                        repo_id, revision, item.path, recursive
-                    )
-                    files.extend(subfiles)
-            return files
-        else:
-            raise Exception(f"Failed to fetch file list: {response.status}")
+        response = await session.get(url, headers=headers)
+        if response.status_code != 200:
+            raise Exception(f"Failed to fetch file list: {response.status_code}")
+
+        data = TypeAdapter(list[FileListEntry]).validate_json(response.text)
+        files: list[FileListEntry] = []
+        for item in data:
+            if item.type == "file":
+                files.append(FileListEntry.model_validate(item))
+            elif item.type == "directory" and recursive:
+                subfiles = await _fetch_file_list(
+                    repo_id, revision, item.path, recursive
+                )
+                files.extend(subfiles)
+        return files


 async def get_download_headers() -> dict[str, str]:
@@ -231,31 +230,25 @@ async def get_download_headers() -> dict[str, str]:


 def create_http_session(
-    auto_decompress: bool = False,
    timeout_profile: Literal["short", "long"] = "long",
-) -> aiohttp.ClientSession:
+) -> httpx.AsyncClient:
    if timeout_profile == "short":
        total_timeout = 30
        connect_timeout = 10
-        sock_read_timeout = 30
-        sock_connect_timeout = 10
+        read_timeout = 30
    else:
        total_timeout = 1800
        connect_timeout = 60
-        sock_read_timeout = 1800
-        sock_connect_timeout = 60
+        read_timeout = 1800

    ssl_context = ssl.create_default_context(cafile=certifi.where())
-    connector = aiohttp.TCPConnector(ssl=ssl_context)

-    return aiohttp.ClientSession(
-        auto_decompress=auto_decompress,
-        connector=connector,
-        timeout=aiohttp.ClientTimeout(
-            total=total_timeout,
+    return httpx.AsyncClient(
+        verify=ssl_context,
+        timeout=httpx.Timeout(
            connect=connect_timeout,
-            sock_read=sock_read_timeout,
-            sock_connect=sock_connect_timeout,
+            read=read_timeout,
+            write=total_timeout,
        ),
    )

@@ -282,23 +275,25 @@ async def file_meta(
    headers = await get_download_headers()
    async with (
        create_http_session(timeout_profile="short") as session,
-        session.head(url, headers=headers) as r,
    ):
-        if r.status == 307:
+        r = await session.head(url, headers=headers)
+        if r.status_code == 307:
            # On redirect, only trust Hugging Face's x-linked-* headers.
-            x_linked_size = r.headers.get("x-linked-size")
-            x_linked_etag = r.headers.get("x-linked-etag")
+            x_linked_size = cast(str | None, r.headers.get("x-linked-size"))
+            x_linked_etag = cast(str | None, r.headers.get("x-linked-etag"))
            if x_linked_size and x_linked_etag:
                content_length = int(x_linked_size)
                etag = trim_etag(x_linked_etag)
                return content_length, etag
            # Otherwise, follow the redirect to get authoritative size/hash
-            redirected_location = r.headers.get("location")
+            redirected_location = cast(str | None, r.headers.get("location"))
            return await file_meta(repo_id, revision, path, redirected_location)
-        content_length = int(
-            r.headers.get("x-linked-size") or r.headers.get("content-length") or 0
+        content_length = cast(
+            str | None,
+            r.headers.get("x-linked-size") or r.headers.get("content-length"),
        )
-        etag = r.headers.get("x-linked-etag") or r.headers.get("etag")
+        content_length = 0 if content_length is None else int(content_length)
+        etag = cast(str | None, r.headers.get("x-linked-etag") or r.headers.get("etag"))
        assert content_length > 0, f"No content length for {url}"
        assert etag is not None, f"No remote hash for {url}"
        etag = trim_etag(etag)
@@ -357,17 +352,17 @@ async def _download_file(
        n_read = resume_byte_pos or 0
        async with (
            create_http_session(timeout_profile="long") as session,
-            session.get(url, headers=headers) as r,
        ):
-            if r.status == 404:
+            r = await session.get(url, headers=headers)
+            if r.status_code == 404:
                raise FileNotFoundError(f"File not found: {url}")
-            assert r.status in [200, 206], (
-                f"Failed to download {path} from {url}: {r.status}"
+            assert r.status_code in [200, 206], (
+                f"Failed to download {path} from {url}: {r.status_code}"
            )
            async with aiofiles.open(
                partial_path, "ab" if resume_byte_pos else "wb"
            ) as f:
-                while chunk := await r.content.read(8 * 1024 * 1024):
+                async for chunk in r.aiter_bytes(8 * 1024 * 1024):
                    n_read = n_read + (await f.write(chunk))
                    on_progress(n_read, length, False)

--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -228,10 +228,15 @@ def tensor_auto_parallel(
        group=group,
    )

-    logger.info(f"tensor_auto_parallel: model type = {type(model).__name__}")
+    if hasattr(model, "shard"):
+        try:
+            model.shard(group)  # type: ignore
+            return model
+        except (AttributeError, TypeError, NameError):
+            pass

    if isinstance(model, (LlamaModel, Ministral3Model)):
-        logger.info("Using LlamaShardingStrategy")
+        logger.warning("shouldn't be hit - upstream sharding exists")
        tensor_parallel_sharding_strategy = LlamaShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -240,7 +245,7 @@ def tensor_auto_parallel(
            sharded_to_all_linear_in_place,
        )
    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model)):
-        logger.info("Using DeepSeekShardingStrategy")
+        logger.warning("shouldn't be hit - upstream sharding exists")
        tensor_parallel_sharding_strategy = DeepSeekShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -249,7 +254,6 @@ def tensor_auto_parallel(
            sharded_to_all_linear_in_place,
        )
    elif isinstance(model, MiniMaxModel):
-        logger.info("Using MiniMaxShardingStrategy")
        tensor_parallel_sharding_strategy = MiniMaxShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -258,7 +262,6 @@ def tensor_auto_parallel(
            sharded_to_all_linear_in_place,
        )
    elif isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
-        logger.info("Using QwenShardingStrategy")
        tensor_parallel_sharding_strategy = QwenShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -267,7 +270,6 @@ def tensor_auto_parallel(
            sharded_to_all_linear_in_place,
        )
    elif isinstance(model, GptOssModel):
-        logger.info("Using GptOssShardingStrategy for tensor parallelism")
        tensor_parallel_sharding_strategy = GptOssShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -350,8 +352,6 @@ def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
 class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
    def shard_model(self, model: nn.Module) -> nn.Module:
        model = cast(DeepseekV3Model, model)
-        dense_count = 0
-        moe_count = 0
        for layer in model.layers:
            # Shard the self attention
            if layer.self_attn.q_lora_rank is None:
@@ -370,7 +370,6 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):

            # Shard the MLP
            if isinstance(layer.mlp, (DeepseekV3MLP, DeepseekV32MLP)):
-                dense_count += 1
                layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
@@ -378,7 +377,6 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
            # Shard the MoE. Shard in place since the MoE should be responsible
            # for aggregating the results.
            else:
-                moe_count += 1
                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.gate_proj)
                self.sharded_to_all_linear_in_place(layer.mlp.shared_experts.down_proj)
                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.up_proj)
@@ -388,7 +386,6 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
                layer.mlp = ShardedDeepseekV3MoE(layer.mlp)  # type: ignore
                layer.mlp.sharding_group = self.group

-        logger.info(f"DeepSeekShardingStrategy: {dense_count} dense layers (shard_linear), {moe_count} MoE layers (shard_inplace)")
        return model


@@ -484,6 +481,7 @@ class ShardedQwenMoE(CustomMlxLayer):
 class GptOssShardingStrategy(TensorParallelShardingStrategy):
    def shard_model(self, model: nn.Module) -> nn.Module:
        model = cast(GptOssMoeModel, model)
+
        for layer in model.layers:
            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -20,6 +20,7 @@ except ImportError:

 from mlx_lm.models.cache import KVCache, QuantizedKVCache, RotatingKVCache
 from mlx_lm.models.deepseek_v3 import DeepseekV3Model
+from mlx_lm.models.gpt_oss import Model as GptOssModel
 from mlx_lm.tokenizer_utils import TokenizerWrapper

 from exo.worker.engines.mlx.constants import (
@@ -162,9 +163,7 @@ def mlx_distributed_init(
                os.environ["MLX_IBV_DEVICES"] = coordination_file
                os.environ["MLX_RANK"] = str(rank)
                os.environ["MLX_JACCL_COORDINATOR"] = jaccl_coordinator
-                logger.info(f"rank {rank} BEFORE mx.distributed.init(backend='jaccl')")
                group = mx.distributed.init(backend="jaccl", strict=True)
-                logger.info(f"rank {rank} AFTER mx.distributed.init - group created")

        logger.info(f"Rank {rank} mlx distributed initialization complete")

@@ -201,12 +200,10 @@ def load_mlx_items(
        tokenizer = get_tokenizer(model_path, bound_instance.bound_shard)

    else:
-        logger.info("Starting distributed shard_and_load")
+        logger.info("Starting distributed init")
        start_time = time.perf_counter()
-        logger.info(f"BEFORE shard_and_load for model {bound_instance.bound_shard.model_meta.model_id}")
        model, tokenizer = shard_and_load(bound_instance.bound_shard, group=group)
        end_time = time.perf_counter()
-        logger.info(f"AFTER shard_and_load completed")
        logger.info(
            f"Time taken to shard and load model: {(end_time - start_time):.2f}s"
        )
@@ -221,10 +218,8 @@ def shard_and_load(
    group: Group,
 ) -> tuple[nn.Module, TokenizerWrapper]:
    model_path = build_model_path(shard_metadata.model_meta.model_id)
-    logger.info(f"shard_and_load: model_path={model_path}")
-    logger.info("BEFORE load_model (lazy=True)")
+
    model, _ = load_model(model_path, lazy=True, strict=False)
-    logger.info("AFTER load_model")
    logger.debug(model)
    if hasattr(model, "model") and isinstance(model.model, DeepseekV3Model):  # type: ignore
        pass
@@ -258,6 +253,8 @@ def shard_and_load(
            model = pipeline_auto_parallel(model, group, shard_metadata)

    mx.eval(model.parameters())
+
+    # TODO: Do we need this?
    mx.eval(model)

    logger.debug("SHARDED")
@@ -369,6 +366,8 @@ def apply_chat_template(
        tools=chat_task_data.tools,
    )

+    logger.info(prompt)
+
    return prompt


@@ -400,6 +399,11 @@ def make_kv_cache(
 ) -> list[KVCache | RotatingKVCache | QuantizedKVCache]:
    assert hasattr(model, "layers")

+    # TODO: Do this for all models
+    if hasattr(model, "make_cache") and isinstance(model, GptOssModel):
+        logger.info("Using MLX LM's make cache")
+        return model.make_cache()  # type: ignore
+
    if max_kv_size is None:
        if KV_CACHE_BITS is None:
            logger.info("Using default KV cache")
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -1,6 +1,15 @@
 import time
+from collections.abc import Generator
+from functools import cache

 import mlx.core as mx
+from mlx_lm.models.gpt_oss import Model as GptOssModel
+from openai_harmony import (  # pyright: ignore[reportMissingTypeStubs]
+    HarmonyEncodingName,
+    Role,
+    StreamableParser,
+    load_harmony_encoding,
+)

 from exo.shared.types.api import ChatCompletionMessageText
 from exo.shared.types.chunks import TokenChunk
@@ -153,11 +162,19 @@ def main(
                    _check_for_debug_prompts(task_params.messages[0].content)

                    # Generate responses using the actual MLX generation
-                    for response in mlx_generate(
+                    mlx_generator = mlx_generate(
                        model=model,
                        tokenizer=tokenizer,
                        task=task_params,
-                    ):
+                    )
+
+                    # GPT-OSS specific parsing to match other model formats.
+                    if isinstance(model, GptOssModel):
+                        mlx_generator = parse_gpt_oss(mlx_generator)
+
+                    # TODO: Add tool call parser here
+
+                    for response in mlx_generator:
                        match response:
                            case GenerationResponse():
                                if shard_metadata.device_rank == 0:
@@ -207,6 +224,43 @@ def main(
                break


+@cache
+def get_gpt_oss_encoding():
+    encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    return encoding
+
+
+def parse_gpt_oss(
+    responses: Generator[GenerationResponse],
+) -> Generator[GenerationResponse]:
+    encoding = get_gpt_oss_encoding()
+    stream = StreamableParser(encoding, role=Role.ASSISTANT)
+    thinking = False
+
+    for response in responses:
+        stream.process(response.token)
+
+        delta = stream.last_content_delta
+        ch = stream.current_channel
+
+        if ch == "analysis" and not thinking:
+            thinking = True
+            yield response.model_copy(update={"text": "<think>"})
+
+        if ch != "analysis" and thinking:
+            thinking = False
+            yield response.model_copy(update={"text": "</think>"})
+
+        if delta:
+            yield response.model_copy(update={"text": delta})
+
+        if response.finish_reason is not None:
+            if thinking:
+                yield response.model_copy(update={"text": "</think>"})
+            yield response
+            break
+
+
 EXO_RUNNER_MUST_FAIL = "EXO RUNNER MUST FAIL"
 EXO_RUNNER_MUST_OOM = "EXO RUNNER MUST OOM"
 EXO_RUNNER_MUST_TIMEOUT = "EXO RUNNER MUST TIMEOUT"
--- a/src/exo/worker/utils/net_profile.py
+++ b/src/exo/worker/utils/net_profile.py
@@ -1,60 +1,63 @@
-import http.client
-import time
-
-from anyio import create_task_group, to_thread
+import anyio
+import httpx
+from anyio import create_task_group
 from loguru import logger

 from exo.shared.topology import Topology
 from exo.shared.types.common import NodeId

-BAD_STATUSLINE_ATTEMPTS = 3
+REACHABILITY_ATTEMPTS = 3


 async def check_reachability(
    target_ip: str,
    expected_node_id: NodeId,
-    self_node_id: NodeId,
    out: dict[NodeId, set[str]],
+    client: httpx.AsyncClient,
 ) -> None:
    """Check if a node is reachable at the given IP and verify its identity."""
+    if ":" in target_ip:
+        # TODO: use real IpAddress types
+        target_ip = f"[{target_ip}]"
+    url = f"http://{target_ip}:52415/node_id"

-    # TODO: use an async http client
-    def _fetch_remote_node_id(*, attempt: int = 1) -> NodeId | None:
-        connection = http.client.HTTPConnection(target_ip, 52415, timeout=3)
+    remote_node_id = None
+
+    last_error = None
+
+    for _ in range(REACHABILITY_ATTEMPTS):
        try:
-            connection.request("GET", "/node_id")
-            response = connection.getresponse()
-            if response.status != 200:
-                return None
+            r = await client.get(url)
+            if r.status_code != 200:
+                await anyio.sleep(1)
+                continue

-            body = response.read().decode("utf-8").strip()
+            body = r.text.strip().strip('"')
+            if not body:
+                await anyio.sleep(1)
+                continue

-            # Strip quotes if present (JSON string response)
-            if body.startswith('"') and body.endswith('"') and len(body) >= 2:
-                body = body[1:-1]
+            remote_node_id = NodeId(body)
+            break

-            return NodeId(body) or None
-        except OSError:
-            return None
-        except http.client.BadStatusLine:
-            if attempt >= BAD_STATUSLINE_ATTEMPTS:
-                logger.warning(
-                    f"BadStatusLine from {target_ip}, after {attempt} attempts, assuming connection to {expected_node_id} has dropped"
-                )
-                return None
-            time.sleep(1)
-            return _fetch_remote_node_id(attempt=attempt + 1)
-        except http.client.HTTPException as e:
-            logger.warning(f"HTTPException from {target_ip}: {type(e).__name__}: {e}")
-            return None
-        finally:
-            connection.close()
+        except (
+            httpx.ConnectError,
+            httpx.ConnectTimeout,
+            httpx.ReadTimeout,
+            httpx.RemoteProtocolError,
+        ) as e:
+            last_error = e
+            await anyio.sleep(1)

-    remote_node_id = await to_thread.run_sync(_fetch_remote_node_id)
-    if remote_node_id is None:
-        return
-
-    if remote_node_id == self_node_id:
+    else:
+        if last_error is not None:
+            logger.warning(
+                f"connect error {type(last_error).__name__} from {target_ip} after {REACHABILITY_ATTEMPTS} attempts; treating as down"
+            )
+        else:
+            logger.warning(
+                f"malformed response from {target_ip} after {REACHABILITY_ATTEMPTS} attempts; treating as down"
+            )
        return

    if remote_node_id != expected_node_id:
@@ -74,18 +77,33 @@ async def check_reachable(
    topology: Topology, self_node_id: NodeId
 ) -> dict[NodeId, set[str]]:
    """Check which nodes are reachable and return their IPs."""
+
    reachable: dict[NodeId, set[str]] = {}
-    async with create_task_group() as tg:
+
+    # these are intentionally httpx's defaults so we can tune them later
+    timeout = httpx.Timeout(timeout=5.0)
+    limits = httpx.Limits(
+        max_connections=100,
+        max_keepalive_connections=20,
+        keepalive_expiry=5,
+    )
+
+    async with (
+        httpx.AsyncClient(timeout=timeout, limits=limits) as client,
+        create_task_group() as tg,
+    ):
        for node in topology.list_nodes():
            if not node.node_profile:
                continue
+            if node.node_id == self_node_id:
+                continue
            for iface in node.node_profile.network_interfaces:
                tg.start_soon(
                    check_reachability,
                    iface.ip_address,
                    node.node_id,
-                    self_node_id,
                    reachable,
+                    client,
                )

    return reachable
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Evan	1f28b22bf7	aaa	2026-01-16 14:20:34 +00:00
Evan Quiney	39ee2bf7bd	switch from synchronous threaded pinging to an async implementation (#1170 ) still seeing churn in our networking - lets properly rate limit it ## changes added an httpx client with max connections with a persistent AsyncClient ## testing deployed on cluster, discovery VASTLY more stable (the only deleted edges were those discovered by mdns)	2026-01-16 13:20:03 +00:00
Sami Khan	991adfbd6f	fix local network warning (#1136 ) ## Motivation Local network warning banner was showing on fresh install even though mDNS was working. The check would fail before the user had a chance to grant permission via the macOS prompt. ## Changes - Added `hasWorkedBefore` flag persisted in UserDefaults - Only show warning if permission previously worked but now doesn't ## Why It Works On fresh install, the check may fail (no permission yet), but `hasWorkedBefore` is false so no warning shows. Once the user grants permission and a check succeeds, we record it. Future failures (zombie permission after restart) will show the warning since `hasWorkedBefore` is now true. ## Test Plan ### Manual Testing Run locally ### Automated Testing N/A	2026-01-16 13:10:50 +00:00
rltakashige	4b3de6b984	Fix exo bench for transformers 5.x (#1168 ) ## Motivation Prompt Sizer was broken as transformers 5.x tokenizers create BatchEncodings which are essentially a dictionary of {input_ids: []} instead of the list of input ids. ## Test Plan ### Manual Testing Tested that exo bench runs as expected. ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-16 12:39:22 +00:00
Evan	c8de3b90ea	quiet rust logs rust logs were too verbose - now only warnings propagate to python entirely happy not to merge this and to clean up rust logging instead, but this felt saner right now	2026-01-16 12:34:28 +00:00
Sami Khan	6e6567a802	resolve issue #1070 (#1076 ) ## Motivation https://github.com/exo-explore/exo/issues/1070 ## Changes Added check in ChatForm.svelte to reset selectedChatModel when it no longer matches any running instance. ## Why It Works The $effect now detects when the selected model is stale (not in availableModels()) and resets to the first available model. ## Test Plan ### Manual Testing 1. Create instance of Model A → Delete it → Create instance of Model B → Chat 2. Verify request goes to Model B (not Model A) --------- Co-authored-by: Alex Cheema <41707476+AlexCheema@users.noreply.github.com>	2026-01-15 20:00:41 +00:00
rltakashige	a735dad667	Parse GPT OSS in runner (#1160 ) ## Motivation Simplification of API + moving model specific code to the runner <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Test Plan ### Manual Testing Tested that GPT OSS outputs are parsed correctly on the dashboard. ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-15 19:53:55 +00:00
rltakashige	aaf4e36bc3	FIX GPT OSS (#1165 ) ## Motivation Adds several unmerged fixes for GPT OSS. Also adds GPT OSS 20B MXFP4 Q8 instead of Q4 for numerical stability (as this is unstable for MLX LM too) <!-- Why is this change needed? What problem does it solve? --> <!-- If it fixes an open issue, please link to the issue here --> ## Test Plan ### Manual Testing Manually tested. No further gibberish responses. ### Automated Testing Ran EXO Bench - pipeline, tensor and single node work on both 20B and 120B models	2026-01-15 19:20:17 +00:00