Ensure unique model Id for each quant

Enable image model quantization
2026-01-23 21:41:21 -05:00 · 2026-01-23 20:08:45 +00:00 · 2026-01-23 19:58:50 +00:00
11 changed files with 544 additions and 834 deletions
--- a/app/EXO/EXO/EXOApp.swift
+++ b/app/EXO/EXO/EXOApp.swift
@@ -45,8 +45,8 @@ struct EXOApp: App {
        let thunderboltBridge = ThunderboltBridgeService(clusterStateService: service)
        _thunderboltBridgeService = StateObject(wrappedValue: thunderboltBridge)
        enableLaunchAtLoginIfNeeded()
-        // Install LaunchDaemon to disable Thunderbolt Bridge on startup (prevents network loops)
-        NetworkSetupHelper.promptAndInstallIfNeeded()
+        // Remove old LaunchDaemon components if they exist (from previous versions)
+        cleanupLegacyNetworkSetup()
        // Check local network access periodically (warning disappears when user grants permission)
        localNetwork.startPeriodicChecking(interval: 10)
        controller.scheduleLaunch(after: 15)
@@ -136,6 +136,36 @@ struct EXOApp: App {
        }
    }

+    private func cleanupLegacyNetworkSetup() {
+        guard NetworkSetupHelper.hasInstalledComponents() else { return }
+        // Dispatch async to ensure app is ready before showing alert
+        DispatchQueue.main.async {
+            let alert = NSAlert()
+            alert.messageText = "EXO Network Configuration"
+            alert.informativeText =
+                "EXO needs to configure local network discovery on your device. This requires granting permission once."
+            alert.alertStyle = .informational
+            alert.addButton(withTitle: "Continue")
+            alert.addButton(withTitle: "Later")
+
+            let response = alert.runModal()
+            guard response == .alertFirstButtonReturn else {
+                Logger().info("User deferred legacy network setup cleanup")
+                return
+            }
+
+            do {
+                try NetworkSetupHelper.uninstall()
+                Logger().info("Cleaned up legacy network setup components")
+            } catch {
+                // Non-fatal: user may have cancelled admin prompt or cleanup may have
+                // partially succeeded. The app will continue normally.
+                Logger().warning(
+                    "Could not clean up legacy network setup (non-fatal): \(error.localizedDescription)"
+                )
+            }
+        }
+    }
 }

 /// Helper for managing EXO's launch-at-login registration
--- a/app/EXO/EXO/Services/NetworkSetupHelper.swift
+++ b/app/EXO/EXO/Services/NetworkSetupHelper.swift
@@ -11,68 +11,6 @@ enum NetworkSetupHelper {
    private static let legacyScriptDestination =
        "/Library/Application Support/EXO/disable_bridge_enable_dhcp.sh"
    private static let plistDestination = "/Library/LaunchDaemons/io.exo.networksetup.plist"
-    private static let requiredStartInterval: Int = 1786
-
-    private static let setupScript = """
-        #!/usr/bin/env bash
-
-        set -euo pipefail
-
-        PREFS="/Library/Preferences/SystemConfiguration/preferences.plist"
-
-        # Remove bridge0 interface
-        ifconfig bridge0 &>/dev/null && {
-          ifconfig bridge0 | grep -q 'member' && {
-            ifconfig bridge0 | awk '/member/ {print $2}' | xargs -n1 ifconfig bridge0 deletem 2>/dev/null || true
-          }
-          ifconfig bridge0 destroy 2>/dev/null || true
-        }
-
-        # Remove Thunderbolt Bridge from VirtualNetworkInterfaces in preferences.plist
-        /usr/libexec/PlistBuddy -c "Delete :VirtualNetworkInterfaces:Bridge:bridge0" "$PREFS" 2>/dev/null || true
-
-        networksetup -listnetworkservices | grep -q "Thunderbolt Bridge" && {
-          networksetup -setnetworkserviceenabled "Thunderbolt Bridge" off
-        } || true
-        """
-
-    /// Prompts user and installs the LaunchDaemon if not already installed.
-    /// Shows an alert explaining what will be installed before requesting admin privileges.
-    static func promptAndInstallIfNeeded() {
-        // Use .utility priority to match NSAppleScript's internal QoS and avoid priority inversion
-        Task.detached(priority: .utility) {
-            // If already correctly installed, skip
-            if daemonAlreadyInstalled() {
-                return
-            }
-
-            // Show alert on main thread
-            let shouldInstall = await MainActor.run {
-                let alert = NSAlert()
-                alert.messageText = "EXO Network Configuration"
-                alert.informativeText =
-                    "EXO needs to install a system service to automatically disable Thunderbolt Bridge on startup. This prevents network loops when connecting multiple Macs via Thunderbolt.\n\nYou will be prompted for your administrator password."
-                alert.alertStyle = .informational
-                alert.addButton(withTitle: "Install")
-                alert.addButton(withTitle: "Not Now")
-                return alert.runModal() == .alertFirstButtonReturn
-            }
-
-            guard shouldInstall else {
-                logger.info("User deferred network setup daemon installation")
-                return
-            }
-
-            do {
-                try installLaunchDaemon()
-                logger.info("Network setup launch daemon installed and started")
-            } catch {
-                logger.error(
-                    "Network setup launch daemon failed: \(error.localizedDescription, privacy: .public)"
-                )
-            }
-        }
-    }

    /// Removes all EXO network setup components from the system.
    /// This includes the LaunchDaemon, scripts, logs, and network location.
@@ -92,100 +30,6 @@ enum NetworkSetupHelper {
        return scriptExists || legacyScriptExists || plistExists
    }

-    private static func daemonAlreadyInstalled() -> Bool {
-        let manager = FileManager.default
-        let scriptExists = manager.fileExists(atPath: scriptDestination)
-        let plistExists = manager.fileExists(atPath: plistDestination)
-        guard scriptExists, plistExists else { return false }
-        guard
-            let installedScript = try? String(contentsOfFile: scriptDestination, encoding: .utf8),
-            installedScript.trimmingCharacters(in: .whitespacesAndNewlines)
-                == setupScript.trimmingCharacters(in: .whitespacesAndNewlines)
-        else {
-            return false
-        }
-        guard
-            let data = try? Data(contentsOf: URL(fileURLWithPath: plistDestination)),
-            let plist = try? PropertyListSerialization.propertyList(
-                from: data, options: [], format: nil) as? [String: Any]
-        else {
-            return false
-        }
-        guard
-            let interval = plist["StartInterval"] as? Int,
-            interval == requiredStartInterval
-        else {
-            return false
-        }
-        if let programArgs = plist["ProgramArguments"] as? [String],
-            programArgs.contains(scriptDestination) == false
-        {
-            return false
-        }
-        return true
-    }
-
-    private static func installLaunchDaemon() throws {
-        let installerScript = makeInstallerScript()
-        try runShellAsAdmin(installerScript)
-    }
-
-    private static func makeInstallerScript() -> String {
-        """
-        set -euo pipefail
-
-        LABEL="\(daemonLabel)"
-        SCRIPT_DEST="\(scriptDestination)"
-        LEGACY_SCRIPT_DEST="\(legacyScriptDestination)"
-        PLIST_DEST="\(plistDestination)"
-        LOG_OUT="/var/log/\(daemonLabel).log"
-        LOG_ERR="/var/log/\(daemonLabel).err.log"
-
-        # First, completely remove any existing installation
-        launchctl bootout system/"$LABEL" 2>/dev/null || true
-        rm -f "$PLIST_DEST"
-        rm -f "$SCRIPT_DEST"
-        rm -f "$LEGACY_SCRIPT_DEST"
-        rm -f "$LOG_OUT" "$LOG_ERR"
-
-        # Install fresh
-        mkdir -p "$(dirname "$SCRIPT_DEST")"
-
-        cat > "$SCRIPT_DEST" <<'EOF_SCRIPT'
-        \(setupScript)
-        EOF_SCRIPT
-        chmod 755 "$SCRIPT_DEST"
-
-        cat > "$PLIST_DEST" <<'EOF_PLIST'
-        <?xml version="1.0" encoding="UTF-8"?>
-        <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-        <plist version="1.0">
-        <dict>
-          <key>Label</key>
-          <string>\(daemonLabel)</string>
-          <key>ProgramArguments</key>
-          <array>
-            <string>/bin/bash</string>
-            <string>\(scriptDestination)</string>
-          </array>
-          <key>StartInterval</key>
-          <integer>\(requiredStartInterval)</integer>
-          <key>RunAtLoad</key>
-          <true/>
-          <key>StandardOutPath</key>
-          <string>/var/log/\(daemonLabel).log</string>
-          <key>StandardErrorPath</key>
-          <string>/var/log/\(daemonLabel).err.log</string>
-        </dict>
-        </plist>
-        EOF_PLIST
-
-        launchctl bootstrap system "$PLIST_DEST"
-        launchctl enable system/"$LABEL"
-        launchctl kickstart -k system/"$LABEL"
-        """
-    }
-
    private static func makeUninstallScript() -> String {
        """
        set -euo pipefail
--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
--- a/src/exo/master/placement_utils.py
+++ b/src/exo/master/placement_utils.py
@@ -257,13 +257,7 @@ def _find_ip_prioritised(
    ip_to_type = {
        iface.ip_address: iface.interface_type for iface in other_network.interfaces
    }
-    priority = {
-        "ethernet": 0,
-        "wifi": 1,
-        "unknown": 2,
-        "maybe_ethernet": 3,
-        "thunderbolt": 4,
-    }
+    priority = {"ethernet": 0, "wifi": 1, "unknown": 2, "thunderbolt": 3}
    return min(ips, key=lambda ip: priority.get(ip_to_type.get(ip, "unknown"), 2))


--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -40,6 +40,7 @@ class ModelCard(CamelCaseModel):
    supports_tensor: bool
    tasks: list[ModelTask]
    components: list[ComponentInfo] | None = None
+    quantization: int | None = None

    @field_validator("tasks", mode="before")
    @classmethod
@@ -413,7 +414,7 @@ MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

-_IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
+_IMAGE_BASE_MODEL_CARDS: dict[str, ModelCard] = {
    "flux1-schnell": ModelCard(
        model_id=ModelId("black-forest-labs/FLUX.1-schnell"),
        storage_size=Memory.from_bytes(23782357120 + 9524621312),
@@ -428,7 +429,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -442,7 +443,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23782357120),
-                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
+                n_layers=57,
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -458,7 +459,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
    ),
    "flux1-dev": ModelCard(
        model_id=ModelId("black-forest-labs/FLUX.1-dev"),
-        storage_size=Memory.from_bytes(23782357120 + 9524621312),
+        storage_size=Memory.from_bytes(23802816640 + 9524621312),
        n_layers=57,
        hidden_size=1,
        supports_tensor=False,
@@ -470,7 +471,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                storage_size=Memory.from_kb(0),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="text_encoder_2",
@@ -484,7 +485,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
                component_name="transformer",
                component_path="transformer/",
                storage_size=Memory.from_bytes(23802816640),
-                n_layers=57,  # 19 transformer_blocks + 38 single_transformer_blocks
+                n_layers=57,
                can_shard=True,
                safetensors_index_filename="diffusion_pytorch_model.safetensors.index.json",
            ),
@@ -543,7 +544,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
    "qwen-image": ModelCard(
        model_id=ModelId("Qwen/Qwen-Image"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
+        n_layers=60,
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.TextToImage],
@@ -551,10 +552,10 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_kb(16584333312),
+                storage_size=Memory.from_bytes(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="transformer",
@@ -577,7 +578,7 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
    "qwen-image-edit-2509": ModelCard(
        model_id=ModelId("Qwen/Qwen-Image-Edit-2509"),
        storage_size=Memory.from_bytes(16584333312 + 40860802176),
-        n_layers=60,  # Qwen has 60 transformer blocks (all joint-style)
+        n_layers=60,
        hidden_size=1,
        supports_tensor=False,
        tasks=[ModelTask.ImageToImage],
@@ -585,10 +586,10 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
            ComponentInfo(
                component_name="text_encoder",
                component_path="text_encoder/",
-                storage_size=Memory.from_kb(16584333312),
+                storage_size=Memory.from_bytes(16584333312),
                n_layers=12,
                can_shard=False,
-                safetensors_index_filename=None,  # Single file
+                safetensors_index_filename=None,
            ),
            ComponentInfo(
                component_name="transformer",
@@ -610,6 +611,93 @@ _IMAGE_MODEL_CARDS: dict[str, ModelCard] = {
    ),
 }

+
+def _create_image_model_quant_variants(
+    base_name: str,
+    base_card: ModelCard,
+) -> dict[str, ModelCard]:
+    """Create quantized variants of an image model card.
+
+    Only the transformer component is quantized; text encoders stay at bf16.
+    Sizes are calculated exactly from the base card's component sizes.
+    """
+    if base_card.components is None:
+        raise ValueError(f"Image model {base_name} must have components defined")
+
+    quantizations = [8, 6, 5, 4, 3]
+
+    num_transformer_bytes = next(
+        c.storage_size.in_bytes
+        for c in base_card.components
+        if c.component_name == "transformer"
+    )
+
+    transformer_bytes = Memory.from_bytes(num_transformer_bytes)
+
+    remaining_bytes = Memory.from_bytes(
+        sum(
+            c.storage_size.in_bytes
+            for c in base_card.components
+            if c.component_name != "transformer"
+        )
+    )
+
+    def with_transformer_size(new_size: Memory) -> list[ComponentInfo]:
+        assert base_card.components is not None
+        return [
+            ComponentInfo(
+                component_name=c.component_name,
+                component_path=c.component_path,
+                storage_size=new_size
+                if c.component_name == "transformer"
+                else c.storage_size,
+                n_layers=c.n_layers,
+                can_shard=c.can_shard,
+                safetensors_index_filename=c.safetensors_index_filename,
+            )
+            for c in base_card.components
+        ]
+
+    variants = {
+        base_name: ModelCard(
+            model_id=base_card.model_id,
+            storage_size=transformer_bytes + remaining_bytes,
+            n_layers=base_card.n_layers,
+            hidden_size=base_card.hidden_size,
+            supports_tensor=base_card.supports_tensor,
+            tasks=base_card.tasks,
+            components=with_transformer_size(transformer_bytes),
+            quantization=None,
+        )
+    }
+
+    for quant in quantizations:
+        quant_transformer_bytes = Memory.from_bytes(
+            (num_transformer_bytes * quant) // 16
+        )
+        total_bytes = remaining_bytes + quant_transformer_bytes
+
+        model_id = base_card.model_id + f"-{quant}bit"
+
+        variants[f"{base_name}-{quant}bit"] = ModelCard(
+            model_id=ModelId(model_id),
+            storage_size=total_bytes,
+            n_layers=base_card.n_layers,
+            hidden_size=base_card.hidden_size,
+            supports_tensor=base_card.supports_tensor,
+            tasks=base_card.tasks,
+            components=with_transformer_size(quant_transformer_bytes),
+            quantization=quant,
+        )
+
+    return variants
+
+
+_image_model_cards: dict[str, ModelCard] = {}
+for _base_name, _base_card in _IMAGE_BASE_MODEL_CARDS.items():
+    _image_model_cards |= _create_image_model_quant_variants(_base_name, _base_card)
+_IMAGE_MODEL_CARDS = _image_model_cards
+
 if EXO_ENABLE_IMAGE_MODELS:
    MODEL_CARDS.update(_IMAGE_MODEL_CARDS)

--- a/src/exo/shared/types/profiling.py
+++ b/src/exo/shared/types/profiling.py
@@ -48,7 +48,7 @@ class SystemPerformanceProfile(CamelCaseModel):
    ecpu_usage: float = 0.0


-InterfaceType = Literal["wifi", "ethernet", "maybe_ethernet", "thunderbolt", "unknown"]
+InterfaceType = Literal["wifi", "ethernet", "thunderbolt", "unknown"]


 class NetworkInterfaceInfo(CamelCaseModel):
--- a/src/exo/utils/info_gatherer/info_gatherer.py
+++ b/src/exo/utils/info_gatherer/info_gatherer.py
@@ -400,7 +400,7 @@ class InfoGatherer:
            return
        old_nics = []
        while True:
-            nics = await get_network_interfaces()
+            nics = get_network_interfaces()
            if nics != old_nics:
                old_nics = nics
                await self.info_sender.send(NodeNetworkInterfaces(ifaces=nics))
--- a/src/exo/utils/info_gatherer/system_info.py
+++ b/src/exo/utils/info_gatherer/system_info.py
@@ -1,6 +1,6 @@
 import socket
 import sys
-from subprocess import CalledProcessError
+from subprocess import CalledProcessError, run

 import psutil
 from anyio import run_process
@@ -16,7 +16,8 @@ async def get_friendly_name() -> str:
    """
    hostname = socket.gethostname()

-    if sys.platform != "darwin":
+    # TODO: better non mac support
+    if sys.platform != "darwin":  # 'darwin' is the platform name for macOS
        return hostname

    try:
@@ -27,20 +28,21 @@ async def get_friendly_name() -> str:
    return process.stdout.decode("utf-8", errors="replace").strip() or hostname


-async def _get_interface_types_from_networksetup() -> dict[str, InterfaceType]:
+def _get_interface_types_from_networksetup() -> dict[str, InterfaceType]:
    """Parse networksetup -listallhardwareports to get interface types."""
    if sys.platform != "darwin":
        return {}
-
    try:
-        result = await run_process(["networksetup", "-listallhardwareports"])
-    except CalledProcessError:
+        result = run(
+            ["networksetup", "-listallhardwareports"], capture_output=True, text=True
+        )
+    except Exception:
        return {}

    types: dict[str, InterfaceType] = {}
    current_type: InterfaceType = "unknown"

-    for line in result.stdout.decode().splitlines():
+    for line in result.stdout.splitlines():
        if line.startswith("Hardware Port:"):
            port_name = line.split(":", 1)[1].strip()
            if "Wi-Fi" in port_name:
@@ -53,15 +55,12 @@ async def _get_interface_types_from_networksetup() -> dict[str, InterfaceType]:
                current_type = "unknown"
        elif line.startswith("Device:"):
            device = line.split(":", 1)[1].strip()
-            # enX is ethernet adapters or thunderbolt - these must be deprioritised
-            if device.startswith("en") and device not in ["en0", "en1"]:
-                current_type = "maybe_ethernet"
            types[device] = current_type

    return types


-async def get_network_interfaces() -> list[NetworkInterfaceInfo]:
+def get_network_interfaces() -> list[NetworkInterfaceInfo]:
    """
    Retrieves detailed network interface information on macOS.
    Parses output from 'networksetup -listallhardwareports' and 'ifconfig'
@@ -69,7 +68,7 @@ async def get_network_interfaces() -> list[NetworkInterfaceInfo]:
    Returns a list of NetworkInterfaceInfo objects.
    """
    interfaces_info: list[NetworkInterfaceInfo] = []
-    interface_types = await _get_interface_types_from_networksetup()
+    interface_types = _get_interface_types_from_networksetup()

    for iface, services in psutil.net_if_addrs().items():
        for service in services:
--- a/src/exo/worker/engines/image/distributed_model.py
+++ b/src/exo/worker/engines/image/distributed_model.py
@@ -71,8 +71,10 @@ class DistributedImageModel:
    def from_bound_instance(
        cls, bound_instance: BoundInstance
    ) -> "DistributedImageModel":
-        model_id = bound_instance.bound_shard.model_card.model_id
+        model_card = bound_instance.bound_shard.model_card
+        model_id = model_card.model_id
        model_path = build_model_path(model_id)
+        quantize = model_card.quantization

        shard_metadata = bound_instance.bound_shard
        if not isinstance(shard_metadata, PipelineShardMetadata):
@@ -93,6 +95,7 @@ class DistributedImageModel:
            local_path=model_path,
            shard_metadata=shard_metadata,
            group=group,
+            quantize=quantize,
        )

    def get_steps_for_quality(self, quality: Literal["low", "medium", "high"]) -> int:
--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -240,6 +240,10 @@ def main(
                            prompt=prompt,
                        )

+                        # GPT-OSS specific parsing to match other model formats.
+                        if isinstance(model, GptOssModel):
+                            mlx_generator = parse_gpt_oss(mlx_generator)
+
                        # For other thinking models (GLM, etc.), check if we need to
                        # prepend the thinking tag that was consumed by the chat template
                        if detect_thinking_prompt_suffix(prompt, tokenizer):
@@ -253,16 +257,10 @@ def main(
                            patch_kimi_tokenizer(tokenizer)

                        # GLM models need patched parser (upstream has bug with None regex match)
-                        elif "glm" in shard_metadata.model_card.model_id.lower():
+                        if "glm" in shard_metadata.model_card.model_id.lower():
                            patch_glm_tokenizer(tokenizer)

-                        # GPT-OSS specific parsing to match other model formats.
-                        elif isinstance(model, GptOssModel):
-                            mlx_generator = parse_gpt_oss(mlx_generator)
-
-                        if tokenizer.has_tool_calling and not isinstance(
-                            model, GptOssModel
-                        ):
+                        if tokenizer.has_tool_calling:
                            assert tokenizer.tool_call_start
                            assert tokenizer.tool_call_end
                            assert tokenizer.tool_parser  # pyright: ignore[reportAny]
@@ -491,10 +489,9 @@ def get_gpt_oss_encoding():


 def filter_kimi_tokens(
-    responses: Generator[GenerationResponse | ToolCallResponse],
+    responses: Generator[GenerationResponse],
 ) -> Generator[GenerationResponse]:
    for resp in responses:
-        assert isinstance(resp, GenerationResponse)
        if (
            resp.text == "<|tool_calls_section_begin|>"
            or resp.text == "<|tool_calls_section_end|>"
@@ -504,44 +501,17 @@ def filter_kimi_tokens(


 def parse_gpt_oss(
-    responses: Generator[GenerationResponse | ToolCallResponse],
-) -> Generator[GenerationResponse | ToolCallResponse]:
+    responses: Generator[GenerationResponse],
+) -> Generator[GenerationResponse]:
    encoding = get_gpt_oss_encoding()
    stream = StreamableParser(encoding, role=Role.ASSISTANT)
    thinking = False
-    current_tool_name: str | None = None
-    tool_arg_parts: list[str] = []

    for response in responses:
-        assert isinstance(response, GenerationResponse)
        stream.process(response.token)

        delta = stream.last_content_delta
        ch = stream.current_channel
-        recipient = stream.current_recipient
-
-        if recipient != current_tool_name:
-            if current_tool_name is not None:
-                prefix = "functions."
-                if current_tool_name.startswith(prefix):
-                    current_tool_name = current_tool_name[len(prefix) :]
-                yield ToolCallResponse(
-                    tool_calls=[
-                        ToolCallItem(
-                            name=current_tool_name,
-                            arguments="".join(tool_arg_parts).strip(),
-                        )
-                    ]
-                )
-                tool_arg_parts = []
-                break
-            current_tool_name = recipient
-
-        # If inside a tool call, accumulate arguments
-        if current_tool_name is not None:
-            if delta:
-                tool_arg_parts.append(delta)
-            continue

        if ch == "analysis" and not thinking:
            thinking = True
@@ -558,12 +528,13 @@ def parse_gpt_oss(
            if thinking:
                yield response.model_copy(update={"text": "</think>"})
            yield response
+            break


 def parse_thinking_models(
-    responses: Generator[GenerationResponse | ToolCallResponse],
+    responses: Generator[GenerationResponse],
    tokenizer: TokenizerWrapper,
-) -> Generator[GenerationResponse | ToolCallResponse]:
+) -> Generator[GenerationResponse]:
    """
    For models that inject thinking tags in the prompt (like GLM-4.7),
    prepend the thinking tag to the output stream so the frontend
@@ -571,9 +542,6 @@ def parse_thinking_models(
    """
    first = True
    for response in responses:
-        if isinstance(response, ToolCallResponse):
-            yield response
-            continue
        if first:
            first = False
            yield response.model_copy(
@@ -654,7 +622,7 @@ def _process_image_response(


 def parse_tool_calls(
-    responses: Generator[GenerationResponse | ToolCallResponse],
+    responses: Generator[GenerationResponse],
    tool_call_start: str,
    tool_call_end: str,
    tool_parser: Callable[[str], dict[str, Any] | list[dict[str, Any]]],
@@ -662,7 +630,6 @@ def parse_tool_calls(
    in_tool_call = False
    tool_call_text_parts: list[str] = []
    for response in responses:
-        assert isinstance(response, GenerationResponse)
        # assumption: the tool call start is one token
        if response.text == tool_call_start:
            in_tool_call = True
--- a/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
@@ -154,7 +154,7 @@ def test_plan_does_not_request_download_when_shard_already_downloaded():
        tasks={},
    )

-    assert not isinstance(result, plan_mod.DownloadModel)
+    assert result is None


 def test_plan_does_not_load_model_until_all_shards_downloaded_globally():
Author	SHA1	Message	Date
ciaranbor	77fbffcebe	Ensure unique model Id for each quant	2026-01-23 20:08:45 +00:00
ciaranbor	4feb3cde86	Enable image model quantization	2026-01-23 19:58:50 +00:00