Use 2GB buffer for more accurate bandwidth measurement

- Increase buffer size from 512MB to 2GB to better saturate memory bus - Use 2D array shape to avoid issues with very large 1D arrays - Improves accuracy from ~75% to ~82% of theoretical peak Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Improve warmup for memory bandwidth profiling
2026-01-17 02:18:47 -05:00 · 2026-01-16 15:33:18 +00:00 · 2026-01-16 15:30:32 +00:00 · 2026-01-16 15:01:08 +00:00 · 2026-01-16 14:05:47 +00:00 · 2026-01-16 13:20:03 +00:00
12 changed files with 1209 additions and 815 deletions
--- a/app/EXO/EXO/ContentView.swift
+++ b/app/EXO/EXO/ContentView.swift
@@ -56,6 +56,11 @@ struct ContentView: View {
    }

    private var shouldShowLocalNetworkWarning: Bool {
+        // Show warning if local network is not working and EXO is running.
+        // The checker uses a longer timeout on first launch to allow time for
+        // the permission prompt, so this correctly handles both:
+        // 1. User denied permission on first launch
+        // 2. Permission broke after restart (macOS TCC bug)
        if case .notWorking = localNetworkChecker.status {
            return controller.status != .stopped
        }
--- a/app/EXO/EXO/Services/LocalNetworkChecker.swift
+++ b/app/EXO/EXO/Services/LocalNetworkChecker.swift
@@ -5,8 +5,8 @@ import os.log
 /// Checks if the app's local network permission is actually functional.
 ///
 /// macOS local network permission can appear enabled in System Preferences but not
-/// actually work after a restart. This service detects this by creating a UDP
-/// connection to the mDNS multicast address (224.0.0.251:5353).
+/// actually work after a restart. This service uses NWConnection to mDNS multicast
+/// to verify actual connectivity.
@MainActor
 final class LocalNetworkChecker: ObservableObject {
    enum Status: Equatable {
@@ -35,30 +35,43 @@ final class LocalNetworkChecker: ObservableObject {
    }

    private static let logger = Logger(subsystem: "io.exo.EXO", category: "LocalNetworkChecker")
+    private static let hasCompletedInitialCheckKey = "LocalNetworkChecker.hasCompletedInitialCheck"

    @Published private(set) var status: Status = .unknown
-    @Published private(set) var lastConnectionState: String = "none"

    private var connection: NWConnection?
    private var checkTask: Task<Void, Never>?

+    /// Whether we've completed at least one check (stored in UserDefaults)
+    private var hasCompletedInitialCheck: Bool {
+        get { UserDefaults.standard.bool(forKey: Self.hasCompletedInitialCheckKey) }
+        set { UserDefaults.standard.set(newValue, forKey: Self.hasCompletedInitialCheckKey) }
+    }
+
    /// Checks if local network access is working.
    func check() {
        checkTask?.cancel()
        status = .checking
-        lastConnectionState = "connecting"
+
+        // Use longer timeout on first launch to allow time for permission prompt
+        let isFirstCheck = !hasCompletedInitialCheck
+        let timeout: UInt64 = isFirstCheck ? 30_000_000_000 : 3_000_000_000

        checkTask = Task { [weak self] in
            guard let self else { return }
-            let result = await self.performCheck()
+
+            Self.logger.info("Checking local network connectivity (first check: \(isFirstCheck))")
+            let result = await self.checkConnectivity(timeout: timeout)
            self.status = result
+            self.hasCompletedInitialCheck = true
+
            Self.logger.info("Local network check complete: \(result.displayText)")
        }
    }

-    private func performCheck() async -> Status {
-        Self.logger.info("Checking local network access via UDP multicast")
-
+    /// Checks connectivity using NWConnection to mDNS multicast.
+    /// The connection attempt triggers the permission prompt if not yet shown.
+    private func checkConnectivity(timeout: UInt64) async -> Status {
        connection?.cancel()
        connection = nil

@@ -84,22 +97,7 @@ final class LocalNetworkChecker: ObservableObject {
                continuation.resume(returning: status)
            }

-            conn.stateUpdateHandler = { [weak self] state in
-                let stateStr: String
-                switch state {
-                case .setup: stateStr = "setup"
-                case .preparing: stateStr = "preparing"
-                case .ready: stateStr = "ready"
-                case .waiting(let e): stateStr = "waiting(\(e))"
-                case .failed(let e): stateStr = "failed(\(e))"
-                case .cancelled: stateStr = "cancelled"
-                @unknown default: stateStr = "unknown"
-                }
-
-                Task { @MainActor in
-                    self?.lastConnectionState = stateStr
-                }
-
+            conn.stateUpdateHandler = { state in
                switch state {
                case .ready:
                    resumeOnce(.working)
@@ -108,6 +106,7 @@ final class LocalNetworkChecker: ObservableObject {
                    if errorStr.contains("54") || errorStr.contains("ECONNRESET") {
                        resumeOnce(.notWorking(reason: "Connection blocked"))
                    }
+                // Otherwise keep waiting - might be showing permission prompt
                case .failed(let error):
                    let errorStr = "\(error)"
                    if errorStr.contains("65") || errorStr.contains("EHOSTUNREACH")
@@ -127,7 +126,7 @@ final class LocalNetworkChecker: ObservableObject {
            conn.start(queue: .main)

            Task {
-                try? await Task.sleep(nanoseconds: 3_000_000_000)
+                try? await Task.sleep(nanoseconds: timeout)
                let state = conn.state
                switch state {
                case .ready:
--- a/bench/exo_bench.py
+++ b/bench/exo_bench.py
@@ -241,6 +241,9 @@ class PromptSizer:
            ids = tokenizer.apply_chat_template(
                messages, tokenize=True, add_generation_prompt=True
            )
+            # Fix for transformers 5.x
+            if hasattr(ids, "input_ids"):
+                ids = ids.input_ids
            return int(len(ids))

        return count_fn
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
    "openai-harmony>=0.0.8",
+    "httpx>=0.28.1",
 ]

 [project.scripts]
--- a/src/exo/master/placement_utils.py
+++ b/src/exo/master/placement_utils.py
@@ -49,20 +49,22 @@ def get_smallest_cycles(cycles: list[list[NodeInfo]]) -> list[list[NodeInfo]]:
    return [cycle for cycle in cycles if len(cycle) == min_nodes]


-def get_shard_assignments_for_pipeline_parallel(
+def _assign_layers_by_ram(
    model_meta: ModelMetadata,
    selected_cycle: list[NodeWithProfile],
-):
-    cycle_memory = sum(
-        (node.node_profile.memory.ram_available for node in selected_cycle),
-        start=Memory(),
-    )
+) -> ShardAssignments:
+    """Assign layers proportionally based on available RAM."""
    total_layers = model_meta.n_layers
    world_size = len(selected_cycle)
    runner_to_shard: dict[RunnerId, ShardMetadata] = {}
    node_to_runner: dict[NodeId, RunnerId] = {}

+    cycle_memory = sum(
+        (node.node_profile.memory.ram_available for node in selected_cycle),
+        start=Memory(),
+    )
    layers_assigned = 0
+
    for i, node in enumerate(selected_cycle):
        if i == len(selected_cycle) - 1:
            node_layers = total_layers - layers_assigned
@@ -77,7 +79,6 @@ def get_shard_assignments_for_pipeline_parallel(
            node_layers = max(1, node_layers)

        runner_id = RunnerId()
-
        shard = PipelineShardMetadata(
            model_meta=model_meta,
            device_rank=i,
@@ -86,18 +87,143 @@ def get_shard_assignments_for_pipeline_parallel(
            end_layer=layers_assigned + node_layers,
            n_layers=total_layers,
        )
-
        runner_to_shard[runner_id] = shard
        node_to_runner[node.node_id] = runner_id
        layers_assigned += node_layers

-    shard_assignments = ShardAssignments(
+    return ShardAssignments(
        model_id=model_meta.model_id,
        runner_to_shard=runner_to_shard,
        node_to_runner=node_to_runner,
    )

-    return shard_assignments
+
+def _reserve_base_layers(world_size: int, total_layers: int) -> dict[int, int]:
+    """Reserve 1 layer per node to ensure connectivity."""
+    assignments = {i: 0 for i in range(world_size)}
+    remaining_layers = total_layers
+
+    for i in range(world_size):
+        assignments[i] = 1
+        remaining_layers -= 1
+
+    if remaining_layers < 0:
+        logger.warning(
+            "Fewer layers than nodes! Reducing to 1 layer per node where possible."
+        )
+        assignments = {i: 1 if i < total_layers else 0 for i in range(world_size)}
+        remaining_layers = 0
+
+    return assignments
+
+
+def _distribute_layers_by_bandwidth(
+    selected_cycle: list[NodeWithProfile],
+    assignments: dict[int, int],
+    remaining_layers: int,
+    model_meta: ModelMetadata,
+) -> None:
+    """Distribute remaining layers based on bandwidth and RAM capacity."""
+    indexed_nodes = list(enumerate(selected_cycle))
+    sorted_nodes = sorted(
+        indexed_nodes,
+        key=lambda x: x[1].node_profile.memory_bandwidth or 0,
+        reverse=True,
+    )
+
+    for original_idx, node in sorted_nodes:
+        if remaining_layers <= 0:
+            break
+
+        layer_size_bytes = model_meta.storage_size.in_bytes / model_meta.n_layers
+        max_layers_by_ram = int(
+            node.node_profile.memory.ram_available.in_bytes // layer_size_bytes
+        )
+        can_take = max(0, max_layers_by_ram - assignments[original_idx])
+        take = min(can_take, remaining_layers)
+        assignments[original_idx] += take
+        remaining_layers -= take
+
+    if remaining_layers > 0:
+        logger.warning(
+            "All nodes maxed out on RAM estimation, dumping remaining layers on fastest nodes."
+        )
+        for original_idx, _ in sorted_nodes:
+            assignments[original_idx] += 1
+            remaining_layers -= 1
+            if remaining_layers == 0:
+                break
+
+
+def _create_shard_assignments(
+    model_meta: ModelMetadata,
+    selected_cycle: list[NodeWithProfile],
+    assignments: dict[int, int],
+) -> ShardAssignments:
+    """Create shard assignments from layer assignments."""
+    world_size = len(selected_cycle)
+    runner_to_shard: dict[RunnerId, ShardMetadata] = {}
+    node_to_runner: dict[NodeId, RunnerId] = {}
+
+    current_start = 0
+    for i, node in enumerate(selected_cycle):
+        count = assignments[i]
+        runner_id = RunnerId()
+        shard = PipelineShardMetadata(
+            model_meta=model_meta,
+            device_rank=i,
+            world_size=world_size,
+            start_layer=current_start,
+            end_layer=current_start + count,
+            n_layers=model_meta.n_layers,
+        )
+        runner_to_shard[runner_id] = shard
+        node_to_runner[node.node_id] = runner_id
+        current_start += count
+
+    return ShardAssignments(
+        model_id=model_meta.model_id,
+        runner_to_shard=runner_to_shard,
+        node_to_runner=node_to_runner,
+    )
+
+
+def _assign_layers_by_bandwidth(
+    model_meta: ModelMetadata,
+    selected_cycle: list[NodeWithProfile],
+) -> ShardAssignments:
+    """Assign layers based on memory bandwidth."""
+    logger.info("Using bandwidth-aware shard assignment")
+
+    total_layers = model_meta.n_layers
+    world_size = len(selected_cycle)
+
+    assignments = _reserve_base_layers(world_size, total_layers)
+    remaining_layers = total_layers - sum(assignments.values())
+
+    if remaining_layers > 0:
+        _distribute_layers_by_bandwidth(
+            selected_cycle, assignments, remaining_layers, model_meta
+        )
+
+    return _create_shard_assignments(model_meta, selected_cycle, assignments)
+
+
+def get_shard_assignments_for_pipeline_parallel(
+    model_meta: ModelMetadata,
+    selected_cycle: list[NodeWithProfile],
+):
+    has_bandwidth = all(
+        node.node_profile.memory_bandwidth is not None for node in selected_cycle
+    )
+
+    if not has_bandwidth:
+        logger.info(
+            "Bandwidth data missing for some nodes, falling back to RAM-proportional assignment"
+        )
+        return _assign_layers_by_ram(model_meta, selected_cycle)
+
+    return _assign_layers_by_bandwidth(model_meta, selected_cycle)


 def get_shard_assignments_for_tensor_parallel(
--- a/src/exo/master/tests/test_placement_utils.py
+++ b/src/exo/master/tests/test_placement_utils.py
@@ -397,3 +397,106 @@ def test_get_mlx_jaccl_coordinators(
    assert coordinators[node_c_id] == (
        f"{conn_c_a.send_back_multiaddr.ip_address}:5000"
    ), "node_c should use the IP from conn_c_a"
+
+
+def test_get_shard_assignments_bandwidth_aware(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
+):
+    # arrange
+    node_a_id = NodeId()
+    node_b_id = NodeId()
+    node_c_id = NodeId()
+
+    # Create nodes with identical RAM (plenty of it)
+    # Using 1GB to ensure no RAM constraints (model is small)
+    node_a = create_node(1024 * 1024 * 1024, node_a_id)
+    node_b = create_node(1024 * 1024 * 1024, node_b_id)
+    node_c = create_node(1024 * 1024 * 1024, node_c_id)
+
+    # Set Bandwidths: A=400 (Fastest), B=200, C=100 (Slowest)
+    assert node_a.node_profile is not None
+    assert node_b.node_profile is not None
+    assert node_c.node_profile is not None
+
+    node_a.node_profile.memory_bandwidth = 400_000_000_000
+    node_b.node_profile.memory_bandwidth = 200_000_000_000
+    node_c.node_profile.memory_bandwidth = 100_000_000_000
+
+    topology.add_node(node_a)
+    topology.add_node(node_b)
+    topology.add_node(node_c)
+
+    topology.add_connection(create_connection(node_a_id, node_b_id))
+    topology.add_connection(create_connection(node_b_id, node_c_id))
+    topology.add_connection(create_connection(node_c_id, node_a_id))
+
+    # Needs full cycle edges for get_cycles/get_shard_assignments if strict?
+    # Actually get_cycles just looks for cycles.
+    # But let's follow the pattern of other tests if they add bidirectional.
+    # checking test_filter_cycles_by_memory, it adds both directions.
+    topology.add_connection(create_connection(node_b_id, node_a_id))
+    topology.add_connection(create_connection(node_c_id, node_b_id))
+    topology.add_connection(create_connection(node_a_id, node_c_id))
+
+    model_meta = ModelMetadata(
+        model_id=ModelId("test-model"),
+        pretty_name="Test Model",
+        n_layers=30,  # 30 layers
+        storage_size=Memory.from_kb(
+            300
+        ),  # 10KB per layer. Nodes have 100MB RAM (100*1024 in create_node usually means KB? other tests use 1000*1024).
+        # create_node arg is likely KB or Bytes.
+        # test_filter_cycles_by_memory: create_node(1000 * 1024, ...) -> Memory.from_bytes(1) passes.
+        # Let's assume create_node takes Bytes or KB consistently.
+        # If I give 100*1024*1024 bytes = 100MB.
+        # Model storage = 300KB.
+        # So capacity is definitely not an issue.
+        hidden_size=1000,
+        supports_tensor=True,
+    )
+
+    cycles = topology.get_cycles()
+    # Depending on how get_cycles works and order of addition, we might get multiple cycles.
+    # filtering by memory usually done in master.
+    # Here we just pick one.
+    selected_cycle = cycles[0]
+
+    # act
+    shard_assignments = get_shard_assignments(
+        model_meta, selected_cycle, Sharding.Pipeline
+    )
+
+    # assert
+    runner_id_a = shard_assignments.node_to_runner[node_a_id]
+    runner_id_b = shard_assignments.node_to_runner[node_b_id]
+    runner_id_c = shard_assignments.node_to_runner[node_c_id]
+
+    # Get layer counts
+    layers_a = (
+        shard_assignments.runner_to_shard[runner_id_a].end_layer
+        - shard_assignments.runner_to_shard[runner_id_a].start_layer
+    )
+    layers_b = (
+        shard_assignments.runner_to_shard[runner_id_b].end_layer
+        - shard_assignments.runner_to_shard[runner_id_b].start_layer
+    )
+    layers_c = (
+        shard_assignments.runner_to_shard[runner_id_c].end_layer
+        - shard_assignments.runner_to_shard[runner_id_c].start_layer
+    )
+
+    # Check total
+    assert layers_a + layers_b + layers_c == 30
+
+    # Check that the fastest node (A with 400GB/s) gets saturated first.
+    # With strict greedy assignment and plenty of RAM:
+    # 1. Reserve: A=1, B=1, C=1. Remaining=27.
+    # 2. Sort: [A, B, C]
+    # 3. A takes min(remaining=27, capacity=huge) = 27.
+    # 4. A=28, B=1, C=1.
+
+    assert layers_a == 28
+    assert layers_b == 1
+    assert layers_c == 1
--- a/src/exo/shared/logging.py
+++ b/src/exo/shared/logging.py
@@ -29,6 +29,11 @@ class _InterceptHandler(logging.Handler):

 def logger_setup(log_file: Path | None, verbosity: int = 0):
    """Set up logging for this process - formatting, file handles, verbosity and output"""
+
+    logging.getLogger("exo_pyo3_bindings").setLevel(logging.WARNING)
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+
    logger.remove()

    # replace all stdlib loggers with _InterceptHandlers that log to loguru
--- a/src/exo/shared/types/profiling.py
+++ b/src/exo/shared/types/profiling.py
@@ -57,6 +57,7 @@ class NodePerformanceProfile(CamelCaseModel):
    chip_id: str
    friendly_name: str
    memory: MemoryPerformanceProfile
+    memory_bandwidth: int | None = None
    network_interfaces: list[NetworkInterfaceInfo] = []
    system: SystemPerformanceProfile

--- a/src/exo/worker/utils/net_profile.py
+++ b/src/exo/worker/utils/net_profile.py
@@ -1,62 +1,64 @@
-import http.client
-import time
-
-from anyio import create_task_group, to_thread
+import anyio
+import httpx
+from anyio import create_task_group
 from loguru import logger

 from exo.shared.topology import Topology
 from exo.shared.types.common import NodeId

-BAD_STATUSLINE_ATTEMPTS = 3
+REACHABILITY_ATTEMPTS = 3


 async def check_reachability(
    target_ip: str,
    expected_node_id: NodeId,
-    self_node_id: NodeId,
    out: dict[NodeId, set[str]],
+    client: httpx.AsyncClient,
 ) -> None:
    """Check if a node is reachable at the given IP and verify its identity."""
+    if ":" in target_ip:
+        # TODO: use real IpAddress types
+        target_ip = f"[{target_ip}]"
+    url = f"http://{target_ip}:52415/node_id"

-    # TODO: use an async http client
-    def _fetch_remote_node_id(*, attempt: int = 1) -> NodeId | None:
-        connection = http.client.HTTPConnection(target_ip, 52415, timeout=3)
+    remote_node_id = None
+    last_error = None
+
+    for _ in range(REACHABILITY_ATTEMPTS):
        try:
-            connection.request("GET", "/node_id")
-            response = connection.getresponse()
-            if response.status != 200:
-                return None
+            r = await client.get(url)
+            if r.status_code != 200:
+                await anyio.sleep(1)
+                continue

-            body = response.read().decode("utf-8").strip()
+            body = r.text.strip().strip('"')
+            if not body:
+                await anyio.sleep(1)
+                continue

-            # Strip quotes if present (JSON string response)
-            if body.startswith('"') and body.endswith('"') and len(body) >= 2:
-                body = body[1:-1]
+            remote_node_id = NodeId(body)
+            break

-            return NodeId(body) or None
-        except OSError:
-            return None
-        except http.client.BadStatusLine:
-            if attempt >= BAD_STATUSLINE_ATTEMPTS:
-                logger.warning(
-                    f"BadStatusLine from {target_ip}, after {attempt} attempts, assuming connection to {expected_node_id} has dropped"
-                )
-                return None
-            time.sleep(1)
-            return _fetch_remote_node_id(attempt=attempt + 1)
-        except http.client.HTTPException as e:
-            logger.warning(f"HTTPException from {target_ip}: {type(e).__name__}: {e}")
-            return None
-        finally:
-            connection.close()
+        # expected failure cases
+        except (
+            httpx.TimeoutException,
+            httpx.NetworkError,
+        ):
+            await anyio.sleep(1)
+
+        # other failures should be logged on last attempt
+        except httpx.HTTPError as e:
+            last_error = e
+            await anyio.sleep(1)
+
+    if last_error is not None:
+        logger.warning(
+            f"connect error {type(last_error).__name__} from {target_ip} after {REACHABILITY_ATTEMPTS} attempts; treating as down"
+        )

-    remote_node_id = await to_thread.run_sync(_fetch_remote_node_id)
    if remote_node_id is None:
        return

-    if remote_node_id == self_node_id:
-        return
-
    if remote_node_id != expected_node_id:
        logger.warning(
            f"Discovered node with unexpected node_id; "
@@ -74,18 +76,33 @@ async def check_reachable(
    topology: Topology, self_node_id: NodeId
 ) -> dict[NodeId, set[str]]:
    """Check which nodes are reachable and return their IPs."""
+
    reachable: dict[NodeId, set[str]] = {}
-    async with create_task_group() as tg:
+
+    # these are intentionally httpx's defaults so we can tune them later
+    timeout = httpx.Timeout(timeout=5.0)
+    limits = httpx.Limits(
+        max_connections=100,
+        max_keepalive_connections=20,
+        keepalive_expiry=5,
+    )
+
+    async with (
+        httpx.AsyncClient(timeout=timeout, limits=limits) as client,
+        create_task_group() as tg,
+    ):
        for node in topology.list_nodes():
            if not node.node_profile:
                continue
+            if node.node_id == self_node_id:
+                continue
            for iface in node.node_profile.network_interfaces:
                tg.start_soon(
                    check_reachability,
                    iface.ip_address,
                    node.node_id,
-                    self_node_id,
                    reachable,
+                    client,
                )

    return reachable
--- a/src/exo/worker/utils/profile.py
+++ b/src/exo/worker/utils/profile.py
@@ -4,6 +4,7 @@ import platform
 from typing import Any, Callable, Coroutine

 import anyio
+from anyio import to_thread
 from loguru import logger

 from exo.shared.types.memory import Memory
@@ -24,8 +25,61 @@ from .system_info import (
    get_friendly_name,
    get_model_and_chip,
    get_network_interfaces,
+    profile_memory_bandwidth,
 )

+# Module-level cache for memory bandwidth (doesn't change at runtime)
+_cached_bandwidth: int | None = None
+_bandwidth_profiled: bool = False
+_bandwidth_profiling_task: asyncio.Task[int | None] | None = None
+
+
+async def profile_bandwidth_once() -> int | None:
+    """Profile bandwidth once in a background thread and cache the result.
+
+    This function is non-blocking - it runs the profiling in a thread pool.
+    Subsequent calls return the cached result immediately.
+    """
+    global _cached_bandwidth, _bandwidth_profiled, _bandwidth_profiling_task
+
+    # Already profiled, return cached value
+    if _bandwidth_profiled:
+        return _cached_bandwidth
+
+    # Profiling already in progress, wait for it
+    if _bandwidth_profiling_task is not None:
+        return await _bandwidth_profiling_task
+
+    # Start profiling in background thread
+    async def _do_profile() -> int | None:
+        global _cached_bandwidth, _bandwidth_profiled
+        try:
+            logger.info("Starting memory bandwidth profiling in background thread...")
+            bandwidth = await to_thread.run_sync(profile_memory_bandwidth, cancellable=True)
+            _cached_bandwidth = bandwidth
+            _bandwidth_profiled = True
+            if bandwidth:
+                logger.info(f"Memory bandwidth profiled: {bandwidth / 1e9:.1f} GB/s")
+            else:
+                logger.warning("Memory bandwidth profiling returned None")
+            return bandwidth
+        except Exception as e:
+            logger.opt(exception=e).error("Memory bandwidth profiling failed")
+            _bandwidth_profiled = True  # Mark as done to avoid retrying
+            return None
+
+    _bandwidth_profiling_task = asyncio.create_task(_do_profile())
+    return await _bandwidth_profiling_task
+
+
+def get_memory_bandwidth_cached() -> int | None:
+    """Return cached bandwidth or None if not yet profiled.
+
+    This is a non-blocking synchronous function that returns immediately.
+    Call profile_bandwidth_once() first to trigger profiling.
+    """
+    return _cached_bandwidth if _bandwidth_profiled else None
+

 async def get_metrics_async() -> Metrics | None:
    """Return detailed Metrics on macOS or a minimal fallback elsewhere."""
@@ -71,6 +125,8 @@ async def start_polling_node_metrics(
    callback: Callable[[NodePerformanceProfile], Coroutine[Any, Any, None]],
 ):
    poll_interval_s = 1.0
+    bandwidth_profile_started = False
+
    while True:
        try:
            metrics = await get_metrics_async()
@@ -85,6 +141,15 @@ async def start_polling_node_metrics(
            # do the memory profile last to get a fresh reading to not conflict with the other memory profiling loop
            memory_profile = get_memory_profile()

+            # Start bandwidth profiling in background on first poll (non-blocking)
+            if not bandwidth_profile_started:
+                bandwidth_profile_started = True
+                # Fire and forget - don't await, let it run in background
+                asyncio.create_task(profile_bandwidth_once())
+
+            # Use cached bandwidth (None until profiling completes)
+            memory_bandwidth = get_memory_bandwidth_cached()
+
            await callback(
                NodePerformanceProfile(
                    model_id=model_id,
@@ -92,6 +157,7 @@ async def start_polling_node_metrics(
                    friendly_name=friendly_name,
                    network_interfaces=network_interfaces,
                    memory=memory_profile,
+                    memory_bandwidth=memory_bandwidth,
                    system=SystemPerformanceProfile(
                        gpu_usage=metrics.gpu_usage[1],
                        temp=metrics.temp.gpu_temp_avg,
--- a/src/exo/worker/utils/system_info.py
+++ b/src/exo/worker/utils/system_info.py
@@ -1,5 +1,6 @@
 import socket
 import sys
+import time
 from subprocess import CalledProcessError

 import psutil
@@ -81,3 +82,68 @@ async def get_model_and_chip() -> tuple[str, str]:
    chip = chip_line.split(": ")[1] if chip_line else "Unknown Chip"

    return (model, chip)
+
+
+def profile_memory_bandwidth() -> int | None:
+    """
+    Profile device memory bandwidth using MLX GPU operations.
+
+    Uses a large array copy on the GPU to measure unified memory bandwidth.
+    Returns measured bandwidth in bytes/second, or None if MLX is unavailable.
+    """
+    try:
+        import mlx.core as mx
+
+        if not mx.metal.is_available():
+            return None
+
+        # Use 2GB buffer to better saturate memory bandwidth
+        # Use 2D shape to avoid potential issues with very large 1D arrays
+        size_bytes = 2 * 1024 * 1024 * 1024
+        side = int((size_bytes // 4) ** 0.5)  # Square 2D array of float32
+        shape = (side, side)
+        actual_bytes = side * side * 4
+        bytes_transferred = actual_bytes * 2  # read + write
+
+        # Warm-up: run the full benchmark operation multiple times to stabilize GPU
+        for _ in range(3):
+            src = mx.random.uniform(shape=shape, dtype=mx.float32)
+            mx.eval(src)
+            dst = src + 0.0
+            mx.eval(dst)
+            mx.synchronize()
+            del src, dst
+
+        # Benchmark: measure time to copy array
+        best_bandwidth = 0.0
+        num_runs = 4
+
+        for _ in range(num_runs):
+            src = mx.random.uniform(shape=shape, dtype=mx.float32)
+            mx.eval(src)
+            mx.synchronize()
+
+            # Time the copy operation (src + 0.0 forces read of src, write of dst)
+            start = time.perf_counter()
+            dst = src + 0.0
+            mx.eval(dst)
+            mx.synchronize()
+            end = time.perf_counter()
+
+            bandwidth = bytes_transferred / (end - start)
+            best_bandwidth = max(best_bandwidth, bandwidth)
+
+            del src, dst
+
+        return int(best_bandwidth)
+    except Exception:
+        return None
+
+
+def get_memory_bandwidth(_chip_id: str) -> int | None:
+    """
+    Returns measured memory bandwidth in bytes/second.
+
+    Uses MLX GPU operations for accurate unified memory bandwidth measurement.
+    """
+    return profile_memory_bandwidth()
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Alex Cheema	4d414556d5	Use 2GB buffer for more accurate bandwidth measurement - Increase buffer size from 512MB to 2GB to better saturate memory bus - Use 2D array shape to avoid issues with very large 1D arrays - Improves accuracy from ~75% to ~82% of theoretical peak Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-16 15:33:18 +00:00
Alex Cheema	d1f80c9e86	Improve warmup for memory bandwidth profiling - Add 3 full warmup iterations before benchmarking - Increase benchmark runs to 4 and take best result - Fixes slow first run issue on M3 Ultra Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-16 15:30:32 +00:00
Alex Cheema	ae3086167f	Merge latest main into feat/bandwidth-aware-placement	2026-01-16 15:01:08 +00:00
Evan	83c5285a80	reduce logs previous commits logs were too verbose, this tones them down a bit	2026-01-16 14:05:47 +00:00
Evan Quiney	39ee2bf7bd	switch from synchronous threaded pinging to an async implementation (#1170 ) still seeing churn in our networking - lets properly rate limit it ## changes added an httpx client with max connections with a persistent AsyncClient ## testing deployed on cluster, discovery VASTLY more stable (the only deleted edges were those discovered by mdns)	2026-01-16 13:20:03 +00:00
Sami Khan	991adfbd6f	fix local network warning (#1136 ) ## Motivation Local network warning banner was showing on fresh install even though mDNS was working. The check would fail before the user had a chance to grant permission via the macOS prompt. ## Changes - Added `hasWorkedBefore` flag persisted in UserDefaults - Only show warning if permission previously worked but now doesn't ## Why It Works On fresh install, the check may fail (no permission yet), but `hasWorkedBefore` is false so no warning shows. Once the user grants permission and a check succeeds, we record it. Future failures (zombie permission after restart) will show the warning since `hasWorkedBefore` is now true. ## Test Plan ### Manual Testing Run locally ### Automated Testing N/A	2026-01-16 13:10:50 +00:00
rltakashige	4b3de6b984	Fix exo bench for transformers 5.x (#1168 ) ## Motivation Prompt Sizer was broken as transformers 5.x tokenizers create BatchEncodings which are essentially a dictionary of {input_ids: []} instead of the list of input ids. ## Test Plan ### Manual Testing Tested that exo bench runs as expected. ### Automated Testing <!-- Describe changes to automated tests, or how existing tests cover this change --> <!-- - -->	2026-01-16 12:39:22 +00:00
Evan	c8de3b90ea	quiet rust logs rust logs were too verbose - now only warnings propagate to python entirely happy not to merge this and to clean up rust logging instead, but this felt saner right now	2026-01-16 12:34:28 +00:00
Alex Cheema	a480df40bf	Merge latest main into feat/bandwidth-aware-placement	2026-01-15 21:15:57 +00:00
Jake Abendroth	a8a0fa1bd8	Merge branch 'main' into feat/bandwidth-aware-placement	2026-01-08 17:28:37 -08:00
Jake Abendroth	9c6f9a6080	feat: enhance memory bandwidth profiling and update shard assignment logic	2026-01-08 17:27:39 -08:00
Jake Abendroth	ab31491786	Merge branch 'main' into feat/bandwidth-aware-placement	2026-01-05 04:04:18 -08:00
Jake Abendroth	9e8d5b759c	feat: implement bandwidth-aware shard assignment for pipeline parallelism This PR implements bandwidth-aware shard assignment for pipeline parallelism to minimize total inference time, aligning with Issue #957. Changes: - Added `memory_bandwidth` to `NodePerformanceProfile`. - Added Apple Silicon bandwidth data. - Implemented greedy assignment algorithm in `placement_utils.py`. - Added verification tests.	2026-01-03 05:13:14 -08:00