Fix linux docs (#1022 )

This PR updates the "Run from Source (Mac & Linux)" section in README.md to clarify Linux instructions. Changes include: - Split the section into macOS and Linux subsections. - Added native Linux package manager commands (apt, dnf, pacman) for dependencies: uv, node, npm. - Clarified that macmon is macOS-only. - Noted that Homebrew on Linux is optional, with native package managers preferred. These changes improve clarity for Linux users and fix confusion from the previous macOS-centric instructions.
Separate out the Runner's behaviour into a "connect" phase and a "load" phase (#1006 )
2025-12-27 16:19:04 -05:00 · 2025-12-27 19:56:44 +00:00 · 2025-12-27 16:28:42 +00:00 · 2025-12-27 16:13:26 +00:00
55 changed files with 1819 additions and 1306 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@ digest.txt
 # nix
 .direnv/

+# IDEA (PyCharm)
+.idea

 # xcode / macos
 *.xcuserstate
--- a/README.md
+++ b/README.md
@@ -61,10 +61,10 @@ Devices running exo automatically discover each other, without needing any manua

 There are two ways to run exo:

-### Run from Source (Mac & Linux)
+### Run from Source (macOS)

 **Prerequisites:**
- [brew](https://github.com/Homebrew/brew) (for simple package management on MacOS)
+- [brew](https://github.com/Homebrew/brew) (for simple package management on macOS)
  
  ```bash
  /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
@@ -98,6 +98,62 @@ uv run exo

 This starts the exo dashboard and API at http://localhost:52415/

+### Run from Source (Linux)
+
+**Prerequisites:**
+
+- [uv](https://github.com/astral-sh/uv) (for Python dependency management)
+- [node](https://github.com/nodejs/node) (for building the dashboard) - version 18 or higher
+- [rust](https://github.com/rust-lang/rustup) (to build Rust bindings, nightly for now)
+
+**Installation methods:**
+
+**Option 1: Using system package manager (Ubuntu/Debian example):**
+```bash
+# Install Node.js and npm
+sudo apt update
+sudo apt install nodejs npm
+
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install Rust (using rustup)
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+rustup toolchain install nightly
+```
+
+**Option 2: Using Homebrew on Linux (if preferred):**
+```bash
+# Install Homebrew on Linux
+/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+
+# Install dependencies
+brew install uv node
+
+# Install Rust (using rustup)
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+rustup toolchain install nightly
+```
+
+**Note:** The `macmon` package is macOS-only and not required for Linux.
+
+Clone the repo, build the dashboard, and run exo:
+
+```bash
+# Clone exo
+git clone https://github.com/exo-explore/exo
+
+# Build dashboard
+cd exo/dashboard && npm install && npm run build && cd ..
+
+# Run exo
+uv run exo
+```
+
+This starts the exo dashboard and API at http://localhost:52415/
+
+**Important note for Linux users:** Currently, exo runs on CPU on Linux. GPU support for Linux platforms is under development. If you'd like to see support for your specific Linux hardware, please [search for existing feature requests](https://github.com/exo-explore/exo/issues) or create a new one.
+
 ### macOS App

 exo ships a macOS app that runs in the background on your Mac.
--- a/TODO.md
+++ b/TODO.md
@@ -19,7 +19,6 @@
 25. Rethink retry logic
 26. Task cancellation. When API http request gets cancelled, it should cancel corresponding task.
 27. Log cleanup - per-module log filters and default to DEBUG log levels
-28. Validate RDMA connections with ibv_devinfo in the info gatherer

 Potential refactors:

--- a/dashboard/src/lib/stores/app.svelte.ts
+++ b/dashboard/src/lib/stores/app.svelte.ts
@@ -96,7 +96,7 @@ interface RawNodeProfile {

 interface RawTopologyNode {
 	nodeId: string;
-	nodeProfile?: RawNodeProfile;
+	nodeProfile: RawNodeProfile;
 }

 interface RawTopologyConnection {
@@ -105,13 +105,9 @@ interface RawTopologyConnection {
 	sendBackMultiaddr?: { multiaddr?: string; address?: string; ip_address?: string } | string;
 }

-// Connection can be an object or a tuple [source, target, metadata]
-type RawConnectionItem = RawTopologyConnection | [string, string, { sinkMultiaddr?: { ip_address?: string; address?: string } }?];
-
 interface RawTopology {
-	// nodes can be array of strings (node IDs) or array of objects with nodeId/nodeProfile
-	nodes: (string | RawTopologyNode)[];
-	connections?: RawConnectionItem[];
+	nodes: RawTopologyNode[];
+	connections?: RawTopologyConnection[];
 }

 type RawNodeProfiles = Record<string, RawNodeProfile>;
@@ -202,17 +198,9 @@ function transformTopology(raw: RawTopology, profiles?: RawNodeProfiles): Topolo
 	const nodes: Record<string, NodeInfo> = {};
 	const edges: TopologyEdge[] = [];

-	// Handle nodes - can be array of strings (node IDs) or array of objects with nodeId/nodeProfile
 	for (const node of raw.nodes || []) {
-		// Determine the node ID - could be a string or an object with nodeId property
-		const nodeId = typeof node === 'string' ? node : node.nodeId;
-		if (!nodeId) continue;
-		
-		// Get the profile - from the separate profiles map or from the node object itself
-		const profileFromMap = profiles?.[nodeId];
-		const profileFromNode = typeof node === 'object' ? node.nodeProfile : undefined;
-		const profile = { ...(profileFromNode ?? {}), ...(profileFromMap ?? {}) };
-		
+		const mergedProfile = profiles?.[node.nodeId];
+		const profile = { ...(node.nodeProfile ?? {}), ...(mergedProfile ?? {}) };
 		const ramTotal = profile?.memory?.ramTotal?.inBytes ?? 0;
 		const ramAvailable = profile?.memory?.ramAvailable?.inBytes ?? 0;
 		const ramUsage = Math.max(ramTotal - ramAvailable, 0);
@@ -250,7 +238,7 @@ function transformTopology(raw: RawTopology, profiles?: RawNodeProfiles): Topolo
 			}
 		}

-		nodes[nodeId] = {
+		nodes[node.nodeId] = {
 			system_info: {
 				model_id: profile?.modelId ?? 'Unknown',
 				chip: profile?.chipId,
@@ -272,34 +260,14 @@ function transformTopology(raw: RawTopology, profiles?: RawNodeProfiles): Topolo
 		};
 	}

-	// Handle connections - can be objects with localNodeId/sendBackNodeId or tuples [source, target, metadata]
 	for (const conn of raw.connections || []) {
-		let localNodeId: string | undefined;
-		let sendBackNodeId: string | undefined;
-		let sendBackMultiaddr: { multiaddr?: string; address?: string; ip_address?: string } | string | undefined;
-		
-		// Check if it's a tuple format [source, target, metadata]
-		if (Array.isArray(conn)) {
-			localNodeId = conn[0] as string;
-			sendBackNodeId = conn[1] as string;
-			const metadata = conn[2] as { sinkMultiaddr?: { ip_address?: string; address?: string } } | undefined;
-			if (metadata?.sinkMultiaddr) {
-				sendBackMultiaddr = metadata.sinkMultiaddr;
-			}
-		} else {
-			// Object format with localNodeId/sendBackNodeId
-			localNodeId = conn.localNodeId;
-			sendBackNodeId = conn.sendBackNodeId;
-			sendBackMultiaddr = conn.sendBackMultiaddr;
-		}
-		
-		if (!localNodeId || !sendBackNodeId) continue;
-		if (localNodeId === sendBackNodeId) continue;
-		if (!nodes[localNodeId] || !nodes[sendBackNodeId]) continue;
+		if (!conn.localNodeId || !conn.sendBackNodeId) continue;
+		if (conn.localNodeId === conn.sendBackNodeId) continue;
+		if (!nodes[conn.localNodeId] || !nodes[conn.sendBackNodeId]) continue;

 		let sendBackIp: string | undefined;
-		if (sendBackMultiaddr) {
-			const multi = sendBackMultiaddr;
+		if (conn.sendBackMultiaddr) {
+			const multi = conn.sendBackMultiaddr;
 			if (typeof multi === 'string') {
 				sendBackIp = extractIpFromMultiaddr(multi);
 			} else {
@@ -308,8 +276,8 @@ function transformTopology(raw: RawTopology, profiles?: RawNodeProfiles): Topolo
 		}

 		edges.push({
-			source: localNodeId,
-			target: sendBackNodeId,
+			source: conn.localNodeId,
+			target: conn.sendBackNodeId,
 			sendBackIp
 		});
 	}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,8 @@ dependencies = [
    "exo_pyo3_bindings", # rust bindings
    "anyio==4.11.0",
    "bidict>=0.23.1",
-    "mlx>=0.30.1",
+    "mlx>=0.30.1; sys_platform == 'darwin'",
+    "mlx[cpu]>=0.30.1; sys_platform == 'linux'",
    "mlx-lm>=0.28.3",
    "tiktoken>=0.12.0", # required for kimi k2 tokenizer
    "hypercorn>=0.18.0",
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -207,7 +207,6 @@ class API:
                    instance_meta=instance_meta,
                    min_nodes=min_nodes,
                ),
-                node_profiles=self.state.node_profiles,
                topology=self.state.topology,
                current_instances=self.state.instances,
            )
@@ -263,7 +262,6 @@ class API:
                            instance_meta=instance_meta,
                            min_nodes=min_nodes,
                        ),
-                        node_profiles=self.state.node_profiles,
                        topology=self.state.topology,
                        current_instances=self.state.instances,
                    )
@@ -428,8 +426,9 @@ class API:
        """Calculate total available memory across all nodes in bytes."""
        total_available = Memory()

-        for profile in self.state.node_profiles.values():
-            total_available += profile.memory.ram_available
+        for node in self.state.topology.list_nodes():
+            if node.node_profile is not None:
+                total_available += node.node_profile.memory.ram_available

        return total_available

--- a/src/exo/master/main.py
+++ b/src/exo/master/main.py
@@ -158,7 +158,6 @@ class Master:
                                command,
                                self.state.topology,
                                self.state.instances,
-                                self.state.node_profiles,
                            )
                            transition_events = get_transition_events(
                                self.state.instances, placement
@@ -201,7 +200,9 @@ class Master:
    async def _plan(self) -> None:
        while True:
            # kill broken instances
-            connected_node_ids = set([x for x in self.state.topology.list_nodes()])
+            connected_node_ids = set(
+                [x.node_id for x in self.state.topology.list_nodes()]
+            )
            for instance_id, instance in self.state.instances.items():
                for node_id in instance.shard_assignments.node_to_runner:
                    if node_id not in connected_node_ids:
--- a/src/exo/master/placement.py
+++ b/src/exo/master/placement.py
@@ -6,11 +6,10 @@ from typing import Sequence
 from loguru import logger

 from exo.master.placement_utils import (
-    NodeWithProfile,
    filter_cycles_by_memory,
    get_hosts_from_subgraph,
+    get_mlx_ibv_devices_matrix,
    get_mlx_jaccl_coordinators,
-    get_mlx_jaccl_devices_matrix,
    get_shard_assignments,
    get_smallest_cycles,
 )
@@ -20,10 +19,10 @@ from exo.shared.types.commands import (
    DeleteInstance,
    PlaceInstance,
 )
-from exo.shared.types.common import Host, NodeId
+from exo.shared.types.common import Host
 from exo.shared.types.events import Event, InstanceCreated, InstanceDeleted
 from exo.shared.types.memory import Memory
-from exo.shared.types.profiling import NodePerformanceProfile
+from exo.shared.types.topology import NodeInfo
 from exo.shared.types.worker.instances import (
    Instance,
    InstanceId,
@@ -52,16 +51,19 @@ def place_instance(
    command: PlaceInstance,
    topology: Topology,
    current_instances: Mapping[InstanceId, Instance],
-    node_profiles: Mapping[NodeId, NodePerformanceProfile],
 ) -> dict[InstanceId, Instance]:
    all_nodes = list(topology.list_nodes())

-    cycles = topology.get_cycles() + [[node] for node in all_nodes]
-    candidate_cycles = list(filter(lambda it: len(it) >= command.min_nodes, cycles))
-    cycles_with_sufficient_memory = filter_cycles_by_memory(
-        candidate_cycles, node_profiles, command.model_meta.storage_size
+    logger.info("finding cycles:")
+    cycles = topology.get_cycles()
+    singleton_cycles = [[node] for node in all_nodes]
+    candidate_cycles = list(
+        filter(lambda it: len(it) >= command.min_nodes, cycles + singleton_cycles)
    )
-    if len(cycles_with_sufficient_memory) == 0:
+    cycles_with_sufficient_memory = filter_cycles_by_memory(
+        candidate_cycles, command.model_meta.storage_size
+    )
+    if not cycles_with_sufficient_memory:
        raise ValueError("No cycles found with sufficient memory")

    smallest_cycles = get_smallest_cycles(cycles_with_sufficient_memory)
@@ -69,15 +71,13 @@ def place_instance(
    smallest_tb_cycles = [
        cycle
        for cycle in smallest_cycles
-        if topology.get_subgraph_from_nodes(
-            [node.node_id for node in cycle]
-        ).is_thunderbolt_cycle([node.node_id for node in cycle])
+        if topology.get_subgraph_from_nodes(cycle).is_thunderbolt_cycle(cycle)
    ]

    if smallest_tb_cycles != []:
        smallest_cycles = smallest_tb_cycles

-    cycles_with_leaf_nodes: list[list[NodeWithProfile]] = [
+    cycles_with_leaf_nodes: list[list[NodeInfo]] = [
        cycle
        for cycle in smallest_cycles
        if any(topology.node_is_leaf(node.node_id) for node in cycle)
@@ -86,7 +86,11 @@ def place_instance(
    selected_cycle = max(
        cycles_with_leaf_nodes if cycles_with_leaf_nodes != [] else smallest_cycles,
        key=lambda cycle: sum(
-            (node.node_profile.memory.ram_available for node in cycle),
+            (
+                node.node_profile.memory.ram_available
+                for node in cycle
+                if node.node_profile is not None
+            ),
            start=Memory(),
        ),
    )
@@ -95,16 +99,14 @@ def place_instance(
        command.model_meta, selected_cycle, command.sharding
    )

-    cycle_digraph: Topology = topology.get_subgraph_from_nodes(
-        [node.node_id for node in selected_cycle]
-    )
+    cycle_digraph: Topology = topology.get_subgraph_from_nodes(selected_cycle)

    instance_id = InstanceId()
    target_instances = dict(deepcopy(current_instances))

    if len(selected_cycle) == 1:
        logger.warning(
-            "You have likely selected jaccl for a single node instance; falling back to MlxRing"
+            "You have likely selected ibv for a single node instance; falling back to MlxRing"
        )

        command.instance_meta = InstanceMeta.MlxRing
@@ -112,18 +114,19 @@ def place_instance(
    # TODO: Single node instances
    match command.instance_meta:
        case InstanceMeta.MlxJaccl:
-            mlx_jaccl_devices = get_mlx_jaccl_devices_matrix(
+            mlx_ibv_devices = get_mlx_ibv_devices_matrix(
+                selected_cycle,
                cycle_digraph,
            )
            mlx_jaccl_coordinators = get_mlx_jaccl_coordinators(
-                coordinator=selected_cycle[0].node_id,
+                selected_cycle,
                coordinator_port=random_ephemeral_port(),
                cycle_digraph=cycle_digraph,
            )
            target_instances[instance_id] = MlxJacclInstance(
                instance_id=instance_id,
                shard_assignments=shard_assignments,
-                jaccl_devices=mlx_jaccl_devices,
+                ibv_devices=mlx_ibv_devices,
                jaccl_coordinators=mlx_jaccl_coordinators,
            )
        case InstanceMeta.MlxRing:
--- a/src/exo/master/placement_utils.py
+++ b/src/exo/master/placement_utils.py
@@ -1,4 +1,5 @@
-from collections.abc import Generator, Mapping
+from collections.abc import Generator
+from typing import TypeGuard, cast

 from loguru import logger
 from pydantic import BaseModel
@@ -8,7 +9,7 @@ from exo.shared.types.common import Host, NodeId
 from exo.shared.types.memory import Memory
 from exo.shared.types.models import ModelMetadata
 from exo.shared.types.profiling import NodePerformanceProfile
-from exo.shared.types.topology import RDMAConnection, SocketConnection
+from exo.shared.types.topology import NodeInfo
 from exo.shared.types.worker.runners import RunnerId, ShardAssignments
 from exo.shared.types.worker.shards import (
    PipelineShardMetadata,
@@ -23,32 +24,27 @@ class NodeWithProfile(BaseModel):
    node_profile: NodePerformanceProfile


+def narrow_all_nodes(nodes: list[NodeInfo]) -> TypeGuard[list[NodeWithProfile]]:
+    return all(node.node_profile is not None for node in nodes)
+
+
 def filter_cycles_by_memory(
-    cycles: list[list[NodeId]],
-    node_profiles: Mapping[NodeId, NodePerformanceProfile],
-    required_memory: Memory,
-) -> list[list[NodeWithProfile]]:
-    filtered_cycles: list[list[NodeWithProfile]] = []
+    cycles: list[list[NodeInfo]], required_memory: Memory
+) -> list[list[NodeInfo]]:
+    filtered_cycles: list[list[NodeInfo]] = []
    for cycle in cycles:
-        if not all(node in node_profiles for node in cycle):
+        if not narrow_all_nodes(cycle):
            continue

        total_mem = sum(
-            (node_profiles[node].memory.ram_available for node in cycle), start=Memory()
+            (node.node_profile.memory.ram_available for node in cycle), start=Memory()
        )
        if total_mem >= required_memory:
-            filtered_cycles.append(
-                [
-                    NodeWithProfile(node_id=node, node_profile=node_profiles[node])
-                    for node in cycle
-                ]
-            )
+            filtered_cycles.append(cast(list[NodeInfo], cycle))
    return filtered_cycles


-def get_smallest_cycles(
-    cycles: list[list[NodeWithProfile]],
-) -> list[list[NodeWithProfile]]:
+def get_smallest_cycles(cycles: list[list[NodeInfo]]) -> list[list[NodeInfo]]:
    min_nodes = min(len(cycle) for cycle in cycles)
    return [cycle for cycle in cycles if len(cycle) == min_nodes]

@@ -139,9 +135,11 @@ def get_shard_assignments_for_tensor_parallel(

 def get_shard_assignments(
    model_meta: ModelMetadata,
-    selected_cycle: list[NodeWithProfile],
+    selected_cycle: list[NodeInfo],
    sharding: Sharding,
 ) -> ShardAssignments:
+    if not narrow_all_nodes(selected_cycle):
+        raise ValueError("All nodes must have profiles to create shard assignments")
    match sharding:
        case Sharding.Pipeline:
            return get_shard_assignments_for_pipeline_parallel(
@@ -178,16 +176,17 @@ def get_hosts_from_subgraph(cycle_digraph: Topology) -> list[Host]:
        current_node = cycle[i]
        next_node = cycle[(i + 1) % len(cycle)]

-        for src, sink, connection in cycle_digraph.list_connections():
-            if not isinstance(connection, SocketConnection):
-                continue
-
-            if src == current_node and sink == next_node:
+        for connection in cycle_digraph.list_connections():
+            if (
+                connection.local_node_id == current_node.node_id
+                and connection.send_back_node_id == next_node.node_id
+            ):
                if get_thunderbolt and not connection.is_thunderbolt():
                    continue
+                assert connection.send_back_multiaddr is not None
                host = Host(
-                    ip=connection.sink_multiaddr.ip_address,
-                    port=connection.sink_multiaddr.port,
+                    ip=connection.send_back_multiaddr.ip_address,
+                    port=connection.send_back_multiaddr.port,
                )
                hosts.append(host)
                break
@@ -195,7 +194,8 @@ def get_hosts_from_subgraph(cycle_digraph: Topology) -> list[Host]:
    return hosts


-def get_mlx_jaccl_devices_matrix(
+def get_mlx_ibv_devices_matrix(
+    selected_cycle: list[NodeInfo],
    cycle_digraph: Topology,
 ) -> list[list[str | None]]:
    """Build connectivity matrix mapping device i to device j via RDMA interface names.
@@ -204,7 +204,6 @@ def get_mlx_jaccl_devices_matrix(
    to device j, or None if no connection exists or no interface name is found.
    Diagonal elements are always None.
    """
-    selected_cycle = list(cycle_digraph.list_nodes())
    num_nodes = len(selected_cycle)
    matrix: list[list[str | None]] = [
        [None for _ in range(num_nodes)] for _ in range(num_nodes)
@@ -215,55 +214,86 @@ def get_mlx_jaccl_devices_matrix(
            if i == j:
                continue

-            for conn in cycle_digraph.get_all_connections_between(node_i, node_j):
-                if isinstance(conn, RDMAConnection):
-                    matrix[i][j] = conn.source_rdma_iface
+            # Find the IP J uses to talk to I
+            for connection_ip in _find_connection_ip(node_j, node_i, cycle_digraph):
+                # This is a local IP on I, which is attached to an interface: find that interface
+                if interface_name := _find_interface_name_for_ip(connection_ip, node_i):
+                    matrix[i][j] = interface_name
+                    logger.info(
+                        f"Interface name for {connection_ip} on {node_i.node_id}: {interface_name}"
+                    )
                    break
            else:
+                logger.warning(
+                    f"Failed to find interface name between {node_i.node_id} and {node_j.node_id}"
+                )
                raise ValueError(
-                    "Current jaccl backend requires all-to-all RDMA connections"
+                    "Current ibv backend requires all-to-all rdma connections"
                )

    return matrix


 def _find_connection_ip(
-    node_i: NodeId,
-    node_j: NodeId,
+    node_i: NodeInfo,
+    node_j: NodeInfo,
    cycle_digraph: Topology,
 ) -> Generator[str]:
    """Find all IP addresses that connect node i to node j."""
-    # TODO: Prioritise ETHERNET > ??WIFI > TB for coordinator
-    for connection in cycle_digraph.get_all_connections_between(node_i, node_j):
-        if isinstance(connection, SocketConnection):
-            yield connection.sink_multiaddr.ip_address
+    for connection in cycle_digraph.list_connections():
+        if (
+            connection.local_node_id == node_i.node_id
+            and connection.send_back_node_id == node_j.node_id
+        ):
+            yield connection.send_back_multiaddr.ip_address
+
+
+def _find_interface_name_for_ip(
+    ip_address: str,
+    node_info: NodeInfo,
+) -> str | None:
+    if node_info.node_profile is None:
+        return None
+
+    logger.info(f"Searching {node_info.node_id} for ip {ip_address}:")
+    for interface in node_info.node_profile.network_interfaces:
+        if interface.name not in ["en2", "en3", "en4", "en5", "en6", "en7"]:
+            continue
+        logger.info(f" | {interface.name}: {interface.ip_address}")
+        if interface.ip_address != ip_address:
+            continue
+
+        logger.info("Found")
+        return f"rdma_{interface.name}"
+
+    return None


 def get_mlx_jaccl_coordinators(
-    coordinator: NodeId,
+    selected_cycle: list[NodeInfo],
    coordinator_port: int,
    cycle_digraph: Topology,
 ) -> dict[NodeId, str]:
-    """Get the coordinator addresses for MLX JACCL (rank 0 device).
+    """Get the coordinator addresses for MLX Jaccl (rank 0 device).

    Select an IP address that each node can reach for the rank 0 node. Returns
    address in format "X.X.X.X:PORT" per node.
    """
-    selected_cycle = list(cycle_digraph.list_nodes())
-    logger.info(f"Selecting coordinator: {coordinator}")
+    rank_0_node = selected_cycle[0]
+    logger.info(f"Selecting coordinator from rank 0 node: {rank_0_node.node_id}")

-    def get_ip_for_node(n: NodeId) -> str:
-        if n == coordinator:
+    def get_ip_for_node(n: NodeInfo) -> str:
+        if n.node_id == rank_0_node.node_id:
            return "0.0.0.0"

-        for ip in _find_connection_ip(n, coordinator, cycle_digraph):
+        for ip in _find_connection_ip(n, rank_0_node, cycle_digraph):
            return ip

        logger.warning(
-            f"Failed to find directly connected ip between {n} and {coordinator}"
-        )
-        raise ValueError(
-            "Current jaccl backend requires all participating devices to be able to communicate"
+            f"Failed to find directly connected ip between {n.node_id} and {rank_0_node.node_id}"
        )
+        raise ValueError("Current ibv backend requires all-to-all rdma connections")

-    return {n: f"{get_ip_for_node(n)}:{coordinator_port}" for n in selected_cycle}
+    return {
+        n.node_id: f"{get_ip_for_node(n)}:{coordinator_port}" for n in selected_cycle
+    }
--- a/src/exo/master/tests/conftest.py
+++ b/src/exo/master/tests/conftest.py
@@ -1,36 +1,67 @@
+from typing import Callable
+
+import pytest
+
+from exo.shared.types.common import NodeId
 from exo.shared.types.multiaddr import Multiaddr
 from exo.shared.types.profiling import (
-    MemoryUsage,
+    MemoryPerformanceProfile,
    NodePerformanceProfile,
    SystemPerformanceProfile,
 )
-from exo.shared.types.topology import RDMAConnection, SocketConnection
+from exo.shared.types.topology import Connection, ConnectionProfile, NodeInfo


-def create_node_profile(memory: int) -> NodePerformanceProfile:
-    return NodePerformanceProfile(
-        model_id="test",
-        chip_id="test",
-        friendly_name="test",
-        memory=MemoryUsage.from_bytes(
-            ram_total=1000,
-            ram_available=memory,
-            swap_total=1000,
-            swap_available=1000,
-        ),
-        network_interfaces=[],
-        system=SystemPerformanceProfile(),
-    )
+@pytest.fixture
+def create_node():
+    def _create_node(memory: int, node_id: NodeId | None = None) -> NodeInfo:
+        if node_id is None:
+            node_id = NodeId()
+        return NodeInfo(
+            node_id=node_id,
+            node_profile=NodePerformanceProfile(
+                model_id="test",
+                chip_id="test",
+                friendly_name="test",
+                memory=MemoryPerformanceProfile.from_bytes(
+                    ram_total=1000,
+                    ram_available=memory,
+                    swap_total=1000,
+                    swap_available=1000,
+                ),
+                network_interfaces=[],
+                system=SystemPerformanceProfile(),
+            ),
+        )
+
+    return _create_node


 # TODO: this is a hack to get the port for the send_back_multiaddr
-def create_connection(ip: int, sink_port: int = 1234) -> SocketConnection:
-    return SocketConnection(
-        sink_multiaddr=Multiaddr(address=f"/ip4/169.254.0.{ip}/tcp/{sink_port}"),
-    )
+@pytest.fixture
+def create_connection() -> Callable[[NodeId, NodeId, int | None], Connection]:
+    port_counter = 1235
+    ip_counter = 1

+    def _create_connection(
+        source_node_id: NodeId, sink_node_id: NodeId, send_back_port: int | None = None
+    ) -> Connection:
+        nonlocal port_counter
+        nonlocal ip_counter
+        # assign unique ips
+        ip_counter += 1
+        if send_back_port is None:
+            send_back_port = port_counter
+            port_counter += 1
+        return Connection(
+            local_node_id=source_node_id,
+            send_back_node_id=sink_node_id,
+            send_back_multiaddr=Multiaddr(
+                address=f"/ip4/169.254.0.{ip_counter}/tcp/{send_back_port}"
+            ),
+            connection_profile=ConnectionProfile(
+                throughput=1000, latency=1000, jitter=1000
+            ),
+        )

-def create_rdma_connection(iface: int) -> RDMAConnection:
-    return RDMAConnection(
-        source_rdma_iface=f"rdma_en{iface}", sink_rdma_iface=f"rdma_en{iface}"
-    )
+    return _create_connection
--- a/src/exo/master/tests/test_master.py
+++ b/src/exo/master/tests/test_master.py
@@ -19,13 +19,15 @@ from exo.shared.types.events import (
    ForwarderEvent,
    IndexedEvent,
    InstanceCreated,
-    NodeGatheredInfo,
+    NodePerformanceMeasured,
    TaskCreated,
 )
 from exo.shared.types.memory import Memory
 from exo.shared.types.models import ModelId, ModelMetadata
 from exo.shared.types.profiling import (
-    MemoryUsage,
+    MemoryPerformanceProfile,
+    NodePerformanceProfile,
+    SystemPerformanceProfile,
 )
 from exo.shared.types.tasks import ChatCompletion as ChatCompletionTask
 from exo.shared.types.tasks import TaskStatus
@@ -81,14 +83,21 @@ async def test_master():
                origin=sender_node_id,
                session=session_id,
                event=(
-                    NodeGatheredInfo(
+                    NodePerformanceMeasured(
                        when=str(datetime.now(tz=timezone.utc)),
                        node_id=node_id,
-                        info=MemoryUsage(
-                            ram_total=Memory.from_bytes(678948 * 1024),
-                            ram_available=Memory.from_bytes(678948 * 1024),
-                            swap_total=Memory.from_bytes(0),
-                            swap_available=Memory.from_bytes(0),
+                        node_profile=NodePerformanceProfile(
+                            model_id="maccy",
+                            chip_id="arm",
+                            friendly_name="test",
+                            memory=MemoryPerformanceProfile(
+                                ram_total=Memory.from_bytes(678948 * 1024),
+                                ram_available=Memory.from_bytes(678948 * 1024),
+                                swap_total=Memory.from_bytes(0),
+                                swap_available=Memory.from_bytes(0),
+                            ),
+                            network_interfaces=[],
+                            system=SystemPerformanceProfile(),
                        ),
                    )
                ),
@@ -152,7 +161,7 @@ async def test_master():
        assert events[0].idx == 0
        assert events[1].idx == 1
        assert events[2].idx == 2
-        assert isinstance(events[0].event, NodeGatheredInfo)
+        assert isinstance(events[0].event, NodePerformanceMeasured)
        assert isinstance(events[1].event, InstanceCreated)
        runner_id = list(
            events[1].event.instance.shard_assignments.runner_to_shard.keys()
--- a/src/exo/master/tests/test_placement.py
+++ b/src/exo/master/tests/test_placement.py
@@ -1,3 +1,5 @@
+from typing import Callable
+
 import pytest
 from loguru import logger

@@ -5,20 +7,14 @@ from exo.master.placement import (
    get_transition_events,
    place_instance,
 )
-from exo.master.tests.conftest import (
-    create_connection,
-    create_node_profile,
-    create_rdma_connection,
-)
 from exo.shared.topology import Topology
 from exo.shared.types.commands import PlaceInstance
 from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.events import InstanceCreated, InstanceDeleted
 from exo.shared.types.memory import Memory
 from exo.shared.types.models import ModelId, ModelMetadata
-from exo.shared.types.multiaddr import Multiaddr
-from exo.shared.types.profiling import NetworkInterfaceInfo
-from exo.shared.types.topology import SocketConnection
+from exo.shared.types.profiling import NetworkInterfaceInfo, NodePerformanceProfile
+from exo.shared.types.topology import Connection, NodeInfo
 from exo.shared.types.worker.instances import (
    Instance,
    InstanceId,
@@ -30,6 +26,11 @@ from exo.shared.types.worker.runners import ShardAssignments
 from exo.shared.types.worker.shards import Sharding


+@pytest.fixture
+def topology() -> Topology:
+    return Topology()
+
+
@pytest.fixture
 def instance() -> Instance:
    return MlxRingInstance(
@@ -73,33 +74,30 @@ def test_get_instance_placements_create_instance(
    available_memory: tuple[int, int, int],
    total_layers: int,
    expected_layers: tuple[int, int, int],
+    topology: Topology,
    model_meta: ModelMetadata,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
 ):
    # arrange
    model_meta.n_layers = total_layers
    model_meta.storage_size.in_bytes = sum(
        available_memory
    )  # make it exactly fit across all nodes
-    topology = Topology()

    cic = place_instance_command(model_meta)
    node_id_a = NodeId()
    node_id_b = NodeId()
    node_id_c = NodeId()
-    profiles = {
-        node_id_a: create_node_profile(available_memory[0]),
-        node_id_b: create_node_profile(available_memory[1]),
-        node_id_c: create_node_profile(available_memory[2]),
-    }
-    topology.add_node(node_id_a)
-    topology.add_node(node_id_b)
-    topology.add_node(node_id_c)
-    topology.add_connection(node_id_a, node_id_b, create_connection(1))
-    topology.add_connection(node_id_b, node_id_c, create_connection(2))
-    topology.add_connection(node_id_c, node_id_a, create_connection(3))
+    topology.add_node(create_node(available_memory[0], node_id_a))
+    topology.add_node(create_node(available_memory[1], node_id_b))
+    topology.add_node(create_node(available_memory[2], node_id_c))
+    topology.add_connection(create_connection(node_id_a, node_id_b))
+    topology.add_connection(create_connection(node_id_b, node_id_c))
+    topology.add_connection(create_connection(node_id_c, node_id_a))

    # act
-    placements = place_instance(cic, topology, {}, profiles)
+    placements = place_instance(cic, topology, {})

    # assert
    assert len(placements) == 1
@@ -125,11 +123,12 @@ def test_get_instance_placements_create_instance(
    assert shards_sorted[-1].end_layer == total_layers


-def test_get_instance_placements_one_node_exact_fit() -> None:
+def test_get_instance_placements_one_node_exact_fit(
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+) -> None:
    topology = Topology()
    node_id = NodeId()
-    topology.add_node(node_id)
-    profiles = {node_id: create_node_profile(1000 * 1024)}
+    topology.add_node(create_node(1000 * 1024, node_id))
    cic = place_instance_command(
        ModelMetadata(
            model_id=ModelId("test-model"),
@@ -138,7 +137,7 @@ def test_get_instance_placements_one_node_exact_fit() -> None:
            n_layers=10,
        ),
    )
-    placements = place_instance(cic, topology, {}, profiles)
+    placements = place_instance(cic, topology, {})

    assert len(placements) == 1
    instance_id = list(placements.keys())[0]
@@ -149,11 +148,12 @@ def test_get_instance_placements_one_node_exact_fit() -> None:
    assert len(instance.shard_assignments.runner_to_shard) == 1


-def test_get_instance_placements_one_node_fits_with_extra_memory() -> None:
+def test_get_instance_placements_one_node_fits_with_extra_memory(
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+) -> None:
    topology = Topology()
    node_id = NodeId()
-    topology.add_node(node_id)
-    profiles = {node_id: create_node_profile(1001 * 1024)}
+    topology.add_node(create_node(1001 * 1024, node_id))
    cic = place_instance_command(
        ModelMetadata(
            model_id=ModelId("test-model"),
@@ -162,7 +162,7 @@ def test_get_instance_placements_one_node_fits_with_extra_memory() -> None:
            n_layers=10,
        ),
    )
-    placements = place_instance(cic, topology, {}, profiles)
+    placements = place_instance(cic, topology, {})

    assert len(placements) == 1
    instance_id = list(placements.keys())[0]
@@ -173,11 +173,12 @@ def test_get_instance_placements_one_node_fits_with_extra_memory() -> None:
    assert len(instance.shard_assignments.runner_to_shard) == 1


-def test_get_instance_placements_one_node_not_fit() -> None:
+def test_get_instance_placements_one_node_not_fit(
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+) -> None:
    topology = Topology()
    node_id = NodeId()
-    topology.add_node(node_id)
-    profiles = {node_id: create_node_profile(1000 * 1024)}
+    topology.add_node(create_node(1000 * 1024, node_id))
    cic = place_instance_command(
        model_meta=ModelMetadata(
            model_id=ModelId("test-model"),
@@ -188,7 +189,7 @@ def test_get_instance_placements_one_node_not_fit() -> None:
    )

    with pytest.raises(ValueError, match="No cycles found with sufficient memory"):
-        place_instance(cic, topology, {}, profiles)
+        place_instance(cic, topology, {})


 def test_get_transition_events_no_change(instance: Instance):
@@ -234,102 +235,190 @@ def test_get_transition_events_delete_instance(instance: Instance):


 def test_placement_prioritizes_leaf_cycle_with_less_memory(
+    topology: Topology,
    model_meta: ModelMetadata,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
 ):
-    # arrange
-    topology = Topology()
+    # Arrange two 3-node cycles. The A-B-C cycle has a leaf node (only one outgoing
+    # neighbor per node). The D-E-F cycle has extra outgoing edges making its nodes
+    # non-leaves. Ensure both cycles have sufficient total memory, with the A-B-C
+    # cycle having LESS total memory than D-E-F. The algorithm should still choose
+    # the cycle that contains a leaf node.

-    model_meta.storage_size = Memory.from_bytes(1000)
+    # Model requires more than any single node but fits within a 3-node cycle
+    model_meta.storage_size.in_bytes = 1500
+    model_meta.n_layers = 12

+    # Create node ids
    node_id_a = NodeId()
    node_id_b = NodeId()
    node_id_c = NodeId()
    node_id_d = NodeId()
+    node_id_e = NodeId()
+    node_id_f = NodeId()

-    profiles = {
-        node_id_a: create_node_profile(500),
-        node_id_b: create_node_profile(600),
-        node_id_c: create_node_profile(600),
-        node_id_d: create_node_profile(500),
-    }
+    # Extra sink nodes to make D/E/F non-leaf via additional outgoing edges
+    node_id_x = NodeId()
+    node_id_y = NodeId()
+    node_id_z = NodeId()

-    topology.add_node(node_id_a)
-    topology.add_node(node_id_b)
-    topology.add_node(node_id_c)
-    topology.add_node(node_id_d)
+    # A-B-C cycle total memory = 1600 (< D-E-F total)
+    topology.add_node(create_node(400, node_id_a))
+    topology.add_node(create_node(400, node_id_b))
+    topology.add_node(create_node(800, node_id_c))

-    # Daisy chain topology
-    topology.add_connection(node_id_a, node_id_b, create_connection(1))
-    topology.add_connection(node_id_b, node_id_a, create_connection(1))
-    topology.add_connection(node_id_b, node_id_c, create_connection(1))
-    topology.add_connection(node_id_c, node_id_b, create_connection(1))
-    topology.add_connection(node_id_c, node_id_d, create_connection(1))
-    topology.add_connection(node_id_d, node_id_c, create_connection(1))
+    # D-E-F cycle total memory = 1800 (> A-B-C total)
+    topology.add_node(create_node(600, node_id_d))
+    topology.add_node(create_node(600, node_id_e))
+    topology.add_node(create_node(600, node_id_f))

-    logger.info(list(topology.list_connections()))
+    # Extra nodes with tiny memory so they can't form singleton placements
+    topology.add_node(create_node(10, node_id_x))
+    topology.add_node(create_node(10, node_id_y))
+    topology.add_node(create_node(10, node_id_z))
+
+    # Build directed cycles
+    topology.add_connection(create_connection(node_id_a, node_id_b))
+    topology.add_connection(create_connection(node_id_b, node_id_c))
+    topology.add_connection(create_connection(node_id_c, node_id_a))
+
+    topology.add_connection(create_connection(node_id_d, node_id_e))
+    topology.add_connection(create_connection(node_id_e, node_id_f))
+    topology.add_connection(create_connection(node_id_f, node_id_d))
+
+    # Add extra outgoing edges from D/E/F so none of them are leaves
+    topology.add_connection(create_connection(node_id_d, node_id_x))
+    topology.add_connection(create_connection(node_id_e, node_id_y))
+    topology.add_connection(create_connection(node_id_f, node_id_z))

    cic = place_instance_command(
        model_meta=model_meta,
    )

-    # act
-    placements = place_instance(cic, topology, {}, profiles)
+    # Act
+    placements = place_instance(cic, topology, {})

-    # assert
+    # Assert the chosen cycle is A-B-C (contains at least one leaf node), even though
+    # D-E-F has more total memory.
    assert len(placements) == 1
-    instance = list(placements.values())[0]
+    instance_id = list(placements.keys())[0]
+    instance = placements[instance_id]

    assigned_nodes = set(instance.shard_assignments.node_to_runner.keys())
-    assert assigned_nodes == set((node_id_a, node_id_b)) or assigned_nodes == set(
-        (node_id_c, node_id_d)
-    )
+    expected_leaf_cycle_nodes = {node_id_a, node_id_b, node_id_c}
+    non_leaf_cycle_nodes = {node_id_d, node_id_e, node_id_f}
+
+    assert expected_leaf_cycle_nodes.issubset(assigned_nodes)
+    assert assigned_nodes.isdisjoint(non_leaf_cycle_nodes)


 def test_tensor_rdma_backend_connectivity_matrix(
+    topology: Topology,
    model_meta: ModelMetadata,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
 ):
-    topology = Topology()
    model_meta.n_layers = 12
    model_meta.storage_size.in_bytes = 1500

-    node_a = NodeId()
-    node_b = NodeId()
-    node_c = NodeId()
+    node_id_a = NodeId()
+    node_id_b = NodeId()
+    node_id_c = NodeId()

-    profiles = {
-        node_a: create_node_profile(500),
-        node_b: create_node_profile(500),
-        node_c: create_node_profile(500),
-    }
+    node_a = create_node(500, node_id_a)
+    node_b = create_node(500, node_id_b)
+    node_c = create_node(500, node_id_c)

    ethernet_interface = NetworkInterfaceInfo(
        name="en0",
        ip_address="192.168.1.100",
    )
-    ethernet_conn = SocketConnection(
-        sink_multiaddr=Multiaddr(address=f"/ip4/192.168.1.{100}/tcp/{8000}")
-    )

-    profiles[node_a].network_interfaces = [ethernet_interface]
-    profiles[node_b].network_interfaces = [ethernet_interface]
-    profiles[node_c].network_interfaces = [ethernet_interface]
+    assert node_a.node_profile is not None
+    assert node_b.node_profile is not None
+    assert node_c.node_profile is not None
+
+    conn_a_b = create_connection(node_id_a, node_id_b)
+    conn_b_c = create_connection(node_id_b, node_id_c)
+    conn_c_a = create_connection(node_id_c, node_id_a)
+
+    conn_b_a = create_connection(node_id_b, node_id_a)
+    conn_c_b = create_connection(node_id_c, node_id_b)
+    conn_a_c = create_connection(node_id_a, node_id_c)
+
+    assert conn_a_b.send_back_multiaddr is not None
+    assert conn_b_c.send_back_multiaddr is not None
+    assert conn_c_a.send_back_multiaddr is not None
+
+    assert conn_b_a.send_back_multiaddr is not None
+    assert conn_c_b.send_back_multiaddr is not None
+    assert conn_a_c.send_back_multiaddr is not None
+
+    node_a.node_profile = NodePerformanceProfile(
+        model_id="test",
+        chip_id="test",
+        friendly_name="test",
+        memory=node_a.node_profile.memory,
+        network_interfaces=[
+            NetworkInterfaceInfo(
+                name="en3",
+                ip_address=conn_c_a.send_back_multiaddr.ip_address,
+            ),
+            NetworkInterfaceInfo(
+                name="en4",
+                ip_address=conn_b_a.send_back_multiaddr.ip_address,
+            ),
+            ethernet_interface,
+        ],
+        system=node_a.node_profile.system,
+    )
+    node_b.node_profile = NodePerformanceProfile(
+        model_id="test",
+        chip_id="test",
+        friendly_name="test",
+        memory=node_b.node_profile.memory,
+        network_interfaces=[
+            NetworkInterfaceInfo(
+                name="en3",
+                ip_address=conn_c_b.send_back_multiaddr.ip_address,
+            ),
+            NetworkInterfaceInfo(
+                name="en4",
+                ip_address=conn_a_b.send_back_multiaddr.ip_address,
+            ),
+            ethernet_interface,
+        ],
+        system=node_b.node_profile.system,
+    )
+    node_c.node_profile = NodePerformanceProfile(
+        model_id="test",
+        chip_id="test",
+        friendly_name="test",
+        memory=node_c.node_profile.memory,
+        network_interfaces=[
+            NetworkInterfaceInfo(
+                name="en3",
+                ip_address=conn_a_c.send_back_multiaddr.ip_address,
+            ),
+            NetworkInterfaceInfo(
+                name="en4",
+                ip_address=conn_b_c.send_back_multiaddr.ip_address,
+            ),
+            ethernet_interface,
+        ],
+        system=node_c.node_profile.system,
+    )

    topology.add_node(node_a)
    topology.add_node(node_b)
    topology.add_node(node_c)
-    topology.add_connection(node_a, node_b, create_rdma_connection(3))
-    topology.add_connection(node_b, node_c, create_rdma_connection(4))
-    topology.add_connection(node_c, node_a, create_rdma_connection(5))
-    topology.add_connection(node_b, node_a, create_rdma_connection(3))
-    topology.add_connection(node_c, node_b, create_rdma_connection(4))
-    topology.add_connection(node_a, node_c, create_rdma_connection(5))
-
-    topology.add_connection(node_a, node_b, ethernet_conn)
-    topology.add_connection(node_b, node_c, ethernet_conn)
-    topology.add_connection(node_c, node_a, ethernet_conn)
-    topology.add_connection(node_a, node_c, ethernet_conn)
-    topology.add_connection(node_b, node_a, ethernet_conn)
-    topology.add_connection(node_c, node_b, ethernet_conn)
+    topology.add_connection(conn_a_b)
+    topology.add_connection(conn_b_c)
+    topology.add_connection(conn_c_a)
+    topology.add_connection(conn_b_a)
+    topology.add_connection(conn_c_b)
+    topology.add_connection(conn_a_c)

    cic = PlaceInstance(
        sharding=Sharding.Tensor,
@@ -339,7 +428,7 @@ def test_tensor_rdma_backend_connectivity_matrix(
        min_nodes=1,
    )

-    placements = place_instance(cic, topology, {}, profiles)
+    placements = place_instance(cic, topology, {})

    assert len(placements) == 1
    instance_id = list(placements.keys())[0]
@@ -347,10 +436,10 @@ def test_tensor_rdma_backend_connectivity_matrix(

    assert isinstance(instance, MlxJacclInstance)

-    assert instance.jaccl_devices is not None
+    assert instance.ibv_devices is not None
    assert instance.jaccl_coordinators is not None

-    matrix = instance.jaccl_devices
+    matrix = instance.ibv_devices
    assert len(matrix) == 3

    for i in range(3):
@@ -359,15 +448,15 @@ def test_tensor_rdma_backend_connectivity_matrix(
    assigned_nodes = list(instance.shard_assignments.node_to_runner.keys())
    node_to_idx = {node_id: idx for idx, node_id in enumerate(assigned_nodes)}

-    idx_a = node_to_idx[node_a]
-    idx_b = node_to_idx[node_b]
-    idx_c = node_to_idx[node_c]
+    idx_a = node_to_idx[node_id_a]
+    idx_b = node_to_idx[node_id_b]
+    idx_c = node_to_idx[node_id_c]

    logger.info(matrix)

-    assert matrix[idx_a][idx_b] == "rdma_en3"
-    assert matrix[idx_b][idx_c] == "rdma_en4"
-    assert matrix[idx_c][idx_a] == "rdma_en5"
+    assert matrix[idx_a][idx_b] == "rdma_en4"
+    assert matrix[idx_b][idx_c] == "rdma_en3"
+    assert matrix[idx_c][idx_a] == "rdma_en3"

    # Verify coordinators are set for all nodes
    assert len(instance.jaccl_coordinators) == 3
--- a/src/exo/master/tests/test_placement_utils.py
+++ b/src/exo/master/tests/test_placement_utils.py
@@ -1,48 +1,56 @@
+from typing import Callable
+
 import pytest

 from exo.master.placement_utils import (
-    NodeWithProfile,
    filter_cycles_by_memory,
    get_hosts_from_subgraph,
    get_mlx_jaccl_coordinators,
    get_shard_assignments,
    get_smallest_cycles,
 )
-from exo.master.tests.conftest import create_connection, create_node_profile
 from exo.shared.topology import Topology
 from exo.shared.types.common import Host, NodeId
 from exo.shared.types.memory import Memory
 from exo.shared.types.models import ModelId, ModelMetadata
+from exo.shared.types.profiling import NetworkInterfaceInfo, NodePerformanceProfile
+from exo.shared.types.topology import Connection, NodeInfo
 from exo.shared.types.worker.shards import Sharding


-def test_filter_cycles_by_memory():
+@pytest.fixture
+def topology() -> Topology:
+    topology = Topology()
+    return topology
+
+
+def test_filter_cycles_by_memory(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
+):
    # arrange
    node1_id = NodeId()
    node2_id = NodeId()
-    topology = Topology()

-    node1 = create_node_profile(1000 * 1024)
-    node2 = create_node_profile(1000 * 1024)
-    node_profiles = {node1_id: node1, node2_id: node2}
+    node1 = create_node(1000 * 1024, node1_id)
+    node2 = create_node(1000 * 1024, node2_id)

-    topology.add_node(node1_id)
-    topology.add_node(node2_id)
+    topology.add_node(node1)
+    topology.add_node(node2)

-    connection1 = create_connection(1)
-    connection2 = create_connection(2)
+    connection1 = create_connection(node1_id, node2_id)
+    connection2 = create_connection(node2_id, node1_id)

-    topology.add_connection(node1_id, node2_id, connection1)
-    topology.add_connection(node2_id, node1_id, connection2)
+    topology.add_connection(connection1)
+    topology.add_connection(connection2)

    cycles = topology.get_cycles()
    assert len(cycles) == 1
    assert len(cycles[0]) == 2

    # act
-    filtered_cycles = filter_cycles_by_memory(
-        cycles, node_profiles, Memory.from_bytes(1)
-    )
+    filtered_cycles = filter_cycles_by_memory(cycles, Memory.from_bytes(1))

    # assert
    assert len(filtered_cycles) == 1
@@ -50,65 +58,64 @@ def test_filter_cycles_by_memory():
    assert set(n.node_id for n in filtered_cycles[0]) == {node1_id, node2_id}


-def test_filter_cycles_by_insufficient_memory():
+def test_filter_cycles_by_insufficient_memory(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
+):
    # arrange
    node1_id = NodeId()
    node2_id = NodeId()
-    topology = Topology()

-    node1 = create_node_profile(1000 * 1024)
-    node2 = create_node_profile(1000 * 1024)
-    node_profiles = {node1_id: node1, node2_id: node2}
+    node1 = create_node(1000 * 1024, node1_id)
+    node2 = create_node(1000 * 1024, node2_id)

-    topology.add_node(node1_id)
-    topology.add_node(node2_id)
+    topology.add_node(node1)
+    topology.add_node(node2)

-    connection1 = create_connection(1)
-    connection2 = create_connection(2)
+    connection1 = create_connection(node1_id, node2_id)
+    connection2 = create_connection(node2_id, node1_id)

-    topology.add_connection(node1_id, node2_id, connection1)
-    topology.add_connection(node2_id, node1_id, connection2)
+    topology.add_connection(connection1)
+    topology.add_connection(connection2)

    # act
    filtered_cycles = filter_cycles_by_memory(
-        topology.get_cycles(), node_profiles, Memory.from_kb(2001)
+        topology.get_cycles(), Memory.from_kb(2001)
    )

    # assert
    assert len(filtered_cycles) == 0


-def test_filter_multiple_cycles_by_memory():
+def test_filter_multiple_cycles_by_memory(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
+):
    # arrange
    node_a_id = NodeId()
    node_b_id = NodeId()
    node_c_id = NodeId()
-    topology = Topology()

-    node_a = create_node_profile(500 * 1024)
-    node_b = create_node_profile(500 * 1024)
-    node_c = create_node_profile(1000 * 1024)
-    node_profiles = {
-        node_a_id: node_a,
-        node_b_id: node_b,
-        node_c_id: node_c,
-    }
+    node_a = create_node(500 * 1024, node_a_id)
+    node_b = create_node(500 * 1024, node_b_id)
+    node_c = create_node(1000 * 1024, node_c_id)

-    topology.add_node(node_a_id)
-    topology.add_node(node_b_id)
-    topology.add_node(node_c_id)
+    topology.add_node(node_a)
+    topology.add_node(node_b)
+    topology.add_node(node_c)

-    topology.add_connection(node_a_id, node_b_id, create_connection(1))
-    topology.add_connection(node_b_id, node_a_id, create_connection(2))
-    topology.add_connection(node_a_id, node_c_id, create_connection(3))
-    topology.add_connection(node_c_id, node_b_id, create_connection(4))
+    topology.add_connection(create_connection(node_a_id, node_b_id))
+    topology.add_connection(create_connection(node_b_id, node_a_id))
+
+    topology.add_connection(create_connection(node_a_id, node_c_id))
+    topology.add_connection(create_connection(node_c_id, node_b_id))

    cycles = topology.get_cycles()

    # act
-    filtered_cycles = filter_cycles_by_memory(
-        cycles, node_profiles, Memory.from_kb(1500)
-    )
+    filtered_cycles = filter_cycles_by_memory(cycles, Memory.from_kb(1500))

    # assert
    assert len(filtered_cycles) == 1
@@ -120,38 +127,31 @@ def test_filter_multiple_cycles_by_memory():
    }


-def test_get_smallest_cycles():
+def test_get_smallest_cycles(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
+):
    # arrange
    node_a_id = NodeId()
    node_b_id = NodeId()
    node_c_id = NodeId()
-    topology = Topology()

-    node_a = create_node_profile(500 * 1024)
-    node_b = create_node_profile(500 * 1024)
-    node_c = create_node_profile(1000 * 1024)
-    node_profiles = {
-        node_a_id: node_a,
-        node_b_id: node_b,
-        node_c_id: node_c,
-    }
+    node_a = create_node(500 * 1024, node_a_id)
+    node_b = create_node(500 * 1024, node_b_id)
+    node_c = create_node(1000 * 1024, node_c_id)

-    topology.add_node(node_a_id)
-    topology.add_node(node_b_id)
-    topology.add_node(node_c_id)
+    topology.add_node(node_a)
+    topology.add_node(node_b)
+    topology.add_node(node_c)

-    topology.add_connection(node_a_id, node_b_id, create_connection(1))
-    topology.add_connection(node_b_id, node_a_id, create_connection(2))
-    topology.add_connection(node_a_id, node_c_id, create_connection(3))
-    topology.add_connection(node_c_id, node_b_id, create_connection(4))
-
-    cycles = [
-        [NodeWithProfile(node_id=nid, node_profile=node_profiles[nid]) for nid in cycle]
-        for cycle in topology.get_cycles()
-    ]
+    topology.add_connection(create_connection(node_a_id, node_b_id))
+    topology.add_connection(create_connection(node_b_id, node_c_id))
+    topology.add_connection(create_connection(node_c_id, node_a_id))
+    topology.add_connection(create_connection(node_b_id, node_a_id))

    # act
-    smallest_cycles = get_smallest_cycles(cycles)
+    smallest_cycles = get_smallest_cycles(topology.get_cycles())

    # assert
    assert len(smallest_cycles) == 1
@@ -168,6 +168,9 @@ def test_get_smallest_cycles():
    ],
 )
 def test_get_shard_assignments(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId], Connection],
    available_memory: tuple[int, int, int],
    total_layers: int,
    expected_layers: tuple[int, int, int],
@@ -176,25 +179,19 @@ def test_get_shard_assignments(
    node_a_id = NodeId()
    node_b_id = NodeId()
    node_c_id = NodeId()
-    topology = Topology()

-    node_a = create_node_profile(available_memory[0] * 1024)
-    node_b = create_node_profile(available_memory[1] * 1024)
-    node_c = create_node_profile(available_memory[2] * 1024)
-    node_profiles = {
-        node_a_id: node_a,
-        node_b_id: node_b,
-        node_c_id: node_c,
-    }
+    node_a = create_node(available_memory[0] * 1024, node_a_id)
+    node_b = create_node(available_memory[1] * 1024, node_b_id)
+    node_c = create_node(available_memory[2] * 1024, node_c_id)

-    topology.add_node(node_a_id)
-    topology.add_node(node_b_id)
-    topology.add_node(node_c_id)
+    topology.add_node(node_a)
+    topology.add_node(node_b)
+    topology.add_node(node_c)

-    topology.add_connection(node_a_id, node_b_id, create_connection(1))
-    topology.add_connection(node_b_id, node_c_id, create_connection(2))
-    topology.add_connection(node_c_id, node_a_id, create_connection(3))
-    topology.add_connection(node_b_id, node_a_id, create_connection(4))
+    topology.add_connection(create_connection(node_a_id, node_b_id))
+    topology.add_connection(create_connection(node_b_id, node_c_id))
+    topology.add_connection(create_connection(node_c_id, node_a_id))
+    topology.add_connection(create_connection(node_b_id, node_a_id))

    model_meta = ModelMetadata(
        model_id=ModelId("test-model"),
@@ -202,11 +199,7 @@ def test_get_shard_assignments(
        n_layers=total_layers,
        storage_size=Memory.from_kb(1000),
    )
-
-    cycles = [
-        [NodeWithProfile(node_id=nid, node_profile=node_profiles[nid]) for nid in cycle]
-        for cycle in topology.get_cycles()
-    ]
+    cycles = topology.get_cycles()
    selected_cycle = cycles[0]

    # act
@@ -235,21 +228,28 @@ def test_get_shard_assignments(
    )


-def test_get_hosts_from_subgraph():
+def test_get_hosts_from_subgraph(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId, int | None], Connection],
+):
    # arrange
    node_a_id = NodeId()
    node_b_id = NodeId()
    node_c_id = NodeId()
-    topology = Topology()

-    topology.add_node(node_a_id)
-    topology.add_node(node_b_id)
-    topology.add_node(node_c_id)
+    node_a = create_node(500, node_a_id)
+    node_b = create_node(500, node_b_id)
+    node_c = create_node(1000, node_c_id)

-    topology.add_connection(node_a_id, node_b_id, create_connection(1))
-    topology.add_connection(node_b_id, node_a_id, create_connection(2))
-    topology.add_connection(node_a_id, node_c_id, create_connection(3))
-    topology.add_connection(node_c_id, node_b_id, create_connection(4))
+    topology.add_node(node_a)
+    topology.add_node(node_b)
+    topology.add_node(node_c)
+
+    topology.add_connection(create_connection(node_a_id, node_b_id, 5001))
+    topology.add_connection(create_connection(node_b_id, node_c_id, 5002))
+    topology.add_connection(create_connection(node_c_id, node_a_id, 5003))
+    topology.add_connection(create_connection(node_b_id, node_a_id, 5004))

    # act
    hosts = get_hosts_from_subgraph(topology)
@@ -257,47 +257,108 @@ def test_get_hosts_from_subgraph():
    # assert
    assert len(hosts) == 3
    expected_hosts = [
-        Host(ip=("169.254.0.2"), port=1234),
-        Host(ip=("169.254.0.3"), port=1234),
-        Host(ip=("169.254.0.4"), port=1234),
+        Host(ip=("169.254.0.2"), port=5001),
+        Host(ip=("169.254.0.3"), port=5002),
+        Host(ip=("169.254.0.4"), port=5003),
    ]
    for expected_host in expected_hosts:
        assert expected_host in hosts


-def test_get_mlx_jaccl_coordinators():
+def test_get_mlx_jaccl_coordinators(
+    topology: Topology,
+    create_node: Callable[[int, NodeId | None], NodeInfo],
+    create_connection: Callable[[NodeId, NodeId, int | None], Connection],
+):
    # arrange
    node_a_id = NodeId()
    node_b_id = NodeId()
    node_c_id = NodeId()
-    topology = Topology()

-    topology.add_node(node_a_id)
-    topology.add_node(node_b_id)
-    topology.add_node(node_c_id)
+    node_a = create_node(500 * 1024, node_a_id)
+    node_b = create_node(500 * 1024, node_b_id)
+    node_c = create_node(1000 * 1024, node_c_id)

-    topology.add_connection(node_a_id, node_b_id, create_connection(1))
-    topology.add_connection(node_b_id, node_a_id, create_connection(2))
-    topology.add_connection(node_a_id, node_c_id, create_connection(3))
-    topology.add_connection(node_c_id, node_b_id, create_connection(4))
+    conn_a_b = create_connection(node_a_id, node_b_id, 5001)
+    conn_b_a = create_connection(node_b_id, node_a_id, 5002)
+    conn_b_c = create_connection(node_b_id, node_c_id, 5003)
+    conn_c_b = create_connection(node_c_id, node_b_id, 5004)
+    conn_c_a = create_connection(node_c_id, node_a_id, 5005)
+    conn_a_c = create_connection(node_a_id, node_c_id, 5006)

-    conn_a_b = create_connection(1)
-    conn_b_a = create_connection(2)
-    conn_b_c = create_connection(3)
-    conn_c_b = create_connection(4)
-    conn_c_a = create_connection(5)
-    conn_a_c = create_connection(6)
+    # Update node profiles with network interfaces before adding to topology
+    assert node_a.node_profile is not None
+    assert node_b.node_profile is not None
+    assert node_c.node_profile is not None

-    topology.add_connection(node_a_id, node_b_id, conn_a_b)
-    topology.add_connection(node_b_id, node_a_id, conn_b_a)
-    topology.add_connection(node_b_id, node_c_id, conn_b_c)
-    topology.add_connection(node_c_id, node_b_id, conn_c_b)
-    topology.add_connection(node_c_id, node_a_id, conn_c_a)
-    topology.add_connection(node_a_id, node_c_id, conn_a_c)
+    node_a.node_profile = NodePerformanceProfile(
+        model_id="test",
+        chip_id="test",
+        friendly_name="test",
+        memory=node_a.node_profile.memory,
+        network_interfaces=[
+            NetworkInterfaceInfo(
+                name="en3",
+                ip_address=conn_a_b.send_back_multiaddr.ip_address,
+            ),
+            NetworkInterfaceInfo(
+                name="en4",
+                ip_address=conn_a_c.send_back_multiaddr.ip_address,
+            ),
+        ],
+        system=node_a.node_profile.system,
+    )
+    node_b.node_profile = NodePerformanceProfile(
+        model_id="test",
+        chip_id="test",
+        friendly_name="test",
+        memory=node_b.node_profile.memory,
+        network_interfaces=[
+            NetworkInterfaceInfo(
+                name="en3",
+                ip_address=conn_b_a.send_back_multiaddr.ip_address,
+            ),
+            NetworkInterfaceInfo(
+                name="en4",
+                ip_address=conn_b_c.send_back_multiaddr.ip_address,
+            ),
+        ],
+        system=node_b.node_profile.system,
+    )
+    node_c.node_profile = NodePerformanceProfile(
+        model_id="test",
+        chip_id="test",
+        friendly_name="test",
+        memory=node_c.node_profile.memory,
+        network_interfaces=[
+            NetworkInterfaceInfo(
+                name="en3",
+                ip_address=conn_c_b.send_back_multiaddr.ip_address,
+            ),
+            NetworkInterfaceInfo(
+                name="en4",
+                ip_address=conn_c_a.send_back_multiaddr.ip_address,
+            ),
+        ],
+        system=node_c.node_profile.system,
+    )
+
+    topology.add_node(node_a)
+    topology.add_node(node_b)
+    topology.add_node(node_c)
+
+    topology.add_connection(conn_a_b)
+    topology.add_connection(conn_b_a)
+    topology.add_connection(conn_b_c)
+    topology.add_connection(conn_c_b)
+    topology.add_connection(conn_c_a)
+    topology.add_connection(conn_a_c)
+
+    cycle = [node_a, node_b, node_c]

    # act
    coordinators = get_mlx_jaccl_coordinators(
-        node_a_id, coordinator_port=5000, cycle_digraph=topology
+        cycle, coordinator_port=5000, cycle_digraph=topology
    )

    # assert
@@ -326,11 +387,11 @@ def test_get_mlx_jaccl_coordinators():

    # Non-rank-0 nodes should use the specific IP from their connection to rank 0
    # node_b uses the IP from conn_b_a (node_b -> node_a)
-    assert coordinators[node_b_id] == (f"{conn_b_a.sink_multiaddr.ip_address}:5000"), (
-        "node_b should use the IP from conn_b_a"
-    )
+    assert coordinators[node_b_id] == (
+        f"{conn_b_a.send_back_multiaddr.ip_address}:5000"
+    ), "node_b should use the IP from conn_b_a"

    # node_c uses the IP from conn_c_a (node_c -> node_a)
-    assert coordinators[node_c_id] == (f"{conn_c_a.sink_multiaddr.ip_address}:5000"), (
-        "node_c should use the IP from conn_c_a"
-    )
+    assert coordinators[node_c_id] == (
+        f"{conn_c_a.send_back_multiaddr.ip_address}:5000"
+    ), "node_c should use the IP from conn_c_a"
--- a/src/exo/master/tests/test_topology.py
+++ b/src/exo/master/tests/test_topology.py
@@ -1,14 +1,13 @@
 import pytest

 from exo.shared.topology import Topology
-from exo.shared.types.common import NodeId
 from exo.shared.types.multiaddr import Multiaddr
 from exo.shared.types.profiling import (
-    MemoryUsage,
+    MemoryPerformanceProfile,
    NodePerformanceProfile,
    SystemPerformanceProfile,
 )
-from exo.shared.types.topology import SocketConnection
+from exo.shared.types.topology import Connection, ConnectionProfile, NodeId, NodeInfo


@pytest.fixture
@@ -17,15 +16,20 @@ def topology() -> Topology:


@pytest.fixture
-def connection() -> SocketConnection:
-    return SocketConnection(
-        sink_multiaddr=Multiaddr(address="/ip4/127.0.0.1/tcp/1235"),
+def connection() -> Connection:
+    return Connection(
+        local_node_id=NodeId(),
+        send_back_node_id=NodeId(),
+        send_back_multiaddr=Multiaddr(address="/ip4/127.0.0.1/tcp/1235"),
+        connection_profile=ConnectionProfile(
+            throughput=1000, latency=1000, jitter=1000
+        ),
    )


@pytest.fixture
 def node_profile() -> NodePerformanceProfile:
-    memory_profile = MemoryUsage.from_bytes(
+    memory_profile = MemoryPerformanceProfile.from_bytes(
        ram_total=1000, ram_available=1000, swap_total=1000, swap_available=1000
    )
    system_profile = SystemPerformanceProfile()
@@ -39,85 +43,162 @@ def node_profile() -> NodePerformanceProfile:
    )


-def test_add_node(topology: Topology):
+@pytest.fixture
+def connection_profile() -> ConnectionProfile:
+    return ConnectionProfile(throughput=1000, latency=1000, jitter=1000)
+
+
+def test_add_node(topology: Topology, node_profile: NodePerformanceProfile):
    # arrange
    node_id = NodeId()

    # act
-    topology.add_node(node_id)
+    topology.add_node(NodeInfo(node_id=node_id, node_profile=node_profile))

    # assert
-    assert topology.node_is_leaf(node_id)
+    data = topology.get_node_profile(node_id)
+    assert data == node_profile


-def test_add_connection(topology: Topology, connection: SocketConnection):
+def test_add_connection(
+    topology: Topology, node_profile: NodePerformanceProfile, connection: Connection
+):
    # arrange
-    node_a = NodeId()
-    node_b = NodeId()
-
-    topology.add_node(node_a)
-    topology.add_node(node_b)
-    topology.add_connection(node_a, node_b, connection)
+    topology.add_node(
+        NodeInfo(node_id=connection.local_node_id, node_profile=node_profile)
+    )
+    topology.add_node(
+        NodeInfo(node_id=connection.send_back_node_id, node_profile=node_profile)
+    )
+    topology.add_connection(connection)

    # act
-    data = list(conn for _, _, conn in topology.list_connections())
+    data = topology.get_connection_profile(connection)

    # assert
-    assert data == [connection]
+    assert data == connection.connection_profile

-    assert topology.node_is_leaf(node_a)
-    assert topology.node_is_leaf(node_b)
+
+def test_update_node_profile(
+    topology: Topology, node_profile: NodePerformanceProfile, connection: Connection
+):
+    # arrange
+    topology.add_node(
+        NodeInfo(node_id=connection.local_node_id, node_profile=node_profile)
+    )
+    topology.add_node(
+        NodeInfo(node_id=connection.send_back_node_id, node_profile=node_profile)
+    )
+    topology.add_connection(connection)
+
+    new_node_profile = NodePerformanceProfile(
+        model_id="test",
+        chip_id="test",
+        friendly_name="test",
+        memory=MemoryPerformanceProfile.from_bytes(
+            ram_total=1000, ram_available=1000, swap_total=1000, swap_available=1000
+        ),
+        network_interfaces=[],
+        system=SystemPerformanceProfile(),
+    )
+
+    # act
+    topology.update_node_profile(
+        connection.local_node_id, node_profile=new_node_profile
+    )
+
+    # assert
+    data = topology.get_node_profile(connection.local_node_id)
+    assert data == new_node_profile
+
+
+def test_update_connection_profile(
+    topology: Topology, node_profile: NodePerformanceProfile, connection: Connection
+):
+    # arrange
+    topology.add_node(
+        NodeInfo(node_id=connection.local_node_id, node_profile=node_profile)
+    )
+    topology.add_node(
+        NodeInfo(node_id=connection.send_back_node_id, node_profile=node_profile)
+    )
+    topology.add_connection(connection)
+
+    new_connection_profile = ConnectionProfile(
+        throughput=2000, latency=2000, jitter=2000
+    )
+    connection = Connection(
+        local_node_id=connection.local_node_id,
+        send_back_node_id=connection.send_back_node_id,
+        send_back_multiaddr=connection.send_back_multiaddr,
+        connection_profile=new_connection_profile,
+    )
+
+    # act
+    topology.update_connection_profile(connection)
+
+    # assert
+    data = topology.get_connection_profile(connection)
+    assert data == new_connection_profile


 def test_remove_connection_still_connected(
-    topology: Topology, connection: SocketConnection
+    topology: Topology, node_profile: NodePerformanceProfile, connection: Connection
 ):
    # arrange
-    node_a = NodeId()
-    node_b = NodeId()
-
-    topology.add_node(node_a)
-    topology.add_node(node_b)
-    topology.add_connection(node_a, node_b, connection)
+    topology.add_node(
+        NodeInfo(node_id=connection.local_node_id, node_profile=node_profile)
+    )
+    topology.add_node(
+        NodeInfo(node_id=connection.send_back_node_id, node_profile=node_profile)
+    )
+    topology.add_connection(connection)

    # act
-    topology.remove_connection(node_a, node_b, connection)
+    topology.remove_connection(connection)

    # assert
-    assert list(topology.get_all_connections_between(node_a, node_b)) == []
+    assert topology.get_connection_profile(connection) is None


-def test_remove_node_still_connected(topology: Topology, connection: SocketConnection):
+def test_remove_node_still_connected(
+    topology: Topology, node_profile: NodePerformanceProfile, connection: Connection
+):
    # arrange
-    node_a = NodeId()
-    node_b = NodeId()
-
-    topology.add_node(node_a)
-    topology.add_node(node_b)
-    topology.add_connection(node_a, node_b, connection)
-    assert list(topology.out_edges(node_a)) == [(node_b, connection)]
+    topology.add_node(
+        NodeInfo(node_id=connection.local_node_id, node_profile=node_profile)
+    )
+    topology.add_node(
+        NodeInfo(node_id=connection.send_back_node_id, node_profile=node_profile)
+    )
+    topology.add_connection(connection)

    # act
-    topology.remove_node(node_b)
+    topology.remove_node(connection.local_node_id)

    # assert
-    assert list(topology.out_edges(node_a)) == []
+    assert topology.get_node_profile(connection.local_node_id) is None


-def test_list_nodes(topology: Topology, connection: SocketConnection):
+def test_list_nodes(
+    topology: Topology, node_profile: NodePerformanceProfile, connection: Connection
+):
    # arrange
-    node_a = NodeId()
-    node_b = NodeId()
-
-    topology.add_node(node_a)
-    topology.add_node(node_b)
-    topology.add_connection(node_a, node_b, connection)
-    assert list(topology.out_edges(node_a)) == [(node_b, connection)]
+    topology.add_node(
+        NodeInfo(node_id=connection.local_node_id, node_profile=node_profile)
+    )
+    topology.add_node(
+        NodeInfo(node_id=connection.send_back_node_id, node_profile=node_profile)
+    )
+    topology.add_connection(connection)

    # act
    nodes = list(topology.list_nodes())

    # assert
    assert len(nodes) == 2
-    assert all(isinstance(node, NodeId) for node in nodes)
-    assert {node for node in nodes} == {node_a, node_b}
+    assert all(isinstance(node, NodeInfo) for node in nodes)
+    assert {node.node_id for node in nodes} == {
+        connection.local_node_id,
+        connection.send_back_node_id,
+    }
--- a/src/exo/shared/apply.py
+++ b/src/exo/shared/apply.py
@@ -11,8 +11,10 @@ from exo.shared.types.events import (
    IndexedEvent,
    InstanceCreated,
    InstanceDeleted,
+    NodeCreated,
    NodeDownloadProgress,
-    NodeGatheredInfo,
+    NodeMemoryMeasured,
+    NodePerformanceMeasured,
    NodeTimedOut,
    RunnerDeleted,
    RunnerStatusUpdated,
@@ -25,23 +27,13 @@ from exo.shared.types.events import (
    TopologyEdgeCreated,
    TopologyEdgeDeleted,
 )
-from exo.shared.types.profiling import NodePerformanceProfile
+from exo.shared.types.profiling import NodePerformanceProfile, SystemPerformanceProfile
 from exo.shared.types.state import State
 from exo.shared.types.tasks import Task, TaskId, TaskStatus
-from exo.shared.types.topology import RDMAConnection
+from exo.shared.types.topology import NodeInfo
 from exo.shared.types.worker.downloads import DownloadProgress
 from exo.shared.types.worker.instances import Instance, InstanceId
 from exo.shared.types.worker.runners import RunnerId, RunnerStatus
-from exo.utils.info_gatherer.info_gatherer import (
-    MacmonMetrics,
-    MacTBConnections,
-    MacTBIdentifiers,
-    MemoryUsage,
-    MiscData,
-    NodeConfig,
-    NodeNetworkInterfaces,
-    StaticNodeInformation,
-)


 def event_apply(event: Event, state: State) -> State:
@@ -55,12 +47,16 @@ def event_apply(event: Event, state: State) -> State:
            return apply_instance_created(event, state)
        case InstanceDeleted():
            return apply_instance_deleted(event, state)
+        case NodeCreated():
+            return apply_topology_node_created(event, state)
        case NodeTimedOut():
            return apply_node_timed_out(event, state)
+        case NodePerformanceMeasured():
+            return apply_node_performance_measured(event, state)
        case NodeDownloadProgress():
            return apply_node_download_progress(event, state)
-        case NodeGatheredInfo():
-            return apply_node_gathered_info(event, state)
+        case NodeMemoryMeasured():
+            return apply_node_memory_measured(event, state)
        case RunnerDeleted():
            return apply_runner_deleted(event, state)
        case RunnerStatusUpdated():
@@ -192,7 +188,7 @@ def apply_runner_deleted(event: RunnerDeleted, state: State) -> State:


 def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
-    topology = copy.deepcopy(state.topology)
+    topology = copy.copy(state.topology)
    state.topology.remove_node(event.node_id)
    node_profiles = {
        key: value for key, value in state.node_profiles.items() if key != event.node_id
@@ -200,12 +196,8 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
    last_seen = {
        key: value for key, value in state.last_seen.items() if key != event.node_id
    }
-    downloads = {
-        key: value for key, value in state.downloads.items() if key != event.node_id
-    }
    return state.model_copy(
        update={
-            "downloads": downloads,
            "topology": topology,
            "node_profiles": node_profiles,
            "last_seen": last_seen,
@@ -213,69 +205,103 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
    )


-def apply_node_gathered_info(event: NodeGatheredInfo, state: State) -> State:
-    topology = copy.deepcopy(state.topology)
-    topology.add_node(event.node_id)
-    info = event.info
-    profile = state.node_profiles.get(event.node_id, NodePerformanceProfile())
-    # TODO: should be broken up into individual events instead of this monster
-    match info:
-        case MacmonMetrics():
-            profile.system = info.system_profile
-            profile.memory = info.memory
-        case MemoryUsage():
-            profile.memory = info
-        case NodeConfig():
-            pass
-        case MiscData():
-            profile.friendly_name = info.friendly_name
-        case StaticNodeInformation():
-            profile.model_id = info.model
-            profile.chip_id = info.chip
-        # TODO: makes me slightly sad
-        case NodeNetworkInterfaces():
-            profile.network_interfaces = info.ifaces
-        case MacTBIdentifiers():
-            profile.tb_interfaces = info.idents
-        case MacTBConnections():
-            conn_map = {
-                tb_ident.domain_uuid: (nid, tb_ident.rdma_interface)
-                for nid in state.node_profiles
-                for tb_ident in state.node_profiles[nid].tb_interfaces
-            }
-            as_rdma_conns = [
-                (
-                    conn_map[tb_conn.sink_uuid][0],
-                    RDMAConnection(
-                        source_rdma_iface=conn_map[tb_conn.source_uuid][1],
-                        sink_rdma_iface=conn_map[tb_conn.sink_uuid][1],
-                    ),
-                )
-                for tb_conn in info.conns
-                if tb_conn.source_uuid in conn_map
-                if tb_conn.sink_uuid in conn_map
-            ]
-            topology.replace_all_out_tb_connections(event.node_id, as_rdma_conns)
-
-    last_seen = {**state.last_seen, event.node_id: datetime.fromisoformat(event.when)}
-    new_profiles = {**state.node_profiles, event.node_id: profile}
+def apply_node_performance_measured(
+    event: NodePerformanceMeasured, state: State
+) -> State:
+    new_profiles: Mapping[NodeId, NodePerformanceProfile] = {
+        **state.node_profiles,
+        event.node_id: event.node_profile,
+    }
+    last_seen: Mapping[NodeId, datetime] = {
+        **state.last_seen,
+        event.node_id: datetime.fromisoformat(event.when),
+    }
+    state = state.model_copy(update={"node_profiles": new_profiles})
+    topology = copy.copy(state.topology)
+    # TODO: NodeCreated
+    if not topology.contains_node(event.node_id):
+        topology.add_node(NodeInfo(node_id=event.node_id))
+    topology.update_node_profile(event.node_id, event.node_profile)
    return state.model_copy(
        update={
            "node_profiles": new_profiles,
-            "last_seen": last_seen,
            "topology": topology,
+            "last_seen": last_seen,
        }
    )


+def apply_node_memory_measured(event: NodeMemoryMeasured, state: State) -> State:
+    existing = state.node_profiles.get(event.node_id)
+    topology = copy.copy(state.topology)
+
+    if existing is None:
+        created = NodePerformanceProfile(
+            model_id="unknown",
+            chip_id="unknown",
+            friendly_name="Unknown",
+            memory=event.memory,
+            network_interfaces=[],
+            system=SystemPerformanceProfile(
+                # TODO: flops_fp16=0.0,
+                gpu_usage=0.0,
+                temp=0.0,
+                sys_power=0.0,
+                pcpu_usage=0.0,
+                ecpu_usage=0.0,
+                ane_power=0.0,
+            ),
+        )
+        created_profiles: Mapping[NodeId, NodePerformanceProfile] = {
+            **state.node_profiles,
+            event.node_id: created,
+        }
+        last_seen: Mapping[NodeId, datetime] = {
+            **state.last_seen,
+            event.node_id: datetime.fromisoformat(event.when),
+        }
+        if not topology.contains_node(event.node_id):
+            topology.add_node(NodeInfo(node_id=event.node_id))
+            # TODO: NodeCreated
+        topology.update_node_profile(event.node_id, created)
+        return state.model_copy(
+            update={
+                "node_profiles": created_profiles,
+                "topology": topology,
+                "last_seen": last_seen,
+            }
+        )
+
+    updated = existing.model_copy(update={"memory": event.memory})
+    updated_profiles: Mapping[NodeId, NodePerformanceProfile] = {
+        **state.node_profiles,
+        event.node_id: updated,
+    }
+    # TODO: NodeCreated
+    if not topology.contains_node(event.node_id):
+        topology.add_node(NodeInfo(node_id=event.node_id))
+    topology.update_node_profile(event.node_id, updated)
+    return state.model_copy(
+        update={"node_profiles": updated_profiles, "topology": topology}
+    )
+
+
+def apply_topology_node_created(event: NodeCreated, state: State) -> State:
+    topology = copy.copy(state.topology)
+    topology.add_node(NodeInfo(node_id=event.node_id))
+    return state.model_copy(update={"topology": topology})
+
+
 def apply_topology_edge_created(event: TopologyEdgeCreated, state: State) -> State:
-    topology = copy.deepcopy(state.topology)
-    topology.add_connection(event.source, event.sink, event.edge)
+    topology = copy.copy(state.topology)
+    topology.add_connection(event.edge)
    return state.model_copy(update={"topology": topology})


 def apply_topology_edge_deleted(event: TopologyEdgeDeleted, state: State) -> State:
-    topology = copy.deepcopy(state.topology)
-    topology.remove_connection(event.sink, event.source, event.edge)
+    topology = copy.copy(state.topology)
+    if not topology.contains_connection(event.edge):
+        return state
+    topology.remove_connection(event.edge)
    # TODO: Clean up removing the reverse connection
    return state.model_copy(update={"topology": topology})
--- a/src/exo/shared/constants.py
+++ b/src/exo/shared/constants.py
@@ -38,7 +38,6 @@ EXO_TEST_LOG = EXO_CACHE_HOME / "exo_test.log"

 # Identity (config)
 EXO_NODE_ID_KEYPAIR = EXO_CONFIG_HOME / "node_id.keypair"
-EXO_CONFIG_FILE = EXO_CONFIG_HOME / "config.toml"

 # libp2p topics for event forwarding
 LIBP2P_LOCAL_EVENTS_TOPIC = "worker_events"
--- a/src/exo/shared/logging.py
+++ b/src/exo/shared/logging.py
@@ -24,8 +24,6 @@ class _InterceptHandler(logging.Handler):
        except ValueError:
            level = record.levelno

-        return
-
        logger.opt(depth=3, exception=record.exc_info).log(level, record.getMessage())


--- a/src/exo/shared/tests/test_apply/test_apply_node_download.py
+++ b/src/exo/shared/tests/test_apply/test_apply_node_download.py
@@ -39,4 +39,7 @@ def test_apply_two_node_download_progress():
        NodeDownloadProgress(download_progress=event2), state
    )

+    # TODO: This test is failing. We should support the following:
+    # 1. Downloading multiple models concurrently on the same node (one per runner is fine).
+    # 2. Downloading a model, it completes, then downloading a different model on the same node.
    assert new_state.downloads == {NodeId("node-1"): [event1, event2]}
--- a/src/exo/shared/tests/test_state_serialization.py
+++ b/src/exo/shared/tests/test_state_serialization.py
@@ -1,7 +1,7 @@
 from exo.shared.types.common import NodeId
 from exo.shared.types.multiaddr import Multiaddr
 from exo.shared.types.state import State
-from exo.shared.types.topology import SocketConnection
+from exo.shared.types.topology import Connection


 def test_state_serialization_roundtrip() -> None:
@@ -11,12 +11,14 @@ def test_state_serialization_roundtrip() -> None:
    node_a = NodeId("node-a")
    node_b = NodeId("node-b")

-    connection = SocketConnection(
-        sink_multiaddr=Multiaddr(address="/ip4/127.0.0.1/tcp/10001"),
+    connection = Connection(
+        local_node_id=node_a,
+        send_back_node_id=node_b,
+        send_back_multiaddr=Multiaddr(address="/ip4/127.0.0.1/tcp/10001"),
    )

    state = State()
-    state.topology.add_connection(node_a, node_b, connection)
+    state.topology.add_connection(connection)

    json_repr = state.model_dump_json()
    restored_state = State.model_validate_json(json_repr)
--- a/src/exo/shared/topology.py
+++ b/src/exo/shared/topology.py
@@ -1,219 +1,203 @@
 import contextlib
-from collections.abc import Mapping, Sequence
-from dataclasses import dataclass, field
 from typing import Iterable

 import rustworkx as rx
 from pydantic import BaseModel, ConfigDict

 from exo.shared.types.common import NodeId
-from exo.shared.types.topology import RDMAConnection, SocketConnection
+from exo.shared.types.profiling import ConnectionProfile, NodePerformanceProfile
+from exo.shared.types.topology import Connection, NodeInfo


 class TopologySnapshot(BaseModel):
-    nodes: Sequence[NodeId]
-    connections: Mapping[
-        NodeId, Mapping[NodeId, Sequence[SocketConnection | RDMAConnection]]
-    ]
+    nodes: list[NodeInfo]
+    connections: list[Connection]

-    model_config = ConfigDict(frozen=True, extra="forbid")
+    model_config = ConfigDict(frozen=True, extra="forbid", strict=True)


-@dataclass
 class Topology:
-    # the _graph can be used as a int -> NodeId map.
-    _graph: rx.PyDiGraph[NodeId, SocketConnection | RDMAConnection] = field(
-        init=False, default_factory=rx.PyDiGraph
-    )
-    _vertex_indices: dict[NodeId, int] = field(init=False, default_factory=dict)
+    def __init__(self) -> None:
+        self._graph: rx.PyDiGraph[NodeInfo, Connection] = rx.PyDiGraph()
+        self._node_id_to_rx_id_map: dict[NodeId, int] = dict()
+        self._rx_id_to_node_id_map: dict[int, NodeId] = dict()
+        self._edge_id_to_rx_id_map: dict[Connection, int] = dict()

    def to_snapshot(self) -> TopologySnapshot:
        return TopologySnapshot(
-            nodes=list(self.list_nodes()), connections=self.map_connections()
+            nodes=list(self.list_nodes()),
+            connections=list(self.list_connections()),
        )

    @classmethod
    def from_snapshot(cls, snapshot: TopologySnapshot) -> "Topology":
        topology = cls()

-        for node_id in snapshot.nodes:
+        for node in snapshot.nodes:
            with contextlib.suppress(ValueError):
-                topology.add_node(node_id)
+                topology.add_node(node)

-        for source in snapshot.connections:
-            for sink in snapshot.connections[source]:
-                for conn in snapshot.connections[source][sink]:
-                    topology.add_connection(source, sink, conn)
+        for connection in snapshot.connections:
+            topology.add_connection(connection)

        return topology

-    def add_node(self, node_id: NodeId) -> None:
-        if node_id in self._vertex_indices:
+    def add_node(self, node: NodeInfo) -> None:
+        if node.node_id in self._node_id_to_rx_id_map:
            return
-        rx_id = self._graph.add_node(node_id)
-        self._vertex_indices[node_id] = rx_id
+        rx_id = self._graph.add_node(node)
+        self._node_id_to_rx_id_map[node.node_id] = rx_id
+        self._rx_id_to_node_id_map[rx_id] = node.node_id

    def node_is_leaf(self, node_id: NodeId) -> bool:
        return (
-            node_id in self._vertex_indices
-            and len(self._graph.neighbors(self._vertex_indices[node_id])) <= 1
+            node_id in self._node_id_to_rx_id_map
+            and len(self._graph.neighbors(self._node_id_to_rx_id_map[node_id])) == 1
        )

    def neighbours(self, node_id: NodeId) -> list[NodeId]:
        return [
-            self._graph[rx_id]
-            for rx_id in self._graph.neighbors(self._vertex_indices[node_id])
+            self._rx_id_to_node_id_map[rx_id]
+            for rx_id in self._graph.neighbors(self._node_id_to_rx_id_map[node_id])
        ]

-    def out_edges(
-        self, node_id: NodeId
-    ) -> Iterable[tuple[NodeId, SocketConnection | RDMAConnection]]:
-        if node_id not in self._vertex_indices:
+    def out_edges(self, node_id: NodeId) -> list[tuple[NodeId, Connection]]:
+        if node_id not in self._node_id_to_rx_id_map:
            return []
-        return (
-            (self._graph[nid], conn)
-            for _, nid, conn in self._graph.out_edges(self._vertex_indices[node_id])
-        )
+        return [
+            (self._rx_id_to_node_id_map[nid], conn)
+            for _, nid, conn in self._graph.out_edges(
+                self._node_id_to_rx_id_map[node_id]
+            )
+        ]

    def contains_node(self, node_id: NodeId) -> bool:
-        return node_id in self._vertex_indices
+        return node_id in self._node_id_to_rx_id_map
+
+    def contains_connection(self, connection: Connection) -> bool:
+        return connection in self._edge_id_to_rx_id_map

    def add_connection(
        self,
-        source: NodeId,
-        sink: NodeId,
-        connection: SocketConnection | RDMAConnection,
+        connection: Connection,
    ) -> None:
-        if connection in self.get_all_connections_between(source, sink):
+        if connection.local_node_id not in self._node_id_to_rx_id_map:
+            self.add_node(NodeInfo(node_id=connection.local_node_id))
+        if connection.send_back_node_id not in self._node_id_to_rx_id_map:
+            self.add_node(NodeInfo(node_id=connection.send_back_node_id))
+
+        if connection in self._edge_id_to_rx_id_map:
            return

-        if source not in self._vertex_indices:
-            self.add_node(source)
-        if sink not in self._vertex_indices:
-            self.add_node(sink)
+        src_id = self._node_id_to_rx_id_map[connection.local_node_id]
+        sink_id = self._node_id_to_rx_id_map[connection.send_back_node_id]

-        src_id = self._vertex_indices[source]
-        sink_id = self._vertex_indices[sink]
+        rx_id = self._graph.add_edge(src_id, sink_id, connection)
+        self._edge_id_to_rx_id_map[connection] = rx_id

-        _ = self._graph.add_edge(src_id, sink_id, connection)
+    def list_nodes(self) -> Iterable[NodeInfo]:
+        return (self._graph[i] for i in self._graph.node_indices())

-    def get_all_connections_between(
-        self, source: NodeId, sink: NodeId
-    ) -> Iterable[SocketConnection | RDMAConnection]:
-        if source not in self._vertex_indices:
-            return []
-        if sink not in self._vertex_indices:
-            return []
+    def list_connections(self) -> Iterable[Connection]:
+        return (connection for _, _, connection in self._graph.weighted_edge_list())

-        src_id = self._vertex_indices[source]
-        sink_id = self._vertex_indices[sink]
+    def get_node_profile(self, node_id: NodeId) -> NodePerformanceProfile | None:
        try:
-            return self._graph.get_all_edge_data(src_id, sink_id)
-        except rx.NoEdgeBetweenNodes:
-            return []
+            rx_idx = self._node_id_to_rx_id_map[node_id]
+            return self._graph.get_node_data(rx_idx).node_profile
+        except KeyError:
+            return None

-    def list_nodes(self) -> Iterable[NodeId]:
-        return self._graph.nodes()
+    def update_node_profile(
+        self, node_id: NodeId, node_profile: NodePerformanceProfile
+    ) -> None:
+        rx_idx = self._node_id_to_rx_id_map[node_id]
+        self._graph[rx_idx].node_profile = node_profile

-    def map_connections(
-        self,
-    ) -> Mapping[NodeId, Mapping[NodeId, Sequence[SocketConnection | RDMAConnection]]]:
-        base: dict[NodeId, dict[NodeId, list[SocketConnection | RDMAConnection]]] = {}
-        for src_id, sink_id, connection in self._graph.weighted_edge_list():
-            source = self._graph[src_id]
-            sink = self._graph[sink_id]
-            if source not in base:
-                base[source] = {}
-            if sink not in base[source]:
-                base[source][sink] = []
-            base[source][sink].append(connection)
-        return base
+    def update_connection_profile(self, connection: Connection) -> None:
+        rx_idx = self._edge_id_to_rx_id_map[connection]
+        self._graph.update_edge_by_index(rx_idx, connection)

-    def list_connections(
-        self,
-    ) -> Iterable[tuple[NodeId, NodeId, SocketConnection | RDMAConnection]]:
-        return (
-            (
-                self._graph[src_id],
-                self._graph[sink_id],
-                connection,
-            )
-            for src_id, sink_id, connection in self._graph.weighted_edge_list()
-        )
+    def get_connection_profile(
+        self, connection: Connection
+    ) -> ConnectionProfile | None:
+        try:
+            rx_idx = self._edge_id_to_rx_id_map[connection]
+            return self._graph.get_edge_data_by_index(rx_idx).connection_profile
+        except KeyError:
+            return None

    def remove_node(self, node_id: NodeId) -> None:
-        if node_id not in self._vertex_indices:
+        if node_id not in self._node_id_to_rx_id_map:
            return

-        rx_idx = self._vertex_indices[node_id]
+        for connection in self.list_connections():
+            if (
+                connection.local_node_id == node_id
+                or connection.send_back_node_id == node_id
+            ):
+                self.remove_connection(connection)
+
+        rx_idx = self._node_id_to_rx_id_map[node_id]
        self._graph.remove_node(rx_idx)

-        del self._vertex_indices[node_id]
+        del self._node_id_to_rx_id_map[node_id]
+        del self._rx_id_to_node_id_map[rx_idx]

-    def replace_all_out_tb_connections(
-        self, source: NodeId, new_connections: Sequence[tuple[NodeId, RDMAConnection]]
-    ) -> None:
-        for conn_idx in self._graph.out_edge_indices(self._vertex_indices[source]):
-            if isinstance(self._graph.get_edge_data_by_index(conn_idx), RDMAConnection):
-                self._graph.remove_edge_from_index(conn_idx)
-        for sink, conn in new_connections:
-            self.add_connection(source, sink, conn)
-
-    def remove_connection(
-        self, source: NodeId, sink: NodeId, edge: SocketConnection | RDMAConnection
-    ) -> None:
-        if source not in self._vertex_indices or sink not in self._vertex_indices:
+    def remove_connection(self, connection: Connection) -> None:
+        if connection not in self._edge_id_to_rx_id_map:
            return
-        for conn_idx in self._graph.edge_indices_from_endpoints(
-            self._vertex_indices[source], self._vertex_indices[sink]
-        ):
-            if self._graph.get_edge_data_by_index(conn_idx) == edge:
-                self._graph.remove_edge_from_index(conn_idx)
+        rx_idx = self._edge_id_to_rx_id_map[connection]
+        self._graph.remove_edge_from_index(rx_idx)
+        del self._edge_id_to_rx_id_map[connection]

-    def get_cycles(self) -> list[list[NodeId]]:
+    def get_cycles(self) -> list[list[NodeInfo]]:
        cycle_idxs = rx.simple_cycles(self._graph)
-        cycles: list[list[NodeId]] = []
+        cycles: list[list[NodeInfo]] = []
        for cycle_idx in cycle_idxs:
            cycle = [self._graph[idx] for idx in cycle_idx]
            cycles.append(cycle)

        return cycles

-    def get_cycles_tb(self) -> list[list[NodeId]]:
+    def get_cycles_tb(self) -> list[list[NodeInfo]]:
        tb_edges = [
            (u, v, conn)
            for u, v, conn in self._graph.weighted_edge_list()
            if conn.is_thunderbolt()
        ]

-        tb_graph: rx.PyDiGraph[NodeId, SocketConnection] = rx.PyDiGraph()
+        tb_graph: rx.PyDiGraph[NodeInfo, Connection] = rx.PyDiGraph()
        tb_graph.add_nodes_from(self._graph.nodes())

        for u, v, conn in tb_edges:
-            if isinstance(conn, SocketConnection):
-                tb_graph.add_edge(u, v, conn)
+            tb_graph.add_edge(u, v, conn)

        cycle_idxs = rx.simple_cycles(tb_graph)
-        cycles: list[list[NodeId]] = []
+        cycles: list[list[NodeInfo]] = []
        for cycle_idx in cycle_idxs:
            cycle = [tb_graph[idx] for idx in cycle_idx]
            cycles.append(cycle)

        return cycles

-    def get_subgraph_from_nodes(self, node_ids: list[NodeId]) -> "Topology":
-        rx_idxs = [self._vertex_indices[idx] for idx in node_ids]
+    def get_subgraph_from_nodes(self, nodes: list[NodeInfo]) -> "Topology":
+        node_idxs = [node.node_id for node in nodes]
+        rx_idxs = [self._node_id_to_rx_id_map[idx] for idx in node_idxs]
        topology = Topology()
        for rx_idx in rx_idxs:
            topology.add_node(self._graph[rx_idx])
-        for source, sink, connection in self.list_connections():
-            if source in node_ids and sink in node_ids:
-                topology.add_connection(source, sink, connection)
+        for connection in self.list_connections():
+            if (
+                connection.local_node_id in node_idxs
+                and connection.send_back_node_id in node_idxs
+            ):
+                topology.add_connection(connection)
        return topology

-    def is_thunderbolt_cycle(self, cycle: list[NodeId]) -> bool:
-        node_idxs = [node for node in cycle]
-        rx_idxs = [self._vertex_indices[idx] for idx in node_idxs]
+    def is_thunderbolt_cycle(self, cycle: list[NodeInfo]) -> bool:
+        node_idxs = [node.node_id for node in cycle]
+        rx_idxs = [self._node_id_to_rx_id_map[idx] for idx in node_idxs]
        for rid in rx_idxs:
            for neighbor_rid in self._graph.neighbors(rid):
                if neighbor_rid not in rx_idxs:
--- a/src/exo/shared/types/events.py
+++ b/src/exo/shared/types/events.py
@@ -2,14 +2,14 @@ from datetime import datetime

 from pydantic import Field

-from exo.shared.topology import SocketConnection
+from exo.shared.topology import Connection, NodePerformanceProfile
 from exo.shared.types.chunks import GenerationChunk
 from exo.shared.types.common import CommandId, Id, NodeId, SessionId
+from exo.shared.types.profiling import MemoryPerformanceProfile
 from exo.shared.types.tasks import Task, TaskId, TaskStatus
 from exo.shared.types.worker.downloads import DownloadProgress
 from exo.shared.types.worker.instances import Instance, InstanceId
 from exo.shared.types.worker.runners import RunnerId, RunnerStatus
-from exo.utils.info_gatherer.info_gatherer import GatheredInfo
 from exo.utils.pydantic_ext import CamelCaseModel, TaggedModel


@@ -76,15 +76,25 @@ class RunnerDeleted(BaseEvent):
    runner_id: RunnerId


+# TODO
+class NodeCreated(BaseEvent):
+    node_id: NodeId
+
+
 class NodeTimedOut(BaseEvent):
    node_id: NodeId


-# TODO: bikeshed this naem
-class NodeGatheredInfo(BaseEvent):
+class NodePerformanceMeasured(BaseEvent):
    node_id: NodeId
    when: str  # this is a manually cast datetime overrode by the master when the event is indexed, rather than the local time on the device
-    info: GatheredInfo  # NB: this model is UNTAGGED!!! be warned for ser/de errors.
+    node_profile: NodePerformanceProfile
+
+
+class NodeMemoryMeasured(BaseEvent):
+    node_id: NodeId
+    when: str  # this is a manually cast datetime overrode by the master when the event is indexed, rather than the local time on the device
+    memory: MemoryPerformanceProfile


 class NodeDownloadProgress(BaseEvent):
@@ -97,15 +107,11 @@ class ChunkGenerated(BaseEvent):


 class TopologyEdgeCreated(BaseEvent):
-    source: NodeId
-    sink: NodeId
-    edge: SocketConnection
+    edge: Connection


 class TopologyEdgeDeleted(BaseEvent):
-    source: NodeId
-    sink: NodeId
-    edge: SocketConnection
+    edge: Connection


 Event = (
@@ -119,8 +125,10 @@ Event = (
    | InstanceDeleted
    | RunnerStatusUpdated
    | RunnerDeleted
+    | NodeCreated
    | NodeTimedOut
-    | NodeGatheredInfo
+    | NodePerformanceMeasured
+    | NodeMemoryMeasured
    | NodeDownloadProgress
    | ChunkGenerated
    | TopologyEdgeCreated
--- a/src/exo/shared/types/multiaddr.py
+++ b/src/exo/shared/types/multiaddr.py
@@ -1,11 +1,10 @@
 import re
 from typing import ClassVar

-from pydantic import BaseModel, ConfigDict, computed_field, field_validator
+from pydantic import BaseModel, computed_field, field_validator


 class Multiaddr(BaseModel):
-    model_config = ConfigDict(frozen=True)
    address: str

    PATTERNS: ClassVar[list[str]] = [
--- a/src/exo/shared/types/profiling.py
+++ b/src/exo/shared/types/profiling.py
@@ -1,14 +1,12 @@
-from collections.abc import Sequence
 from typing import Self

 import psutil

 from exo.shared.types.memory import Memory
-from exo.shared.types.thunderbolt import TBIdentifier
 from exo.utils.pydantic_ext import CamelCaseModel


-class MemoryUsage(CamelCaseModel):
+class MemoryPerformanceProfile(CamelCaseModel):
    ram_total: Memory
    ram_available: Memory
    swap_total: Memory
@@ -46,6 +44,7 @@ class SystemPerformanceProfile(CamelCaseModel):
    sys_power: float = 0.0
    pcpu_usage: float = 0.0
    ecpu_usage: float = 0.0
+    ane_power: float = 0.0


 class NetworkInterfaceInfo(CamelCaseModel):
@@ -54,16 +53,15 @@ class NetworkInterfaceInfo(CamelCaseModel):


 class NodePerformanceProfile(CamelCaseModel):
-    model_id: str = "Unknown"
-    chip_id: str = "Unknown"
-    friendly_name: str = "Unknown"
-    memory: MemoryUsage = MemoryUsage.from_bytes(
-        ram_total=0, ram_available=0, swap_total=0, swap_available=0
-    )
-    network_interfaces: Sequence[NetworkInterfaceInfo] = []
-    tb_interfaces: Sequence[TBIdentifier] = []
-    system: SystemPerformanceProfile = SystemPerformanceProfile()
+    model_id: str
+    chip_id: str
+    friendly_name: str
+    memory: MemoryPerformanceProfile
+    network_interfaces: list[NetworkInterfaceInfo] = []
+    system: SystemPerformanceProfile


 class ConnectionProfile(CamelCaseModel):
-    pass
+    throughput: float
+    latency: float
+    jitter: float
--- a/src/exo/shared/types/tasks.py
+++ b/src/exo/shared/types/tasks.py
@@ -40,6 +40,10 @@ class LoadModel(BaseTask):  # emitted by Worker
    pass


+class ConnectToGroup(BaseTask):  # emitted by Worker
+    pass
+
+
 class StartWarmup(BaseTask):  # emitted by Worker
    pass

@@ -57,5 +61,11 @@ class Shutdown(BaseTask):  # emitted by Worker


 Task = (
-    CreateRunner | DownloadModel | LoadModel | StartWarmup | ChatCompletion | Shutdown
+    CreateRunner
+    | DownloadModel
+    | ConnectToGroup
+    | LoadModel
+    | StartWarmup
+    | ChatCompletion
+    | Shutdown
 )
--- a/src/exo/shared/types/thunderbolt.py
+++ b/src/exo/shared/types/thunderbolt.py
@@ -1,64 +0,0 @@
-import anyio
-from pydantic import BaseModel, Field
-
-from exo.utils.pydantic_ext import CamelCaseModel
-
-
-class TBConnection(CamelCaseModel):
-    source_uuid: str
-    sink_uuid: str
-
-
-class TBIdentifier(CamelCaseModel):
-    rdma_interface: str
-    domain_uuid: str
-
-
-# Intentionally minimal, only collecting data we care about - there's a lot more
-
-
-class TBReceptacleTag(BaseModel, extra="ignore"):
-    receptacle_id_key: str
-
-
-class TBConnectivityItem(BaseModel, extra="ignore"):
-    domain_uuid_key: str | None
-
-
-class TBConnectivityData(BaseModel, extra="ignore"):
-    domain_uuid_key: str | None
-    device_name_key: str
-    items: list[TBConnectivityItem] | None = Field(None, alias="_items")
-    receptacle_1_tag: TBReceptacleTag
-
-    def ident(self, ifaces: dict[str, str]) -> TBIdentifier | None:
-        if self.domain_uuid_key is None:
-            return
-        tag = f"Thunderbolt {self.receptacle_1_tag.receptacle_id_key}"
-        iface = f"rdma_{ifaces[tag]}"
-        return TBIdentifier(rdma_interface=iface, domain_uuid=self.domain_uuid_key)
-
-    def conn(self) -> TBConnection | None:
-        if self.domain_uuid_key is None or self.items is None:
-            return
-
-        sink_key = next(
-            item.domain_uuid_key
-            for item in self.items
-            if item.domain_uuid_key is not None
-        )
-        return TBConnection(source_uuid=self.domain_uuid_key, sink_uuid=sink_key)
-
-
-class TBConnectivity(BaseModel):
-    SPThunderboltDataType: list[TBConnectivityData]
-
-    @classmethod
-    async def gather(cls) -> list[TBConnectivityData] | None:
-        proc = await anyio.run_process(
-            ["system_profiler", "SPThunderboltDataType", "-json"], check=False
-        )
-        if proc.returncode != 0:
-            return None
-        # Saving you from PascalCase while avoiding too much pydantic
-        return TBConnectivity.model_validate_json(proc.stdout).SPThunderboltDataType
--- a/src/exo/shared/types/topology.py
+++ b/src/exo/shared/types/topology.py
@@ -1,32 +1,37 @@
-from enum import Enum
-
-from loguru import logger
-
+from exo.shared.types.common import NodeId
 from exo.shared.types.multiaddr import Multiaddr
-from exo.utils.pydantic_ext import FrozenModel
+from exo.shared.types.profiling import ConnectionProfile, NodePerformanceProfile
+from exo.utils.pydantic_ext import CamelCaseModel


-class RDMAConnection(FrozenModel):
-    source_rdma_iface: str
-    sink_rdma_iface: str
+class NodeInfo(CamelCaseModel):
+    node_id: NodeId
+    node_profile: NodePerformanceProfile | None = None
+
+
+class Connection(CamelCaseModel):
+    local_node_id: NodeId
+    send_back_node_id: NodeId
+    send_back_multiaddr: Multiaddr
+    connection_profile: ConnectionProfile | None = None
+
+    def __hash__(self) -> int:
+        return hash(
+            (
+                self.local_node_id,
+                self.send_back_node_id,
+                self.send_back_multiaddr.address,
+            )
+        )
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Connection):
+            raise ValueError("Cannot compare Connection with non-Connection")
+        return (
+            self.local_node_id == other.local_node_id
+            and self.send_back_node_id == other.send_back_node_id
+            and self.send_back_multiaddr == other.send_back_multiaddr
+        )

    def is_thunderbolt(self) -> bool:
-        logger.warning("duh")
-        return True
-
-
-# TODO
-class LinkType(str, Enum):
-    Thunderbolt = "Thunderbolt"
-    Ethernet = "Ethernet"
-    WiFi = "WiFi"
-
-
-class SocketConnection(FrozenModel):
-    sink_multiaddr: Multiaddr
-
-    def __hash__(self):
-        return hash(self.sink_multiaddr.ip_address)
-
-    def is_thunderbolt(self) -> bool:
-        return str(self.sink_multiaddr.ipv4_address).startswith("169.254")
+        return str(self.send_back_multiaddr.ipv4_address).startswith("169.254")
--- a/src/exo/shared/types/worker/instances.py
+++ b/src/exo/shared/types/worker/instances.py
@@ -29,7 +29,7 @@ class MlxRingInstance(BaseInstance):


 class MlxJacclInstance(BaseInstance):
-    jaccl_devices: list[list[str | None]]
+    ibv_devices: list[list[str | None]]
    jaccl_coordinators: dict[NodeId, str]


--- a/src/exo/shared/types/worker/resource_monitor.py
+++ b/src/exo/shared/types/worker/resource_monitor.py
@@ -0,0 +1,43 @@
+import asyncio
+from abc import ABC, abstractmethod
+from collections.abc import Coroutine
+from typing import Callable
+
+from exo.shared.types.profiling import (
+    MemoryPerformanceProfile,
+    SystemPerformanceProfile,
+)
+
+
+class ResourceCollector(ABC):
+    @abstractmethod
+    async def collect(self) -> SystemPerformanceProfile | MemoryPerformanceProfile: ...
+
+
+class SystemResourceCollector(ResourceCollector):
+    async def collect(self) -> SystemPerformanceProfile: ...
+
+
+class MemoryResourceCollector(ResourceCollector):
+    async def collect(self) -> MemoryPerformanceProfile: ...
+
+
+class ResourceMonitor:
+    data_collectors: list[ResourceCollector]
+    effect_handlers: set[
+        Callable[[SystemPerformanceProfile | MemoryPerformanceProfile], None]
+    ]
+
+    async def _collect(
+        self,
+    ) -> list[SystemPerformanceProfile | MemoryPerformanceProfile]:
+        tasks: list[
+            Coroutine[None, None, SystemPerformanceProfile | MemoryPerformanceProfile]
+        ] = [collector.collect() for collector in self.data_collectors]
+        return await asyncio.gather(*tasks)
+
+    async def collect(self) -> None:
+        profiles = await self._collect()
+        for profile in profiles:
+            for effect_handler in self.effect_handlers:
+                effect_handler(profile)
--- a/src/exo/shared/types/worker/runners.py
+++ b/src/exo/shared/types/worker/runners.py
@@ -21,7 +21,15 @@ class BaseRunnerStatus(TaggedModel):
        return isinstance(self, RunnerRunning)


-class RunnerWaitingForModel(BaseRunnerStatus):
+class RunnerIdle(BaseRunnerStatus):
+    pass
+
+
+class RunnerConnecting(BaseRunnerStatus):
+    pass
+
+
+class RunnerConnected(BaseRunnerStatus):
    pass


@@ -54,7 +62,9 @@ class RunnerFailed(BaseRunnerStatus):


 RunnerStatus = (
-    RunnerWaitingForModel
+    RunnerIdle
+    | RunnerConnecting
+    | RunnerConnected
    | RunnerLoading
    | RunnerLoaded
    | RunnerWarmingUp
--- a/src/exo/utils/info_gatherer/info_gatherer.py
+++ b/src/exo/utils/info_gatherer/info_gatherer.py
@@ -1,231 +0,0 @@
-import os
-import shutil
-import sys
-import tomllib
-from collections.abc import Sequence
-from dataclasses import dataclass, field
-from subprocess import CalledProcessError
-from typing import Self, cast
-
-import anyio
-from anyio import create_task_group, open_process
-from anyio.abc import TaskGroup
-from anyio.streams.buffered import BufferedByteReceiveStream
-from anyio.streams.text import TextReceiveStream
-from loguru import logger
-
-from exo.shared.constants import EXO_CONFIG_FILE
-from exo.shared.types.memory import Memory
-from exo.shared.types.profiling import (
-    MemoryUsage,
-    NetworkInterfaceInfo,
-)
-from exo.shared.types.thunderbolt import TBConnection, TBConnectivity, TBIdentifier
-from exo.utils.channels import Sender
-from exo.utils.pydantic_ext import TaggedModel
-
-from .macmon import MacmonMetrics
-from .system_info import get_friendly_name, get_model_and_chip, get_network_interfaces
-
-IS_DARWIN = sys.platform == "darwin"
-
-
-class StaticNodeInformation(TaggedModel):
-    """Node information that should NEVER change, to be gathered once at startup"""
-
-    model: str
-    chip: str
-
-    @classmethod
-    async def gather(cls) -> Self:
-        model, chip = await get_model_and_chip()
-        return cls(model=model, chip=chip)
-
-
-class NodeNetworkInterfaces(TaggedModel):
-    ifaces: Sequence[NetworkInterfaceInfo]
-
-
-class MacTBIdentifiers(TaggedModel):
-    idents: Sequence[TBIdentifier]
-
-
-class MacTBConnections(TaggedModel):
-    conns: Sequence[TBConnection]
-
-
-class NodeConfig(TaggedModel):
-    """Node configuration from EXO_CONFIG_FILE, reloaded from the file only at startup. Other changes should come in through the API and propagate from there"""
-
-    # TODO
-    @classmethod
-    async def gather(cls) -> Self | None:
-        cfg_file = anyio.Path(EXO_CONFIG_FILE)
-        await cfg_file.touch(exist_ok=True)
-        async with await cfg_file.open("rb") as f:
-            try:
-                contents = (await f.read()).decode("utf-8")
-                data = tomllib.loads(contents)
-                return cls.model_validate(data)
-            except (tomllib.TOMLDecodeError, UnicodeDecodeError):
-                logger.warning("Invalid config file, skipping...")
-                return None
-
-
-class MiscData(TaggedModel):
-    """Node information that may slowly change that doesn't fall into the other categories"""
-
-    friendly_name: str
-
-    @classmethod
-    async def gather(cls) -> Self:
-        return cls(friendly_name=await get_friendly_name())
-
-
-async def _gather_iface_map() -> dict[str, str] | None:
-    proc = await anyio.run_process(
-        ["networksetup", "-listallhardwareports"], check=False
-    )
-    if proc.returncode != 0:
-        return None
-
-    ports: dict[str, str] = {}
-    port = ""
-    for line in proc.stdout.decode("utf-8").split("\n"):
-        if line.startswith("Hardware Port:"):
-            port = line.split(": ")[1]
-        elif line.startswith("Device:"):
-            ports[port] = line.split(": ")[1]
-            port = ""
-    if "" in ports:
-        del ports[""]
-    return ports
-
-
-GatheredInfo = (
-    MacmonMetrics
-    | MemoryUsage
-    | NodeNetworkInterfaces
-    | MacTBIdentifiers
-    | MacTBConnections
-    | NodeConfig
-    | MiscData
-    | StaticNodeInformation
-)
-
-
-@dataclass
-class InfoGatherer:
-    info_sender: Sender[GatheredInfo]
-    interface_watcher_interval: float | None = 10
-    misc_poll_interval: float | None = 60
-    system_profiler_interval: float | None = 5 if IS_DARWIN else None
-    memory_poll_rate: float | None = None if IS_DARWIN else 1
-    macmon_interval: float | None = 1 if IS_DARWIN else None
-    _tg: TaskGroup = field(init=False, default_factory=create_task_group)
-
-    async def run(self):
-        async with self._tg as tg:
-            if (macmon_path := shutil.which("macmon")) is not None:
-                tg.start_soon(self._monitor_macmon, macmon_path)
-            if IS_DARWIN:
-                tg.start_soon(self._monitor_system_profiler)
-            tg.start_soon(self._watch_system_info)
-            tg.start_soon(self._monitor_memory_usage)
-            tg.start_soon(self._monitor_misc)
-
-            nc = await NodeConfig.gather()
-            if nc is not None:
-                await self.info_sender.send(nc)
-            sni = await StaticNodeInformation.gather()
-            await self.info_sender.send(sni)
-
-    def shutdown(self):
-        self._tg.cancel_scope.cancel()
-
-    async def _monitor_misc(self):
-        if self.misc_poll_interval is None:
-            return
-        prev = await MiscData.gather()
-        while True:
-            curr = await MiscData.gather()
-            if prev != curr:
-                prev = curr
-                await self.info_sender.send(curr)
-            await anyio.sleep(self.misc_poll_interval)
-
-    async def _monitor_system_profiler(self):
-        if self.system_profiler_interval is None:
-            return
-        iface_map = await _gather_iface_map()
-        if iface_map is None:
-            return
-
-        old_idents = []
-        while True:
-            data = await TBConnectivity.gather()
-            assert data is not None
-
-            idents = [it for i in data if (it := i.ident(iface_map)) is not None]
-            if idents != old_idents:
-                await self.info_sender.send(MacTBIdentifiers(idents=idents))
-            old_idents = idents
-
-            conns = [it for i in data if (it := i.conn()) is not None]
-            await self.info_sender.send(MacTBConnections(conns=conns))
-
-            await anyio.sleep(self.system_profiler_interval)
-
-    async def _monitor_memory_usage(self):
-        override_memory_env = os.getenv("OVERRIDE_MEMORY_MB")
-        override_memory: int | None = (
-            Memory.from_mb(int(override_memory_env)).in_bytes
-            if override_memory_env
-            else None
-        )
-        if self.memory_poll_rate is None:
-            return
-        while True:
-            await self.info_sender.send(
-                MemoryUsage.from_psutil(override_memory=override_memory)
-            )
-            await anyio.sleep(self.memory_poll_rate)
-
-    async def _watch_system_info(self):
-        if self.interface_watcher_interval is None:
-            return
-        old_nics = []
-        while True:
-            nics = get_network_interfaces()
-            if nics != old_nics:
-                old_nics = nics
-                await self.info_sender.send(NodeNetworkInterfaces(ifaces=nics))
-            await anyio.sleep(self.interface_watcher_interval)
-
-    async def _monitor_macmon(self, macmon_path: str):
-        if self.macmon_interval is None:
-            return
-        # macmon pipe --interval [interval in ms]
-        try:
-            async with await open_process(
-                [macmon_path, "pipe", "--interval", str(self.macmon_interval * 1000)]
-            ) as p:
-                if not p.stdout:
-                    logger.critical("MacMon closed stdout")
-                    return
-                async for text in TextReceiveStream(
-                    BufferedByteReceiveStream(p.stdout)
-                ):
-                    await self.info_sender.send(MacmonMetrics.from_raw_json(text))
-        except CalledProcessError as e:
-            stderr_msg = "no stderr"
-            stderr_output = cast(bytes | str | None, e.stderr)
-            if stderr_output is not None:
-                stderr_msg = (
-                    stderr_output.decode()
-                    if isinstance(stderr_output, bytes)
-                    else str(stderr_output)
-                )
-            logger.warning(
-                f"MacMon failed with return code {e.returncode}: {stderr_msg}"
-            )
--- a/src/exo/utils/info_gatherer/macmon.py
+++ b/src/exo/utils/info_gatherer/macmon.py
@@ -1,70 +0,0 @@
-from typing import Self
-
-from pydantic import BaseModel
-
-from exo.shared.types.profiling import MemoryUsage, SystemPerformanceProfile
-from exo.utils.pydantic_ext import TaggedModel
-
-
-class _TempMetrics(BaseModel, extra="ignore"):
-    """Temperature-related metrics returned by macmon."""
-
-    cpu_temp_avg: float
-    gpu_temp_avg: float
-
-
-class _MemoryMetrics(BaseModel, extra="ignore"):
-    """Memory-related metrics returned by macmon."""
-
-    ram_total: int
-    ram_usage: int
-    swap_total: int
-    swap_usage: int
-
-
-class RawMacmonMetrics(BaseModel, extra="ignore"):
-    """Complete set of metrics returned by macmon.
-
-    Unknown fields are ignored for forward-compatibility.
-    """
-
-    timestamp: str  # ignored
-    temp: _TempMetrics
-    memory: _MemoryMetrics
-    ecpu_usage: tuple[int, float]  # freq mhz, usage %
-    pcpu_usage: tuple[int, float]  # freq mhz, usage %
-    gpu_usage: tuple[int, float]  # freq mhz, usage %
-    all_power: float
-    ane_power: float
-    cpu_power: float
-    gpu_power: float
-    gpu_ram_power: float
-    ram_power: float
-    sys_power: float
-
-
-class MacmonMetrics(TaggedModel):
-    system_profile: SystemPerformanceProfile
-    memory: MemoryUsage
-
-    @classmethod
-    def from_raw(cls, raw: RawMacmonMetrics) -> Self:
-        return cls(
-            system_profile=SystemPerformanceProfile(
-                gpu_usage=raw.gpu_usage[1],
-                temp=raw.temp.gpu_temp_avg,
-                sys_power=raw.sys_power,
-                pcpu_usage=raw.pcpu_usage[1],
-                ecpu_usage=raw.ecpu_usage[1],
-            ),
-            memory=MemoryUsage.from_bytes(
-                ram_total=raw.memory.ram_total,
-                ram_available=(raw.memory.ram_total - raw.memory.ram_usage),
-                swap_total=raw.memory.swap_total,
-                swap_available=(raw.memory.swap_total - raw.memory.swap_usage),
-            ),
-        )
-
-    @classmethod
-    def from_raw_json(cls, json: str) -> Self:
-        return cls.from_raw(RawMacmonMetrics.model_validate_json(json))
--- a/src/exo/utils/info_gatherer/net_profile.py
+++ b/src/exo/utils/info_gatherer/net_profile.py
@@ -1,56 +0,0 @@
-import socket
-from collections.abc import Mapping
-from ipaddress import ip_address
-
-from anyio import create_task_group, to_thread
-
-from exo.shared.topology import Topology
-from exo.shared.types.common import NodeId
-from exo.shared.types.profiling import NodePerformanceProfile
-
-
-# TODO: ref. api port
-async def check_reachability(
-    target_ip: str, target_node_id: NodeId, out: dict[NodeId, set[str]]
-) -> None:
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.settimeout(1)  # 1 second timeout
-    try:
-        result = await to_thread.run_sync(sock.connect_ex, (target_ip, 52415))
-    except socket.gaierror:
-        # seems to throw on ipv6 loopback. oh well
-        # logger.warning(f"invalid {target_ip=}")
-        return
-    finally:
-        sock.close()
-
-    if result == 0:
-        if target_node_id not in out:
-            out[target_node_id] = set()
-        out[target_node_id].add(target_ip)
-
-
-async def check_reachable(
-    our_node_id: NodeId,
-    topology: Topology,
-    profiles: Mapping[NodeId, NodePerformanceProfile],
-) -> Mapping[NodeId, set[str]]:
-    reachable: dict[NodeId, set[str]] = {}
-    our_profile = profiles.get(our_node_id, None)
-    if our_profile is None:
-        return {}
-    our_interfaces = our_profile.network_interfaces
-    async with create_task_group() as tg:
-        for node_id in topology.list_nodes():
-            if node_id not in profiles or node_id == our_node_id:
-                continue
-            for iface in profiles[node_id].network_interfaces:
-                if ip_address(iface.ip_address).is_loopback:
-                    # Definitely a loopback address
-                    continue
-                if iface in our_interfaces:
-                    # Skip duplicates with our own interfaces
-                    continue
-                tg.start_soon(check_reachability, iface.ip_address, node_id, reachable)
-
-    return reachable
--- a/src/exo/utils/pydantic_ext.py
+++ b/src/exo/utils/pydantic_ext.py
@@ -19,20 +19,11 @@ class CamelCaseModel(BaseModel):
        alias_generator=to_camel,
        validate_by_name=True,
        extra="forbid",
+        # I want to reenable this ASAP, but it's causing an issue with TaskStatus
        strict=True,
    )


-class FrozenModel(BaseModel):
-    model_config = ConfigDict(
-        alias_generator=to_camel,
-        validate_by_name=True,
-        extra="forbid",
-        strict=True,
-        frozen=True,
-    )
-
-
 class TaggedModel(CamelCaseModel):
    @model_serializer(mode="wrap")
    def _serialize(self, handler: SerializerFunctionWrapHandler):
--- a/src/exo/utils/tests/test_mp_channel.py
+++ b/src/exo/utils/tests/test_mp_channel.py
@@ -28,8 +28,9 @@ def bar(send: MpSender[str]):
    send.close()


+# not async, just want the fail_after
@pytest.mark.anyio
-async def test_channel_ipc():
+async def test_channel_setup():
    with fail_after(0.5):
        s, r = mp_channel[str]()
        p1 = mp.Process(target=foo, args=(r,))
--- a/src/exo/worker/engines/mlx/constants.py
+++ b/src/exo/worker/engines/mlx/constants.py
@@ -10,7 +10,6 @@ KEEP_KV_SIZE: int | None = 1600
 QUANTIZE_MODEL_MODE: str | None = "affine"
 CACHE_GROUP_SIZE: int = 64
 KV_CACHE_BITS: int | None = 8
-TEMPERATURE: float = 1.0

 # TODO: We should really make this opt-in, but Kimi requires trust_remote_code=True
 TRUST_REMOTE_CODE: bool = True
--- a/src/exo/worker/engines/mlx/utils_mlx.py
+++ b/src/exo/worker/engines/mlx/utils_mlx.py
@@ -13,7 +13,6 @@ from mlx_lm.tokenizer_utils import TokenizerWrapper
 from exo.worker.engines.mlx.constants import (
    CACHE_GROUP_SIZE,
    KV_CACHE_BITS,
-    TEMPERATURE,
    TRUST_REMOTE_CODE,
 )

@@ -21,6 +20,8 @@ try:
    from mlx_lm.tokenizer_utils import load_tokenizer
 except ImportError:
    from mlx_lm.tokenizer_utils import load as load_tokenizer  # type: ignore
+import contextlib
+
 import mlx.core as mx
 import mlx.nn as nn
 from mlx_lm.utils import load_model
@@ -48,6 +49,7 @@ from exo.worker.engines.mlx.auto_parallel import (
 )
 from exo.worker.runner.bootstrap import logger

+Group = mx.distributed.Group
 # Needed for 8 bit model
 resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 4096))

@@ -67,7 +69,7 @@ def get_weights_size(model_shard_meta: ShardMetadata) -> Memory:
    )


-def mx_barrier(group: mx.distributed.Group | None = None):
+def mx_barrier(group: Group | None = None):
    mx.eval(
        mx.distributed.all_sum(
            mx.array(1.0),
@@ -77,7 +79,7 @@ def mx_barrier(group: mx.distributed.Group | None = None):
    )


-def broadcast_from_zero(value: int, group: mx.distributed.Group | None = None):
+def broadcast_from_zero(value: int, group: Group | None = None):
    if group is None:
        return value

@@ -99,85 +101,96 @@ class HostList(RootModel[list[str]]):

 def mlx_distributed_init(
    bound_instance: BoundInstance,
-) -> mx.distributed.Group:
+) -> Group:
    """
-    Initialize the MLX distributed
+    Initialize MLX distributed.
    """
    rank = bound_instance.bound_shard.device_rank
    logger.info(f"Starting initialization for rank {rank}")

-    # TODO: singleton instances
-    match bound_instance.instance:
-        case MlxRingInstance(hosts=hosts):
-            hostfile = f"./hosts_{rank}.json"
-            hosts_json = HostList.from_hosts(hosts).model_dump_json()
+    coordination_file = None
+    try:
+        # TODO: singleton instances
+        match bound_instance.instance:
+            case MlxRingInstance(hosts=hosts):
+                coordination_file = (
+                    f"./hosts_{bound_instance.instance.instance_id}_{rank}.json"
+                )
+                hosts_json = HostList.from_hosts(hosts).model_dump_json()

-            with open(hostfile, "w") as f:
-                _ = f.write(hosts_json)
+                with open(coordination_file, "w") as f:
+                    _ = f.write(hosts_json)

-            logger.info(f"rank {rank} hostfile: {hostfile} hosts: {hosts_json}")
+                logger.info(
+                    f"rank {rank} hostfile: {coordination_file} hosts: {hosts_json}"
+                )

-            os.environ["MLX_HOSTFILE"] = hostfile
-            os.environ["MLX_RANK"] = str(rank)
-            os.environ["MLX_RING_VERBOSE"] = "1"
-            group = mx.distributed.init(backend="ring", strict=True)
+                os.environ["MLX_HOSTFILE"] = coordination_file
+                os.environ["MLX_RANK"] = str(rank)
+                os.environ["MLX_RING_VERBOSE"] = "1"
+                group = mx.distributed.init(backend="ring", strict=True)

-        case MlxJacclInstance(
-            jaccl_devices=jaccl_devices, jaccl_coordinators=jaccl_coordinators
-        ):
-            # Use RDMA connectivity matrix
-            devices_file = f"./hosts_{rank}.json"
-            jaccl_devices_json = json.dumps(jaccl_devices)
+            case MlxJacclInstance(
+                ibv_devices=ibv_devices, jaccl_coordinators=jaccl_coordinators
+            ):
+                # Use RDMA connectivity matrix
+                coordination_file = (
+                    f"./hosts_{bound_instance.instance.instance_id}_{rank}.json"
+                )
+                ibv_devices_json = json.dumps(ibv_devices)

-            with open(devices_file, "w") as f:
-                _ = f.write(jaccl_devices_json)
+                with open(coordination_file, "w") as f:
+                    _ = f.write(ibv_devices_json)

-            jaccl_coordinator = jaccl_coordinators[bound_instance.bound_node_id]
+                jaccl_coordinator = jaccl_coordinators[bound_instance.bound_node_id]

-            logger.info(f"rank {rank} MLX_JACCL_DEVICES: {jaccl_devices_json}")
-            logger.info(f"rank {rank} MLX_JACCL_COORDINATOR: {jaccl_coordinator}")
-            os.environ["MLX_JACCL_DEVICES"] = devices_file
-            os.environ["MLX_RANK"] = str(rank)
-            os.environ["MLX_JACCL_COORDINATOR"] = jaccl_coordinator
-            group = mx.distributed.init(backend="jaccl", strict=True)
+                logger.info(f"rank {rank} MLX_IBV_DEVICES: {ibv_devices_json}")
+                logger.info(f"rank {rank} MLX_JACCL_COORDINATOR: {jaccl_coordinator}")
+                os.environ["MLX_IBV_DEVICES"] = coordination_file
+                os.environ["MLX_RANK"] = str(rank)
+                os.environ["MLX_JACCL_COORDINATOR"] = jaccl_coordinator
+                group = mx.distributed.init(backend="jaccl", strict=True)

-    logger.info(f"Rank {rank} mlx distributed initialization complete")
+        logger.info(f"Rank {rank} mlx distributed initialization complete")

-    return group
+        return group
+    finally:
+        with contextlib.suppress(FileNotFoundError):
+            if coordination_file:
+                os.remove(coordination_file)


 def initialize_mlx(
    bound_instance: BoundInstance,
-) -> tuple[Model, TokenizerWrapper, Callable[[mx.array], mx.array]]:
-    """
-    Initialize the MLX model, tokenizer, and sampler. Runs in the MLX thread.
-    """
+) -> Group:
+    # should we unseed it?
+    # TODO: pass in seed from params
    mx.random.seed(42)

-    set_wired_limit_for_model(get_weights_size(bound_instance.bound_shard))
+    assert len(bound_instance.instance.shard_assignments.node_to_runner) > 1, (
+        "Tried to initialize mlx for a single node instance"
+    )
+    return mlx_distributed_init(bound_instance)

-    sampler: Callable[[mx.array], mx.array] = make_sampler(temp=TEMPERATURE)
+
+def load_mlx_items(
+    bound_instance: BoundInstance, group: Group | None
+) -> tuple[Model, TokenizerWrapper, Callable[[mx.array], mx.array]]:
+    # TODO: pass temperature
+    sampler: Callable[[mx.array], mx.array] = make_sampler(temp=0.7)
    logger.info("Created a sampler")

-    if len(bound_instance.instance.shard_assignments.node_to_runner) <= 1:
+    if group is None:
        logger.info(f"Single device used for {bound_instance.instance}")
        model_path = build_model_path(bound_instance.bound_shard.model_meta.model_id)
        start_time = time.perf_counter()
        model, _ = load_model(model_path, strict=True)
        end_time = time.perf_counter()
        logger.info(f"Time taken to load model: {(end_time - start_time):.2f}s")
-        if hasattr(model, "model") and isinstance(model.model, DeepseekV3Model):  # type: ignore
-            pass
-            # model, config = quantize_model(
-            #    model, config, group_size=KV_GROUP_SIZE, bits=ATTENTION_KV_BITS, quant_predicate=quant_predicate, mode=QUANTIZE_MODEL_MODE
-            # )
-
        tokenizer = get_tokenizer(model_path, bound_instance.bound_shard)

    else:
        logger.info("Starting distributed init")
-        group = mlx_distributed_init(bound_instance)
-
        start_time = time.perf_counter()
        model, tokenizer = shard_and_load(bound_instance.bound_shard, group=group)
        end_time = time.perf_counter()
@@ -187,14 +200,12 @@ def initialize_mlx(

    set_wired_limit_for_model(get_weights_size(bound_instance.bound_shard))

-    logger.debug(model)
-
    return cast(Model, model), tokenizer, sampler


 def shard_and_load(
    shard_metadata: ShardMetadata,
-    group: mx.distributed.Group,
+    group: Group,
 ) -> tuple[nn.Module, TokenizerWrapper]:
    model_path = build_model_path(shard_metadata.model_meta.model_id)

--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -16,13 +16,15 @@ from exo.shared.types.events import (
    ForwarderEvent,
    IndexedEvent,
    NodeDownloadProgress,
-    NodeGatheredInfo,
+    NodeMemoryMeasured,
+    NodePerformanceMeasured,
    TaskCreated,
    TaskStatusUpdated,
    TopologyEdgeCreated,
    TopologyEdgeDeleted,
 )
 from exo.shared.types.multiaddr import Multiaddr
+from exo.shared.types.profiling import MemoryPerformanceProfile, NodePerformanceProfile
 from exo.shared.types.state import State
 from exo.shared.types.tasks import (
    CreateRunner,
@@ -31,7 +33,7 @@ from exo.shared.types.tasks import (
    Task,
    TaskStatus,
 )
-from exo.shared.types.topology import SocketConnection
+from exo.shared.types.topology import Connection
 from exo.shared.types.worker.downloads import (
    DownloadCompleted,
    DownloadOngoing,
@@ -42,14 +44,14 @@ from exo.shared.types.worker.runners import RunnerId
 from exo.shared.types.worker.shards import ShardMetadata
 from exo.utils.channels import Receiver, Sender, channel
 from exo.utils.event_buffer import OrderedBuffer
-from exo.utils.info_gatherer.info_gatherer import GatheredInfo, InfoGatherer
-from exo.utils.info_gatherer.net_profile import check_reachable
 from exo.worker.download.download_utils import (
    map_repo_download_progress_to_download_progress_data,
 )
 from exo.worker.download.shard_downloader import RepoDownloadProgress, ShardDownloader
 from exo.worker.plan import plan
 from exo.worker.runner.runner_supervisor import RunnerSupervisor
+from exo.worker.utils import start_polling_memory_metrics, start_polling_node_metrics
+from exo.worker.utils.net_profile import check_reachable


 class Worker:
@@ -83,7 +85,7 @@ class Worker:
        self.state: State = State()
        self.download_status: dict[ShardMetadata, DownloadProgress] = {}
        self.runners: dict[RunnerId, RunnerSupervisor] = {}
-        self._tg: TaskGroup = create_task_group()
+        self._tg: TaskGroup | None = None

        self._nack_cancel_scope: CancelScope | None = None
        self._nack_attempts: int = 0
@@ -95,13 +97,37 @@ class Worker:
    async def run(self):
        logger.info("Starting Worker")

-        info_send, info_recv = channel[GatheredInfo]()
-        info_gatherer: InfoGatherer = InfoGatherer(info_send)
+        # TODO: CLEANUP HEADER
+        async def resource_monitor_callback(
+            node_performance_profile: NodePerformanceProfile,
+        ) -> None:
+            await self.event_sender.send(
+                NodePerformanceMeasured(
+                    node_id=self.node_id,
+                    node_profile=node_performance_profile,
+                    when=str(datetime.now(tz=timezone.utc)),
+                ),
+            )

-        async with self._tg as tg:
-            tg.start_soon(info_gatherer.run)
-            tg.start_soon(self._forward_info, info_recv)
+        async def memory_monitor_callback(
+            memory_profile: MemoryPerformanceProfile,
+        ) -> None:
+            await self.event_sender.send(
+                NodeMemoryMeasured(
+                    node_id=self.node_id,
+                    memory=memory_profile,
+                    when=str(datetime.now(tz=timezone.utc)),
+                )
+            )
+
+        # END CLEANUP
+
+        async with create_task_group() as tg:
+            self._tg = tg
            tg.start_soon(self.plan_step)
+            tg.start_soon(start_polling_node_metrics, resource_monitor_callback)
+
+            tg.start_soon(start_polling_memory_metrics, memory_monitor_callback)
            tg.start_soon(self._connection_message_event_writer)
            tg.start_soon(self._resend_out_for_delivery)
            tg.start_soon(self._event_applier)
@@ -114,17 +140,6 @@ class Worker:
        for runner in self.runners.values():
            runner.shutdown()

-    async def _forward_info(self, recv: Receiver[GatheredInfo]):
-        with recv as info_stream:
-            async for info in info_stream:
-                await self.event_sender.send(
-                    NodeGatheredInfo(
-                        node_id=self.node_id,
-                        when=str(datetime.now(tz=timezone.utc)),
-                        info=info,
-                    )
-                )
-
    async def _event_applier(self):
        with self.global_event_receiver as events:
            async for f_event in events:
@@ -144,6 +159,7 @@ class Worker:
                    self._nack_cancel_scope is None
                    or self._nack_cancel_scope.cancel_called
                ):
+                    assert self._tg
                    # Request the next index.
                    self._tg.start_soon(
                        self._nack_request, self.state.last_event_applied_idx + 1
@@ -212,7 +228,7 @@ class Worker:
                            )
                        )
                    else:
-                        self.event_sender.send_nowait(
+                        await self.event_sender.send(
                            TaskStatusUpdated(
                                task_id=task.task_id, task_status=TaskStatus.Running
                            )
@@ -232,7 +248,8 @@ class Worker:
                    await self.runners[self._task_to_runner_id(task)].start_task(task)

    def shutdown(self):
-        self._tg.cancel_scope.cancel()
+        if self._tg:
+            self._tg.cancel_scope.cancel()

    def _task_to_runner_id(self, task: Task):
        instance = self.state.instances[task.instance_id]
@@ -249,24 +266,24 @@ class Worker:
        match msg.connection_type:
            case ConnectionMessageType.Connected:
                return TopologyEdgeCreated(
-                    source=self.node_id,
-                    sink=msg.node_id,
-                    edge=SocketConnection(
-                        sink_multiaddr=Multiaddr(
+                    edge=Connection(
+                        local_node_id=self.node_id,
+                        send_back_node_id=msg.node_id,
+                        send_back_multiaddr=Multiaddr(
                            address=f"/ip4/{msg.remote_ipv4}/tcp/{msg.remote_tcp_port}"
                        ),
-                    ),
+                    )
                )

            case ConnectionMessageType.Disconnected:
                return TopologyEdgeDeleted(
-                    source=self.node_id,
-                    sink=msg.node_id,
-                    edge=SocketConnection(
-                        sink_multiaddr=Multiaddr(
+                    edge=Connection(
+                        local_node_id=self.node_id,
+                        send_back_node_id=msg.node_id,
+                        send_back_multiaddr=Multiaddr(
                            address=f"/ip4/{msg.remote_ipv4}/tcp/{msg.remote_tcp_port}"
                        ),
-                    ),
+                    )
                )

    async def _nack_request(self, since_idx: int) -> None:
@@ -315,6 +332,7 @@ class Worker:
            event_sender=self.event_sender.clone(),
        )
        self.runners[task.bound_instance.bound_runner_id] = runner
+        assert self._tg
        self._tg.start_soon(runner.run)
        return runner

@@ -373,6 +391,7 @@ class Worker:
                last_progress_time = current_time()

        self.shard_downloader.on_progress(download_progress_callback)
+        assert self._tg
        self._tg.start_soon(self.shard_downloader.ensure_shard, task.shard_metadata)

    async def _forward_events(self) -> None:
@@ -395,35 +414,28 @@ class Worker:
        while True:
            # TODO: EdgeDeleted
            edges = set(self.state.topology.list_connections())
-            conns = await check_reachable(
-                self.node_id, self.state.topology, self.state.node_profiles
-            )
+            conns = await check_reachable(self.state.topology)
            for nid in conns:
                for ip in conns[nid]:
-                    edge = SocketConnection(
+                    edge = Connection(
+                        local_node_id=self.node_id,
+                        send_back_node_id=nid,
                        # nonsense multiaddr
-                        sink_multiaddr=Multiaddr(address=f"/ip4/{ip}/tcp/52415")
+                        send_back_multiaddr=Multiaddr(address=f"/ip4/{ip}/tcp/52415")
                        if "." in ip
                        # nonsense multiaddr
                        else Multiaddr(address=f"/ip6/{ip}/tcp/52415"),
                    )
                    if edge not in edges:
                        logger.debug(f"ping discovered {edge=}")
-                        await self.event_sender.send(
-                            TopologyEdgeCreated(
-                                source=self.node_id, sink=nid, edge=edge
-                            )
-                        )
+                        await self.event_sender.send(TopologyEdgeCreated(edge=edge))

            for nid, conn in self.state.topology.out_edges(self.node_id):
-                if not isinstance(conn, SocketConnection):
-                    continue
-                if nid not in conns or conn.sink_multiaddr.ip_address not in conns.get(
-                    nid, set()
+                if (
+                    nid not in conns
+                    or conn.send_back_multiaddr.ip_address not in conns.get(nid, set())
                ):
                    logger.debug(f"ping failed to discover {conn=}")
-                    await self.event_sender.send(
-                        TopologyEdgeDeleted(source=self.node_id, sink=nid, edge=conn)
-                    )
+                    await self.event_sender.send(TopologyEdgeDeleted(edge=conn))

            await anyio.sleep(10)
--- a/src/exo/worker/plan.py
+++ b/src/exo/worker/plan.py
@@ -5,6 +5,7 @@ from collections.abc import Mapping, Sequence
 from exo.shared.types.common import NodeId
 from exo.shared.types.tasks import (
    ChatCompletion,
+    ConnectToGroup,
    CreateRunner,
    DownloadModel,
    LoadModel,
@@ -14,17 +15,23 @@ from exo.shared.types.tasks import (
    TaskId,
    TaskStatus,
 )
-from exo.shared.types.worker.downloads import DownloadCompleted, DownloadProgress
+from exo.shared.types.worker.downloads import (
+    DownloadCompleted,
+    DownloadOngoing,
+    DownloadProgress,
+)
 from exo.shared.types.worker.instances import BoundInstance, Instance, InstanceId
 from exo.shared.types.worker.runners import (
+    RunnerConnected,
+    RunnerConnecting,
    RunnerFailed,
    RunnerId,
+    RunnerIdle,
    RunnerLoaded,
    RunnerLoading,
    RunnerReady,
    RunnerRunning,
    RunnerStatus,
-    RunnerWaitingForModel,
    RunnerWarmingUp,
 )
 from exo.shared.types.worker.shards import ShardMetadata
@@ -48,6 +55,7 @@ def plan(
        _kill_runner(runners, all_runners, instances)
        or _create_runner(node_id, runners, instances)
        or _model_needs_download(runners, download_status)
+        or _init_distributed_backend(runners, all_runners)
        or _load_model(runners, all_runners, global_download_status)
        or _ready_to_warmup(runners, all_runners)
        or _pending_tasks(runners, tasks, all_runners)
@@ -106,9 +114,11 @@ def _model_needs_download(
    download_status: Mapping[ShardMetadata, DownloadProgress],
 ) -> DownloadModel | None:
    for runner in runners.values():
-        if (
-            isinstance(runner.status, RunnerWaitingForModel)
-            and runner.bound_instance.bound_shard not in download_status
+        if isinstance(runner.status, RunnerIdle) and (
+            not isinstance(
+                download_status.get(runner.bound_instance.bound_shard, None),
+                (DownloadOngoing, DownloadCompleted),
+            )
        ):
            # We don't invalidate download_status randomly in case a file gets deleted on disk
            return DownloadModel(
@@ -117,14 +127,54 @@ def _model_needs_download(
            )


-""" --- TODO!
-def _init_backend(
+def _init_distributed_backend(
    runners: Mapping[RunnerId, RunnerSupervisor],
    all_runners: Mapping[RunnerId, RunnerStatus],
-) -> LoadModel | None:
-    for runner in runner.values()
-    pass
-"""
+):
+    for runner in runners.values():
+        instance = runner.bound_instance.instance
+        shard_assignments = instance.shard_assignments
+
+        is_single_node_instance = len(shard_assignments.runner_to_shard) == 1
+        if is_single_node_instance:
+            continue
+
+        runner_is_idle = isinstance(runner.status, RunnerIdle)
+        all_runners_connecting = all(
+            isinstance(
+                all_runners.get(global_runner_id),
+                (RunnerConnecting, RunnerIdle),
+            )
+            for global_runner_id in shard_assignments.runner_to_shard
+        )
+
+        if not (runner_is_idle and all_runners_connecting):
+            continue
+
+        runner_id = runner.bound_instance.bound_runner_id
+
+        shard = runner.bound_instance.bound_shard
+        device_rank = shard.device_rank
+        world_size = shard.world_size
+
+        assert device_rank < world_size
+        assert device_rank >= 0
+
+        accepting_ranks = device_rank < world_size - 1
+
+        # Rank = n-1
+        connecting_rank_ready = device_rank == world_size - 1 and all(
+            isinstance(all_runners.get(global_runner_id, None), RunnerConnecting)
+            for global_runner_id in shard_assignments.runner_to_shard
+            if global_runner_id != runner_id
+        )
+
+        if not (accepting_ranks or connecting_rank_ready):
+            continue
+
+        return ConnectToGroup(instance_id=instance.instance_id)
+
+    return None


 def _load_model(
@@ -136,31 +186,33 @@ def _load_model(
        instance = runner.bound_instance.instance
        shard_assignments = instance.shard_assignments

-        all_downloads_complete_local = all(
+        all_local_downloads_complete = all(
            nid in global_download_status
            and any(
                isinstance(dp, DownloadCompleted)
-                and dp.shard_metadata == shard_assignments.runner_to_shard[rid]
+                and dp.shard_metadata.model_meta.model_id == shard_assignments.model_id
                for dp in global_download_status[nid]
            )
-            for nid, rid in shard_assignments.node_to_runner.items()
+            for nid in shard_assignments.node_to_runner
        )
+        if not all_local_downloads_complete:
+            continue

-        runner_is_waiting = isinstance(runner.status, RunnerWaitingForModel)
+        is_single_node_instance = len(instance.shard_assignments.runner_to_shard) == 1
+        if is_single_node_instance and isinstance(runner.status, RunnerIdle):
+            return LoadModel(instance_id=instance.instance_id)

-        all_runners_expecting_model = all(
+        is_runner_waiting = isinstance(runner.status, RunnerConnected)
+
+        all_ready_for_model = all(
            isinstance(
-                all_runners.get(global_runner_id),
-                (RunnerWaitingForModel, RunnerLoading, RunnerLoaded),
+                all_runners.get(global_runner_id, None),
+                (RunnerConnected, RunnerLoading, RunnerLoaded),
            )
            for global_runner_id in shard_assignments.runner_to_shard
        )

-        if (
-            all_downloads_complete_local
-            and runner_is_waiting
-            and all_runners_expecting_model
-        ):
+        if is_runner_waiting and all_ready_for_model:
            return LoadModel(instance_id=instance.instance_id)

    return None
@@ -183,8 +235,9 @@ def _ready_to_warmup(
        assert device_rank < world_size
        assert device_rank >= 0

-        # Rank != n-1
-        accepting_ranks_ready = device_rank != world_size - 1 and all(
+        # TODO: Ensure these align with MLX distributeds expectations.
+        # Rank != 0
+        accepting_ranks_ready = device_rank > 0 and all(
            isinstance(
                all_runners.get(global_runner_id, None),
                (RunnerLoaded, RunnerWarmingUp),
@@ -192,8 +245,8 @@ def _ready_to_warmup(
            for global_runner_id in shard_assignments.runner_to_shard
        )

-        # Rank = n-1
-        connecting_rank_ready = device_rank == world_size - 1 and all(
+        # Rank = 0
+        connecting_rank_ready = device_rank == 0 and all(
            isinstance(all_runners.get(global_runner_id, None), RunnerWarmingUp)
            for global_runner_id in shard_assignments.runner_to_shard
            if global_runner_id != runner_id
@@ -221,6 +274,8 @@ def _pending_tasks(
            if task.instance_id != runner.bound_instance.instance.instance_id:
                continue

+            # TODO: Check ordering aligns with MLX distributeds expectations.
+
            if isinstance(runner.status, RunnerReady) and all(
                isinstance(all_runners[global_runner_id], (RunnerReady, RunnerRunning))
                for global_runner_id in runner.bound_instance.instance.shard_assignments.runner_to_shard
--- a/src/exo/worker/runner/bootstrap.py
+++ b/src/exo/worker/runner/bootstrap.py
@@ -22,7 +22,7 @@ def entrypoint(
 ) -> None:
    if (
        isinstance(bound_instance.instance, MlxJacclInstance)
-        and len(bound_instance.instance.jaccl_devices) >= 2
+        and len(bound_instance.instance.ibv_devices) >= 2
    ):
        os.environ["MLX_METAL_FAST_SYNCH"] = "1"

--- a/src/exo/worker/runner/runner.py
+++ b/src/exo/worker/runner/runner.py
@@ -11,6 +11,7 @@ from exo.shared.types.events import (
 )
 from exo.shared.types.tasks import (
    ChatCompletion,
+    ConnectToGroup,
    LoadModel,
    Shutdown,
    StartWarmup,
@@ -22,20 +23,23 @@ from exo.shared.types.worker.runner_response import (
    GenerationResponse,
 )
 from exo.shared.types.worker.runners import (
+    RunnerConnected,
+    RunnerConnecting,
    RunnerFailed,
+    RunnerIdle,
    RunnerLoaded,
    RunnerLoading,
    RunnerReady,
    RunnerRunning,
    RunnerShutdown,
    RunnerStatus,
-    RunnerWaitingForModel,
    RunnerWarmingUp,
 )
 from exo.utils.channels import ClosedResourceError, MpReceiver, MpSender
 from exo.worker.engines.mlx.generator.generate import mlx_generate, warmup_inference
 from exo.worker.engines.mlx.utils_mlx import (
    initialize_mlx,
+    load_mlx_items,
    mlx_force_oom,
 )
 from exo.worker.runner.bootstrap import logger
@@ -63,9 +67,10 @@ def main(
        model = None
        tokenizer = None
        sampler = None
+        group = None

-        current_status: RunnerStatus = RunnerWaitingForModel()
-        logger.info("runner waiting for model")
+        current_status: RunnerStatus = RunnerIdle()
+        logger.info("runner created")
        event_sender.send(
            RunnerStatusUpdated(runner_id=runner_id, runner_status=current_status)
        )
@@ -78,9 +83,26 @@ def main(
                )
                event_sender.send(TaskAcknowledged(task_id=task.task_id))
                match task:
-                    case LoadModel() if isinstance(
-                        current_status, (RunnerWaitingForModel, RunnerFailed)
+                    case ConnectToGroup() if isinstance(
+                        current_status, (RunnerIdle, RunnerFailed)
                    ):
+                        logger.info("runner connecting")
+                        current_status = RunnerConnecting()
+                        event_sender.send(
+                            RunnerStatusUpdated(
+                                runner_id=runner_id, runner_status=current_status
+                            )
+                        )
+                        group = initialize_mlx(bound_instance)
+
+                        logger.info("runner connected")
+                        current_status = RunnerConnected()
+
+                    # we load the model if it's connected with a group, or idle without a group. we should never tell a model to connect if it doesn't need to
+                    case LoadModel() if (
+                        isinstance(current_status, RunnerConnected)
+                        and group is not None
+                    ) or (isinstance(current_status, RunnerIdle) and group is None):
                        current_status = RunnerLoading()
                        logger.info("runner loading")
                        event_sender.send(
@@ -89,15 +111,12 @@ def main(
                            )
                        )

-                        model, tokenizer, sampler = initialize_mlx(bound_instance)
+                        model, tokenizer, sampler = load_mlx_items(
+                            bound_instance, group
+                        )

                        current_status = RunnerLoaded()
                        logger.info("runner loaded")
-                        event_sender.send(
-                            RunnerStatusUpdated(
-                                runner_id=runner_id, runner_status=current_status
-                            )
-                        )
                    case StartWarmup() if isinstance(current_status, RunnerLoaded):
                        assert model
                        assert tokenizer
@@ -123,11 +142,6 @@ def main(
                        )
                        current_status = RunnerReady()
                        logger.info("runner ready")
-                        event_sender.send(
-                            RunnerStatusUpdated(
-                                runner_id=runner_id, runner_status=RunnerReady()
-                            )
-                        )
                    case ChatCompletion(
                        task_params=task_params, command_id=command_id
                    ) if isinstance(current_status, RunnerReady):
@@ -172,11 +186,6 @@ def main(

                        current_status = RunnerReady()
                        logger.info("runner ready")
-                        event_sender.send(
-                            RunnerStatusUpdated(
-                                runner_id=runner_id, runner_status=RunnerReady()
-                            )
-                        )
                    case Shutdown():
                        logger.info("runner shutting down")
                        event_sender.send(
@@ -186,12 +195,19 @@ def main(
                        )
                        break
                    case _:
-                        raise ValueError("Received task outside of state machine")
+                        raise ValueError(
+                            f"Received {task.__class__.__name__} outside of state machine in {current_status=}"
+                        )
                event_sender.send(
                    TaskStatusUpdated(
                        task_id=task.task_id, task_status=TaskStatus.Complete
                    )
                )
+                event_sender.send(
+                    RunnerStatusUpdated(
+                        runner_id=runner_id, runner_status=current_status
+                    )
+                )
        event_sender.send(
            RunnerStatusUpdated(runner_id=runner_id, runner_status=RunnerShutdown())
        )
--- a/src/exo/worker/runner/runner_supervisor.py
+++ b/src/exo/worker/runner/runner_supervisor.py
@@ -19,8 +19,8 @@ from exo.shared.types.tasks import Task, TaskId
 from exo.shared.types.worker.instances import BoundInstance
 from exo.shared.types.worker.runners import (
    RunnerFailed,
+    RunnerIdle,
    RunnerStatus,
-    RunnerWaitingForModel,
 )
 from exo.shared.types.worker.shards import ShardMetadata
 from exo.utils.channels import MpReceiver, MpSender, Sender, mp_channel
@@ -41,7 +41,7 @@ class RunnerSupervisor:
    _event_sender: Sender[Event]
    # err_path: str
    _tg: TaskGroup | None = field(default=None, init=False)
-    status: RunnerStatus = field(default_factory=RunnerWaitingForModel, init=False)
+    status: RunnerStatus = field(default_factory=RunnerIdle, init=False)
    pending: dict[TaskId, anyio.Event] = field(default_factory=dict, init=False)

    @classmethod
--- a/src/exo/worker/tests/constants.py
+++ b/src/exo/worker/tests/constants.py
@@ -24,3 +24,9 @@ TASK_2_ID: Final[TaskId] = TaskId("66666666-6666-4666-8666-666666666666")

 COMMAND_1_ID: Final[CommandId] = CommandId("77777777-7777-4777-8777-777777777777")
 COMMAND_2_ID: Final[CommandId] = CommandId("88888888-8888-4888-8888-888888888888")
+
+SHUTDOWN_TASK_ID = TaskId("shutdown")
+CHAT_COMPLETION_TASK_ID = TaskId("chat-completion")
+INITIALIZATION_TASK_ID = TaskId("initialisation")
+LOAD_TASK_ID = TaskId("load")
+WARMUP_TASK_ID = TaskId("warmup")
--- a/src/exo/worker/tests/unittests/conftest.py
+++ b/src/exo/worker/tests/unittests/conftest.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from dataclasses import dataclass

 from exo.shared.types.common import NodeId
@@ -14,6 +16,7 @@ from exo.shared.types.worker.runners import RunnerId, RunnerStatus, ShardAssignm
 from exo.shared.types.worker.shards import PipelineShardMetadata, ShardMetadata


+# Runner supervisor without multiprocessing logic.
@dataclass(frozen=True)
 class FakeRunnerSupervisor:
    bound_instance: BoundInstance
@@ -35,6 +38,8 @@ def get_pipeline_shard_metadata(
            pretty_name=str(model_id),
            storage_size=Memory.from_mb(100000),
            n_layers=32,
+            # hidden_size=2048,
+            # supports_tensor=False,
        ),
        device_rank=device_rank,
        world_size=world_size,
@@ -69,3 +74,24 @@ def get_mlx_ring_instance(
        ),
        hosts=[],
    )
+
+
+def get_bound_mlx_ring_instance(
+    instance_id: InstanceId, model_id: ModelId, runner_id: RunnerId, node_id: NodeId
+) -> BoundInstance:
+    shard = get_pipeline_shard_metadata(model_id=model_id, device_rank=0, world_size=2)
+    other_shard = get_pipeline_shard_metadata(
+        model_id=model_id, device_rank=1, world_size=2
+    )
+    instance = get_mlx_ring_instance(
+        instance_id=instance_id,
+        model_id=model_id,
+        node_to_runner={
+            node_id: runner_id,
+            NodeId("other_node"): RunnerId("other_runner"),
+        },
+        runner_to_shard={runner_id: shard, RunnerId("other_runner"): other_shard},
+    )
+    return BoundInstance(
+        instance=instance, bound_runner_id=runner_id, bound_node_id=node_id
+    )
--- a/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_download_and_loading.py
@@ -4,7 +4,8 @@ from exo.shared.types.tasks import LoadModel
 from exo.shared.types.worker.downloads import DownloadCompleted, DownloadProgress
 from exo.shared.types.worker.instances import BoundInstance
 from exo.shared.types.worker.runners import (
-    RunnerWaitingForModel,
+    RunnerConnected,
+    RunnerIdle,
 )
 from exo.shared.types.worker.shards import ShardMetadata
 from exo.worker.tests.constants import (
@@ -38,13 +39,11 @@ def test_plan_requests_download_when_waiting_and_shard_not_downloaded():
    bound_instance = BoundInstance(
        instance=instance, bound_runner_id=RUNNER_1_ID, bound_node_id=NODE_A
    )
-    runner = FakeRunnerSupervisor(
-        bound_instance=bound_instance, status=RunnerWaitingForModel()
-    )
+    runner = FakeRunnerSupervisor(bound_instance=bound_instance, status=RunnerIdle())

    runners = {RUNNER_1_ID: runner}
    instances = {INSTANCE_1_ID: instance}
-    all_runners = {RUNNER_1_ID: RunnerWaitingForModel()}
+    all_runners = {RUNNER_1_ID: RunnerIdle()}

    # No entry for this shard -> should trigger DownloadModel
    download_status: dict[ShardMetadata, DownloadProgress] = {}
@@ -82,15 +81,15 @@ def test_plan_loads_model_when_all_shards_downloaded_and_waiting():
        instance=instance, bound_runner_id=RUNNER_1_ID, bound_node_id=NODE_A
    )
    local_runner = FakeRunnerSupervisor(
-        bound_instance=bound_instance, status=RunnerWaitingForModel()
+        bound_instance=bound_instance, status=RunnerConnected()
    )

    runners = {RUNNER_1_ID: local_runner}
    instances = {INSTANCE_1_ID: instance}

    all_runners = {
-        RUNNER_1_ID: RunnerWaitingForModel(),
-        RUNNER_2_ID: RunnerWaitingForModel(),
+        RUNNER_1_ID: RunnerConnected(),
+        RUNNER_2_ID: RunnerConnected(),
    }

    # Local node has already marked its shard as downloaded (not actually used by _load_model)
@@ -133,13 +132,11 @@ def test_plan_does_not_request_download_when_shard_already_downloaded():
    bound_instance = BoundInstance(
        instance=instance, bound_runner_id=RUNNER_1_ID, bound_node_id=NODE_A
    )
-    runner = FakeRunnerSupervisor(
-        bound_instance=bound_instance, status=RunnerWaitingForModel()
-    )
+    runner = FakeRunnerSupervisor(bound_instance=bound_instance, status=RunnerIdle())

    runners = {RUNNER_1_ID: runner}
    instances = {INSTANCE_1_ID: instance}
-    all_runners = {RUNNER_1_ID: RunnerWaitingForModel()}
+    all_runners = {RUNNER_1_ID: RunnerIdle()}

    # Local status claims the shard is downloaded already
    local_download_status = {
@@ -183,14 +180,14 @@ def test_plan_does_not_load_model_until_all_shards_downloaded_globally():
        instance=instance, bound_runner_id=RUNNER_1_ID, bound_node_id=NODE_A
    )
    local_runner = FakeRunnerSupervisor(
-        bound_instance=bound_instance, status=RunnerWaitingForModel()
+        bound_instance=bound_instance, status=RunnerConnected()
    )

    runners = {RUNNER_1_ID: local_runner}
    instances = {INSTANCE_1_ID: instance}
    all_runners = {
-        RUNNER_1_ID: RunnerWaitingForModel(),
-        RUNNER_2_ID: RunnerWaitingForModel(),
+        RUNNER_1_ID: RunnerConnected(),
+        RUNNER_2_ID: RunnerConnected(),
    }

    # Only NODE_A's shard is recorded as downloaded globally
@@ -213,3 +210,22 @@ def test_plan_does_not_load_model_until_all_shards_downloaded_globally():
    )

    assert result is None
+
+    global_download_status = {
+        NODE_A: [DownloadCompleted(shard_metadata=shard1, node_id=NODE_A)],
+        NODE_B: [
+            DownloadCompleted(shard_metadata=shard2, node_id=NODE_B)
+        ],  # NODE_B has no downloads completed yet
+    }
+
+    result = plan_mod.plan(
+        node_id=NODE_A,
+        runners=runners,  # type: ignore
+        download_status=local_download_status,
+        global_download_status=global_download_status,
+        instances=instances,
+        all_runners=all_runners,
+        tasks={},
+    )
+
+    assert result is not None
--- a/src/exo/worker/tests/unittests/test_plan/test_task_forwarding.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_task_forwarding.py
@@ -5,9 +5,9 @@ from exo.shared.types.api import ChatCompletionTaskParams
 from exo.shared.types.tasks import ChatCompletion, Task, TaskId, TaskStatus
 from exo.shared.types.worker.instances import BoundInstance, InstanceId
 from exo.shared.types.worker.runners import (
+    RunnerIdle,
    RunnerReady,
    RunnerRunning,
-    RunnerWaitingForModel,
 )
 from exo.worker.tests.constants import (
    COMMAND_1_ID,
@@ -99,7 +99,7 @@ def test_plan_does_not_forward_chat_completion_if_any_runner_not_ready():
    instances = {INSTANCE_1_ID: instance}
    all_runners = {
        RUNNER_1_ID: RunnerReady(),
-        RUNNER_2_ID: RunnerWaitingForModel(),
+        RUNNER_2_ID: RunnerIdle(),
    }

    task = ChatCompletion(
--- a/src/exo/worker/tests/unittests/test_plan/test_warmup.py
+++ b/src/exo/worker/tests/unittests/test_plan/test_warmup.py
@@ -2,8 +2,8 @@ import exo.worker.plan as plan_mod
 from exo.shared.types.tasks import StartWarmup
 from exo.shared.types.worker.instances import BoundInstance
 from exo.shared.types.worker.runners import (
+    RunnerIdle,
    RunnerLoaded,
-    RunnerWaitingForModel,
    RunnerWarmingUp,
 )
 from exo.worker.tests.constants import (
@@ -128,7 +128,7 @@ def test_plan_does_not_start_warmup_for_non_zero_rank_until_all_loaded_or_warmin
    runners = {RUNNER_2_ID: local_runner}
    instances = {INSTANCE_1_ID: instance}
    all_runners = {
-        RUNNER_1_ID: RunnerWaitingForModel(),
+        RUNNER_1_ID: RunnerIdle(),
        RUNNER_2_ID: RunnerLoaded(),
    }

--- a/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_event_ordering.py
@@ -0,0 +1,208 @@
+# Check tasks are complete before runner is ever ready.
+from collections.abc import Iterable
+from typing import Callable
+
+import pytest
+
+import exo.worker.runner.runner as mlx_runner
+from exo.shared.types.api import ChatCompletionMessage
+from exo.shared.types.chunks import TokenChunk
+from exo.shared.types.events import (
+    ChunkGenerated,
+    Event,
+    RunnerStatusUpdated,
+    TaskAcknowledged,
+    TaskStatusUpdated,
+)
+from exo.shared.types.tasks import (
+    ChatCompletion,
+    ChatCompletionTaskParams,
+    ConnectToGroup,
+    LoadModel,
+    Shutdown,
+    StartWarmup,
+    Task,
+    TaskStatus,
+)
+from exo.shared.types.worker.runner_response import GenerationResponse
+from exo.shared.types.worker.runners import (
+    RunnerConnected,
+    RunnerConnecting,
+    RunnerIdle,
+    RunnerLoaded,
+    RunnerLoading,
+    RunnerReady,
+    RunnerRunning,
+    RunnerShutdown,
+    RunnerWarmingUp,
+)
+from exo.utils.channels import mp_channel
+
+from ...constants import (
+    CHAT_COMPLETION_TASK_ID,
+    COMMAND_1_ID,
+    INITIALIZATION_TASK_ID,
+    INSTANCE_1_ID,
+    LOAD_TASK_ID,
+    MODEL_A_ID,
+    NODE_A,
+    RUNNER_1_ID,
+    SHUTDOWN_TASK_ID,
+    WARMUP_TASK_ID,
+)
+from ..conftest import get_bound_mlx_ring_instance
+
+
+def make_nothin[T, U, V](res: T) -> Callable[[], T]:
+    def nothin(*_1: U, **_2: V) -> T:
+        return res
+
+    return nothin
+
+
+nothin = make_nothin(None)
+
+
+INIT_TASK = ConnectToGroup(
+    task_id=INITIALIZATION_TASK_ID,
+    instance_id=INSTANCE_1_ID,
+)
+
+LOAD_TASK = LoadModel(
+    task_id=LOAD_TASK_ID,
+    instance_id=INSTANCE_1_ID,
+)
+
+WARMUP_TASK = StartWarmup(
+    task_id=WARMUP_TASK_ID,
+    instance_id=INSTANCE_1_ID,
+)
+
+SHUTDOWN_TASK = Shutdown(
+    task_id=SHUTDOWN_TASK_ID,
+    instance_id=INSTANCE_1_ID,
+    runner_id=RUNNER_1_ID,
+)
+
+CHAT_PARAMS = ChatCompletionTaskParams(
+    model=str(MODEL_A_ID),
+    messages=[ChatCompletionMessage(role="user", content="hello")],
+    stream=True,
+    max_tokens=4,
+    temperature=0.0,
+)
+
+CHAT_TASK = ChatCompletion(
+    task_id=CHAT_COMPLETION_TASK_ID,
+    command_id=COMMAND_1_ID,
+    task_params=CHAT_PARAMS,
+    instance_id=INSTANCE_1_ID,
+)
+
+
+def assert_events_equal(test_events: Iterable[Event], true_events: Iterable[Event]):
+    for test_event, true_event in zip(test_events, true_events, strict=True):
+        test_event.event_id = true_event.event_id
+        assert test_event == true_event, f"{test_event} != {true_event}"
+
+
+@pytest.fixture
+def patch_out_mlx(monkeypatch: pytest.MonkeyPatch):
+    # initialize_mlx returns a "group" equal to 1
+    monkeypatch.setattr(mlx_runner, "initialize_mlx", make_nothin(1))
+    monkeypatch.setattr(mlx_runner, "load_mlx_items", make_nothin((1, 1, 1)))
+    monkeypatch.setattr(mlx_runner, "warmup_inference", make_nothin(1))
+    monkeypatch.setattr(mlx_runner, "_check_for_debug_prompts", nothin)
+
+    def fake_generate(*_1: object, **_2: object):
+        yield GenerationResponse(token=0, text="hi", finish_reason="stop")
+
+    monkeypatch.setattr(mlx_runner, "mlx_generate", fake_generate)
+
+
+def _run(tasks: Iterable[Task]):
+    bound_instance = get_bound_mlx_ring_instance(
+        instance_id=INSTANCE_1_ID,
+        model_id=MODEL_A_ID,
+        runner_id=RUNNER_1_ID,
+        node_id=NODE_A,
+    )
+
+    task_sender, task_receiver = mp_channel[Task]()
+    event_sender, event_receiver = mp_channel[Event]()
+
+    with task_sender, event_receiver:
+        for t in tasks:
+            task_sender.send(t)
+
+        # worst monkeypatch known to man
+        # this is some c++ nonsense
+        event_sender.close = nothin
+        event_sender.join = nothin
+        task_receiver.close = nothin
+        task_receiver.join = nothin
+
+        mlx_runner.main(bound_instance, event_sender, task_receiver)
+
+        return event_receiver.collect()
+
+
+def test_events_processed_in_correct_order(patch_out_mlx: pytest.MonkeyPatch):
+    events = _run([INIT_TASK, LOAD_TASK, WARMUP_TASK, CHAT_TASK, SHUTDOWN_TASK])
+
+    expected_chunk = ChunkGenerated(
+        command_id=COMMAND_1_ID,
+        chunk=TokenChunk(
+            idx=0,
+            model=MODEL_A_ID,
+            text="hi",
+            token_id=0,
+            finish_reason="stop",
+        ),
+    )
+
+    assert_events_equal(
+        events,
+        [
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerIdle()),
+            TaskStatusUpdated(
+                task_id=INITIALIZATION_TASK_ID, task_status=TaskStatus.Running
+            ),
+            TaskAcknowledged(task_id=INITIALIZATION_TASK_ID),
+            RunnerStatusUpdated(
+                runner_id=RUNNER_1_ID, runner_status=RunnerConnecting()
+            ),
+            TaskStatusUpdated(
+                task_id=INITIALIZATION_TASK_ID, task_status=TaskStatus.Complete
+            ),
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerConnected()),
+            TaskStatusUpdated(task_id=LOAD_TASK_ID, task_status=TaskStatus.Running),
+            TaskAcknowledged(task_id=LOAD_TASK_ID),
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerLoading()),
+            TaskStatusUpdated(task_id=LOAD_TASK_ID, task_status=TaskStatus.Complete),
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerLoaded()),
+            TaskStatusUpdated(task_id=WARMUP_TASK_ID, task_status=TaskStatus.Running),
+            TaskAcknowledged(task_id=WARMUP_TASK_ID),
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerWarmingUp()),
+            TaskStatusUpdated(task_id=WARMUP_TASK_ID, task_status=TaskStatus.Complete),
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerReady()),
+            TaskStatusUpdated(
+                task_id=CHAT_COMPLETION_TASK_ID, task_status=TaskStatus.Running
+            ),
+            TaskAcknowledged(task_id=CHAT_COMPLETION_TASK_ID),
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerRunning()),
+            expected_chunk,
+            TaskStatusUpdated(
+                task_id=CHAT_COMPLETION_TASK_ID, task_status=TaskStatus.Complete
+            ),
+            # CHAT COMPLETION TASK SHOULD COMPLETE BEFORE RUNNER READY
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerReady()),
+            TaskStatusUpdated(task_id=SHUTDOWN_TASK_ID, task_status=TaskStatus.Running),
+            TaskAcknowledged(task_id=SHUTDOWN_TASK_ID),
+            TaskStatusUpdated(
+                task_id=SHUTDOWN_TASK_ID, task_status=TaskStatus.Complete
+            ),
+            # SPECIAL EXCEPTION FOR RUNNER SHUTDOWN
+            RunnerStatusUpdated(runner_id=RUNNER_1_ID, runner_status=RunnerShutdown()),
+        ],
+    )
--- a/src/exo/worker/tests/unittests/test_runner/test_runner_supervisor.py
+++ b/src/exo/worker/tests/unittests/test_runner/test_runner_supervisor.py
@@ -0,0 +1 @@
+# TODO:
--- a/src/exo/worker/utils/init.py
+++ b/src/exo/worker/utils/init.py
@@ -0,0 +1,6 @@
+from .profile import start_polling_memory_metrics, start_polling_node_metrics
+
+__all__ = [
+    "start_polling_node_metrics",
+    "start_polling_memory_metrics",
+]
--- a/src/exo/worker/utils/net_profile.py
+++ b/src/exo/worker/utils/net_profile.py
@@ -0,0 +1,41 @@
+import socket
+
+from anyio import create_task_group, to_thread
+
+from exo.shared.topology import Topology
+from exo.shared.types.common import NodeId
+
+
+# TODO: ref. api port
+async def check_reachability(
+    target_ip: str, target_node_id: NodeId, out: dict[NodeId, set[str]]
+) -> None:
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.settimeout(1)  # 1 second timeout
+    try:
+        result = await to_thread.run_sync(sock.connect_ex, (target_ip, 52415))
+    except socket.gaierror:
+        # seems to throw on ipv6 loopback. oh well
+        # logger.warning(f"invalid {target_ip=}")
+        return
+    finally:
+        sock.close()
+
+    if result == 0:
+        if target_node_id not in out:
+            out[target_node_id] = set()
+        out[target_node_id].add(target_ip)
+
+
+async def check_reachable(topology: Topology) -> dict[NodeId, set[str]]:
+    reachable: dict[NodeId, set[str]] = {}
+    async with create_task_group() as tg:
+        for node in topology.list_nodes():
+            if not node.node_profile:
+                continue
+            for iface in node.node_profile.network_interfaces:
+                tg.start_soon(
+                    check_reachability, iface.ip_address, node.node_id, reachable
+                )
+
+    return reachable
--- a/src/exo/worker/utils/profile.py
+++ b/src/exo/worker/utils/profile.py
@@ -0,0 +1,114 @@
+import asyncio
+import os
+import platform
+from typing import Any, Callable, Coroutine
+
+import anyio
+from loguru import logger
+
+from exo.shared.types.memory import Memory
+from exo.shared.types.profiling import (
+    MemoryPerformanceProfile,
+    NodePerformanceProfile,
+    SystemPerformanceProfile,
+)
+
+from .macmon import (
+    MacMonError,
+    Metrics,
+)
+from .macmon import (
+    get_metrics_async as macmon_get_metrics_async,
+)
+from .system_info import (
+    get_friendly_name,
+    get_model_and_chip,
+    get_network_interfaces,
+)
+
+
+async def get_metrics_async() -> Metrics | None:
+    """Return detailed Metrics on macOS or a minimal fallback elsewhere."""
+
+    if platform.system().lower() == "darwin":
+        return await macmon_get_metrics_async()
+
+
+def get_memory_profile() -> MemoryPerformanceProfile:
+    """Construct a MemoryPerformanceProfile using psutil"""
+    override_memory_env = os.getenv("OVERRIDE_MEMORY_MB")
+    override_memory: int | None = (
+        Memory.from_mb(int(override_memory_env)).in_bytes
+        if override_memory_env
+        else None
+    )
+
+    return MemoryPerformanceProfile.from_psutil(override_memory=override_memory)
+
+
+async def start_polling_memory_metrics(
+    callback: Callable[[MemoryPerformanceProfile], Coroutine[Any, Any, None]],
+    *,
+    poll_interval_s: float = 0.5,
+) -> None:
+    """Continuously poll and emit memory-only metrics at a faster cadence.
+
+    Parameters
+    - callback: coroutine called with a fresh MemoryPerformanceProfile each tick
+    - poll_interval_s: interval between polls
+    """
+    while True:
+        try:
+            mem = get_memory_profile()
+            await callback(mem)
+        except MacMonError as e:
+            logger.opt(exception=e).error("Memory Monitor encountered error")
+        finally:
+            await anyio.sleep(poll_interval_s)
+
+
+async def start_polling_node_metrics(
+    callback: Callable[[NodePerformanceProfile], Coroutine[Any, Any, None]],
+):
+    poll_interval_s = 1.0
+    while True:
+        try:
+            metrics = await get_metrics_async()
+            if metrics is None:
+                return
+
+            network_interfaces = get_network_interfaces()
+            # these awaits could be joined but realistically they should be cached
+            model_id, chip_id = await get_model_and_chip()
+            friendly_name = await get_friendly_name()
+
+            # do the memory profile last to get a fresh reading to not conflict with the other memory profiling loop
+            memory_profile = get_memory_profile()
+
+            await callback(
+                NodePerformanceProfile(
+                    model_id=model_id,
+                    chip_id=chip_id,
+                    friendly_name=friendly_name,
+                    network_interfaces=network_interfaces,
+                    memory=memory_profile,
+                    system=SystemPerformanceProfile(
+                        gpu_usage=metrics.gpu_usage[1],
+                        temp=metrics.temp.gpu_temp_avg,
+                        sys_power=metrics.sys_power,
+                        pcpu_usage=metrics.pcpu_usage[1],
+                        ecpu_usage=metrics.ecpu_usage[1],
+                        ane_power=metrics.ane_power,
+                    ),
+                )
+            )
+
+        except asyncio.TimeoutError:
+            logger.warning(
+                "[resource_monitor] Operation timed out after 30s, skipping this cycle."
+            )
+        except MacMonError as e:
+            logger.opt(exception=e).error("Resource Monitor encountered error")
+            return
+        finally:
+            await anyio.sleep(poll_interval_s)
--- a/src/exo/utils/info_gatherer/system_info.py
+++ b/src/exo/utils/info_gatherer/system_info.py
--- a/src/exo/utils/info_gatherer/init.py
+++ b/src/exo/utils/info_gatherer/init.py
--- a/src/exo/worker/utils/tests/test_macmon.py
+++ b/src/exo/worker/utils/tests/test_macmon.py
--- a/uv.lock
+++ b/uv.lock
@@ -334,6 +334,7 @@ dependencies = [
    { name = "hypercorn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "loguru", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "mlx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "mlx", extra = ["cpu"], marker = "sys_platform == 'linux'" },
    { name = "mlx-lm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "networkx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -374,7 +375,8 @@ requires-dist = [
    { name = "huggingface-hub", specifier = ">=0.33.4" },
    { name = "hypercorn", specifier = ">=0.18.0" },
    { name = "loguru", specifier = ">=0.7.3" },
-    { name = "mlx", specifier = ">=0.30.1" },
+    { name = "mlx", marker = "sys_platform != 'linux'", specifier = ">=0.30.1" },
+    { name = "mlx", extras = ["cpu"], marker = "sys_platform == 'linux'", specifier = ">=0.30.1" },
    { name = "mlx-lm", specifier = ">=0.28.3" },
    { name = "networkx", specifier = ">=3.5" },
    { name = "protobuf", specifier = ">=6.32.0" },
@@ -801,6 +803,20 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d4/ff/1e1968f107b4221a98dc26832586b1f646b27ddf3e55c95051c09d751f0a/mlx-0.30.1-cp314-cp314-manylinux_2_35_x86_64.whl", hash = "sha256:d18012d5cf0f013bc4a405cfd1e9d2d28e798f4d2dc4f15aa0fbffff73c02ba2", size = 687114, upload-time = "2025-12-18T01:55:56.506Z" },
 ]

+[package.optional-dependencies]
+cpu = [
+    { name = "mlx-cpu", marker = "sys_platform == 'linux'" },
+]
+
+[[package]]
+name = "mlx-cpu"
+version = "0.30.1"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/51/32903727a68a61e972383e28a775c1f5e5f0628552c85cbc6103d68c0dc4/mlx_cpu-0.30.1-py3-none-manylinux_2_35_aarch64.whl", hash = "sha256:3f5dc2e4d0849181f8253508bb6a0854250483fc63d43ac79ec614b19824b172", size = 8992394, upload-time = "2025-12-18T00:16:13.696Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/74/69c21bb907f3c4064881ab0653029c939ae15fc4e63a5301ef8643cb1d68/mlx_cpu-0.30.1-py3-none-manylinux_2_35_x86_64.whl", hash = "sha256:c9ea6992d8c001e1123dfd3b4d4405ff576c787eec52656ad405e3d033a8be60", size = 10553055, upload-time = "2025-12-18T00:16:16.104Z" },
+]
+
 [[package]]
 name = "mlx-lm"
 version = "0.28.3"