Compare commits

...

4 Commits

Author SHA1 Message Date
Alex Cheema
ae07239b57 feat: detect RDMA status via rdma_ctl and show per-node debug labels
Use `rdma_ctl status` to detect whether RDMA is actually enabled on
each node, instead of relying on TB5 link speeds. The info box now
only shows when TB5 nodes have RDMA disabled, and includes setup
instructions. Debug mode shows RDMA:ON/RDMA:OFF per node.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 11:08:01 -08:00
Alex Cheema
8ce8e1736c fix: only show TB5 RDMA info box when 2+ exo nodes have TB5
A single node with TB5 ports connected to non-exo devices shouldn't
trigger the info box since there's no second node to benefit from RDMA.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 10:37:32 -08:00
Alex Cheema
a37955526c feat: show info box when TB5 connections exist without RDMA
Detect Thunderbolt 5 connections by parsing link_speed from
system_profiler and show a dismissible blue info box when RDMA
is not enabled, prompting users to enable it for better performance.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 10:15:09 -08:00
Alex Cheema
0abc90c494 fix: show RDMA interface names instead of "? ?" in debug mode
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 09:44:56 -08:00
9 changed files with 343 additions and 18 deletions

View File

@@ -422,9 +422,16 @@
const bToACandidates: Array<{ ip: string; iface: string | null }> = [];
for (const edge of topology.edges) {
const ip = edge.sendBackIp || "?";
const iface =
edge.sendBackInterface || getInterfaceForIp(edge.source, ip);
let ip: string;
let iface: string | null;
if (edge.sourceRdmaIface || edge.sinkRdmaIface) {
ip = "RDMA";
iface = `${edge.sourceRdmaIface || "?"} \u2192 ${edge.sinkRdmaIface || "?"}`;
} else {
ip = edge.sendBackIp || "?";
iface = edge.sendBackInterface || getInterfaceForIp(edge.source, ip);
}
if (edge.source === nodeId1 && edge.target === nodeId2) {
aToBCandidates.push({ ip, iface });

View File

@@ -6,6 +6,7 @@
isTopologyMinimized,
debugMode,
nodeThunderboltBridge,
nodeRdmaCtl,
type NodeInfo,
} from "$lib/stores/app.svelte";
@@ -31,6 +32,7 @@
const data = $derived(topologyData());
const debugEnabled = $derived(debugMode());
const tbBridgeData = $derived(nodeThunderboltBridge());
const rdmaCtlData = $derived(nodeRdmaCtl());
function getNodeLabel(nodeId: string): string {
const node = data?.nodes?.[nodeId];
@@ -333,14 +335,27 @@
if (edge.source === a) entry.aToB = true;
else entry.bToA = true;
const ip = edge.sendBackIp || "?";
const ifaceInfo = getInterfaceLabel(edge.source, ip);
let ip: string;
let ifaceLabel: string;
let missingIface: boolean;
if (edge.sourceRdmaIface || edge.sinkRdmaIface) {
ip = "RDMA";
ifaceLabel = `${edge.sourceRdmaIface || "?"} \u2192 ${edge.sinkRdmaIface || "?"}`;
missingIface = false;
} else {
ip = edge.sendBackIp || "?";
const ifaceInfo = getInterfaceLabel(edge.source, ip);
ifaceLabel = ifaceInfo.label;
missingIface = ifaceInfo.missing;
}
entry.connections.push({
from: edge.source,
to: edge.target,
ip,
ifaceLabel: ifaceInfo.label,
missingIface: ifaceInfo.missing,
ifaceLabel,
missingIface,
});
pairMap.set(key, entry);
});
@@ -1120,15 +1135,17 @@
.text(` (${ramUsagePercent.toFixed(0)}%)`);
}
// Debug mode: Show TB bridge status
// Debug mode: Show TB bridge and RDMA status
if (debugEnabled) {
let debugLabelY =
nodeInfo.y +
iconBaseHeight / 2 +
(showFullLabels ? 32 : showCompactLabels ? 26 : 22);
const debugFontSize = showFullLabels ? 9 : 7;
const debugLineHeight = showFullLabels ? 11 : 9;
const tbStatus = tbBridgeData[nodeInfo.id];
if (tbStatus) {
const tbY =
nodeInfo.y +
iconBaseHeight / 2 +
(showFullLabels ? 32 : showCompactLabels ? 26 : 22);
const tbFontSize = showFullLabels ? 9 : 7;
const tbColor = tbStatus.enabled
? "rgba(234,179,8,0.9)"
: "rgba(100,100,100,0.7)";
@@ -1136,12 +1153,30 @@
nodeG
.append("text")
.attr("x", nodeInfo.x)
.attr("y", tbY)
.attr("y", debugLabelY)
.attr("text-anchor", "middle")
.attr("fill", tbColor)
.attr("font-size", tbFontSize)
.attr("font-size", debugFontSize)
.attr("font-family", "SF Mono, Monaco, monospace")
.text(tbText);
debugLabelY += debugLineHeight;
}
const rdmaStatus = rdmaCtlData[nodeInfo.id];
if (rdmaStatus !== undefined) {
const rdmaColor = rdmaStatus.enabled
? "rgba(74,222,128,0.9)"
: "rgba(100,100,100,0.7)";
const rdmaText = rdmaStatus.enabled ? "RDMA:ON" : "RDMA:OFF";
nodeG
.append("text")
.attr("x", nodeInfo.x)
.attr("y", debugLabelY)
.attr("text-anchor", "middle")
.attr("fill", rdmaColor)
.attr("font-size", debugFontSize)
.attr("font-family", "SF Mono, Monaco, monospace")
.text(rdmaText);
}
}
});

View File

@@ -56,6 +56,8 @@ export interface TopologyEdge {
target: string;
sendBackIp?: string;
sendBackInterface?: string;
sourceRdmaIface?: string;
sinkRdmaIface?: string;
}
export interface TopologyData {
@@ -225,6 +227,19 @@ interface RawStateResponse {
nodeMemory?: Record<string, RawMemoryUsage>;
nodeSystem?: Record<string, RawSystemPerformanceProfile>;
nodeNetwork?: Record<string, RawNodeNetworkInfo>;
// Thunderbolt identifiers per node
nodeThunderbolt?: Record<
string,
{
interfaces: Array<{
rdmaInterface: string;
domainUuid: string;
linkSpeed: string;
}>;
}
>;
// RDMA ctl status per node
nodeRdmaCtl?: Record<string, { enabled: boolean }>;
// Thunderbolt bridge status per node
nodeThunderboltBridge?: Record<
string,
@@ -437,6 +452,8 @@ function transformTopology(
if (!Array.isArray(edgeList)) continue;
for (const edge of edgeList) {
let sendBackIp: string | undefined;
let sourceRdmaIface: string | undefined;
let sinkRdmaIface: string | undefined;
if (edge && typeof edge === "object" && "sinkMultiaddr" in edge) {
const multiaddr = edge.sinkMultiaddr;
if (multiaddr) {
@@ -444,10 +461,23 @@ function transformTopology(
multiaddr.ip_address ||
extractIpFromMultiaddr(multiaddr.address);
}
} else if (
edge &&
typeof edge === "object" &&
"sourceRdmaIface" in edge
) {
sourceRdmaIface = edge.sourceRdmaIface;
sinkRdmaIface = edge.sinkRdmaIface;
}
if (nodes[source] && nodes[sink] && source !== sink) {
edges.push({ source, target: sink, sendBackIp });
edges.push({
source,
target: sink,
sendBackIp,
sourceRdmaIface,
sinkRdmaIface,
});
}
}
}
@@ -496,6 +526,19 @@ class AppStore {
previewNodeFilter = $state<Set<string>>(new Set());
lastUpdate = $state<number | null>(null);
thunderboltBridgeCycles = $state<string[][]>([]);
nodeThunderbolt = $state<
Record<
string,
{
interfaces: Array<{
rdmaInterface: string;
domainUuid: string;
linkSpeed: string;
}>;
}
>
>({});
nodeRdmaCtl = $state<Record<string, { enabled: boolean }>>({});
nodeThunderboltBridge = $state<
Record<
string,
@@ -1206,6 +1249,10 @@ class AppStore {
if (data.downloads) {
this.downloads = data.downloads;
}
// Thunderbolt identifiers per node
this.nodeThunderbolt = data.nodeThunderbolt ?? {};
// RDMA ctl status per node
this.nodeRdmaCtl = data.nodeRdmaCtl ?? {};
// Thunderbolt bridge cycles
this.thunderboltBridgeCycles = data.thunderboltBridgeCycles ?? [];
// Thunderbolt bridge status per node
@@ -3038,7 +3085,9 @@ export const setChatSidebarVisible = (visible: boolean) =>
appStore.setChatSidebarVisible(visible);
export const refreshState = () => appStore.fetchState();
// Thunderbolt bridge status
// Thunderbolt & RDMA status
export const nodeThunderbolt = () => appStore.nodeThunderbolt;
export const nodeRdmaCtl = () => appStore.nodeRdmaCtl;
export const thunderboltBridgeCycles = () => appStore.thunderboltBridgeCycles;
export const nodeThunderboltBridge = () => appStore.nodeThunderboltBridge;

View File

@@ -37,6 +37,8 @@
toggleTopologyOnlyMode,
chatSidebarVisible,
toggleChatSidebarVisible,
nodeThunderbolt,
nodeRdmaCtl,
thunderboltBridgeCycles,
nodeThunderboltBridge,
type DownloadProgress,
@@ -62,8 +64,26 @@
const sidebarVisible = $derived(chatSidebarVisible());
const tbBridgeCycles = $derived(thunderboltBridgeCycles());
const tbBridgeData = $derived(nodeThunderboltBridge());
const tbIdentifiers = $derived(nodeThunderbolt());
const rdmaCtlData = $derived(nodeRdmaCtl());
const nodeFilter = $derived(previewNodeFilter());
// Detect TB5 nodes where RDMA is not enabled
const tb5WithoutRdma = $derived.by(() => {
const rdmaCtl = rdmaCtlData;
if (!rdmaCtl) return false;
const ids = tbIdentifiers;
if (!ids) return false;
// Find nodes with TB5 hardware (any TB interface)
const tb5NodeIds = Object.entries(ids)
.filter(([_, node]) => node.interfaces.length > 0)
.map(([id]) => id);
if (tb5NodeIds.length < 2) return false;
// At least one TB5 node has RDMA disabled
return tb5NodeIds.some((id) => rdmaCtl[id]?.enabled !== true);
});
let tb5InfoDismissed = $state(false);
// Helper to get friendly node name from node ID
function getNodeName(nodeId: string): string {
const node = data?.nodes?.[nodeId];
@@ -1800,6 +1820,53 @@
</div>
{/if}
<!-- TB5 RDMA Available Info -->
{#if tb5WithoutRdma && !tb5InfoDismissed}
<div
class="absolute left-4 flex items-center gap-2 px-3 py-2 rounded border border-blue-400/50 bg-blue-400/10 backdrop-blur-sm"
class:top-16={tbBridgeCycles.length > 0}
class:top-4={tbBridgeCycles.length === 0}
role="status"
>
<svg
class="w-5 h-5 text-blue-400 flex-shrink-0"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
/>
</svg>
<span class="text-sm font-mono text-blue-200">
RDMA AVAILABLE
</span>
<button
type="button"
onclick={() => (tb5InfoDismissed = true)}
class="ml-1 text-blue-300/60 hover:text-blue-200 transition-colors cursor-pointer"
title="Dismiss"
>
<svg
class="w-4 h-4"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M6 18L18 6M6 6l12 12"
/>
</svg>
</button>
</div>
{/if}
<!-- Exit topology-only mode button -->
<button
type="button"
@@ -1919,6 +1986,86 @@
</div>
{/if}
<!-- TB5 RDMA Available Info -->
{#if tb5WithoutRdma && !tb5InfoDismissed}
<div
class="absolute left-4 group"
class:top-16={tbBridgeCycles.length > 0}
class:top-4={tbBridgeCycles.length === 0}
role="status"
>
<div
class="flex items-center gap-2 px-3 py-2 rounded border border-blue-400/50 bg-blue-400/10 backdrop-blur-sm"
>
<svg
class="w-5 h-5 text-blue-400 flex-shrink-0"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
/>
</svg>
<span class="text-sm font-mono text-blue-200">
RDMA AVAILABLE
</span>
<button
type="button"
onclick={() => (tb5InfoDismissed = true)}
class="ml-1 text-blue-300/60 hover:text-blue-200 transition-colors cursor-pointer"
title="Dismiss"
>
<svg
class="w-4 h-4"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M6 18L18 6M6 6l12 12"
/>
</svg>
</button>
</div>
<!-- Tooltip on hover -->
<div
class="absolute top-full left-0 mt-2 w-80 p-3 rounded border border-blue-400/30 bg-exo-dark-gray/95 backdrop-blur-sm opacity-0 invisible group-hover:opacity-100 group-hover:visible transition-all duration-200 z-50 shadow-lg"
>
<p class="text-xs text-white/80 mb-2">
Thunderbolt 5 hardware detected on multiple nodes. Enable
RDMA for significantly faster inter-node communication.
</p>
<p class="text-xs text-white/60 mb-1.5">
<span class="text-blue-300">To enable:</span>
</p>
<ol
class="text-xs text-white/60 list-decimal list-inside space-y-0.5 mb-1.5"
>
<li>Connect nodes with TB5 cables</li>
<li>Boot to Recovery (hold power 10s Options)</li>
<li>
Run
<code class="text-blue-300 bg-blue-400/10 px-1 rounded"
>rdma_ctl enable</code
>
</li>
<li>Reboot</li>
</ol>
<p class="text-xs text-white/40">
Requires macOS 26.2+, TB5 cables, and matching OS versions.
</p>
</div>
</div>
{/if}
<!-- Node Filter Indicator (top-right corner) -->
{#if isFilterActive()}
<button
@@ -2797,6 +2944,33 @@
>
</div>
{/if}
<!-- TB5 RDMA Available (compact) -->
{#if tb5WithoutRdma && !tb5InfoDismissed}
<div
class="absolute left-2 flex items-center gap-1.5 px-2 py-1 rounded border border-blue-400/50 bg-blue-400/10 backdrop-blur-sm"
class:top-10={tbBridgeCycles.length > 0}
class:top-2={tbBridgeCycles.length === 0}
title="Thunderbolt 5 detected — RDMA can be enabled for better performance"
>
<svg
class="w-3.5 h-3.5 text-blue-400"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
/>
</svg>
<span class="text-[10px] font-mono text-blue-200"
>RDMA AVAILABLE</span
>
</div>
{/if}
</div>
</button>

View File

@@ -31,6 +31,7 @@ from exo.shared.types.events import (
from exo.shared.types.profiling import (
NodeIdentity,
NodeNetworkInfo,
NodeRdmaCtlStatus,
NodeThunderboltInfo,
ThunderboltBridgeStatus,
)
@@ -48,6 +49,7 @@ from exo.utils.info_gatherer.info_gatherer import (
MiscData,
NodeConfig,
NodeNetworkInterfaces,
RdmaCtlStatus,
StaticNodeInformation,
ThunderboltBridgeInfo,
)
@@ -239,6 +241,9 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
for key, value in state.node_thunderbolt_bridge.items()
if key != event.node_id
}
node_rdma_ctl = {
key: value for key, value in state.node_rdma_ctl.items() if key != event.node_id
}
# Only recompute cycles if the leaving node had TB bridge enabled
leaving_node_status = state.node_thunderbolt_bridge.get(event.node_id)
leaving_node_had_tb_enabled = (
@@ -260,6 +265,7 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
"node_network": node_network,
"node_thunderbolt": node_thunderbolt,
"node_thunderbolt_bridge": node_thunderbolt_bridge,
"node_rdma_ctl": node_rdma_ctl,
"thunderbolt_bridge_cycles": thunderbolt_bridge_cycles,
}
)
@@ -354,6 +360,11 @@ def apply_node_gathered_info(event: NodeGatheredInfo, state: State) -> State:
new_tb_bridge, state.node_network
)
)
case RdmaCtlStatus():
update["node_rdma_ctl"] = {
**state.node_rdma_ctl,
event.node_id: NodeRdmaCtlStatus(enabled=info.enabled),
}
return state.model_copy(update=update)

View File

@@ -77,6 +77,12 @@ class NodeThunderboltInfo(CamelCaseModel):
interfaces: Sequence[ThunderboltIdentifier] = []
class NodeRdmaCtlStatus(CamelCaseModel):
"""Whether RDMA is enabled on this node (via rdma_ctl)."""
enabled: bool
class ThunderboltBridgeStatus(CamelCaseModel):
"""Whether the Thunderbolt Bridge network service is enabled on this node."""

View File

@@ -11,6 +11,7 @@ from exo.shared.types.profiling import (
MemoryUsage,
NodeIdentity,
NodeNetworkInfo,
NodeRdmaCtlStatus,
NodeThunderboltInfo,
SystemPerformanceProfile,
ThunderboltBridgeStatus,
@@ -53,6 +54,7 @@ class State(CamelCaseModel):
node_network: Mapping[NodeId, NodeNetworkInfo] = {}
node_thunderbolt: Mapping[NodeId, NodeThunderboltInfo] = {}
node_thunderbolt_bridge: Mapping[NodeId, ThunderboltBridgeStatus] = {}
node_rdma_ctl: Mapping[NodeId, NodeRdmaCtlStatus] = {}
# Detected cycles where all nodes have Thunderbolt bridge enabled (>2 nodes)
thunderbolt_bridge_cycles: Sequence[Sequence[NodeId]] = []

View File

@@ -12,6 +12,7 @@ class ThunderboltConnection(CamelCaseModel):
class ThunderboltIdentifier(CamelCaseModel):
rdma_interface: str
domain_uuid: str
link_speed: str = ""
## Intentionally minimal, only collecting data we care about - there's a lot more
@@ -19,6 +20,7 @@ class ThunderboltIdentifier(CamelCaseModel):
class _ReceptacleTag(BaseModel, extra="ignore"):
receptacle_id_key: str | None = None
current_speed_key: str | None = None
class _ConnectivityItem(BaseModel, extra="ignore"):
@@ -42,7 +44,9 @@ class ThunderboltConnectivityData(BaseModel, extra="ignore"):
# if tag not in ifaces: return None
iface = f"rdma_{ifaces[tag]}"
return ThunderboltIdentifier(
rdma_interface=iface, domain_uuid=self.domain_uuid_key
rdma_interface=iface,
domain_uuid=self.domain_uuid_key,
link_speed=self.receptacle_1_tag.current_speed_key or "",
)
def conn(self) -> ThunderboltConnection | None:

View File

@@ -196,6 +196,28 @@ class MacThunderboltConnections(TaggedModel):
conns: Sequence[ThunderboltConnection]
class RdmaCtlStatus(TaggedModel):
enabled: bool
@classmethod
async def gather(cls) -> Self | None:
if not IS_DARWIN or shutil.which("rdma_ctl") is None:
return None
try:
with anyio.fail_after(5):
proc = await anyio.run_process(["rdma_ctl", "status"], check=False)
except (TimeoutError, OSError):
return None
if proc.returncode != 0:
return None
output = proc.stdout.decode("utf-8").lower().strip()
if "enabled" in output:
return cls(enabled=True)
if "disabled" in output:
return cls(enabled=False)
return None
class ThunderboltBridgeInfo(TaggedModel):
status: ThunderboltBridgeStatus
@@ -310,6 +332,7 @@ GatheredInfo = (
| NodeNetworkInterfaces
| MacThunderboltIdentifiers
| MacThunderboltConnections
| RdmaCtlStatus
| ThunderboltBridgeInfo
| NodeConfig
| MiscData
@@ -326,6 +349,7 @@ class InfoGatherer:
memory_poll_rate: float | None = None if IS_DARWIN else 1
macmon_interval: float | None = 1 if IS_DARWIN else None
thunderbolt_bridge_poll_interval: float | None = 10 if IS_DARWIN else None
rdma_ctl_poll_interval: float | None = 10 if IS_DARWIN else None
_tg: TaskGroup = field(init=False, default_factory=create_task_group)
async def run(self):
@@ -335,6 +359,7 @@ class InfoGatherer:
tg.start_soon(self._monitor_macmon, macmon_path)
tg.start_soon(self._monitor_system_profiler_thunderbolt_data)
tg.start_soon(self._monitor_thunderbolt_bridge_status)
tg.start_soon(self._monitor_rdma_ctl_status)
tg.start_soon(self._watch_system_info)
tg.start_soon(self._monitor_memory_usage)
tg.start_soon(self._monitor_misc)
@@ -420,6 +445,18 @@ class InfoGatherer:
logger.warning(f"Error gathering Thunderbolt Bridge status: {e}")
await anyio.sleep(self.thunderbolt_bridge_poll_interval)
async def _monitor_rdma_ctl_status(self):
if self.rdma_ctl_poll_interval is None:
return
while True:
try:
curr = await RdmaCtlStatus.gather()
if curr is not None:
await self.info_sender.send(curr)
except Exception as e:
logger.warning(f"Error gathering RDMA ctl status: {e}")
await anyio.sleep(self.rdma_ctl_poll_interval)
async def _monitor_macmon(self, macmon_path: str):
if self.macmon_interval is None:
return