mirror of
https://github.com/exo-explore/exo.git
synced 2026-04-17 20:40:35 -04:00
**Enabling peers to be discovered in environments where mDNS is unavailable (SSH sessions, headless servers, Docker).** ## Motivation Exo discovers peers exclusively via mDNS, which works great on a local network but breaks once you move beyond a single L2 broadcast domain: - SSH sessions on macOS — TCC blocks mDNS multicast from non-GUI sessions (#1488) - Headless servers/rack machines — #1682 ("DGX Spark does not find other nodes") - Docker Compose — mDNS is often unavailable across container networks; e.g. #1462 (E2E test framework) needs an alternative Related works: #1488 (working implementation made by @AlexCheema and closed because SSH had a GUI workaround), #1023 (Headscale WAN then closed due to merge conflicts), #1656 (discovery cleanup, open). This PR introduces an optional bootstrap mechanism for peer discovery while leaving the existing mDNS behavior unchanged. ## Changes Adds two new CLI flags: - `--bootstrap-peers` (env: `EXO_BOOTSTRAP_PEERS`) — comma-separated libp2p multiaddrs to dial on startup and retry periodically - `--libp2p-port` — fixed TCP port for libp2p to listen on (default: OS-assigned). Required when bootstrap peers, so other nodes know which port to dial. 8 files: - `rust/networking/src/discovery.rs`: Store bootstrap addrs, dial in existing retry loop - `rust/networking/src/swarm.rs`: Thread `bootstrap_peers` parameter to `Behaviour` - `rust/networking/examples/chatroom.rs`: Updated call site for new create_swarm signature - `rust/networking/tests/bootstrap_peers.rs`: Integration tests - `rust/exo_pyo3_bindings/src/networking.rs`: Accept optional `bootstrap_peers` in PyO3 constructor - `rust/exo_pyo3_bindings/exo_pyo3_bindings.pyi` : Update type stub - `src/exo/routing/router.py`: Pass peers to `NetworkingHandle` - `src/exo/main.py` : `--bootstrap-peers` CLI arg + `EXO_BOOTSTRAP_PEERS` env var ## Why It Works Bootstrap peers are dialed in the existing retry loop — the same path taken by peers when mDNS-discovered. The swarm handles connection, Noise handshake, and gossipsub mesh joining from there. PeerId is intentionally not required in the multiaddr, the Noise handshake discovers it. Docker Compose example: ```yaml services: exo-1: environment: EXO_BOOTSTRAP_PEERS: "/ip4/exo-2/tcp/30000" exo-2: environment: EXO_BOOTSTRAP_PEERS: "/ip4/exo-1/tcp/30000" ``` ## Test Plan ### Manual Testing <details> <summary>Docker Compose config</summary> ``` services: exo-node1: build: context: . dockerfile: Dockerfile.bootstrap-test container_name: exo-bootstrap-node1 hostname: exo-node1 command: ["-q", "--libp2p-port", "30000", "--bootstrap-peers", "/ip4/172.30.20.3/tcp/30000"] environment: - EXO_LIBP2P_NAMESPACE=bootstrap-test ports: - "52415:52415" networks: bootstrap-net: ipv4_address: 172.30.20.2 deploy: resources: limits: memory: 4g exo-node2: build: context: . dockerfile: Dockerfile.bootstrap-test container_name: exo-bootstrap-node2 hostname: exo-node2 command: ["-q", "--libp2p-port", "30000", "--bootstrap-peers", "/ip4/172.30.20.2/tcp/30000"] environment: - EXO_LIBP2P_NAMESPACE=bootstrap-test ports: - "52416:52415" networks: bootstrap-net: ipv4_address: 172.30.20.3 deploy: resources: limits: memory: 4g networks: bootstrap-net: driver: bridge ipam: config: - subnet: 172.30.20.0/24 ``` </details> Two containers on a bridge network (`172.30.20.0/24`), fixed IPs, `--libp2p-port 30000`, cross-referencing `--bootstrap-peers`. Both nodes found each other and established a connection then ran the election protocol. ### Automated Testing 4 Rust integration tests in `rust/networking/tests/bootstrap_peers.rs` (`cargo test -p networking`): | Test | What it verifies | Result | |------|-----------------|--------| | `two_nodes_connect_via_bootstrap_peers` | Node B discovers Node A via bootstrap addr (real TCP connection) | PASS | | `create_swarm_with_empty_bootstrap_peers` | Backward compatibility — no bootstrap peers works | PASS | | `create_swarm_ignores_invalid_bootstrap_addrs` | Invalid multiaddrs silently filtered | PASS | | `create_swarm_with_fixed_port` | `listen_port` parameter works | PASS | All 4 pass. The connection test takes ~6s --------- Signed-off-by: DeepZima <deepzima@outlook.com> Co-authored-by: Evan <evanev7@gmail.com>
108 lines
3.4 KiB
Rust
108 lines
3.4 KiB
Rust
use futures_lite::StreamExt;
|
|
use networking::swarm::{FromSwarm, create_swarm};
|
|
use std::time::Duration;
|
|
use tokio::sync::mpsc;
|
|
use tokio::time::timeout;
|
|
|
|
/// Helper: find a free TCP port.
|
|
fn free_port() -> u16 {
|
|
let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
|
|
listener.local_addr().unwrap().port()
|
|
}
|
|
|
|
/// Two nodes connect via bootstrap peers — no mDNS needed.
|
|
///
|
|
/// Node A listens on a fixed port. Node B bootstraps to A's address.
|
|
/// We verify that B emits `FromSwarm::Discovered` for A's peer ID.
|
|
#[tokio::test]
|
|
async fn two_nodes_connect_via_bootstrap_peers() {
|
|
let port_a = free_port();
|
|
|
|
// Node A: listens on a known port, no bootstrap peers
|
|
let keypair_a = libp2p::identity::Keypair::generate_ed25519();
|
|
let peer_id_a = keypair_a.public().to_peer_id();
|
|
let (_tx_a, rx_a) = mpsc::channel(16);
|
|
let swarm_a = create_swarm(keypair_a, rx_a, vec![], port_a).expect("create swarm A");
|
|
let mut stream_a = swarm_a.into_stream();
|
|
|
|
// Node B: bootstraps to A's address
|
|
let keypair_b = libp2p::identity::Keypair::generate_ed25519();
|
|
let (_tx_b, rx_b) = mpsc::channel(16);
|
|
let swarm_b = create_swarm(
|
|
keypair_b,
|
|
rx_b,
|
|
vec![format!("/ip4/127.0.0.1/tcp/{port_a}")],
|
|
0,
|
|
)
|
|
.expect("create swarm B");
|
|
let mut stream_b = swarm_b.into_stream();
|
|
|
|
// Wait for B to discover A (connection established)
|
|
let connected = timeout(Duration::from_secs(10), async {
|
|
loop {
|
|
tokio::select! {
|
|
Some(event) = stream_a.next() => {
|
|
// A will also see B connect, but we check from B's perspective
|
|
let _ = event;
|
|
}
|
|
Some(event) = stream_b.next() => {
|
|
if let FromSwarm::Discovered { peer_id } = event {
|
|
if peer_id == peer_id_a {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
})
|
|
.await;
|
|
|
|
assert!(
|
|
connected.is_ok() && connected.unwrap(),
|
|
"Node B should discover Node A via bootstrap peer"
|
|
);
|
|
}
|
|
|
|
/// Empty bootstrap peers should work (backward compatible).
|
|
#[tokio::test]
|
|
async fn create_swarm_with_empty_bootstrap_peers() {
|
|
let keypair = libp2p::identity::Keypair::generate_ed25519();
|
|
let (_tx, rx) = mpsc::channel(16);
|
|
let swarm = create_swarm(keypair, rx, vec![], 0);
|
|
assert!(
|
|
swarm.is_ok(),
|
|
"create_swarm with no bootstrap peers should succeed"
|
|
);
|
|
}
|
|
|
|
/// Invalid multiaddr strings are silently filtered out.
|
|
#[tokio::test]
|
|
async fn create_swarm_ignores_invalid_bootstrap_addrs() {
|
|
let keypair = libp2p::identity::Keypair::generate_ed25519();
|
|
let (_tx, rx) = mpsc::channel(16);
|
|
let swarm = create_swarm(
|
|
keypair,
|
|
rx,
|
|
vec![
|
|
"not-a-valid-multiaddr".to_string(),
|
|
"".to_string(),
|
|
"/ip4/10.0.0.1/tcp/30000".to_string(), // valid
|
|
],
|
|
0,
|
|
);
|
|
assert!(
|
|
swarm.is_ok(),
|
|
"create_swarm should succeed even with invalid bootstrap addrs"
|
|
);
|
|
}
|
|
|
|
/// Fixed listen port works correctly.
|
|
#[tokio::test]
|
|
async fn create_swarm_with_fixed_port() {
|
|
let port = free_port();
|
|
let keypair = libp2p::identity::Keypair::generate_ed25519();
|
|
let (_tx, rx) = mpsc::channel(16);
|
|
let swarm = create_swarm(keypair, rx, vec![], port);
|
|
assert!(swarm.is_ok(), "create_swarm with fixed port should succeed");
|
|
}
|