mirror of
https://github.com/exo-explore/exo.git
synced 2026-03-06 07:06:28 -05:00
## Motivation Testing multiple devices simultaneously requires coordination, and we don't necessarily want to run a full EXO to test single components. We need a mid-scale integration testing framework for distributed tests. ## Changes Add a simple python server + bash query that runs Jaccl and Ring tests without constructing a worker/master/networking. The query relies on all devices being accessible over tailscale, currently. ## Test Plan Manually tested RDMA + Ring inference on 2 nodes.
53 lines
1010 B
Bash
Executable File
53 lines
1010 B
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -euo pipefail
|
|
|
|
query() {
|
|
tailscale status | awk -v find="$1" '$2 == find { print $1 }'
|
|
}
|
|
|
|
if [[ $# -lt 2 ]]; then
|
|
echo "USAGE: $0 <test kind> [host1] [host2] ..."
|
|
exit 1
|
|
fi
|
|
|
|
|
|
kind=$1
|
|
shift
|
|
|
|
test_kinds="ring jaccl"
|
|
|
|
if ! echo "$test_kinds" | grep -q "$kind"; then
|
|
printf "%s is not a known test kind.\nCurrent test kinds are %s" "$kind" "$test_kinds"
|
|
exit 1
|
|
fi
|
|
|
|
hostnames=("$@")
|
|
weaved=()
|
|
ips=()
|
|
for name in "${hostnames[@]}"; do
|
|
ip=$(query "$name")
|
|
ips+=("$ip")
|
|
weaved+=("$name" "$ip")
|
|
done
|
|
|
|
devs_raw=$(printf "[\"%s\", \"%s\"], " "${weaved[@]}")
|
|
devs="[${devs_raw%, }]"
|
|
|
|
for i in "${!ips[@]}"; do
|
|
{
|
|
req="{
|
|
\"model_id\": \"llama-3.2-1b\",
|
|
\"devs\": ${devs},
|
|
\"kind\": \"inference\"
|
|
}"
|
|
echo "req $req"
|
|
curl -sN \
|
|
-X POST "http://${ips[$i]}:52415/${kind}" \
|
|
-H "Content-Type: application/json" -d "$req" \
|
|
2>&1 | sed "s/^/\n${hostnames[$i]}@${ips[$i]}: /" || echo "curl to ${hostnames[$i]} failed"
|
|
} &
|
|
done
|
|
|
|
wait
|