mirror of
https://github.com/exo-explore/exo.git
synced 2026-02-16 09:03:55 -05:00
Compare commits
27 Commits
alexcheema
...
e2e-tests
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e435656dc | ||
|
|
f87ebebf3f | ||
|
|
d925dfe452 | ||
|
|
85e90c5b93 | ||
|
|
8bf4d1f585 | ||
|
|
5e27e4e719 | ||
|
|
b249757116 | ||
|
|
5c0b769bf8 | ||
|
|
702886d147 | ||
|
|
2526b7d166 | ||
|
|
ffb79d88ca | ||
|
|
4f32b9f180 | ||
|
|
4842edf253 | ||
|
|
71285ddebf | ||
|
|
16a9a389e2 | ||
|
|
e8203596ab | ||
|
|
b88749a6c5 | ||
|
|
b428858318 | ||
|
|
f1fd491f5c | ||
|
|
12fb6bcfa2 | ||
|
|
dfce8f14ad | ||
|
|
4a446b2779 | ||
|
|
a82feed8e3 | ||
|
|
da6e626f6f | ||
|
|
cf23916b8b | ||
|
|
80b29ba0d9 | ||
|
|
b6214c297f |
15
.dockerignore
Normal file
15
.dockerignore
Normal file
@@ -0,0 +1,15 @@
|
||||
.venv/
|
||||
.direnv/
|
||||
target/
|
||||
.git/
|
||||
.idea/
|
||||
.pytest_cache/
|
||||
.ruff_cache/
|
||||
dashboard/node_modules/
|
||||
dashboard/.svelte-kit/
|
||||
dashboard/build/
|
||||
dist/
|
||||
*.pdb
|
||||
**/__pycache__
|
||||
**/.DS_Store
|
||||
.mlx_typings/
|
||||
5
.github/workflows/build-app.yml
vendored
5
.github/workflows/build-app.yml
vendored
@@ -303,8 +303,11 @@ jobs:
|
||||
SIGNING_IDENTITY=$(security find-identity -v -p codesigning "$BUILD_KEYCHAIN_PATH" | awk -F '"' '{print $2}')
|
||||
/usr/bin/codesign --deep --force --timestamp --options runtime \
|
||||
--sign "$SIGNING_IDENTITY" EXO.app
|
||||
mkdir -p dmg-root
|
||||
cp -R EXO.app dmg-root/
|
||||
ln -s /Applications dmg-root/Applications
|
||||
DMG_NAME="EXO-${RELEASE_VERSION}.dmg"
|
||||
bash "$GITHUB_WORKSPACE/packaging/dmg/create-dmg.sh" EXO.app "$DMG_NAME" "EXO"
|
||||
hdiutil create -volname "EXO" -srcfolder dmg-root -ov -format UDZO "$DMG_NAME"
|
||||
/usr/bin/codesign --force --timestamp --options runtime \
|
||||
--sign "$SIGNING_IDENTITY" "$DMG_NAME"
|
||||
if [[ -n "$APPLE_NOTARIZATION_USERNAME" ]]; then
|
||||
|
||||
44
.github/workflows/e2e.yml
vendored
Normal file
44
.github/workflows/e2e.yml
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
name: e2e-tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- e2e-tests
|
||||
pull_request:
|
||||
branches:
|
||||
- staging
|
||||
- main
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Free up disk space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
|
||||
/opt/hostedtoolcache /usr/local/share/boost /usr/share/swift \
|
||||
/opt/microsoft /opt/az
|
||||
docker system prune -af
|
||||
df -h /
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
lfs: false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build E2E image with cache
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
file: e2e/Dockerfile
|
||||
tags: exo-e2e:latest
|
||||
load: true
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Run E2E tests
|
||||
run: python3 e2e/run_all.py
|
||||
@@ -15,12 +15,18 @@ struct ContentView: View {
|
||||
@EnvironmentObject private var localNetworkChecker: LocalNetworkChecker
|
||||
@EnvironmentObject private var updater: SparkleUpdater
|
||||
@EnvironmentObject private var thunderboltBridgeService: ThunderboltBridgeService
|
||||
@EnvironmentObject private var settingsWindowController: SettingsWindowController
|
||||
@State private var focusedNode: NodeViewModel?
|
||||
@State private var deletingInstanceIDs: Set<String> = []
|
||||
@State private var showAllNodes = false
|
||||
@State private var showAllInstances = false
|
||||
@State private var baseURLCopied = false
|
||||
@State private var showAdvanced = false
|
||||
@State private var showDebugInfo = false
|
||||
@State private var bugReportInFlight = false
|
||||
@State private var bugReportMessage: String?
|
||||
@State private var uninstallInProgress = false
|
||||
@State private var pendingNamespace: String = ""
|
||||
@State private var pendingHFToken: String = ""
|
||||
@State private var pendingEnableImageModels = false
|
||||
|
||||
var body: some View {
|
||||
VStack(alignment: .leading, spacing: 12) {
|
||||
@@ -252,79 +258,139 @@ struct ContentView: View {
|
||||
VStack(alignment: .leading, spacing: 0) {
|
||||
if controller.status != .stopped {
|
||||
dashboardButton
|
||||
baseURLRow
|
||||
Divider()
|
||||
.padding(.vertical, 8)
|
||||
} else {
|
||||
Divider()
|
||||
.padding(.vertical, 4)
|
||||
}
|
||||
HoverButton(
|
||||
title: "Settings",
|
||||
tint: .primary,
|
||||
trailingSystemImage: "gear"
|
||||
) {
|
||||
settingsWindowController.open(
|
||||
controller: controller,
|
||||
updater: updater,
|
||||
networkStatusService: networkStatusService,
|
||||
thunderboltBridgeService: thunderboltBridgeService,
|
||||
stateService: stateService
|
||||
)
|
||||
}
|
||||
HoverButton(
|
||||
title: "Check for Updates",
|
||||
tint: .primary,
|
||||
trailingSystemImage: "arrow.triangle.2.circlepath"
|
||||
) {
|
||||
updater.checkForUpdates()
|
||||
}
|
||||
.padding(.bottom, 8)
|
||||
HoverButton(title: "Quit", tint: .secondary) {
|
||||
advancedSection
|
||||
.padding(.bottom, 8)
|
||||
controlButton(title: "Quit", tint: .secondary) {
|
||||
controller.stop()
|
||||
NSApplication.shared.terminate(nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var dashboardButton: some View {
|
||||
HoverButton(
|
||||
title: "Web Dashboard",
|
||||
tint: .primary,
|
||||
trailingSystemImage: "arrow.up.right"
|
||||
) {
|
||||
guard let url = URL(string: "http://localhost:52415/") else { return }
|
||||
NSWorkspace.shared.open(url)
|
||||
private var advancedSection: some View {
|
||||
VStack(alignment: .leading, spacing: 6) {
|
||||
HStack {
|
||||
Text("Advanced")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
Spacer()
|
||||
collapseButton(isExpanded: $showAdvanced)
|
||||
}
|
||||
.animation(nil, value: showAdvanced)
|
||||
if showAdvanced {
|
||||
VStack(alignment: .leading, spacing: 8) {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Text("Cluster Namespace")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
HStack {
|
||||
TextField("optional", text: $pendingNamespace)
|
||||
.textFieldStyle(.roundedBorder)
|
||||
.font(.caption2)
|
||||
.onAppear {
|
||||
pendingNamespace = controller.customNamespace
|
||||
}
|
||||
Button("Save & Restart") {
|
||||
controller.customNamespace = pendingNamespace
|
||||
if controller.status == .running || controller.status == .starting {
|
||||
controller.restart()
|
||||
}
|
||||
}
|
||||
.font(.caption2)
|
||||
.disabled(pendingNamespace == controller.customNamespace)
|
||||
}
|
||||
}
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Text("HuggingFace Token")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
HStack {
|
||||
SecureField("optional", text: $pendingHFToken)
|
||||
.textFieldStyle(.roundedBorder)
|
||||
.font(.caption2)
|
||||
.onAppear {
|
||||
pendingHFToken = controller.hfToken
|
||||
}
|
||||
Button("Save & Restart") {
|
||||
controller.hfToken = pendingHFToken
|
||||
if controller.status == .running || controller.status == .starting {
|
||||
controller.restart()
|
||||
}
|
||||
}
|
||||
.font(.caption2)
|
||||
.disabled(pendingHFToken == controller.hfToken)
|
||||
}
|
||||
}
|
||||
Divider()
|
||||
HStack {
|
||||
Toggle(
|
||||
"Enable Image Models (experimental)", isOn: $pendingEnableImageModels
|
||||
)
|
||||
.toggleStyle(.switch)
|
||||
.font(.caption2)
|
||||
.onAppear {
|
||||
pendingEnableImageModels = controller.enableImageModels
|
||||
}
|
||||
|
||||
Spacer()
|
||||
|
||||
Button("Save & Restart") {
|
||||
controller.enableImageModels = pendingEnableImageModels
|
||||
if controller.status == .running || controller.status == .starting {
|
||||
controller.restart()
|
||||
}
|
||||
}
|
||||
.font(.caption2)
|
||||
.disabled(pendingEnableImageModels == controller.enableImageModels)
|
||||
}
|
||||
HoverButton(title: "Check for Updates", small: true) {
|
||||
updater.checkForUpdates()
|
||||
}
|
||||
debugSection
|
||||
HoverButton(title: "Uninstall", tint: .red, small: true) {
|
||||
showUninstallConfirmationAlert()
|
||||
}
|
||||
.disabled(uninstallInProgress)
|
||||
}
|
||||
.transition(.opacity)
|
||||
}
|
||||
}
|
||||
.animation(.easeInOut(duration: 0.25), value: showAdvanced)
|
||||
}
|
||||
|
||||
private var baseURLRow: some View {
|
||||
HStack(spacing: 6) {
|
||||
Image(systemName: "link")
|
||||
.imageScale(.small)
|
||||
.foregroundColor(.secondary)
|
||||
Text("localhost:52415/v1")
|
||||
.font(.system(.caption, design: .monospaced))
|
||||
.foregroundColor(.primary)
|
||||
Spacer()
|
||||
Button {
|
||||
NSPasteboard.general.clearContents()
|
||||
NSPasteboard.general.setString("http://localhost:52415/v1", forType: .string)
|
||||
baseURLCopied = true
|
||||
DispatchQueue.main.asyncAfter(deadline: .now() + 2) {
|
||||
baseURLCopied = false
|
||||
}
|
||||
} label: {
|
||||
Image(systemName: baseURLCopied ? "checkmark" : "doc.on.doc")
|
||||
private func controlButton(title: String, tint: Color = .primary, action: @escaping () -> Void)
|
||||
-> some View
|
||||
{
|
||||
HoverButton(title: title, tint: tint, trailingSystemImage: nil, action: action)
|
||||
}
|
||||
|
||||
private var dashboardButton: some View {
|
||||
Button {
|
||||
guard let url = URL(string: "http://localhost:52415/") else { return }
|
||||
NSWorkspace.shared.open(url)
|
||||
} label: {
|
||||
HStack {
|
||||
Image(systemName: "arrow.up.right.square")
|
||||
.imageScale(.small)
|
||||
.foregroundColor(baseURLCopied ? .green : .secondary)
|
||||
.contentTransition(.symbolEffect(.replace))
|
||||
Text("Dashboard")
|
||||
.fontWeight(.medium)
|
||||
Spacer()
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.help("Copy API base URL")
|
||||
.padding(.vertical, 8)
|
||||
.padding(.horizontal, 10)
|
||||
.background(
|
||||
RoundedRectangle(cornerRadius: 8, style: .continuous)
|
||||
.fill(Color(red: 1.0, green: 0.87, blue: 0.0).opacity(0.2))
|
||||
)
|
||||
}
|
||||
.padding(.vertical, 4)
|
||||
.padding(.horizontal, 8)
|
||||
.buttonStyle(.plain)
|
||||
.padding(.bottom, 4)
|
||||
}
|
||||
|
||||
private func collapseButton(isExpanded: Binding<Bool>) -> some View {
|
||||
@@ -379,6 +445,207 @@ struct ContentView: View {
|
||||
}
|
||||
}
|
||||
|
||||
private var thunderboltStatusText: String {
|
||||
switch networkStatusService.status.thunderboltBridgeState {
|
||||
case .some(.disabled):
|
||||
return "Thunderbolt Bridge: Disabled"
|
||||
case .some(.deleted):
|
||||
return "Thunderbolt Bridge: Deleted"
|
||||
case .some(.enabled):
|
||||
return "Thunderbolt Bridge: Enabled"
|
||||
case nil:
|
||||
return "Thunderbolt Bridge: Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
private var thunderboltStatusColor: Color {
|
||||
switch networkStatusService.status.thunderboltBridgeState {
|
||||
case .some(.disabled), .some(.deleted):
|
||||
return .green
|
||||
case .some(.enabled):
|
||||
return .red
|
||||
case nil:
|
||||
return .secondary
|
||||
}
|
||||
}
|
||||
|
||||
/// Shows TB bridge status for all nodes from exo cluster state
|
||||
private var clusterThunderboltBridgeView: some View {
|
||||
let bridgeStatuses = stateService.latestSnapshot?.nodeThunderboltBridge ?? [:]
|
||||
let localNodeId = stateService.localNodeId
|
||||
let nodeProfiles = stateService.latestSnapshot?.nodeProfiles ?? [:]
|
||||
|
||||
return VStack(alignment: .leading, spacing: 1) {
|
||||
if bridgeStatuses.isEmpty {
|
||||
Text("Cluster TB Bridge: No data")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
} else {
|
||||
Text("Cluster TB Bridge Status:")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
ForEach(Array(bridgeStatuses.keys.sorted()), id: \.self) { nodeId in
|
||||
if let status = bridgeStatuses[nodeId] {
|
||||
let nodeName =
|
||||
nodeProfiles[nodeId]?.friendlyName ?? String(nodeId.prefix(8))
|
||||
let isLocal = nodeId == localNodeId
|
||||
let prefix = isLocal ? " \(nodeName) (local):" : " \(nodeName):"
|
||||
let statusText =
|
||||
!status.exists
|
||||
? "N/A"
|
||||
: (status.enabled ? "Enabled" : "Disabled")
|
||||
let color: Color =
|
||||
!status.exists
|
||||
? .secondary
|
||||
: (status.enabled ? .red : .green)
|
||||
Text("\(prefix) \(statusText)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(color)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var interfaceIpList: some View {
|
||||
let statuses = networkStatusService.status.interfaceStatuses
|
||||
return VStack(alignment: .leading, spacing: 1) {
|
||||
Text("Interfaces (en0–en7):")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
if statuses.isEmpty {
|
||||
Text(" Unknown")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
} else {
|
||||
ForEach(statuses, id: \.interfaceName) { status in
|
||||
let ipText = status.ipAddress ?? "No IP"
|
||||
Text(" \(status.interfaceName): \(ipText)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(status.ipAddress == nil ? .red : .green)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var debugSection: some View {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
HoverButton(
|
||||
title: "Debug Info",
|
||||
tint: .primary,
|
||||
trailingSystemImage: showDebugInfo ? "chevron.up" : "chevron.down",
|
||||
small: true
|
||||
) {
|
||||
showDebugInfo.toggle()
|
||||
}
|
||||
if showDebugInfo {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Text("Version: \(buildTag)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
Text("Commit: \(buildCommit)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
Text(thunderboltStatusText)
|
||||
.font(.caption2)
|
||||
.foregroundColor(thunderboltStatusColor)
|
||||
clusterThunderboltBridgeView
|
||||
interfaceIpList
|
||||
rdmaStatusView
|
||||
sendBugReportButton
|
||||
.padding(.top, 6)
|
||||
}
|
||||
.padding(.leading, 8)
|
||||
.transition(.opacity)
|
||||
}
|
||||
}
|
||||
.animation(.easeInOut(duration: 0.25), value: showDebugInfo)
|
||||
}
|
||||
|
||||
private var rdmaStatusView: some View {
|
||||
let rdmaStatuses = stateService.latestSnapshot?.nodeRdmaCtl ?? [:]
|
||||
let localNodeId = stateService.localNodeId
|
||||
let nodeProfiles = stateService.latestSnapshot?.nodeProfiles ?? [:]
|
||||
let localDevices = networkStatusService.status.localRdmaDevices
|
||||
let localPorts = networkStatusService.status.localRdmaActivePorts
|
||||
|
||||
return VStack(alignment: .leading, spacing: 1) {
|
||||
if rdmaStatuses.isEmpty {
|
||||
Text("Cluster RDMA: No data")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
} else {
|
||||
Text("Cluster RDMA Status:")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
ForEach(Array(rdmaStatuses.keys.sorted()), id: \.self) { nodeId in
|
||||
if let status = rdmaStatuses[nodeId] {
|
||||
let nodeName =
|
||||
nodeProfiles[nodeId]?.friendlyName ?? String(nodeId.prefix(8))
|
||||
let isLocal = nodeId == localNodeId
|
||||
let prefix = isLocal ? " \(nodeName) (local):" : " \(nodeName):"
|
||||
let statusText = status.enabled ? "Enabled" : "Disabled"
|
||||
let color: Color = status.enabled ? .green : .orange
|
||||
Text("\(prefix) \(statusText)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(color)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !localDevices.isEmpty {
|
||||
Text(" Local Devices: \(localDevices.joined(separator: ", "))")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
if !localPorts.isEmpty {
|
||||
Text(" Local Active Ports:")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
ForEach(localPorts, id: \.device) { port in
|
||||
Text(" \(port.device) port \(port.port): \(port.state)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.green)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var sendBugReportButton: some View {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Button {
|
||||
Task {
|
||||
await sendBugReport()
|
||||
}
|
||||
} label: {
|
||||
HStack {
|
||||
if bugReportInFlight {
|
||||
ProgressView()
|
||||
.scaleEffect(0.6)
|
||||
}
|
||||
Text("Send Bug Report")
|
||||
.font(.caption)
|
||||
.fontWeight(.semibold)
|
||||
Spacer()
|
||||
}
|
||||
.padding(.vertical, 6)
|
||||
.padding(.horizontal, 8)
|
||||
.background(
|
||||
RoundedRectangle(cornerRadius: 6)
|
||||
.fill(Color.accentColor.opacity(0.12))
|
||||
)
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.disabled(bugReportInFlight)
|
||||
|
||||
if let message = bugReportMessage {
|
||||
Text(message)
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
.fixedSize(horizontal: false, vertical: true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var processToggleBinding: Binding<Bool> {
|
||||
Binding(
|
||||
get: {
|
||||
@@ -419,6 +686,101 @@ struct ContentView: View {
|
||||
)
|
||||
}
|
||||
|
||||
private func sendBugReport() async {
|
||||
bugReportInFlight = true
|
||||
bugReportMessage = "Collecting logs..."
|
||||
let service = BugReportService()
|
||||
do {
|
||||
let outcome = try await service.sendReport(isManual: true)
|
||||
bugReportMessage = outcome.message
|
||||
} catch {
|
||||
bugReportMessage = error.localizedDescription
|
||||
}
|
||||
bugReportInFlight = false
|
||||
}
|
||||
|
||||
private func showUninstallConfirmationAlert() {
|
||||
let alert = NSAlert()
|
||||
alert.messageText = "Uninstall EXO"
|
||||
alert.informativeText = """
|
||||
This will remove EXO and all its system components:
|
||||
|
||||
• Network configuration daemon
|
||||
• Launch at login registration
|
||||
• EXO network location
|
||||
|
||||
The app will be moved to Trash.
|
||||
"""
|
||||
alert.alertStyle = .warning
|
||||
alert.addButton(withTitle: "Uninstall")
|
||||
alert.addButton(withTitle: "Cancel")
|
||||
|
||||
// Style the Uninstall button as destructive
|
||||
if let uninstallButton = alert.buttons.first {
|
||||
uninstallButton.hasDestructiveAction = true
|
||||
}
|
||||
|
||||
let response = alert.runModal()
|
||||
if response == .alertFirstButtonReturn {
|
||||
performUninstall()
|
||||
}
|
||||
}
|
||||
|
||||
private func performUninstall() {
|
||||
uninstallInProgress = true
|
||||
|
||||
// Stop EXO process first
|
||||
controller.cancelPendingLaunch()
|
||||
controller.stop()
|
||||
stateService.stopPolling()
|
||||
|
||||
// Run the privileged uninstall on a background thread
|
||||
// Using .utility QoS to avoid priority inversion with NSAppleScript's subprocess
|
||||
DispatchQueue.global(qos: .utility).async {
|
||||
do {
|
||||
// Remove network setup daemon and components (requires admin privileges)
|
||||
try NetworkSetupHelper.uninstall()
|
||||
|
||||
DispatchQueue.main.async {
|
||||
// Unregister from launch at login
|
||||
LaunchAtLoginHelper.disable()
|
||||
|
||||
// Move app to trash
|
||||
self.moveAppToTrash()
|
||||
|
||||
// Quit the app
|
||||
DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
|
||||
NSApplication.shared.terminate(nil)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
DispatchQueue.main.async {
|
||||
self.showErrorAlert(message: error.localizedDescription)
|
||||
self.uninstallInProgress = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func showErrorAlert(message: String) {
|
||||
let alert = NSAlert()
|
||||
alert.messageText = "Uninstall Failed"
|
||||
alert.informativeText = message
|
||||
alert.alertStyle = .critical
|
||||
alert.addButton(withTitle: "OK")
|
||||
alert.runModal()
|
||||
}
|
||||
|
||||
private func moveAppToTrash() {
|
||||
guard let appURL = Bundle.main.bundleURL as URL? else { return }
|
||||
do {
|
||||
try FileManager.default.trashItem(at: appURL, resultingItemURL: nil)
|
||||
} catch {
|
||||
// If we can't trash the app, that's OK - user can do it manually
|
||||
// The important system components have already been cleaned up
|
||||
}
|
||||
}
|
||||
|
||||
private var buildTag: String {
|
||||
Bundle.main.infoDictionary?["EXOBuildTag"] as? String ?? "unknown"
|
||||
}
|
||||
|
||||
@@ -21,9 +21,7 @@ struct EXOApp: App {
|
||||
@StateObject private var localNetworkChecker: LocalNetworkChecker
|
||||
@StateObject private var updater: SparkleUpdater
|
||||
@StateObject private var thunderboltBridgeService: ThunderboltBridgeService
|
||||
@StateObject private var settingsWindowController: SettingsWindowController
|
||||
private let terminationObserver: TerminationObserver
|
||||
private let firstLaunchPopout = FirstLaunchPopout()
|
||||
private let ciContext = CIContext(options: nil)
|
||||
|
||||
init() {
|
||||
@@ -45,13 +43,12 @@ struct EXOApp: App {
|
||||
_updater = StateObject(wrappedValue: updater)
|
||||
let thunderboltBridge = ThunderboltBridgeService(clusterStateService: service)
|
||||
_thunderboltBridgeService = StateObject(wrappedValue: thunderboltBridge)
|
||||
_settingsWindowController = StateObject(wrappedValue: SettingsWindowController())
|
||||
enableLaunchAtLoginIfNeeded()
|
||||
// Install LaunchDaemon to disable Thunderbolt Bridge on startup (prevents network loops)
|
||||
NetworkSetupHelper.promptAndInstallIfNeeded()
|
||||
// Check local network access periodically (warning disappears when user grants permission)
|
||||
localNetwork.startPeriodicChecking(interval: 10)
|
||||
controller.scheduleLaunch(after: 5)
|
||||
controller.scheduleLaunch(after: 15)
|
||||
service.startPolling()
|
||||
networkStatus.startPolling()
|
||||
}
|
||||
@@ -65,12 +62,6 @@ struct EXOApp: App {
|
||||
.environmentObject(localNetworkChecker)
|
||||
.environmentObject(updater)
|
||||
.environmentObject(thunderboltBridgeService)
|
||||
.environmentObject(settingsWindowController)
|
||||
.onReceive(controller.$isFirstLaunchReady) { ready in
|
||||
if ready {
|
||||
firstLaunchPopout.show()
|
||||
}
|
||||
}
|
||||
} label: {
|
||||
menuBarIcon
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ import Foundation
|
||||
private let customNamespaceKey = "EXOCustomNamespace"
|
||||
private let hfTokenKey = "EXOHFToken"
|
||||
private let enableImageModelsKey = "EXOEnableImageModels"
|
||||
private let hasLaunchedBeforeKey = "EXOHasLaunchedBefore"
|
||||
|
||||
@MainActor
|
||||
final class ExoProcessController: ObservableObject {
|
||||
@@ -61,9 +60,6 @@ final class ExoProcessController: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
/// Fires once when EXO transitions to `.running` for the very first time (fresh install).
|
||||
@Published private(set) var isFirstLaunchReady = false
|
||||
|
||||
private var process: Process?
|
||||
private var runtimeDirectoryURL: URL?
|
||||
private var pendingLaunchTask: Task<Void, Never>?
|
||||
@@ -117,12 +113,6 @@ final class ExoProcessController: ObservableObject {
|
||||
try child.run()
|
||||
process = child
|
||||
status = .running
|
||||
|
||||
// Detect first-ever launch to trigger welcome popout
|
||||
if !UserDefaults.standard.bool(forKey: hasLaunchedBeforeKey) {
|
||||
UserDefaults.standard.set(true, forKey: hasLaunchedBeforeKey)
|
||||
isFirstLaunchReady = true
|
||||
}
|
||||
} catch {
|
||||
process = nil
|
||||
status = .failed(message: "Launch error")
|
||||
|
||||
@@ -1,181 +0,0 @@
|
||||
import AppKit
|
||||
import SwiftUI
|
||||
|
||||
/// A small floating callout that drops down from the menu bar area on first launch,
|
||||
/// pointing the user to the web dashboard. Clean, minimal, speech-bubble style.
|
||||
@MainActor
|
||||
final class FirstLaunchPopout {
|
||||
private var panel: NSPanel?
|
||||
private var countdownTask: Task<Void, Never>?
|
||||
private static let dashboardURL = "http://localhost:52415/"
|
||||
|
||||
func show() {
|
||||
guard panel == nil else { return }
|
||||
|
||||
let hostingView = NSHostingView(
|
||||
rootView: WelcomeCalloutView(
|
||||
onDismiss: { [weak self] in
|
||||
self?.dismiss()
|
||||
},
|
||||
onOpen: { [weak self] in
|
||||
self?.openDashboard()
|
||||
self?.dismiss()
|
||||
}))
|
||||
hostingView.frame = NSRect(x: 0, y: 0, width: 280, height: 100)
|
||||
|
||||
let window = NSPanel(
|
||||
contentRect: NSRect(x: 0, y: 0, width: 280, height: 100),
|
||||
styleMask: [.nonactivatingPanel, .fullSizeContentView],
|
||||
backing: .buffered,
|
||||
defer: false
|
||||
)
|
||||
window.contentView = hostingView
|
||||
window.isFloatingPanel = true
|
||||
window.level = .floating
|
||||
window.hasShadow = true
|
||||
window.isOpaque = false
|
||||
window.backgroundColor = .clear
|
||||
window.isMovableByWindowBackground = false
|
||||
window.hidesOnDeactivate = false
|
||||
window.collectionBehavior = [.canJoinAllSpaces, .stationary]
|
||||
window.titleVisibility = .hidden
|
||||
window.titlebarAppearsTransparent = true
|
||||
|
||||
// Position near top-right, just below the menu bar
|
||||
if let screen = NSScreen.main {
|
||||
let screenFrame = screen.visibleFrame
|
||||
let x = screenFrame.maxX - window.frame.width - 16
|
||||
let y = screenFrame.maxY - 8
|
||||
window.setFrameOrigin(NSPoint(x: x, y: y))
|
||||
}
|
||||
|
||||
window.alphaValue = 0
|
||||
window.orderFrontRegardless()
|
||||
panel = window
|
||||
|
||||
// Fade in
|
||||
NSAnimationContext.runAnimationGroup { context in
|
||||
context.duration = 0.3
|
||||
context.timingFunction = CAMediaTimingFunction(name: .easeOut)
|
||||
window.animator().alphaValue = 1
|
||||
}
|
||||
|
||||
// Auto-open dashboard after 5s then dismiss
|
||||
countdownTask = Task {
|
||||
try? await Task.sleep(nanoseconds: 5_000_000_000)
|
||||
if !Task.isCancelled {
|
||||
openDashboard()
|
||||
try? await Task.sleep(nanoseconds: 800_000_000)
|
||||
if !Task.isCancelled {
|
||||
dismiss()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func dismiss() {
|
||||
countdownTask?.cancel()
|
||||
countdownTask = nil
|
||||
guard let window = panel else { return }
|
||||
NSAnimationContext.runAnimationGroup({ context in
|
||||
context.duration = 0.2
|
||||
context.timingFunction = CAMediaTimingFunction(name: .easeIn)
|
||||
window.animator().alphaValue = 0
|
||||
}, completionHandler: {
|
||||
Task { @MainActor in
|
||||
window.close()
|
||||
}
|
||||
})
|
||||
panel = nil
|
||||
}
|
||||
|
||||
private func openDashboard() {
|
||||
guard let url = URL(string: Self.dashboardURL) else { return }
|
||||
NSWorkspace.shared.open(url)
|
||||
}
|
||||
}
|
||||
|
||||
/// Minimal welcome callout — friendly pointer, not a wall of text.
|
||||
private struct WelcomeCalloutView: View {
|
||||
let onDismiss: () -> Void
|
||||
let onOpen: () -> Void
|
||||
@State private var countdown = 5
|
||||
@State private var timerTask: Task<Void, Never>?
|
||||
@State private var appeared = false
|
||||
|
||||
var body: some View {
|
||||
VStack(alignment: .leading, spacing: 10) {
|
||||
HStack(alignment: .top) {
|
||||
Text("Welcome to EXO!")
|
||||
.font(.system(.headline, design: .rounded))
|
||||
.fontWeight(.semibold)
|
||||
.foregroundColor(.primary)
|
||||
Spacer()
|
||||
Button {
|
||||
onDismiss()
|
||||
} label: {
|
||||
Image(systemName: "xmark.circle.fill")
|
||||
.font(.system(size: 14))
|
||||
.foregroundStyle(.tertiary)
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
}
|
||||
|
||||
Text("Run your first model here:")
|
||||
.font(.system(.subheadline, design: .default))
|
||||
.foregroundColor(.secondary)
|
||||
|
||||
HStack {
|
||||
Button {
|
||||
onOpen()
|
||||
} label: {
|
||||
Label("Open Dashboard", systemImage: "arrow.up.right.square")
|
||||
.font(.system(.caption, design: .default))
|
||||
.fontWeight(.medium)
|
||||
}
|
||||
.buttonStyle(.borderedProminent)
|
||||
.tint(.accentColor)
|
||||
.controlSize(.small)
|
||||
|
||||
Spacer()
|
||||
|
||||
if countdown > 0 {
|
||||
Text("Opening in \(countdown)s")
|
||||
.font(.system(.caption2, design: .default))
|
||||
.foregroundColor(.secondary.opacity(0.6))
|
||||
.monospacedDigit()
|
||||
}
|
||||
}
|
||||
}
|
||||
.padding(14)
|
||||
.background {
|
||||
RoundedRectangle(cornerRadius: 12, style: .continuous)
|
||||
.fill(.ultraThinMaterial)
|
||||
.shadow(color: .black.opacity(0.15), radius: 12, y: 4)
|
||||
}
|
||||
.padding(4)
|
||||
.opacity(appeared ? 1 : 0)
|
||||
.offset(y: appeared ? 0 : -8)
|
||||
.onAppear {
|
||||
withAnimation(.easeOut(duration: 0.35).delay(0.05)) {
|
||||
appeared = true
|
||||
}
|
||||
startCountdown()
|
||||
}
|
||||
.onDisappear {
|
||||
timerTask?.cancel()
|
||||
timerTask = nil
|
||||
}
|
||||
}
|
||||
|
||||
private func startCountdown() {
|
||||
timerTask = Task {
|
||||
while countdown > 0 {
|
||||
try? await Task.sleep(nanoseconds: 1_000_000_000)
|
||||
if !Task.isCancelled {
|
||||
countdown -= 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,478 +0,0 @@
|
||||
import AppKit
|
||||
import SwiftUI
|
||||
|
||||
/// Native macOS Settings window following Apple HIG.
|
||||
/// Organized into General, Model, Advanced, and About sections.
|
||||
struct SettingsView: View {
|
||||
@EnvironmentObject private var controller: ExoProcessController
|
||||
@EnvironmentObject private var updater: SparkleUpdater
|
||||
@EnvironmentObject private var networkStatusService: NetworkStatusService
|
||||
@EnvironmentObject private var thunderboltBridgeService: ThunderboltBridgeService
|
||||
@EnvironmentObject private var stateService: ClusterStateService
|
||||
|
||||
@State private var pendingNamespace: String = ""
|
||||
@State private var pendingHFToken: String = ""
|
||||
@State private var pendingEnableImageModels = false
|
||||
@State private var needsRestart = false
|
||||
@State private var bugReportInFlight = false
|
||||
@State private var bugReportMessage: String?
|
||||
@State private var uninstallInProgress = false
|
||||
|
||||
var body: some View {
|
||||
TabView {
|
||||
generalTab
|
||||
.tabItem {
|
||||
Label("General", systemImage: "gear")
|
||||
}
|
||||
modelTab
|
||||
.tabItem {
|
||||
Label("Model", systemImage: "cube")
|
||||
}
|
||||
advancedTab
|
||||
.tabItem {
|
||||
Label("Advanced", systemImage: "wrench.and.screwdriver")
|
||||
}
|
||||
aboutTab
|
||||
.tabItem {
|
||||
Label("About", systemImage: "info.circle")
|
||||
}
|
||||
}
|
||||
.frame(width: 450, height: 400)
|
||||
.onAppear {
|
||||
pendingNamespace = controller.customNamespace
|
||||
pendingHFToken = controller.hfToken
|
||||
pendingEnableImageModels = controller.enableImageModels
|
||||
needsRestart = false
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - General Tab
|
||||
|
||||
private var generalTab: some View {
|
||||
Form {
|
||||
Section {
|
||||
LabeledContent("Cluster Namespace") {
|
||||
TextField("default", text: $pendingNamespace)
|
||||
.textFieldStyle(.roundedBorder)
|
||||
.frame(width: 200)
|
||||
}
|
||||
Text("Nodes with the same namespace form a cluster. Leave empty for default.")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
|
||||
Section {
|
||||
LabeledContent("HuggingFace Token") {
|
||||
SecureField("optional", text: $pendingHFToken)
|
||||
.textFieldStyle(.roundedBorder)
|
||||
.frame(width: 200)
|
||||
}
|
||||
Text("Required for gated models. Get yours at huggingface.co/settings/tokens")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
|
||||
Section {
|
||||
HStack {
|
||||
Spacer()
|
||||
Button("Save & Restart") {
|
||||
applyGeneralSettings()
|
||||
}
|
||||
.disabled(!hasGeneralChanges)
|
||||
}
|
||||
}
|
||||
}
|
||||
.formStyle(.grouped)
|
||||
.padding()
|
||||
}
|
||||
|
||||
// MARK: - Model Tab
|
||||
|
||||
private var modelTab: some View {
|
||||
Form {
|
||||
Section {
|
||||
Toggle("Enable Image Models (experimental)", isOn: $pendingEnableImageModels)
|
||||
Text("Allow text-to-image and image-to-image models in the model picker.")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
|
||||
Section {
|
||||
HStack {
|
||||
Spacer()
|
||||
Button("Save & Restart") {
|
||||
applyModelSettings()
|
||||
}
|
||||
.disabled(!hasModelChanges)
|
||||
}
|
||||
}
|
||||
}
|
||||
.formStyle(.grouped)
|
||||
.padding()
|
||||
}
|
||||
|
||||
// MARK: - Advanced Tab
|
||||
|
||||
private var advancedTab: some View {
|
||||
Form {
|
||||
Section("Onboarding") {
|
||||
HStack {
|
||||
VStack(alignment: .leading) {
|
||||
Text("Reset Onboarding")
|
||||
Text("Opens the dashboard and resets the onboarding wizard.")
|
||||
.font(.caption)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
Spacer()
|
||||
Button("Reset") {
|
||||
guard let url = URL(string: "http://localhost:52415/?reset-onboarding")
|
||||
else { return }
|
||||
NSWorkspace.shared.open(url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Section("Debug Info") {
|
||||
LabeledContent("Thunderbolt Bridge") {
|
||||
Text(thunderboltStatusText)
|
||||
.foregroundColor(thunderboltStatusColor)
|
||||
}
|
||||
|
||||
VStack(alignment: .leading, spacing: 2) {
|
||||
clusterThunderboltBridgeView
|
||||
}
|
||||
|
||||
VStack(alignment: .leading, spacing: 2) {
|
||||
interfaceIpList
|
||||
}
|
||||
|
||||
VStack(alignment: .leading, spacing: 2) {
|
||||
rdmaStatusView
|
||||
}
|
||||
|
||||
sendBugReportButton
|
||||
}
|
||||
|
||||
Section("Danger Zone") {
|
||||
Button(role: .destructive) {
|
||||
showUninstallConfirmationAlert()
|
||||
} label: {
|
||||
HStack {
|
||||
Text("Uninstall EXO")
|
||||
Spacer()
|
||||
Image(systemName: "trash")
|
||||
.imageScale(.small)
|
||||
}
|
||||
}
|
||||
.disabled(uninstallInProgress)
|
||||
}
|
||||
}
|
||||
.formStyle(.grouped)
|
||||
.padding()
|
||||
}
|
||||
|
||||
// MARK: - About Tab
|
||||
|
||||
private var aboutTab: some View {
|
||||
Form {
|
||||
Section {
|
||||
LabeledContent("Version") {
|
||||
Text(buildTag)
|
||||
.textSelection(.enabled)
|
||||
}
|
||||
LabeledContent("Commit") {
|
||||
Text(buildCommit)
|
||||
.font(.system(.body, design: .monospaced))
|
||||
.textSelection(.enabled)
|
||||
}
|
||||
}
|
||||
|
||||
Section {
|
||||
Button("Check for Updates") {
|
||||
updater.checkForUpdates()
|
||||
}
|
||||
}
|
||||
}
|
||||
.formStyle(.grouped)
|
||||
.padding()
|
||||
}
|
||||
|
||||
// MARK: - Debug Info Views (moved from ContentView)
|
||||
|
||||
private var thunderboltStatusText: String {
|
||||
switch networkStatusService.status.thunderboltBridgeState {
|
||||
case .some(.disabled):
|
||||
return "Disabled"
|
||||
case .some(.deleted):
|
||||
return "Deleted"
|
||||
case .some(.enabled):
|
||||
return "Enabled"
|
||||
case nil:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
private var thunderboltStatusColor: Color {
|
||||
switch networkStatusService.status.thunderboltBridgeState {
|
||||
case .some(.disabled), .some(.deleted):
|
||||
return .green
|
||||
case .some(.enabled):
|
||||
return .red
|
||||
case nil:
|
||||
return .secondary
|
||||
}
|
||||
}
|
||||
|
||||
private var clusterThunderboltBridgeView: some View {
|
||||
let bridgeStatuses = stateService.latestSnapshot?.nodeThunderboltBridge ?? [:]
|
||||
let localNodeId = stateService.localNodeId
|
||||
let nodeProfiles = stateService.latestSnapshot?.nodeProfiles ?? [:]
|
||||
|
||||
return VStack(alignment: .leading, spacing: 1) {
|
||||
if bridgeStatuses.isEmpty {
|
||||
Text("Cluster TB Bridge: No data")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
} else {
|
||||
Text("Cluster TB Bridge Status:")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
ForEach(Array(bridgeStatuses.keys.sorted()), id: \.self) { nodeId in
|
||||
if let status = bridgeStatuses[nodeId] {
|
||||
let nodeName =
|
||||
nodeProfiles[nodeId]?.friendlyName ?? String(nodeId.prefix(8))
|
||||
let isLocal = nodeId == localNodeId
|
||||
let prefix = isLocal ? " \(nodeName) (local):" : " \(nodeName):"
|
||||
let statusText =
|
||||
!status.exists
|
||||
? "N/A"
|
||||
: (status.enabled ? "Enabled" : "Disabled")
|
||||
let color: Color =
|
||||
!status.exists
|
||||
? .secondary
|
||||
: (status.enabled ? .red : .green)
|
||||
Text("\(prefix) \(statusText)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(color)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var interfaceIpList: some View {
|
||||
let statuses = networkStatusService.status.interfaceStatuses
|
||||
return VStack(alignment: .leading, spacing: 1) {
|
||||
Text("Interfaces (en0–en7):")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
if statuses.isEmpty {
|
||||
Text(" Unknown")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
} else {
|
||||
ForEach(statuses, id: \.interfaceName) { status in
|
||||
let ipText = status.ipAddress ?? "No IP"
|
||||
Text(" \(status.interfaceName): \(ipText)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(status.ipAddress == nil ? .red : .green)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var rdmaStatusView: some View {
|
||||
let rdmaStatuses = stateService.latestSnapshot?.nodeRdmaCtl ?? [:]
|
||||
let localNodeId = stateService.localNodeId
|
||||
let nodeProfiles = stateService.latestSnapshot?.nodeProfiles ?? [:]
|
||||
let localDevices = networkStatusService.status.localRdmaDevices
|
||||
let localPorts = networkStatusService.status.localRdmaActivePorts
|
||||
|
||||
return VStack(alignment: .leading, spacing: 1) {
|
||||
if rdmaStatuses.isEmpty {
|
||||
Text("Cluster RDMA: No data")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
} else {
|
||||
Text("Cluster RDMA Status:")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
ForEach(Array(rdmaStatuses.keys.sorted()), id: \.self) { nodeId in
|
||||
if let status = rdmaStatuses[nodeId] {
|
||||
let nodeName =
|
||||
nodeProfiles[nodeId]?.friendlyName ?? String(nodeId.prefix(8))
|
||||
let isLocal = nodeId == localNodeId
|
||||
let prefix = isLocal ? " \(nodeName) (local):" : " \(nodeName):"
|
||||
let statusText = status.enabled ? "Enabled" : "Disabled"
|
||||
let color: Color = status.enabled ? .green : .orange
|
||||
Text("\(prefix) \(statusText)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(color)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !localDevices.isEmpty {
|
||||
Text(" Local Devices: \(localDevices.joined(separator: ", "))")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
}
|
||||
if !localPorts.isEmpty {
|
||||
Text(" Local Active Ports:")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
ForEach(localPorts, id: \.device) { port in
|
||||
Text(" \(port.device) port \(port.port): \(port.state)")
|
||||
.font(.caption2)
|
||||
.foregroundColor(.green)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var sendBugReportButton: some View {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Button {
|
||||
Task {
|
||||
await sendBugReport()
|
||||
}
|
||||
} label: {
|
||||
HStack {
|
||||
if bugReportInFlight {
|
||||
ProgressView()
|
||||
.scaleEffect(0.6)
|
||||
}
|
||||
Text("Send Bug Report")
|
||||
.font(.caption)
|
||||
.fontWeight(.semibold)
|
||||
Spacer()
|
||||
}
|
||||
}
|
||||
.disabled(bugReportInFlight)
|
||||
|
||||
if let message = bugReportMessage {
|
||||
Text(message)
|
||||
.font(.caption2)
|
||||
.foregroundColor(.secondary)
|
||||
.fixedSize(horizontal: false, vertical: true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Actions
|
||||
|
||||
private func sendBugReport() async {
|
||||
bugReportInFlight = true
|
||||
bugReportMessage = "Collecting logs..."
|
||||
let service = BugReportService()
|
||||
do {
|
||||
let outcome = try await service.sendReport(isManual: true)
|
||||
bugReportMessage = outcome.message
|
||||
} catch {
|
||||
bugReportMessage = error.localizedDescription
|
||||
}
|
||||
bugReportInFlight = false
|
||||
}
|
||||
|
||||
private func showUninstallConfirmationAlert() {
|
||||
let alert = NSAlert()
|
||||
alert.messageText = "Uninstall EXO"
|
||||
alert.informativeText = """
|
||||
This will remove EXO and all its system components:
|
||||
|
||||
• Network configuration daemon
|
||||
• Launch at login registration
|
||||
• EXO network location
|
||||
|
||||
The app will be moved to Trash.
|
||||
"""
|
||||
alert.alertStyle = .warning
|
||||
alert.addButton(withTitle: "Uninstall")
|
||||
alert.addButton(withTitle: "Cancel")
|
||||
|
||||
if let uninstallButton = alert.buttons.first {
|
||||
uninstallButton.hasDestructiveAction = true
|
||||
}
|
||||
|
||||
let response = alert.runModal()
|
||||
if response == .alertFirstButtonReturn {
|
||||
performUninstall()
|
||||
}
|
||||
}
|
||||
|
||||
private func performUninstall() {
|
||||
uninstallInProgress = true
|
||||
|
||||
controller.cancelPendingLaunch()
|
||||
controller.stop()
|
||||
stateService.stopPolling()
|
||||
|
||||
DispatchQueue.global(qos: .utility).async {
|
||||
do {
|
||||
try NetworkSetupHelper.uninstall()
|
||||
|
||||
DispatchQueue.main.async {
|
||||
LaunchAtLoginHelper.disable()
|
||||
self.moveAppToTrash()
|
||||
|
||||
DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
|
||||
NSApplication.shared.terminate(nil)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
DispatchQueue.main.async {
|
||||
let errorAlert = NSAlert()
|
||||
errorAlert.messageText = "Uninstall Failed"
|
||||
errorAlert.informativeText = error.localizedDescription
|
||||
errorAlert.alertStyle = .critical
|
||||
errorAlert.addButton(withTitle: "OK")
|
||||
errorAlert.runModal()
|
||||
self.uninstallInProgress = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func moveAppToTrash() {
|
||||
guard let appURL = Bundle.main.bundleURL as URL? else { return }
|
||||
do {
|
||||
try FileManager.default.trashItem(at: appURL, resultingItemURL: nil)
|
||||
} catch {
|
||||
// If we can't trash the app, that's OK - user can do it manually
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Helpers
|
||||
|
||||
private var hasGeneralChanges: Bool {
|
||||
pendingNamespace != controller.customNamespace || pendingHFToken != controller.hfToken
|
||||
}
|
||||
|
||||
private var hasModelChanges: Bool {
|
||||
pendingEnableImageModels != controller.enableImageModels
|
||||
}
|
||||
|
||||
private func applyGeneralSettings() {
|
||||
controller.customNamespace = pendingNamespace
|
||||
controller.hfToken = pendingHFToken
|
||||
restartIfRunning()
|
||||
}
|
||||
|
||||
private func applyModelSettings() {
|
||||
controller.enableImageModels = pendingEnableImageModels
|
||||
restartIfRunning()
|
||||
}
|
||||
|
||||
private func restartIfRunning() {
|
||||
if controller.status == .running || controller.status == .starting {
|
||||
controller.restart()
|
||||
}
|
||||
}
|
||||
|
||||
private var buildTag: String {
|
||||
Bundle.main.infoDictionary?["EXOBuildTag"] as? String ?? "unknown"
|
||||
}
|
||||
|
||||
private var buildCommit: String {
|
||||
Bundle.main.infoDictionary?["EXOBuildCommit"] as? String ?? "unknown"
|
||||
}
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
import AppKit
|
||||
import SwiftUI
|
||||
|
||||
/// Manages a standalone native macOS Settings window.
|
||||
/// Ensures only one instance exists and brings it to front on repeated opens.
|
||||
@MainActor
|
||||
final class SettingsWindowController: ObservableObject {
|
||||
private var window: NSWindow?
|
||||
|
||||
func open(
|
||||
controller: ExoProcessController,
|
||||
updater: SparkleUpdater,
|
||||
networkStatusService: NetworkStatusService,
|
||||
thunderboltBridgeService: ThunderboltBridgeService,
|
||||
stateService: ClusterStateService
|
||||
) {
|
||||
if let existing = window, existing.isVisible {
|
||||
existing.makeKeyAndOrderFront(nil)
|
||||
NSApp.activate(ignoringOtherApps: true)
|
||||
return
|
||||
}
|
||||
|
||||
let settingsView = SettingsView()
|
||||
.environmentObject(controller)
|
||||
.environmentObject(updater)
|
||||
.environmentObject(networkStatusService)
|
||||
.environmentObject(thunderboltBridgeService)
|
||||
.environmentObject(stateService)
|
||||
|
||||
let hostingView = NSHostingView(rootView: settingsView)
|
||||
|
||||
let newWindow = NSWindow(
|
||||
contentRect: NSRect(x: 0, y: 0, width: 450, height: 400),
|
||||
styleMask: [.titled, .closable],
|
||||
backing: .buffered,
|
||||
defer: false
|
||||
)
|
||||
newWindow.title = "EXO Settings"
|
||||
newWindow.contentView = hostingView
|
||||
newWindow.center()
|
||||
newWindow.isReleasedWhenClosed = false
|
||||
newWindow.makeKeyAndOrderFront(nil)
|
||||
NSApp.activate(ignoringOtherApps: true)
|
||||
|
||||
window = newWindow
|
||||
}
|
||||
}
|
||||
1
conftest.py
Normal file
1
conftest.py
Normal file
@@ -0,0 +1 @@
|
||||
collect_ignore = ["tests/start_distributed_test.py"]
|
||||
@@ -28,7 +28,6 @@
|
||||
showModelSelector?: boolean;
|
||||
modelTasks?: Record<string, string[]>;
|
||||
modelCapabilities?: Record<string, string[]>;
|
||||
onSend?: () => void;
|
||||
}
|
||||
|
||||
let {
|
||||
@@ -39,7 +38,6 @@
|
||||
showModelSelector = false,
|
||||
modelTasks = {},
|
||||
modelCapabilities = {},
|
||||
onSend,
|
||||
}: Props = $props();
|
||||
|
||||
let message = $state("");
|
||||
@@ -302,8 +300,6 @@
|
||||
);
|
||||
}
|
||||
|
||||
onSend?.();
|
||||
|
||||
// Refocus the textarea after sending
|
||||
setTimeout(() => textareaRef?.focus(), 10);
|
||||
}
|
||||
|
||||
@@ -307,7 +307,7 @@
|
||||
<div class="py-2">
|
||||
<div class="px-4 py-2">
|
||||
<span
|
||||
class="text-xs text-exo-light-gray font-mono tracking-wider uppercase"
|
||||
class="text-sm text-white/70 font-mono tracking-wider uppercase"
|
||||
>
|
||||
{searchQuery ? "SEARCH RESULTS" : "CONVERSATIONS"}
|
||||
</span>
|
||||
@@ -376,37 +376,39 @@
|
||||
onkeydown={(e) =>
|
||||
e.key === "Enter" &&
|
||||
handleSelectConversation(conversation.id)}
|
||||
class="group w-full flex items-center justify-between p-2.5 rounded-lg mb-1 transition-all text-left cursor-pointer
|
||||
class="group w-full flex items-center justify-between p-2 rounded mb-1 transition-all text-left cursor-pointer
|
||||
{activeId === conversation.id
|
||||
? 'bg-exo-yellow/5 border border-exo-yellow/30'
|
||||
: 'hover:bg-white/[0.03] hover:border-white/10 border border-transparent'}"
|
||||
? 'bg-transparent border border-exo-yellow/30'
|
||||
: 'hover:border-exo-yellow/20 border border-transparent'}"
|
||||
>
|
||||
<div class="flex-1 min-w-0 pr-2">
|
||||
<div
|
||||
class="text-sm font-medium truncate {activeId ===
|
||||
conversation.id
|
||||
class="text-sm truncate {activeId === conversation.id
|
||||
? 'text-exo-yellow'
|
||||
: 'text-white'}"
|
||||
: 'text-white/90'}"
|
||||
>
|
||||
{conversation.name}
|
||||
</div>
|
||||
<div class="text-xs text-white/60 mt-0.5">
|
||||
<div class="text-sm text-white/50 mt-0.5">
|
||||
{formatDate(conversation.updatedAt)}
|
||||
</div>
|
||||
<div class="text-xs text-exo-light-gray truncate">
|
||||
<div class="text-sm text-white/70 truncate">
|
||||
{info.modelLabel}
|
||||
</div>
|
||||
<div class="text-xs text-white/60 font-mono">
|
||||
Strategy: <span class="text-white/80"
|
||||
>{info.strategyLabel}</span
|
||||
>
|
||||
</div>
|
||||
{#if stats}
|
||||
<div class="text-xs text-white/70 font-mono mt-1">
|
||||
{#if stats.ttftMs}<span class="text-white/50">TTFT</span>
|
||||
<span class="text-exo-yellow/80"
|
||||
>{stats.ttftMs.toFixed(0)}ms</span
|
||||
>{/if}{#if stats.ttftMs && stats.tps}<span
|
||||
class="text-white/30 mx-1.5">·</span
|
||||
>{/if}{#if stats.tps}<span class="text-exo-yellow/80"
|
||||
>{stats.tps.toFixed(1)}</span
|
||||
>
|
||||
<span class="text-white/50">tok/s</span>{/if}
|
||||
<div class="text-xs text-white/60 font-mono mt-1">
|
||||
{#if stats.ttftMs}<span class="text-white/40">TTFT</span>
|
||||
{stats.ttftMs.toFixed(
|
||||
0,
|
||||
)}ms{/if}{#if stats.ttftMs && stats.tps}<span
|
||||
class="text-white/30 mx-1.5">•</span
|
||||
>{/if}{#if stats.tps}{stats.tps.toFixed(1)}
|
||||
<span class="text-white/40">tok/s</span>{/if}
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
@@ -1805,7 +1805,7 @@ class AppStore {
|
||||
assistantMessage.id,
|
||||
(msg) => {
|
||||
msg.content =
|
||||
"No model is loaded yet. Select a model from the sidebar to get started — it will download and load automatically.";
|
||||
"Error: No model available. Please launch an instance first.";
|
||||
},
|
||||
);
|
||||
this.syncActiveMessagesIfNeeded(targetConversationId);
|
||||
@@ -2243,7 +2243,7 @@ class AppStore {
|
||||
const modelToUse = this.getModelForRequest();
|
||||
if (!modelToUse) {
|
||||
throw new Error(
|
||||
"No model is loaded yet. Select a model from the sidebar to get started — it will download and load automatically.",
|
||||
"No model selected and no running instances available. Please launch an instance first.",
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
58
e2e/Dockerfile
Normal file
58
e2e/Dockerfile
Normal file
@@ -0,0 +1,58 @@
|
||||
# Stage 1: Build the dashboard
|
||||
FROM node:22-slim AS dashboard
|
||||
WORKDIR /app/dashboard
|
||||
COPY dashboard/package.json dashboard/package-lock.json ./
|
||||
RUN npm ci
|
||||
COPY dashboard/ .
|
||||
RUN npm run build
|
||||
|
||||
# Stage 2: Build and run exo
|
||||
FROM python:3.13-slim
|
||||
|
||||
# Install system dependencies
|
||||
# libblas-dev/liblapack-dev/liblapacke-dev are required by MLX CPU backend on Linux
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
libblas-dev \
|
||||
liblapack-dev \
|
||||
liblapacke-dev \
|
||||
curl \
|
||||
protobuf-compiler \
|
||||
iptables \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Rust nightly
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain nightly
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
# Wrap g++ with -fpermissive to fix MLX CPU JIT compilation with GCC 14
|
||||
# (GCC 14 treats _Float128/_Float32/_Float64 as built-in types, conflicting with MLX-generated code)
|
||||
# Must be done BEFORE uv sync so any source builds also get the fix
|
||||
RUN mv /usr/bin/g++ /usr/bin/g++.real && \
|
||||
printf '#!/bin/sh\nexec /usr/bin/g++.real -fpermissive "$@"\n' > /usr/bin/g++ && \
|
||||
chmod +x /usr/bin/g++
|
||||
|
||||
# Install uv
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy dependency files first for better layer caching
|
||||
COPY pyproject.toml Cargo.toml uv.lock README.md ./
|
||||
COPY rust/ ./rust/
|
||||
COPY bench/pyproject.toml ./bench/pyproject.toml
|
||||
|
||||
# Copy source and resources
|
||||
COPY src/ ./src/
|
||||
COPY resources/ ./resources/
|
||||
|
||||
# Copy built dashboard from stage 1
|
||||
COPY --from=dashboard /app/dashboard/build ./dashboard/build/
|
||||
|
||||
# Install Python deps and build Rust bindings, then clean up build artifacts
|
||||
# to keep the layer small (Rust target/ and cargo registry can be 1-2 GB)
|
||||
RUN uv sync && rm -rf /app/rust/target /root/.cargo/registry /root/.cargo/git
|
||||
|
||||
CMD [".venv/bin/exo", "-v"]
|
||||
195
e2e/conftest.py
Normal file
195
e2e/conftest.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""Shared E2E test infrastructure for exo cluster tests."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from urllib.error import URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
E2E_DIR = Path(__file__).parent.resolve()
|
||||
TIMEOUT = int(os.environ.get("E2E_TIMEOUT", "120"))
|
||||
|
||||
|
||||
class Cluster:
|
||||
"""Async wrapper around a docker compose exo cluster."""
|
||||
|
||||
def __init__(self, name: str, overrides: list[str] | None = None):
|
||||
self.name = name
|
||||
self.project = f"e2e-{name}"
|
||||
compose_files = [str(E2E_DIR / "docker-compose.yml")]
|
||||
for path in overrides or []:
|
||||
compose_files.append(str(E2E_DIR / path))
|
||||
self._compose_base = [
|
||||
"docker",
|
||||
"compose",
|
||||
"-p",
|
||||
self.project,
|
||||
*[arg for f in compose_files for arg in ("-f", f)],
|
||||
]
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc):
|
||||
await self.stop()
|
||||
|
||||
async def _run(self, *args: str, check: bool = True) -> str:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*self._compose_base,
|
||||
*args,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
output = stdout.decode()
|
||||
if check and proc.returncode != 0:
|
||||
print(output, file=sys.stderr)
|
||||
raise RuntimeError(
|
||||
f"docker compose {' '.join(args)} failed (rc={proc.returncode})"
|
||||
)
|
||||
return output
|
||||
|
||||
async def build(self):
|
||||
# Skip build if the image was pre-built (e.g. in CI with buildx cache)
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"docker",
|
||||
"image",
|
||||
"inspect",
|
||||
"exo-e2e:latest",
|
||||
stdout=asyncio.subprocess.DEVNULL,
|
||||
stderr=asyncio.subprocess.DEVNULL,
|
||||
)
|
||||
await proc.wait()
|
||||
if proc.returncode == 0:
|
||||
print(" Using pre-built image (exo-e2e:latest)")
|
||||
return
|
||||
print(" Building images...")
|
||||
await self._run("build", "--quiet")
|
||||
|
||||
async def start(self):
|
||||
print(" Starting cluster...")
|
||||
await self._run("up", "-d")
|
||||
|
||||
async def stop(self):
|
||||
print(" Cleaning up...")
|
||||
await self._run("down", "--timeout", "5", check=False)
|
||||
|
||||
async def logs(self) -> str:
|
||||
return await self._run("logs", check=False)
|
||||
|
||||
async def exec(
|
||||
self, service: str, *cmd: str, check: bool = True
|
||||
) -> tuple[int, str]:
|
||||
"""Run a command inside a running container. Returns (returncode, output)."""
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*self._compose_base,
|
||||
"exec",
|
||||
"-T",
|
||||
service,
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
output = stdout.decode()
|
||||
if check and proc.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"exec {' '.join(cmd)} in {service} failed (rc={proc.returncode})"
|
||||
)
|
||||
return proc.returncode, output
|
||||
|
||||
async def wait_for(self, description: str, check_fn, timeout: int = TIMEOUT):
|
||||
"""Poll check_fn every 2s until it returns True or timeout expires."""
|
||||
print(f" Waiting for {description}...")
|
||||
deadline = asyncio.get_event_loop().time() + timeout
|
||||
while asyncio.get_event_loop().time() < deadline:
|
||||
if await check_fn():
|
||||
print(f" {description}")
|
||||
return
|
||||
await asyncio.sleep(2)
|
||||
output = await self.logs()
|
||||
print(f"--- cluster logs ---\n{output}\n---", file=sys.stderr)
|
||||
raise TimeoutError(f"Timed out waiting for {description}")
|
||||
|
||||
async def assert_healthy(self):
|
||||
"""Verify the cluster formed correctly: nodes started, discovered each other, elected a master, API responds."""
|
||||
|
||||
async def both_nodes_started():
|
||||
log = await self.logs()
|
||||
return log.count("Starting node") >= 2
|
||||
|
||||
async def nodes_discovered():
|
||||
log = await self.logs()
|
||||
return log.count("ConnectionMessageType.Connected") >= 2
|
||||
|
||||
async def master_elected():
|
||||
log = await self.logs()
|
||||
return "demoting self" in log
|
||||
|
||||
async def api_responding():
|
||||
try:
|
||||
with urlopen("http://localhost:52415/v1/models", timeout=3) as resp:
|
||||
return resp.status == 200
|
||||
except (URLError, OSError):
|
||||
return False
|
||||
|
||||
await self.wait_for("Both nodes started", both_nodes_started)
|
||||
await self.wait_for("Nodes discovered each other", nodes_discovered)
|
||||
await self.wait_for("Master election resolved", master_elected)
|
||||
await self.wait_for("API responding", api_responding)
|
||||
|
||||
async def _api(
|
||||
self, method: str, path: str, body: dict | None = None, timeout: int = 30
|
||||
) -> dict:
|
||||
"""Make an API request to the cluster. Returns parsed JSON."""
|
||||
url = f"http://localhost:52415{path}"
|
||||
data = json.dumps(body).encode() if body else None
|
||||
req = Request(
|
||||
url, data=data, headers={"Content-Type": "application/json"}, method=method
|
||||
)
|
||||
loop = asyncio.get_event_loop()
|
||||
resp_bytes = await loop.run_in_executor(
|
||||
None, lambda: urlopen(req, timeout=timeout).read()
|
||||
)
|
||||
return json.loads(resp_bytes)
|
||||
|
||||
async def place_model(self, model: str, timeout: int = 600):
|
||||
"""Place a model instance on the cluster (triggers download) and wait until it's ready."""
|
||||
await self._api("POST", "/place_instance", {"model_id": model})
|
||||
|
||||
async def model_ready():
|
||||
try:
|
||||
resp = await self._api("GET", "/v1/models")
|
||||
return any(m.get("id") == model for m in resp.get("data", []))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
await self.wait_for(f"Model {model} ready", model_ready, timeout=timeout)
|
||||
|
||||
async def chat(
|
||||
self, model: str, messages: list[dict], timeout: int = 600, **kwargs
|
||||
) -> dict:
|
||||
"""Send a chat completion request. Retries until model is downloaded and inference completes."""
|
||||
body = json.dumps({"model": model, "messages": messages, **kwargs}).encode()
|
||||
deadline = asyncio.get_event_loop().time() + timeout
|
||||
last_error = None
|
||||
|
||||
while asyncio.get_event_loop().time() < deadline:
|
||||
try:
|
||||
req = Request(
|
||||
"http://localhost:52415/v1/chat/completions",
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
loop = asyncio.get_event_loop()
|
||||
resp_bytes = await loop.run_in_executor(
|
||||
None, lambda r=req: urlopen(r, timeout=300).read()
|
||||
)
|
||||
return json.loads(resp_bytes)
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
await asyncio.sleep(5)
|
||||
|
||||
raise TimeoutError(f"Chat request failed after {timeout}s: {last_error}")
|
||||
20
e2e/docker-compose.yml
Normal file
20
e2e/docker-compose.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
services:
|
||||
exo-node-1:
|
||||
image: exo-e2e:latest
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: e2e/Dockerfile
|
||||
environment:
|
||||
- EXO_LIBP2P_NAMESPACE=docker-e2e
|
||||
command: [".venv/bin/exo", "-v"]
|
||||
ports:
|
||||
- "52415:52415"
|
||||
|
||||
exo-node-2:
|
||||
image: exo-e2e:latest
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: e2e/Dockerfile
|
||||
environment:
|
||||
- EXO_LIBP2P_NAMESPACE=docker-e2e
|
||||
command: [".venv/bin/exo", "-v"]
|
||||
83
e2e/run_all.py
Normal file
83
e2e/run_all.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Discovers and runs all E2E tests in e2e/test_*.py.
|
||||
|
||||
Tests with '# slow' on the first line of their docstring are skipped
|
||||
unless --slow is passed or E2E_SLOW=1 is set.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
E2E_DIR = Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
def is_slow(test_file: Path) -> bool:
|
||||
"""Check if the test file is marked as slow (has '# slow' in first 3 lines)."""
|
||||
with open(test_file) as f:
|
||||
for line in f:
|
||||
if line.strip().startswith("#"):
|
||||
continue
|
||||
if line.strip().startswith('"""') or line.strip().startswith("'''"):
|
||||
# Read into the docstring
|
||||
for doc_line in f:
|
||||
if "slow" in doc_line.lower() and doc_line.strip().startswith(
|
||||
"slow"
|
||||
):
|
||||
return True
|
||||
if '"""' in doc_line or "'''" in doc_line:
|
||||
break
|
||||
break
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
run_slow = "--slow" in sys.argv or os.environ.get("E2E_SLOW") == "1"
|
||||
if "--update-snapshots" in sys.argv:
|
||||
os.environ["UPDATE_SNAPSHOTS"] = "1"
|
||||
test_files = sorted(E2E_DIR.glob("test_*.py"))
|
||||
if not test_files:
|
||||
print("No test files found")
|
||||
sys.exit(1)
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
skipped = 0
|
||||
failures = []
|
||||
|
||||
for test_file in test_files:
|
||||
name = test_file.stem
|
||||
if is_slow(test_file) and not run_slow:
|
||||
print(f"=== {name} === SKIPPED (slow, use --slow to run)")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
print(f"=== {name} ===")
|
||||
result = subprocess.run([sys.executable, str(test_file)])
|
||||
if result.returncode == 0:
|
||||
passed += 1
|
||||
else:
|
||||
# Retry once — Docker networking (mDNS) can be slow on first boot
|
||||
print(f"\n=== {name} === RETRYING (attempt 2/2)")
|
||||
result = subprocess.run([sys.executable, str(test_file)])
|
||||
if result.returncode == 0:
|
||||
passed += 1
|
||||
else:
|
||||
failed += 1
|
||||
failures.append(name)
|
||||
print()
|
||||
|
||||
total = passed + failed + skipped
|
||||
print("================================")
|
||||
print(
|
||||
f"{passed}/{total} tests passed" + (f", {skipped} skipped" if skipped else "")
|
||||
)
|
||||
|
||||
if failed:
|
||||
print(f"Failed: {' '.join(failures)}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
78
e2e/snapshot.py
Normal file
78
e2e/snapshot.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Snapshot testing infrastructure for E2E tests.
|
||||
|
||||
Provides deterministic regression testing by comparing inference output
|
||||
against committed baseline snapshots. Tests FAIL if no baseline exists —
|
||||
baselines must be explicitly generated and committed.
|
||||
|
||||
Generate baselines: UPDATE_SNAPSHOTS=1 python3 e2e/run_all.py --slow
|
||||
Update after intentional changes: UPDATE_SNAPSHOTS=1 python3 e2e/run_all.py --slow
|
||||
|
||||
Snapshots are stored per-architecture (e.g. snapshots/x86_64/, snapshots/arm64/)
|
||||
since floating-point results differ between CPU architectures.
|
||||
"""
|
||||
|
||||
import difflib
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
|
||||
ARCH = platform.machine()
|
||||
SNAPSHOTS_DIR = Path(__file__).parent / "snapshots" / ARCH
|
||||
|
||||
|
||||
def assert_snapshot(
|
||||
name: str,
|
||||
content: str,
|
||||
metadata: dict,
|
||||
) -> None:
|
||||
"""Compare content against a saved snapshot, or create one if missing.
|
||||
|
||||
Args:
|
||||
name: Snapshot identifier (used as filename: snapshots/{arch}/{name}.json).
|
||||
content: The actual inference output to compare.
|
||||
metadata: Additional context stored alongside content (model, seed, etc.).
|
||||
Not used for comparison -- purely documentary.
|
||||
|
||||
Raises:
|
||||
AssertionError: If content doesn't match the saved snapshot.
|
||||
|
||||
Environment:
|
||||
UPDATE_SNAPSHOTS=1: Overwrite existing snapshot with actual content.
|
||||
"""
|
||||
snapshot_file = SNAPSHOTS_DIR / f"{name}.json"
|
||||
update = os.environ.get("UPDATE_SNAPSHOTS") == "1"
|
||||
|
||||
if update:
|
||||
# Explicitly regenerate snapshot
|
||||
SNAPSHOTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
snapshot_data = {**metadata, "arch": ARCH, "content": content}
|
||||
snapshot_file.write_text(json.dumps(snapshot_data, indent=2) + "\n")
|
||||
print(f" Updated snapshot: {ARCH}/{snapshot_file.name}")
|
||||
elif not snapshot_file.exists():
|
||||
raise AssertionError(
|
||||
f"No baseline snapshot for '{name}' on {ARCH}.\n"
|
||||
f"Expected file: {snapshot_file}\n\n"
|
||||
f"Generate baselines with: UPDATE_SNAPSHOTS=1 python3 e2e/run_all.py --slow"
|
||||
)
|
||||
else:
|
||||
snapshot = json.loads(snapshot_file.read_text())
|
||||
expected = snapshot["content"]
|
||||
if content != expected:
|
||||
diff = "\n".join(
|
||||
difflib.unified_diff(
|
||||
expected.splitlines(),
|
||||
content.splitlines(),
|
||||
fromfile=f"expected ({snapshot_file.relative_to(SNAPSHOTS_DIR.parent.parent)})",
|
||||
tofile="actual",
|
||||
lineterm="",
|
||||
)
|
||||
)
|
||||
raise AssertionError(
|
||||
f"Snapshot mismatch for '{name}' on {ARCH}!\n\n"
|
||||
f"{diff}\n\n"
|
||||
f"Expected: {expected!r}\n"
|
||||
f"Actual: {content!r}\n\n"
|
||||
f"To update: UPDATE_SNAPSHOTS=1 python3 e2e/run_all.py --slow"
|
||||
)
|
||||
print(f" Output matches snapshot ({ARCH}/{snapshot_file.name})")
|
||||
22
e2e/test_cluster_formation.py
Normal file
22
e2e/test_cluster_formation.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""Test: Basic cluster formation.
|
||||
|
||||
Verifies two nodes discover each other, elect a master, and the API responds.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent))
|
||||
from conftest import Cluster
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("cluster_formation") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
print("PASSED: cluster_formation")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
62
e2e/test_inference_snapshot.py
Normal file
62
e2e/test_inference_snapshot.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""Test: Deterministic inference output (snapshot test).
|
||||
slow
|
||||
|
||||
Sends a chat completion request with a fixed seed,
|
||||
then verifies the output matches a known-good snapshot. This ensures
|
||||
inference produces consistent results across runs.
|
||||
|
||||
Uses MLX CPU backend in Docker on x86 Linux.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from snapshot import assert_snapshot
|
||||
|
||||
from conftest import Cluster
|
||||
|
||||
MODEL = "mlx-community/Qwen3-0.6B-4bit"
|
||||
SEED = 42
|
||||
PROMPT = "What is 2+2? Reply with just the number."
|
||||
MAX_TOKENS = 32
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("inference_snapshot") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
print(f" Launching model {MODEL}...")
|
||||
await cluster.place_model(MODEL)
|
||||
|
||||
print(f" Sending chat completion (seed={SEED})...")
|
||||
resp = await cluster.chat(
|
||||
model=MODEL,
|
||||
messages=[{"role": "user", "content": PROMPT}],
|
||||
seed=SEED,
|
||||
temperature=0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
print(f" Response: {content!r}")
|
||||
|
||||
assert_snapshot(
|
||||
name="inference_snapshot",
|
||||
content=content,
|
||||
metadata={
|
||||
"model": MODEL,
|
||||
"seed": SEED,
|
||||
"prompt": PROMPT,
|
||||
"max_tokens": MAX_TOKENS,
|
||||
},
|
||||
)
|
||||
|
||||
print("PASSED: inference_snapshot")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
47
e2e/test_no_internet.py
Normal file
47
e2e/test_no_internet.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Test: Cluster works without internet access.
|
||||
|
||||
Verifies exo functions correctly when containers can talk to each other
|
||||
but cannot reach the internet. Uses iptables to block all outbound traffic
|
||||
except private subnets and multicast (for mDNS discovery).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent))
|
||||
from conftest import Cluster
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster(
|
||||
"no_internet",
|
||||
overrides=["tests/no_internet/docker-compose.override.yml"],
|
||||
) as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
# Verify internet is actually blocked from inside the containers
|
||||
for node in ["exo-node-1", "exo-node-2"]:
|
||||
rc, _ = await cluster.exec(
|
||||
node,
|
||||
"curl",
|
||||
"-sf",
|
||||
"--max-time",
|
||||
"3",
|
||||
"https://huggingface.co",
|
||||
check=False,
|
||||
)
|
||||
assert rc != 0, f"{node} should not be able to reach the internet"
|
||||
print(f" {node}: internet correctly blocked")
|
||||
|
||||
# Verify exo detected no internet connectivity
|
||||
log = await cluster.logs()
|
||||
assert "Internet connectivity: False" in log, "exo should detect no internet"
|
||||
print(" exo correctly detected no internet connectivity")
|
||||
|
||||
print("PASSED: no_internet")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
65
e2e/test_runner_chaos.py
Normal file
65
e2e/test_runner_chaos.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Test: Runner chaos — abrupt runner death detection.
|
||||
slow
|
||||
|
||||
Sends a chat completion with the EXO_RUNNER_MUST_DIE trigger, which causes
|
||||
the runner process to call os._exit(1) (simulating an OOM kill). Verifies that
|
||||
the RunnerSupervisor health check detects the death and the system doesn't hang.
|
||||
|
||||
Requires a machine that can run MLX inference at reasonable speed (Apple Silicon).
|
||||
Run with: python3 e2e/run_all.py --slow or E2E_SLOW=1 python3 e2e/run_all.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from conftest import Cluster
|
||||
|
||||
MODEL = "mlx-community/Qwen3-0.6B-4bit"
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("runner_chaos") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
# Place the model so a runner is loaded and ready
|
||||
print(f" Launching model {MODEL}...")
|
||||
await cluster.place_model(MODEL)
|
||||
|
||||
# Send a chat request with the die trigger.
|
||||
# The runner will call os._exit(1) mid-inference, simulating OOM kill.
|
||||
# The chat request itself will fail — that's expected.
|
||||
print(" Sending EXO_RUNNER_MUST_DIE trigger...")
|
||||
with contextlib.suppress(Exception):
|
||||
await cluster.chat(
|
||||
model=MODEL,
|
||||
messages=[{"role": "user", "content": "EXO RUNNER MUST DIE"}],
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# Wait for the health check to detect the death and emit RunnerFailed
|
||||
async def health_check_detected():
|
||||
log = await cluster.logs()
|
||||
return "runner process died unexpectedly" in log
|
||||
|
||||
await cluster.wait_for(
|
||||
"Health check detected runner death",
|
||||
health_check_detected,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Verify RunnerFailed was emitted (visible in logs)
|
||||
log = await cluster.logs()
|
||||
assert "runner process died unexpectedly" in log, (
|
||||
f"Expected health check to detect runner death but it didn't.\nLogs:\n{log}"
|
||||
)
|
||||
|
||||
print("PASSED: runner_chaos")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
60
e2e/test_snapshot_code_gen.py
Normal file
60
e2e/test_snapshot_code_gen.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""Test: Code generation snapshot.
|
||||
slow
|
||||
|
||||
Verifies deterministic output for a code generation prompt.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from snapshot import assert_snapshot
|
||||
|
||||
from conftest import Cluster
|
||||
|
||||
MODEL = "mlx-community/Qwen3-0.6B-4bit"
|
||||
SEED = 42
|
||||
PROMPT = (
|
||||
"Write a Python function to reverse a string. Only output the code, no explanation."
|
||||
)
|
||||
MAX_TOKENS = 64
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("snapshot_code_gen") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
print(f" Launching model {MODEL}...")
|
||||
await cluster.place_model(MODEL)
|
||||
|
||||
print(f" Sending chat completion (seed={SEED})...")
|
||||
resp = await cluster.chat(
|
||||
model=MODEL,
|
||||
messages=[{"role": "user", "content": PROMPT}],
|
||||
seed=SEED,
|
||||
temperature=0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
print(f" Response: {content!r}")
|
||||
|
||||
assert_snapshot(
|
||||
name="snapshot_code_gen",
|
||||
content=content,
|
||||
metadata={
|
||||
"model": MODEL,
|
||||
"seed": SEED,
|
||||
"prompt": PROMPT,
|
||||
"max_tokens": MAX_TOKENS,
|
||||
},
|
||||
)
|
||||
|
||||
print("PASSED: snapshot_code_gen")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
65
e2e/test_snapshot_edge.py
Normal file
65
e2e/test_snapshot_edge.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Test: Edge case snapshots.
|
||||
slow
|
||||
|
||||
Verifies deterministic output for edge-case prompts: single word input,
|
||||
special characters, and unicode.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from snapshot import assert_snapshot
|
||||
|
||||
from conftest import Cluster
|
||||
|
||||
MODEL = "mlx-community/Qwen3-0.6B-4bit"
|
||||
SEED = 42
|
||||
MAX_TOKENS = 32
|
||||
|
||||
CASES = [
|
||||
("edge_single_word", "Hi"),
|
||||
("edge_special_chars", "What does 2 * (3 + 4) / 7 - 1 equal? Use <math> tags."),
|
||||
("edge_unicode", "Translate 'hello' to Japanese, Chinese, and Korean."),
|
||||
]
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("snapshot_edge") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
print(f" Launching model {MODEL}...")
|
||||
await cluster.place_model(MODEL)
|
||||
|
||||
for snapshot_name, prompt in CASES:
|
||||
print(f" [{snapshot_name}] Sending: {prompt!r}")
|
||||
resp = await cluster.chat(
|
||||
model=MODEL,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
seed=SEED,
|
||||
temperature=0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
print(f" [{snapshot_name}] Response: {content!r}")
|
||||
|
||||
assert_snapshot(
|
||||
name=snapshot_name,
|
||||
content=content,
|
||||
metadata={
|
||||
"model": MODEL,
|
||||
"seed": SEED,
|
||||
"prompt": prompt,
|
||||
"max_tokens": MAX_TOKENS,
|
||||
},
|
||||
)
|
||||
|
||||
print("PASSED: snapshot_edge")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
58
e2e/test_snapshot_long_output.py
Normal file
58
e2e/test_snapshot_long_output.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Test: Longer output snapshot.
|
||||
slow
|
||||
|
||||
Verifies deterministic output with a higher max_tokens (128).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from snapshot import assert_snapshot
|
||||
|
||||
from conftest import Cluster
|
||||
|
||||
MODEL = "mlx-community/Qwen3-0.6B-4bit"
|
||||
SEED = 42
|
||||
PROMPT = "Explain how a binary search algorithm works."
|
||||
MAX_TOKENS = 128
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("snapshot_long_output") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
print(f" Launching model {MODEL}...")
|
||||
await cluster.place_model(MODEL)
|
||||
|
||||
print(f" Sending chat completion (seed={SEED}, max_tokens={MAX_TOKENS})...")
|
||||
resp = await cluster.chat(
|
||||
model=MODEL,
|
||||
messages=[{"role": "user", "content": PROMPT}],
|
||||
seed=SEED,
|
||||
temperature=0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
print(f" Response: {content!r}")
|
||||
|
||||
assert_snapshot(
|
||||
name="snapshot_long_output",
|
||||
content=content,
|
||||
metadata={
|
||||
"model": MODEL,
|
||||
"seed": SEED,
|
||||
"prompt": PROMPT,
|
||||
"max_tokens": MAX_TOKENS,
|
||||
},
|
||||
)
|
||||
|
||||
print("PASSED: snapshot_long_output")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
73
e2e/test_snapshot_multi_model.py
Normal file
73
e2e/test_snapshot_multi_model.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Test: Multi-model snapshot tests.
|
||||
slow
|
||||
|
||||
Verifies deterministic output across different model architectures to catch
|
||||
model-specific regressions. Each model uses its own snapshot file.
|
||||
Run with: python3 e2e/run_all.py --slow or E2E_SLOW=1 python3 e2e/run_all.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from snapshot import assert_snapshot
|
||||
|
||||
from conftest import Cluster
|
||||
|
||||
SEED = 42
|
||||
PROMPT = "What is the capital of France?"
|
||||
MAX_TOKENS = 32
|
||||
|
||||
MODELS = [
|
||||
"mlx-community/SmolLM2-135M-Instruct",
|
||||
"mlx-community/Llama-3.2-1B-Instruct-4bit",
|
||||
"mlx-community/gemma-2-2b-it-4bit",
|
||||
]
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("snapshot_multi_model") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
for model in MODELS:
|
||||
short_name = (
|
||||
model.split("/")[-1].lower().replace("-", "_").replace(".", "_")
|
||||
)
|
||||
snapshot_name = f"snapshot_multi_{short_name}"
|
||||
|
||||
print(f" Launching model {model}...")
|
||||
await cluster.place_model(model)
|
||||
|
||||
print(f" Sending chat completion (seed={SEED})...")
|
||||
resp = await cluster.chat(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": PROMPT}],
|
||||
seed=SEED,
|
||||
temperature=0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
print(f" [{short_name}] Response: {content!r}")
|
||||
|
||||
assert_snapshot(
|
||||
name=snapshot_name,
|
||||
content=content,
|
||||
metadata={
|
||||
"model": model,
|
||||
"seed": SEED,
|
||||
"prompt": PROMPT,
|
||||
"max_tokens": MAX_TOKENS,
|
||||
},
|
||||
)
|
||||
|
||||
print(f" [{short_name}] PASSED")
|
||||
|
||||
print("PASSED: snapshot_multi_model")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
58
e2e/test_snapshot_reasoning.py
Normal file
58
e2e/test_snapshot_reasoning.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Test: Reasoning/math snapshot.
|
||||
slow
|
||||
|
||||
Verifies deterministic output for a simple reasoning prompt.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from snapshot import assert_snapshot
|
||||
|
||||
from conftest import Cluster
|
||||
|
||||
MODEL = "mlx-community/Qwen3-0.6B-4bit"
|
||||
SEED = 42
|
||||
PROMPT = "If I have 3 apples and give away 1, how many do I have? Think step by step."
|
||||
MAX_TOKENS = 64
|
||||
|
||||
|
||||
async def main():
|
||||
async with Cluster("snapshot_reasoning") as cluster:
|
||||
await cluster.build()
|
||||
await cluster.start()
|
||||
await cluster.assert_healthy()
|
||||
|
||||
print(f" Launching model {MODEL}...")
|
||||
await cluster.place_model(MODEL)
|
||||
|
||||
print(f" Sending chat completion (seed={SEED})...")
|
||||
resp = await cluster.chat(
|
||||
model=MODEL,
|
||||
messages=[{"role": "user", "content": PROMPT}],
|
||||
seed=SEED,
|
||||
temperature=0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
|
||||
content = resp["choices"][0]["message"]["content"]
|
||||
print(f" Response: {content!r}")
|
||||
|
||||
assert_snapshot(
|
||||
name="snapshot_reasoning",
|
||||
content=content,
|
||||
metadata={
|
||||
"model": MODEL,
|
||||
"seed": SEED,
|
||||
"prompt": PROMPT,
|
||||
"max_tokens": MAX_TOKENS,
|
||||
},
|
||||
)
|
||||
|
||||
print("PASSED: snapshot_reasoning")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
32
e2e/tests/no_internet/docker-compose.override.yml
Normal file
32
e2e/tests/no_internet/docker-compose.override.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
# Block all outbound internet traffic using iptables while preserving:
|
||||
# - Multicast (224.0.0.0/4) for mDNS peer discovery
|
||||
# - Private subnets (10/8, 172.16/12, 192.168/16) for inter-container communication
|
||||
# - Loopback (127/8)
|
||||
# Requires NET_ADMIN capability for iptables.
|
||||
services:
|
||||
exo-node-1:
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
command:
|
||||
- |
|
||||
iptables -A OUTPUT -d 127.0.0.0/8 -j ACCEPT
|
||||
iptables -A OUTPUT -d 10.0.0.0/8 -j ACCEPT
|
||||
iptables -A OUTPUT -d 172.16.0.0/12 -j ACCEPT
|
||||
iptables -A OUTPUT -d 192.168.0.0/16 -j ACCEPT
|
||||
iptables -A OUTPUT -d 224.0.0.0/4 -j ACCEPT
|
||||
iptables -A OUTPUT -j REJECT
|
||||
exec .venv/bin/exo -v
|
||||
exo-node-2:
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
command:
|
||||
- |
|
||||
iptables -A OUTPUT -d 127.0.0.0/8 -j ACCEPT
|
||||
iptables -A OUTPUT -d 10.0.0.0/8 -j ACCEPT
|
||||
iptables -A OUTPUT -d 172.16.0.0/12 -j ACCEPT
|
||||
iptables -A OUTPUT -d 192.168.0.0/16 -j ACCEPT
|
||||
iptables -A OUTPUT -d 224.0.0.0/4 -j ACCEPT
|
||||
iptables -A OUTPUT -j REJECT
|
||||
exec .venv/bin/exo -v
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB |
@@ -1,112 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# create-dmg.sh — Build a polished macOS DMG installer for EXO
|
||||
#
|
||||
# Usage:
|
||||
# ./packaging/dmg/create-dmg.sh <app-path> <output-dmg> [volume-name]
|
||||
#
|
||||
# Example:
|
||||
# ./packaging/dmg/create-dmg.sh output/EXO.app EXO-1.0.0.dmg "EXO"
|
||||
#
|
||||
# Creates a DMG with:
|
||||
# - Custom background image with drag-to-Applications arrow
|
||||
# - App icon on left, Applications alias on right
|
||||
# - Proper window size and icon positioning
|
||||
set -euo pipefail
|
||||
|
||||
APP_PATH="${1:?Usage: create-dmg.sh <app-path> <output-dmg> [volume-name]}"
|
||||
OUTPUT_DMG="${2:?Usage: create-dmg.sh <app-path> <output-dmg> [volume-name]}"
|
||||
VOLUME_NAME="${3:-EXO}"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
BACKGROUND_SCRIPT="${SCRIPT_DIR}/generate-background.py"
|
||||
TEMP_DIR="$(mktemp -d)"
|
||||
DMG_STAGING="${TEMP_DIR}/dmg-root"
|
||||
TEMP_DMG="${TEMP_DIR}/temp.dmg"
|
||||
BACKGROUND_PNG="${TEMP_DIR}/background.png"
|
||||
|
||||
cleanup() { rm -rf "$TEMP_DIR"; }
|
||||
trap cleanup EXIT
|
||||
|
||||
echo "==> Creating DMG installer for ${VOLUME_NAME}"
|
||||
|
||||
# ── Step 1: Generate background image ────────────────────────────────────────
|
||||
if command -v python3 &>/dev/null; then
|
||||
python3 "$BACKGROUND_SCRIPT" "$BACKGROUND_PNG"
|
||||
echo " Background image generated"
|
||||
else
|
||||
echo " Warning: python3 not found, skipping custom background"
|
||||
BACKGROUND_PNG=""
|
||||
fi
|
||||
|
||||
# ── Step 2: Prepare staging directory ─────────────────────────────────────────
|
||||
mkdir -p "$DMG_STAGING"
|
||||
cp -R "$APP_PATH" "$DMG_STAGING/"
|
||||
ln -s /Applications "$DMG_STAGING/Applications"
|
||||
|
||||
# ── Step 3: Create writable DMG ──────────────────────────────────────────────
|
||||
# Calculate required size (app size + 20MB headroom)
|
||||
APP_SIZE_KB=$(du -sk "$APP_PATH" | cut -f1)
|
||||
DMG_SIZE_KB=$((APP_SIZE_KB + 20480))
|
||||
|
||||
hdiutil create \
|
||||
-volname "$VOLUME_NAME" \
|
||||
-size "${DMG_SIZE_KB}k" \
|
||||
-fs HFS+ \
|
||||
-layout SPUD \
|
||||
"$TEMP_DMG"
|
||||
|
||||
# ── Step 4: Mount and configure ──────────────────────────────────────────────
|
||||
MOUNT_DIR=$(hdiutil attach "$TEMP_DMG" -readwrite -noverify | awk -F'\t' '/Apple_HFS/ {gsub(/^[[:space:]]+|[[:space:]]+$/, "", $NF); print $NF}')
|
||||
echo " Mounted at: $MOUNT_DIR"
|
||||
|
||||
# Copy contents
|
||||
cp -R "$DMG_STAGING/"* "$MOUNT_DIR/"
|
||||
|
||||
# Add background image
|
||||
if [[ -n $BACKGROUND_PNG && -f $BACKGROUND_PNG ]]; then
|
||||
mkdir -p "$MOUNT_DIR/.background"
|
||||
cp "$BACKGROUND_PNG" "$MOUNT_DIR/.background/background.png"
|
||||
fi
|
||||
|
||||
# ── Step 5: Configure window appearance via AppleScript ──────────────────────
|
||||
# Window: 800×400, app icon on left, Applications on right (matches Ollama layout)
|
||||
# Background image is 1600×740 (2× retina). Text size 12.
|
||||
APP_NAME="$(basename "$APP_PATH")"
|
||||
|
||||
osascript <<APPLESCRIPT
|
||||
tell application "Finder"
|
||||
tell disk "$VOLUME_NAME"
|
||||
open
|
||||
set current view of container window to icon view
|
||||
set toolbar visible of container window to false
|
||||
set statusbar visible of container window to false
|
||||
set bounds of container window to {200, 120, 1000, 520}
|
||||
set opts to icon view options of container window
|
||||
set icon size of opts to 128
|
||||
set text size of opts to 12
|
||||
set arrangement of opts to not arranged
|
||||
if exists file ".background:background.png" then
|
||||
set background picture of opts to file ".background:background.png"
|
||||
end if
|
||||
set position of item "$APP_NAME" of container window to {200, 190}
|
||||
set position of item "Applications" of container window to {600, 190}
|
||||
close
|
||||
open
|
||||
update without registering applications
|
||||
delay 1
|
||||
close
|
||||
end tell
|
||||
end tell
|
||||
APPLESCRIPT
|
||||
|
||||
echo " Window layout configured"
|
||||
|
||||
# Ensure Finder updates are flushed
|
||||
sync
|
||||
|
||||
# ── Step 6: Finalise ─────────────────────────────────────────────────────────
|
||||
hdiutil detach "$MOUNT_DIR" -quiet
|
||||
hdiutil convert "$TEMP_DMG" -format UDZO -imagekey zlib-level=9 -o "$OUTPUT_DMG"
|
||||
|
||||
echo "==> DMG created: $OUTPUT_DMG"
|
||||
echo " Size: $(du -h "$OUTPUT_DMG" | cut -f1)"
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Copy the static DMG background image to the specified output path.
|
||||
|
||||
The background is a 1600×740 retina PNG (2× for 800×400 logical window) with a
|
||||
hand-drawn arrow and yellow bookmark accents on a white background. Based on
|
||||
Ollama's DMG background (MIT-licensed).
|
||||
|
||||
Usage:
|
||||
python3 generate-background.py output.png
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
BACKGROUND_PNG = Path(__file__).parent / "background.png"
|
||||
|
||||
|
||||
def generate_background(output_path: str) -> None:
|
||||
"""Copy the static background image to the output path."""
|
||||
if not BACKGROUND_PNG.exists():
|
||||
print(f"Error: {BACKGROUND_PNG} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
shutil.copy2(BACKGROUND_PNG, output_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: {sys.argv[0]} <output.png>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
generate_background(sys.argv[1])
|
||||
print(f"Background image written to {sys.argv[1]}")
|
||||
@@ -0,0 +1,12 @@
|
||||
model_id = "mlx-community/SmolLM2-135M-Instruct"
|
||||
n_layers = 30
|
||||
hidden_size = 576
|
||||
supports_tensor = true
|
||||
tasks = ["TextGeneration"]
|
||||
family = "llama"
|
||||
quantization = "bf16"
|
||||
base_model = "SmolLM2 135M"
|
||||
capabilities = ["text"]
|
||||
|
||||
[storage_size]
|
||||
in_bytes = 269060381
|
||||
@@ -0,0 +1,12 @@
|
||||
model_id = "mlx-community/gemma-2-2b-it-4bit"
|
||||
n_layers = 26
|
||||
hidden_size = 2304
|
||||
supports_tensor = false
|
||||
tasks = ["TextGeneration"]
|
||||
family = "gemma2"
|
||||
quantization = "4bit"
|
||||
base_model = "Gemma 2 2B"
|
||||
capabilities = ["text"]
|
||||
|
||||
[storage_size]
|
||||
in_bytes = 1492755242
|
||||
@@ -1,27 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
import webbrowser
|
||||
|
||||
from exo.shared.constants import EXO_CONFIG_HOME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_FIRST_RUN_MARKER = EXO_CONFIG_HOME / ".dashboard_opened"
|
||||
|
||||
|
||||
def _is_first_run() -> bool:
|
||||
return not _FIRST_RUN_MARKER.exists()
|
||||
|
||||
|
||||
def _mark_first_run_done() -> None:
|
||||
_FIRST_RUN_MARKER.parent.mkdir(parents=True, exist_ok=True)
|
||||
_FIRST_RUN_MARKER.touch()
|
||||
|
||||
|
||||
def print_startup_banner(port: int) -> None:
|
||||
"""Print a prominent startup banner with API endpoint information."""
|
||||
dashboard_url = f"http://localhost:{port}"
|
||||
first_run = _is_first_run()
|
||||
banner = f"""
|
||||
╔═══════════════════════════════════════════════════════════════════════╗
|
||||
║ ║
|
||||
@@ -49,14 +28,3 @@ def print_startup_banner(port: int) -> None:
|
||||
"""
|
||||
|
||||
print(banner)
|
||||
|
||||
if first_run:
|
||||
# Skip browser open when running inside the native macOS app —
|
||||
# FirstLaunchPopout.swift handles the auto-open with a countdown.
|
||||
if not os.environ.get("EXO_RUNTIME_DIR"):
|
||||
try:
|
||||
webbrowser.open(dashboard_url)
|
||||
logger.info("First run detected — opening dashboard in browser")
|
||||
except Exception:
|
||||
logger.debug("Could not auto-open browser", exc_info=True)
|
||||
_mark_first_run_done()
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import threading
|
||||
from multiprocessing.sharedctypes import Synchronized
|
||||
|
||||
import loguru
|
||||
|
||||
@@ -10,12 +14,22 @@ from exo.utils.channels import ClosedResourceError, MpReceiver, MpSender
|
||||
|
||||
logger: "loguru.Logger" = loguru.logger
|
||||
|
||||
HEARTBEAT_INTERVAL_SECONDS = 0.5
|
||||
|
||||
|
||||
def _heartbeat_loop(heartbeat: Synchronized[int], stop: threading.Event) -> None:
|
||||
"""Daemon thread that periodically increments the heartbeat counter."""
|
||||
while not stop.is_set():
|
||||
heartbeat.value += 1
|
||||
stop.wait(HEARTBEAT_INTERVAL_SECONDS)
|
||||
|
||||
|
||||
def entrypoint(
|
||||
bound_instance: BoundInstance,
|
||||
event_sender: MpSender[Event],
|
||||
task_receiver: MpReceiver[Task],
|
||||
_logger: "loguru.Logger",
|
||||
heartbeat: Synchronized[int] | None = None,
|
||||
) -> None:
|
||||
fast_synch_override = os.environ.get("EXO_FAST_SYNCH")
|
||||
if fast_synch_override == "on" or (
|
||||
@@ -34,6 +48,17 @@ def entrypoint(
|
||||
|
||||
logger.info(f"Fast synch flag: {os.environ['MLX_METAL_FAST_SYNCH']}")
|
||||
|
||||
# Start heartbeat thread so the supervisor can detect if we freeze.
|
||||
stop_heartbeat = threading.Event()
|
||||
heartbeat_thread: threading.Thread | None = None
|
||||
if heartbeat is not None:
|
||||
heartbeat_thread = threading.Thread(
|
||||
target=_heartbeat_loop,
|
||||
args=(heartbeat, stop_heartbeat),
|
||||
daemon=True,
|
||||
)
|
||||
heartbeat_thread.start()
|
||||
|
||||
# Import main after setting global logger - this lets us just import logger from this module
|
||||
try:
|
||||
from exo.worker.runner.runner import main
|
||||
@@ -52,6 +77,9 @@ def entrypoint(
|
||||
)
|
||||
)
|
||||
finally:
|
||||
stop_heartbeat.set()
|
||||
if heartbeat_thread is not None:
|
||||
heartbeat_thread.join(timeout=1)
|
||||
try:
|
||||
event_sender.close()
|
||||
task_receiver.close()
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import resource
|
||||
import time
|
||||
from collections.abc import Generator
|
||||
@@ -953,6 +954,7 @@ def _validate_single_tool(obj: dict[str, Any]) -> ToolCallItem:
|
||||
EXO_RUNNER_MUST_FAIL = "EXO RUNNER MUST FAIL"
|
||||
EXO_RUNNER_MUST_OOM = "EXO RUNNER MUST OOM"
|
||||
EXO_RUNNER_MUST_TIMEOUT = "EXO RUNNER MUST TIMEOUT"
|
||||
EXO_RUNNER_MUST_DIE = "EXO RUNNER MUST DIE"
|
||||
|
||||
|
||||
def _check_for_debug_prompts(task_params: TextGenerationTaskParams) -> None:
|
||||
@@ -968,6 +970,9 @@ def _check_for_debug_prompts(task_params: TextGenerationTaskParams) -> None:
|
||||
if not prompt:
|
||||
return
|
||||
|
||||
if EXO_RUNNER_MUST_DIE in prompt:
|
||||
logger.info("Abrupt process death triggered (simulates OOM kill)")
|
||||
os._exit(1)
|
||||
if EXO_RUNNER_MUST_FAIL in prompt:
|
||||
logger.info("raising exception")
|
||||
raise Exception("Artificial runner exception - for testing purposes only.")
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import multiprocessing
|
||||
import signal
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import Process
|
||||
from multiprocessing.sharedctypes import Synchronized
|
||||
from typing import Self
|
||||
|
||||
import anyio
|
||||
from anyio import (
|
||||
BrokenResourceError,
|
||||
CancelScope,
|
||||
ClosedResourceError,
|
||||
to_thread,
|
||||
)
|
||||
@@ -26,6 +31,7 @@ from exo.shared.types.worker.runners import (
|
||||
RunnerIdle,
|
||||
RunnerLoading,
|
||||
RunnerRunning,
|
||||
RunnerShutdown,
|
||||
RunnerShuttingDown,
|
||||
RunnerStatus,
|
||||
RunnerWarmingUp,
|
||||
@@ -36,6 +42,8 @@ from exo.worker.runner.bootstrap import entrypoint
|
||||
|
||||
PREFILL_TIMEOUT_SECONDS = 60
|
||||
DECODE_TIMEOUT_SECONDS = 5
|
||||
HEALTH_CHECK_INTERVAL_SECONDS = 1
|
||||
HEARTBEAT_STALE_CHECKS = 10
|
||||
|
||||
|
||||
@dataclass(eq=False)
|
||||
@@ -47,9 +55,13 @@ class RunnerSupervisor:
|
||||
_ev_recv: MpReceiver[Event]
|
||||
_task_sender: MpSender[Task]
|
||||
_event_sender: Sender[Event]
|
||||
_heartbeat: Synchronized[int]
|
||||
status: RunnerStatus = field(default_factory=RunnerIdle, init=False)
|
||||
pending: dict[TaskId, anyio.Event] = field(default_factory=dict, init=False)
|
||||
completed: set[TaskId] = field(default_factory=set, init=False)
|
||||
_death_handled: bool = field(default=False, init=False)
|
||||
_last_heartbeat_value: int = field(default=0, init=False)
|
||||
_heartbeat_stale_count: int = field(default=0, init=False)
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
@@ -63,6 +75,8 @@ class RunnerSupervisor:
|
||||
# A task is kind of a runner command
|
||||
task_sender, task_recv = mp_channel[Task]()
|
||||
|
||||
heartbeat: Synchronized[int] = multiprocessing.Value("Q", 0)
|
||||
|
||||
runner_process = Process(
|
||||
target=entrypoint,
|
||||
args=(
|
||||
@@ -70,6 +84,7 @@ class RunnerSupervisor:
|
||||
ev_send,
|
||||
task_recv,
|
||||
logger,
|
||||
heartbeat,
|
||||
),
|
||||
daemon=True,
|
||||
)
|
||||
@@ -84,13 +99,16 @@ class RunnerSupervisor:
|
||||
_ev_recv=ev_recv,
|
||||
_task_sender=task_sender,
|
||||
_event_sender=event_sender,
|
||||
_heartbeat=heartbeat,
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
async def run(self):
|
||||
self.runner_process.start()
|
||||
await self._forward_events()
|
||||
async with anyio.create_task_group() as tg:
|
||||
tg.start_soon(self._forward_events)
|
||||
tg.start_soon(self._health_check, tg.cancel_scope)
|
||||
|
||||
def shutdown(self):
|
||||
logger.info("Runner supervisor shutting down")
|
||||
@@ -168,9 +186,99 @@ class RunnerSupervisor:
|
||||
self.completed.add(event.task_id)
|
||||
await self._event_sender.send(event)
|
||||
except (ClosedResourceError, BrokenResourceError) as e:
|
||||
await self._check_runner(e)
|
||||
for tid in self.pending:
|
||||
self.pending[tid].set()
|
||||
if not self._death_handled:
|
||||
self._death_handled = True
|
||||
await self._check_runner(e)
|
||||
for tid in self.pending:
|
||||
self.pending[tid].set()
|
||||
|
||||
async def _health_check(self, cancel_scope: CancelScope) -> None:
|
||||
"""Periodically check if the runner process is alive and responsive.
|
||||
|
||||
Detects two failure modes:
|
||||
1. Process death (e.g. OOM kill) without cleanly closing the event
|
||||
channel, which would leave _forward_events blocked on queue.get().
|
||||
2. Unresponsive process (e.g. frozen by OS memory pressure, deadlock)
|
||||
detected via a stale heartbeat counter.
|
||||
"""
|
||||
while True:
|
||||
await anyio.sleep(HEALTH_CHECK_INTERVAL_SECONDS)
|
||||
|
||||
if not self.runner_process.is_alive():
|
||||
self._handle_process_exit(cancel_scope)
|
||||
return
|
||||
|
||||
# Check heartbeat counter — if it hasn't changed between
|
||||
# consecutive checks, the subprocess may be frozen.
|
||||
current = self._heartbeat.value
|
||||
if current > 0:
|
||||
if current == self._last_heartbeat_value:
|
||||
self._heartbeat_stale_count += 1
|
||||
if self._heartbeat_stale_count >= HEARTBEAT_STALE_CHECKS:
|
||||
logger.error(
|
||||
f"Health check: runner process unresponsive "
|
||||
f"(heartbeat stale for {self._heartbeat_stale_count} checks), killing"
|
||||
)
|
||||
self._handle_unresponsive(cancel_scope)
|
||||
return
|
||||
else:
|
||||
self._heartbeat_stale_count = 0
|
||||
self._last_heartbeat_value = current
|
||||
|
||||
def _handle_process_exit(self, cancel_scope: CancelScope) -> None:
|
||||
"""Handle runner process that has exited."""
|
||||
if not self._death_handled:
|
||||
self._death_handled = True
|
||||
if isinstance(
|
||||
self.status, (RunnerShutdown, RunnerShuttingDown, RunnerFailed)
|
||||
):
|
||||
logger.info("Health check: runner process exited (expected)")
|
||||
else:
|
||||
rc = self.runner_process.exitcode
|
||||
if isinstance(rc, int) and rc < 0:
|
||||
sig = -rc
|
||||
try:
|
||||
cause = f"signal={sig} ({signal.strsignal(sig)})"
|
||||
except Exception:
|
||||
cause = f"signal={sig}"
|
||||
else:
|
||||
cause = f"exitcode={rc}"
|
||||
|
||||
logger.error(
|
||||
f"Health check: runner process died unexpectedly ({cause})"
|
||||
)
|
||||
self._event_sender.send_nowait(
|
||||
RunnerStatusUpdated(
|
||||
runner_id=self.bound_instance.bound_runner_id,
|
||||
runner_status=RunnerFailed(
|
||||
error_message=f"Terminated ({cause})"
|
||||
),
|
||||
)
|
||||
)
|
||||
self.shutdown()
|
||||
|
||||
for tid in self.pending:
|
||||
self.pending[tid].set()
|
||||
|
||||
cancel_scope.cancel()
|
||||
|
||||
def _handle_unresponsive(self, cancel_scope: CancelScope) -> None:
|
||||
"""Handle runner process that is alive but unresponsive."""
|
||||
if not self._death_handled:
|
||||
self._death_handled = True
|
||||
self._event_sender.send_nowait(
|
||||
RunnerStatusUpdated(
|
||||
runner_id=self.bound_instance.bound_runner_id,
|
||||
runner_status=RunnerFailed(
|
||||
error_message="Runner process unresponsive (heartbeat timeout)"
|
||||
),
|
||||
)
|
||||
)
|
||||
for tid in self.pending:
|
||||
self.pending[tid].set()
|
||||
self.shutdown()
|
||||
|
||||
cancel_scope.cancel()
|
||||
|
||||
def __del__(self) -> None:
|
||||
if self.runner_process.is_alive():
|
||||
|
||||
@@ -1 +1,202 @@
|
||||
# TODO:
|
||||
from __future__ import annotations
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
import signal as signal_module
|
||||
from collections.abc import Callable
|
||||
from multiprocessing.sharedctypes import Synchronized
|
||||
from typing import Any
|
||||
|
||||
import anyio
|
||||
|
||||
from exo.shared.types.events import Event, RunnerStatusUpdated
|
||||
from exo.shared.types.tasks import Task, TaskId
|
||||
from exo.shared.types.worker.runners import RunnerFailed, RunnerIdle, RunnerShutdown
|
||||
from exo.utils.channels import Receiver, Sender, channel, mp_channel
|
||||
from exo.worker.runner.runner_supervisor import (
|
||||
HEALTH_CHECK_INTERVAL_SECONDS,
|
||||
HEARTBEAT_STALE_CHECKS,
|
||||
RunnerSupervisor,
|
||||
)
|
||||
|
||||
from ...constants import (
|
||||
INSTANCE_1_ID,
|
||||
MODEL_A_ID,
|
||||
NODE_A,
|
||||
RUNNER_1_ID,
|
||||
)
|
||||
from ..conftest import get_bound_mlx_ring_instance
|
||||
|
||||
|
||||
def _die_immediately() -> None:
|
||||
"""Subprocess target that exits with a non-zero code."""
|
||||
os._exit(1)
|
||||
|
||||
|
||||
def _die_with_signal() -> None:
|
||||
"""Subprocess target that kills itself with SIGKILL (simulates OOM)."""
|
||||
os.kill(os.getpid(), signal_module.SIGKILL)
|
||||
|
||||
|
||||
def _exit_cleanly() -> None:
|
||||
"""Subprocess target that exits with code 0."""
|
||||
os._exit(0)
|
||||
|
||||
|
||||
def _hang_forever() -> None:
|
||||
"""Subprocess target that hangs without updating heartbeat (simulates freeze)."""
|
||||
import time
|
||||
|
||||
# Write one heartbeat so the supervisor starts tracking, then stop.
|
||||
time.sleep(100000)
|
||||
|
||||
|
||||
def _build_supervisor(
|
||||
event_sender: Sender[Event],
|
||||
target: Callable[..., Any],
|
||||
) -> RunnerSupervisor:
|
||||
"""Build a RunnerSupervisor with a custom subprocess target.
|
||||
|
||||
Uses a clone of event_sender (matching real Worker behavior) so that
|
||||
closing the supervisor's copy doesn't close the test's receiver.
|
||||
"""
|
||||
bound_instance = get_bound_mlx_ring_instance(
|
||||
instance_id=INSTANCE_1_ID,
|
||||
model_id=MODEL_A_ID,
|
||||
runner_id=RUNNER_1_ID,
|
||||
node_id=NODE_A,
|
||||
)
|
||||
|
||||
_ev_send, ev_recv = mp_channel[Event]()
|
||||
task_sender, _task_recv = mp_channel[Task]()
|
||||
runner_process = multiprocessing.Process(target=target, daemon=True)
|
||||
heartbeat: Synchronized[int] = multiprocessing.Value("Q", 0)
|
||||
|
||||
return RunnerSupervisor(
|
||||
bound_instance=bound_instance,
|
||||
shard_metadata=bound_instance.bound_shard,
|
||||
runner_process=runner_process,
|
||||
initialize_timeout=10,
|
||||
_ev_recv=ev_recv,
|
||||
_task_sender=task_sender,
|
||||
_event_sender=event_sender.clone(),
|
||||
_heartbeat=heartbeat,
|
||||
)
|
||||
|
||||
|
||||
def _collect_failed_events(
|
||||
event_receiver: Receiver[Event],
|
||||
) -> list[RunnerFailed]:
|
||||
"""Drain the receiver and return all RunnerFailed statuses."""
|
||||
out: list[RunnerFailed] = []
|
||||
while True:
|
||||
try:
|
||||
event = event_receiver.receive_nowait()
|
||||
except Exception:
|
||||
break
|
||||
if isinstance(event, RunnerStatusUpdated) and isinstance(
|
||||
event.runner_status, RunnerFailed
|
||||
):
|
||||
out.append(event.runner_status)
|
||||
return out
|
||||
|
||||
|
||||
async def test_health_check_detects_dead_process():
|
||||
"""When the runner process dies with a non-zero exit code, the health check
|
||||
should emit a RunnerFailed event and run() should return."""
|
||||
event_sender, event_receiver = channel[Event]()
|
||||
supervisor = _build_supervisor(event_sender, _die_immediately)
|
||||
|
||||
with anyio.fail_after(HEALTH_CHECK_INTERVAL_SECONDS + 5):
|
||||
await supervisor.run()
|
||||
|
||||
failures = _collect_failed_events(event_receiver)
|
||||
assert len(failures) == 1
|
||||
assert failures[0].error_message is not None
|
||||
assert "exitcode=1" in failures[0].error_message
|
||||
|
||||
|
||||
async def test_health_check_detects_signal_death():
|
||||
"""When the runner process is killed by a signal (e.g. OOM -> SIGKILL),
|
||||
the health check should report the signal in the failure message."""
|
||||
event_sender, event_receiver = channel[Event]()
|
||||
supervisor = _build_supervisor(event_sender, _die_with_signal)
|
||||
|
||||
with anyio.fail_after(HEALTH_CHECK_INTERVAL_SECONDS + 5):
|
||||
await supervisor.run()
|
||||
|
||||
failures = _collect_failed_events(event_receiver)
|
||||
assert len(failures) == 1
|
||||
assert failures[0].error_message is not None
|
||||
assert "signal=9" in failures[0].error_message
|
||||
|
||||
|
||||
async def test_health_check_releases_pending_tasks():
|
||||
"""When the runner dies, any pending start_task() waiters should be unblocked."""
|
||||
event_sender, _event_receiver = channel[Event]()
|
||||
supervisor = _build_supervisor(event_sender, _die_immediately)
|
||||
|
||||
# Register a pending waiter as if start_task() was waiting for acknowledgement
|
||||
task_event = anyio.Event()
|
||||
tid = TaskId("pending-task")
|
||||
supervisor.pending[tid] = task_event
|
||||
|
||||
with anyio.fail_after(HEALTH_CHECK_INTERVAL_SECONDS + 5):
|
||||
await supervisor.run()
|
||||
|
||||
assert task_event.is_set()
|
||||
|
||||
|
||||
async def test_clean_exit_no_failure_when_shutdown_status():
|
||||
"""When the runner was in RunnerShutdown status and exits with code 0,
|
||||
no RunnerFailed event should be emitted."""
|
||||
event_sender, event_receiver = channel[Event]()
|
||||
supervisor = _build_supervisor(event_sender, _exit_cleanly)
|
||||
|
||||
# Simulate that the runner had already reported shutdown via events
|
||||
supervisor.status = RunnerShutdown()
|
||||
|
||||
with anyio.fail_after(HEALTH_CHECK_INTERVAL_SECONDS + 5):
|
||||
await supervisor.run()
|
||||
|
||||
failures = _collect_failed_events(event_receiver)
|
||||
assert len(failures) == 0
|
||||
|
||||
|
||||
async def test_unexpected_exit_code_zero_emits_failure():
|
||||
"""When the runner exits with code 0 but was NOT in a shutdown state,
|
||||
this is unexpected and should still emit RunnerFailed."""
|
||||
event_sender, event_receiver = channel[Event]()
|
||||
supervisor = _build_supervisor(event_sender, _exit_cleanly)
|
||||
|
||||
assert isinstance(supervisor.status, RunnerIdle)
|
||||
|
||||
with anyio.fail_after(HEALTH_CHECK_INTERVAL_SECONDS + 5):
|
||||
await supervisor.run()
|
||||
|
||||
failures = _collect_failed_events(event_receiver)
|
||||
assert len(failures) == 1
|
||||
assert failures[0].error_message is not None
|
||||
assert "exitcode=0" in failures[0].error_message
|
||||
|
||||
|
||||
async def test_heartbeat_timeout_detects_unresponsive_process():
|
||||
"""When the runner process is alive but its heartbeat goes stale,
|
||||
the health check should kill it and emit RunnerFailed."""
|
||||
event_sender, event_receiver = channel[Event]()
|
||||
supervisor = _build_supervisor(event_sender, _hang_forever)
|
||||
|
||||
# Pre-seed the heartbeat counter with a non-zero value and set the
|
||||
# supervisor's last-seen value to match so it appears stale immediately.
|
||||
# Set stale count to HEARTBEAT_STALE_CHECKS - 1 so a single check triggers.
|
||||
supervisor._heartbeat.value = 42 # pyright: ignore[reportPrivateUsage]
|
||||
supervisor._last_heartbeat_value = 42 # pyright: ignore[reportPrivateUsage]
|
||||
supervisor._heartbeat_stale_count = HEARTBEAT_STALE_CHECKS - 1 # pyright: ignore[reportPrivateUsage]
|
||||
|
||||
with anyio.fail_after(HEALTH_CHECK_INTERVAL_SECONDS + 5):
|
||||
await supervisor.run()
|
||||
|
||||
failures = _collect_failed_events(event_receiver)
|
||||
assert len(failures) == 1
|
||||
assert failures[0].error_message is not None
|
||||
assert "unresponsive" in failures[0].error_message.lower()
|
||||
|
||||
Reference in New Issue
Block a user