Add MLX compute device backend selection (cpu/gpu/auto)

Add MlxDevice enum (Auto/Cpu/Gpu) that flows from the API through PlaceInstance commands to instance creation and runner initialization. The device is set via mx.set_default_device() in bootstrap.py before the runner imports MLX, ensuring all subsequent operations use the chosen device. This enables Linux CPU-only machines to participate in inference and allows forcing CPU mode for testing or when GPU has issues. Default is Auto (current behavior) for full backward compatibility. Closes #958 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Revert "Use custom fork that resolves GPU locks" (#1502 )
2026-02-18 14:55:13 -05:00 · 2026-02-17 10:52:11 -08:00 · 2026-02-17 18:18:54 +00:00 · 2026-02-17 18:11:47 +00:00 · 2026-02-17 18:02:32 +00:00 · 2026-02-17 17:52:49 +00:00
7 changed files with 55 additions and 23 deletions
--- a/src/exo/master/adapters/responses.py
+++ b/src/exo/master/adapters/responses.py
@@ -26,7 +26,6 @@ from exo.shared.types.openai_responses import (
    ResponseOutputText,
    ResponsesRequest,
    ResponsesResponse,
-    ResponsesStreamEvent,
    ResponseTextDeltaEvent,
    ResponseTextDoneEvent,
    ResponseUsage,
@@ -34,11 +33,6 @@ from exo.shared.types.openai_responses import (
 from exo.shared.types.text_generation import InputMessage, TextGenerationTaskParams


-def _format_sse(event: ResponsesStreamEvent) -> str:
-    """Format a streaming event as an SSE message."""
-    return f"event: {event.type}\ndata: {event.model_dump_json()}\n\n"
-
-
 def _extract_content(content: str | list[ResponseContentPart]) -> str:
    """Extract plain text from a content field that may be a string or list of parts."""
    if isinstance(content, str):
@@ -213,13 +207,13 @@ async def generate_responses_stream(
    created_event = ResponseCreatedEvent(
        sequence_number=next(seq), response=initial_response
    )
-    yield _format_sse(created_event)
+    yield f"event: response.created\ndata: {created_event.model_dump_json()}\n\n"

    # response.in_progress
    in_progress_event = ResponseInProgressEvent(
        sequence_number=next(seq), response=initial_response
    )
-    yield _format_sse(in_progress_event)
+    yield f"event: response.in_progress\ndata: {in_progress_event.model_dump_json()}\n\n"

    # response.output_item.added
    initial_item = ResponseMessageItem(
@@ -230,7 +224,7 @@ async def generate_responses_stream(
    item_added = ResponseOutputItemAddedEvent(
        sequence_number=next(seq), output_index=0, item=initial_item
    )
-    yield _format_sse(item_added)
+    yield f"event: response.output_item.added\ndata: {item_added.model_dump_json()}\n\n"

    # response.content_part.added
    initial_part = ResponseOutputText(text="")
@@ -241,7 +235,7 @@ async def generate_responses_stream(
        content_index=0,
        part=initial_part,
    )
-    yield _format_sse(part_added)
+    yield f"event: response.content_part.added\ndata: {part_added.model_dump_json()}\n\n"

    accumulated_text = ""
    function_call_items: list[ResponseFunctionCallItem] = []
@@ -272,7 +266,7 @@ async def generate_responses_stream(
                    output_index=next_output_index,
                    item=fc_item,
                )
-                yield _format_sse(fc_added)
+                yield f"event: response.output_item.added\ndata: {fc_added.model_dump_json()}\n\n"

                # response.function_call_arguments.delta
                args_delta = ResponseFunctionCallArgumentsDeltaEvent(
@@ -281,7 +275,7 @@ async def generate_responses_stream(
                    output_index=next_output_index,
                    delta=tool.arguments,
                )
-                yield _format_sse(args_delta)
+                yield f"event: response.function_call_arguments.delta\ndata: {args_delta.model_dump_json()}\n\n"

                # response.function_call_arguments.done
                args_done = ResponseFunctionCallArgumentsDoneEvent(
@@ -291,7 +285,7 @@ async def generate_responses_stream(
                    name=tool.name,
                    arguments=tool.arguments,
                )
-                yield _format_sse(args_done)
+                yield f"event: response.function_call_arguments.done\ndata: {args_done.model_dump_json()}\n\n"

                # response.output_item.done
                fc_done_item = ResponseFunctionCallItem(
@@ -306,7 +300,7 @@ async def generate_responses_stream(
                    output_index=next_output_index,
                    item=fc_done_item,
                )
-                yield _format_sse(fc_item_done)
+                yield f"event: response.output_item.done\ndata: {fc_item_done.model_dump_json()}\n\n"

                function_call_items.append(fc_done_item)
                next_output_index += 1
@@ -322,7 +316,7 @@ async def generate_responses_stream(
            content_index=0,
            delta=chunk.text,
        )
-        yield _format_sse(delta_event)
+        yield f"event: response.output_text.delta\ndata: {delta_event.model_dump_json()}\n\n"

    # response.output_text.done
    text_done = ResponseTextDoneEvent(
@@ -332,7 +326,7 @@ async def generate_responses_stream(
        content_index=0,
        text=accumulated_text,
    )
-    yield _format_sse(text_done)
+    yield f"event: response.output_text.done\ndata: {text_done.model_dump_json()}\n\n"

    # response.content_part.done
    final_part = ResponseOutputText(text=accumulated_text)
@@ -343,7 +337,7 @@ async def generate_responses_stream(
        content_index=0,
        part=final_part,
    )
-    yield _format_sse(part_done)
+    yield f"event: response.content_part.done\ndata: {part_done.model_dump_json()}\n\n"

    # response.output_item.done
    final_message_item = ResponseMessageItem(
@@ -354,7 +348,7 @@ async def generate_responses_stream(
    item_done = ResponseOutputItemDoneEvent(
        sequence_number=next(seq), output_index=0, item=final_message_item
    )
-    yield _format_sse(item_done)
+    yield f"event: response.output_item.done\ndata: {item_done.model_dump_json()}\n\n"

    # Create usage from usage data if available
    usage = None
@@ -379,4 +373,4 @@ async def generate_responses_stream(
    completed_event = ResponseCompletedEvent(
        sequence_number=next(seq), response=final_response
    )
-    yield _format_sse(completed_event)
+    yield f"event: response.completed\ndata: {completed_event.model_dump_json()}\n\n"
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -143,7 +143,12 @@ from exo.shared.types.openai_responses import (
    ResponsesResponse,
 )
 from exo.shared.types.state import State
-from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
+from exo.shared.types.worker.instances import (
+    Instance,
+    InstanceId,
+    InstanceMeta,
+    MlxDevice,
+)
 from exo.shared.types.worker.shards import Sharding
 from exo.utils.banner import print_startup_banner
 from exo.utils.channels import Receiver, Sender, channel
@@ -310,6 +315,7 @@ class API:
            sharding=payload.sharding,
            instance_meta=payload.instance_meta,
            min_nodes=payload.min_nodes,
+            mlx_device=payload.mlx_device,
        )
        await self._send(command)

@@ -350,6 +356,7 @@ class API:
        sharding: Sharding = Sharding.Pipeline,
        instance_meta: InstanceMeta = InstanceMeta.MlxRing,
        min_nodes: int = 1,
+        mlx_device: MlxDevice = MlxDevice.Auto,
    ) -> Instance:
        model_card = await ModelCard.load(model_id)

@@ -360,6 +367,7 @@ class API:
                    sharding=sharding,
                    instance_meta=instance_meta,
                    min_nodes=min_nodes,
+                    mlx_device=mlx_device,
                ),
                node_memory=self.state.node_memory,
                node_network=self.state.node_network,
--- a/src/exo/master/placement.py
+++ b/src/exo/master/placement.py
@@ -159,6 +159,7 @@ def place_instance(
                shard_assignments=shard_assignments,
                jaccl_devices=mlx_jaccl_devices,
                jaccl_coordinators=mlx_jaccl_coordinators,
+                mlx_device=command.mlx_device,
            )
        case InstanceMeta.MlxRing:
            ephemeral_port = random_ephemeral_port()
@@ -173,6 +174,7 @@ def place_instance(
                shard_assignments=shard_assignments,
                hosts_by_node=hosts_by_node,
                ephemeral_port=ephemeral_port,
+                mlx_device=command.mlx_device,
            )

    return target_instances
--- a/src/exo/shared/types/api.py
+++ b/src/exo/shared/types/api.py
@@ -8,7 +8,12 @@ from pydantic import BaseModel, Field
 from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.memory import Memory
-from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
+from exo.shared.types.worker.instances import (
+    Instance,
+    InstanceId,
+    InstanceMeta,
+    MlxDevice,
+)
 from exo.shared.types.worker.shards import Sharding, ShardMetadata
 from exo.utils.pydantic_ext import CamelCaseModel

@@ -226,6 +231,7 @@ class PlaceInstanceParams(BaseModel):
    sharding: Sharding = Sharding.Pipeline
    instance_meta: InstanceMeta = InstanceMeta.MlxRing
    min_nodes: int = 1
+    mlx_device: MlxDevice = MlxDevice.Auto


 class CreateInstanceParams(BaseModel):
--- a/src/exo/shared/types/commands.py
+++ b/src/exo/shared/types/commands.py
@@ -8,7 +8,12 @@ from exo.shared.types.api import (
 from exo.shared.types.chunks import InputImageChunk
 from exo.shared.types.common import CommandId, NodeId
 from exo.shared.types.text_generation import TextGenerationTaskParams
-from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
+from exo.shared.types.worker.instances import (
+    Instance,
+    InstanceId,
+    InstanceMeta,
+    MlxDevice,
+)
 from exo.shared.types.worker.shards import Sharding, ShardMetadata
 from exo.utils.pydantic_ext import CamelCaseModel, TaggedModel

@@ -38,6 +43,7 @@ class PlaceInstance(BaseCommand):
    sharding: Sharding
    instance_meta: InstanceMeta
    min_nodes: int
+    mlx_device: MlxDevice = MlxDevice.Auto


 class CreateInstance(BaseCommand):
--- a/src/exo/shared/types/worker/instances.py
+++ b/src/exo/shared/types/worker/instances.py
@@ -16,9 +16,16 @@ class InstanceMeta(str, Enum):
    MlxJaccl = "MlxJaccl"


+class MlxDevice(str, Enum):
+    Auto = "Auto"
+    Cpu = "Cpu"
+    Gpu = "Gpu"
+
+
 class BaseInstance(TaggedModel):
    instance_id: InstanceId
    shard_assignments: ShardAssignments
+    mlx_device: MlxDevice = MlxDevice.Auto

    def shard(self, runner_id: RunnerId) -> ShardMetadata | None:
        return self.shard_assignments.runner_to_shard.get(runner_id, None)
--- a/src/exo/worker/runner/bootstrap.py
+++ b/src/exo/worker/runner/bootstrap.py
@@ -4,7 +4,7 @@ import loguru

 from exo.shared.types.events import Event, RunnerStatusUpdated
 from exo.shared.types.tasks import Task, TaskId
-from exo.shared.types.worker.instances import BoundInstance, MlxJacclInstance
+from exo.shared.types.worker.instances import BoundInstance, MlxDevice, MlxJacclInstance
 from exo.shared.types.worker.runners import RunnerFailed
 from exo.utils.channels import ClosedResourceError, MpReceiver, MpSender

@@ -35,6 +35,15 @@ def entrypoint(

    logger.info(f"Fast synch flag: {os.environ['MLX_METAL_FAST_SYNCH']}")

+    # Set MLX compute device before importing runner (which imports mlx.core at module scope)
+    mlx_device = bound_instance.instance.mlx_device
+    if mlx_device != MlxDevice.Auto:
+        import mlx.core as mx
+
+        device = mx.cpu if mlx_device == MlxDevice.Cpu else mx.gpu
+        mx.set_default_device(device)
+        logger.info(f"MLX device set to: {mlx_device}")
+
    # Import main after setting global logger - this lets us just import logger from this module
    try:
        from exo.worker.runner.runner import main
Author	SHA1	Message	Date
Alex Cheema	f8748eade1	Add MLX compute device backend selection (cpu/gpu/auto) Add MlxDevice enum (Auto/Cpu/Gpu) that flows from the API through PlaceInstance commands to instance creation and runner initialization. The device is set via mx.set_default_device() in bootstrap.py before the runner imports MLX, ensuring all subsequent operations use the chosen device. This enables Linux CPU-only machines to participate in inference and allows forcing CPU mode for testing or when GPU has issues. Default is Auto (current behavior) for full backward compatibility. Closes #958 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 10:52:11 -08:00
rltakashige	83af8c63fa	Revert "Use custom fork that resolves GPU locks" (#1502 ) Reverts exo-explore/exo#1489 Goddammit Claude...	2026-02-17 18:18:54 +00:00
Evan Quiney	eccc6298d1	Revert "Add MetaInstance declarative layer (#1447 )" This reverts commit `a962a28afc`.	2026-02-17 18:11:47 +00:00
Evan Quiney	c8997217cf	Revert "feat: better onboarding UX for new users (#1479 )" This reverts commit `490d2e46ba`.	2026-02-17 18:02:32 +00:00
Alex Cheema	490d2e46ba	feat: better onboarding UX for new users (#1479 ) ## Summary - Auto-open dashboard in browser on first launch (uses `~/.exo/.dashboard_opened` marker) - Welcome overlay with "Choose a Model" CTA button when no model instance is running - Tutorial progress messages during model download → loading → ready lifecycle stages - Fix conversation sidebar text contrast — bumped to white text, added active state background - Simplify technical jargon — sharding/instance type/min nodes hidden behind collapsible "Advanced Options" toggle; strategy display hidden behind debug mode - Polished DMG installer with drag-to-Applications layout, custom branded background, and AppleScript-configured window positioning ## Test plan - [ ] Launch exo for the first time (delete `~/.exo/.dashboard_opened` to simulate) — browser should auto-open - [ ] Verify welcome overlay appears on topology when no model is loaded - [ ] Launch a model and verify download/loading/ready messages appear in instance cards - [ ] Check conversation sidebar text is readable (white on dark, yellow when active) - [ ] Verify "Advanced Options" toggle hides/shows sharding controls - [ ] Build DMG with `packaging/dmg/create-dmg.sh` and verify drag-to-Applications layout 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 17:52:49 +00:00
rltakashige	facf2d4d03	Use custom fork that resolves GPU locks (#1489 ) ## Motivation There is an issue on Macs that means that an explicit synchronization is necessary for memory to be updated from L1 cache. This means that GPU locks can occur when a spin wait does not see the updated timestamp. ## Changes Updated in my own personal fork. ## Why It Works https://github.com/ARM-software/acle/releases ## Test Plan ### Manual Testing Tested manually that no GPU locks occur (even with multiple simultaneous instances running) and that the performance differential is negligible (267 vs 269 tps on Llama 3.2 1B at an approx 10k context.) ------------------------------------------------------ I have seen a GPU lock, specifically when sending a particularly large chat completion while the model was loading. However, I have since been unable to reproduce and this may be something I did wrong. Please do create an issue and tag me if any GPU locks do occur. --------- Co-authored-by: Jake Hillion <jake@hillion.co.uk> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 17:48:43 +00:00
Alex Cheema	a962a28afc	Add MetaInstance declarative layer (#1447 ) ## Motivation Users currently manage instances directly, which means if a node disconnects or connections break, the instance dies and nothing recreates it. MetaInstance is a declarative primitive: "ensure an instance matching these parameters always exists." The reconciler watches for unhealthy or missing backing instances and re-places them automatically. ## Changes - MetaInstance type (`meta_instance.py`): declarative constraint with `model_id`, `min_nodes`, optional `node_ids`, and `sharding` - Reconciler (`reconcile.py`): `find_unsatisfied_meta_instances` checks which MetaInstances lack a healthy backing instance, `try_place_for_meta_instance` creates one - Master loop (`main.py`): periodically reconciles unsatisfied MetaInstances; immediate placement on `CreateMetaInstance` command - API (`api.py`): `create_meta_instance` / `delete_meta_instance` / `GET /meta_instances` endpoints; delete cascades to backing instances with task cancellation - Binding via `meta_instance_id` on Instance (`instances.py`): no separate binding event or backing map — the instance carries its parent MetaInstance ID directly, eliminating race conditions in the reconciler - Dashboard: sidebar shows MetaInstances with their backing instance status; orphan instances (created directly) still shown separately - Tests: constraint matching, connection health, unsatisfied detection, exclusive binding, cascade delete with task cancellation ### Recent improvements - fix: cancel active tasks on cascade delete — `DeleteMetaInstance` now emits `TaskStatusUpdated(Cancelled)` for any Pending/Running tasks on backing instances before emitting `InstanceDeleted`. Previously, cascade-deleting backing instances left orphaned task references in state. - Lifecycle logging — added `logger.info`/`logger.warning` for: `CreateMetaInstance` (model, min_nodes, sharding), `DeleteMetaInstance` (with cascade count), reconciler placement success/failure, and retry decisions with attempt counts in `InstanceHealthReconciler`. - GET `/meta_instances` endpoint — lists all meta-instances without needing to fetch full state. - 2 regression tests — `test_cascade_delete_cancels_active_tasks` and `test_cascade_delete_skips_completed_tasks` verify the cascade-delete event sequence. ## Why It Works Putting `meta_instance_id` on `BaseInstance` makes binding inherent to instance creation. When the reconciler creates an instance for a MetaInstance, it tags it via `model_copy`. When the instance is deleted, the binding disappears with it. This avoids the two bugs that a separate binding mechanism would introduce: 1. Stale exclusion sets — the reconciler loop can't accidentally bind two MetaInstances to the same instance 2. Delete ordering race — no window between deleting an instance and its binding where the reconciler could re-place ## Test Plan ### Manual Testing <!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB, connected via Thunderbolt 4) --> - Created MetaInstance via dashboard, verified instance placed - Verified delete cascades (deleting MetaInstance removes backing instance) - Verified orphan instances still work independently ### Automated Testing - 30 tests in `test_meta_instance_edge_cases.py`: lifecycle, retry logic, error handling, concurrent operations, cascade delete with task cancellation - 24 tests in `test_reconcile.py`: constraint matching, connection health (single/multi-node, edge removal, IP changes), unsatisfied detection, exclusive binding, idempotency - All 261 tests pass - basedpyright 0 errors, ruff clean, dashboard builds --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-17 09:48:19 -08:00