wrrg

add model cards
introduce resources folder
2026-01-21 12:30:22 -05:00 · 2026-01-20 11:14:20 +00:00 · 2026-01-20 10:56:29 +00:00 · 2026-01-20 10:56:29 +00:00 · 2026-01-20 10:56:29 +00:00 · 2026-01-20 10:56:17 +00:00
50 changed files with 664 additions and 435 deletions
--- a/packaging/pyinstaller/exo.spec
+++ b/packaging/pyinstaller/exo.spec
@@ -10,6 +10,7 @@ PROJECT_ROOT = Path.cwd()
 SOURCE_ROOT = PROJECT_ROOT / "src"
 ENTRYPOINT = SOURCE_ROOT / "exo" / "__main__.py"
 DASHBOARD_DIR = PROJECT_ROOT / "dashboard" / "build"
+RESOURCES_DIR = PROJECT_ROOT / "resources"
 EXO_SHARED_MODELS_DIR = SOURCE_ROOT / "exo" / "shared" / "models"

 if not ENTRYPOINT.is_file():
@@ -18,6 +19,9 @@ if not ENTRYPOINT.is_file():
 if not DASHBOARD_DIR.is_dir():
    raise SystemExit(f"Dashboard assets are missing: {DASHBOARD_DIR}")

+if not RESOURCES_DIR.is_dir():
+    raise SystemExit(f"Resources are missing: {RESOURCES_DIR}")
+
 if not EXO_SHARED_MODELS_DIR.is_dir():
    raise SystemExit(f"Shared model assets are missing: {EXO_SHARED_MODELS_DIR}")

@@ -58,6 +62,7 @@ HIDDEN_IMPORTS = sorted(

 DATAS: list[tuple[str, str]] = [
    (str(DASHBOARD_DIR), "dashboard"),
+    (str(RESOURCES_DIR), "resources"),
    (str(MLX_LIB_DIR), "mlx/lib"),
    (str(EXO_SHARED_MODELS_DIR), "exo/shared/models"),
 ]
--- a/resources/mlx-community--DeepSeek-V3.1-4bit.toml
+++ b/resources/mlx-community--DeepSeek-V3.1-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/DeepSeek-V3.1-4bit"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 405874409472
--- a/resources/mlx-community--DeepSeek-V3.1-8bit.toml
+++ b/resources/mlx-community--DeepSeek-V3.1-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/DeepSeek-V3.1-8bit"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 765577920512
--- a/resources/mlx-community--GLM-4.5-Air-8bit.toml
+++ b/resources/mlx-community--GLM-4.5-Air-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.5-Air-8bit"
+n_layers = 46
+hidden_size = 4096
+supports_tensor = false
+
+[storage_size]
+in_bytes = 122406567936
--- a/resources/mlx-community--GLM-4.5-Air-bf16.toml
+++ b/resources/mlx-community--GLM-4.5-Air-bf16.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.5-Air-bf16"
+n_layers = 46
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 229780750336
--- a/resources/mlx-community--GLM-4.7-4bit.toml
+++ b/resources/mlx-community--GLM-4.7-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.7-4bit"
+n_layers = 91
+hidden_size = 5120
+supports_tensor = true
+
+[storage_size]
+in_bytes = 198556925568
--- a/resources/mlx-community--GLM-4.7-6bit.toml
+++ b/resources/mlx-community--GLM-4.7-6bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.7-6bit"
+n_layers = 91
+hidden_size = 5120
+supports_tensor = true
+
+[storage_size]
+in_bytes = 286737579648
--- a/resources/mlx-community--GLM-4.7-8bit-gs32.toml
+++ b/resources/mlx-community--GLM-4.7-8bit-gs32.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.7-8bit-gs32"
+n_layers = 91
+hidden_size = 5120
+supports_tensor = true
+
+[storage_size]
+in_bytes = 396963397248
--- a/resources/mlx-community--Kimi-K2-Instruct-4bit.toml
+++ b/resources/mlx-community--Kimi-K2-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Kimi-K2-Instruct-4bit"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 620622774272
--- a/resources/mlx-community--Kimi-K2-Thinking.toml
+++ b/resources/mlx-community--Kimi-K2-Thinking.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Kimi-K2-Thinking"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 706522120192
--- a/resources/mlx-community--Llama-3.2-1B-Instruct-4bit.toml
+++ b/resources/mlx-community--Llama-3.2-1B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+n_layers = 16
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 729808896
--- a/resources/mlx-community--Llama-3.2-3B-Instruct-4bit.toml
+++ b/resources/mlx-community--Llama-3.2-3B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+n_layers = 28
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 1863319552
--- a/resources/mlx-community--Llama-3.2-3B-Instruct-8bit.toml
+++ b/resources/mlx-community--Llama-3.2-3B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
+n_layers = 28
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 3501195264
--- a/resources/mlx-community--Llama-3.3-70B-Instruct-4bit.toml
+++ b/resources/mlx-community--Llama-3.3-70B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 40652242944
--- a/resources/mlx-community--Llama-3.3-70B-Instruct-8bit.toml
+++ b/resources/mlx-community--Llama-3.3-70B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 76799803392
--- a/resources/mlx-community--Meta-Llama-3.1-70B-Instruct-4bit.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-70B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 40652242944
--- a/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-4bit.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+n_layers = 32
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 4637851648
--- a/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-8bit.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
+n_layers = 32
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 8954839040
--- a/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-bf16.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-bf16.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
+n_layers = 32
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 16882073600
--- a/resources/mlx-community--MiniMax-M2.1-3bit.toml
+++ b/resources/mlx-community--MiniMax-M2.1-3bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/MiniMax-M2.1-3bit"
+n_layers = 61
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 100086644736
--- a/resources/mlx-community--MiniMax-M2.1-8bit.toml
+++ b/resources/mlx-community--MiniMax-M2.1-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/MiniMax-M2.1-8bit"
+n_layers = 61
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 242986745856
--- a/resources/mlx-community--Qwen3-0.6B-4bit.toml
+++ b/resources/mlx-community--Qwen3-0.6B-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-0.6B-4bit"
+n_layers = 28
+hidden_size = 1024
+supports_tensor = false
+
+[storage_size]
+in_bytes = 342884352
--- a/resources/mlx-community--Qwen3-0.6B-8bit.toml
+++ b/resources/mlx-community--Qwen3-0.6B-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-0.6B-8bit"
+n_layers = 28
+hidden_size = 1024
+supports_tensor = false
+
+[storage_size]
+in_bytes = 698351616
--- a/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
+++ b/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
+n_layers = 94
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 141733920768
--- a/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
+++ b/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
+n_layers = 94
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 268435456000
--- a/resources/mlx-community--Qwen3-30B-A3B-4bit.toml
+++ b/resources/mlx-community--Qwen3-30B-A3B-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-30B-A3B-4bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 17612931072
--- a/resources/mlx-community--Qwen3-30B-A3B-8bit.toml
+++ b/resources/mlx-community--Qwen3-30B-A3B-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-30B-A3B-8bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 33279705088
--- a/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-4bit.toml
+++ b/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
+n_layers = 62
+hidden_size = 6144
+supports_tensor = true
+
+[storage_size]
+in_bytes = 289910292480
--- a/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-8bit.toml
+++ b/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
+n_layers = 62
+hidden_size = 6144
+supports_tensor = true
+
+[storage_size]
+in_bytes = 579820584960
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-4bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 46976204800
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-8bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 88814387200
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 88814387200
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 88814387200
--- a/resources/mlx-community--gpt-oss-120b-MXFP4-Q8.toml
+++ b/resources/mlx-community--gpt-oss-120b-MXFP4-Q8.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
+n_layers = 36
+hidden_size = 2880
+supports_tensor = true
+
+[storage_size]
+in_bytes = 70652212224
--- a/resources/mlx-community--gpt-oss-20b-MXFP4-Q8.toml
+++ b/resources/mlx-community--gpt-oss-20b-MXFP4-Q8.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/gpt-oss-20b-MXFP4-Q8"
+n_layers = 24
+hidden_size = 2880
+supports_tensor = true
+
+[storage_size]
+in_bytes = 12025908224
--- a/resources/mlx-community--llama-3.3-70b-instruct-fp16.toml
+++ b/resources/mlx-community--llama-3.3-70b-instruct-fp16.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 144383672320
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -1,5 +1,6 @@
 import time
 from collections.abc import AsyncGenerator
+from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import cast

@@ -19,8 +20,7 @@ from exo.master.placement import place_instance as get_instance_placements
 from exo.shared.apply import apply
 from exo.shared.election import ElectionMessage
 from exo.shared.logging import InterceptLogger
-from exo.shared.models.model_cards import MODEL_CARDS, ModelCard, ModelId
-from exo.shared.models.model_meta import get_model_card
+from exo.shared.models.model_cards import ModelCard, ModelId, get_model_cards
 from exo.shared.types.api import (
    BenchChatCompletionResponse,
    BenchChatCompletionTaskParams,
@@ -65,7 +65,7 @@ from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
 from exo.shared.types.worker.shards import Sharding
 from exo.utils.banner import print_startup_banner
 from exo.utils.channels import Receiver, Sender, channel
-from exo.utils.dashboard_path import find_dashboard
+from exo.utils.dashboard_path import RuntimeResources, find_directory
 from exo.utils.event_buffer import OrderedBuffer


@@ -86,57 +86,52 @@ def chunk_to_response(
    )


-async def resolve_model_card(model_id: str) -> ModelCard:
-    if model_id in MODEL_CARDS:
-        model_card = MODEL_CARDS[model_id]
-        return model_card
-    else:
-        return await get_model_card(model_id)
-
-
+@dataclass(eq=False)
 class API:
-    def __init__(
-        self,
+    node_id: NodeId
+    session_id: SessionId
+    port: int
+    app: FastAPI
+    global_event_receiver: Receiver[ForwarderEvent]
+    command_sender: Sender[ForwarderCommand]
+    election_receiver: Receiver[ElectionMessage]
+    state = field(init=False, default_factory=State)
+    _event_log: list[Event] = field(init=False, default_factory=list)
+    event_buffer: OrderedBuffer[Event] = field(init=False, default_factory=OrderedBuffer)
+    _chat_completion_queues: dict[CommandId, Sender[TokenChunk]] = field(init=False, default_factory=dict)
+    _tg: TaskGroup = field(init=False, default_factory=create_task_group)
+    last_completed_election: int = field(init=False, default=0)
+    paused: bool = field(init=False, default = False)
+    paused_ev: anyio.Event = field(init=False, default_factory=anyio.Event)
+
+    @classmethod
+    async def create(
+        cls,
        node_id: NodeId,
        session_id: SessionId,
        *,
        port: int,
-        # Ideally this would be a MasterForwarderEvent but type system says no :(
        global_event_receiver: Receiver[ForwarderEvent],
        command_sender: Sender[ForwarderCommand],
-        # This lets us pause the API if an election is running
        election_receiver: Receiver[ElectionMessage],
    ) -> None:
-        self.state = State()
-        self._event_log: list[Event] = []
-        self.command_sender = command_sender
-        self.global_event_receiver = global_event_receiver
-        self.election_receiver = election_receiver
-        self.event_buffer: OrderedBuffer[Event] = OrderedBuffer[Event]()
-        self.node_id: NodeId = node_id
-        self.session_id: SessionId = session_id
-        self.last_completed_election: int = 0
-        self.port = port
-
-        self.paused: bool = False
-        self.paused_ev: anyio.Event = anyio.Event()
-
-        self.app = FastAPI()
-        self._setup_exception_handlers()
-        self._setup_cors()
-        self._setup_routes()
-
-        self.app.mount(
+        app = FastAPI()
+        app.mount(
            "/",
            StaticFiles(
-                directory=find_dashboard(),
+                directory=await find_directory(RuntimeResources.Dashboard),
                html=True,
            ),
            name="dashboard",
        )

-        self._chat_completion_queues: dict[CommandId, Sender[TokenChunk]] = {}
-        self._tg: TaskGroup | None = None
+        cls(node_id, session_id, port, app, global_event_receiver, command_sender, election_receiver)
+
+    def __post_init__(self) -> None:
+        self._setup_exception_handlers()
+        self._setup_cors()
+        self._setup_routes()
+

    def reset(self, new_session_id: SessionId, result_clock: int):
        logger.info("Resetting API State")
@@ -213,7 +208,7 @@ class API:
        self, payload: CreateInstanceParams
    ) -> CreateInstanceResponse:
        instance = payload.instance
-        model_card = await resolve_model_card(instance.shard_assignments.model_id)
+        model_card = await ModelCard.from_hf(instance.shard_assignments.model_id)
        required_memory = model_card.storage_size
        available_memory = self._calculate_total_available_memory()

@@ -279,7 +274,7 @@ class API:
        if len(list(self.state.topology.list_nodes())) == 0:
            return PlacementPreviewResponse(previews=[])

-        cards = [card for card in MODEL_CARDS.values() if card.model_id == model_id]
+        cards = [card for card in await get_model_cards() if card.short_id == model_id]
        if not cards:
            raise HTTPException(status_code=404, detail=f"Model {model_id} not found")

@@ -620,7 +615,7 @@ class API:
                    storage_size_megabytes=int(card.storage_size.in_mb),
                    supports_tensor=card.supports_tensor,
                )
-                for card in MODEL_CARDS.values()
+                for card in model_cards()
            ]
        )

--- a/src/exo/master/tests/test_placement.py
+++ b/src/exo/master/tests/test_placement.py
@@ -276,7 +276,9 @@ def test_placement_selects_leaf_nodes(
    # arrange
    topology = Topology()

-    model_card.storage_size = Memory.from_bytes(1000)
+    # Model requires more than any single node but fits within a 3-node cycle
+    model_card.storage_size.in_bytes = 1500
+    model_card.n_layers = 12

    node_id_a = NodeId()
    node_id_b = NodeId()
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -1,8 +1,24 @@
-from pydantic import PositiveInt
+from typing import Annotated

+import aiofiles
+import aiofiles.os as aios
+import tomlkit
+from anyio import Path, open_file
+from huggingface_hub import model_info
+from loguru import logger
+from pydantic import BaseModel, Field, PositiveInt, ValidationError
+from tomlkit.exceptions import TOMLKitError
+
+from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.common import Id
 from exo.shared.types.memory import Memory
+from exo.utils.dashboard_path import RuntimeResources, find_directory
 from exo.utils.pydantic_ext import CamelCaseModel
+from exo.worker.download.download_utils import (
+    ModelSafetensorsIndex,
+    download_file_with_retry,
+    ensure_models_dir,
+)


 class ModelId(Id):
@@ -12,6 +28,7 @@ class ModelId(Id):
    def short(self) -> str:
        return self.split("/")[-1]

+_card_cache: dict[str, ModelCard] = {}

 class ModelCard(CamelCaseModel):
    model_id: ModelId
@@ -20,249 +37,67 @@ class ModelCard(CamelCaseModel):
    hidden_size: PositiveInt
    supports_tensor: bool

+    async def save(self, path: Path) -> None:
+        async with await open_file(path, "w") as f:
+            py = self.model_dump()
+            data = tomlkit.dumps(py)  # pyright: ignore[reportUnknownMemberType]
+            await f.write(data)
+
+    async def save_to_default_path(self) -> None:
+        dir = await find_directory(RuntimeResources.Resources)
+        await self.save(dir / self.model_id.normalize())
+
+    @staticmethod
+    async def load_from_path(path: Path) -> ModelCard:
+        async with await open_file(path, "r") as f:
+            py = tomlkit.loads(await f.read())
+            return ModelCard.model_validate(py)
+
+    @staticmethod
+    async def load_from_default_path(model_id: ModelId) -> ModelCard:
+        return await ModelCard.load_from_path(await find_directory(RuntimeResources.Resources) / model_id.normalize())
+
+    @staticmethod
+    async def load(model_id: ModelId) -> ModelCard:
+        try:
+            return await ModelCard.load_from_default_path(model_id)
+        except (ValidationError, TOMLKitError, FileNotFoundError):
+            return await ModelCard.from_hf(model_id)
+
+
+    @staticmethod
+    async def from_hf(model_id: ModelId) -> ModelCard:
+        """Fetches storage size and number of layers for a Hugging Face model, returns Pydantic ModelMeta."""
+        if (mc := _card_cache.get(model_id, None)) is not None:
+            return mc
+        config_data = await get_config_data(model_id)
+        num_layers = config_data.layer_count
+        mem_size_bytes = await get_safetensors_size(model_id)
+
+        mc = ModelCard(
+            model_id=ModelId(model_id),
+            storage_size=mem_size_bytes,
+            n_layers=num_layers,
+            hidden_size=config_data.hidden_size or 0,
+            # TODO: all custom models currently do not support tensor. We could add a dynamic test for this?
+            supports_tensor=False,
+        )
+        _card_cache[model_id] = mc
+        return mc
+
+# TODO: should we cache this? how do we check for changes
+async def get_model_cards() -> list[ModelCard]:
+    dir = await find_directory(RuntimeResources.Resources)
+    cards: list[ModelCard] = []
+    async for file in dir.glob("*.toml"):
+        try:
+            cards.append(await ModelCard.load_from_path(file))
+        except (TOMLKitError, ValidationError):
+            continue
+
+    return cards

 MODEL_CARDS: dict[str, ModelCard] = {
-    # deepseek v3
-    "deepseek-v3.1-4bit": ModelCard(
-        model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
-        storage_size=Memory.from_gb(378),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    "deepseek-v3.1-8bit": ModelCard(
-        model_id=ModelId("mlx-community/DeepSeek-V3.1-8bit"),
-        storage_size=Memory.from_gb(713),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    # kimi k2
-    "kimi-k2-instruct-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Kimi-K2-Instruct-4bit"),
-        storage_size=Memory.from_gb(578),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    "kimi-k2-thinking": ModelCard(
-        model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
-        storage_size=Memory.from_gb(658),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    # llama-3.1
-    "llama-3.1-8b": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
-        storage_size=Memory.from_mb(4423),
-        n_layers=32,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "llama-3.1-8b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"),
-        storage_size=Memory.from_mb(8540),
-        n_layers=32,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "llama-3.1-8b-bf16": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"),
-        storage_size=Memory.from_mb(16100),
-        n_layers=32,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "llama-3.1-70b": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"),
-        storage_size=Memory.from_mb(38769),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    # llama-3.2
-    "llama-3.2-1b": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.2-1B-Instruct-4bit"),
-        storage_size=Memory.from_mb(696),
-        n_layers=16,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "llama-3.2-3b": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-4bit"),
-        storage_size=Memory.from_mb(1777),
-        n_layers=28,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
-    "llama-3.2-3b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-8bit"),
-        storage_size=Memory.from_mb(3339),
-        n_layers=28,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
-    # llama-3.3
-    "llama-3.3-70b": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-4bit"),
-        storage_size=Memory.from_mb(38769),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    "llama-3.3-70b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-8bit"),
-        storage_size=Memory.from_mb(73242),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    "llama-3.3-70b-fp16": ModelCard(
-        model_id=ModelId("mlx-community/llama-3.3-70b-instruct-fp16"),
-        storage_size=Memory.from_mb(137695),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    # qwen3
-    "qwen3-0.6b": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-0.6B-4bit"),
-        storage_size=Memory.from_mb(327),
-        n_layers=28,
-        hidden_size=1024,
-        supports_tensor=False,
-    ),
-    "qwen3-0.6b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-0.6B-8bit"),
-        storage_size=Memory.from_mb(666),
-        n_layers=28,
-        hidden_size=1024,
-        supports_tensor=False,
-    ),
-    "qwen3-30b": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-30B-A3B-4bit"),
-        storage_size=Memory.from_mb(16797),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-30b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-30B-A3B-8bit"),
-        storage_size=Memory.from_mb(31738),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"),
-        storage_size=Memory.from_mb(44800),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"),
-        storage_size=Memory.from_mb(84700),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-thinking-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"),
-        storage_size=Memory.from_mb(84700),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-thinking-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"),
-        storage_size=Memory.from_mb(84700),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-235b-a22b-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"),
-        storage_size=Memory.from_gb(132),
-        n_layers=94,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "qwen3-235b-a22b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"),
-        storage_size=Memory.from_gb(250),
-        n_layers=94,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "qwen3-coder-480b-a35b-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"),
-        storage_size=Memory.from_gb(270),
-        n_layers=62,
-        hidden_size=6144,
-        supports_tensor=True,
-    ),
-    "qwen3-coder-480b-a35b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"),
-        storage_size=Memory.from_gb(540),
-        n_layers=62,
-        hidden_size=6144,
-        supports_tensor=True,
-    ),
-    # gpt-oss
-    "gpt-oss-120b-MXFP4-Q8": ModelCard(
-        model_id=ModelId("mlx-community/gpt-oss-120b-MXFP4-Q8"),
-        storage_size=Memory.from_kb(68_996_301),
-        n_layers=36,
-        hidden_size=2880,
-        supports_tensor=True,
-    ),
-    "gpt-oss-20b-MXFP4-Q8": ModelCard(
-        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
-        storage_size=Memory.from_kb(11_744_051),
-        n_layers=24,
-        hidden_size=2880,
-        supports_tensor=True,
-    ),
-    # glm 4.5
-    "glm-4.5-air-8bit": ModelCard(
-        # Needs to be quantized g32 or g16 to work with tensor parallel
-        model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
-        storage_size=Memory.from_gb(114),
-        n_layers=46,
-        hidden_size=4096,
-        supports_tensor=False,
-    ),
-    "glm-4.5-air-bf16": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.5-Air-bf16"),
-        storage_size=Memory.from_gb(214),
-        n_layers=46,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    # glm 4.7
-    "glm-4.7-4bit": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.7-4bit"),
-        storage_size=Memory.from_bytes(198556925568),
-        n_layers=91,
-        hidden_size=5120,
-        supports_tensor=True,
-    ),
-    "glm-4.7-6bit": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.7-6bit"),
-        storage_size=Memory.from_bytes(286737579648),
-        n_layers=91,
-        hidden_size=5120,
-        supports_tensor=True,
-    ),
-    "glm-4.7-8bit-gs32": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
-        storage_size=Memory.from_bytes(396963397248),
-        n_layers=91,
-        hidden_size=5120,
-        supports_tensor=True,
-    ),
    # glm 4.7 flash
    "glm-4.7-flash-4bit": ModelCard(
        model_id=ModelId("mlx-community/GLM-4.7-Flash-4bit"),
@@ -292,19 +127,83 @@ MODEL_CARDS: dict[str, ModelCard] = {
        hidden_size=2048,
        supports_tensor=True,
    ),
-    # minimax-m2
-    "minimax-m2.1-8bit": ModelCard(
-        model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
-        storage_size=Memory.from_bytes(242986745856),
-        n_layers=61,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
-    "minimax-m2.1-3bit": ModelCard(
-        model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
-        storage_size=Memory.from_bytes(100086644736),
-        n_layers=61,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
 }
+
+
+
+
+
+class ConfigData(BaseModel):
+    model_config = {"extra": "ignore"}  # Allow unknown fields
+
+    # Common field names for number of layers across different architectures
+    num_hidden_layers: Annotated[int, Field(ge=0)] | None = None
+    num_layers: Annotated[int, Field(ge=0)] | None = None
+    n_layer: Annotated[int, Field(ge=0)] | None = None
+    n_layers: Annotated[int, Field(ge=0)] | None = None  # Sometimes used
+    num_decoder_layers: Annotated[int, Field(ge=0)] | None = None  # Transformer models
+    decoder_layers: Annotated[int, Field(ge=0)] | None = None  # Some architectures
+    hidden_size: Annotated[int, Field(ge=0)] | None = None
+
+    @property
+    def layer_count(self) -> int:
+        # Check common field names for layer count
+        layer_fields = [
+            self.num_hidden_layers,
+            self.num_layers,
+            self.n_layer,
+            self.n_layers,
+            self.num_decoder_layers,
+            self.decoder_layers,
+        ]
+
+        for layer_count in layer_fields:
+            if layer_count is not None:
+                return layer_count
+
+        raise ValueError(
+            f"No layer count found in config.json: {self.model_dump_json()}"
+        )
+
+
+async def get_config_data(model_id: ModelId) -> ConfigData:
+    """Downloads and parses config.json for a model."""
+    target_dir = (await ensure_models_dir()) / model_id.normalize()
+    await aios.makedirs(target_dir, exist_ok=True)
+    config_path = await download_file_with_retry(
+        str(model_id),
+        "main",
+        "config.json",
+        target_dir,
+        lambda curr_bytes, total_bytes, is_renamed: logger.info(
+            f"Downloading config.json for {model_id}: {curr_bytes}/{total_bytes} ({is_renamed=})"
+        ),
+    )
+    async with aiofiles.open(config_path, "r") as f:
+        return ConfigData.model_validate_json(await f.read())
+
+
+async def get_safetensors_size(model_id: ModelId) -> Memory:
+    """Gets model size from safetensors index or falls back to HF API."""
+    target_dir = (await ensure_models_dir()) / model_id.normalize()
+    await aios.makedirs(target_dir, exist_ok=True)
+    index_path = await download_file_with_retry(
+        str(model_id),
+        "main",
+        "model.safetensors.index.json",
+        target_dir,
+        lambda curr_bytes, total_bytes, is_renamed: logger.info(
+            f"Downloading model.safetensors.index.json for {model_id}: {curr_bytes}/{total_bytes} ({is_renamed=})"
+        ),
+    )
+    async with aiofiles.open(index_path, "r") as f:
+        index_data = ModelSafetensorsIndex.model_validate_json(await f.read())
+
+    metadata = index_data.metadata
+    if metadata is not None:
+        return Memory.from_bytes(metadata.total_size)
+
+    info = model_info(model_id)
+    if info.safetensors is None:
+        raise ValueError(f"No safetensors info found for {model_id}")
+    return Memory.from_bytes(info.safetensors.total)
--- a/src/exo/shared/models/model_meta.py
+++ b/src/exo/shared/models/model_meta.py
@@ -6,7 +6,7 @@ from huggingface_hub import model_info
 from loguru import logger
 from pydantic import BaseModel, Field

-from exo.shared.models.model_cards import MODEL_CARDS, ModelCard, ModelId
+from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.memory import Memory
 from exo.worker.download.download_utils import (
    ModelSafetensorsIndex,
@@ -90,33 +90,23 @@ async def get_safetensors_size(model_id: str) -> Memory:
        raise ValueError(f"No safetensors info found for {model_id}")
    return Memory.from_bytes(info.safetensors.total)

-
 _model_card_cache: dict[str, ModelCard] = {}

-
 async def get_model_card(model_id: str) -> ModelCard:
+    """Fetches storage size and number of layers for a Hugging Face model, returns Pydantic ModelMeta."""
    if model_id in _model_card_cache:
        return _model_card_cache[model_id]
-    model_card = await _get_model_card(model_id)
-    _model_card_cache[model_id] = model_card
-    return model_card
-
-
-async def _get_model_card(model_id: str) -> ModelCard:
-    """Fetches storage size and number of layers for a Hugging Face model, returns Pydantic ModelMeta."""
    config_data = await get_config_data(model_id)
    num_layers = config_data.layer_count
    mem_size_bytes = await get_safetensors_size(model_id)
-    model_card = next(
-        (card for card in MODEL_CARDS.values() if card.model_id == ModelId(model_id)),
-        None,
-    )

-    return ModelCard(
+    mc = ModelCard(
        model_id=ModelId(model_id),
        storage_size=mem_size_bytes,
        n_layers=num_layers,
        hidden_size=config_data.hidden_size or 0,
        # TODO: all custom models currently do not support tensor. We could add a dynamic test for this?
-        supports_tensor=model_card.supports_tensor if model_card is not None else False,
+        supports_tensor=False,
    )
+    _model_card_cache[model_id] = mc
+    return mc
--- a/src/exo/utils/dashboard_path.py
+++ b/src/exo/utils/dashboard_path.py
@@ -1,45 +1,72 @@
+import enum
 import os
 import sys
-from pathlib import Path
 from typing import cast

+from anyio import Path

-def find_dashboard() -> Path:
-    dashboard = (
-        _find_dashboard_in_env()
-        or _find_dashboard_in_repo()
-        or _find_dashboard_in_bundle()
+
+class RuntimeResources(enum.Enum):
+    Dashboard = enum.auto
+    Resources = enum.auto
+
+_dir_cache: dict[RuntimeResources, Path]
+
+async def find_directory(rr: RuntimeResources) -> Path:
+    dir = (
+        _dir_cache.get(rr, None)
+        or await _find_in_env(rr)
+        or await _find_in_repo(rr)
+        or await _find_in_bundle(rr)
    )
-    if not dashboard:
+    if not dir:
        raise FileNotFoundError(
-            "Unable to locate dashboard assets - make sure the dashboard has been built, or export DASHBOARD_DIR if you've built the dashboard elsewhere."
+            "Unable to locate directory - make sure the dashboard has been built and the runtime resources (model cards) exist."
        )
-    return dashboard
+    _dir_cache[rr] = dir
+    return dir


-def _find_dashboard_in_env() -> Path | None:
-    env = os.environ.get("DASHBOARD_DIR")
+async def _find_in_env(rr: RuntimeResources) -> Path | None:
+    match rr:
+        case RuntimeResources.Dashboard:
+            env = os.environ.get("DASHBOARD_DIR")
+        case RuntimeResources.Resources:
+            env = os.environ.get("RESOURCES_DIR")
    if not env:
        return None
-    resolved_env = Path(env).expanduser().resolve()
+    resolved_env = await (await Path(env).expanduser()).resolve()

    return resolved_env


-def _find_dashboard_in_repo() -> Path | None:
-    current_module = Path(__file__).resolve()
+async def _find_in_repo(rr: RuntimeResources) -> Path | None:
+    current_module = await Path(__file__).resolve()
    for parent in current_module.parents:
-        build = parent / "dashboard" / "build"
-        if build.is_dir() and (build / "index.html").exists():
-            return build
+        match rr:
+            case RuntimeResources.Dashboard:
+                build = parent / "dashboard" / "build"
+                if await build.is_dir() and await (build / "index.html").exists():
+                    return build
+            case RuntimeResources.Resources:
+                res = parent / "resources"
+                if await res.is_dir():
+                    return res
    return None


-def _find_dashboard_in_bundle() -> Path | None:
+async def _find_in_bundle(rr: RuntimeResources) -> Path | None:
    frozen_root = cast(str | None, getattr(sys, "_MEIPASS", None))
    if frozen_root is None:
        return None
-    candidate = Path(frozen_root) / "dashboard"
-    if candidate.is_dir():
-        return candidate
+
+    match rr:
+        case RuntimeResources.Dashboard:
+            candidate = Path(frozen_root) / "dashboard"
+            if await candidate.is_dir():
+                return candidate
+        case RuntimeResources.Resources:
+            candidate = Path(frozen_root) / "resources"
+            if await candidate.is_dir():
+                return candidate
    return None
--- a/src/exo/worker/download/download_utils.py
+++ b/src/exo/worker/download/download_utils.py
@@ -477,6 +477,53 @@ async def get_downloaded_size(path: Path) -> int:
    return 0


+async def download_progress_for_local_path(
+    repo_id: str, shard: ShardMetadata, local_path: Path
+) -> RepoDownloadProgress:
+    file_progress: dict[str, RepoFileDownloadProgress] = {}
+    total_files = 0
+    total_bytes = 0
+
+    if await aios.path.isdir(local_path):
+        for root, _, files in os.walk(local_path):
+            for f in files:
+                if f.endswith((".safetensors", ".bin", ".pt", ".gguf", ".json")):
+                    file_path = Path(root) / f
+                    size = (await aios.stat(file_path)).st_size
+                    rel_path = str(file_path.relative_to(local_path))
+                    file_progress[rel_path] = RepoFileDownloadProgress(
+                        repo_id=repo_id,
+                        repo_revision="local",
+                        file_path=rel_path,
+                        downloaded=Memory.from_bytes(size),
+                        downloaded_this_session=Memory.from_bytes(0),
+                        total=Memory.from_bytes(size),
+                        speed=0,
+                        eta=timedelta(0),
+                        status="complete",
+                        start_time=time.time(),
+                    )
+                    total_files += 1
+                    total_bytes += size
+    else:
+        raise ValueError(f"Local path {local_path} is not a directory")
+
+    return RepoDownloadProgress(
+        repo_id=repo_id,
+        repo_revision="local",
+        shard=shard,
+        completed_files=total_files,
+        total_files=total_files,
+        downloaded_bytes=Memory.from_bytes(total_bytes),
+        downloaded_bytes_this_session=Memory.from_bytes(0),
+        total_bytes=Memory.from_bytes(total_bytes),
+        overall_speed=0,
+        overall_eta=timedelta(0),
+        status="complete",
+        file_progress=file_progress,
+    )
+
+
 async def download_shard(
    shard: ShardMetadata,
    on_progress: Callable[[ShardMetadata, RepoDownloadProgress], Awaitable[None]],
@@ -487,6 +534,14 @@ async def download_shard(
    if not skip_download:
        logger.info(f"Downloading {shard.model_card.model_id=}")

+    # Handle local paths
+    if await aios.path.exists(str(shard.model_card.model_id)):
+        logger.info(f"Using local model path {shard.model_card.model_id}")
+        local_path = Path(str(shard.model_card.model_id))
+        return local_path, await download_progress_for_local_path(
+            str(shard.model_card.model_id), shard, local_path
+        )
+
    revision = "main"
    target_dir = await ensure_models_dir() / str(shard.model_card.model_id).replace(
        "/", "--"
@@ -497,8 +552,7 @@ async def download_shard(
    if not allow_patterns:
        allow_patterns = await resolve_allow_patterns(shard)

-    if not skip_download:
-        logger.info(f"Downloading {shard.model_card.model_id=} with {allow_patterns=}")
+    logger.info(f"Downloading {shard.model_card.model_id=} with {allow_patterns=}")

    all_start_time = time.time()
    # TODO: currently not recursive. Some models might require subdirectories - thus this will need to be changed.
--- a/src/exo/worker/download/impl_shard_downloader.py
+++ b/src/exo/worker/download/impl_shard_downloader.py
@@ -3,8 +3,7 @@ from collections.abc import Awaitable
 from pathlib import Path
 from typing import AsyncIterator, Callable

-from exo.shared.models.model_cards import MODEL_CARDS
-from exo.shared.models.model_meta import get_model_card
+from exo.shared.models.model_cards import ModelCard, get_model_cards
 from exo.shared.types.worker.shards import (
    PipelineShardMetadata,
    ShardMetadata,
@@ -20,7 +19,7 @@ def exo_shard_downloader(max_parallel_downloads: int = 8) -> ShardDownloader:


 async def build_base_shard(model_id: str) -> ShardMetadata:
-    model_card = await get_model_card(model_id)
+    model_card = await ModelCard.from_hf(model_id)
    return PipelineShardMetadata(
        model_card=model_card,
        device_rank=0,
@@ -159,7 +158,7 @@ class ResumableShardDownloader(ShardDownloader):
        # Kick off download status coroutines concurrently
        tasks = [
            asyncio.create_task(_status_for_model(model_card.model_id))
-            for model_card in MODEL_CARDS.values()
+            for model_card in await get_model_cards()
        ]

        for task in asyncio.as_completed(tasks):
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Callable
 from functools import partial
 from inspect import signature
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, Protocol, cast

 import mlx.core as mx
 import mlx.nn as nn
@@ -67,16 +67,27 @@ def eval_with_timeout(
        completed.set()


+class _LayerCallable(Protocol):
+    """Structural type that any compatible layer must satisfy.
+
+    We require a single positional input of type ``mx.array`` and an
+    ``mx.array`` output, while permitting arbitrary *args / **kwargs so this
+    protocol matches the vast majority of `mlx.nn.Module` subclasses.
+    """
+
+    def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array: ...
+
+
 class CustomMlxLayer(nn.Module):
    """Base class for replacing an MLX layer with a custom implementation."""

-    def __init__(self, original_layer: nn.Module):
+    def __init__(self, original_layer: _LayerCallable):
        super().__init__()
        object.__setattr__(self, "_original_layer", original_layer)

    @property
-    def original_layer(self) -> nn.Module:
-        return cast(nn.Module, object.__getattribute__(self, "_original_layer"))
+    def original_layer(self) -> _LayerCallable:
+        return cast(_LayerCallable, object.__getattribute__(self, "_original_layer"))

    # Calls __getattr__ for any attributes not found on nn.Module (e.g. use_sliding)
    if not TYPE_CHECKING:
@@ -89,53 +100,52 @@ class CustomMlxLayer(nn.Module):
                return getattr(original_layer, name)


-def patch_pipeline_first_layer(
-    pipeline_layer: nn.Module, group: mx.distributed.Group
-) -> nn.Module:
-    cls = type(pipeline_layer)
-    orig_call = cast(Callable[..., mx.array], cls.__call__)
+class PipelineFirstLayer(CustomMlxLayer):
+    def __init__(
+        self,
+        original_layer: _LayerCallable,
+        r: int,
+        group: mx.distributed.Group,
+    ):
+        super().__init__(original_layer)
+        self.r: int = r
+        self.group = group

-    rank = group.rank()
-
-    class PatchedFirstLayer(cls):
-        def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
-            if rank != 0:
-                x = mx.distributed.recv_like(x, (rank - 1), group=group)
-            return orig_call(self, x, *args, **kwargs)
-
-    pipeline_layer.__class__ = PatchedFirstLayer
-
-    return pipeline_layer
+    def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
+        if self.r != 0:
+            x = mx.distributed.recv_like(x, (self.r - 1), group=self.group)
+        return self.original_layer(x, *args, **kwargs)


-def patch_pipeline_last_layer(
-    pipeline_layer: nn.Module, group: mx.distributed.Group
-) -> nn.Module:
-    cls = type(pipeline_layer)
-    orig_call = cast(Callable[..., mx.array], cls.__call__)
-    orig_call_sig = signature(orig_call)
+class PipelineLastLayer(CustomMlxLayer):
+    def __init__(
+        self,
+        original_layer: _LayerCallable,
+        r: int,
+        s: int,
+        group: mx.distributed.Group,
+    ):
+        super().__init__(original_layer)
+        self.r: int = r
+        self.s: int = s
+        self.group = group
+        self.original_layer_signature = signature(self.original_layer.__call__)

-    rank = group.rank()
-    size = group.size()
+    def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
+        cache = self.original_layer_signature.bind_partial(
+            x, *args, **kwargs
+        ).arguments.get("cache", None)

-    class PatchedLastLayer(cls):
-        def __call__(self, x: mx.array, *args: object, **kwargs: object) -> mx.array:
-            cache = orig_call_sig.bind_partial(x, *args, **kwargs).arguments.get(
-                "cache", None
+        output: mx.array = self.original_layer(x, *args, **kwargs)
+
+        if self.r != self.s - 1:
+            output = mx.distributed.send(
+                output, (self.r + 1) % self.s, group=self.group
            )
+            if cache is not None:
+                cache.keys = mx.depends(cache.keys, output)  # type: ignore[reportUnknownMemberType]

-            output: mx.array = orig_call(self, x, *args, **kwargs)
-
-            if rank != size - 1:
-                output = mx.distributed.send(output, (rank + 1) % size, group=group)
-                if cache is not None:
-                    cache.keys = mx.depends(cache.keys, output)  # type: ignore[reportUnknownMemberType]
-
-            return output
-
-    pipeline_layer.__class__ = PatchedLastLayer
-
-    return pipeline_layer
+        return output


 def _inner_model(model: nn.Module) -> nn.Module:
@@ -150,13 +160,13 @@ def _inner_model(model: nn.Module) -> nn.Module:
    raise ValueError("Model must either have a 'model' or 'transformer' attribute")


-def _get_layers(inner_model_instance: nn.Module) -> list[nn.Module]:
+def _get_layers(inner_model_instance: nn.Module) -> list[_LayerCallable]:
    # Handle both model.layers and model.h cases
-    layers: list[nn.Module]
+    layers: list[_LayerCallable]
    if hasattr(inner_model_instance, "layers"):
-        layers = cast(list[nn.Module], inner_model_instance.layers)
+        layers = cast(list[_LayerCallable], inner_model_instance.layers)
    elif hasattr(inner_model_instance, "h"):
-        layers = cast(list[nn.Module], inner_model_instance.h)
+        layers = cast(list[_LayerCallable], inner_model_instance.h)
    else:
        raise ValueError("Model must have either a 'layers' or 'h' attribute")

@@ -181,12 +191,15 @@ def pipeline_auto_parallel(
    layers = _get_layers(inner_model_instance)

    start_layer, end_layer = model_shard_meta.start_layer, model_shard_meta.end_layer
+    device_rank, world_size = model_shard_meta.device_rank, model_shard_meta.world_size

    layers = layers[start_layer:end_layer]
-    layers[0] = patch_pipeline_first_layer(layers[0], group)
-    layers[-1] = patch_pipeline_last_layer(
+    layers[0] = PipelineFirstLayer(layers[0], device_rank, group=group)
+    layers[-1] = PipelineLastLayer(
        layers[-1],
-        group,
+        device_rank,
+        world_size,
+        group=group,
    )

    if isinstance(inner_model_instance, GptOssMoeModel):
@@ -433,7 +446,7 @@ class LlamaShardingStrategy(TensorParallelShardingStrategy):
        return model


-def _set_layers(model: nn.Module, layers: list[nn.Module]) -> None:
+def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
    inner_model_instance = _inner_model(model)
    if hasattr(inner_model_instance, "layers"):
        inner_model_instance.layers = layers
@@ -508,17 +521,17 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):


 class ShardedDeepseekV3MoE(CustomMlxLayer):
-    def __init__(self, layer: nn.Module):
+    def __init__(self, layer: _LayerCallable):
        super().__init__(layer)
        self.sharding_group: mx.distributed.Group | None = None

    def __call__(self, x: mx.array) -> mx.array:
        if self.sharding_group is not None:
            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer.__call__(x)  # type: ignore
+        y = self.original_layer.__call__(x)
        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)  # type: ignore
-        return y  # type: ignore
+            y = mx.distributed.all_sum(y, group=self.sharding_group)
+        return y


 class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
@@ -552,7 +565,7 @@ class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
            self.all_to_sharded_linear_in_place(
                layer.block_sparse_moe.switch_mlp.up_proj
            )
-            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue]
+            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
            layer.block_sparse_moe.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]

        return model
@@ -586,7 +599,7 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
-                layer.mlp = ShardedQwenMoE(layer.mlp)  # pyright: ignore[reportAttributeAccessIssue]
+                layer.mlp = ShardedQwenMoE(layer.mlp)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
                layer.mlp.sharding_group = self.group

            # Shard the MLP
@@ -599,17 +612,17 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):


 class ShardedQwenMoE(CustomMlxLayer):
-    def __init__(self, layer: nn.Module):
+    def __init__(self, layer: _LayerCallable):
        super().__init__(layer)
        self.sharding_group: mx.distributed.Group | None = None

    def __call__(self, x: mx.array) -> mx.array:
        if self.sharding_group is not None:
            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer.__call__(x)  # type: ignore
+        y = self.original_layer.__call__(x)
        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)  # type: ignore
-        return y  # type: ignore
+            y = mx.distributed.all_sum(y, group=self.sharding_group)
+        return y


 class GptOssShardingStrategy(TensorParallelShardingStrategy):
@@ -661,7 +674,7 @@ class ShardedGptOssMoE(CustomMlxLayer):
    def __call__(self, x: mx.array) -> mx.array:
        if self.sharding_group is not None:
            x = sum_gradients(self.sharding_group)(x)
-        y = self.original_layer(x)  # type: ignore
+        y = self.original_layer(x)
        if self.sharding_group is not None:
-            y = mx.distributed.all_sum(y, group=self.sharding_group)  # type: ignore
-        return y  # type: ignore
+            y = mx.distributed.all_sum(y, group=self.sharding_group)
+        return y
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -413,6 +413,11 @@ class Worker:
            )
            for nid in conns:
                for ip in conns[nid]:
+                    if "127.0.0.1" in ip or "localhost" in ip:
+                        logger.warning(
+                            f"Loopback connection should not happen: {ip=} for {nid=}"
+                        )
+
                    edge = SocketConnection(
                        # nonsense multiaddr
                        sink_multiaddr=Multiaddr(address=f"/ip4/{ip}/tcp/52415")
@@ -433,9 +438,6 @@ class Worker:
            for conn in self.state.topology.out_edges(self.node_id):
                if not isinstance(conn.edge, SocketConnection):
                    continue
-                # ignore mDNS discovered connections
-                if conn.edge.sink_multiaddr.port != 52415:
-                    continue
                if (
                    conn.sink not in conns
                    or conn.edge.sink_multiaddr.ip_address
@@ -449,7 +451,7 @@ class Worker:
    async def _emit_existing_download_progress(self) -> None:
        try:
            while True:
-                logger.debug("Fetching and emitting existing download progress...")
+                logger.info("Fetching and emitting existing download progress...")
                async for (
                    _,
                    progress,
@@ -480,7 +482,7 @@ class Worker:
                    await self.event_sender.send(
                        NodeDownloadProgress(download_progress=status)
                    )
-                logger.debug("Done emitting existing download progress.")
+                logger.info("Done emitting existing download progress.")
                await anyio.sleep(5 * 60)  # 5 minutes
        except Exception as e:
            logger.error(f"Error emitting existing download progress: {e}")
--- a/src/exo/worker/tests/unittests/test_mlx/conftest.py
+++ b/src/exo/worker/tests/unittests/test_mlx/conftest.py
@@ -18,7 +18,7 @@ from exo.shared.types.tasks import ChatCompletionTaskParams
 from exo.shared.types.worker.shards import PipelineShardMetadata, TensorShardMetadata
 from exo.worker.engines.mlx import Model
 from exo.worker.engines.mlx.generator.generate import mlx_generate
-from exo.worker.engines.mlx.utils_mlx import shard_and_load, apply_chat_template
+from exo.worker.engines.mlx.utils_mlx import shard_and_load


 class MockLayer(nn.Module):
@@ -116,11 +116,12 @@ def run_gpt_oss_pipeline_device(
            messages=[ChatCompletionMessage(role="user", content=prompt_text)],
            max_tokens=max_tokens,
        )
-        prompt = apply_chat_template(tokenizer, task)

        generated_text = ""
        for response in mlx_generate(
-            model=model, tokenizer=tokenizer, task=task, prompt=prompt
+            model=model,
+            tokenizer=tokenizer,
+            task=task,
        ):
            generated_text += response.text
            if response.finish_reason is not None:
@@ -182,11 +183,11 @@ def run_gpt_oss_tensor_parallel_device(
            max_tokens=max_tokens,
        )

-        prompt = apply_chat_template(tokenizer, task)
-
        generated_text = ""
        for response in mlx_generate(
-            model=model, tokenizer=tokenizer, task=task, prompt=prompt
+            model=model,
+            tokenizer=tokenizer,
+            task=task,
        ):
            generated_text += response.text
            if response.finish_reason is not None:
--- a/src/exo/worker/tests/unittests/test_mlx/test_auto_parallel.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_auto_parallel.py
@@ -10,8 +10,8 @@ import pytest

 from exo.worker.engines.mlx.auto_parallel import (
    CustomMlxLayer,
-    patch_pipeline_first_layer,
-    patch_pipeline_last_layer,
+    PipelineFirstLayer,
+    PipelineLastLayer,
    patch_pipeline_model,
 )
 from exo.worker.tests.unittests.test_mlx.conftest import MockLayer
@@ -50,8 +50,8 @@ def run_pipeline_device(
        group = mx.distributed.init(backend="ring", strict=True)

        mock = MockLayerInner()
-        first = patch_pipeline_first_layer(mock, group)
-        composed = patch_pipeline_last_layer(first, group)
+        first = PipelineFirstLayer(mock, r=rank, group=group)
+        composed = PipelineLastLayer(first, r=rank, s=world_size, group=group)

        # Wrap in a mock model, then wrap in PipelineParallelModel for all_gather
        inner_model = MockModel([composed])
@@ -78,8 +78,8 @@ def test_composed_wrappers_delegate_attributes() -> None:
    mock = MockLayer()
    group = mx.distributed.init()

-    first = patch_pipeline_first_layer(mock, group)
-    composed = patch_pipeline_last_layer(first, group)
+    first = PipelineFirstLayer(mock, r=0, group=group)
+    composed = PipelineLastLayer(first, r=0, s=1, group=group)

    assert composed.custom_attr == "test_value"  # type: ignore[attr-defined]
    assert composed.use_sliding is True  # type: ignore[attr-defined]
--- a/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
@@ -18,7 +18,6 @@ def _check_model_exists() -> bool:


 pytestmark = [
-    pytest.mark.slow,
    pytest.mark.skipif(
        not _check_model_exists(),
        reason=f"GPT-OSS model not found at {DEFAULT_GPT_OSS_CONFIG.model_path}",
--- a/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
@@ -89,8 +89,6 @@ def get_test_models() -> list[tuple[str, ModelCard]]:

 TEST_MODELS: list[tuple[str, ModelCard]] = get_test_models()

-pytestmark = pytest.mark.slow
-

@pytest.fixture(scope="module")
 def event_loop():
--- a/tests/headless_runner.py
+++ b/tests/headless_runner.py
@@ -12,7 +12,7 @@ from loguru import logger
 from pydantic import BaseModel

 from exo.shared.logging import InterceptLogger, logger_setup
-from exo.shared.models.model_cards import MODEL_CARDS, ModelId
+from exo.shared.models.model_cards import ModelId
 from exo.shared.types.api import ChatCompletionMessage, ChatCompletionTaskParams
 from exo.shared.types.commands import CommandId
 from exo.shared.types.common import Host, NodeId
@@ -89,22 +89,22 @@ async def tb_detection():

 async def assert_downloads():
    sd = exo_shard_downloader()
-    # await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-0.6b"].model_id))
+    # await sd.ensure_shard(ModelId("mlx-community/Qwen3-0.6B-8bit")))
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["llama-3.1-8b-bf16"].model_id)
+        await build_full_shard(ModelId("mlx-community/Llama-3.1-8b-bf16"))
    )
-    await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-30b"].model_id))
+    await sd.ensure_shard(await build_full_shard(ModelId("mlx-community/Qwen3-30b-A3B")))
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["gpt-oss-120b-MXFP4-Q8"].model_id)
+        await build_full_shard(ModelId("mlx-commmunity/gpt-oss-120b-MXFP4-Q8"))
    )
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["gpt-oss-20b-4bit"].model_id)
+        await build_full_shard(ModelId("mlx-community/gpt-oss-20b-4bit"))
    )
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["glm-4.7-8bit-gs32"].model_id)
+        await build_full_shard(ModelId("mlx-community/GLM-4.7-8bit-gs32"))
    )
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["minimax-m2.1-8bit"].model_id)
+        await build_full_shard(ModelId("mlx-community/MiniMax-M2.1-8bit"))
    )
Author	SHA1	Message	Date
Evan	92b24196c3	wrrg	2026-01-20 11:14:20 +00:00
Evan	3bf7770988	add model cards	2026-01-20 10:56:29 +00:00
Evan	8392463a70	introduce resources folder	2026-01-20 10:56:29 +00:00
Evan	9c1f6224b0	Merge branch 'main' into simplify-model-cards	2026-01-20 10:56:29 +00:00
Evan	f370dbd1e0	Merge branch 'main' into simplify-model-cards merge fix	2026-01-20 10:56:17 +00:00
rltakashige	6a38f9efba	Merge branch 'main' into simplify-model-cards	2026-01-19 17:43:59 +00:00
Evan	0475de6431	wuff	2026-01-19 17:07:03 +00:00