wrrg

add model cards
introduce resources folder
2026-01-21 12:30:22 -05:00 · 2026-01-20 11:14:20 +00:00 · 2026-01-20 10:56:29 +00:00 · 2026-01-20 10:56:29 +00:00 · 2026-01-20 10:56:29 +00:00 · 2026-01-20 10:56:17 +00:00
47 changed files with 739 additions and 355 deletions
--- a/packaging/pyinstaller/exo.spec
+++ b/packaging/pyinstaller/exo.spec
@@ -10,6 +10,7 @@ PROJECT_ROOT = Path.cwd()
 SOURCE_ROOT = PROJECT_ROOT / "src"
 ENTRYPOINT = SOURCE_ROOT / "exo" / "__main__.py"
 DASHBOARD_DIR = PROJECT_ROOT / "dashboard" / "build"
+RESOURCES_DIR = PROJECT_ROOT / "resources"
 EXO_SHARED_MODELS_DIR = SOURCE_ROOT / "exo" / "shared" / "models"

 if not ENTRYPOINT.is_file():
@@ -18,6 +19,9 @@ if not ENTRYPOINT.is_file():
 if not DASHBOARD_DIR.is_dir():
    raise SystemExit(f"Dashboard assets are missing: {DASHBOARD_DIR}")

+if not RESOURCES_DIR.is_dir():
+    raise SystemExit(f"Resources are missing: {RESOURCES_DIR}")
+
 if not EXO_SHARED_MODELS_DIR.is_dir():
    raise SystemExit(f"Shared model assets are missing: {EXO_SHARED_MODELS_DIR}")

@@ -58,6 +62,7 @@ HIDDEN_IMPORTS = sorted(

 DATAS: list[tuple[str, str]] = [
    (str(DASHBOARD_DIR), "dashboard"),
+    (str(RESOURCES_DIR), "resources"),
    (str(MLX_LIB_DIR), "mlx/lib"),
    (str(EXO_SHARED_MODELS_DIR), "exo/shared/models"),
 ]
--- a/resources/mlx-community--DeepSeek-V3.1-4bit.toml
+++ b/resources/mlx-community--DeepSeek-V3.1-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/DeepSeek-V3.1-4bit"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 405874409472
--- a/resources/mlx-community--DeepSeek-V3.1-8bit.toml
+++ b/resources/mlx-community--DeepSeek-V3.1-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/DeepSeek-V3.1-8bit"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 765577920512
--- a/resources/mlx-community--GLM-4.5-Air-8bit.toml
+++ b/resources/mlx-community--GLM-4.5-Air-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.5-Air-8bit"
+n_layers = 46
+hidden_size = 4096
+supports_tensor = false
+
+[storage_size]
+in_bytes = 122406567936
--- a/resources/mlx-community--GLM-4.5-Air-bf16.toml
+++ b/resources/mlx-community--GLM-4.5-Air-bf16.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.5-Air-bf16"
+n_layers = 46
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 229780750336
--- a/resources/mlx-community--GLM-4.7-4bit.toml
+++ b/resources/mlx-community--GLM-4.7-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.7-4bit"
+n_layers = 91
+hidden_size = 5120
+supports_tensor = true
+
+[storage_size]
+in_bytes = 198556925568
--- a/resources/mlx-community--GLM-4.7-6bit.toml
+++ b/resources/mlx-community--GLM-4.7-6bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.7-6bit"
+n_layers = 91
+hidden_size = 5120
+supports_tensor = true
+
+[storage_size]
+in_bytes = 286737579648
--- a/resources/mlx-community--GLM-4.7-8bit-gs32.toml
+++ b/resources/mlx-community--GLM-4.7-8bit-gs32.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/GLM-4.7-8bit-gs32"
+n_layers = 91
+hidden_size = 5120
+supports_tensor = true
+
+[storage_size]
+in_bytes = 396963397248
--- a/resources/mlx-community--Kimi-K2-Instruct-4bit.toml
+++ b/resources/mlx-community--Kimi-K2-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Kimi-K2-Instruct-4bit"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 620622774272
--- a/resources/mlx-community--Kimi-K2-Thinking.toml
+++ b/resources/mlx-community--Kimi-K2-Thinking.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Kimi-K2-Thinking"
+n_layers = 61
+hidden_size = 7168
+supports_tensor = true
+
+[storage_size]
+in_bytes = 706522120192
--- a/resources/mlx-community--Llama-3.2-1B-Instruct-4bit.toml
+++ b/resources/mlx-community--Llama-3.2-1B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+n_layers = 16
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 729808896
--- a/resources/mlx-community--Llama-3.2-3B-Instruct-4bit.toml
+++ b/resources/mlx-community--Llama-3.2-3B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
+n_layers = 28
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 1863319552
--- a/resources/mlx-community--Llama-3.2-3B-Instruct-8bit.toml
+++ b/resources/mlx-community--Llama-3.2-3B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
+n_layers = 28
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 3501195264
--- a/resources/mlx-community--Llama-3.3-70B-Instruct-4bit.toml
+++ b/resources/mlx-community--Llama-3.3-70B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 40652242944
--- a/resources/mlx-community--Llama-3.3-70B-Instruct-8bit.toml
+++ b/resources/mlx-community--Llama-3.3-70B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 76799803392
--- a/resources/mlx-community--Meta-Llama-3.1-70B-Instruct-4bit.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-70B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 40652242944
--- a/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-4bit.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+n_layers = 32
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 4637851648
--- a/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-8bit.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
+n_layers = 32
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 8954839040
--- a/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-bf16.toml
+++ b/resources/mlx-community--Meta-Llama-3.1-8B-Instruct-bf16.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
+n_layers = 32
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 16882073600
--- a/resources/mlx-community--MiniMax-M2.1-3bit.toml
+++ b/resources/mlx-community--MiniMax-M2.1-3bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/MiniMax-M2.1-3bit"
+n_layers = 61
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 100086644736
--- a/resources/mlx-community--MiniMax-M2.1-8bit.toml
+++ b/resources/mlx-community--MiniMax-M2.1-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/MiniMax-M2.1-8bit"
+n_layers = 61
+hidden_size = 3072
+supports_tensor = true
+
+[storage_size]
+in_bytes = 242986745856
--- a/resources/mlx-community--Qwen3-0.6B-4bit.toml
+++ b/resources/mlx-community--Qwen3-0.6B-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-0.6B-4bit"
+n_layers = 28
+hidden_size = 1024
+supports_tensor = false
+
+[storage_size]
+in_bytes = 342884352
--- a/resources/mlx-community--Qwen3-0.6B-8bit.toml
+++ b/resources/mlx-community--Qwen3-0.6B-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-0.6B-8bit"
+n_layers = 28
+hidden_size = 1024
+supports_tensor = false
+
+[storage_size]
+in_bytes = 698351616
--- a/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
+++ b/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
+n_layers = 94
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 141733920768
--- a/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
+++ b/resources/mlx-community--Qwen3-235B-A22B-Instruct-2507-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
+n_layers = 94
+hidden_size = 4096
+supports_tensor = true
+
+[storage_size]
+in_bytes = 268435456000
--- a/resources/mlx-community--Qwen3-30B-A3B-4bit.toml
+++ b/resources/mlx-community--Qwen3-30B-A3B-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-30B-A3B-4bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 17612931072
--- a/resources/mlx-community--Qwen3-30B-A3B-8bit.toml
+++ b/resources/mlx-community--Qwen3-30B-A3B-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-30B-A3B-8bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 33279705088
--- a/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-4bit.toml
+++ b/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
+n_layers = 62
+hidden_size = 6144
+supports_tensor = true
+
+[storage_size]
+in_bytes = 289910292480
--- a/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-8bit.toml
+++ b/resources/mlx-community--Qwen3-Coder-480B-A35B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
+n_layers = 62
+hidden_size = 6144
+supports_tensor = true
+
+[storage_size]
+in_bytes = 579820584960
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-4bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 46976204800
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-8bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Instruct-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 88814387200
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-4bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 88814387200
--- a/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
+++ b/resources/mlx-community--Qwen3-Next-80B-A3B-Thinking-8bit.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
+n_layers = 48
+hidden_size = 2048
+supports_tensor = true
+
+[storage_size]
+in_bytes = 88814387200
--- a/resources/mlx-community--gpt-oss-120b-MXFP4-Q8.toml
+++ b/resources/mlx-community--gpt-oss-120b-MXFP4-Q8.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
+n_layers = 36
+hidden_size = 2880
+supports_tensor = true
+
+[storage_size]
+in_bytes = 70652212224
--- a/resources/mlx-community--gpt-oss-20b-MXFP4-Q8.toml
+++ b/resources/mlx-community--gpt-oss-20b-MXFP4-Q8.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/gpt-oss-20b-MXFP4-Q8"
+n_layers = 24
+hidden_size = 2880
+supports_tensor = true
+
+[storage_size]
+in_bytes = 12025908224
--- a/resources/mlx-community--llama-3.3-70b-instruct-fp16.toml
+++ b/resources/mlx-community--llama-3.3-70b-instruct-fp16.toml
@@ -0,0 +1,7 @@
+model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
+n_layers = 80
+hidden_size = 8192
+supports_tensor = true
+
+[storage_size]
+in_bytes = 144383672320
--- a/src/exo/master/api.py
+++ b/src/exo/master/api.py
@@ -1,5 +1,6 @@
 import time
 from collections.abc import AsyncGenerator
+from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import cast

@@ -19,8 +20,7 @@ from exo.master.placement import place_instance as get_instance_placements
 from exo.shared.apply import apply
 from exo.shared.election import ElectionMessage
 from exo.shared.logging import InterceptLogger
-from exo.shared.models.model_cards import MODEL_CARDS, ModelCard, ModelId
-from exo.shared.models.model_meta import get_model_card
+from exo.shared.models.model_cards import ModelCard, ModelId, get_model_cards
 from exo.shared.types.api import (
    BenchChatCompletionResponse,
    BenchChatCompletionTaskParams,
@@ -65,7 +65,7 @@ from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
 from exo.shared.types.worker.shards import Sharding
 from exo.utils.banner import print_startup_banner
 from exo.utils.channels import Receiver, Sender, channel
-from exo.utils.dashboard_path import find_dashboard
+from exo.utils.dashboard_path import RuntimeResources, find_directory
 from exo.utils.event_buffer import OrderedBuffer


@@ -86,57 +86,52 @@ def chunk_to_response(
    )


-async def resolve_model_card(model_id: str) -> ModelCard:
-    if model_id in MODEL_CARDS:
-        model_card = MODEL_CARDS[model_id]
-        return model_card
-    else:
-        return await get_model_card(model_id)
-
-
+@dataclass(eq=False)
 class API:
-    def __init__(
-        self,
+    node_id: NodeId
+    session_id: SessionId
+    port: int
+    app: FastAPI
+    global_event_receiver: Receiver[ForwarderEvent]
+    command_sender: Sender[ForwarderCommand]
+    election_receiver: Receiver[ElectionMessage]
+    state = field(init=False, default_factory=State)
+    _event_log: list[Event] = field(init=False, default_factory=list)
+    event_buffer: OrderedBuffer[Event] = field(init=False, default_factory=OrderedBuffer)
+    _chat_completion_queues: dict[CommandId, Sender[TokenChunk]] = field(init=False, default_factory=dict)
+    _tg: TaskGroup = field(init=False, default_factory=create_task_group)
+    last_completed_election: int = field(init=False, default=0)
+    paused: bool = field(init=False, default = False)
+    paused_ev: anyio.Event = field(init=False, default_factory=anyio.Event)
+
+    @classmethod
+    async def create(
+        cls,
        node_id: NodeId,
        session_id: SessionId,
        *,
        port: int,
-        # Ideally this would be a MasterForwarderEvent but type system says no :(
        global_event_receiver: Receiver[ForwarderEvent],
        command_sender: Sender[ForwarderCommand],
-        # This lets us pause the API if an election is running
        election_receiver: Receiver[ElectionMessage],
    ) -> None:
-        self.state = State()
-        self._event_log: list[Event] = []
-        self.command_sender = command_sender
-        self.global_event_receiver = global_event_receiver
-        self.election_receiver = election_receiver
-        self.event_buffer: OrderedBuffer[Event] = OrderedBuffer[Event]()
-        self.node_id: NodeId = node_id
-        self.session_id: SessionId = session_id
-        self.last_completed_election: int = 0
-        self.port = port
-
-        self.paused: bool = False
-        self.paused_ev: anyio.Event = anyio.Event()
-
-        self.app = FastAPI()
-        self._setup_exception_handlers()
-        self._setup_cors()
-        self._setup_routes()
-
-        self.app.mount(
+        app = FastAPI()
+        app.mount(
            "/",
            StaticFiles(
-                directory=find_dashboard(),
+                directory=await find_directory(RuntimeResources.Dashboard),
                html=True,
            ),
            name="dashboard",
        )

-        self._chat_completion_queues: dict[CommandId, Sender[TokenChunk]] = {}
-        self._tg: TaskGroup | None = None
+        cls(node_id, session_id, port, app, global_event_receiver, command_sender, election_receiver)
+
+    def __post_init__(self) -> None:
+        self._setup_exception_handlers()
+        self._setup_cors()
+        self._setup_routes()
+

    def reset(self, new_session_id: SessionId, result_clock: int):
        logger.info("Resetting API State")
@@ -213,7 +208,7 @@ class API:
        self, payload: CreateInstanceParams
    ) -> CreateInstanceResponse:
        instance = payload.instance
-        model_card = await resolve_model_card(instance.shard_assignments.model_id)
+        model_card = await ModelCard.from_hf(instance.shard_assignments.model_id)
        required_memory = model_card.storage_size
        available_memory = self._calculate_total_available_memory()

@@ -279,7 +274,7 @@ class API:
        if len(list(self.state.topology.list_nodes())) == 0:
            return PlacementPreviewResponse(previews=[])

-        cards = [card for card in MODEL_CARDS.values() if card.model_id == model_id]
+        cards = [card for card in await get_model_cards() if card.short_id == model_id]
        if not cards:
            raise HTTPException(status_code=404, detail=f"Model {model_id} not found")

@@ -620,7 +615,7 @@ class API:
                    storage_size_megabytes=int(card.storage_size.in_mb),
                    supports_tensor=card.supports_tensor,
                )
-                for card in MODEL_CARDS.values()
+                for card in model_cards()
            ]
        )

--- a/src/exo/master/tests/test_placement.py
+++ b/src/exo/master/tests/test_placement.py
@@ -276,7 +276,9 @@ def test_placement_selects_leaf_nodes(
    # arrange
    topology = Topology()

-    model_card.storage_size = Memory.from_bytes(1000)
+    # Model requires more than any single node but fits within a 3-node cycle
+    model_card.storage_size.in_bytes = 1500
+    model_card.n_layers = 12

    node_id_a = NodeId()
    node_id_b = NodeId()
--- a/src/exo/shared/models/model_cards.py
+++ b/src/exo/shared/models/model_cards.py
@@ -1,8 +1,24 @@
-from pydantic import PositiveInt
+from typing import Annotated

+import aiofiles
+import aiofiles.os as aios
+import tomlkit
+from anyio import Path, open_file
+from huggingface_hub import model_info
+from loguru import logger
+from pydantic import BaseModel, Field, PositiveInt, ValidationError
+from tomlkit.exceptions import TOMLKitError
+
+from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.common import Id
 from exo.shared.types.memory import Memory
+from exo.utils.dashboard_path import RuntimeResources, find_directory
 from exo.utils.pydantic_ext import CamelCaseModel
+from exo.worker.download.download_utils import (
+    ModelSafetensorsIndex,
+    download_file_with_retry,
+    ensure_models_dir,
+)


 class ModelId(Id):
@@ -12,6 +28,7 @@ class ModelId(Id):
    def short(self) -> str:
        return self.split("/")[-1]

+_card_cache: dict[str, ModelCard] = {}

 class ModelCard(CamelCaseModel):
    model_id: ModelId
@@ -20,249 +37,67 @@ class ModelCard(CamelCaseModel):
    hidden_size: PositiveInt
    supports_tensor: bool

+    async def save(self, path: Path) -> None:
+        async with await open_file(path, "w") as f:
+            py = self.model_dump()
+            data = tomlkit.dumps(py)  # pyright: ignore[reportUnknownMemberType]
+            await f.write(data)
+
+    async def save_to_default_path(self) -> None:
+        dir = await find_directory(RuntimeResources.Resources)
+        await self.save(dir / self.model_id.normalize())
+
+    @staticmethod
+    async def load_from_path(path: Path) -> ModelCard:
+        async with await open_file(path, "r") as f:
+            py = tomlkit.loads(await f.read())
+            return ModelCard.model_validate(py)
+
+    @staticmethod
+    async def load_from_default_path(model_id: ModelId) -> ModelCard:
+        return await ModelCard.load_from_path(await find_directory(RuntimeResources.Resources) / model_id.normalize())
+
+    @staticmethod
+    async def load(model_id: ModelId) -> ModelCard:
+        try:
+            return await ModelCard.load_from_default_path(model_id)
+        except (ValidationError, TOMLKitError, FileNotFoundError):
+            return await ModelCard.from_hf(model_id)
+
+
+    @staticmethod
+    async def from_hf(model_id: ModelId) -> ModelCard:
+        """Fetches storage size and number of layers for a Hugging Face model, returns Pydantic ModelMeta."""
+        if (mc := _card_cache.get(model_id, None)) is not None:
+            return mc
+        config_data = await get_config_data(model_id)
+        num_layers = config_data.layer_count
+        mem_size_bytes = await get_safetensors_size(model_id)
+
+        mc = ModelCard(
+            model_id=ModelId(model_id),
+            storage_size=mem_size_bytes,
+            n_layers=num_layers,
+            hidden_size=config_data.hidden_size or 0,
+            # TODO: all custom models currently do not support tensor. We could add a dynamic test for this?
+            supports_tensor=False,
+        )
+        _card_cache[model_id] = mc
+        return mc
+
+# TODO: should we cache this? how do we check for changes
+async def get_model_cards() -> list[ModelCard]:
+    dir = await find_directory(RuntimeResources.Resources)
+    cards: list[ModelCard] = []
+    async for file in dir.glob("*.toml"):
+        try:
+            cards.append(await ModelCard.load_from_path(file))
+        except (TOMLKitError, ValidationError):
+            continue
+
+    return cards

 MODEL_CARDS: dict[str, ModelCard] = {
-    # deepseek v3
-    "deepseek-v3.1-4bit": ModelCard(
-        model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
-        storage_size=Memory.from_gb(378),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    "deepseek-v3.1-8bit": ModelCard(
-        model_id=ModelId("mlx-community/DeepSeek-V3.1-8bit"),
-        storage_size=Memory.from_gb(713),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    # kimi k2
-    "kimi-k2-instruct-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Kimi-K2-Instruct-4bit"),
-        storage_size=Memory.from_gb(578),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    "kimi-k2-thinking": ModelCard(
-        model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
-        storage_size=Memory.from_gb(658),
-        n_layers=61,
-        hidden_size=7168,
-        supports_tensor=True,
-    ),
-    # llama-3.1
-    "llama-3.1-8b": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
-        storage_size=Memory.from_mb(4423),
-        n_layers=32,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "llama-3.1-8b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"),
-        storage_size=Memory.from_mb(8540),
-        n_layers=32,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "llama-3.1-8b-bf16": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"),
-        storage_size=Memory.from_mb(16100),
-        n_layers=32,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "llama-3.1-70b": ModelCard(
-        model_id=ModelId("mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"),
-        storage_size=Memory.from_mb(38769),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    # llama-3.2
-    "llama-3.2-1b": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.2-1B-Instruct-4bit"),
-        storage_size=Memory.from_mb(696),
-        n_layers=16,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "llama-3.2-3b": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-4bit"),
-        storage_size=Memory.from_mb(1777),
-        n_layers=28,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
-    "llama-3.2-3b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-8bit"),
-        storage_size=Memory.from_mb(3339),
-        n_layers=28,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
-    # llama-3.3
-    "llama-3.3-70b": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-4bit"),
-        storage_size=Memory.from_mb(38769),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    "llama-3.3-70b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-8bit"),
-        storage_size=Memory.from_mb(73242),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    "llama-3.3-70b-fp16": ModelCard(
-        model_id=ModelId("mlx-community/llama-3.3-70b-instruct-fp16"),
-        storage_size=Memory.from_mb(137695),
-        n_layers=80,
-        hidden_size=8192,
-        supports_tensor=True,
-    ),
-    # qwen3
-    "qwen3-0.6b": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-0.6B-4bit"),
-        storage_size=Memory.from_mb(327),
-        n_layers=28,
-        hidden_size=1024,
-        supports_tensor=False,
-    ),
-    "qwen3-0.6b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-0.6B-8bit"),
-        storage_size=Memory.from_mb(666),
-        n_layers=28,
-        hidden_size=1024,
-        supports_tensor=False,
-    ),
-    "qwen3-30b": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-30B-A3B-4bit"),
-        storage_size=Memory.from_mb(16797),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-30b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-30B-A3B-8bit"),
-        storage_size=Memory.from_mb(31738),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"),
-        storage_size=Memory.from_mb(44800),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"),
-        storage_size=Memory.from_mb(84700),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-thinking-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"),
-        storage_size=Memory.from_mb(84700),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-80b-a3B-thinking-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"),
-        storage_size=Memory.from_mb(84700),
-        n_layers=48,
-        hidden_size=2048,
-        supports_tensor=True,
-    ),
-    "qwen3-235b-a22b-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"),
-        storage_size=Memory.from_gb(132),
-        n_layers=94,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "qwen3-235b-a22b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"),
-        storage_size=Memory.from_gb(250),
-        n_layers=94,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    "qwen3-coder-480b-a35b-4bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"),
-        storage_size=Memory.from_gb(270),
-        n_layers=62,
-        hidden_size=6144,
-        supports_tensor=True,
-    ),
-    "qwen3-coder-480b-a35b-8bit": ModelCard(
-        model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"),
-        storage_size=Memory.from_gb(540),
-        n_layers=62,
-        hidden_size=6144,
-        supports_tensor=True,
-    ),
-    # gpt-oss
-    "gpt-oss-120b-MXFP4-Q8": ModelCard(
-        model_id=ModelId("mlx-community/gpt-oss-120b-MXFP4-Q8"),
-        storage_size=Memory.from_kb(68_996_301),
-        n_layers=36,
-        hidden_size=2880,
-        supports_tensor=True,
-    ),
-    "gpt-oss-20b-MXFP4-Q8": ModelCard(
-        model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
-        storage_size=Memory.from_kb(11_744_051),
-        n_layers=24,
-        hidden_size=2880,
-        supports_tensor=True,
-    ),
-    # glm 4.5
-    "glm-4.5-air-8bit": ModelCard(
-        # Needs to be quantized g32 or g16 to work with tensor parallel
-        model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
-        storage_size=Memory.from_gb(114),
-        n_layers=46,
-        hidden_size=4096,
-        supports_tensor=False,
-    ),
-    "glm-4.5-air-bf16": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.5-Air-bf16"),
-        storage_size=Memory.from_gb(214),
-        n_layers=46,
-        hidden_size=4096,
-        supports_tensor=True,
-    ),
-    # glm 4.7
-    "glm-4.7-4bit": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.7-4bit"),
-        storage_size=Memory.from_bytes(198556925568),
-        n_layers=91,
-        hidden_size=5120,
-        supports_tensor=True,
-    ),
-    "glm-4.7-6bit": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.7-6bit"),
-        storage_size=Memory.from_bytes(286737579648),
-        n_layers=91,
-        hidden_size=5120,
-        supports_tensor=True,
-    ),
-    "glm-4.7-8bit-gs32": ModelCard(
-        model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
-        storage_size=Memory.from_bytes(396963397248),
-        n_layers=91,
-        hidden_size=5120,
-        supports_tensor=True,
-    ),
    # glm 4.7 flash
    "glm-4.7-flash-4bit": ModelCard(
        model_id=ModelId("mlx-community/GLM-4.7-Flash-4bit"),
@@ -292,19 +127,83 @@ MODEL_CARDS: dict[str, ModelCard] = {
        hidden_size=2048,
        supports_tensor=True,
    ),
-    # minimax-m2
-    "minimax-m2.1-8bit": ModelCard(
-        model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
-        storage_size=Memory.from_bytes(242986745856),
-        n_layers=61,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
-    "minimax-m2.1-3bit": ModelCard(
-        model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
-        storage_size=Memory.from_bytes(100086644736),
-        n_layers=61,
-        hidden_size=3072,
-        supports_tensor=True,
-    ),
 }
+
+
+
+
+
+class ConfigData(BaseModel):
+    model_config = {"extra": "ignore"}  # Allow unknown fields
+
+    # Common field names for number of layers across different architectures
+    num_hidden_layers: Annotated[int, Field(ge=0)] | None = None
+    num_layers: Annotated[int, Field(ge=0)] | None = None
+    n_layer: Annotated[int, Field(ge=0)] | None = None
+    n_layers: Annotated[int, Field(ge=0)] | None = None  # Sometimes used
+    num_decoder_layers: Annotated[int, Field(ge=0)] | None = None  # Transformer models
+    decoder_layers: Annotated[int, Field(ge=0)] | None = None  # Some architectures
+    hidden_size: Annotated[int, Field(ge=0)] | None = None
+
+    @property
+    def layer_count(self) -> int:
+        # Check common field names for layer count
+        layer_fields = [
+            self.num_hidden_layers,
+            self.num_layers,
+            self.n_layer,
+            self.n_layers,
+            self.num_decoder_layers,
+            self.decoder_layers,
+        ]
+
+        for layer_count in layer_fields:
+            if layer_count is not None:
+                return layer_count
+
+        raise ValueError(
+            f"No layer count found in config.json: {self.model_dump_json()}"
+        )
+
+
+async def get_config_data(model_id: ModelId) -> ConfigData:
+    """Downloads and parses config.json for a model."""
+    target_dir = (await ensure_models_dir()) / model_id.normalize()
+    await aios.makedirs(target_dir, exist_ok=True)
+    config_path = await download_file_with_retry(
+        str(model_id),
+        "main",
+        "config.json",
+        target_dir,
+        lambda curr_bytes, total_bytes, is_renamed: logger.info(
+            f"Downloading config.json for {model_id}: {curr_bytes}/{total_bytes} ({is_renamed=})"
+        ),
+    )
+    async with aiofiles.open(config_path, "r") as f:
+        return ConfigData.model_validate_json(await f.read())
+
+
+async def get_safetensors_size(model_id: ModelId) -> Memory:
+    """Gets model size from safetensors index or falls back to HF API."""
+    target_dir = (await ensure_models_dir()) / model_id.normalize()
+    await aios.makedirs(target_dir, exist_ok=True)
+    index_path = await download_file_with_retry(
+        str(model_id),
+        "main",
+        "model.safetensors.index.json",
+        target_dir,
+        lambda curr_bytes, total_bytes, is_renamed: logger.info(
+            f"Downloading model.safetensors.index.json for {model_id}: {curr_bytes}/{total_bytes} ({is_renamed=})"
+        ),
+    )
+    async with aiofiles.open(index_path, "r") as f:
+        index_data = ModelSafetensorsIndex.model_validate_json(await f.read())
+
+    metadata = index_data.metadata
+    if metadata is not None:
+        return Memory.from_bytes(metadata.total_size)
+
+    info = model_info(model_id)
+    if info.safetensors is None:
+        raise ValueError(f"No safetensors info found for {model_id}")
+    return Memory.from_bytes(info.safetensors.total)
--- a/src/exo/shared/models/model_meta.py
+++ b/src/exo/shared/models/model_meta.py
@@ -6,7 +6,7 @@ from huggingface_hub import model_info
 from loguru import logger
 from pydantic import BaseModel, Field

-from exo.shared.models.model_cards import MODEL_CARDS, ModelCard, ModelId
+from exo.shared.models.model_cards import ModelCard, ModelId
 from exo.shared.types.memory import Memory
 from exo.worker.download.download_utils import (
    ModelSafetensorsIndex,
@@ -90,33 +90,23 @@ async def get_safetensors_size(model_id: str) -> Memory:
        raise ValueError(f"No safetensors info found for {model_id}")
    return Memory.from_bytes(info.safetensors.total)

-
 _model_card_cache: dict[str, ModelCard] = {}

-
 async def get_model_card(model_id: str) -> ModelCard:
+    """Fetches storage size and number of layers for a Hugging Face model, returns Pydantic ModelMeta."""
    if model_id in _model_card_cache:
        return _model_card_cache[model_id]
-    model_card = await _get_model_card(model_id)
-    _model_card_cache[model_id] = model_card
-    return model_card
-
-
-async def _get_model_card(model_id: str) -> ModelCard:
-    """Fetches storage size and number of layers for a Hugging Face model, returns Pydantic ModelMeta."""
    config_data = await get_config_data(model_id)
    num_layers = config_data.layer_count
    mem_size_bytes = await get_safetensors_size(model_id)
-    model_card = next(
-        (card for card in MODEL_CARDS.values() if card.model_id == ModelId(model_id)),
-        None,
-    )

-    return ModelCard(
+    mc = ModelCard(
        model_id=ModelId(model_id),
        storage_size=mem_size_bytes,
        n_layers=num_layers,
        hidden_size=config_data.hidden_size or 0,
        # TODO: all custom models currently do not support tensor. We could add a dynamic test for this?
-        supports_tensor=model_card.supports_tensor if model_card is not None else False,
+        supports_tensor=False,
    )
+    _model_card_cache[model_id] = mc
+    return mc
--- a/src/exo/utils/dashboard_path.py
+++ b/src/exo/utils/dashboard_path.py
@@ -1,45 +1,72 @@
+import enum
 import os
 import sys
-from pathlib import Path
 from typing import cast

+from anyio import Path

-def find_dashboard() -> Path:
-    dashboard = (
-        _find_dashboard_in_env()
-        or _find_dashboard_in_repo()
-        or _find_dashboard_in_bundle()
+
+class RuntimeResources(enum.Enum):
+    Dashboard = enum.auto
+    Resources = enum.auto
+
+_dir_cache: dict[RuntimeResources, Path]
+
+async def find_directory(rr: RuntimeResources) -> Path:
+    dir = (
+        _dir_cache.get(rr, None)
+        or await _find_in_env(rr)
+        or await _find_in_repo(rr)
+        or await _find_in_bundle(rr)
    )
-    if not dashboard:
+    if not dir:
        raise FileNotFoundError(
-            "Unable to locate dashboard assets - make sure the dashboard has been built, or export DASHBOARD_DIR if you've built the dashboard elsewhere."
+            "Unable to locate directory - make sure the dashboard has been built and the runtime resources (model cards) exist."
        )
-    return dashboard
+    _dir_cache[rr] = dir
+    return dir


-def _find_dashboard_in_env() -> Path | None:
-    env = os.environ.get("DASHBOARD_DIR")
+async def _find_in_env(rr: RuntimeResources) -> Path | None:
+    match rr:
+        case RuntimeResources.Dashboard:
+            env = os.environ.get("DASHBOARD_DIR")
+        case RuntimeResources.Resources:
+            env = os.environ.get("RESOURCES_DIR")
    if not env:
        return None
-    resolved_env = Path(env).expanduser().resolve()
+    resolved_env = await (await Path(env).expanduser()).resolve()

    return resolved_env


-def _find_dashboard_in_repo() -> Path | None:
-    current_module = Path(__file__).resolve()
+async def _find_in_repo(rr: RuntimeResources) -> Path | None:
+    current_module = await Path(__file__).resolve()
    for parent in current_module.parents:
-        build = parent / "dashboard" / "build"
-        if build.is_dir() and (build / "index.html").exists():
-            return build
+        match rr:
+            case RuntimeResources.Dashboard:
+                build = parent / "dashboard" / "build"
+                if await build.is_dir() and await (build / "index.html").exists():
+                    return build
+            case RuntimeResources.Resources:
+                res = parent / "resources"
+                if await res.is_dir():
+                    return res
    return None


-def _find_dashboard_in_bundle() -> Path | None:
+async def _find_in_bundle(rr: RuntimeResources) -> Path | None:
    frozen_root = cast(str | None, getattr(sys, "_MEIPASS", None))
    if frozen_root is None:
        return None
-    candidate = Path(frozen_root) / "dashboard"
-    if candidate.is_dir():
-        return candidate
+
+    match rr:
+        case RuntimeResources.Dashboard:
+            candidate = Path(frozen_root) / "dashboard"
+            if await candidate.is_dir():
+                return candidate
+        case RuntimeResources.Resources:
+            candidate = Path(frozen_root) / "resources"
+            if await candidate.is_dir():
+                return candidate
    return None
--- a/src/exo/worker/download/impl_shard_downloader.py
+++ b/src/exo/worker/download/impl_shard_downloader.py
@@ -3,8 +3,7 @@ from collections.abc import Awaitable
 from pathlib import Path
 from typing import AsyncIterator, Callable

-from exo.shared.models.model_cards import MODEL_CARDS
-from exo.shared.models.model_meta import get_model_card
+from exo.shared.models.model_cards import ModelCard, get_model_cards
 from exo.shared.types.worker.shards import (
    PipelineShardMetadata,
    ShardMetadata,
@@ -20,7 +19,7 @@ def exo_shard_downloader(max_parallel_downloads: int = 8) -> ShardDownloader:


 async def build_base_shard(model_id: str) -> ShardMetadata:
-    model_card = await get_model_card(model_id)
+    model_card = await ModelCard.from_hf(model_id)
    return PipelineShardMetadata(
        model_card=model_card,
        device_rank=0,
@@ -159,7 +158,7 @@ class ResumableShardDownloader(ShardDownloader):
        # Kick off download status coroutines concurrently
        tasks = [
            asyncio.create_task(_status_for_model(model_card.model_id))
-            for model_card in MODEL_CARDS.values()
+            for model_card in await get_model_cards()
        ]

        for task in asyncio.as_completed(tasks):
--- a/src/exo/worker/engines/mlx/auto_parallel.py
+++ b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -13,11 +13,17 @@ from mlx.nn.layers.distributed import (
    shard_linear,
    sum_gradients,
 )
+from mlx_lm.models.deepseek_v3 import DeepseekV3MLP
 from mlx_lm.models.deepseek_v3 import Model as DeepseekV3Model
+from mlx_lm.models.deepseek_v32 import DeepseekV32MLP
 from mlx_lm.models.deepseek_v32 import Model as DeepseekV32Model
 from mlx_lm.models.glm4_moe import Model as Glm4MoeModel
 from mlx_lm.models.glm4_moe import MoE
 from mlx_lm.models.gpt_oss import GptOssMoeModel
+from mlx_lm.models.gpt_oss import Model as GptOssModel
+from mlx_lm.models.llama import Model as LlamaModel
+from mlx_lm.models.minimax import Model as MiniMaxModel
+from mlx_lm.models.ministral3 import Model as Ministral3Model
 from mlx_lm.models.qwen3_moe import Model as Qwen3MoeModel
 from mlx_lm.models.qwen3_moe import Qwen3MoeSparseMoeBlock
 from mlx_lm.models.qwen3_next import Model as Qwen3NextModel
@@ -335,7 +341,33 @@ def tensor_auto_parallel(
        except (AttributeError, TypeError, NameError):
            pass

-    if isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
+    if isinstance(model, (LlamaModel, Ministral3Model)):
+        logger.warning("shouldn't be hit - upstream sharding exists")
+        tensor_parallel_sharding_strategy = LlamaShardingStrategy(
+            group,
+            all_to_sharded_linear,
+            sharded_to_all_linear,
+            all_to_sharded_linear_in_place,
+            sharded_to_all_linear_in_place,
+        )
+    elif isinstance(model, (DeepseekV3Model, DeepseekV32Model)):
+        logger.warning("shouldn't be hit - upstream sharding exists")
+        tensor_parallel_sharding_strategy = DeepSeekShardingStrategy(
+            group,
+            all_to_sharded_linear,
+            sharded_to_all_linear,
+            all_to_sharded_linear_in_place,
+            sharded_to_all_linear_in_place,
+        )
+    elif isinstance(model, MiniMaxModel):
+        tensor_parallel_sharding_strategy = MiniMaxShardingStrategy(
+            group,
+            all_to_sharded_linear,
+            sharded_to_all_linear,
+            all_to_sharded_linear_in_place,
+            sharded_to_all_linear_in_place,
+        )
+    elif isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
        tensor_parallel_sharding_strategy = QwenShardingStrategy(
            group,
            all_to_sharded_linear,
@@ -343,6 +375,15 @@ def tensor_auto_parallel(
            all_to_sharded_linear_in_place,
            sharded_to_all_linear_in_place,
        )
+    elif isinstance(model, GptOssModel):
+        tensor_parallel_sharding_strategy = GptOssShardingStrategy(
+            group,
+            all_to_sharded_linear,
+            sharded_to_all_linear,
+            all_to_sharded_linear_in_place,
+            sharded_to_all_linear_in_place,
+        )
+
    else:
        raise ValueError(f"Unsupported model type: {type(model)}")

@@ -377,6 +418,34 @@ class TensorParallelShardingStrategy(ABC):
    ) -> nn.Module: ...


+class LlamaShardingStrategy(TensorParallelShardingStrategy):
+    def shard_model(
+        self,
+        model: nn.Module,
+        timeout_seconds: float,
+        on_timeout: TimeoutCallback | None,
+    ) -> nn.Module:
+        model = cast(LlamaModel, model)
+        for layer in model.layers:
+            # Force load weights before sharding to avoid FAST_SYNCH deadlock
+            eval_with_timeout(
+                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
+            )
+            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
+            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
+            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
+            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+            layer.self_attn.n_heads //= self.N
+            if layer.self_attn.n_kv_heads is not None:
+                layer.self_attn.n_kv_heads //= self.N
+
+            layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
+            layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
+            layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
+
+        return model
+
+
 def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
    inner_model_instance = _inner_model(model)
    if hasattr(inner_model_instance, "layers"):
@@ -403,6 +472,105 @@ def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
        raise ValueError("Model must have either a 'layers' or 'h' attribute")


+class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
+    def shard_model(
+        self,
+        model: nn.Module,
+        timeout_seconds: float,
+        on_timeout: TimeoutCallback | None,
+    ) -> nn.Module:
+        model = cast(DeepseekV3Model, model)
+        for layer in model.layers:
+            eval_with_timeout(
+                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
+            )
+            # Shard the self attention
+            if layer.self_attn.q_lora_rank is None:
+                layer.self_attn.q_proj = self.all_to_sharded_linear(
+                    layer.self_attn.q_proj
+                )
+            else:
+                layer.self_attn.q_b_proj = self.all_to_sharded_linear(
+                    layer.self_attn.q_b_proj
+                )
+            layer.self_attn.kv_b_proj = self.all_to_sharded_linear(
+                layer.self_attn.kv_b_proj
+            )
+            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+            layer.self_attn.num_heads //= self.N
+
+            # Shard the MLP
+            if isinstance(layer.mlp, (DeepseekV3MLP, DeepseekV32MLP)):
+                layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
+                layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
+                layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
+
+            # Shard the MoE. Shard in place since the MoE should be responsible
+            # for aggregating the results.
+            else:
+                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.gate_proj)
+                self.sharded_to_all_linear_in_place(layer.mlp.shared_experts.down_proj)
+                self.all_to_sharded_linear_in_place(layer.mlp.shared_experts.up_proj)
+                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
+                self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
+                self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
+                layer.mlp = ShardedDeepseekV3MoE(layer.mlp)  # type: ignore
+                layer.mlp.sharding_group = self.group
+
+        return model
+
+
+class ShardedDeepseekV3MoE(CustomMlxLayer):
+    def __init__(self, layer: _LayerCallable):
+        super().__init__(layer)
+        self.sharding_group: mx.distributed.Group | None = None
+
+    def __call__(self, x: mx.array) -> mx.array:
+        if self.sharding_group is not None:
+            x = sum_gradients(self.sharding_group)(x)
+        y = self.original_layer.__call__(x)
+        if self.sharding_group is not None:
+            y = mx.distributed.all_sum(y, group=self.sharding_group)
+        return y
+
+
+class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
+    def shard_model(
+        self,
+        model: nn.Module,
+        timeout_seconds: float,
+        on_timeout: TimeoutCallback | None,
+    ) -> nn.Module:
+        model = cast(MiniMaxModel, model)
+        for layer in model.layers:
+            eval_with_timeout(
+                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
+            )
+            # Shard the self attention
+            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
+            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
+            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
+            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+            layer.self_attn.num_attention_heads //= self.N
+            layer.self_attn.num_key_value_heads //= self.N
+
+            # Shard the MoE. Shard in place since the MoE should be responsible
+            # for aggregating the results.
+            self.all_to_sharded_linear_in_place(
+                layer.block_sparse_moe.switch_mlp.gate_proj
+            )
+            self.sharded_to_all_linear_in_place(
+                layer.block_sparse_moe.switch_mlp.down_proj
+            )
+            self.all_to_sharded_linear_in_place(
+                layer.block_sparse_moe.switch_mlp.up_proj
+            )
+            layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe)  # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
+            layer.block_sparse_moe.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
+
+        return model
+
+
 class QwenShardingStrategy(TensorParallelShardingStrategy):
    def shard_model(
        self,
@@ -455,3 +623,58 @@ class ShardedQwenMoE(CustomMlxLayer):
        if self.sharding_group is not None:
            y = mx.distributed.all_sum(y, group=self.sharding_group)
        return y
+
+
+class GptOssShardingStrategy(TensorParallelShardingStrategy):
+    def shard_model(
+        self,
+        model: nn.Module,
+        timeout_seconds: float,
+        on_timeout: TimeoutCallback | None,
+    ) -> nn.Module:
+        model = cast(GptOssMoeModel, model)
+
+        for layer in model.layers:
+            eval_with_timeout(
+                layer.parameters(), timeout_seconds / len(model.layers), on_timeout
+            )
+            layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
+            layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
+            layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
+            layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
+
+            layer.self_attn.num_attention_heads //= self.N
+            layer.self_attn.num_key_value_heads //= self.N
+            layer.self_attn.num_key_value_groups = (
+                layer.self_attn.num_attention_heads
+                // layer.self_attn.num_key_value_heads
+            )
+
+            layer.self_attn.sinks = layer.self_attn.sinks[
+                layer.self_attn.num_attention_heads
+                * self.group.rank() : layer.self_attn.num_attention_heads
+                * (self.group.rank() + 1)
+            ]
+
+            self.all_to_sharded_linear_in_place(layer.mlp.experts.gate_proj)
+            self.sharded_to_all_linear_in_place(layer.mlp.experts.down_proj)
+            self.all_to_sharded_linear_in_place(layer.mlp.experts.up_proj)
+
+            layer.mlp = ShardedGptOssMoE(layer.mlp)  # type: ignore
+            layer.mlp.sharding_group = self.group  # pyright: ignore[reportAttributeAccessIssue]
+
+        return model
+
+
+class ShardedGptOssMoE(CustomMlxLayer):
+    def __init__(self, layer: nn.Module):
+        super().__init__(layer)
+        self.sharding_group: mx.distributed.Group | None = None
+
+    def __call__(self, x: mx.array) -> mx.array:
+        if self.sharding_group is not None:
+            x = sum_gradients(self.sharding_group)(x)
+        y = self.original_layer(x)
+        if self.sharding_group is not None:
+            y = mx.distributed.all_sum(y, group=self.sharding_group)
+        return y
--- a/src/exo/worker/main.py
+++ b/src/exo/worker/main.py
@@ -413,6 +413,11 @@ class Worker:
            )
            for nid in conns:
                for ip in conns[nid]:
+                    if "127.0.0.1" in ip or "localhost" in ip:
+                        logger.warning(
+                            f"Loopback connection should not happen: {ip=} for {nid=}"
+                        )
+
                    edge = SocketConnection(
                        # nonsense multiaddr
                        sink_multiaddr=Multiaddr(address=f"/ip4/{ip}/tcp/52415")
@@ -433,9 +438,6 @@ class Worker:
            for conn in self.state.topology.out_edges(self.node_id):
                if not isinstance(conn.edge, SocketConnection):
                    continue
-                # ignore mDNS discovered connections
-                if conn.edge.sink_multiaddr.port != 52415:
-                    continue
                if (
                    conn.sink not in conns
                    or conn.edge.sink_multiaddr.ip_address
--- a/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_distributed_fix.py
@@ -18,7 +18,6 @@ def _check_model_exists() -> bool:


 pytestmark = [
-    pytest.mark.slow,
    pytest.mark.skipif(
        not _check_model_exists(),
        reason=f"GPT-OSS model not found at {DEFAULT_GPT_OSS_CONFIG.model_path}",
--- a/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
+++ b/src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
@@ -89,8 +89,6 @@ def get_test_models() -> list[tuple[str, ModelCard]]:

 TEST_MODELS: list[tuple[str, ModelCard]] = get_test_models()

-pytestmark = pytest.mark.slow
-

@pytest.fixture(scope="module")
 def event_loop():
--- a/tests/headless_runner.py
+++ b/tests/headless_runner.py
@@ -12,7 +12,7 @@ from loguru import logger
 from pydantic import BaseModel

 from exo.shared.logging import InterceptLogger, logger_setup
-from exo.shared.models.model_cards import MODEL_CARDS, ModelId
+from exo.shared.models.model_cards import ModelId
 from exo.shared.types.api import ChatCompletionMessage, ChatCompletionTaskParams
 from exo.shared.types.commands import CommandId
 from exo.shared.types.common import Host, NodeId
@@ -89,22 +89,22 @@ async def tb_detection():

 async def assert_downloads():
    sd = exo_shard_downloader()
-    # await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-0.6b"].model_id))
+    # await sd.ensure_shard(ModelId("mlx-community/Qwen3-0.6B-8bit")))
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["llama-3.1-8b-bf16"].model_id)
+        await build_full_shard(ModelId("mlx-community/Llama-3.1-8b-bf16"))
    )
-    await sd.ensure_shard(await build_full_shard(MODEL_CARDS["qwen3-30b"].model_id))
+    await sd.ensure_shard(await build_full_shard(ModelId("mlx-community/Qwen3-30b-A3B")))
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["gpt-oss-120b-MXFP4-Q8"].model_id)
+        await build_full_shard(ModelId("mlx-commmunity/gpt-oss-120b-MXFP4-Q8"))
    )
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["gpt-oss-20b-4bit"].model_id)
+        await build_full_shard(ModelId("mlx-community/gpt-oss-20b-4bit"))
    )
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["glm-4.7-8bit-gs32"].model_id)
+        await build_full_shard(ModelId("mlx-community/GLM-4.7-8bit-gs32"))
    )
    await sd.ensure_shard(
-        await build_full_shard(MODEL_CARDS["minimax-m2.1-8bit"].model_id)
+        await build_full_shard(ModelId("mlx-community/MiniMax-M2.1-8bit"))
    )
Author	SHA1	Message	Date
Evan	92b24196c3	wrrg	2026-01-20 11:14:20 +00:00
Evan	3bf7770988	add model cards	2026-01-20 10:56:29 +00:00
Evan	8392463a70	introduce resources folder	2026-01-20 10:56:29 +00:00
Evan	9c1f6224b0	Merge branch 'main' into simplify-model-cards	2026-01-20 10:56:29 +00:00
Evan	f370dbd1e0	Merge branch 'main' into simplify-model-cards merge fix	2026-01-20 10:56:17 +00:00
rltakashige	6a38f9efba	Merge branch 'main' into simplify-model-cards	2026-01-19 17:43:59 +00:00
Evan	0475de6431	wuff	2026-01-19 17:07:03 +00:00