Merge remote-tracking branch 'origin/main' into alexcheema/robust-hf-config-parsing

# Conflicts: # bench/exo_bench.py # dashboard/src/lib/components/FamilyLogos.svelte # dashboard/src/lib/components/FamilySidebar.svelte # dashboard/src/lib/components/HuggingFaceResultItem.svelte # dashboard/src/lib/components/ModelFilterPopover.svelte # dashboard/src/lib/components/ModelPickerGroup.svelte # dashboard/src/lib/components/ModelPickerModal.svelte # dashboard/src/lib/components/index.ts # dashboard/src/lib/stores/favorites.svelte.ts # dashboard/src/routes/+page.svelte # src/exo/master/api.py # src/exo/shared/models/model_cards.py # src/exo/shared/models/model_meta.py # src/exo/shared/types/api.py # src/exo/shared/types/models.py # src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
Add custom model support with model_id as sole identifier
2026-02-20 07:46:42 -05:00 · 2026-02-05 06:05:31 -08:00 · 2026-01-19 22:51:51 +00:00
41 changed files with 1334 additions and 0 deletions
--- a/src/exo/shared/constants.py
+++ b/src/exo/shared/constants.py
@@ -50,6 +50,9 @@ EXO_TEST_LOG = EXO_CACHE_HOME / "exo_test.log"
 EXO_NODE_ID_KEYPAIR = EXO_CONFIG_HOME / "node_id.keypair"
 EXO_CONFIG_FILE = EXO_CONFIG_HOME / "config.toml"

+# User-added custom models (config)
+EXO_USER_MODELS_DIR = EXO_CONFIG_HOME / "models"
+
 # libp2p topics for event forwarding
 LIBP2P_LOCAL_EVENTS_TOPIC = "worker_events"
 LIBP2P_GLOBAL_EVENTS_TOPIC = "global_events"
--- a/src/exo/shared/models/architecture_support.py
+++ b/src/exo/shared/models/architecture_support.py
@@ -0,0 +1,81 @@
+"""Architecture support mapping for tensor parallelism and capabilities.
+
+This module provides a single source of truth for which model architectures
+support tensor parallelism and other capabilities. The mapping is derived
+from the MLX model classes in exo.worker.engines.mlx.auto_parallel.
+"""
+
+from typing import Final
+
+# Model architectures (HuggingFace model_type values) that support tensor parallelism.
+# This mapping corresponds to the model classes in auto_parallel.py:
+#
+# | model_type      | MLX Class           |
+# |-----------------|---------------------|
+# | llama           | LlamaModel          |
+# | mistral         | LlamaModel          |
+# | qwen2           | LlamaModel          |
+# | ministral3      | Ministral3Model     |
+# | deepseek_v3     | DeepseekV3Model     |
+# | deepseek_v32    | DeepseekV32Model    |
+# | minimax         | MiniMaxModel        |
+# | qwen3_moe       | Qwen3MoeModel       |
+# | glm4_moe        | Glm4MoeModel        |
+# | qwen3_next      | Qwen3NextModel      |
+# | gpt_oss         | GptOssModel         |
+# | gpt_oss_moe     | GptOssMoeModel      |
+#
+TENSOR_PARALLEL_ARCHITECTURES: Final[frozenset[str]] = frozenset(
+    {
+        "llama",
+        "mistral",
+        "qwen2",
+        "ministral3",
+        "deepseek_v3",
+        "deepseek_v32",
+        "minimax",
+        "qwen3_moe",
+        "glm4_moe",
+        "qwen3_next",
+        "gpt_oss",
+        "gpt_oss_moe",
+    }
+)
+
+# Model architectures (HuggingFace model_type values) that support vision input.
+# These architectures have native image understanding capabilities.
+VISION_ARCHITECTURES: Final[frozenset[str]] = frozenset(
+    {
+        "llava",  # LLaVA vision-language models
+        "qwen2_5_vl",  # Qwen 2.5 Vision-Language
+        "qwen2_vl",  # Qwen 2 Vision-Language
+        "phi4mm",  # Phi-4 multimodal
+        "mllama",  # Llama 3.2 Vision (MLlama)
+        "paligemma",  # PaLI-GEMMA
+        "idefics2",  # IDEFICS2
+    }
+)
+
+
+def supports_tensor_parallel(architecture: str) -> bool:
+    """Check if an architecture supports tensor parallelism.
+
+    Args:
+        architecture: The HuggingFace model_type value (e.g., "llama", "qwen2").
+
+    Returns:
+        True if the architecture supports tensor parallelism, False otherwise.
+    """
+    return architecture.lower() in TENSOR_PARALLEL_ARCHITECTURES
+
+
+def supports_vision(architecture: str) -> bool:
+    """Check if an architecture supports vision/image input.
+
+    Args:
+        architecture: The HuggingFace model_type value (e.g., "llava", "qwen2_vl").
+
+    Returns:
+        True if the architecture supports vision input, False otherwise.
+    """
+    return architecture.lower() in VISION_ARCHITECTURES
--- a/src/exo/shared/models/cards/deepseek-v3.1-4bit.json
+++ b/src/exo/shared/models/cards/deepseek-v3.1-4bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/DeepSeek-V3.1-4bit",
+  "name": "DeepSeek V3.1 (4-bit)",
+  "description": "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 405874409472,
+  "n_layers": 61,
+  "hidden_size": 7168,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/deepseek-v3.1-8bit.json
+++ b/src/exo/shared/models/cards/deepseek-v3.1-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/DeepSeek-V3.1-8bit",
+  "name": "DeepSeek V3.1 (8-bit)",
+  "description": "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 765577920512,
+  "n_layers": 61,
+  "hidden_size": 7168,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/glm-4.5-air-8bit.json
+++ b/src/exo/shared/models/cards/glm-4.5-air-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/GLM-4.5-Air-8bit",
+  "name": "GLM 4.5 Air 8bit",
+  "description": "GLM 4.5 Air 8bit",
+  "tags": [],
+  "supports_tensor": false,
+  "storage_size_bytes": 122406567936,
+  "n_layers": 46,
+  "hidden_size": 4096,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/glm-4.5-air-bf16.json
+++ b/src/exo/shared/models/cards/glm-4.5-air-bf16.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/GLM-4.5-Air-bf16",
+  "name": "GLM 4.5 Air bf16",
+  "description": "GLM 4.5 Air bf16",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 229780750336,
+  "n_layers": 46,
+  "hidden_size": 4096,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/glm-4.7-4bit.json
+++ b/src/exo/shared/models/cards/glm-4.7-4bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/GLM-4.7-4bit",
+  "name": "GLM 4.7 4bit",
+  "description": "GLM 4.7 4bit",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 198556925568,
+  "n_layers": 91,
+  "hidden_size": 5120,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/glm-4.7-6bit.json
+++ b/src/exo/shared/models/cards/glm-4.7-6bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/GLM-4.7-6bit",
+  "name": "GLM 4.7 6bit",
+  "description": "GLM 4.7 6bit",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 286737579648,
+  "n_layers": 91,
+  "hidden_size": 5120,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/glm-4.7-8bit-gs32.json
+++ b/src/exo/shared/models/cards/glm-4.7-8bit-gs32.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/GLM-4.7-8bit-gs32",
+  "name": "GLM 4.7 8bit (gs32)",
+  "description": "GLM 4.7 8bit (gs32)",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 396963397248,
+  "n_layers": 91,
+  "hidden_size": 5120,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/gpt-oss-120b-MXFP4-Q8.json
+++ b/src/exo/shared/models/cards/gpt-oss-120b-MXFP4-Q8.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/gpt-oss-120b-MXFP4-Q8",
+  "name": "GPT-OSS 120B (MXFP4-Q8, MLX)",
+  "description": "OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 70652212224,
+  "n_layers": 36,
+  "hidden_size": 2880,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/gpt-oss-20b-MXFP4-Q8.json
+++ b/src/exo/shared/models/cards/gpt-oss-20b-MXFP4-Q8.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/gpt-oss-20b-MXFP4-Q8",
+  "name": "GPT-OSS 20B (MXFP4-Q8, MLX)",
+  "description": "OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this variant is a 4-bit MLX conversion for Apple Silicon.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 12025908224,
+  "n_layers": 24,
+  "hidden_size": 2880,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/kimi-k2-instruct-4bit.json
+++ b/src/exo/shared/models/cards/kimi-k2-instruct-4bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Kimi-K2-Instruct-4bit",
+  "name": "Kimi K2 Instruct (4-bit)",
+  "description": "Kimi K2 is a large language model trained on the Kimi K2 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 620622774272,
+  "n_layers": 61,
+  "hidden_size": 7168,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/kimi-k2-thinking.json
+++ b/src/exo/shared/models/cards/kimi-k2-thinking.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Kimi-K2-Thinking",
+  "name": "Kimi K2 Thinking (4-bit)",
+  "description": "Kimi K2 Thinking is the latest, most capable version of open-source thinking model.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 706522120192,
+  "n_layers": 61,
+  "hidden_size": 7168,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.1-70b.json
+++ b/src/exo/shared/models/cards/llama-3.1-70b.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit",
+  "name": "Llama 3.1 70B (4-bit)",
+  "description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 40652242944,
+  "n_layers": 80,
+  "hidden_size": 8192,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.1-8b-8bit.json
+++ b/src/exo/shared/models/cards/llama-3.1-8b-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit",
+  "name": "Llama 3.1 8B (8-bit)",
+  "description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 8954839040,
+  "n_layers": 32,
+  "hidden_size": 4096,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.1-8b-bf16.json
+++ b/src/exo/shared/models/cards/llama-3.1-8b-bf16.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16",
+  "name": "Llama 3.1 8B (BF16)",
+  "description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 16882073600,
+  "n_layers": 32,
+  "hidden_size": 4096,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.1-8b.json
+++ b/src/exo/shared/models/cards/llama-3.1-8b.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
+  "name": "Llama 3.1 8B (4-bit)",
+  "description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 4637851648,
+  "n_layers": 32,
+  "hidden_size": 4096,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.2-1b.json
+++ b/src/exo/shared/models/cards/llama-3.2-1b.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Llama-3.2-1B-Instruct-4bit",
+  "name": "Llama 3.2 1B (4-bit)",
+  "description": "Llama 3.2 is a large language model trained on the Llama 3.2 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 729808896,
+  "n_layers": 16,
+  "hidden_size": 2048,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.2-3b-8bit.json
+++ b/src/exo/shared/models/cards/llama-3.2-3b-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Llama-3.2-3B-Instruct-8bit",
+  "name": "Llama 3.2 3B (8-bit)",
+  "description": "Llama 3.2 is a large language model trained on the Llama 3.2 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 3501195264,
+  "n_layers": 28,
+  "hidden_size": 3072,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.2-3b.json
+++ b/src/exo/shared/models/cards/llama-3.2-3b.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
+  "name": "Llama 3.2 3B (4-bit)",
+  "description": "Llama 3.2 is a large language model trained on the Llama 3.2 dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 1863319552,
+  "n_layers": 28,
+  "hidden_size": 3072,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.3-70b-8bit.json
+++ b/src/exo/shared/models/cards/llama-3.3-70b-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Llama-3.3-70B-Instruct-8bit",
+  "name": "Llama 3.3 70B (8-bit)",
+  "description": "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 76799803392,
+  "n_layers": 80,
+  "hidden_size": 8192,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.3-70b-fp16.json
+++ b/src/exo/shared/models/cards/llama-3.3-70b-fp16.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/llama-3.3-70b-instruct-fp16",
+  "name": "Llama 3.3 70B (FP16)",
+  "description": "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 144383672320,
+  "n_layers": 80,
+  "hidden_size": 8192,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/llama-3.3-70b.json
+++ b/src/exo/shared/models/cards/llama-3.3-70b.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Llama-3.3-70B-Instruct-4bit",
+  "name": "Llama 3.3 70B (4-bit)",
+  "description": "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 40652242944,
+  "n_layers": 80,
+  "hidden_size": 8192,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/minimax-m2.1-3bit.json
+++ b/src/exo/shared/models/cards/minimax-m2.1-3bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/MiniMax-M2.1-3bit",
+  "name": "MiniMax M2.1 3bit",
+  "description": "MiniMax M2.1 3bit",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 100086644736,
+  "n_layers": 61,
+  "hidden_size": 3072,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/minimax-m2.1-8bit.json
+++ b/src/exo/shared/models/cards/minimax-m2.1-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/MiniMax-M2.1-8bit",
+  "name": "MiniMax M2.1 8bit",
+  "description": "MiniMax M2.1 8bit",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 242986745856,
+  "n_layers": 61,
+  "hidden_size": 3072,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-0.6b-8bit.json
+++ b/src/exo/shared/models/cards/qwen3-0.6b-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-0.6B-8bit",
+  "name": "Qwen3 0.6B (8-bit)",
+  "description": "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.",
+  "tags": [],
+  "supports_tensor": false,
+  "storage_size_bytes": 698351616,
+  "n_layers": 28,
+  "hidden_size": 1024,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-0.6b.json
+++ b/src/exo/shared/models/cards/qwen3-0.6b.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-0.6B-4bit",
+  "name": "Qwen3 0.6B (4-bit)",
+  "description": "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.",
+  "tags": [],
+  "supports_tensor": false,
+  "storage_size_bytes": 342884352,
+  "n_layers": 28,
+  "hidden_size": 1024,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-235b-a22b-4bit.json
+++ b/src/exo/shared/models/cards/qwen3-235b-a22b-4bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit",
+  "name": "Qwen3 235B A22B (4-bit)",
+  "description": "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 141733920768,
+  "n_layers": 94,
+  "hidden_size": 4096,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-235b-a22b-8bit.json
+++ b/src/exo/shared/models/cards/qwen3-235b-a22b-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit",
+  "name": "Qwen3 235B A22B (8-bit)",
+  "description": "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 268435456000,
+  "n_layers": 94,
+  "hidden_size": 4096,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-30b-8bit.json
+++ b/src/exo/shared/models/cards/qwen3-30b-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-30B-A3B-8bit",
+  "name": "Qwen3 30B A3B (8-bit)",
+  "description": "Qwen3 30B is a large language model trained on the Qwen3 30B dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 33279705088,
+  "n_layers": 48,
+  "hidden_size": 2048,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-30b.json
+++ b/src/exo/shared/models/cards/qwen3-30b.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-30B-A3B-4bit",
+  "name": "Qwen3 30B A3B (4-bit)",
+  "description": "Qwen3 30B is a large language model trained on the Qwen3 30B dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 17612931072,
+  "n_layers": 48,
+  "hidden_size": 2048,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-80b-a3B-4bit.json
+++ b/src/exo/shared/models/cards/qwen3-80b-a3B-4bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit",
+  "name": "Qwen3 80B A3B (4-bit)",
+  "description": "Qwen3 80B",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 46976204800,
+  "n_layers": 48,
+  "hidden_size": 2048,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-80b-a3B-8bit.json
+++ b/src/exo/shared/models/cards/qwen3-80b-a3B-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit",
+  "name": "Qwen3 80B A3B (8-bit)",
+  "description": "Qwen3 80B",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 88814387200,
+  "n_layers": 48,
+  "hidden_size": 2048,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-80b-a3B-thinking-4bit.json
+++ b/src/exo/shared/models/cards/qwen3-80b-a3B-thinking-4bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit",
+  "name": "Qwen3 80B A3B Thinking (4-bit)",
+  "description": "Qwen3 80B Reasoning model",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 88814387200,
+  "n_layers": 48,
+  "hidden_size": 2048,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-80b-a3B-thinking-8bit.json
+++ b/src/exo/shared/models/cards/qwen3-80b-a3B-thinking-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit",
+  "name": "Qwen3 80B A3B Thinking (8-bit)",
+  "description": "Qwen3 80B Reasoning model",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 88814387200,
+  "n_layers": 48,
+  "hidden_size": 2048,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-coder-480b-a35b-4bit.json
+++ b/src/exo/shared/models/cards/qwen3-coder-480b-a35b-4bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit",
+  "name": "Qwen3 Coder 480B A35B (4-bit)",
+  "description": "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 289910292480,
+  "n_layers": 62,
+  "hidden_size": 6144,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/cards/qwen3-coder-480b-a35b-8bit.json
+++ b/src/exo/shared/models/cards/qwen3-coder-480b-a35b-8bit.json
@@ -0,0 +1,11 @@
+{
+  "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit",
+  "name": "Qwen3 Coder 480B A35B (8-bit)",
+  "description": "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.",
+  "tags": [],
+  "supports_tensor": true,
+  "storage_size_bytes": 579820584960,
+  "n_layers": 62,
+  "hidden_size": 6144,
+  "is_user_added": false
+}
--- a/src/exo/shared/models/model_registry.py
+++ b/src/exo/shared/models/model_registry.py
@@ -0,0 +1,399 @@
+"""Model registry for loading and managing model configurations.
+
+This module provides a registry that loads model configurations from:
+1. New registry structure: base_models.json + variants.json (grouped models)
+2. Legacy JSON files in the cards/ directory (shipped with exo)
+3. User-added JSON files in ~/.exo/models/ (created via dashboard)
+
+The registry automatically combines base model metadata with variant data
+to produce complete ModelConfig objects with grouping information.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Final, cast
+
+from loguru import logger
+
+from exo.shared.constants import EXO_USER_MODELS_DIR
+from exo.shared.models.architecture_support import (
+    supports_tensor_parallel,
+    supports_vision,
+)
+from exo.shared.models.registry import (
+    BASE_MODELS_BY_ID,
+    VARIANTS,
+)
+from exo.shared.types.models import ModelConfig
+
+# Directory containing built-in model config JSON files (legacy format)
+BUILTIN_CARDS_DIR: Final[Path] = Path(__file__).parent / "cards"
+
+
+def derive_capabilities(
+    architecture: str,
+    manual_capabilities: list[str] | None = None,
+) -> list[str]:
+    """Derive model capabilities from architecture and manual overrides.
+
+    Priority:
+    1. Manual capabilities from base_models.json (if provided)
+    2. Architecture-based detection (for vision)
+    3. Default: ["text"]
+
+    Args:
+        architecture: The HuggingFace model_type value.
+        manual_capabilities: Explicit capabilities from base_models.json.
+
+    Returns:
+        List of capability strings (e.g., ["text", "vision", "code"]).
+    """
+    # If manual capabilities are provided, use them as primary source
+    if manual_capabilities:
+        return manual_capabilities
+
+    capabilities = ["text"]  # All models support text by default
+
+    # Architecture-based detection for vision
+    if supports_vision(architecture):
+        capabilities.append("vision")
+
+    return capabilities
+
+
+def _model_id_to_filename(model_id: str) -> str:
+    """Convert model_id to a valid filename by replacing / with --.
+
+    Examples:
+        "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit" -> "mlx-community--Meta-Llama-3.1-8B-Instruct-4bit"
+    """
+    return model_id.replace("/", "--")
+
+
+def _quantization_display_name(quantization: str) -> str:
+    """Format quantization for display in model name."""
+    quant_lower = quantization.lower()
+    if quant_lower in ("4bit", "4-bit"):
+        return "4-bit"
+    if quant_lower in ("8bit", "8-bit"):
+        return "8-bit"
+    if quant_lower in ("bf16", "bfloat16"):
+        return "BF16"
+    if quant_lower in ("fp16", "float16"):
+        return "FP16"
+    if quant_lower in ("3bit", "3-bit"):
+        return "3-bit"
+    if quant_lower in ("6bit", "6-bit"):
+        return "6-bit"
+    # Return as-is for other quantizations
+    return quantization.upper()
+
+
+def _load_config_from_file(path: Path) -> ModelConfig | None:
+    """Load a single model config from a JSON file."""
+    try:
+        with path.open("r", encoding="utf-8") as f:
+            data = cast(dict[str, Any], json.load(f))
+        return ModelConfig.model_validate(data)
+    except Exception as e:
+        logger.warning(f"Failed to load model config from {path}: {e}")
+        return None
+
+
+def _variant_to_config(variant: dict[str, object]) -> ModelConfig | None:
+    """Convert a variant + base model data to a ModelConfig."""
+    base_model_id = str(variant.get("base_model", ""))
+    base_model = BASE_MODELS_BY_ID.get(base_model_id)
+
+    if base_model is None:
+        logger.warning(f"Base model {base_model_id} not found for variant {variant}")
+        return None
+
+    model_id = str(variant["model_id"])
+    quantization = str(variant.get("quantization", ""))
+    storage_size_raw = variant.get("storage_size_bytes", 0)
+    storage_size_bytes = int(str(storage_size_raw)) if storage_size_raw else 0
+
+    base_name = str(base_model["name"])
+    architecture = str(base_model.get("architecture", ""))
+    n_layers_raw = base_model.get("n_layers", 0)
+    n_layers = int(str(n_layers_raw)) if n_layers_raw else 1
+    hidden_size_raw = base_model.get("hidden_size", 0)
+    hidden_size = int(str(hidden_size_raw)) if hidden_size_raw else 1
+    description = str(base_model.get("description", ""))
+
+    # Extract new UI display fields from base model
+    tagline = str(base_model.get("tagline", ""))
+    family = str(base_model.get("family", ""))
+    manual_capabilities_raw = base_model.get("capabilities")
+    manual_capabilities = (
+        cast(list[str], manual_capabilities_raw)
+        if isinstance(manual_capabilities_raw, list)
+        else None
+    )
+    capabilities = derive_capabilities(architecture, manual_capabilities)
+
+    # Generate display name: "Base Name (Quantization)"
+    quant_display = _quantization_display_name(quantization)
+    display_name = f"{base_name} ({quant_display})"
+
+    # Derive supports_tensor from architecture
+    tensor_support = supports_tensor_parallel(architecture)
+
+    return ModelConfig(
+        model_id=model_id,
+        name=display_name,
+        description=description,
+        tags=[],
+        supports_tensor=tensor_support,
+        storage_size_bytes=storage_size_bytes,
+        n_layers=n_layers,
+        hidden_size=hidden_size,
+        is_user_added=False,
+        architecture=architecture,
+        base_model_id=base_model_id,
+        base_model_name=base_name,
+        quantization=quantization,
+        tagline=tagline,
+        capabilities=capabilities,
+        family=family,
+    )
+
+
+class ModelRegistry:
+    """Registry for model configurations.
+
+    Loads configurations from:
+    1. Registry structure (base_models.json + variants.json)
+    2. Built-in cards/ directory (legacy format)
+    3. User ~/.exo/models/
+    """
+
+    def __init__(self) -> None:
+        self._configs: dict[str, ModelConfig] = {}
+        self._reload()
+
+    def _migrate_user_model_files(self) -> None:
+        """Migrate user model files from old naming format to new format.
+
+        Old format: {short_id}.json (e.g., "meta-llama-3.1-8b-instruct-4bit.json")
+        New format: {model_id.replace("/", "--")}.json (e.g., "mlx-community--Meta-Llama-3.1-8B-Instruct-4bit.json")
+        """
+        if not EXO_USER_MODELS_DIR.exists():
+            return
+
+        for path in EXO_USER_MODELS_DIR.glob("*.json"):
+            # New format files contain "--" (org--repo)
+            if "--" in path.stem:
+                continue
+
+            # This is an old format file, load and migrate
+            config = _load_config_from_file(path)
+            if config is None:
+                continue
+
+            # Create new filename from model_id
+            new_filename = _model_id_to_filename(config.model_id) + ".json"
+            new_path = EXO_USER_MODELS_DIR / new_filename
+
+            if new_path.exists():
+                # New file already exists, just remove the old one
+                logger.info(f"Removing old user model file (already migrated): {path}")
+                path.unlink()
+            else:
+                # Rename to new format
+                logger.info(f"Migrating user model file: {path} -> {new_path}")
+                path.rename(new_path)
+
+    def _reload(self) -> None:
+        """Reload all model configurations from disk."""
+        self._configs.clear()
+        registry_count = 0
+        legacy_count = 0
+
+        # First, load from new registry structure (base_models + variants)
+        for variant in VARIANTS:
+            config = _variant_to_config(variant)
+            if config is not None:
+                # Use model_id as the key
+                self._configs[config.model_id] = config
+                registry_count += 1
+
+        # Then load legacy built-in models (these can override registry if same model_id)
+        if BUILTIN_CARDS_DIR.exists():
+            for path in BUILTIN_CARDS_DIR.glob("*.json"):
+                config = _load_config_from_file(path)
+                if config is None:
+                    continue
+                # Skip if already loaded from registry
+                if config.model_id in self._configs:
+                    continue
+                self._configs[config.model_id] = config
+                legacy_count += 1
+
+        # Migrate old user model files before loading
+        self._migrate_user_model_files()
+
+        # Load user models (these can override built-in if same model_id)
+        user_count = 0
+        if EXO_USER_MODELS_DIR.exists():
+            for path in EXO_USER_MODELS_DIR.glob("*.json"):
+                config = _load_config_from_file(path)
+                if config is not None:
+                    self._configs[config.model_id] = config
+                    user_count += 1
+
+        logger.info(
+            f"Loaded {len(self._configs)} model configs "
+            f"(registry: {registry_count}, legacy: {legacy_count}, user: {user_count})"
+        )
+
+    def get(self, model_id: str) -> ModelConfig | None:
+        """Get a model config by model_id (HuggingFace repo path)."""
+        return self._configs.get(model_id)
+
+    def list_all(self) -> dict[str, ModelConfig]:
+        """Return all model configurations."""
+        return dict(self._configs)
+
+    def list_builtin(self) -> dict[str, ModelConfig]:
+        """Return only built-in model configurations."""
+        return {k: v for k, v in self._configs.items() if not v.is_user_added}
+
+    def list_user_added(self) -> dict[str, ModelConfig]:
+        """Return only user-added model configurations."""
+        return {k: v for k, v in self._configs.items() if v.is_user_added}
+
+    def list_grouped(self) -> dict[str, list[ModelConfig]]:
+        """Return models grouped by base_model_id.
+
+        Returns:
+            Dict mapping base_model_id to list of variant configs.
+            Models without a base_model_id are grouped under their model_id.
+        """
+        grouped: dict[str, list[ModelConfig]] = {}
+        for model_id, config in self._configs.items():
+            # Use base_model_id if available, otherwise use model_id
+            group_key = config.base_model_id if config.base_model_id else model_id
+            if group_key not in grouped:
+                grouped[group_key] = []
+            grouped[group_key].append(config)
+
+        # Sort variants within each group by storage size (smallest first)
+        for variants in grouped.values():
+            variants.sort(key=lambda c: c.storage_size_bytes)
+
+        return grouped
+
+    def get_base_model_variants(self, base_model_id: str) -> list[ModelConfig]:
+        """Get all variants for a given base model.
+
+        Args:
+            base_model_id: The base model identifier (e.g., "llama-3.1-8b")
+
+        Returns:
+            List of ModelConfig objects for all variants, sorted by storage size.
+        """
+        variants = [
+            config
+            for config in self._configs.values()
+            if config.base_model_id == base_model_id
+        ]
+        variants.sort(key=lambda c: c.storage_size_bytes)
+        return variants
+
+    def add_user_model(self, config: ModelConfig) -> str:
+        """Add a user model configuration and persist to disk.
+
+        Args:
+            config: The model configuration to add
+
+        Returns:
+            The model_id of the added model.
+        """
+        model_id = config.model_id
+
+        # Ensure is_user_added is True
+        config = config.model_copy(update={"is_user_added": True})
+
+        # Create user models directory if needed
+        EXO_USER_MODELS_DIR.mkdir(parents=True, exist_ok=True)
+
+        # Save to JSON file using model_id as filename
+        filename = _model_id_to_filename(model_id) + ".json"
+        path = EXO_USER_MODELS_DIR / filename
+        with path.open("w", encoding="utf-8") as f:
+            json.dump(config.model_dump(), f, indent=2)
+
+        # Add to registry
+        self._configs[model_id] = config
+        logger.info(f"Added user model: {model_id}")
+
+        return model_id
+
+    def remove_user_model(self, model_id: str) -> bool:
+        """Remove a user-added model.
+
+        Args:
+            model_id: The model_id of the model to remove.
+
+        Returns:
+            True if the model was removed, False if not found or not user-added.
+        """
+        config = self._configs.get(model_id)
+        if config is None or not config.is_user_added:
+            return False
+
+        # Remove JSON file
+        filename = _model_id_to_filename(model_id) + ".json"
+        path = EXO_USER_MODELS_DIR / filename
+        if path.exists():
+            path.unlink()
+
+        # Remove from registry
+        del self._configs[model_id]
+        logger.info(f"Removed user model: {model_id}")
+
+        return True
+
+    def update_user_model(self, model_id: str, **updates: object) -> ModelConfig | None:
+        """Update a user-added model configuration.
+
+        Args:
+            model_id: The model_id of the model to update.
+            **updates: Field updates to apply.
+
+        Returns:
+            The updated config, or None if not found or not user-added.
+        """
+        config = self._configs.get(model_id)
+        if config is None or not config.is_user_added:
+            return None
+
+        # Apply updates
+        updated_config = config.model_copy(update=updates)
+
+        # Persist
+        filename = _model_id_to_filename(model_id) + ".json"
+        path = EXO_USER_MODELS_DIR / filename
+        with path.open("w", encoding="utf-8") as f:
+            json.dump(updated_config.model_dump(), f, indent=2)
+
+        self._configs[model_id] = updated_config
+        logger.info(f"Updated user model: {model_id}")
+
+        return updated_config
+
+
+# Global registry instance (lazy-loaded)
+_registry: ModelRegistry | None = None
+
+
+def get_registry() -> ModelRegistry:
+    """Get the global model registry instance."""
+    global _registry
+    if _registry is None:
+        _registry = ModelRegistry()
+    return _registry
--- a/src/exo/shared/models/registry/init.py
+++ b/src/exo/shared/models/registry/init.py
@@ -0,0 +1,50 @@
+"""Model registry with base models and quantization variants.
+
+This module provides access to the model registry data stored in JSON files.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Final, cast
+
+_REGISTRY_DIR: Final[Path] = Path(__file__).parent
+
+# Load base models and variants data at import time
+with (_REGISTRY_DIR / "base_models.json").open("r", encoding="utf-8") as f:
+    _base_models_data: dict[str, Any] = cast(dict[str, Any], json.load(f))
+    BASE_MODELS: Final[list[dict[str, object]]] = cast(
+        list[dict[str, object]], _base_models_data["base_models"]
+    )
+
+with (_REGISTRY_DIR / "variants.json").open("r", encoding="utf-8") as f:
+    _variants_data: dict[str, Any] = cast(dict[str, Any], json.load(f))
+    VARIANTS: Final[list[dict[str, object]]] = cast(
+        list[dict[str, object]], _variants_data["variants"]
+    )
+
+# Create lookup dicts for fast access
+BASE_MODELS_BY_ID: Final[dict[str, dict[str, object]]] = {
+    str(m["id"]): m for m in BASE_MODELS
+}
+
+# Variants are keyed by model_id (full HuggingFace repo path)
+VARIANTS_BY_MODEL_ID: Final[dict[str, dict[str, object]]] = {
+    str(v["model_id"]): v for v in VARIANTS
+}
+
+VARIANTS_BY_BASE_MODEL: Final[dict[str, list[dict[str, object]]]] = {}
+for _variant in VARIANTS:
+    _base_model_id = str(_variant["base_model"])
+    if _base_model_id not in VARIANTS_BY_BASE_MODEL:
+        VARIANTS_BY_BASE_MODEL[_base_model_id] = []
+    VARIANTS_BY_BASE_MODEL[_base_model_id].append(_variant)
+
+__all__ = [
+    "BASE_MODELS",
+    "VARIANTS",
+    "BASE_MODELS_BY_ID",
+    "VARIANTS_BY_MODEL_ID",
+    "VARIANTS_BY_BASE_MODEL",
+]
--- a/src/exo/shared/models/registry/base_models.json
+++ b/src/exo/shared/models/registry/base_models.json
@@ -0,0 +1,202 @@
+{
+  "base_models": [
+    {
+      "id": "llama-3.1-8b",
+      "family": "llama",
+      "name": "Llama 3.1 8B",
+      "description": "Meta's Llama 3.1 8B instruction-tuned model with 128K context window",
+      "tagline": "Fast and capable instruction-following model",
+      "capabilities": ["text"],
+      "architecture": "llama",
+      "n_layers": 32,
+      "hidden_size": 4096
+    },
+    {
+      "id": "llama-3.1-70b",
+      "family": "llama",
+      "name": "Llama 3.1 70B",
+      "description": "Meta's Llama 3.1 70B instruction-tuned model with 128K context window",
+      "tagline": "Powerful general-purpose model with 128K context",
+      "capabilities": ["text", "code"],
+      "architecture": "llama",
+      "n_layers": 80,
+      "hidden_size": 8192
+    },
+    {
+      "id": "llama-3.2-1b",
+      "family": "llama",
+      "name": "Llama 3.2 1B",
+      "description": "Meta's Llama 3.2 1B lightweight instruction-tuned model",
+      "tagline": "Ultra-lightweight for edge and mobile",
+      "capabilities": ["text"],
+      "architecture": "llama",
+      "n_layers": 16,
+      "hidden_size": 2048
+    },
+    {
+      "id": "llama-3.2-3b",
+      "family": "llama",
+      "name": "Llama 3.2 3B",
+      "description": "Meta's Llama 3.2 3B instruction-tuned model",
+      "tagline": "Compact model balancing size and capability",
+      "capabilities": ["text"],
+      "architecture": "llama",
+      "n_layers": 28,
+      "hidden_size": 3072
+    },
+    {
+      "id": "llama-3.3-70b",
+      "family": "llama",
+      "name": "Llama 3.3 70B",
+      "description": "Meta's Llama 3.3 70B instruction-tuned model with improved performance",
+      "tagline": "Meta's flagship open-weight model",
+      "capabilities": ["text", "code"],
+      "architecture": "llama",
+      "n_layers": 80,
+      "hidden_size": 8192
+    },
+    {
+      "id": "deepseek-v3.1",
+      "family": "deepseek",
+      "name": "DeepSeek V3.1",
+      "description": "DeepSeek's V3.1 model with 685B parameters using MoE architecture",
+      "tagline": "State-of-the-art MoE with 685B parameters",
+      "capabilities": ["text", "thinking", "code"],
+      "architecture": "deepseek_v32",
+      "n_layers": 61,
+      "hidden_size": 7168
+    },
+    {
+      "id": "kimi-k2",
+      "family": "kimi",
+      "name": "Kimi K2",
+      "description": "Moonshot AI's Kimi K2 large language model",
+      "tagline": "Powerful MoE from Moonshot AI",
+      "capabilities": ["text", "code"],
+      "architecture": "deepseek_v3",
+      "n_layers": 61,
+      "hidden_size": 7168
+    },
+    {
+      "id": "qwen3-0.6b",
+      "family": "qwen",
+      "name": "Qwen3 0.6B",
+      "description": "Alibaba's Qwen3 0.6B lightweight model",
+      "tagline": "Tiny but capable for simple tasks",
+      "capabilities": ["text"],
+      "architecture": "qwen2",
+      "n_layers": 28,
+      "hidden_size": 1024
+    },
+    {
+      "id": "qwen3-30b-a3b",
+      "family": "qwen",
+      "name": "Qwen3 30B A3B",
+      "description": "Alibaba's Qwen3 30B with A3B MoE architecture",
+      "tagline": "Efficient MoE for balanced performance",
+      "capabilities": ["text", "code"],
+      "architecture": "qwen3_moe",
+      "n_layers": 48,
+      "hidden_size": 2048
+    },
+    {
+      "id": "qwen3-80b-a3b",
+      "family": "qwen",
+      "name": "Qwen3 80B A3B",
+      "description": "Alibaba's Qwen3 80B with A3B MoE architecture",
+      "tagline": "Large-scale MoE with strong capabilities",
+      "capabilities": ["text", "code"],
+      "architecture": "qwen3_moe",
+      "n_layers": 128,
+      "hidden_size": 2048
+    },
+    {
+      "id": "qwen3-80b-a3b-thinking",
+      "family": "qwen",
+      "name": "Qwen3 80B A3B Thinking",
+      "description": "Alibaba's Qwen3 80B A3B with enhanced reasoning capabilities",
+      "tagline": "Reasoning-enhanced large MoE model",
+      "capabilities": ["text", "thinking", "code"],
+      "architecture": "qwen3_moe",
+      "n_layers": 128,
+      "hidden_size": 2048
+    },
+    {
+      "id": "qwen3-235b-a22b",
+      "family": "qwen",
+      "name": "Qwen3 235B A22B",
+      "description": "Alibaba's Qwen3 235B with A22B MoE architecture",
+      "tagline": "Massive open model for general intelligence",
+      "capabilities": ["text", "code"],
+      "architecture": "qwen3_moe",
+      "n_layers": 94,
+      "hidden_size": 4096
+    },
+    {
+      "id": "qwen3-coder-480b-a35b",
+      "family": "qwen",
+      "name": "Qwen3 Coder 480B A35B",
+      "description": "Alibaba's Qwen3 Coder 480B optimized for code generation",
+      "tagline": "Massive code-specialized MoE model",
+      "capabilities": ["text", "code"],
+      "architecture": "qwen3_moe",
+      "n_layers": 128,
+      "hidden_size": 5120
+    },
+    {
+      "id": "gpt-oss-120b",
+      "family": "gpt-oss",
+      "name": "GPT-OSS 120B",
+      "description": "Open source GPT model with 120B parameters",
+      "tagline": "Large open-source GPT with MoE",
+      "capabilities": ["text", "code"],
+      "architecture": "gpt_oss_moe",
+      "n_layers": 128,
+      "hidden_size": 4096
+    },
+    {
+      "id": "gpt-oss-20b",
+      "family": "gpt-oss",
+      "name": "GPT-OSS 20B",
+      "description": "Open source GPT model with 20B parameters",
+      "tagline": "Compact open-source GPT",
+      "capabilities": ["text"],
+      "architecture": "gpt_oss",
+      "n_layers": 48,
+      "hidden_size": 2560
+    },
+    {
+      "id": "glm-4.5-air",
+      "family": "glm",
+      "name": "GLM 4.5 Air",
+      "description": "Zhipu AI's GLM 4.5 Air model",
+      "tagline": "Lightweight Chinese-English bilingual model",
+      "capabilities": ["text"],
+      "architecture": "glm4_moe",
+      "n_layers": 40,
+      "hidden_size": 4096
+    },
+    {
+      "id": "glm-4.7",
+      "family": "glm",
+      "name": "GLM 4.7",
+      "description": "Zhipu AI's GLM 4.7 model with MoE architecture",
+      "tagline": "Large-scale Chinese-English MoE",
+      "capabilities": ["text", "code"],
+      "architecture": "glm4_moe",
+      "n_layers": 91,
+      "hidden_size": 5120
+    },
+    {
+      "id": "minimax-m2.1",
+      "family": "minimax",
+      "name": "MiniMax M2.1",
+      "description": "MiniMax's M2.1 large language model",
+      "tagline": "High-performance general-purpose model",
+      "capabilities": ["text", "code"],
+      "architecture": "minimax",
+      "n_layers": 80,
+      "hidden_size": 6144
+    }
+  ]
+}
--- a/src/exo/shared/models/registry/variants.json
+++ b/src/exo/shared/models/registry/variants.json
@@ -0,0 +1,214 @@
+{
+  "variants": [
+    {
+      "base_model": "llama-3.1-8b",
+      "model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 4637851648
+    },
+    {
+      "base_model": "llama-3.1-8b",
+      "model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 8954839040
+    },
+    {
+      "base_model": "llama-3.1-8b",
+      "model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16",
+      "quantization": "bf16",
+      "storage_size_bytes": 16065732608
+    },
+    {
+      "base_model": "llama-3.1-70b",
+      "model_id": "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 40324489216
+    },
+    {
+      "base_model": "llama-3.2-1b",
+      "model_id": "mlx-community/Llama-3.2-1B-Instruct-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 874968576
+    },
+    {
+      "base_model": "llama-3.2-3b",
+      "model_id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 2019430400
+    },
+    {
+      "base_model": "llama-3.2-3b",
+      "model_id": "mlx-community/Llama-3.2-3B-Instruct-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 3620454400
+    },
+    {
+      "base_model": "llama-3.3-70b",
+      "model_id": "mlx-community/Llama-3.3-70B-Instruct-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 40324489216
+    },
+    {
+      "base_model": "llama-3.3-70b",
+      "model_id": "mlx-community/Llama-3.3-70B-Instruct-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 74823761920
+    },
+    {
+      "base_model": "llama-3.3-70b",
+      "model_id": "mlx-community/Llama-3.3-70B-Instruct",
+      "quantization": "fp16",
+      "storage_size_bytes": 140734881792
+    },
+    {
+      "base_model": "deepseek-v3.1",
+      "model_id": "mlx-community/DeepSeek-V3.1-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 405874409472
+    },
+    {
+      "base_model": "deepseek-v3.1",
+      "model_id": "mlx-community/DeepSeek-V3.1-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 737851965440
+    },
+    {
+      "base_model": "kimi-k2",
+      "model_id": "mlx-community/Kimi-K2-Instruct-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 607696764928
+    },
+    {
+      "base_model": "kimi-k2",
+      "model_id": "mlx-community/Kimi-K2-Thinking-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 607696764928
+    },
+    {
+      "base_model": "qwen3-0.6b",
+      "model_id": "mlx-community/Qwen3-0.6B-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 460873728
+    },
+    {
+      "base_model": "qwen3-0.6b",
+      "model_id": "mlx-community/Qwen3-0.6B-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 752879616
+    },
+    {
+      "base_model": "qwen3-30b-a3b",
+      "model_id": "mlx-community/Qwen3-30B-A3B-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 17612931072
+    },
+    {
+      "base_model": "qwen3-30b-a3b",
+      "model_id": "mlx-community/Qwen3-30B-A3B-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 32161292288
+    },
+    {
+      "base_model": "qwen3-80b-a3b",
+      "model_id": "mlx-community/Qwen3-80B-A3B-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 46170324992
+    },
+    {
+      "base_model": "qwen3-80b-a3b",
+      "model_id": "mlx-community/Qwen3-80B-A3B-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 84458057728
+    },
+    {
+      "base_model": "qwen3-80b-a3b-thinking",
+      "model_id": "mlx-community/Qwen3-80B-A3B-Thinking-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 46170324992
+    },
+    {
+      "base_model": "qwen3-80b-a3b-thinking",
+      "model_id": "mlx-community/Qwen3-80B-A3B-Thinking-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 84458057728
+    },
+    {
+      "base_model": "qwen3-235b-a22b",
+      "model_id": "mlx-community/Qwen3-235B-A22B-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 131953045504
+    },
+    {
+      "base_model": "qwen3-235b-a22b",
+      "model_id": "mlx-community/Qwen3-235B-A22B-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 247587201024
+    },
+    {
+      "base_model": "qwen3-coder-480b-a35b",
+      "model_id": "mlx-community/Qwen3-Coder-480B-A35B-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 274063867904
+    },
+    {
+      "base_model": "qwen3-coder-480b-a35b",
+      "model_id": "mlx-community/Qwen3-Coder-480B-A35B-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 506312851456
+    },
+    {
+      "base_model": "gpt-oss-120b",
+      "model_id": "mlx-community/GPT-OSS-120B-MXFP4-Q8",
+      "quantization": "mxfp4-q8",
+      "storage_size_bytes": 67108864000
+    },
+    {
+      "base_model": "gpt-oss-20b",
+      "model_id": "mlx-community/GPT-OSS-20B-MXFP4-Q8",
+      "quantization": "mxfp4-q8",
+      "storage_size_bytes": 11811160064
+    },
+    {
+      "base_model": "glm-4.5-air",
+      "model_id": "mlx-community/GLM-4.5-Air-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 9663676416
+    },
+    {
+      "base_model": "glm-4.5-air",
+      "model_id": "mlx-community/GLM-4.5-Air-bf16",
+      "quantization": "bf16",
+      "storage_size_bytes": 18253611008
+    },
+    {
+      "base_model": "glm-4.7",
+      "model_id": "mlx-community/GLM-4.7-4bit",
+      "quantization": "4bit",
+      "storage_size_bytes": 198556925568
+    },
+    {
+      "base_model": "glm-4.7",
+      "model_id": "mlx-community/GLM-4.7-6bit",
+      "quantization": "6bit",
+      "storage_size_bytes": 282281095168
+    },
+    {
+      "base_model": "glm-4.7",
+      "model_id": "mlx-community/GLM-4.7-8bit-gs32",
+      "quantization": "8bit",
+      "storage_size_bytes": 374997565440
+    },
+    {
+      "base_model": "minimax-m2.1",
+      "model_id": "mlx-community/MiniMax-M2.1-8bit",
+      "quantization": "8bit",
+      "storage_size_bytes": 483183656960
+    },
+    {
+      "base_model": "minimax-m2.1",
+      "model_id": "mlx-community/MiniMax-M2.1-3bit",
+      "quantization": "3bit",
+      "storage_size_bytes": 213674106880
+    }
+  ]
+}