Compare commits

...

2 Commits

Author SHA1 Message Date
Alex Cheema
8a18486565 Merge remote-tracking branch 'origin/main' into alexcheema/robust-hf-config-parsing
# Conflicts:
#	bench/exo_bench.py
#	dashboard/src/lib/components/FamilyLogos.svelte
#	dashboard/src/lib/components/FamilySidebar.svelte
#	dashboard/src/lib/components/HuggingFaceResultItem.svelte
#	dashboard/src/lib/components/ModelFilterPopover.svelte
#	dashboard/src/lib/components/ModelPickerGroup.svelte
#	dashboard/src/lib/components/ModelPickerModal.svelte
#	dashboard/src/lib/components/index.ts
#	dashboard/src/lib/stores/favorites.svelte.ts
#	dashboard/src/routes/+page.svelte
#	src/exo/master/api.py
#	src/exo/shared/models/model_cards.py
#	src/exo/shared/models/model_meta.py
#	src/exo/shared/types/api.py
#	src/exo/shared/types/models.py
#	src/exo/worker/tests/unittests/test_mlx/test_tokenizers.py
2026-02-05 06:05:31 -08:00
Alex Cheema
dd34ad1a51 Add custom model support with model_id as sole identifier
Add support for adding custom HuggingFace models to exo with robust config
parsing. Models are identified solely by their full HuggingFace model_id
(e.g., "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit").

Key features:
- POST /models/custom endpoint to add custom models by HuggingFace repo ID
- DELETE /models/custom?model_id=... to remove custom models
- PATCH /models/custom?model_id=... to update custom model settings
- GET /models/search to search HuggingFace Hub for MLX models
- Automatic detection of model architecture, layers, and tensor parallel support
- Robust config parsing with fallbacks for various HuggingFace config formats
- User models persisted to ~/.cache/exo/user_models/

Registry changes:
- New registry data structure with base_models.json and variants.json
- Model registry uses model_id as dict key (removed short_id)
- Removed hugging_face_id from API responses (id field is now model_id)
- Migration logic for existing user model files

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 22:51:51 +00:00
41 changed files with 1334 additions and 0 deletions

View File

@@ -50,6 +50,9 @@ EXO_TEST_LOG = EXO_CACHE_HOME / "exo_test.log"
EXO_NODE_ID_KEYPAIR = EXO_CONFIG_HOME / "node_id.keypair"
EXO_CONFIG_FILE = EXO_CONFIG_HOME / "config.toml"
# User-added custom models (config)
EXO_USER_MODELS_DIR = EXO_CONFIG_HOME / "models"
# libp2p topics for event forwarding
LIBP2P_LOCAL_EVENTS_TOPIC = "worker_events"
LIBP2P_GLOBAL_EVENTS_TOPIC = "global_events"

View File

@@ -0,0 +1,81 @@
"""Architecture support mapping for tensor parallelism and capabilities.
This module provides a single source of truth for which model architectures
support tensor parallelism and other capabilities. The mapping is derived
from the MLX model classes in exo.worker.engines.mlx.auto_parallel.
"""
from typing import Final
# Model architectures (HuggingFace model_type values) that support tensor parallelism.
# This mapping corresponds to the model classes in auto_parallel.py:
#
# | model_type | MLX Class |
# |-----------------|---------------------|
# | llama | LlamaModel |
# | mistral | LlamaModel |
# | qwen2 | LlamaModel |
# | ministral3 | Ministral3Model |
# | deepseek_v3 | DeepseekV3Model |
# | deepseek_v32 | DeepseekV32Model |
# | minimax | MiniMaxModel |
# | qwen3_moe | Qwen3MoeModel |
# | glm4_moe | Glm4MoeModel |
# | qwen3_next | Qwen3NextModel |
# | gpt_oss | GptOssModel |
# | gpt_oss_moe | GptOssMoeModel |
#
TENSOR_PARALLEL_ARCHITECTURES: Final[frozenset[str]] = frozenset(
{
"llama",
"mistral",
"qwen2",
"ministral3",
"deepseek_v3",
"deepseek_v32",
"minimax",
"qwen3_moe",
"glm4_moe",
"qwen3_next",
"gpt_oss",
"gpt_oss_moe",
}
)
# Model architectures (HuggingFace model_type values) that support vision input.
# These architectures have native image understanding capabilities.
VISION_ARCHITECTURES: Final[frozenset[str]] = frozenset(
{
"llava", # LLaVA vision-language models
"qwen2_5_vl", # Qwen 2.5 Vision-Language
"qwen2_vl", # Qwen 2 Vision-Language
"phi4mm", # Phi-4 multimodal
"mllama", # Llama 3.2 Vision (MLlama)
"paligemma", # PaLI-GEMMA
"idefics2", # IDEFICS2
}
)
def supports_tensor_parallel(architecture: str) -> bool:
"""Check if an architecture supports tensor parallelism.
Args:
architecture: The HuggingFace model_type value (e.g., "llama", "qwen2").
Returns:
True if the architecture supports tensor parallelism, False otherwise.
"""
return architecture.lower() in TENSOR_PARALLEL_ARCHITECTURES
def supports_vision(architecture: str) -> bool:
"""Check if an architecture supports vision/image input.
Args:
architecture: The HuggingFace model_type value (e.g., "llava", "qwen2_vl").
Returns:
True if the architecture supports vision input, False otherwise.
"""
return architecture.lower() in VISION_ARCHITECTURES

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/DeepSeek-V3.1-4bit",
"name": "DeepSeek V3.1 (4-bit)",
"description": "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 405874409472,
"n_layers": 61,
"hidden_size": 7168,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/DeepSeek-V3.1-8bit",
"name": "DeepSeek V3.1 (8-bit)",
"description": "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 765577920512,
"n_layers": 61,
"hidden_size": 7168,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/GLM-4.5-Air-8bit",
"name": "GLM 4.5 Air 8bit",
"description": "GLM 4.5 Air 8bit",
"tags": [],
"supports_tensor": false,
"storage_size_bytes": 122406567936,
"n_layers": 46,
"hidden_size": 4096,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/GLM-4.5-Air-bf16",
"name": "GLM 4.5 Air bf16",
"description": "GLM 4.5 Air bf16",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 229780750336,
"n_layers": 46,
"hidden_size": 4096,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/GLM-4.7-4bit",
"name": "GLM 4.7 4bit",
"description": "GLM 4.7 4bit",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 198556925568,
"n_layers": 91,
"hidden_size": 5120,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/GLM-4.7-6bit",
"name": "GLM 4.7 6bit",
"description": "GLM 4.7 6bit",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 286737579648,
"n_layers": 91,
"hidden_size": 5120,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/GLM-4.7-8bit-gs32",
"name": "GLM 4.7 8bit (gs32)",
"description": "GLM 4.7 8bit (gs32)",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 396963397248,
"n_layers": 91,
"hidden_size": 5120,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/gpt-oss-120b-MXFP4-Q8",
"name": "GPT-OSS 120B (MXFP4-Q8, MLX)",
"description": "OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 70652212224,
"n_layers": 36,
"hidden_size": 2880,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/gpt-oss-20b-MXFP4-Q8",
"name": "GPT-OSS 20B (MXFP4-Q8, MLX)",
"description": "OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this variant is a 4-bit MLX conversion for Apple Silicon.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 12025908224,
"n_layers": 24,
"hidden_size": 2880,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Kimi-K2-Instruct-4bit",
"name": "Kimi K2 Instruct (4-bit)",
"description": "Kimi K2 is a large language model trained on the Kimi K2 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 620622774272,
"n_layers": 61,
"hidden_size": 7168,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Kimi-K2-Thinking",
"name": "Kimi K2 Thinking (4-bit)",
"description": "Kimi K2 Thinking is the latest, most capable version of open-source thinking model.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 706522120192,
"n_layers": 61,
"hidden_size": 7168,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit",
"name": "Llama 3.1 70B (4-bit)",
"description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 40652242944,
"n_layers": 80,
"hidden_size": 8192,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit",
"name": "Llama 3.1 8B (8-bit)",
"description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 8954839040,
"n_layers": 32,
"hidden_size": 4096,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16",
"name": "Llama 3.1 8B (BF16)",
"description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 16882073600,
"n_layers": 32,
"hidden_size": 4096,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
"name": "Llama 3.1 8B (4-bit)",
"description": "Llama 3.1 is a large language model trained on the Llama 3.1 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 4637851648,
"n_layers": 32,
"hidden_size": 4096,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Llama-3.2-1B-Instruct-4bit",
"name": "Llama 3.2 1B (4-bit)",
"description": "Llama 3.2 is a large language model trained on the Llama 3.2 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 729808896,
"n_layers": 16,
"hidden_size": 2048,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Llama-3.2-3B-Instruct-8bit",
"name": "Llama 3.2 3B (8-bit)",
"description": "Llama 3.2 is a large language model trained on the Llama 3.2 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 3501195264,
"n_layers": 28,
"hidden_size": 3072,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
"name": "Llama 3.2 3B (4-bit)",
"description": "Llama 3.2 is a large language model trained on the Llama 3.2 dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 1863319552,
"n_layers": 28,
"hidden_size": 3072,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Llama-3.3-70B-Instruct-8bit",
"name": "Llama 3.3 70B (8-bit)",
"description": "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 76799803392,
"n_layers": 80,
"hidden_size": 8192,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/llama-3.3-70b-instruct-fp16",
"name": "Llama 3.3 70B (FP16)",
"description": "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 144383672320,
"n_layers": 80,
"hidden_size": 8192,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Llama-3.3-70B-Instruct-4bit",
"name": "Llama 3.3 70B (4-bit)",
"description": "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 40652242944,
"n_layers": 80,
"hidden_size": 8192,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/MiniMax-M2.1-3bit",
"name": "MiniMax M2.1 3bit",
"description": "MiniMax M2.1 3bit",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 100086644736,
"n_layers": 61,
"hidden_size": 3072,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/MiniMax-M2.1-8bit",
"name": "MiniMax M2.1 8bit",
"description": "MiniMax M2.1 8bit",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 242986745856,
"n_layers": 61,
"hidden_size": 3072,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-0.6B-8bit",
"name": "Qwen3 0.6B (8-bit)",
"description": "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.",
"tags": [],
"supports_tensor": false,
"storage_size_bytes": 698351616,
"n_layers": 28,
"hidden_size": 1024,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-0.6B-4bit",
"name": "Qwen3 0.6B (4-bit)",
"description": "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.",
"tags": [],
"supports_tensor": false,
"storage_size_bytes": 342884352,
"n_layers": 28,
"hidden_size": 1024,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit",
"name": "Qwen3 235B A22B (4-bit)",
"description": "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 141733920768,
"n_layers": 94,
"hidden_size": 4096,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit",
"name": "Qwen3 235B A22B (8-bit)",
"description": "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 268435456000,
"n_layers": 94,
"hidden_size": 4096,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-30B-A3B-8bit",
"name": "Qwen3 30B A3B (8-bit)",
"description": "Qwen3 30B is a large language model trained on the Qwen3 30B dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 33279705088,
"n_layers": 48,
"hidden_size": 2048,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-30B-A3B-4bit",
"name": "Qwen3 30B A3B (4-bit)",
"description": "Qwen3 30B is a large language model trained on the Qwen3 30B dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 17612931072,
"n_layers": 48,
"hidden_size": 2048,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit",
"name": "Qwen3 80B A3B (4-bit)",
"description": "Qwen3 80B",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 46976204800,
"n_layers": 48,
"hidden_size": 2048,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit",
"name": "Qwen3 80B A3B (8-bit)",
"description": "Qwen3 80B",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 88814387200,
"n_layers": 48,
"hidden_size": 2048,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit",
"name": "Qwen3 80B A3B Thinking (4-bit)",
"description": "Qwen3 80B Reasoning model",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 88814387200,
"n_layers": 48,
"hidden_size": 2048,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit",
"name": "Qwen3 80B A3B Thinking (8-bit)",
"description": "Qwen3 80B Reasoning model",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 88814387200,
"n_layers": 48,
"hidden_size": 2048,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit",
"name": "Qwen3 Coder 480B A35B (4-bit)",
"description": "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 289910292480,
"n_layers": 62,
"hidden_size": 6144,
"is_user_added": false
}

View File

@@ -0,0 +1,11 @@
{
"model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit",
"name": "Qwen3 Coder 480B A35B (8-bit)",
"description": "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.",
"tags": [],
"supports_tensor": true,
"storage_size_bytes": 579820584960,
"n_layers": 62,
"hidden_size": 6144,
"is_user_added": false
}

View File

@@ -0,0 +1,399 @@
"""Model registry for loading and managing model configurations.
This module provides a registry that loads model configurations from:
1. New registry structure: base_models.json + variants.json (grouped models)
2. Legacy JSON files in the cards/ directory (shipped with exo)
3. User-added JSON files in ~/.exo/models/ (created via dashboard)
The registry automatically combines base model metadata with variant data
to produce complete ModelConfig objects with grouping information.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Final, cast
from loguru import logger
from exo.shared.constants import EXO_USER_MODELS_DIR
from exo.shared.models.architecture_support import (
supports_tensor_parallel,
supports_vision,
)
from exo.shared.models.registry import (
BASE_MODELS_BY_ID,
VARIANTS,
)
from exo.shared.types.models import ModelConfig
# Directory containing built-in model config JSON files (legacy format)
BUILTIN_CARDS_DIR: Final[Path] = Path(__file__).parent / "cards"
def derive_capabilities(
architecture: str,
manual_capabilities: list[str] | None = None,
) -> list[str]:
"""Derive model capabilities from architecture and manual overrides.
Priority:
1. Manual capabilities from base_models.json (if provided)
2. Architecture-based detection (for vision)
3. Default: ["text"]
Args:
architecture: The HuggingFace model_type value.
manual_capabilities: Explicit capabilities from base_models.json.
Returns:
List of capability strings (e.g., ["text", "vision", "code"]).
"""
# If manual capabilities are provided, use them as primary source
if manual_capabilities:
return manual_capabilities
capabilities = ["text"] # All models support text by default
# Architecture-based detection for vision
if supports_vision(architecture):
capabilities.append("vision")
return capabilities
def _model_id_to_filename(model_id: str) -> str:
"""Convert model_id to a valid filename by replacing / with --.
Examples:
"mlx-community/Meta-Llama-3.1-8B-Instruct-4bit" -> "mlx-community--Meta-Llama-3.1-8B-Instruct-4bit"
"""
return model_id.replace("/", "--")
def _quantization_display_name(quantization: str) -> str:
"""Format quantization for display in model name."""
quant_lower = quantization.lower()
if quant_lower in ("4bit", "4-bit"):
return "4-bit"
if quant_lower in ("8bit", "8-bit"):
return "8-bit"
if quant_lower in ("bf16", "bfloat16"):
return "BF16"
if quant_lower in ("fp16", "float16"):
return "FP16"
if quant_lower in ("3bit", "3-bit"):
return "3-bit"
if quant_lower in ("6bit", "6-bit"):
return "6-bit"
# Return as-is for other quantizations
return quantization.upper()
def _load_config_from_file(path: Path) -> ModelConfig | None:
"""Load a single model config from a JSON file."""
try:
with path.open("r", encoding="utf-8") as f:
data = cast(dict[str, Any], json.load(f))
return ModelConfig.model_validate(data)
except Exception as e:
logger.warning(f"Failed to load model config from {path}: {e}")
return None
def _variant_to_config(variant: dict[str, object]) -> ModelConfig | None:
"""Convert a variant + base model data to a ModelConfig."""
base_model_id = str(variant.get("base_model", ""))
base_model = BASE_MODELS_BY_ID.get(base_model_id)
if base_model is None:
logger.warning(f"Base model {base_model_id} not found for variant {variant}")
return None
model_id = str(variant["model_id"])
quantization = str(variant.get("quantization", ""))
storage_size_raw = variant.get("storage_size_bytes", 0)
storage_size_bytes = int(str(storage_size_raw)) if storage_size_raw else 0
base_name = str(base_model["name"])
architecture = str(base_model.get("architecture", ""))
n_layers_raw = base_model.get("n_layers", 0)
n_layers = int(str(n_layers_raw)) if n_layers_raw else 1
hidden_size_raw = base_model.get("hidden_size", 0)
hidden_size = int(str(hidden_size_raw)) if hidden_size_raw else 1
description = str(base_model.get("description", ""))
# Extract new UI display fields from base model
tagline = str(base_model.get("tagline", ""))
family = str(base_model.get("family", ""))
manual_capabilities_raw = base_model.get("capabilities")
manual_capabilities = (
cast(list[str], manual_capabilities_raw)
if isinstance(manual_capabilities_raw, list)
else None
)
capabilities = derive_capabilities(architecture, manual_capabilities)
# Generate display name: "Base Name (Quantization)"
quant_display = _quantization_display_name(quantization)
display_name = f"{base_name} ({quant_display})"
# Derive supports_tensor from architecture
tensor_support = supports_tensor_parallel(architecture)
return ModelConfig(
model_id=model_id,
name=display_name,
description=description,
tags=[],
supports_tensor=tensor_support,
storage_size_bytes=storage_size_bytes,
n_layers=n_layers,
hidden_size=hidden_size,
is_user_added=False,
architecture=architecture,
base_model_id=base_model_id,
base_model_name=base_name,
quantization=quantization,
tagline=tagline,
capabilities=capabilities,
family=family,
)
class ModelRegistry:
"""Registry for model configurations.
Loads configurations from:
1. Registry structure (base_models.json + variants.json)
2. Built-in cards/ directory (legacy format)
3. User ~/.exo/models/
"""
def __init__(self) -> None:
self._configs: dict[str, ModelConfig] = {}
self._reload()
def _migrate_user_model_files(self) -> None:
"""Migrate user model files from old naming format to new format.
Old format: {short_id}.json (e.g., "meta-llama-3.1-8b-instruct-4bit.json")
New format: {model_id.replace("/", "--")}.json (e.g., "mlx-community--Meta-Llama-3.1-8B-Instruct-4bit.json")
"""
if not EXO_USER_MODELS_DIR.exists():
return
for path in EXO_USER_MODELS_DIR.glob("*.json"):
# New format files contain "--" (org--repo)
if "--" in path.stem:
continue
# This is an old format file, load and migrate
config = _load_config_from_file(path)
if config is None:
continue
# Create new filename from model_id
new_filename = _model_id_to_filename(config.model_id) + ".json"
new_path = EXO_USER_MODELS_DIR / new_filename
if new_path.exists():
# New file already exists, just remove the old one
logger.info(f"Removing old user model file (already migrated): {path}")
path.unlink()
else:
# Rename to new format
logger.info(f"Migrating user model file: {path} -> {new_path}")
path.rename(new_path)
def _reload(self) -> None:
"""Reload all model configurations from disk."""
self._configs.clear()
registry_count = 0
legacy_count = 0
# First, load from new registry structure (base_models + variants)
for variant in VARIANTS:
config = _variant_to_config(variant)
if config is not None:
# Use model_id as the key
self._configs[config.model_id] = config
registry_count += 1
# Then load legacy built-in models (these can override registry if same model_id)
if BUILTIN_CARDS_DIR.exists():
for path in BUILTIN_CARDS_DIR.glob("*.json"):
config = _load_config_from_file(path)
if config is None:
continue
# Skip if already loaded from registry
if config.model_id in self._configs:
continue
self._configs[config.model_id] = config
legacy_count += 1
# Migrate old user model files before loading
self._migrate_user_model_files()
# Load user models (these can override built-in if same model_id)
user_count = 0
if EXO_USER_MODELS_DIR.exists():
for path in EXO_USER_MODELS_DIR.glob("*.json"):
config = _load_config_from_file(path)
if config is not None:
self._configs[config.model_id] = config
user_count += 1
logger.info(
f"Loaded {len(self._configs)} model configs "
f"(registry: {registry_count}, legacy: {legacy_count}, user: {user_count})"
)
def get(self, model_id: str) -> ModelConfig | None:
"""Get a model config by model_id (HuggingFace repo path)."""
return self._configs.get(model_id)
def list_all(self) -> dict[str, ModelConfig]:
"""Return all model configurations."""
return dict(self._configs)
def list_builtin(self) -> dict[str, ModelConfig]:
"""Return only built-in model configurations."""
return {k: v for k, v in self._configs.items() if not v.is_user_added}
def list_user_added(self) -> dict[str, ModelConfig]:
"""Return only user-added model configurations."""
return {k: v for k, v in self._configs.items() if v.is_user_added}
def list_grouped(self) -> dict[str, list[ModelConfig]]:
"""Return models grouped by base_model_id.
Returns:
Dict mapping base_model_id to list of variant configs.
Models without a base_model_id are grouped under their model_id.
"""
grouped: dict[str, list[ModelConfig]] = {}
for model_id, config in self._configs.items():
# Use base_model_id if available, otherwise use model_id
group_key = config.base_model_id if config.base_model_id else model_id
if group_key not in grouped:
grouped[group_key] = []
grouped[group_key].append(config)
# Sort variants within each group by storage size (smallest first)
for variants in grouped.values():
variants.sort(key=lambda c: c.storage_size_bytes)
return grouped
def get_base_model_variants(self, base_model_id: str) -> list[ModelConfig]:
"""Get all variants for a given base model.
Args:
base_model_id: The base model identifier (e.g., "llama-3.1-8b")
Returns:
List of ModelConfig objects for all variants, sorted by storage size.
"""
variants = [
config
for config in self._configs.values()
if config.base_model_id == base_model_id
]
variants.sort(key=lambda c: c.storage_size_bytes)
return variants
def add_user_model(self, config: ModelConfig) -> str:
"""Add a user model configuration and persist to disk.
Args:
config: The model configuration to add
Returns:
The model_id of the added model.
"""
model_id = config.model_id
# Ensure is_user_added is True
config = config.model_copy(update={"is_user_added": True})
# Create user models directory if needed
EXO_USER_MODELS_DIR.mkdir(parents=True, exist_ok=True)
# Save to JSON file using model_id as filename
filename = _model_id_to_filename(model_id) + ".json"
path = EXO_USER_MODELS_DIR / filename
with path.open("w", encoding="utf-8") as f:
json.dump(config.model_dump(), f, indent=2)
# Add to registry
self._configs[model_id] = config
logger.info(f"Added user model: {model_id}")
return model_id
def remove_user_model(self, model_id: str) -> bool:
"""Remove a user-added model.
Args:
model_id: The model_id of the model to remove.
Returns:
True if the model was removed, False if not found or not user-added.
"""
config = self._configs.get(model_id)
if config is None or not config.is_user_added:
return False
# Remove JSON file
filename = _model_id_to_filename(model_id) + ".json"
path = EXO_USER_MODELS_DIR / filename
if path.exists():
path.unlink()
# Remove from registry
del self._configs[model_id]
logger.info(f"Removed user model: {model_id}")
return True
def update_user_model(self, model_id: str, **updates: object) -> ModelConfig | None:
"""Update a user-added model configuration.
Args:
model_id: The model_id of the model to update.
**updates: Field updates to apply.
Returns:
The updated config, or None if not found or not user-added.
"""
config = self._configs.get(model_id)
if config is None or not config.is_user_added:
return None
# Apply updates
updated_config = config.model_copy(update=updates)
# Persist
filename = _model_id_to_filename(model_id) + ".json"
path = EXO_USER_MODELS_DIR / filename
with path.open("w", encoding="utf-8") as f:
json.dump(updated_config.model_dump(), f, indent=2)
self._configs[model_id] = updated_config
logger.info(f"Updated user model: {model_id}")
return updated_config
# Global registry instance (lazy-loaded)
_registry: ModelRegistry | None = None
def get_registry() -> ModelRegistry:
"""Get the global model registry instance."""
global _registry
if _registry is None:
_registry = ModelRegistry()
return _registry

View File

@@ -0,0 +1,50 @@
"""Model registry with base models and quantization variants.
This module provides access to the model registry data stored in JSON files.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Final, cast
_REGISTRY_DIR: Final[Path] = Path(__file__).parent
# Load base models and variants data at import time
with (_REGISTRY_DIR / "base_models.json").open("r", encoding="utf-8") as f:
_base_models_data: dict[str, Any] = cast(dict[str, Any], json.load(f))
BASE_MODELS: Final[list[dict[str, object]]] = cast(
list[dict[str, object]], _base_models_data["base_models"]
)
with (_REGISTRY_DIR / "variants.json").open("r", encoding="utf-8") as f:
_variants_data: dict[str, Any] = cast(dict[str, Any], json.load(f))
VARIANTS: Final[list[dict[str, object]]] = cast(
list[dict[str, object]], _variants_data["variants"]
)
# Create lookup dicts for fast access
BASE_MODELS_BY_ID: Final[dict[str, dict[str, object]]] = {
str(m["id"]): m for m in BASE_MODELS
}
# Variants are keyed by model_id (full HuggingFace repo path)
VARIANTS_BY_MODEL_ID: Final[dict[str, dict[str, object]]] = {
str(v["model_id"]): v for v in VARIANTS
}
VARIANTS_BY_BASE_MODEL: Final[dict[str, list[dict[str, object]]]] = {}
for _variant in VARIANTS:
_base_model_id = str(_variant["base_model"])
if _base_model_id not in VARIANTS_BY_BASE_MODEL:
VARIANTS_BY_BASE_MODEL[_base_model_id] = []
VARIANTS_BY_BASE_MODEL[_base_model_id].append(_variant)
__all__ = [
"BASE_MODELS",
"VARIANTS",
"BASE_MODELS_BY_ID",
"VARIANTS_BY_MODEL_ID",
"VARIANTS_BY_BASE_MODEL",
]

View File

@@ -0,0 +1,202 @@
{
"base_models": [
{
"id": "llama-3.1-8b",
"family": "llama",
"name": "Llama 3.1 8B",
"description": "Meta's Llama 3.1 8B instruction-tuned model with 128K context window",
"tagline": "Fast and capable instruction-following model",
"capabilities": ["text"],
"architecture": "llama",
"n_layers": 32,
"hidden_size": 4096
},
{
"id": "llama-3.1-70b",
"family": "llama",
"name": "Llama 3.1 70B",
"description": "Meta's Llama 3.1 70B instruction-tuned model with 128K context window",
"tagline": "Powerful general-purpose model with 128K context",
"capabilities": ["text", "code"],
"architecture": "llama",
"n_layers": 80,
"hidden_size": 8192
},
{
"id": "llama-3.2-1b",
"family": "llama",
"name": "Llama 3.2 1B",
"description": "Meta's Llama 3.2 1B lightweight instruction-tuned model",
"tagline": "Ultra-lightweight for edge and mobile",
"capabilities": ["text"],
"architecture": "llama",
"n_layers": 16,
"hidden_size": 2048
},
{
"id": "llama-3.2-3b",
"family": "llama",
"name": "Llama 3.2 3B",
"description": "Meta's Llama 3.2 3B instruction-tuned model",
"tagline": "Compact model balancing size and capability",
"capabilities": ["text"],
"architecture": "llama",
"n_layers": 28,
"hidden_size": 3072
},
{
"id": "llama-3.3-70b",
"family": "llama",
"name": "Llama 3.3 70B",
"description": "Meta's Llama 3.3 70B instruction-tuned model with improved performance",
"tagline": "Meta's flagship open-weight model",
"capabilities": ["text", "code"],
"architecture": "llama",
"n_layers": 80,
"hidden_size": 8192
},
{
"id": "deepseek-v3.1",
"family": "deepseek",
"name": "DeepSeek V3.1",
"description": "DeepSeek's V3.1 model with 685B parameters using MoE architecture",
"tagline": "State-of-the-art MoE with 685B parameters",
"capabilities": ["text", "thinking", "code"],
"architecture": "deepseek_v32",
"n_layers": 61,
"hidden_size": 7168
},
{
"id": "kimi-k2",
"family": "kimi",
"name": "Kimi K2",
"description": "Moonshot AI's Kimi K2 large language model",
"tagline": "Powerful MoE from Moonshot AI",
"capabilities": ["text", "code"],
"architecture": "deepseek_v3",
"n_layers": 61,
"hidden_size": 7168
},
{
"id": "qwen3-0.6b",
"family": "qwen",
"name": "Qwen3 0.6B",
"description": "Alibaba's Qwen3 0.6B lightweight model",
"tagline": "Tiny but capable for simple tasks",
"capabilities": ["text"],
"architecture": "qwen2",
"n_layers": 28,
"hidden_size": 1024
},
{
"id": "qwen3-30b-a3b",
"family": "qwen",
"name": "Qwen3 30B A3B",
"description": "Alibaba's Qwen3 30B with A3B MoE architecture",
"tagline": "Efficient MoE for balanced performance",
"capabilities": ["text", "code"],
"architecture": "qwen3_moe",
"n_layers": 48,
"hidden_size": 2048
},
{
"id": "qwen3-80b-a3b",
"family": "qwen",
"name": "Qwen3 80B A3B",
"description": "Alibaba's Qwen3 80B with A3B MoE architecture",
"tagline": "Large-scale MoE with strong capabilities",
"capabilities": ["text", "code"],
"architecture": "qwen3_moe",
"n_layers": 128,
"hidden_size": 2048
},
{
"id": "qwen3-80b-a3b-thinking",
"family": "qwen",
"name": "Qwen3 80B A3B Thinking",
"description": "Alibaba's Qwen3 80B A3B with enhanced reasoning capabilities",
"tagline": "Reasoning-enhanced large MoE model",
"capabilities": ["text", "thinking", "code"],
"architecture": "qwen3_moe",
"n_layers": 128,
"hidden_size": 2048
},
{
"id": "qwen3-235b-a22b",
"family": "qwen",
"name": "Qwen3 235B A22B",
"description": "Alibaba's Qwen3 235B with A22B MoE architecture",
"tagline": "Massive open model for general intelligence",
"capabilities": ["text", "code"],
"architecture": "qwen3_moe",
"n_layers": 94,
"hidden_size": 4096
},
{
"id": "qwen3-coder-480b-a35b",
"family": "qwen",
"name": "Qwen3 Coder 480B A35B",
"description": "Alibaba's Qwen3 Coder 480B optimized for code generation",
"tagline": "Massive code-specialized MoE model",
"capabilities": ["text", "code"],
"architecture": "qwen3_moe",
"n_layers": 128,
"hidden_size": 5120
},
{
"id": "gpt-oss-120b",
"family": "gpt-oss",
"name": "GPT-OSS 120B",
"description": "Open source GPT model with 120B parameters",
"tagline": "Large open-source GPT with MoE",
"capabilities": ["text", "code"],
"architecture": "gpt_oss_moe",
"n_layers": 128,
"hidden_size": 4096
},
{
"id": "gpt-oss-20b",
"family": "gpt-oss",
"name": "GPT-OSS 20B",
"description": "Open source GPT model with 20B parameters",
"tagline": "Compact open-source GPT",
"capabilities": ["text"],
"architecture": "gpt_oss",
"n_layers": 48,
"hidden_size": 2560
},
{
"id": "glm-4.5-air",
"family": "glm",
"name": "GLM 4.5 Air",
"description": "Zhipu AI's GLM 4.5 Air model",
"tagline": "Lightweight Chinese-English bilingual model",
"capabilities": ["text"],
"architecture": "glm4_moe",
"n_layers": 40,
"hidden_size": 4096
},
{
"id": "glm-4.7",
"family": "glm",
"name": "GLM 4.7",
"description": "Zhipu AI's GLM 4.7 model with MoE architecture",
"tagline": "Large-scale Chinese-English MoE",
"capabilities": ["text", "code"],
"architecture": "glm4_moe",
"n_layers": 91,
"hidden_size": 5120
},
{
"id": "minimax-m2.1",
"family": "minimax",
"name": "MiniMax M2.1",
"description": "MiniMax's M2.1 large language model",
"tagline": "High-performance general-purpose model",
"capabilities": ["text", "code"],
"architecture": "minimax",
"n_layers": 80,
"hidden_size": 6144
}
]
}

View File

@@ -0,0 +1,214 @@
{
"variants": [
{
"base_model": "llama-3.1-8b",
"model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
"quantization": "4bit",
"storage_size_bytes": 4637851648
},
{
"base_model": "llama-3.1-8b",
"model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit",
"quantization": "8bit",
"storage_size_bytes": 8954839040
},
{
"base_model": "llama-3.1-8b",
"model_id": "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16",
"quantization": "bf16",
"storage_size_bytes": 16065732608
},
{
"base_model": "llama-3.1-70b",
"model_id": "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit",
"quantization": "4bit",
"storage_size_bytes": 40324489216
},
{
"base_model": "llama-3.2-1b",
"model_id": "mlx-community/Llama-3.2-1B-Instruct-4bit",
"quantization": "4bit",
"storage_size_bytes": 874968576
},
{
"base_model": "llama-3.2-3b",
"model_id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
"quantization": "4bit",
"storage_size_bytes": 2019430400
},
{
"base_model": "llama-3.2-3b",
"model_id": "mlx-community/Llama-3.2-3B-Instruct-8bit",
"quantization": "8bit",
"storage_size_bytes": 3620454400
},
{
"base_model": "llama-3.3-70b",
"model_id": "mlx-community/Llama-3.3-70B-Instruct-4bit",
"quantization": "4bit",
"storage_size_bytes": 40324489216
},
{
"base_model": "llama-3.3-70b",
"model_id": "mlx-community/Llama-3.3-70B-Instruct-8bit",
"quantization": "8bit",
"storage_size_bytes": 74823761920
},
{
"base_model": "llama-3.3-70b",
"model_id": "mlx-community/Llama-3.3-70B-Instruct",
"quantization": "fp16",
"storage_size_bytes": 140734881792
},
{
"base_model": "deepseek-v3.1",
"model_id": "mlx-community/DeepSeek-V3.1-4bit",
"quantization": "4bit",
"storage_size_bytes": 405874409472
},
{
"base_model": "deepseek-v3.1",
"model_id": "mlx-community/DeepSeek-V3.1-8bit",
"quantization": "8bit",
"storage_size_bytes": 737851965440
},
{
"base_model": "kimi-k2",
"model_id": "mlx-community/Kimi-K2-Instruct-4bit",
"quantization": "4bit",
"storage_size_bytes": 607696764928
},
{
"base_model": "kimi-k2",
"model_id": "mlx-community/Kimi-K2-Thinking-4bit",
"quantization": "4bit",
"storage_size_bytes": 607696764928
},
{
"base_model": "qwen3-0.6b",
"model_id": "mlx-community/Qwen3-0.6B-4bit",
"quantization": "4bit",
"storage_size_bytes": 460873728
},
{
"base_model": "qwen3-0.6b",
"model_id": "mlx-community/Qwen3-0.6B-8bit",
"quantization": "8bit",
"storage_size_bytes": 752879616
},
{
"base_model": "qwen3-30b-a3b",
"model_id": "mlx-community/Qwen3-30B-A3B-4bit",
"quantization": "4bit",
"storage_size_bytes": 17612931072
},
{
"base_model": "qwen3-30b-a3b",
"model_id": "mlx-community/Qwen3-30B-A3B-8bit",
"quantization": "8bit",
"storage_size_bytes": 32161292288
},
{
"base_model": "qwen3-80b-a3b",
"model_id": "mlx-community/Qwen3-80B-A3B-4bit",
"quantization": "4bit",
"storage_size_bytes": 46170324992
},
{
"base_model": "qwen3-80b-a3b",
"model_id": "mlx-community/Qwen3-80B-A3B-8bit",
"quantization": "8bit",
"storage_size_bytes": 84458057728
},
{
"base_model": "qwen3-80b-a3b-thinking",
"model_id": "mlx-community/Qwen3-80B-A3B-Thinking-4bit",
"quantization": "4bit",
"storage_size_bytes": 46170324992
},
{
"base_model": "qwen3-80b-a3b-thinking",
"model_id": "mlx-community/Qwen3-80B-A3B-Thinking-8bit",
"quantization": "8bit",
"storage_size_bytes": 84458057728
},
{
"base_model": "qwen3-235b-a22b",
"model_id": "mlx-community/Qwen3-235B-A22B-4bit",
"quantization": "4bit",
"storage_size_bytes": 131953045504
},
{
"base_model": "qwen3-235b-a22b",
"model_id": "mlx-community/Qwen3-235B-A22B-8bit",
"quantization": "8bit",
"storage_size_bytes": 247587201024
},
{
"base_model": "qwen3-coder-480b-a35b",
"model_id": "mlx-community/Qwen3-Coder-480B-A35B-4bit",
"quantization": "4bit",
"storage_size_bytes": 274063867904
},
{
"base_model": "qwen3-coder-480b-a35b",
"model_id": "mlx-community/Qwen3-Coder-480B-A35B-8bit",
"quantization": "8bit",
"storage_size_bytes": 506312851456
},
{
"base_model": "gpt-oss-120b",
"model_id": "mlx-community/GPT-OSS-120B-MXFP4-Q8",
"quantization": "mxfp4-q8",
"storage_size_bytes": 67108864000
},
{
"base_model": "gpt-oss-20b",
"model_id": "mlx-community/GPT-OSS-20B-MXFP4-Q8",
"quantization": "mxfp4-q8",
"storage_size_bytes": 11811160064
},
{
"base_model": "glm-4.5-air",
"model_id": "mlx-community/GLM-4.5-Air-8bit",
"quantization": "8bit",
"storage_size_bytes": 9663676416
},
{
"base_model": "glm-4.5-air",
"model_id": "mlx-community/GLM-4.5-Air-bf16",
"quantization": "bf16",
"storage_size_bytes": 18253611008
},
{
"base_model": "glm-4.7",
"model_id": "mlx-community/GLM-4.7-4bit",
"quantization": "4bit",
"storage_size_bytes": 198556925568
},
{
"base_model": "glm-4.7",
"model_id": "mlx-community/GLM-4.7-6bit",
"quantization": "6bit",
"storage_size_bytes": 282281095168
},
{
"base_model": "glm-4.7",
"model_id": "mlx-community/GLM-4.7-8bit-gs32",
"quantization": "8bit",
"storage_size_bytes": 374997565440
},
{
"base_model": "minimax-m2.1",
"model_id": "mlx-community/MiniMax-M2.1-8bit",
"quantization": "8bit",
"storage_size_bytes": 483183656960
},
{
"base_model": "minimax-m2.1",
"model_id": "mlx-community/MiniMax-M2.1-3bit",
"quantization": "3bit",
"storage_size_bytes": 213674106880
}
]
}