mirror of
https://github.com/exo-explore/exo.git
synced 2026-04-17 20:40:35 -04:00
## Motivation
`mlx-community` has just published the new **Qwen3.6-35B-A3B**
multimodal MoE family on HuggingFace. Without static model cards exo
doesn't surface these models in the dashboard picker or match its
placement / prefill logic, so users can't one-click launch them. This PR
adds cards for the three quants whose safetensors indexes are already
live on HF (4bit / 5bit / bf16).
## Changes
Three new TOML files in `resources/inference_model_cards/`:
- `mlx-community--Qwen3.6-35B-A3B-4bit.toml` (~19 GB)
- `mlx-community--Qwen3.6-35B-A3B-5bit.toml` (~23 GB)
- `mlx-community--Qwen3.6-35B-A3B-bf16.toml` (~65 GB)
All three share the same architectural fields (`n_layers = 40`,
`hidden_size = 2048`, `num_key_value_heads = 2`, `context_length =
262144`, capabilities `text, thinking, thinking_toggle, vision`,
`base_model = "Qwen3.6 35B A3B"`) — only `model_id`, `quantization`, and
`storage_size.in_bytes` differ between variants.
## Why It Works
- Qwen3.6-35B-A3B reuses the `qwen3_5_moe` architecture
(`Qwen3_5MoeForConditionalGeneration`) — the same one already wired into
exo's MLX runner at `src/exo/worker/engines/mlx/auto_parallel.py:47` via
`Qwen3_5MoeModel`. The architectural fields are taken verbatim from the
HF `config.json.text_config` and match the existing `Qwen3.5-35B-A3B-*`
cards.
- Storage sizes are the exact `metadata.total_size` read from each
variant's `model.safetensors.index.json` on HF, so download progress and
cluster-memory-fit checks are accurate.
- Vision support is flagged in `capabilities`; the `[vision]` block is
auto-detected by `ModelCard._autodetect_vision` from the upstream
`config.json`, so no hand-written vision config is required.
- The card loader (`_refresh_card_cache` in
`src/exo/shared/models/model_cards.py`) globs every `.toml` in
`resources/inference_model_cards/` on startup, so nothing else needs to
change — the `/models` endpoint and the dashboard picker pick them up
automatically.
The `mxfp4` / `mxfp8` / `nvfp4` variants are still uploading upstream
(index JSONs currently 404) and can be added in a follow-up PR once HF
completes.
## Test Plan
### Manual Testing
Hardware: MacBook Pro M4 Max, 48 GB unified memory.
- Built the dashboard, ran `uv run exo`, waited for the API to come up
on `http://localhost:52415`.
- `curl -s http://localhost:52415/models` returns the three new model
ids (`mlx-community/Qwen3.6-35B-A3B-{4bit,5bit,bf16}`) alongside
existing models.
- Opened the dashboard, clicked SELECT MODEL, typed "Qwen3.6" into the
search box. A single **"Qwen3.6 35B A3B"** group appears showing `3
variants (19GB-65GB)`. Expanding it lists the `4bit` / `5bit` / `bf16`
quants with sizes `19GB` / `23GB` / `65GB`, exactly as expected:

- Programmatically loaded each TOML via `ModelCard.load_from_path(...)`
and confirmed the parsed fields (layers / hidden / KV heads / context /
quant / base_model / caps / bytes) match what's written in the files.
### Automated Testing
No code paths were touched — these are pure TOML data files that plug
into the existing model-card loader. The existing pytest suite covers
TOML parsing and card serving; adding new TOMLs doesn't require new test
scaffolding. `uv run ruff check` and `nix fmt` are clean.
---------
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-authored-by: Ryuichi Leo Takashige <rl.takashige@gmail.com>
126 lines
3.2 KiB
Python
126 lines
3.2 KiB
Python
# -*- mode: python ; coding: utf-8 -*-
|
|
|
|
import importlib.util
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from PyInstaller.utils.hooks import collect_submodules
|
|
|
|
PROJECT_ROOT = Path.cwd()
|
|
SOURCE_ROOT = PROJECT_ROOT / "src"
|
|
ENTRYPOINT = SOURCE_ROOT / "exo" / "__main__.py"
|
|
DASHBOARD_DIR = PROJECT_ROOT / "dashboard" / "build"
|
|
RESOURCES_DIR = PROJECT_ROOT / "resources"
|
|
EXO_SHARED_MODELS_DIR = SOURCE_ROOT / "exo" / "shared" / "models"
|
|
|
|
if not ENTRYPOINT.is_file():
|
|
raise SystemExit(f"Unable to locate Exo entrypoint: {ENTRYPOINT}")
|
|
|
|
if not DASHBOARD_DIR.is_dir():
|
|
raise SystemExit(f"Dashboard assets are missing: {DASHBOARD_DIR}")
|
|
|
|
if not RESOURCES_DIR.is_dir():
|
|
raise SystemExit(f"Resource assets are missing: {RESOURCES_DIR}")
|
|
|
|
if not EXO_SHARED_MODELS_DIR.is_dir():
|
|
raise SystemExit(f"Shared model assets are missing: {EXO_SHARED_MODELS_DIR}")
|
|
|
|
block_cipher = None
|
|
|
|
|
|
def _module_directory(module_name: str) -> Path:
|
|
spec = importlib.util.find_spec(module_name)
|
|
if spec is None:
|
|
raise SystemExit(f"Module '{module_name}' is not available in the current environment.")
|
|
if spec.submodule_search_locations:
|
|
return Path(next(iter(spec.submodule_search_locations))).resolve()
|
|
if spec.origin:
|
|
return Path(spec.origin).resolve().parent
|
|
raise SystemExit(f"Unable to determine installation directory for '{module_name}'.")
|
|
|
|
|
|
MLX_PACKAGE_DIR = _module_directory("mlx")
|
|
MLX_LIB_DIR = MLX_PACKAGE_DIR / "lib"
|
|
if not MLX_LIB_DIR.is_dir():
|
|
raise SystemExit(f"mlx Metal libraries are missing: {MLX_LIB_DIR}")
|
|
|
|
|
|
def _safe_collect(package_name: str) -> list[str]:
|
|
try:
|
|
return collect_submodules(package_name)
|
|
except ImportError:
|
|
return []
|
|
|
|
|
|
HIDDEN_IMPORTS = sorted(
|
|
set(
|
|
collect_submodules("mlx")
|
|
+ _safe_collect("mlx_lm")
|
|
+ _safe_collect("mlx_vlm")
|
|
+ _safe_collect("transformers")
|
|
)
|
|
)
|
|
|
|
DATAS: list[tuple[str, str]] = [
|
|
(str(DASHBOARD_DIR), "dashboard"),
|
|
(str(RESOURCES_DIR), "resources"),
|
|
(str(MLX_LIB_DIR), "mlx/lib"),
|
|
(str(EXO_SHARED_MODELS_DIR), "exo/shared/models"),
|
|
]
|
|
|
|
MACMON_PATH = shutil.which("macmon")
|
|
if MACMON_PATH is None:
|
|
raise SystemExit(
|
|
"macmon binary not found in PATH. "
|
|
"Install the pinned fork used by exo via: "
|
|
"cargo install --git https://github.com/vladkens/macmon "
|
|
"--rev a1cd06b6cc0d5e61db24fd8832e74cd992097a7d macmon --force"
|
|
)
|
|
|
|
BINARIES: list[tuple[str, str]] = [
|
|
(MACMON_PATH, "."),
|
|
]
|
|
|
|
a = Analysis(
|
|
[str(ENTRYPOINT)],
|
|
pathex=[str(SOURCE_ROOT)],
|
|
binaries=BINARIES,
|
|
datas=DATAS,
|
|
hiddenimports=HIDDEN_IMPORTS,
|
|
hookspath=[],
|
|
hooksconfig={},
|
|
runtime_hooks=[],
|
|
excludes=[],
|
|
win_no_prefer_redirects=False,
|
|
win_private_assemblies=False,
|
|
noarchive=False,
|
|
)
|
|
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
|
|
exe = EXE(
|
|
pyz,
|
|
a.scripts,
|
|
[],
|
|
exclude_binaries=True,
|
|
name="exo",
|
|
debug=False,
|
|
bootloader_ignore_signals=False,
|
|
strip=False,
|
|
upx=False,
|
|
console=True,
|
|
disable_windowed_traceback=False,
|
|
argv_emulation=False,
|
|
target_arch=None,
|
|
codesign_identity=None,
|
|
entitlements_file=None,
|
|
)
|
|
coll = COLLECT(
|
|
exe,
|
|
a.binaries,
|
|
a.zipfiles,
|
|
a.datas,
|
|
strip=False,
|
|
upx=False,
|
|
upx_exclude=[],
|
|
name="exo",
|
|
)
|