mirror of
https://github.com/exo-explore/exo.git
synced 2026-01-16 18:10:48 -05:00
Compare commits
1 Commits
v1.0.63
...
model-card
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c2f9f50f7e |
@@ -23,6 +23,7 @@ dependencies = [
|
||||
"tiktoken>=0.12.0", # required for kimi k2 tokenizer
|
||||
"hypercorn>=0.18.0",
|
||||
"openai-harmony>=0.0.8",
|
||||
"tomlkit>=0.14.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
15
resources/model_cards/deepseek-v3.1-4bit.toml
Normal file
15
resources/model_cards/deepseek-v3.1-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "deepseek-v3.1-4bit"
|
||||
model_id = "mlx-community/DeepSeek-V3.1-4bit"
|
||||
name = "DeepSeek V3.1 (4-bit)"
|
||||
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/DeepSeek-V3.1-4bit"
|
||||
pretty_name = "DeepSeek V3.1 (4-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 405874409472
|
||||
15
resources/model_cards/deepseek-v3.1-8bit.toml
Normal file
15
resources/model_cards/deepseek-v3.1-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "deepseek-v3.1-8bit"
|
||||
model_id = "mlx-community/DeepSeek-V3.1-8bit"
|
||||
name = "DeepSeek V3.1 (8-bit)"
|
||||
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/DeepSeek-V3.1-8bit"
|
||||
pretty_name = "DeepSeek V3.1 (8-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 765577920512
|
||||
15
resources/model_cards/glm-4.5-air-8bit.toml
Normal file
15
resources/model_cards/glm-4.5-air-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.5-air-8bit"
|
||||
model_id = "mlx-community/GLM-4.5-Air-8bit"
|
||||
name = "GLM 4.5 Air 8bit"
|
||||
description = "GLM 4.5 Air 8bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.5-Air-8bit"
|
||||
pretty_name = "GLM 4.5 Air 8bit"
|
||||
n_layers = 46
|
||||
hidden_size = 4096
|
||||
supports_tensor = false
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 122406567936
|
||||
15
resources/model_cards/glm-4.5-air-bf16.toml
Normal file
15
resources/model_cards/glm-4.5-air-bf16.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.5-air-bf16"
|
||||
model_id = "mlx-community/GLM-4.5-Air-bf16"
|
||||
name = "GLM 4.5 Air bf16"
|
||||
description = "GLM 4.5 Air bf16"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.5-Air-bf16"
|
||||
pretty_name = "GLM 4.5 Air bf16"
|
||||
n_layers = 46
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 229780750336
|
||||
15
resources/model_cards/glm-4.7-4bit.toml
Normal file
15
resources/model_cards/glm-4.7-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.7-4bit"
|
||||
model_id = "mlx-community/GLM-4.7-4bit"
|
||||
name = "GLM 4.7 4bit"
|
||||
description = "GLM 4.7 4bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.7-4bit"
|
||||
pretty_name = "GLM 4.7 4bit"
|
||||
n_layers = 91
|
||||
hidden_size = 5120
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 198556925568
|
||||
15
resources/model_cards/glm-4.7-6bit.toml
Normal file
15
resources/model_cards/glm-4.7-6bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.7-6bit"
|
||||
model_id = "mlx-community/GLM-4.7-6bit"
|
||||
name = "GLM 4.7 6bit"
|
||||
description = "GLM 4.7 6bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.7-6bit"
|
||||
pretty_name = "GLM 4.7 6bit"
|
||||
n_layers = 91
|
||||
hidden_size = 5120
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 286737579648
|
||||
15
resources/model_cards/glm-4.7-8bit-gs32.toml
Normal file
15
resources/model_cards/glm-4.7-8bit-gs32.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.7-8bit-gs32"
|
||||
model_id = "mlx-community/GLM-4.7-8bit-gs32"
|
||||
name = "GLM 4.7 8bit (gs32)"
|
||||
description = "GLM 4.7 8bit (gs32)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.7-8bit-gs32"
|
||||
pretty_name = "GLM 4.7 8bit (gs32)"
|
||||
n_layers = 91
|
||||
hidden_size = 5120
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 396963397248
|
||||
15
resources/model_cards/gpt-oss-120b-MXFP4-Q8.toml
Normal file
15
resources/model_cards/gpt-oss-120b-MXFP4-Q8.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "gpt-oss-120b-MXFP4-Q8"
|
||||
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
|
||||
name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
|
||||
description = "OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
|
||||
pretty_name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
|
||||
n_layers = 36
|
||||
hidden_size = 2880
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 70652212224
|
||||
15
resources/model_cards/gpt-oss-20b-4bit.toml
Normal file
15
resources/model_cards/gpt-oss-20b-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "gpt-oss-20b-4bit"
|
||||
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
|
||||
name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
|
||||
description = "OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
|
||||
pretty_name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
|
||||
n_layers = 24
|
||||
hidden_size = 2880
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 12025908224
|
||||
15
resources/model_cards/kimi-k2-instruct-4bit.toml
Normal file
15
resources/model_cards/kimi-k2-instruct-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "kimi-k2-instruct-4bit"
|
||||
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
|
||||
name = "Kimi K2 Instruct (4-bit)"
|
||||
description = "Kimi K2 is a large language model trained on the Kimi K2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
|
||||
pretty_name = "Kimi K2 Instruct (4-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 620622774272
|
||||
15
resources/model_cards/kimi-k2-thinking.toml
Normal file
15
resources/model_cards/kimi-k2-thinking.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "kimi-k2-thinking"
|
||||
model_id = "mlx-community/Kimi-K2-Thinking"
|
||||
name = "Kimi K2 Thinking (4-bit)"
|
||||
description = "Kimi K2 Thinking is the latest, most capable version of open-source thinking model."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Kimi-K2-Thinking"
|
||||
pretty_name = "Kimi K2 Thinking (4-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 706522120192
|
||||
15
resources/model_cards/llama-3.1-70b.toml
Normal file
15
resources/model_cards/llama-3.1-70b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-70b"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
|
||||
name = "Llama 3.1 70B (4-bit)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.1 70B (4-bit)"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 40652242944
|
||||
15
resources/model_cards/llama-3.1-8b-8bit.toml
Normal file
15
resources/model_cards/llama-3.1-8b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-8b-8bit"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
|
||||
name = "Llama 3.1 8B (8-bit)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
|
||||
pretty_name = "Llama 3.1 8B (8-bit)"
|
||||
n_layers = 32
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 8954839040
|
||||
15
resources/model_cards/llama-3.1-8b-bf16.toml
Normal file
15
resources/model_cards/llama-3.1-8b-bf16.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-8b-bf16"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
|
||||
name = "Llama 3.1 8B (BF16)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
|
||||
pretty_name = "Llama 3.1 8B (BF16)"
|
||||
n_layers = 32
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 16882073600
|
||||
15
resources/model_cards/llama-3.1-8b.toml
Normal file
15
resources/model_cards/llama-3.1-8b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-8b"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
|
||||
name = "Llama 3.1 8B (4-bit)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.1 8B (4-bit)"
|
||||
n_layers = 32
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 4637851648
|
||||
15
resources/model_cards/llama-3.2-1b.toml
Normal file
15
resources/model_cards/llama-3.2-1b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.2-1b"
|
||||
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
|
||||
name = "Llama 3.2 1B (4-bit)"
|
||||
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.2 1B (4-bit)"
|
||||
n_layers = 16
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 729808896
|
||||
15
resources/model_cards/llama-3.2-3b-8bit.toml
Normal file
15
resources/model_cards/llama-3.2-3b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.2-3b-8bit"
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
|
||||
name = "Llama 3.2 3B (8-bit)"
|
||||
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
|
||||
pretty_name = "Llama 3.2 3B (8-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 3501195264
|
||||
15
resources/model_cards/llama-3.2-3b.toml
Normal file
15
resources/model_cards/llama-3.2-3b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.2-3b"
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
|
||||
name = "Llama 3.2 3B (4-bit)"
|
||||
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.2 3B (4-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 1863319552
|
||||
15
resources/model_cards/llama-3.3-70b-8bit.toml
Normal file
15
resources/model_cards/llama-3.3-70b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.3-70b-8bit"
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
|
||||
name = "Llama 3.3 70B (8-bit)"
|
||||
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
|
||||
pretty_name = "Llama 3.3 70B (8-bit)"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 76799803392
|
||||
15
resources/model_cards/llama-3.3-70b-fp16.toml
Normal file
15
resources/model_cards/llama-3.3-70b-fp16.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.3-70b-fp16"
|
||||
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
|
||||
name = "Llama 3.3 70B (FP16)"
|
||||
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
|
||||
pretty_name = "Llama 3.3 70B (FP16)"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 144383672320
|
||||
15
resources/model_cards/llama-3.3-70b.toml
Normal file
15
resources/model_cards/llama-3.3-70b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.3-70b"
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
|
||||
name = "Llama 3.3 70B (4-bit)"
|
||||
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.3 70B"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 40652242944
|
||||
15
resources/model_cards/minimax-m2.1-3bit.toml
Normal file
15
resources/model_cards/minimax-m2.1-3bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "minimax-m2.1-3bit"
|
||||
model_id = "mlx-community/MiniMax-M2.1-3bit"
|
||||
name = "MiniMax M2.1 3bit"
|
||||
description = "MiniMax M2.1 3bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/MiniMax-M2.1-3bit"
|
||||
pretty_name = "MiniMax M2.1 3bit"
|
||||
n_layers = 61
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 100086644736
|
||||
15
resources/model_cards/minimax-m2.1-8bit.toml
Normal file
15
resources/model_cards/minimax-m2.1-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "minimax-m2.1-8bit"
|
||||
model_id = "mlx-community/MiniMax-M2.1-8bit"
|
||||
name = "MiniMax M2.1 8bit"
|
||||
description = "MiniMax M2.1 8bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/MiniMax-M2.1-8bit"
|
||||
pretty_name = "MiniMax M2.1 8bit"
|
||||
n_layers = 61
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 242986745856
|
||||
15
resources/model_cards/qwen3-0.6b-8bit.toml
Normal file
15
resources/model_cards/qwen3-0.6b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-0.6b-8bit"
|
||||
model_id = "mlx-community/Qwen3-0.6B-8bit"
|
||||
name = "Qwen3 0.6B (8-bit)"
|
||||
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-0.6B-8bit"
|
||||
pretty_name = "Qwen3 0.6B (8-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 1024
|
||||
supports_tensor = false
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 698351616
|
||||
15
resources/model_cards/qwen3-0.6b.toml
Normal file
15
resources/model_cards/qwen3-0.6b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-0.6b"
|
||||
model_id = "mlx-community/Qwen3-0.6B-4bit"
|
||||
name = "Qwen3 0.6B (4-bit)"
|
||||
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-0.6B-4bit"
|
||||
pretty_name = "Qwen3 0.6B (4-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 1024
|
||||
supports_tensor = false
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 342884352
|
||||
15
resources/model_cards/qwen3-235b-a22b-4bit.toml
Normal file
15
resources/model_cards/qwen3-235b-a22b-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-235b-a22b-4bit"
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
|
||||
name = "Qwen3 235B A22B (4-bit)"
|
||||
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
|
||||
pretty_name = "Qwen3 235B A22B (4-bit)"
|
||||
n_layers = 94
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 141733920768
|
||||
15
resources/model_cards/qwen3-235b-a22b-8bit.toml
Normal file
15
resources/model_cards/qwen3-235b-a22b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-235b-a22b-8bit"
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
|
||||
name = "Qwen3 235B A22B (8-bit)"
|
||||
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
|
||||
pretty_name = "Qwen3 235B A22B (8-bit)"
|
||||
n_layers = 94
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 268435456000
|
||||
15
resources/model_cards/qwen3-30b-8bit.toml
Normal file
15
resources/model_cards/qwen3-30b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-30b-8bit"
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
|
||||
name = "Qwen3 30B A3B (8-bit)"
|
||||
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
|
||||
pretty_name = "Qwen3 30B A3B (8-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 33279705088
|
||||
15
resources/model_cards/qwen3-30b.toml
Normal file
15
resources/model_cards/qwen3-30b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-30b"
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
|
||||
name = "Qwen3 30B A3B (4-bit)"
|
||||
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
|
||||
pretty_name = "Qwen3 30B A3B (4-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 17612931072
|
||||
15
resources/model_cards/qwen3-80b-a3B-4bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-4bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
|
||||
name = "Qwen3 80B A3B (4-bit)"
|
||||
description = "Qwen3 80B"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
|
||||
pretty_name = "Qwen3 80B A3B (4-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 46976204800
|
||||
15
resources/model_cards/qwen3-80b-a3B-8bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-8bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
|
||||
name = "Qwen3 80B A3B (8-bit)"
|
||||
description = "Qwen3 80B"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
|
||||
pretty_name = "Qwen3 80B A3B (8-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 88814387200
|
||||
15
resources/model_cards/qwen3-80b-a3B-thinking-4bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-thinking-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-thinking-4bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
|
||||
name = "Qwen3 80B A3B Thinking (4-bit)"
|
||||
description = "Qwen3 80B Reasoning model"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
|
||||
pretty_name = "Qwen3 80B A3B (4-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 88814387200
|
||||
15
resources/model_cards/qwen3-80b-a3B-thinking-8bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-thinking-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-thinking-8bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
|
||||
name = "Qwen3 80B A3B Thinking (8-bit)"
|
||||
description = "Qwen3 80B Reasoning model"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
|
||||
pretty_name = "Qwen3 80B A3B (8-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 88814387200
|
||||
15
resources/model_cards/qwen3-coder-480b-a35b-4bit.toml
Normal file
15
resources/model_cards/qwen3-coder-480b-a35b-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-coder-480b-a35b-4bit"
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
|
||||
name = "Qwen3 Coder 480B A35B (4-bit)"
|
||||
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
|
||||
pretty_name = "Qwen3 Coder 480B A35B (4-bit)"
|
||||
n_layers = 62
|
||||
hidden_size = 6144
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 289910292480
|
||||
15
resources/model_cards/qwen3-coder-480b-a35b-8bit.toml
Normal file
15
resources/model_cards/qwen3-coder-480b-a35b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-coder-480b-a35b-8bit"
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
|
||||
name = "Qwen3 Coder 480B A35B (8-bit)"
|
||||
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
|
||||
pretty_name = "Qwen3 Coder 480B A35B (8-bit)"
|
||||
n_layers = 62
|
||||
hidden_size = 6144
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 579820584960
|
||||
@@ -1,5 +1,8 @@
|
||||
from exo.shared.types.memory import Memory
|
||||
from anyio import Path, open_file
|
||||
import tomlkit
|
||||
|
||||
from exo.shared.types.models import ModelId, ModelMetadata
|
||||
from exo.shared.models.model_meta import get_model_meta
|
||||
from exo.utils.pydantic_ext import CamelCaseModel
|
||||
|
||||
|
||||
@@ -11,542 +14,27 @@ class ModelCard(CamelCaseModel):
|
||||
tags: list[str]
|
||||
metadata: ModelMetadata
|
||||
|
||||
@staticmethod
|
||||
async def load(path: Path) -> "ModelCard":
|
||||
async with await open_file(path) as f:
|
||||
data = await f.read()
|
||||
py = tomlkit.loads(data)
|
||||
return ModelCard.model_validate(py)
|
||||
|
||||
MODEL_CARDS: dict[str, ModelCard] = {
|
||||
# deepseek v3
|
||||
"deepseek-v3.1-4bit": ModelCard(
|
||||
short_id="deepseek-v3.1-4bit",
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
|
||||
name="DeepSeek V3.1 (4-bit)",
|
||||
description="""DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
|
||||
pretty_name="DeepSeek V3.1 (4-bit)",
|
||||
storage_size=Memory.from_gb(378),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"deepseek-v3.1-8bit": ModelCard(
|
||||
short_id="deepseek-v3.1-8bit",
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-8bit"),
|
||||
name="DeepSeek V3.1 (8-bit)",
|
||||
description="""DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-8bit"),
|
||||
pretty_name="DeepSeek V3.1 (8-bit)",
|
||||
storage_size=Memory.from_gb(713),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# kimi k2
|
||||
"kimi-k2-instruct-4bit": ModelCard(
|
||||
short_id="kimi-k2-instruct-4bit",
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Instruct-4bit"),
|
||||
name="Kimi K2 Instruct (4-bit)",
|
||||
description="""Kimi K2 is a large language model trained on the Kimi K2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Instruct-4bit"),
|
||||
pretty_name="Kimi K2 Instruct (4-bit)",
|
||||
storage_size=Memory.from_gb(578),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"kimi-k2-thinking": ModelCard(
|
||||
short_id="kimi-k2-thinking",
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
|
||||
name="Kimi K2 Thinking (4-bit)",
|
||||
description="""Kimi K2 Thinking is the latest, most capable version of open-source thinking model.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
|
||||
pretty_name="Kimi K2 Thinking (4-bit)",
|
||||
storage_size=Memory.from_gb(658),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# llama-3.1
|
||||
"llama-3.1-8b": ModelCard(
|
||||
short_id="llama-3.1-8b",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
|
||||
name="Llama 3.1 8B (4-bit)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.1 8B (4-bit)",
|
||||
storage_size=Memory.from_mb(4423),
|
||||
n_layers=32,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.1-8b-8bit": ModelCard(
|
||||
short_id="llama-3.1-8b-8bit",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"),
|
||||
name="Llama 3.1 8B (8-bit)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"),
|
||||
pretty_name="Llama 3.1 8B (8-bit)",
|
||||
storage_size=Memory.from_mb(8540),
|
||||
n_layers=32,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.1-8b-bf16": ModelCard(
|
||||
short_id="llama-3.1-8b-bf16",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"),
|
||||
name="Llama 3.1 8B (BF16)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"),
|
||||
pretty_name="Llama 3.1 8B (BF16)",
|
||||
storage_size=Memory.from_mb(16100),
|
||||
n_layers=32,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.1-70b": ModelCard(
|
||||
short_id="llama-3.1-70b",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"),
|
||||
name="Llama 3.1 70B (4-bit)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.1 70B (4-bit)",
|
||||
storage_size=Memory.from_mb(38769),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# llama-3.2
|
||||
"llama-3.2-1b": ModelCard(
|
||||
short_id="llama-3.2-1b",
|
||||
model_id=ModelId("mlx-community/Llama-3.2-1B-Instruct-4bit"),
|
||||
name="Llama 3.2 1B (4-bit)",
|
||||
description="""Llama 3.2 is a large language model trained on the Llama 3.2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.2-1B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.2 1B (4-bit)",
|
||||
storage_size=Memory.from_mb(696),
|
||||
n_layers=16,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.2-3b": ModelCard(
|
||||
short_id="llama-3.2-3b",
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-4bit"),
|
||||
name="Llama 3.2 3B (4-bit)",
|
||||
description="""Llama 3.2 is a large language model trained on the Llama 3.2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.2 3B (4-bit)",
|
||||
storage_size=Memory.from_mb(1777),
|
||||
n_layers=28,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.2-3b-8bit": ModelCard(
|
||||
short_id="llama-3.2-3b-8bit",
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-8bit"),
|
||||
name="Llama 3.2 3B (8-bit)",
|
||||
description="""Llama 3.2 is a large language model trained on the Llama 3.2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-8bit"),
|
||||
pretty_name="Llama 3.2 3B (8-bit)",
|
||||
storage_size=Memory.from_mb(3339),
|
||||
n_layers=28,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# llama-3.3
|
||||
"llama-3.3-70b": ModelCard(
|
||||
short_id="llama-3.3-70b",
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-4bit"),
|
||||
name="Llama 3.3 70B (4-bit)",
|
||||
description="""The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.3 70B",
|
||||
storage_size=Memory.from_mb(38769),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.3-70b-8bit": ModelCard(
|
||||
short_id="llama-3.3-70b-8bit",
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-8bit"),
|
||||
name="Llama 3.3 70B (8-bit)",
|
||||
description="""The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-8bit"),
|
||||
pretty_name="Llama 3.3 70B (8-bit)",
|
||||
storage_size=Memory.from_mb(73242),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.3-70b-fp16": ModelCard(
|
||||
short_id="llama-3.3-70b-fp16",
|
||||
model_id=ModelId("mlx-community/llama-3.3-70b-instruct-fp16"),
|
||||
name="Llama 3.3 70B (FP16)",
|
||||
description="""The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/llama-3.3-70b-instruct-fp16"),
|
||||
pretty_name="Llama 3.3 70B (FP16)",
|
||||
storage_size=Memory.from_mb(137695),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# qwen3
|
||||
"qwen3-0.6b": ModelCard(
|
||||
short_id="qwen3-0.6b",
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-4bit"),
|
||||
name="Qwen3 0.6B (4-bit)",
|
||||
description="""Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-4bit"),
|
||||
pretty_name="Qwen3 0.6B (4-bit)",
|
||||
storage_size=Memory.from_mb(327),
|
||||
n_layers=28,
|
||||
hidden_size=1024,
|
||||
supports_tensor=False,
|
||||
),
|
||||
),
|
||||
"qwen3-0.6b-8bit": ModelCard(
|
||||
short_id="qwen3-0.6b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-8bit"),
|
||||
name="Qwen3 0.6B (8-bit)",
|
||||
description="""Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-8bit"),
|
||||
pretty_name="Qwen3 0.6B (8-bit)",
|
||||
storage_size=Memory.from_mb(666),
|
||||
n_layers=28,
|
||||
hidden_size=1024,
|
||||
supports_tensor=False,
|
||||
),
|
||||
),
|
||||
"qwen3-30b": ModelCard(
|
||||
short_id="qwen3-30b",
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-4bit"),
|
||||
name="Qwen3 30B A3B (4-bit)",
|
||||
description="""Qwen3 30B is a large language model trained on the Qwen3 30B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-4bit"),
|
||||
pretty_name="Qwen3 30B A3B (4-bit)",
|
||||
storage_size=Memory.from_mb(16797),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-30b-8bit": ModelCard(
|
||||
short_id="qwen3-30b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-8bit"),
|
||||
name="Qwen3 30B A3B (8-bit)",
|
||||
description="""Qwen3 30B is a large language model trained on the Qwen3 30B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-8bit"),
|
||||
pretty_name="Qwen3 30B A3B (8-bit)",
|
||||
storage_size=Memory.from_mb(31738),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-4bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"),
|
||||
name="Qwen3 80B A3B (4-bit)",
|
||||
description="""Qwen3 80B""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"),
|
||||
pretty_name="Qwen3 80B A3B (4-bit)",
|
||||
storage_size=Memory.from_mb(44800),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-8bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"),
|
||||
name="Qwen3 80B A3B (8-bit)",
|
||||
description="""Qwen3 80B""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"),
|
||||
pretty_name="Qwen3 80B A3B (8-bit)",
|
||||
storage_size=Memory.from_mb(84700),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-thinking-4bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-thinking-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"),
|
||||
name="Qwen3 80B A3B Thinking (4-bit)",
|
||||
description="""Qwen3 80B Reasoning model""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"),
|
||||
pretty_name="Qwen3 80B A3B (4-bit)",
|
||||
storage_size=Memory.from_mb(84700),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-thinking-8bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-thinking-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"),
|
||||
name="Qwen3 80B A3B Thinking (8-bit)",
|
||||
description="""Qwen3 80B Reasoning model""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"),
|
||||
pretty_name="Qwen3 80B A3B (8-bit)",
|
||||
storage_size=Memory.from_mb(84700),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-235b-a22b-4bit": ModelCard(
|
||||
short_id="qwen3-235b-a22b-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"),
|
||||
name="Qwen3 235B A22B (4-bit)",
|
||||
description="""Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"),
|
||||
pretty_name="Qwen3 235B A22B (4-bit)",
|
||||
storage_size=Memory.from_gb(132),
|
||||
n_layers=94,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-235b-a22b-8bit": ModelCard(
|
||||
short_id="qwen3-235b-a22b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"),
|
||||
name="Qwen3 235B A22B (8-bit)",
|
||||
description="""Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"),
|
||||
pretty_name="Qwen3 235B A22B (8-bit)",
|
||||
storage_size=Memory.from_gb(250),
|
||||
n_layers=94,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-coder-480b-a35b-4bit": ModelCard(
|
||||
short_id="qwen3-coder-480b-a35b-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"),
|
||||
name="Qwen3 Coder 480B A35B (4-bit)",
|
||||
description="""Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"),
|
||||
pretty_name="Qwen3 Coder 480B A35B (4-bit)",
|
||||
storage_size=Memory.from_gb(270),
|
||||
n_layers=62,
|
||||
hidden_size=6144,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-coder-480b-a35b-8bit": ModelCard(
|
||||
short_id="qwen3-coder-480b-a35b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"),
|
||||
name="Qwen3 Coder 480B A35B (8-bit)",
|
||||
description="""Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"),
|
||||
pretty_name="Qwen3 Coder 480B A35B (8-bit)",
|
||||
storage_size=Memory.from_gb(540),
|
||||
n_layers=62,
|
||||
hidden_size=6144,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# gpt-oss
|
||||
"gpt-oss-120b-MXFP4-Q8": ModelCard(
|
||||
short_id="gpt-oss-120b-MXFP4-Q8",
|
||||
model_id=ModelId("mlx-community/gpt-oss-120b-MXFP4-Q8"),
|
||||
name="GPT-OSS 120B (MXFP4-Q8, MLX)",
|
||||
description="""OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/gpt-oss-120b-MXFP4-Q8"),
|
||||
pretty_name="GPT-OSS 120B (MXFP4-Q8, MLX)",
|
||||
storage_size=Memory.from_kb(68_996_301),
|
||||
n_layers=36,
|
||||
hidden_size=2880,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"gpt-oss-20b-4bit": ModelCard(
|
||||
short_id="gpt-oss-20b-4bit",
|
||||
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
|
||||
name="GPT-OSS 20B (MXFP4-Q4, MLX)",
|
||||
description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
|
||||
pretty_name="GPT-OSS 20B (MXFP4-Q4, MLX)",
|
||||
storage_size=Memory.from_kb(11_744_051),
|
||||
n_layers=24,
|
||||
hidden_size=2880,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# glm 4.5
|
||||
"glm-4.5-air-8bit": ModelCard(
|
||||
# Needs to be quantized g32 or g16 to work with tensor parallel
|
||||
short_id="glm-4.5-air-8bit",
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
|
||||
name="GLM 4.5 Air 8bit",
|
||||
description="""GLM 4.5 Air 8bit""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
|
||||
pretty_name="GLM 4.5 Air 8bit",
|
||||
storage_size=Memory.from_gb(114),
|
||||
n_layers=46,
|
||||
hidden_size=4096,
|
||||
supports_tensor=False,
|
||||
),
|
||||
),
|
||||
"glm-4.5-air-bf16": ModelCard(
|
||||
short_id="glm-4.5-air-bf16",
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-bf16"),
|
||||
name="GLM 4.5 Air bf16",
|
||||
description="""GLM 4.5 Air bf16""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-bf16"),
|
||||
pretty_name="GLM 4.5 Air bf16",
|
||||
storage_size=Memory.from_gb(214),
|
||||
n_layers=46,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# glm 4.7
|
||||
"glm-4.7-4bit": ModelCard(
|
||||
short_id="glm-4.7-4bit",
|
||||
model_id=ModelId("mlx-community/GLM-4.7-4bit"),
|
||||
name="GLM 4.7 4bit",
|
||||
description="GLM 4.7 4bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.7-4bit"),
|
||||
pretty_name="GLM 4.7 4bit",
|
||||
storage_size=Memory.from_bytes(198556925568),
|
||||
n_layers=91,
|
||||
hidden_size=5120,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"glm-4.7-6bit": ModelCard(
|
||||
short_id="glm-4.7-6bit",
|
||||
model_id=ModelId("mlx-community/GLM-4.7-6bit"),
|
||||
name="GLM 4.7 6bit",
|
||||
description="GLM 4.7 6bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.7-6bit"),
|
||||
pretty_name="GLM 4.7 6bit",
|
||||
storage_size=Memory.from_bytes(286737579648),
|
||||
n_layers=91,
|
||||
hidden_size=5120,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"glm-4.7-8bit-gs32": ModelCard(
|
||||
short_id="glm-4.7-8bit-gs32",
|
||||
model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
|
||||
name="GLM 4.7 8bit (gs32)",
|
||||
description="GLM 4.7 8bit (gs32)",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
|
||||
pretty_name="GLM 4.7 8bit (gs32)",
|
||||
storage_size=Memory.from_bytes(396963397248),
|
||||
n_layers=91,
|
||||
hidden_size=5120,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# minimax-m2
|
||||
"minimax-m2.1-8bit": ModelCard(
|
||||
short_id="minimax-m2.1-8bit",
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
|
||||
name="MiniMax M2.1 8bit",
|
||||
description="MiniMax M2.1 8bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
|
||||
pretty_name="MiniMax M2.1 8bit",
|
||||
storage_size=Memory.from_bytes(242986745856),
|
||||
n_layers=61,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"minimax-m2.1-3bit": ModelCard(
|
||||
short_id="minimax-m2.1-3bit",
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
|
||||
name="MiniMax M2.1 3bit",
|
||||
description="MiniMax M2.1 3bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
|
||||
pretty_name="MiniMax M2.1 3bit",
|
||||
storage_size=Memory.from_bytes(100086644736),
|
||||
n_layers=61,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
}
|
||||
async def save(self, path: Path):
|
||||
async with await open_file(path, "w") as f:
|
||||
py = self.model_dump()
|
||||
data = tomlkit.dumps(py) # pyright: ignore[reportUnknownMemberType]
|
||||
await f.write(data)
|
||||
|
||||
@staticmethod
|
||||
async def from_hf(model_id: str) -> "ModelCard":
|
||||
short_name = model_id.split("/")[-1]
|
||||
return ModelCard(
|
||||
short_id=short_name,
|
||||
model_id=ModelId(model_id),
|
||||
name=short_name,
|
||||
description=f"Custom model from {model_id}",
|
||||
tags=[],
|
||||
metadata=await get_model_meta(model_id),
|
||||
)
|
||||
|
||||
@@ -6,7 +6,6 @@ from huggingface_hub import model_info
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from exo.shared.models.model_cards import MODEL_CARDS
|
||||
from exo.shared.types.memory import Memory
|
||||
from exo.shared.types.models import ModelId, ModelMetadata
|
||||
from exo.worker.download.download_utils import (
|
||||
@@ -108,19 +107,13 @@ async def _get_model_meta(model_id: str) -> ModelMetadata:
|
||||
config_data = await get_config_data(model_id)
|
||||
num_layers = config_data.layer_count
|
||||
mem_size_bytes = await get_safetensors_size(model_id)
|
||||
model_card = next(
|
||||
(card for card in MODEL_CARDS.values() if card.model_id == ModelId(model_id)),
|
||||
None,
|
||||
)
|
||||
|
||||
return ModelMetadata(
|
||||
model_id=ModelId(model_id),
|
||||
pretty_name=model_card.name if model_card is not None else model_id,
|
||||
pretty_name=model_id,
|
||||
storage_size=mem_size_bytes,
|
||||
n_layers=num_layers,
|
||||
hidden_size=config_data.hidden_size or 0,
|
||||
# TODO: all custom models currently do not support tensor. We could add a dynamic test for this?
|
||||
supports_tensor=model_card.metadata.supports_tensor
|
||||
if model_card is not None
|
||||
else False,
|
||||
supports_tensor=False,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user