Compare commits

..

1 Commits

Author SHA1 Message Date
Evan
642b1bb1b4 migrate model cards to .toml files 2026-01-15 17:07:48 +00:00
45 changed files with 1376 additions and 898 deletions

View File

@@ -60,39 +60,12 @@
return models;
});
// Track previous model IDs to detect newly added models (plain variable to avoid reactive loop)
let previousModelIds: Set<string> = new Set();
// Auto-select the first available model if none is selected, if current selection is stale, or if a new model is added
// Auto-select the first available model if none is selected
$effect(() => {
const models = availableModels();
const currentModelIds = new Set(models.map(m => m.id));
if (models.length > 0) {
// Find newly added models (in current but not in previous)
const newModels = models.filter(m => !previousModelIds.has(m.id));
// If no model selected, select the first available
if (!currentModel) {
setSelectedChatModel(models[0].id);
}
// If current model is stale (no longer has a running instance), reset to first available
else if (!models.some(m => m.id === currentModel)) {
setSelectedChatModel(models[0].id);
}
// If a new model was just added, select it
else if (newModels.length > 0 && previousModelIds.size > 0) {
setSelectedChatModel(newModels[0].id);
}
} else {
// No instances running - clear the selected model
if (currentModel) {
setSelectedChatModel('');
}
if (models.length > 0 && !currentModel) {
setSelectedChatModel(models[0].id);
}
// Update previous model IDs for next comparison
previousModelIds = currentModelIds;
});
function getInstanceModelId(instanceWrapped: unknown): string {

View File

@@ -400,8 +400,10 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
const errorText = await response.text();
console.error('Failed to launch instance:', errorText);
} else {
// Always auto-select the newly launched model so the user chats to what they just launched
setSelectedChatModel(modelId);
// Auto-select the launched model only if no model is currently selected
if (!selectedChatModel()) {
setSelectedChatModel(modelId);
}
// Scroll to the bottom of instances container to show the new instance
// Use multiple attempts to ensure DOM has updated with the new instance
@@ -761,10 +763,6 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
async function deleteInstance(instanceId: string) {
if (!confirm(`Delete instance ${instanceId.slice(0, 8)}...?`)) return;
// Get the model ID of the instance being deleted before we delete it
const deletedInstanceModelId = getInstanceModelId(instanceData[instanceId]);
const wasSelected = selectedChatModel() === deletedInstanceModelId;
try {
const response = await fetch(`/instance/${instanceId}`, {
method: 'DELETE',
@@ -773,24 +771,6 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
if (!response.ok) {
console.error('Failed to delete instance:', response.status);
} else if (wasSelected) {
// If we deleted the currently selected model, switch to another available model
// Find another instance that isn't the one we just deleted
const remainingInstances = Object.entries(instanceData).filter(([id]) => id !== instanceId);
if (remainingInstances.length > 0) {
// Select the last instance (most recently added, since objects preserve insertion order)
const [, lastInstance] = remainingInstances[remainingInstances.length - 1];
const newModelId = getInstanceModelId(lastInstance);
if (newModelId && newModelId !== 'Unknown' && newModelId !== 'Unknown Model') {
setSelectedChatModel(newModelId);
} else {
// Clear selection if no valid model found
setSelectedChatModel('');
}
} else {
// No more instances, clear the selection
setSelectedChatModel('');
}
}
} catch (error) {
console.error('Error deleting instance:', error);

View File

@@ -1,5 +1,3 @@
export NIX_CONFIG := "extra-experimental-features = nix-command flakes"
fmt:
nix fmt

View File

@@ -23,6 +23,7 @@ dependencies = [
"tiktoken>=0.12.0", # required for kimi k2 tokenizer
"hypercorn>=0.18.0",
"openai-harmony>=0.0.8",
"tomlkit>=0.14.0",
]
[project.scripts]

View File

@@ -0,0 +1,15 @@
short_id = "deepseek-v3.1-4bit"
model_id = "mlx-community/DeepSeek-V3.1-4bit"
name = "DeepSeek V3.1 (4-bit)"
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/DeepSeek-V3.1-4bit"
pretty_name = "DeepSeek V3.1 (4-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 405874409472

View File

@@ -0,0 +1,15 @@
short_id = "deepseek-v3.1-8bit"
model_id = "mlx-community/DeepSeek-V3.1-8bit"
name = "DeepSeek V3.1 (8-bit)"
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/DeepSeek-V3.1-8bit"
pretty_name = "DeepSeek V3.1 (8-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 765577920512

View File

@@ -0,0 +1,15 @@
short_id = "glm-4.5-air-8bit"
model_id = "mlx-community/GLM-4.5-Air-8bit"
name = "GLM 4.5 Air 8bit"
description = "GLM 4.5 Air 8bit"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.5-Air-8bit"
pretty_name = "GLM 4.5 Air 8bit"
n_layers = 46
hidden_size = 4096
supports_tensor = false
[metadata.storage_size]
in_bytes = 122406567936

View File

@@ -0,0 +1,15 @@
short_id = "glm-4.5-air-bf16"
model_id = "mlx-community/GLM-4.5-Air-bf16"
name = "GLM 4.5 Air bf16"
description = "GLM 4.5 Air bf16"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.5-Air-bf16"
pretty_name = "GLM 4.5 Air bf16"
n_layers = 46
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 229780750336

View File

@@ -0,0 +1,15 @@
short_id = "glm-4.7-4bit"
model_id = "mlx-community/GLM-4.7-4bit"
name = "GLM 4.7 4bit"
description = "GLM 4.7 4bit"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.7-4bit"
pretty_name = "GLM 4.7 4bit"
n_layers = 91
hidden_size = 5120
supports_tensor = true
[metadata.storage_size]
in_bytes = 198556925568

View File

@@ -0,0 +1,15 @@
short_id = "glm-4.7-6bit"
model_id = "mlx-community/GLM-4.7-6bit"
name = "GLM 4.7 6bit"
description = "GLM 4.7 6bit"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.7-6bit"
pretty_name = "GLM 4.7 6bit"
n_layers = 91
hidden_size = 5120
supports_tensor = true
[metadata.storage_size]
in_bytes = 286737579648

View File

@@ -0,0 +1,15 @@
short_id = "glm-4.7-8bit-gs32"
model_id = "mlx-community/GLM-4.7-8bit-gs32"
name = "GLM 4.7 8bit (gs32)"
description = "GLM 4.7 8bit (gs32)"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.7-8bit-gs32"
pretty_name = "GLM 4.7 8bit (gs32)"
n_layers = 91
hidden_size = 5120
supports_tensor = true
[metadata.storage_size]
in_bytes = 396963397248

View File

@@ -0,0 +1,15 @@
short_id = "gpt-oss-120b-MXFP4-Q8"
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
description = "OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon."
tags = []
[metadata]
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
pretty_name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
n_layers = 36
hidden_size = 2880
supports_tensor = true
[metadata.storage_size]
in_bytes = 70652212224

View File

@@ -0,0 +1,15 @@
short_id = "gpt-oss-20b-4bit"
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
description = "OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization."
tags = []
[metadata]
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
pretty_name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
n_layers = 24
hidden_size = 2880
supports_tensor = true
[metadata.storage_size]
in_bytes = 12025908224

View File

@@ -0,0 +1,15 @@
short_id = "kimi-k2-instruct-4bit"
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
name = "Kimi K2 Instruct (4-bit)"
description = "Kimi K2 is a large language model trained on the Kimi K2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
pretty_name = "Kimi K2 Instruct (4-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 620622774272

View File

@@ -0,0 +1,15 @@
short_id = "kimi-k2-thinking"
model_id = "mlx-community/Kimi-K2-Thinking"
name = "Kimi K2 Thinking (4-bit)"
description = "Kimi K2 Thinking is the latest, most capable version of open-source thinking model."
tags = []
[metadata]
model_id = "mlx-community/Kimi-K2-Thinking"
pretty_name = "Kimi K2 Thinking (4-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 706522120192

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.1-70b"
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
name = "Llama 3.1 70B (4-bit)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
pretty_name = "Llama 3.1 70B (4-bit)"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 40652242944

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.1-8b-8bit"
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
name = "Llama 3.1 8B (8-bit)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
pretty_name = "Llama 3.1 8B (8-bit)"
n_layers = 32
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 8954839040

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.1-8b-bf16"
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
name = "Llama 3.1 8B (BF16)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
pretty_name = "Llama 3.1 8B (BF16)"
n_layers = 32
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 16882073600

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.1-8b"
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
name = "Llama 3.1 8B (4-bit)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
pretty_name = "Llama 3.1 8B (4-bit)"
n_layers = 32
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 4637851648

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.2-1b"
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
name = "Llama 3.2 1B (4-bit)"
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
pretty_name = "Llama 3.2 1B (4-bit)"
n_layers = 16
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 729808896

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.2-3b-8bit"
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
name = "Llama 3.2 3B (8-bit)"
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
pretty_name = "Llama 3.2 3B (8-bit)"
n_layers = 28
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 3501195264

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.2-3b"
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
name = "Llama 3.2 3B (4-bit)"
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
pretty_name = "Llama 3.2 3B (4-bit)"
n_layers = 28
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 1863319552

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.3-70b-8bit"
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
name = "Llama 3.3 70B (8-bit)"
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
tags = []
[metadata]
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
pretty_name = "Llama 3.3 70B (8-bit)"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 76799803392

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.3-70b-fp16"
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
name = "Llama 3.3 70B (FP16)"
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
tags = []
[metadata]
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
pretty_name = "Llama 3.3 70B (FP16)"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 144383672320

View File

@@ -0,0 +1,15 @@
short_id = "llama-3.3-70b"
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
name = "Llama 3.3 70B (4-bit)"
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
tags = []
[metadata]
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
pretty_name = "Llama 3.3 70B"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 40652242944

View File

@@ -0,0 +1,15 @@
short_id = "minimax-m2.1-3bit"
model_id = "mlx-community/MiniMax-M2.1-3bit"
name = "MiniMax M2.1 3bit"
description = "MiniMax M2.1 3bit"
tags = []
[metadata]
model_id = "mlx-community/MiniMax-M2.1-3bit"
pretty_name = "MiniMax M2.1 3bit"
n_layers = 61
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 100086644736

View File

@@ -0,0 +1,15 @@
short_id = "minimax-m2.1-8bit"
model_id = "mlx-community/MiniMax-M2.1-8bit"
name = "MiniMax M2.1 8bit"
description = "MiniMax M2.1 8bit"
tags = []
[metadata]
model_id = "mlx-community/MiniMax-M2.1-8bit"
pretty_name = "MiniMax M2.1 8bit"
n_layers = 61
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 242986745856

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-0.6b-8bit"
model_id = "mlx-community/Qwen3-0.6B-8bit"
name = "Qwen3 0.6B (8-bit)"
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-0.6B-8bit"
pretty_name = "Qwen3 0.6B (8-bit)"
n_layers = 28
hidden_size = 1024
supports_tensor = false
[metadata.storage_size]
in_bytes = 698351616

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-0.6b"
model_id = "mlx-community/Qwen3-0.6B-4bit"
name = "Qwen3 0.6B (4-bit)"
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-0.6B-4bit"
pretty_name = "Qwen3 0.6B (4-bit)"
n_layers = 28
hidden_size = 1024
supports_tensor = false
[metadata.storage_size]
in_bytes = 342884352

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-235b-a22b-4bit"
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
name = "Qwen3 235B A22B (4-bit)"
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
pretty_name = "Qwen3 235B A22B (4-bit)"
n_layers = 94
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 141733920768

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-235b-a22b-8bit"
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
name = "Qwen3 235B A22B (8-bit)"
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
pretty_name = "Qwen3 235B A22B (8-bit)"
n_layers = 94
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 268435456000

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-30b-8bit"
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
name = "Qwen3 30B A3B (8-bit)"
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
pretty_name = "Qwen3 30B A3B (8-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 33279705088

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-30b"
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
name = "Qwen3 30B A3B (4-bit)"
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
pretty_name = "Qwen3 30B A3B (4-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 17612931072

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-80b-a3B-4bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
name = "Qwen3 80B A3B (4-bit)"
description = "Qwen3 80B"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
pretty_name = "Qwen3 80B A3B (4-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 46976204800

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-80b-a3B-8bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
name = "Qwen3 80B A3B (8-bit)"
description = "Qwen3 80B"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
pretty_name = "Qwen3 80B A3B (8-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 88814387200

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-80b-a3B-thinking-4bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
name = "Qwen3 80B A3B Thinking (4-bit)"
description = "Qwen3 80B Reasoning model"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
pretty_name = "Qwen3 80B A3B (4-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 88814387200

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-80b-a3B-thinking-8bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
name = "Qwen3 80B A3B Thinking (8-bit)"
description = "Qwen3 80B Reasoning model"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
pretty_name = "Qwen3 80B A3B (8-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 88814387200

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-coder-480b-a35b-4bit"
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
name = "Qwen3 Coder 480B A35B (4-bit)"
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
pretty_name = "Qwen3 Coder 480B A35B (4-bit)"
n_layers = 62
hidden_size = 6144
supports_tensor = true
[metadata.storage_size]
in_bytes = 289910292480

View File

@@ -0,0 +1,15 @@
short_id = "qwen3-coder-480b-a35b-8bit"
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
name = "Qwen3 Coder 480B A35B (8-bit)"
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
pretty_name = "Qwen3 Coder 480B A35B (8-bit)"
n_layers = 62
hidden_size = 6144
supports_tensor = true
[metadata.storage_size]
in_bytes = 579820584960

View File

@@ -13,6 +13,12 @@ from hypercorn.asyncio import serve # pyright: ignore[reportUnknownVariableType
from hypercorn.config import Config
from hypercorn.typing import ASGIFramework
from loguru import logger
from openai_harmony import ( # pyright: ignore[reportMissingTypeStubs]
HarmonyEncodingName,
Role,
StreamableParser,
load_harmony_encoding,
)
from exo.master.placement import place_instance as get_instance_placements
from exo.shared.apply import apply
@@ -61,6 +67,8 @@ from exo.utils.channels import Receiver, Sender, channel
from exo.utils.dashboard_path import find_dashboard
from exo.utils.event_buffer import OrderedBuffer
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
def chunk_to_response(
chunk: TokenChunk, command_id: CommandId
@@ -373,8 +381,35 @@ class API:
instance_id=instance_id,
)
async def _process_gpt_oss(self, token_chunks: Receiver[TokenChunk]):
stream = StreamableParser(encoding, role=Role.ASSISTANT)
thinking = False
async for chunk in token_chunks:
stream.process(chunk.token_id)
delta = stream.last_content_delta
ch = stream.current_channel
if ch == "analysis" and not thinking:
thinking = True
yield chunk.model_copy(update={"text": "<think>"})
if ch != "analysis" and thinking:
thinking = False
yield chunk.model_copy(update={"text": "</think>"})
if delta:
yield chunk.model_copy(update={"text": delta})
if chunk.finish_reason is not None:
if thinking:
yield chunk.model_copy(update={"text": "</think>"})
yield chunk
break
async def _chat_chunk_stream(
self, command_id: CommandId
self, command_id: CommandId, parse_gpt_oss: bool
) -> AsyncGenerator[TokenChunk, None]:
"""Yield `TokenChunk`s for a given command until completion."""
@@ -382,10 +417,16 @@ class API:
self._chat_completion_queues[command_id], recv = channel[TokenChunk]()
with recv as token_chunks:
async for chunk in token_chunks:
yield chunk
if chunk.finish_reason is not None:
break
if parse_gpt_oss:
async for chunk in self._process_gpt_oss(token_chunks):
yield chunk
if chunk.finish_reason is not None:
break
else:
async for chunk in token_chunks:
yield chunk
if chunk.finish_reason is not None:
break
except anyio.get_cancelled_exc_class():
# TODO: TaskCancelled
@@ -401,11 +442,11 @@ class API:
del self._chat_completion_queues[command_id]
async def _generate_chat_stream(
self, command_id: CommandId
self, command_id: CommandId, parse_gpt_oss: bool
) -> AsyncGenerator[str, None]:
"""Generate chat completion stream as JSON strings."""
async for chunk in self._chat_chunk_stream(command_id):
async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
chunk_response: ChatCompletionResponse = chunk_to_response(
chunk, command_id
)
@@ -417,7 +458,7 @@ class API:
yield "data: [DONE]\n\n"
async def _collect_chat_completion(
self, command_id: CommandId
self, command_id: CommandId, parse_gpt_oss: bool
) -> ChatCompletionResponse:
"""Collect all token chunks for a chat completion and return a single response."""
@@ -425,7 +466,7 @@ class API:
model: str | None = None
finish_reason: FinishReason | None = None
async for chunk in self._chat_chunk_stream(command_id):
async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
if model is None:
model = chunk.model
@@ -454,7 +495,7 @@ class API:
)
async def _collect_chat_completion_with_stats(
self, command_id: CommandId
self, command_id: CommandId, parse_gpt_oss: bool
) -> BenchChatCompletionResponse:
text_parts: list[str] = []
model: str | None = None
@@ -462,7 +503,7 @@ class API:
stats: GenerationStats | None = None
async for chunk in self._chat_chunk_stream(command_id):
async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
if model is None:
model = chunk.model
@@ -503,6 +544,8 @@ class API:
"""Handle chat completions, supporting both streaming and non-streaming responses."""
model_meta = await resolve_model_meta(payload.model)
payload.model = model_meta.model_id
parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
logger.info(f"{parse_gpt_oss=}")
if not any(
instance.shard_assignments.model_id == payload.model
@@ -519,16 +562,17 @@ class API:
await self._send(command)
if payload.stream:
return StreamingResponse(
self._generate_chat_stream(command.command_id),
self._generate_chat_stream(command.command_id, parse_gpt_oss),
media_type="text/event-stream",
)
return await self._collect_chat_completion(command.command_id)
return await self._collect_chat_completion(command.command_id, parse_gpt_oss)
async def bench_chat_completions(
self, payload: BenchChatCompletionTaskParams
) -> BenchChatCompletionResponse:
model_meta = await resolve_model_meta(payload.model)
parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
payload.model = model_meta.model_id
if not any(
@@ -545,7 +589,10 @@ class API:
command = ChatCompletion(request_params=payload)
await self._send(command)
response = await self._collect_chat_completion_with_stats(command.command_id)
response = await self._collect_chat_completion_with_stats(
command.command_id,
parse_gpt_oss,
)
return response
def _calculate_total_available_memory(self) -> Memory:

View File

@@ -1,3 +1,6 @@
from anyio import Path, open_file
import tomlkit
from exo.shared.types.memory import Memory
from exo.shared.types.models import ModelId, ModelMetadata
from exo.utils.pydantic_ext import CamelCaseModel
@@ -11,6 +14,21 @@ class ModelCard(CamelCaseModel):
tags: list[str]
metadata: ModelMetadata
@staticmethod
async def load(path: Path) -> "ModelCard":
async with await open_file(path) as f:
data = await f.read()
py = tomlkit.loads(data)
return ModelCard.model_validate(py)
async def save(self, path: Path):
async with await open_file(path, "w") as f:
py = self.model_dump()
data = tomlkit.dumps(py) # pyright: ignore[reportUnknownMemberType]
await f.write(data)
MODEL_CARDS: dict[str, ModelCard] = {
# deepseek v3
@@ -425,15 +443,15 @@ MODEL_CARDS: dict[str, ModelCard] = {
supports_tensor=True,
),
),
"gpt-oss-20b-MXFP4-Q8": ModelCard(
short_id="gpt-oss-20b-MXFP4-Q8",
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
name="GPT-OSS 20B (MXFP4-Q8, MLX)",
description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this variant is a 4-bit MLX conversion for Apple Silicon.""",
"gpt-oss-20b-4bit": ModelCard(
short_id="gpt-oss-20b-4bit",
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
name="GPT-OSS 20B (MXFP4-Q4, MLX)",
description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization.""",
tags=[],
metadata=ModelMetadata(
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
pretty_name="GPT-OSS 20B (MXFP4-Q8, MLX)",
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q4"),
pretty_name="GPT-OSS 20B (MXFP4-Q4, MLX)",
storage_size=Memory.from_kb(11_744_051),
n_layers=24,
hidden_size=2880,

View File

@@ -20,7 +20,6 @@ except ImportError:
from mlx_lm.models.cache import KVCache, QuantizedKVCache, RotatingKVCache
from mlx_lm.models.deepseek_v3 import DeepseekV3Model
from mlx_lm.models.gpt_oss import Model as GptOssModel
from mlx_lm.tokenizer_utils import TokenizerWrapper
from exo.worker.engines.mlx.constants import (
@@ -366,8 +365,6 @@ def apply_chat_template(
tools=chat_task_data.tools,
)
logger.info(prompt)
return prompt
@@ -399,11 +396,6 @@ def make_kv_cache(
) -> list[KVCache | RotatingKVCache | QuantizedKVCache]:
assert hasattr(model, "layers")
# TODO: Do this for all models
if hasattr(model, "make_cache") and isinstance(model, GptOssModel):
logger.info("Using MLX LM's make cache")
return model.make_cache() # type: ignore
if max_kv_size is None:
if KV_CACHE_BITS is None:
logger.info("Using default KV cache")

View File

@@ -1,15 +1,6 @@
import time
from collections.abc import Generator
from functools import cache
import mlx.core as mx
from mlx_lm.models.gpt_oss import Model as GptOssModel
from openai_harmony import ( # pyright: ignore[reportMissingTypeStubs]
HarmonyEncodingName,
Role,
StreamableParser,
load_harmony_encoding,
)
from exo.shared.types.api import ChatCompletionMessageText
from exo.shared.types.chunks import TokenChunk
@@ -162,19 +153,11 @@ def main(
_check_for_debug_prompts(task_params.messages[0].content)
# Generate responses using the actual MLX generation
mlx_generator = mlx_generate(
for response in mlx_generate(
model=model,
tokenizer=tokenizer,
task=task_params,
)
# GPT-OSS specific parsing to match other model formats.
if isinstance(model, GptOssModel):
mlx_generator = parse_gpt_oss(mlx_generator)
# TODO: Add tool call parser here
for response in mlx_generator:
):
match response:
case GenerationResponse():
if shard_metadata.device_rank == 0:
@@ -224,43 +207,6 @@ def main(
break
@cache
def get_gpt_oss_encoding():
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
return encoding
def parse_gpt_oss(
responses: Generator[GenerationResponse],
) -> Generator[GenerationResponse]:
encoding = get_gpt_oss_encoding()
stream = StreamableParser(encoding, role=Role.ASSISTANT)
thinking = False
for response in responses:
stream.process(response.token)
delta = stream.last_content_delta
ch = stream.current_channel
if ch == "analysis" and not thinking:
thinking = True
yield response.model_copy(update={"text": "<think>"})
if ch != "analysis" and thinking:
thinking = False
yield response.model_copy(update={"text": "</think>"})
if delta:
yield response.model_copy(update={"text": delta})
if response.finish_reason is not None:
if thinking:
yield response.model_copy(update={"text": "</think>"})
yield response
break
EXO_RUNNER_MUST_FAIL = "EXO RUNNER MUST FAIL"
EXO_RUNNER_MUST_OOM = "EXO RUNNER MUST OOM"
EXO_RUNNER_MUST_TIMEOUT = "EXO RUNNER MUST TIMEOUT"

View File

@@ -1,5 +1,4 @@
import http.client
import time
from anyio import create_task_group, to_thread
from loguru import logger
@@ -7,8 +6,6 @@ from loguru import logger
from exo.shared.topology import Topology
from exo.shared.types.common import NodeId
BAD_STATUSLINE_ATTEMPTS = 3
async def check_reachability(
target_ip: str,
@@ -18,9 +15,8 @@ async def check_reachability(
) -> None:
"""Check if a node is reachable at the given IP and verify its identity."""
# TODO: use an async http client
def _fetch_remote_node_id(*, attempt: int = 1) -> NodeId | None:
connection = http.client.HTTPConnection(target_ip, 52415, timeout=3)
def _fetch_remote_node_id() -> NodeId | None:
connection = http.client.HTTPConnection(target_ip, 52415, timeout=1)
try:
connection.request("GET", "/node_id")
response = connection.getresponse()
@@ -36,16 +32,7 @@ async def check_reachability(
return NodeId(body) or None
except OSError:
return None
except http.client.BadStatusLine:
if attempt >= BAD_STATUSLINE_ATTEMPTS:
logger.warning(
f"BadStatusLine from {target_ip}, after {attempt} attempts, assuming connection to {expected_node_id} has dropped"
)
return None
time.sleep(1)
return _fetch_remote_node_id(attempt=attempt + 1)
except http.client.HTTPException as e:
logger.warning(f"HTTPException from {target_ip}: {type(e).__name__}: {e}")
except http.client.HTTPException:
return None
finally:
connection.close()

1493
uv.lock generated
View File

File diff suppressed because it is too large Load Diff