mirror of
https://github.com/exo-explore/exo.git
synced 2026-01-21 20:39:59 -05:00
Compare commits
1 Commits
leo/add-to
...
model-card
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c2f9f50f7e |
@@ -1,378 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# pyright: reportAny=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false, reportMissingTypeStubs=false
|
||||
"""
|
||||
exo-eval: Run SWE-bench evaluation against exo using OpenHands SDK (local, no Docker).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from datasets import load_dataset
|
||||
from loguru import logger
|
||||
from openhands.sdk import LLM, Agent, Conversation, Tool
|
||||
from openhands.tools.file_editor import FileEditorTool
|
||||
from openhands.tools.terminal import TerminalTool
|
||||
|
||||
|
||||
class EvalStatus(str, Enum):
|
||||
Resolved = "Resolved"
|
||||
Failed = "Failed"
|
||||
Error = "Error"
|
||||
Timeout = "Timeout"
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalResult:
|
||||
instance_id: str
|
||||
repo: str
|
||||
status: EvalStatus
|
||||
elapsed_seconds: float
|
||||
tests_passed: list[str]
|
||||
tests_failed: list[str]
|
||||
error_message: str | None = None
|
||||
|
||||
|
||||
def load_swe_bench(
|
||||
split: str = "lite",
|
||||
limit: int | None = None,
|
||||
instance_ids: list[str] | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Load SWE-bench dataset from HuggingFace."""
|
||||
# SWE-bench Lite is a curated 300-instance subset
|
||||
dataset_name = (
|
||||
"princeton-nlp/SWE-bench_Lite" if split == "lite" else "princeton-nlp/SWE-bench"
|
||||
)
|
||||
actual_split = "test" if split == "lite" else split
|
||||
|
||||
ds = load_dataset(dataset_name, split=actual_split)
|
||||
instances = [dict(row) for row in ds]
|
||||
|
||||
if instance_ids:
|
||||
instances = [i for i in instances if i["instance_id"] in instance_ids]
|
||||
|
||||
if limit:
|
||||
instances = instances[:limit]
|
||||
|
||||
return instances
|
||||
|
||||
|
||||
def clone_repo_at_commit(repo: str, commit: str, dest: Path) -> None:
|
||||
"""Clone a repo at a specific commit."""
|
||||
repo_url = f"https://github.com/{repo}.git"
|
||||
|
||||
subprocess.run(
|
||||
["git", "clone", "--depth", "1", repo_url, str(dest)],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
subprocess.run(
|
||||
["git", "fetch", "--depth", "1", "origin", commit],
|
||||
cwd=dest,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
subprocess.run(
|
||||
["git", "checkout", commit],
|
||||
cwd=dest,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
|
||||
def build_agent_prompt(instance: dict[str, Any]) -> str:
|
||||
"""Build the prompt for the agent."""
|
||||
return f"""You are a software engineer fixing a bug in the {instance['repo']} repository.
|
||||
|
||||
## Problem Statement
|
||||
{instance['problem_statement']}
|
||||
|
||||
## Instructions
|
||||
1. Explore the codebase to understand the issue
|
||||
2. Identify the files that need to be modified
|
||||
3. Make the necessary changes to fix the issue
|
||||
4. The fix should be minimal and targeted
|
||||
|
||||
You have access to:
|
||||
- terminal: Run shell commands (git, grep, python, etc.)
|
||||
- file_editor: View and edit files
|
||||
|
||||
Start by exploring the repository structure to understand where the relevant code is.
|
||||
"""
|
||||
|
||||
|
||||
def parse_fail_to_pass(fail_to_pass_str: str) -> list[str]:
|
||||
"""Parse the FAIL_TO_PASS field into a list of test names."""
|
||||
try:
|
||||
return json.loads(fail_to_pass_str)
|
||||
except json.JSONDecodeError:
|
||||
return [t.strip() for t in fail_to_pass_str.split(",") if t.strip()]
|
||||
|
||||
|
||||
def run_tests(workspace: Path, tests: list[str]) -> tuple[list[str], list[str]]:
|
||||
"""Run tests and return (passed, failed) lists."""
|
||||
passed = []
|
||||
failed = []
|
||||
|
||||
for test in tests:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["python", "-m", "pytest", "-xvs", test],
|
||||
cwd=workspace,
|
||||
capture_output=True,
|
||||
timeout=300,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
passed.append(test)
|
||||
else:
|
||||
failed.append(test)
|
||||
except subprocess.TimeoutExpired:
|
||||
failed.append(test)
|
||||
|
||||
return passed, failed
|
||||
|
||||
|
||||
def run_single_eval(
|
||||
instance: dict[str, Any],
|
||||
host: str,
|
||||
port: int,
|
||||
model: str,
|
||||
max_turns: int = 30,
|
||||
timeout: float = 600.0,
|
||||
) -> EvalResult:
|
||||
"""Evaluate a single SWE-bench instance."""
|
||||
instance_id = instance["instance_id"]
|
||||
repo = instance["repo"]
|
||||
base_commit = instance["base_commit"]
|
||||
fail_to_pass = parse_fail_to_pass(instance["FAIL_TO_PASS"])
|
||||
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
workspace = Path(tmpdir) / "repo"
|
||||
|
||||
# Clone repo at base commit
|
||||
logger.info(f"Cloning {repo} at {base_commit[:8]}...")
|
||||
clone_repo_at_commit(repo, base_commit, workspace)
|
||||
|
||||
# Setup OpenHands agent
|
||||
llm = LLM(
|
||||
model=f"openai/{model}",
|
||||
base_url=f"http://{host}:{port}/v1",
|
||||
api_key="not-needed",
|
||||
)
|
||||
|
||||
agent = Agent(
|
||||
llm=llm,
|
||||
tools=[
|
||||
Tool(name=TerminalTool.name),
|
||||
Tool(name=FileEditorTool.name),
|
||||
],
|
||||
)
|
||||
|
||||
# Run agent
|
||||
conversation = Conversation(
|
||||
agent=agent,
|
||||
workspace=str(workspace),
|
||||
)
|
||||
|
||||
logger.info(f"Running agent on {instance_id}...")
|
||||
conversation.send_message(build_agent_prompt(instance))
|
||||
|
||||
for _turn in range(max_turns):
|
||||
if time.perf_counter() - start_time > timeout:
|
||||
return EvalResult(
|
||||
instance_id=instance_id,
|
||||
repo=repo,
|
||||
status=EvalStatus.Timeout,
|
||||
elapsed_seconds=time.perf_counter() - start_time,
|
||||
tests_passed=[],
|
||||
tests_failed=fail_to_pass,
|
||||
)
|
||||
|
||||
result = conversation.run(max_turns=1)
|
||||
if result.done:
|
||||
break
|
||||
|
||||
# Run tests to verify
|
||||
logger.info(f"Running tests for {instance_id}...")
|
||||
passed, failed = run_tests(workspace, fail_to_pass)
|
||||
|
||||
elapsed = time.perf_counter() - start_time
|
||||
status = EvalStatus.Resolved if not failed else EvalStatus.Failed
|
||||
|
||||
return EvalResult(
|
||||
instance_id=instance_id,
|
||||
repo=repo,
|
||||
status=status,
|
||||
elapsed_seconds=elapsed,
|
||||
tests_passed=passed,
|
||||
tests_failed=failed,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return EvalResult(
|
||||
instance_id=instance_id,
|
||||
repo=repo,
|
||||
status=EvalStatus.Error,
|
||||
elapsed_seconds=time.perf_counter() - start_time,
|
||||
tests_passed=[],
|
||||
tests_failed=[],
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
|
||||
def verify_exo_running(host: str, port: int, model: str) -> str:
|
||||
"""Verify exo is running and return full model ID."""
|
||||
import http.client
|
||||
|
||||
conn = http.client.HTTPConnection(host, port, timeout=10)
|
||||
conn.request("GET", "/models")
|
||||
resp = conn.getresponse()
|
||||
|
||||
if resp.status != 200:
|
||||
raise RuntimeError(f"exo not responding at {host}:{port}")
|
||||
|
||||
data = json.loads(resp.read())
|
||||
for m in data.get("data", []):
|
||||
if m.get("id") == model or m.get("hugging_face_id") == model:
|
||||
return m.get("hugging_face_id") or m.get("id")
|
||||
|
||||
raise ValueError(f"Model '{model}' not found in exo")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(
|
||||
prog="exo-eval",
|
||||
description="Run SWE-bench evaluation against exo (local, no Docker).",
|
||||
)
|
||||
|
||||
ap.add_argument("--host", default=os.environ.get("EXO_HOST", "localhost"))
|
||||
ap.add_argument(
|
||||
"--port", type=int, default=int(os.environ.get("EXO_PORT", "52415"))
|
||||
)
|
||||
ap.add_argument("--model", required=True, help="exo model ID")
|
||||
ap.add_argument(
|
||||
"--split", default="lite", choices=["lite", "dev", "test", "train"]
|
||||
)
|
||||
ap.add_argument("--limit", type=int, default=10, help="Max instances")
|
||||
ap.add_argument("--instance-ids", nargs="+", help="Specific instance IDs")
|
||||
ap.add_argument("--max-turns", type=int, default=30)
|
||||
ap.add_argument("--timeout", type=float, default=600.0)
|
||||
ap.add_argument("--json-out", default="bench/eval_results.json")
|
||||
ap.add_argument("-v", "--verbose", action="store_true")
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
|
||||
args = ap.parse_args()
|
||||
|
||||
# Load dataset first (doesn't require exo to be running)
|
||||
logger.info(f"Loading SWE-bench {args.split} dataset...")
|
||||
instances = load_swe_bench(
|
||||
split=args.split,
|
||||
limit=args.limit,
|
||||
instance_ids=args.instance_ids,
|
||||
)
|
||||
logger.info(f"Loaded {len(instances)} instances")
|
||||
|
||||
if args.dry_run:
|
||||
print(f"\nSWE-bench {args.split} instances ({len(instances)}):")
|
||||
for inst in instances:
|
||||
print(f" {inst['instance_id']} ({inst['repo']})")
|
||||
return 0
|
||||
|
||||
# Verify exo is running
|
||||
model_id = verify_exo_running(args.host, args.port, args.model)
|
||||
logger.info(f"Using model: {model_id}")
|
||||
|
||||
# Run evaluation
|
||||
results: list[EvalResult] = []
|
||||
for i, instance in enumerate(instances):
|
||||
logger.info(f"[{i+1}/{len(instances)}] {instance['instance_id']}")
|
||||
|
||||
result = run_single_eval(
|
||||
instance=instance,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
model=model_id,
|
||||
max_turns=args.max_turns,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
logger.info(f" Status: {result.status.value}")
|
||||
if result.tests_passed:
|
||||
logger.info(f" Passed: {len(result.tests_passed)} tests")
|
||||
if result.tests_failed:
|
||||
logger.info(f" Failed: {len(result.tests_failed)} tests")
|
||||
if result.error_message:
|
||||
logger.error(f" Error: {result.error_message}")
|
||||
|
||||
# Compute summary
|
||||
total = len(results)
|
||||
resolved = sum(1 for r in results if r.status == EvalStatus.Resolved)
|
||||
failed = sum(1 for r in results if r.status == EvalStatus.Failed)
|
||||
errors = sum(1 for r in results if r.status == EvalStatus.Error)
|
||||
timeouts = sum(1 for r in results if r.status == EvalStatus.Timeout)
|
||||
|
||||
summary = {
|
||||
"model": model_id,
|
||||
"split": args.split,
|
||||
"total": total,
|
||||
"resolved": resolved,
|
||||
"resolved_rate": resolved / total if total else 0,
|
||||
"failed": failed,
|
||||
"errors": errors,
|
||||
"timeouts": timeouts,
|
||||
}
|
||||
|
||||
output = {
|
||||
"summary": summary,
|
||||
"results": [
|
||||
{
|
||||
"instance_id": r.instance_id,
|
||||
"repo": r.repo,
|
||||
"status": r.status.value,
|
||||
"elapsed_seconds": r.elapsed_seconds,
|
||||
"tests_passed": r.tests_passed,
|
||||
"tests_failed": r.tests_failed,
|
||||
"error_message": r.error_message,
|
||||
}
|
||||
for r in results
|
||||
],
|
||||
}
|
||||
|
||||
Path(args.json_out).write_text(json.dumps(output, indent=2))
|
||||
logger.info(f"Results written to {args.json_out}")
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SWE-bench Evaluation Results")
|
||||
print("=" * 60)
|
||||
print(f"Model: {model_id}")
|
||||
print(f"Split: {args.split}")
|
||||
print(f"Total: {total}")
|
||||
if total:
|
||||
print(f"Resolved: {resolved} ({resolved/total*100:.1f}%)")
|
||||
else:
|
||||
print("Resolved: 0")
|
||||
print(f"Failed: {failed}")
|
||||
print(f"Errors: {errors}")
|
||||
print(f"Timeouts: {timeouts}")
|
||||
print("=" * 60)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -60,39 +60,12 @@
|
||||
return models;
|
||||
});
|
||||
|
||||
// Track previous model IDs to detect newly added models (plain variable to avoid reactive loop)
|
||||
let previousModelIds: Set<string> = new Set();
|
||||
|
||||
// Auto-select the first available model if none is selected, if current selection is stale, or if a new model is added
|
||||
// Auto-select the first available model if none is selected
|
||||
$effect(() => {
|
||||
const models = availableModels();
|
||||
const currentModelIds = new Set(models.map(m => m.id));
|
||||
|
||||
if (models.length > 0) {
|
||||
// Find newly added models (in current but not in previous)
|
||||
const newModels = models.filter(m => !previousModelIds.has(m.id));
|
||||
|
||||
// If no model selected, select the first available
|
||||
if (!currentModel) {
|
||||
setSelectedChatModel(models[0].id);
|
||||
}
|
||||
// If current model is stale (no longer has a running instance), reset to first available
|
||||
else if (!models.some(m => m.id === currentModel)) {
|
||||
setSelectedChatModel(models[0].id);
|
||||
}
|
||||
// If a new model was just added, select it
|
||||
else if (newModels.length > 0 && previousModelIds.size > 0) {
|
||||
setSelectedChatModel(newModels[0].id);
|
||||
}
|
||||
} else {
|
||||
// No instances running - clear the selected model
|
||||
if (currentModel) {
|
||||
setSelectedChatModel('');
|
||||
}
|
||||
if (models.length > 0 && !currentModel) {
|
||||
setSelectedChatModel(models[0].id);
|
||||
}
|
||||
|
||||
// Update previous model IDs for next comparison
|
||||
previousModelIds = currentModelIds;
|
||||
});
|
||||
|
||||
function getInstanceModelId(instanceWrapped: unknown): string {
|
||||
|
||||
@@ -400,8 +400,10 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
|
||||
const errorText = await response.text();
|
||||
console.error('Failed to launch instance:', errorText);
|
||||
} else {
|
||||
// Always auto-select the newly launched model so the user chats to what they just launched
|
||||
setSelectedChatModel(modelId);
|
||||
// Auto-select the launched model only if no model is currently selected
|
||||
if (!selectedChatModel()) {
|
||||
setSelectedChatModel(modelId);
|
||||
}
|
||||
|
||||
// Scroll to the bottom of instances container to show the new instance
|
||||
// Use multiple attempts to ensure DOM has updated with the new instance
|
||||
@@ -761,10 +763,6 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
|
||||
async function deleteInstance(instanceId: string) {
|
||||
if (!confirm(`Delete instance ${instanceId.slice(0, 8)}...?`)) return;
|
||||
|
||||
// Get the model ID of the instance being deleted before we delete it
|
||||
const deletedInstanceModelId = getInstanceModelId(instanceData[instanceId]);
|
||||
const wasSelected = selectedChatModel() === deletedInstanceModelId;
|
||||
|
||||
try {
|
||||
const response = await fetch(`/instance/${instanceId}`, {
|
||||
method: 'DELETE',
|
||||
@@ -773,24 +771,6 @@ function toggleInstanceDownloadDetails(nodeId: string): void {
|
||||
|
||||
if (!response.ok) {
|
||||
console.error('Failed to delete instance:', response.status);
|
||||
} else if (wasSelected) {
|
||||
// If we deleted the currently selected model, switch to another available model
|
||||
// Find another instance that isn't the one we just deleted
|
||||
const remainingInstances = Object.entries(instanceData).filter(([id]) => id !== instanceId);
|
||||
if (remainingInstances.length > 0) {
|
||||
// Select the last instance (most recently added, since objects preserve insertion order)
|
||||
const [, lastInstance] = remainingInstances[remainingInstances.length - 1];
|
||||
const newModelId = getInstanceModelId(lastInstance);
|
||||
if (newModelId && newModelId !== 'Unknown' && newModelId !== 'Unknown Model') {
|
||||
setSelectedChatModel(newModelId);
|
||||
} else {
|
||||
// Clear selection if no valid model found
|
||||
setSelectedChatModel('');
|
||||
}
|
||||
} else {
|
||||
// No more instances, clear the selection
|
||||
setSelectedChatModel('');
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error deleting instance:', error);
|
||||
|
||||
2
justfile
2
justfile
@@ -1,5 +1,3 @@
|
||||
export NIX_CONFIG := "extra-experimental-features = nix-command flakes"
|
||||
|
||||
fmt:
|
||||
nix fmt
|
||||
|
||||
|
||||
@@ -23,9 +23,7 @@ dependencies = [
|
||||
"tiktoken>=0.12.0", # required for kimi k2 tokenizer
|
||||
"hypercorn>=0.18.0",
|
||||
"openai-harmony>=0.0.8",
|
||||
"openhands-sdk>=0.1.0", # for exo-eval SWE-bench evaluation
|
||||
"openhands-tools>=0.1.0", # tools for openhands agents
|
||||
"datasets>=3.0.0", # for loading SWE-bench from HuggingFace
|
||||
"tomlkit>=0.14.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
15
resources/model_cards/deepseek-v3.1-4bit.toml
Normal file
15
resources/model_cards/deepseek-v3.1-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "deepseek-v3.1-4bit"
|
||||
model_id = "mlx-community/DeepSeek-V3.1-4bit"
|
||||
name = "DeepSeek V3.1 (4-bit)"
|
||||
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/DeepSeek-V3.1-4bit"
|
||||
pretty_name = "DeepSeek V3.1 (4-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 405874409472
|
||||
15
resources/model_cards/deepseek-v3.1-8bit.toml
Normal file
15
resources/model_cards/deepseek-v3.1-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "deepseek-v3.1-8bit"
|
||||
model_id = "mlx-community/DeepSeek-V3.1-8bit"
|
||||
name = "DeepSeek V3.1 (8-bit)"
|
||||
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/DeepSeek-V3.1-8bit"
|
||||
pretty_name = "DeepSeek V3.1 (8-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 765577920512
|
||||
15
resources/model_cards/glm-4.5-air-8bit.toml
Normal file
15
resources/model_cards/glm-4.5-air-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.5-air-8bit"
|
||||
model_id = "mlx-community/GLM-4.5-Air-8bit"
|
||||
name = "GLM 4.5 Air 8bit"
|
||||
description = "GLM 4.5 Air 8bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.5-Air-8bit"
|
||||
pretty_name = "GLM 4.5 Air 8bit"
|
||||
n_layers = 46
|
||||
hidden_size = 4096
|
||||
supports_tensor = false
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 122406567936
|
||||
15
resources/model_cards/glm-4.5-air-bf16.toml
Normal file
15
resources/model_cards/glm-4.5-air-bf16.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.5-air-bf16"
|
||||
model_id = "mlx-community/GLM-4.5-Air-bf16"
|
||||
name = "GLM 4.5 Air bf16"
|
||||
description = "GLM 4.5 Air bf16"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.5-Air-bf16"
|
||||
pretty_name = "GLM 4.5 Air bf16"
|
||||
n_layers = 46
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 229780750336
|
||||
15
resources/model_cards/glm-4.7-4bit.toml
Normal file
15
resources/model_cards/glm-4.7-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.7-4bit"
|
||||
model_id = "mlx-community/GLM-4.7-4bit"
|
||||
name = "GLM 4.7 4bit"
|
||||
description = "GLM 4.7 4bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.7-4bit"
|
||||
pretty_name = "GLM 4.7 4bit"
|
||||
n_layers = 91
|
||||
hidden_size = 5120
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 198556925568
|
||||
15
resources/model_cards/glm-4.7-6bit.toml
Normal file
15
resources/model_cards/glm-4.7-6bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.7-6bit"
|
||||
model_id = "mlx-community/GLM-4.7-6bit"
|
||||
name = "GLM 4.7 6bit"
|
||||
description = "GLM 4.7 6bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.7-6bit"
|
||||
pretty_name = "GLM 4.7 6bit"
|
||||
n_layers = 91
|
||||
hidden_size = 5120
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 286737579648
|
||||
15
resources/model_cards/glm-4.7-8bit-gs32.toml
Normal file
15
resources/model_cards/glm-4.7-8bit-gs32.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "glm-4.7-8bit-gs32"
|
||||
model_id = "mlx-community/GLM-4.7-8bit-gs32"
|
||||
name = "GLM 4.7 8bit (gs32)"
|
||||
description = "GLM 4.7 8bit (gs32)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/GLM-4.7-8bit-gs32"
|
||||
pretty_name = "GLM 4.7 8bit (gs32)"
|
||||
n_layers = 91
|
||||
hidden_size = 5120
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 396963397248
|
||||
15
resources/model_cards/gpt-oss-120b-MXFP4-Q8.toml
Normal file
15
resources/model_cards/gpt-oss-120b-MXFP4-Q8.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "gpt-oss-120b-MXFP4-Q8"
|
||||
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
|
||||
name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
|
||||
description = "OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
|
||||
pretty_name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
|
||||
n_layers = 36
|
||||
hidden_size = 2880
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 70652212224
|
||||
15
resources/model_cards/gpt-oss-20b-4bit.toml
Normal file
15
resources/model_cards/gpt-oss-20b-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "gpt-oss-20b-4bit"
|
||||
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
|
||||
name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
|
||||
description = "OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
|
||||
pretty_name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
|
||||
n_layers = 24
|
||||
hidden_size = 2880
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 12025908224
|
||||
15
resources/model_cards/kimi-k2-instruct-4bit.toml
Normal file
15
resources/model_cards/kimi-k2-instruct-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "kimi-k2-instruct-4bit"
|
||||
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
|
||||
name = "Kimi K2 Instruct (4-bit)"
|
||||
description = "Kimi K2 is a large language model trained on the Kimi K2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
|
||||
pretty_name = "Kimi K2 Instruct (4-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 620622774272
|
||||
15
resources/model_cards/kimi-k2-thinking.toml
Normal file
15
resources/model_cards/kimi-k2-thinking.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "kimi-k2-thinking"
|
||||
model_id = "mlx-community/Kimi-K2-Thinking"
|
||||
name = "Kimi K2 Thinking (4-bit)"
|
||||
description = "Kimi K2 Thinking is the latest, most capable version of open-source thinking model."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Kimi-K2-Thinking"
|
||||
pretty_name = "Kimi K2 Thinking (4-bit)"
|
||||
n_layers = 61
|
||||
hidden_size = 7168
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 706522120192
|
||||
15
resources/model_cards/llama-3.1-70b.toml
Normal file
15
resources/model_cards/llama-3.1-70b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-70b"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
|
||||
name = "Llama 3.1 70B (4-bit)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.1 70B (4-bit)"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 40652242944
|
||||
15
resources/model_cards/llama-3.1-8b-8bit.toml
Normal file
15
resources/model_cards/llama-3.1-8b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-8b-8bit"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
|
||||
name = "Llama 3.1 8B (8-bit)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
|
||||
pretty_name = "Llama 3.1 8B (8-bit)"
|
||||
n_layers = 32
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 8954839040
|
||||
15
resources/model_cards/llama-3.1-8b-bf16.toml
Normal file
15
resources/model_cards/llama-3.1-8b-bf16.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-8b-bf16"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
|
||||
name = "Llama 3.1 8B (BF16)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
|
||||
pretty_name = "Llama 3.1 8B (BF16)"
|
||||
n_layers = 32
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 16882073600
|
||||
15
resources/model_cards/llama-3.1-8b.toml
Normal file
15
resources/model_cards/llama-3.1-8b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.1-8b"
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
|
||||
name = "Llama 3.1 8B (4-bit)"
|
||||
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.1 8B (4-bit)"
|
||||
n_layers = 32
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 4637851648
|
||||
15
resources/model_cards/llama-3.2-1b.toml
Normal file
15
resources/model_cards/llama-3.2-1b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.2-1b"
|
||||
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
|
||||
name = "Llama 3.2 1B (4-bit)"
|
||||
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.2 1B (4-bit)"
|
||||
n_layers = 16
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 729808896
|
||||
15
resources/model_cards/llama-3.2-3b-8bit.toml
Normal file
15
resources/model_cards/llama-3.2-3b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.2-3b-8bit"
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
|
||||
name = "Llama 3.2 3B (8-bit)"
|
||||
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
|
||||
pretty_name = "Llama 3.2 3B (8-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 3501195264
|
||||
15
resources/model_cards/llama-3.2-3b.toml
Normal file
15
resources/model_cards/llama-3.2-3b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.2-3b"
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
|
||||
name = "Llama 3.2 3B (4-bit)"
|
||||
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.2 3B (4-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 1863319552
|
||||
15
resources/model_cards/llama-3.3-70b-8bit.toml
Normal file
15
resources/model_cards/llama-3.3-70b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.3-70b-8bit"
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
|
||||
name = "Llama 3.3 70B (8-bit)"
|
||||
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
|
||||
pretty_name = "Llama 3.3 70B (8-bit)"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 76799803392
|
||||
15
resources/model_cards/llama-3.3-70b-fp16.toml
Normal file
15
resources/model_cards/llama-3.3-70b-fp16.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.3-70b-fp16"
|
||||
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
|
||||
name = "Llama 3.3 70B (FP16)"
|
||||
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
|
||||
pretty_name = "Llama 3.3 70B (FP16)"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 144383672320
|
||||
15
resources/model_cards/llama-3.3-70b.toml
Normal file
15
resources/model_cards/llama-3.3-70b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "llama-3.3-70b"
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
|
||||
name = "Llama 3.3 70B (4-bit)"
|
||||
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
|
||||
pretty_name = "Llama 3.3 70B"
|
||||
n_layers = 80
|
||||
hidden_size = 8192
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 40652242944
|
||||
15
resources/model_cards/minimax-m2.1-3bit.toml
Normal file
15
resources/model_cards/minimax-m2.1-3bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "minimax-m2.1-3bit"
|
||||
model_id = "mlx-community/MiniMax-M2.1-3bit"
|
||||
name = "MiniMax M2.1 3bit"
|
||||
description = "MiniMax M2.1 3bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/MiniMax-M2.1-3bit"
|
||||
pretty_name = "MiniMax M2.1 3bit"
|
||||
n_layers = 61
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 100086644736
|
||||
15
resources/model_cards/minimax-m2.1-8bit.toml
Normal file
15
resources/model_cards/minimax-m2.1-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "minimax-m2.1-8bit"
|
||||
model_id = "mlx-community/MiniMax-M2.1-8bit"
|
||||
name = "MiniMax M2.1 8bit"
|
||||
description = "MiniMax M2.1 8bit"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/MiniMax-M2.1-8bit"
|
||||
pretty_name = "MiniMax M2.1 8bit"
|
||||
n_layers = 61
|
||||
hidden_size = 3072
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 242986745856
|
||||
15
resources/model_cards/qwen3-0.6b-8bit.toml
Normal file
15
resources/model_cards/qwen3-0.6b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-0.6b-8bit"
|
||||
model_id = "mlx-community/Qwen3-0.6B-8bit"
|
||||
name = "Qwen3 0.6B (8-bit)"
|
||||
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-0.6B-8bit"
|
||||
pretty_name = "Qwen3 0.6B (8-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 1024
|
||||
supports_tensor = false
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 698351616
|
||||
15
resources/model_cards/qwen3-0.6b.toml
Normal file
15
resources/model_cards/qwen3-0.6b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-0.6b"
|
||||
model_id = "mlx-community/Qwen3-0.6B-4bit"
|
||||
name = "Qwen3 0.6B (4-bit)"
|
||||
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-0.6B-4bit"
|
||||
pretty_name = "Qwen3 0.6B (4-bit)"
|
||||
n_layers = 28
|
||||
hidden_size = 1024
|
||||
supports_tensor = false
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 342884352
|
||||
15
resources/model_cards/qwen3-235b-a22b-4bit.toml
Normal file
15
resources/model_cards/qwen3-235b-a22b-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-235b-a22b-4bit"
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
|
||||
name = "Qwen3 235B A22B (4-bit)"
|
||||
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
|
||||
pretty_name = "Qwen3 235B A22B (4-bit)"
|
||||
n_layers = 94
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 141733920768
|
||||
15
resources/model_cards/qwen3-235b-a22b-8bit.toml
Normal file
15
resources/model_cards/qwen3-235b-a22b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-235b-a22b-8bit"
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
|
||||
name = "Qwen3 235B A22B (8-bit)"
|
||||
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
|
||||
pretty_name = "Qwen3 235B A22B (8-bit)"
|
||||
n_layers = 94
|
||||
hidden_size = 4096
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 268435456000
|
||||
15
resources/model_cards/qwen3-30b-8bit.toml
Normal file
15
resources/model_cards/qwen3-30b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-30b-8bit"
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
|
||||
name = "Qwen3 30B A3B (8-bit)"
|
||||
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
|
||||
pretty_name = "Qwen3 30B A3B (8-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 33279705088
|
||||
15
resources/model_cards/qwen3-30b.toml
Normal file
15
resources/model_cards/qwen3-30b.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-30b"
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
|
||||
name = "Qwen3 30B A3B (4-bit)"
|
||||
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
|
||||
pretty_name = "Qwen3 30B A3B (4-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 17612931072
|
||||
15
resources/model_cards/qwen3-80b-a3B-4bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-4bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
|
||||
name = "Qwen3 80B A3B (4-bit)"
|
||||
description = "Qwen3 80B"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
|
||||
pretty_name = "Qwen3 80B A3B (4-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 46976204800
|
||||
15
resources/model_cards/qwen3-80b-a3B-8bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-8bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
|
||||
name = "Qwen3 80B A3B (8-bit)"
|
||||
description = "Qwen3 80B"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
|
||||
pretty_name = "Qwen3 80B A3B (8-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 88814387200
|
||||
15
resources/model_cards/qwen3-80b-a3B-thinking-4bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-thinking-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-thinking-4bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
|
||||
name = "Qwen3 80B A3B Thinking (4-bit)"
|
||||
description = "Qwen3 80B Reasoning model"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
|
||||
pretty_name = "Qwen3 80B A3B (4-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 88814387200
|
||||
15
resources/model_cards/qwen3-80b-a3B-thinking-8bit.toml
Normal file
15
resources/model_cards/qwen3-80b-a3B-thinking-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-80b-a3B-thinking-8bit"
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
|
||||
name = "Qwen3 80B A3B Thinking (8-bit)"
|
||||
description = "Qwen3 80B Reasoning model"
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
|
||||
pretty_name = "Qwen3 80B A3B (8-bit)"
|
||||
n_layers = 48
|
||||
hidden_size = 2048
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 88814387200
|
||||
15
resources/model_cards/qwen3-coder-480b-a35b-4bit.toml
Normal file
15
resources/model_cards/qwen3-coder-480b-a35b-4bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-coder-480b-a35b-4bit"
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
|
||||
name = "Qwen3 Coder 480B A35B (4-bit)"
|
||||
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
|
||||
pretty_name = "Qwen3 Coder 480B A35B (4-bit)"
|
||||
n_layers = 62
|
||||
hidden_size = 6144
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 289910292480
|
||||
15
resources/model_cards/qwen3-coder-480b-a35b-8bit.toml
Normal file
15
resources/model_cards/qwen3-coder-480b-a35b-8bit.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
short_id = "qwen3-coder-480b-a35b-8bit"
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
|
||||
name = "Qwen3 Coder 480B A35B (8-bit)"
|
||||
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
|
||||
tags = []
|
||||
|
||||
[metadata]
|
||||
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
|
||||
pretty_name = "Qwen3 Coder 480B A35B (8-bit)"
|
||||
n_layers = 62
|
||||
hidden_size = 6144
|
||||
supports_tensor = true
|
||||
|
||||
[metadata.storage_size]
|
||||
in_bytes = 579820584960
|
||||
@@ -1,6 +1,6 @@
|
||||
import time
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any, cast
|
||||
from typing import cast
|
||||
|
||||
import anyio
|
||||
from anyio import create_task_group
|
||||
@@ -13,6 +13,12 @@ from hypercorn.asyncio import serve # pyright: ignore[reportUnknownVariableType
|
||||
from hypercorn.config import Config
|
||||
from hypercorn.typing import ASGIFramework
|
||||
from loguru import logger
|
||||
from openai_harmony import ( # pyright: ignore[reportMissingTypeStubs]
|
||||
HarmonyEncodingName,
|
||||
Role,
|
||||
StreamableParser,
|
||||
load_harmony_encoding,
|
||||
)
|
||||
|
||||
from exo.master.placement import place_instance as get_instance_placements
|
||||
from exo.shared.apply import apply
|
||||
@@ -61,6 +67,8 @@ from exo.utils.channels import Receiver, Sender, channel
|
||||
from exo.utils.dashboard_path import find_dashboard
|
||||
from exo.utils.event_buffer import OrderedBuffer
|
||||
|
||||
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
||||
|
||||
|
||||
def chunk_to_response(
|
||||
chunk: TokenChunk, command_id: CommandId
|
||||
@@ -72,13 +80,7 @@ def chunk_to_response(
|
||||
choices=[
|
||||
StreamingChoiceResponse(
|
||||
index=0,
|
||||
delta=ChatCompletionMessage(
|
||||
role="assistant",
|
||||
content=chunk.text if chunk.text else None,
|
||||
tool_calls=[tc.model_dump() for tc in chunk.tool_calls]
|
||||
if chunk.tool_calls
|
||||
else None,
|
||||
),
|
||||
delta=ChatCompletionMessage(role="assistant", content=chunk.text),
|
||||
finish_reason=chunk.finish_reason,
|
||||
)
|
||||
],
|
||||
@@ -379,8 +381,35 @@ class API:
|
||||
instance_id=instance_id,
|
||||
)
|
||||
|
||||
async def _process_gpt_oss(self, token_chunks: Receiver[TokenChunk]):
|
||||
stream = StreamableParser(encoding, role=Role.ASSISTANT)
|
||||
thinking = False
|
||||
|
||||
async for chunk in token_chunks:
|
||||
stream.process(chunk.token_id)
|
||||
|
||||
delta = stream.last_content_delta
|
||||
ch = stream.current_channel
|
||||
|
||||
if ch == "analysis" and not thinking:
|
||||
thinking = True
|
||||
yield chunk.model_copy(update={"text": "<think>"})
|
||||
|
||||
if ch != "analysis" and thinking:
|
||||
thinking = False
|
||||
yield chunk.model_copy(update={"text": "</think>"})
|
||||
|
||||
if delta:
|
||||
yield chunk.model_copy(update={"text": delta})
|
||||
|
||||
if chunk.finish_reason is not None:
|
||||
if thinking:
|
||||
yield chunk.model_copy(update={"text": "</think>"})
|
||||
yield chunk
|
||||
break
|
||||
|
||||
async def _chat_chunk_stream(
|
||||
self, command_id: CommandId
|
||||
self, command_id: CommandId, parse_gpt_oss: bool
|
||||
) -> AsyncGenerator[TokenChunk, None]:
|
||||
"""Yield `TokenChunk`s for a given command until completion."""
|
||||
|
||||
@@ -388,10 +417,16 @@ class API:
|
||||
self._chat_completion_queues[command_id], recv = channel[TokenChunk]()
|
||||
|
||||
with recv as token_chunks:
|
||||
async for chunk in token_chunks:
|
||||
yield chunk
|
||||
if chunk.finish_reason is not None:
|
||||
break
|
||||
if parse_gpt_oss:
|
||||
async for chunk in self._process_gpt_oss(token_chunks):
|
||||
yield chunk
|
||||
if chunk.finish_reason is not None:
|
||||
break
|
||||
else:
|
||||
async for chunk in token_chunks:
|
||||
yield chunk
|
||||
if chunk.finish_reason is not None:
|
||||
break
|
||||
|
||||
except anyio.get_cancelled_exc_class():
|
||||
# TODO: TaskCancelled
|
||||
@@ -407,11 +442,11 @@ class API:
|
||||
del self._chat_completion_queues[command_id]
|
||||
|
||||
async def _generate_chat_stream(
|
||||
self, command_id: CommandId
|
||||
self, command_id: CommandId, parse_gpt_oss: bool
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""Generate chat completion stream as JSON strings."""
|
||||
|
||||
async for chunk in self._chat_chunk_stream(command_id):
|
||||
async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
|
||||
chunk_response: ChatCompletionResponse = chunk_to_response(
|
||||
chunk, command_id
|
||||
)
|
||||
@@ -423,35 +458,20 @@ class API:
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
async def _collect_chat_completion(
|
||||
self, command_id: CommandId
|
||||
self, command_id: CommandId, parse_gpt_oss: bool
|
||||
) -> ChatCompletionResponse:
|
||||
"""Collect all token chunks for a chat completion and return a single response."""
|
||||
|
||||
text_parts: list[str] = []
|
||||
model: str | None = None
|
||||
finish_reason: FinishReason | None = None
|
||||
all_tool_calls: list[dict[str, Any]] = []
|
||||
|
||||
async for chunk in self._chat_chunk_stream(command_id):
|
||||
async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
|
||||
if model is None:
|
||||
model = chunk.model
|
||||
|
||||
text_parts.append(chunk.text)
|
||||
|
||||
# Collect tool calls
|
||||
if chunk.tool_calls:
|
||||
for tc in chunk.tool_calls:
|
||||
all_tool_calls.append(
|
||||
{
|
||||
"id": tc.id,
|
||||
"type": tc.type,
|
||||
"function": {
|
||||
"name": tc.function.name,
|
||||
"arguments": tc.function.arguments,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
if chunk.finish_reason is not None:
|
||||
finish_reason = chunk.finish_reason
|
||||
|
||||
@@ -467,8 +487,7 @@ class API:
|
||||
index=0,
|
||||
message=ChatCompletionMessage(
|
||||
role="assistant",
|
||||
content=combined_text if combined_text else None,
|
||||
tool_calls=all_tool_calls if all_tool_calls else None,
|
||||
content=combined_text,
|
||||
),
|
||||
finish_reason=finish_reason,
|
||||
)
|
||||
@@ -476,36 +495,21 @@ class API:
|
||||
)
|
||||
|
||||
async def _collect_chat_completion_with_stats(
|
||||
self, command_id: CommandId
|
||||
self, command_id: CommandId, parse_gpt_oss: bool
|
||||
) -> BenchChatCompletionResponse:
|
||||
text_parts: list[str] = []
|
||||
model: str | None = None
|
||||
finish_reason: FinishReason | None = None
|
||||
all_tool_calls: list[dict[str, Any]] = []
|
||||
|
||||
stats: GenerationStats | None = None
|
||||
|
||||
async for chunk in self._chat_chunk_stream(command_id):
|
||||
async for chunk in self._chat_chunk_stream(command_id, parse_gpt_oss):
|
||||
if model is None:
|
||||
model = chunk.model
|
||||
|
||||
text_parts.append(chunk.text)
|
||||
stats = chunk.stats or stats
|
||||
|
||||
# Collect tool calls
|
||||
if chunk.tool_calls:
|
||||
for tc in chunk.tool_calls:
|
||||
all_tool_calls.append(
|
||||
{
|
||||
"id": tc.id,
|
||||
"type": tc.type,
|
||||
"function": {
|
||||
"name": tc.function.name,
|
||||
"arguments": tc.function.arguments,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
if chunk.finish_reason is not None:
|
||||
finish_reason = chunk.finish_reason
|
||||
|
||||
@@ -520,9 +524,7 @@ class API:
|
||||
ChatCompletionChoice(
|
||||
index=0,
|
||||
message=ChatCompletionMessage(
|
||||
role="assistant",
|
||||
content=combined_text if combined_text else None,
|
||||
tool_calls=all_tool_calls if all_tool_calls else None,
|
||||
role="assistant", content=combined_text
|
||||
),
|
||||
finish_reason=finish_reason,
|
||||
)
|
||||
@@ -542,6 +544,8 @@ class API:
|
||||
"""Handle chat completions, supporting both streaming and non-streaming responses."""
|
||||
model_meta = await resolve_model_meta(payload.model)
|
||||
payload.model = model_meta.model_id
|
||||
parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
|
||||
logger.info(f"{parse_gpt_oss=}")
|
||||
|
||||
if not any(
|
||||
instance.shard_assignments.model_id == payload.model
|
||||
@@ -558,16 +562,17 @@ class API:
|
||||
await self._send(command)
|
||||
if payload.stream:
|
||||
return StreamingResponse(
|
||||
self._generate_chat_stream(command.command_id),
|
||||
self._generate_chat_stream(command.command_id, parse_gpt_oss),
|
||||
media_type="text/event-stream",
|
||||
)
|
||||
|
||||
return await self._collect_chat_completion(command.command_id)
|
||||
return await self._collect_chat_completion(command.command_id, parse_gpt_oss)
|
||||
|
||||
async def bench_chat_completions(
|
||||
self, payload: BenchChatCompletionTaskParams
|
||||
) -> BenchChatCompletionResponse:
|
||||
model_meta = await resolve_model_meta(payload.model)
|
||||
parse_gpt_oss = "gpt-oss" in model_meta.model_id.lower()
|
||||
payload.model = model_meta.model_id
|
||||
|
||||
if not any(
|
||||
@@ -584,7 +589,10 @@ class API:
|
||||
command = ChatCompletion(request_params=payload)
|
||||
await self._send(command)
|
||||
|
||||
response = await self._collect_chat_completion_with_stats(command.command_id)
|
||||
response = await self._collect_chat_completion_with_stats(
|
||||
command.command_id,
|
||||
parse_gpt_oss,
|
||||
)
|
||||
return response
|
||||
|
||||
def _calculate_total_available_memory(self) -> Memory:
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
from exo.shared.types.memory import Memory
|
||||
from anyio import Path, open_file
|
||||
import tomlkit
|
||||
|
||||
from exo.shared.types.models import ModelId, ModelMetadata
|
||||
from exo.shared.models.model_meta import get_model_meta
|
||||
from exo.utils.pydantic_ext import CamelCaseModel
|
||||
|
||||
|
||||
@@ -11,542 +14,27 @@ class ModelCard(CamelCaseModel):
|
||||
tags: list[str]
|
||||
metadata: ModelMetadata
|
||||
|
||||
@staticmethod
|
||||
async def load(path: Path) -> "ModelCard":
|
||||
async with await open_file(path) as f:
|
||||
data = await f.read()
|
||||
py = tomlkit.loads(data)
|
||||
return ModelCard.model_validate(py)
|
||||
|
||||
MODEL_CARDS: dict[str, ModelCard] = {
|
||||
# deepseek v3
|
||||
"deepseek-v3.1-4bit": ModelCard(
|
||||
short_id="deepseek-v3.1-4bit",
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
|
||||
name="DeepSeek V3.1 (4-bit)",
|
||||
description="""DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
|
||||
pretty_name="DeepSeek V3.1 (4-bit)",
|
||||
storage_size=Memory.from_gb(378),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"deepseek-v3.1-8bit": ModelCard(
|
||||
short_id="deepseek-v3.1-8bit",
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-8bit"),
|
||||
name="DeepSeek V3.1 (8-bit)",
|
||||
description="""DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/DeepSeek-V3.1-8bit"),
|
||||
pretty_name="DeepSeek V3.1 (8-bit)",
|
||||
storage_size=Memory.from_gb(713),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# kimi k2
|
||||
"kimi-k2-instruct-4bit": ModelCard(
|
||||
short_id="kimi-k2-instruct-4bit",
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Instruct-4bit"),
|
||||
name="Kimi K2 Instruct (4-bit)",
|
||||
description="""Kimi K2 is a large language model trained on the Kimi K2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Instruct-4bit"),
|
||||
pretty_name="Kimi K2 Instruct (4-bit)",
|
||||
storage_size=Memory.from_gb(578),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"kimi-k2-thinking": ModelCard(
|
||||
short_id="kimi-k2-thinking",
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
|
||||
name="Kimi K2 Thinking (4-bit)",
|
||||
description="""Kimi K2 Thinking is the latest, most capable version of open-source thinking model.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Kimi-K2-Thinking"),
|
||||
pretty_name="Kimi K2 Thinking (4-bit)",
|
||||
storage_size=Memory.from_gb(658),
|
||||
n_layers=61,
|
||||
hidden_size=7168,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# llama-3.1
|
||||
"llama-3.1-8b": ModelCard(
|
||||
short_id="llama-3.1-8b",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
|
||||
name="Llama 3.1 8B (4-bit)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.1 8B (4-bit)",
|
||||
storage_size=Memory.from_mb(4423),
|
||||
n_layers=32,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.1-8b-8bit": ModelCard(
|
||||
short_id="llama-3.1-8b-8bit",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"),
|
||||
name="Llama 3.1 8B (8-bit)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"),
|
||||
pretty_name="Llama 3.1 8B (8-bit)",
|
||||
storage_size=Memory.from_mb(8540),
|
||||
n_layers=32,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.1-8b-bf16": ModelCard(
|
||||
short_id="llama-3.1-8b-bf16",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"),
|
||||
name="Llama 3.1 8B (BF16)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"),
|
||||
pretty_name="Llama 3.1 8B (BF16)",
|
||||
storage_size=Memory.from_mb(16100),
|
||||
n_layers=32,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.1-70b": ModelCard(
|
||||
short_id="llama-3.1-70b",
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"),
|
||||
name="Llama 3.1 70B (4-bit)",
|
||||
description="""Llama 3.1 is a large language model trained on the Llama 3.1 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.1 70B (4-bit)",
|
||||
storage_size=Memory.from_mb(38769),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# llama-3.2
|
||||
"llama-3.2-1b": ModelCard(
|
||||
short_id="llama-3.2-1b",
|
||||
model_id=ModelId("mlx-community/Llama-3.2-1B-Instruct-4bit"),
|
||||
name="Llama 3.2 1B (4-bit)",
|
||||
description="""Llama 3.2 is a large language model trained on the Llama 3.2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.2-1B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.2 1B (4-bit)",
|
||||
storage_size=Memory.from_mb(696),
|
||||
n_layers=16,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.2-3b": ModelCard(
|
||||
short_id="llama-3.2-3b",
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-4bit"),
|
||||
name="Llama 3.2 3B (4-bit)",
|
||||
description="""Llama 3.2 is a large language model trained on the Llama 3.2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.2 3B (4-bit)",
|
||||
storage_size=Memory.from_mb(1777),
|
||||
n_layers=28,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.2-3b-8bit": ModelCard(
|
||||
short_id="llama-3.2-3b-8bit",
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-8bit"),
|
||||
name="Llama 3.2 3B (8-bit)",
|
||||
description="""Llama 3.2 is a large language model trained on the Llama 3.2 dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.2-3B-Instruct-8bit"),
|
||||
pretty_name="Llama 3.2 3B (8-bit)",
|
||||
storage_size=Memory.from_mb(3339),
|
||||
n_layers=28,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# llama-3.3
|
||||
"llama-3.3-70b": ModelCard(
|
||||
short_id="llama-3.3-70b",
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-4bit"),
|
||||
name="Llama 3.3 70B (4-bit)",
|
||||
description="""The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-4bit"),
|
||||
pretty_name="Llama 3.3 70B",
|
||||
storage_size=Memory.from_mb(38769),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.3-70b-8bit": ModelCard(
|
||||
short_id="llama-3.3-70b-8bit",
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-8bit"),
|
||||
name="Llama 3.3 70B (8-bit)",
|
||||
description="""The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Llama-3.3-70B-Instruct-8bit"),
|
||||
pretty_name="Llama 3.3 70B (8-bit)",
|
||||
storage_size=Memory.from_mb(73242),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"llama-3.3-70b-fp16": ModelCard(
|
||||
short_id="llama-3.3-70b-fp16",
|
||||
model_id=ModelId("mlx-community/llama-3.3-70b-instruct-fp16"),
|
||||
name="Llama 3.3 70B (FP16)",
|
||||
description="""The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/llama-3.3-70b-instruct-fp16"),
|
||||
pretty_name="Llama 3.3 70B (FP16)",
|
||||
storage_size=Memory.from_mb(137695),
|
||||
n_layers=80,
|
||||
hidden_size=8192,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# qwen3
|
||||
"qwen3-0.6b": ModelCard(
|
||||
short_id="qwen3-0.6b",
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-4bit"),
|
||||
name="Qwen3 0.6B (4-bit)",
|
||||
description="""Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-4bit"),
|
||||
pretty_name="Qwen3 0.6B (4-bit)",
|
||||
storage_size=Memory.from_mb(327),
|
||||
n_layers=28,
|
||||
hidden_size=1024,
|
||||
supports_tensor=False,
|
||||
),
|
||||
),
|
||||
"qwen3-0.6b-8bit": ModelCard(
|
||||
short_id="qwen3-0.6b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-8bit"),
|
||||
name="Qwen3 0.6B (8-bit)",
|
||||
description="""Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-0.6B-8bit"),
|
||||
pretty_name="Qwen3 0.6B (8-bit)",
|
||||
storage_size=Memory.from_mb(666),
|
||||
n_layers=28,
|
||||
hidden_size=1024,
|
||||
supports_tensor=False,
|
||||
),
|
||||
),
|
||||
"qwen3-30b": ModelCard(
|
||||
short_id="qwen3-30b",
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-4bit"),
|
||||
name="Qwen3 30B A3B (4-bit)",
|
||||
description="""Qwen3 30B is a large language model trained on the Qwen3 30B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-4bit"),
|
||||
pretty_name="Qwen3 30B A3B (4-bit)",
|
||||
storage_size=Memory.from_mb(16797),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-30b-8bit": ModelCard(
|
||||
short_id="qwen3-30b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-8bit"),
|
||||
name="Qwen3 30B A3B (8-bit)",
|
||||
description="""Qwen3 30B is a large language model trained on the Qwen3 30B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-30B-A3B-8bit"),
|
||||
pretty_name="Qwen3 30B A3B (8-bit)",
|
||||
storage_size=Memory.from_mb(31738),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-4bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"),
|
||||
name="Qwen3 80B A3B (4-bit)",
|
||||
description="""Qwen3 80B""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"),
|
||||
pretty_name="Qwen3 80B A3B (4-bit)",
|
||||
storage_size=Memory.from_mb(44800),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-8bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"),
|
||||
name="Qwen3 80B A3B (8-bit)",
|
||||
description="""Qwen3 80B""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"),
|
||||
pretty_name="Qwen3 80B A3B (8-bit)",
|
||||
storage_size=Memory.from_mb(84700),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-thinking-4bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-thinking-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"),
|
||||
name="Qwen3 80B A3B Thinking (4-bit)",
|
||||
description="""Qwen3 80B Reasoning model""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"),
|
||||
pretty_name="Qwen3 80B A3B (4-bit)",
|
||||
storage_size=Memory.from_mb(84700),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-80b-a3B-thinking-8bit": ModelCard(
|
||||
short_id="qwen3-80b-a3B-thinking-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"),
|
||||
name="Qwen3 80B A3B Thinking (8-bit)",
|
||||
description="""Qwen3 80B Reasoning model""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"),
|
||||
pretty_name="Qwen3 80B A3B (8-bit)",
|
||||
storage_size=Memory.from_mb(84700),
|
||||
n_layers=48,
|
||||
hidden_size=2048,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-235b-a22b-4bit": ModelCard(
|
||||
short_id="qwen3-235b-a22b-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"),
|
||||
name="Qwen3 235B A22B (4-bit)",
|
||||
description="""Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"),
|
||||
pretty_name="Qwen3 235B A22B (4-bit)",
|
||||
storage_size=Memory.from_gb(132),
|
||||
n_layers=94,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-235b-a22b-8bit": ModelCard(
|
||||
short_id="qwen3-235b-a22b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"),
|
||||
name="Qwen3 235B A22B (8-bit)",
|
||||
description="""Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"),
|
||||
pretty_name="Qwen3 235B A22B (8-bit)",
|
||||
storage_size=Memory.from_gb(250),
|
||||
n_layers=94,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-coder-480b-a35b-4bit": ModelCard(
|
||||
short_id="qwen3-coder-480b-a35b-4bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"),
|
||||
name="Qwen3 Coder 480B A35B (4-bit)",
|
||||
description="""Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"),
|
||||
pretty_name="Qwen3 Coder 480B A35B (4-bit)",
|
||||
storage_size=Memory.from_gb(270),
|
||||
n_layers=62,
|
||||
hidden_size=6144,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"qwen3-coder-480b-a35b-8bit": ModelCard(
|
||||
short_id="qwen3-coder-480b-a35b-8bit",
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"),
|
||||
name="Qwen3 Coder 480B A35B (8-bit)",
|
||||
description="""Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"),
|
||||
pretty_name="Qwen3 Coder 480B A35B (8-bit)",
|
||||
storage_size=Memory.from_gb(540),
|
||||
n_layers=62,
|
||||
hidden_size=6144,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# gpt-oss
|
||||
"gpt-oss-120b-MXFP4-Q8": ModelCard(
|
||||
short_id="gpt-oss-120b-MXFP4-Q8",
|
||||
model_id=ModelId("mlx-community/gpt-oss-120b-MXFP4-Q8"),
|
||||
name="GPT-OSS 120B (MXFP4-Q8, MLX)",
|
||||
description="""OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/gpt-oss-120b-MXFP4-Q8"),
|
||||
pretty_name="GPT-OSS 120B (MXFP4-Q8, MLX)",
|
||||
storage_size=Memory.from_kb(68_996_301),
|
||||
n_layers=36,
|
||||
hidden_size=2880,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"gpt-oss-20b-MXFP4-Q8": ModelCard(
|
||||
short_id="gpt-oss-20b-MXFP4-Q8",
|
||||
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
|
||||
name="GPT-OSS 20B (MXFP4-Q8, MLX)",
|
||||
description="""OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this variant is a 4-bit MLX conversion for Apple Silicon.""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/gpt-oss-20b-MXFP4-Q8"),
|
||||
pretty_name="GPT-OSS 20B (MXFP4-Q8, MLX)",
|
||||
storage_size=Memory.from_kb(11_744_051),
|
||||
n_layers=24,
|
||||
hidden_size=2880,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# glm 4.5
|
||||
"glm-4.5-air-8bit": ModelCard(
|
||||
# Needs to be quantized g32 or g16 to work with tensor parallel
|
||||
short_id="glm-4.5-air-8bit",
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
|
||||
name="GLM 4.5 Air 8bit",
|
||||
description="""GLM 4.5 Air 8bit""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
|
||||
pretty_name="GLM 4.5 Air 8bit",
|
||||
storage_size=Memory.from_gb(114),
|
||||
n_layers=46,
|
||||
hidden_size=4096,
|
||||
supports_tensor=False,
|
||||
),
|
||||
),
|
||||
"glm-4.5-air-bf16": ModelCard(
|
||||
short_id="glm-4.5-air-bf16",
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-bf16"),
|
||||
name="GLM 4.5 Air bf16",
|
||||
description="""GLM 4.5 Air bf16""",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.5-Air-bf16"),
|
||||
pretty_name="GLM 4.5 Air bf16",
|
||||
storage_size=Memory.from_gb(214),
|
||||
n_layers=46,
|
||||
hidden_size=4096,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# glm 4.7
|
||||
"glm-4.7-4bit": ModelCard(
|
||||
short_id="glm-4.7-4bit",
|
||||
model_id=ModelId("mlx-community/GLM-4.7-4bit"),
|
||||
name="GLM 4.7 4bit",
|
||||
description="GLM 4.7 4bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.7-4bit"),
|
||||
pretty_name="GLM 4.7 4bit",
|
||||
storage_size=Memory.from_bytes(198556925568),
|
||||
n_layers=91,
|
||||
hidden_size=5120,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"glm-4.7-6bit": ModelCard(
|
||||
short_id="glm-4.7-6bit",
|
||||
model_id=ModelId("mlx-community/GLM-4.7-6bit"),
|
||||
name="GLM 4.7 6bit",
|
||||
description="GLM 4.7 6bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.7-6bit"),
|
||||
pretty_name="GLM 4.7 6bit",
|
||||
storage_size=Memory.from_bytes(286737579648),
|
||||
n_layers=91,
|
||||
hidden_size=5120,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"glm-4.7-8bit-gs32": ModelCard(
|
||||
short_id="glm-4.7-8bit-gs32",
|
||||
model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
|
||||
name="GLM 4.7 8bit (gs32)",
|
||||
description="GLM 4.7 8bit (gs32)",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
|
||||
pretty_name="GLM 4.7 8bit (gs32)",
|
||||
storage_size=Memory.from_bytes(396963397248),
|
||||
n_layers=91,
|
||||
hidden_size=5120,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
# minimax-m2
|
||||
"minimax-m2.1-8bit": ModelCard(
|
||||
short_id="minimax-m2.1-8bit",
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
|
||||
name="MiniMax M2.1 8bit",
|
||||
description="MiniMax M2.1 8bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
|
||||
pretty_name="MiniMax M2.1 8bit",
|
||||
storage_size=Memory.from_bytes(242986745856),
|
||||
n_layers=61,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
"minimax-m2.1-3bit": ModelCard(
|
||||
short_id="minimax-m2.1-3bit",
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
|
||||
name="MiniMax M2.1 3bit",
|
||||
description="MiniMax M2.1 3bit",
|
||||
tags=[],
|
||||
metadata=ModelMetadata(
|
||||
model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
|
||||
pretty_name="MiniMax M2.1 3bit",
|
||||
storage_size=Memory.from_bytes(100086644736),
|
||||
n_layers=61,
|
||||
hidden_size=3072,
|
||||
supports_tensor=True,
|
||||
),
|
||||
),
|
||||
}
|
||||
async def save(self, path: Path):
|
||||
async with await open_file(path, "w") as f:
|
||||
py = self.model_dump()
|
||||
data = tomlkit.dumps(py) # pyright: ignore[reportUnknownMemberType]
|
||||
await f.write(data)
|
||||
|
||||
@staticmethod
|
||||
async def from_hf(model_id: str) -> "ModelCard":
|
||||
short_name = model_id.split("/")[-1]
|
||||
return ModelCard(
|
||||
short_id=short_name,
|
||||
model_id=ModelId(model_id),
|
||||
name=short_name,
|
||||
description=f"Custom model from {model_id}",
|
||||
tags=[],
|
||||
metadata=await get_model_meta(model_id),
|
||||
)
|
||||
|
||||
@@ -6,7 +6,6 @@ from huggingface_hub import model_info
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from exo.shared.models.model_cards import MODEL_CARDS
|
||||
from exo.shared.types.memory import Memory
|
||||
from exo.shared.types.models import ModelId, ModelMetadata
|
||||
from exo.worker.download.download_utils import (
|
||||
@@ -108,19 +107,13 @@ async def _get_model_meta(model_id: str) -> ModelMetadata:
|
||||
config_data = await get_config_data(model_id)
|
||||
num_layers = config_data.layer_count
|
||||
mem_size_bytes = await get_safetensors_size(model_id)
|
||||
model_card = next(
|
||||
(card for card in MODEL_CARDS.values() if card.model_id == ModelId(model_id)),
|
||||
None,
|
||||
)
|
||||
|
||||
return ModelMetadata(
|
||||
model_id=ModelId(model_id),
|
||||
pretty_name=model_card.name if model_card is not None else model_id,
|
||||
pretty_name=model_id,
|
||||
storage_size=mem_size_bytes,
|
||||
n_layers=num_layers,
|
||||
hidden_size=config_data.hidden_size or 0,
|
||||
# TODO: all custom models currently do not support tensor. We could add a dynamic test for this?
|
||||
supports_tensor=model_card.metadata.supports_tensor
|
||||
if model_card is not None
|
||||
else False,
|
||||
supports_tensor=False,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
from enum import Enum
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from exo.shared.types.api import GenerationStats
|
||||
from exo.utils.pydantic_ext import TaggedModel
|
||||
@@ -15,17 +12,6 @@ class ChunkType(str, Enum):
|
||||
Image = "Image"
|
||||
|
||||
|
||||
class ToolCallFunction(BaseModel, frozen=True):
|
||||
name: str
|
||||
arguments: str
|
||||
|
||||
|
||||
class ToolCall(BaseModel, frozen=True):
|
||||
id: str
|
||||
type: Literal["function"] = "function"
|
||||
function: ToolCallFunction
|
||||
|
||||
|
||||
class BaseChunk(TaggedModel):
|
||||
idx: int
|
||||
model: ModelId
|
||||
@@ -36,7 +22,6 @@ class TokenChunk(BaseChunk):
|
||||
token_id: int
|
||||
finish_reason: FinishReason | None = None
|
||||
stats: GenerationStats | None = None
|
||||
tool_calls: list[ToolCall] | None = None
|
||||
|
||||
|
||||
class ImageChunk(BaseChunk):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from exo.shared.types.api import FinishReason, GenerationStats
|
||||
from exo.shared.types.chunks import ToolCall
|
||||
from exo.utils.pydantic_ext import TaggedModel
|
||||
|
||||
|
||||
@@ -17,7 +16,6 @@ class GenerationResponse(BaseRunnerResponse):
|
||||
# logprobs: list[float] | None = None # too big. we can change to be top-k
|
||||
finish_reason: FinishReason | None = None
|
||||
stats: GenerationStats | None = None
|
||||
tool_calls: list[ToolCall] | None = None
|
||||
|
||||
|
||||
class FinishedResponse(BaseRunnerResponse):
|
||||
|
||||
@@ -20,7 +20,6 @@ except ImportError:
|
||||
|
||||
from mlx_lm.models.cache import KVCache, QuantizedKVCache, RotatingKVCache
|
||||
from mlx_lm.models.deepseek_v3 import DeepseekV3Model
|
||||
from mlx_lm.models.gpt_oss import Model as GptOssModel
|
||||
from mlx_lm.tokenizer_utils import TokenizerWrapper
|
||||
|
||||
from exo.worker.engines.mlx.constants import (
|
||||
@@ -366,8 +365,6 @@ def apply_chat_template(
|
||||
tools=chat_task_data.tools,
|
||||
)
|
||||
|
||||
logger.info(prompt)
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
@@ -399,11 +396,6 @@ def make_kv_cache(
|
||||
) -> list[KVCache | RotatingKVCache | QuantizedKVCache]:
|
||||
assert hasattr(model, "layers")
|
||||
|
||||
# TODO: Do this for all models
|
||||
if hasattr(model, "make_cache") and isinstance(model, GptOssModel):
|
||||
logger.info("Using MLX LM's make cache")
|
||||
return model.make_cache() # type: ignore
|
||||
|
||||
if max_kv_size is None:
|
||||
if KV_CACHE_BITS is None:
|
||||
logger.info("Using default KV cache")
|
||||
|
||||
@@ -1,22 +1,9 @@
|
||||
import json
|
||||
import time
|
||||
from collections.abc import Generator
|
||||
from functools import cache
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
import mlx.core as mx
|
||||
from mlx_lm.models.gpt_oss import Model as GptOssModel
|
||||
from mlx_lm.tokenizer_utils import TokenizerWrapper
|
||||
from openai_harmony import ( # pyright: ignore[reportMissingTypeStubs]
|
||||
HarmonyEncodingName,
|
||||
Role,
|
||||
StreamableParser,
|
||||
load_harmony_encoding,
|
||||
)
|
||||
|
||||
from exo.shared.types.api import ChatCompletionMessageText
|
||||
from exo.shared.types.chunks import TokenChunk, ToolCall, ToolCallFunction
|
||||
from exo.shared.types.chunks import TokenChunk
|
||||
from exo.shared.types.events import (
|
||||
ChunkGenerated,
|
||||
Event,
|
||||
@@ -166,22 +153,11 @@ def main(
|
||||
_check_for_debug_prompts(task_params.messages[0].content)
|
||||
|
||||
# Generate responses using the actual MLX generation
|
||||
mlx_generator = mlx_generate(
|
||||
for response in mlx_generate(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
task=task_params,
|
||||
)
|
||||
|
||||
# GPT-OSS specific parsing to match other model formats.
|
||||
if isinstance(model, GptOssModel):
|
||||
mlx_generator = parse_gpt_oss(mlx_generator)
|
||||
|
||||
# Parse tool calls to place them in the tool calls section
|
||||
mlx_generator = parse_tool_calls(
|
||||
mlx_generator, tokenizer, task_params.tools
|
||||
)
|
||||
|
||||
for response in mlx_generator:
|
||||
):
|
||||
match response:
|
||||
case GenerationResponse():
|
||||
if shard_metadata.device_rank == 0:
|
||||
@@ -195,7 +171,6 @@ def main(
|
||||
token_id=response.token,
|
||||
finish_reason=response.finish_reason,
|
||||
stats=response.stats,
|
||||
tool_calls=response.tool_calls,
|
||||
),
|
||||
)
|
||||
)
|
||||
@@ -232,135 +207,6 @@ def main(
|
||||
break
|
||||
|
||||
|
||||
@cache
|
||||
def get_gpt_oss_encoding():
|
||||
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
||||
return encoding
|
||||
|
||||
|
||||
def parse_gpt_oss(
|
||||
responses: Generator[GenerationResponse],
|
||||
) -> Generator[GenerationResponse]:
|
||||
encoding = get_gpt_oss_encoding()
|
||||
stream = StreamableParser(encoding, role=Role.ASSISTANT)
|
||||
thinking = False
|
||||
|
||||
for response in responses:
|
||||
stream.process(response.token)
|
||||
|
||||
delta = stream.last_content_delta
|
||||
ch = stream.current_channel
|
||||
|
||||
if ch == "analysis" and not thinking:
|
||||
thinking = True
|
||||
yield response.model_copy(update={"text": "<think>"})
|
||||
|
||||
if ch != "analysis" and thinking:
|
||||
thinking = False
|
||||
yield response.model_copy(update={"text": "</think>"})
|
||||
|
||||
if delta:
|
||||
yield response.model_copy(update={"text": delta})
|
||||
|
||||
if response.finish_reason is not None:
|
||||
if thinking:
|
||||
yield response.model_copy(update={"text": "</think>"})
|
||||
yield response
|
||||
break
|
||||
|
||||
|
||||
def _generate_tool_call_id() -> str:
|
||||
return f"call_{uuid4().hex[:24]}"
|
||||
|
||||
|
||||
def _parse_tool_call_content(
|
||||
content: str,
|
||||
tokenizer: TokenizerWrapper,
|
||||
tools: list[dict[str, Any]] | None,
|
||||
) -> ToolCall | None:
|
||||
content = content.strip()
|
||||
if not content:
|
||||
return None
|
||||
|
||||
tool_parser: Any = getattr(tokenizer, "tool_parser", None)
|
||||
if tool_parser is None:
|
||||
logger.warning("No tool_parser available for tokenizer")
|
||||
return None
|
||||
|
||||
try:
|
||||
parsed: dict[str, Any] = tool_parser(content, tools) # pyright: ignore[reportAny]
|
||||
if parsed and "name" in parsed:
|
||||
arguments: Any = parsed.get("arguments", {}) # pyright: ignore[reportAny]
|
||||
arguments_str: str = (
|
||||
json.dumps(arguments)
|
||||
if not isinstance(arguments, str)
|
||||
else arguments
|
||||
)
|
||||
return ToolCall(
|
||||
id=_generate_tool_call_id(),
|
||||
type="function",
|
||||
function=ToolCallFunction(
|
||||
name=str(parsed["name"]), # pyright: ignore[reportAny]
|
||||
arguments=arguments_str,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"tool_parser failed: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def parse_tool_calls(
|
||||
responses: Generator[GenerationResponse],
|
||||
tokenizer: TokenizerWrapper,
|
||||
tools: list[dict[str, Any]] | None,
|
||||
) -> Generator[GenerationResponse]:
|
||||
has_tool_calling = getattr(tokenizer, "has_tool_calling", False)
|
||||
if not has_tool_calling or tools is None:
|
||||
yield from responses
|
||||
return
|
||||
|
||||
tool_call_start: str | None = getattr(tokenizer, "tool_call_start", None)
|
||||
tool_call_end: str | None = getattr(tokenizer, "tool_call_end", None)
|
||||
|
||||
if tool_call_start is None or tool_call_end is None:
|
||||
yield from responses
|
||||
return
|
||||
|
||||
in_tool_call = False
|
||||
tool_call_buffer: list[str] = []
|
||||
pending_tool_calls: list[ToolCall] = []
|
||||
|
||||
for response in responses:
|
||||
if response.text == tool_call_start:
|
||||
in_tool_call = True
|
||||
tool_call_buffer = []
|
||||
continue
|
||||
|
||||
if response.text == tool_call_end:
|
||||
in_tool_call = False
|
||||
parsed = _parse_tool_call_content(
|
||||
"".join(tool_call_buffer), tokenizer, tools
|
||||
)
|
||||
if parsed is not None:
|
||||
pending_tool_calls.append(parsed)
|
||||
continue
|
||||
|
||||
if in_tool_call:
|
||||
tool_call_buffer.append(response.text)
|
||||
continue
|
||||
|
||||
if response.finish_reason is None or not pending_tool_calls:
|
||||
yield response
|
||||
else:
|
||||
yield response.model_copy(
|
||||
update={
|
||||
"finish_reason": "tool_calls",
|
||||
"tool_calls": pending_tool_calls if pending_tool_calls else None,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
EXO_RUNNER_MUST_FAIL = "EXO RUNNER MUST FAIL"
|
||||
EXO_RUNNER_MUST_OOM = "EXO RUNNER MUST OOM"
|
||||
EXO_RUNNER_MUST_TIMEOUT = "EXO RUNNER MUST TIMEOUT"
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import http.client
|
||||
import time
|
||||
|
||||
from anyio import create_task_group, to_thread
|
||||
from loguru import logger
|
||||
@@ -7,8 +6,6 @@ from loguru import logger
|
||||
from exo.shared.topology import Topology
|
||||
from exo.shared.types.common import NodeId
|
||||
|
||||
BAD_STATUSLINE_ATTEMPTS = 3
|
||||
|
||||
|
||||
async def check_reachability(
|
||||
target_ip: str,
|
||||
@@ -18,9 +15,8 @@ async def check_reachability(
|
||||
) -> None:
|
||||
"""Check if a node is reachable at the given IP and verify its identity."""
|
||||
|
||||
# TODO: use an async http client
|
||||
def _fetch_remote_node_id(*, attempt: int = 1) -> NodeId | None:
|
||||
connection = http.client.HTTPConnection(target_ip, 52415, timeout=3)
|
||||
def _fetch_remote_node_id() -> NodeId | None:
|
||||
connection = http.client.HTTPConnection(target_ip, 52415, timeout=1)
|
||||
try:
|
||||
connection.request("GET", "/node_id")
|
||||
response = connection.getresponse()
|
||||
@@ -36,16 +32,7 @@ async def check_reachability(
|
||||
return NodeId(body) or None
|
||||
except OSError:
|
||||
return None
|
||||
except http.client.BadStatusLine:
|
||||
if attempt >= BAD_STATUSLINE_ATTEMPTS:
|
||||
logger.warning(
|
||||
f"BadStatusLine from {target_ip}, after {attempt} attempts, assuming connection to {expected_node_id} has dropped"
|
||||
)
|
||||
return None
|
||||
time.sleep(1)
|
||||
return _fetch_remote_node_id(attempt=attempt + 1)
|
||||
except http.client.HTTPException as e:
|
||||
logger.warning(f"HTTPException from {target_ip}: {type(e).__name__}: {e}")
|
||||
except http.client.HTTPException:
|
||||
return None
|
||||
finally:
|
||||
connection.close()
|
||||
|
||||
Reference in New Issue
Block a user