Files
exo/bench/eval_configs/models.toml
rltakashige 131ad0ff36 Implement continuous batching (#1642)
## Motivation

Following the changes made in #1632 !
Closes #1020

## Changes

<!-- Describe what you changed in detail -->

## Why It Works

<!-- Explain why your approach solves the problem -->

## Test Plan

### Manual Testing
<!-- Hardware: (e.g., MacBook Pro M1 Max 32GB, Mac Mini M2 16GB,
connected via Thunderbolt 4) -->
<!-- What you did: -->
<!-- - -->

### Automated Testing
<!-- Describe changes to automated tests, or how existing tests cover
this change -->
<!-- - -->

---------

Co-authored-by: Evan Quiney <evanev7@gmail.com>
2026-03-09 15:04:45 +00:00

293 lines
7.5 KiB
TOML

# Model evaluation configurations for exo_eval.
#
# Each [[model]] entry uses `patterns` — a list of substrings matched
# against the model_id. First matching entry wins.
#
# Required fields:
# name, patterns, reasoning
#
# Optional per-model overrides (CLI flags take priority over these):
# temperature, top_p, max_tokens, reasoning_effort
#
# Fallback defaults (when no per-model config):
# reasoning: temperature=1.0, max_tokens=131072, reasoning_effort="high"
# non-reasoning: temperature=0.0, max_tokens=16384
#
# All per-model values below are sourced from official model cards,
# generation_config.json files, and vendor documentation.
# ─── Qwen3.5 (Feb 2026) ─────────────────────────────────────────────
# Source: HuggingFace model cards (Qwen/Qwen3.5-*)
# 35B-A3B thinking general: temp=1.0, top_p=0.95, top_k=20
# 397B thinking: temp=0.6, top_p=0.95, top_k=20
# Non-thinking: temp=0.7, top_p=0.8, top_k=20
# max_tokens: 32768 general, 81920 for complex math/code
[[model]]
name = "Qwen3.5 2B"
patterns = ["Qwen3.5-2B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 81920
[[model]]
name = "Qwen3.5 9B"
patterns = ["Qwen3.5-9B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 81920
[[model]]
name = "Qwen3.5 27B"
patterns = ["Qwen3.5-27B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 81920
[[model]]
name = "Qwen3.5 35B A3B"
patterns = ["Qwen3.5-35B-A3B"]
reasoning = true
temperature = 1.0
top_p = 0.95
max_tokens = 81920
[[model]]
name = "Qwen3.5 122B A10B"
patterns = ["Qwen3.5-122B-A10B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 81920
[[model]]
name = "Qwen3.5 397B A17B"
patterns = ["Qwen3.5-397B-A17B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 81920
# ─── Qwen3 (Apr 2025) ───────────────────────────────────────────────
# Source: HuggingFace model cards (Qwen/Qwen3-*)
# Thinking: temp=0.6, top_p=0.95, top_k=20
# Non-thinking: temp=0.7, top_p=0.8, top_k=20
# max_tokens: 32768 general, 38912 for complex math/code
[[model]]
name = "Qwen3 0.6B"
patterns = ["Qwen3-0.6B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 38912
[[model]]
name = "Qwen3 30B A3B"
patterns = ["Qwen3-30B-A3B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 38912
[[model]]
name = "Qwen3 235B A22B"
patterns = ["Qwen3-235B-A22B"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 38912
[[model]]
name = "Qwen3 Next 80B Thinking"
patterns = ["Qwen3-Next-80B-A3B-Thinking"]
reasoning = true
temperature = 0.6
top_p = 0.95
max_tokens = 38912
[[model]]
name = "Qwen3 Next 80B Instruct"
patterns = ["Qwen3-Next-80B-A3B-Instruct"]
reasoning = false
temperature = 0.7
top_p = 0.8
max_tokens = 16384
[[model]]
name = "Qwen3 Coder 480B"
patterns = ["Qwen3-Coder-480B"]
reasoning = false
temperature = 0.7
top_p = 0.8
max_tokens = 16384
[[model]]
name = "Qwen3 Coder Next"
patterns = ["Qwen3-Coder-Next"]
reasoning = false
temperature = 0.7
top_p = 0.8
max_tokens = 16384
# ─── GPT-OSS (OpenAI) ───────────────────────────────────────────────
# Source: OpenAI GitHub README + HuggingFace discussion #21
# temp=1.0, top_p=1.0, NO top_k, NO repetition_penalty
# reasoning_effort supported: low/medium/high
[[model]]
name = "GPT-OSS 20B"
patterns = ["gpt-oss-20b"]
reasoning = true
temperature = 1.0
top_p = 1.0
[[model]]
name = "GPT-OSS 120B"
patterns = ["gpt-oss-120b"]
reasoning = true
temperature = 1.0
top_p = 1.0
# ─── DeepSeek ────────────────────────────────────────────────────────
# Source: https://api-docs.deepseek.com/quick_start/parameter_settings
# Coding/Math: temp=0.0, General: temp=1.3, Creative: temp=1.5
# NOTE: DeepSeek API applies nonlinear temp mapping. These are API values.
# When running model directly: API temp 1.0 = model temp 0.3
# We use temp=0.0 for eval (coding/math focus).
[[model]]
name = "DeepSeek V3.1"
patterns = ["DeepSeek-V3.1"]
reasoning = true
temperature = 0.0
# ─── GLM (ZhipuAI / THUDM) ──────────────────────────────────────────
# Source: HuggingFace model cards + generation_config.json + docs.z.ai
# GLM 4.5+: temp=1.0, top_p=0.95
# Reasoning tasks: 131072 max_tokens; coding/SWE tasks: temp=0.7
[[model]]
name = "GLM-5"
patterns = ["GLM-5"]
reasoning = true
temperature = 1.0
top_p = 0.95
max_tokens = 131072
[[model]]
name = "GLM 4.5 Air"
patterns = ["GLM-4.5-Air"]
reasoning = true
temperature = 1.0
top_p = 0.95
[[model]]
name = "GLM 4.7"
patterns = ["GLM-4.7-"]
reasoning = true
temperature = 1.0
top_p = 0.95
max_tokens = 131072
# Note: matches both GLM-4.7 and GLM-4.7-Flash
# ─── Kimi (Moonshot AI) ─────────────────────────────────────────────
# Source: HuggingFace model cards (moonshotai/Kimi-K2-*)
# K2-Instruct: temp=0.6
# K2-Thinking: temp=1.0, max_length=262144
# K2.5: thinking temp=1.0, top_p=0.95; instant temp=0.6, top_p=0.95
[[model]]
name = "Kimi K2 Thinking"
patterns = ["Kimi-K2-Thinking"]
reasoning = true
temperature = 1.0
max_tokens = 131072
[[model]]
name = "Kimi K2.5"
patterns = ["Kimi-K2.5"]
reasoning = true
temperature = 1.0
top_p = 0.95
max_tokens = 131072
[[model]]
name = "Kimi K2 Instruct"
patterns = ["Kimi-K2-Instruct"]
reasoning = false
temperature = 0.6
# ─── MiniMax ─────────────────────────────────────────────────────────
# Source: HuggingFace model cards + generation_config.json
# All models: temp=1.0, top_p=0.95, top_k=40
[[model]]
name = "MiniMax M2.5"
patterns = ["MiniMax-M2.5"]
reasoning = true
temperature = 1.0
top_p = 0.95
[[model]]
name = "MiniMax M2.1"
patterns = ["MiniMax-M2.1"]
reasoning = true
temperature = 1.0
top_p = 0.95
# ─── Step (StepFun) ─────────────────────────────────────────────────
# Source: HuggingFace model card (stepfun-ai/Step-3.5-Flash)
# Reasoning: temp=1.0, top_p=0.95
# General chat: temp=0.6, top_p=0.95
# We use reasoning settings for eval.
[[model]]
name = "Step 3.5 Flash"
patterns = ["Step-3.5-Flash"]
reasoning = true
temperature = 1.0
top_p = 0.95
# ─── Llama (Meta) ───────────────────────────────────────────────────
# Source: generation_config.json + meta-llama/llama-models generation.py
# All variants: temp=0.6, top_p=0.9
[[model]]
name = "Llama 3.2 1B"
patterns = ["Llama-3.2-1B"]
reasoning = false
temperature = 0.6
top_p = 0.9
[[model]]
name = "Llama 3.2 3B"
patterns = ["Llama-3.2-3B"]
reasoning = false
temperature = 0.6
top_p = 0.9
[[model]]
name = "Llama 3.1 8B"
patterns = ["Llama-3.1-8B", "Meta-Llama-3.1-8B"]
reasoning = false
temperature = 0.6
top_p = 0.9
[[model]]
name = "Llama 3.1 70B"
patterns = ["Llama-3.1-70B", "Meta-Llama-3.1-70B"]
reasoning = false
temperature = 0.6
top_p = 0.9
[[model]]
name = "Llama 3.3 70B"
patterns = ["Llama-3.3-70B", "llama-3.3-70b"]
reasoning = false
temperature = 0.6
top_p = 0.9