mirror of
https://github.com/exo-explore/exo.git
synced 2025-12-23 22:27:50 -05:00
116 lines
3.0 KiB
YAML
116 lines
3.0 KiB
YAML
# Simple single-shot benchmark
|
|
# Tests 2 instances concurrently on 2 nodes
|
|
|
|
# Hardware configuration - maps runner labels to instance counts
|
|
hardware_plan:
|
|
puffin4: 1
|
|
puffin8: 1
|
|
|
|
# Environment variables to set on each node
|
|
environment:
|
|
PLACEHOLDER: "placeholder"
|
|
# OVERRIDE_MEMORY_MB: 50000
|
|
MLX_METAL_FAST_SYNCH: 1
|
|
|
|
# Timeout for instance and runner readiness (seconds)
|
|
timeout_seconds: 1800
|
|
|
|
# Model instances to run concurrently
|
|
model_ids:
|
|
# - "mlx-community/DeepSeek-V3.1-8bit"
|
|
# - "mlx-community/Kimi-K2-Instruct-4bit"
|
|
- "mlx-community/Kimi-K2-Thinking"
|
|
# - "mlx-community/Qwen3-235B-A22B-4bit"
|
|
# - "mlx-community/Llama-3.3-70B-Instruct-4bit"
|
|
# - "mlx-community/Llama-3.3-70B-Instruct-8bit"
|
|
# - "mlx-community/Llama-3.2-1B-Instruct-4bit"
|
|
|
|
# Sharding strategy: "Pipeline" or "Tensor"
|
|
sharding: "Pipeline"
|
|
|
|
# Instance type: "MlxRing" or "MlxIbv"
|
|
instance_meta: "MlxIbv"
|
|
|
|
# If true, run requests sequentially (no overlap); if false, fire-and-forget (default: false)
|
|
no_overlap: true
|
|
|
|
# Benchmark stages
|
|
# pp: 64, 256, 1024, 2048, 4096, 8192, 16384
|
|
# g: 64, 512
|
|
stages:
|
|
# - name: "simple"
|
|
# prompt_length: 512
|
|
# generation_length: 10
|
|
# time_between_requests: 2.0
|
|
# iterations: 5
|
|
- name: "pp64_g64"
|
|
prompt_length: 64
|
|
generation_length: 64
|
|
time_between_requests: 2.0
|
|
iterations: 5
|
|
# - name: "pp64_g512"
|
|
# prompt_length: 64
|
|
# generation_length: 512
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|
|
- name: "pp256_g64"
|
|
prompt_length: 256
|
|
generation_length: 64
|
|
time_between_requests: 2.0
|
|
iterations: 5
|
|
# - name: "pp256_g512"
|
|
# prompt_length: 256
|
|
# generation_length: 512
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|
|
- name: "pp1024_g64"
|
|
prompt_length: 1024
|
|
generation_length: 64
|
|
time_between_requests: 2.0
|
|
iterations: 5
|
|
# - name: "pp1024_g512"
|
|
# prompt_length: 1024
|
|
# generation_length: 512
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|
|
- name: "pp2048_g64"
|
|
prompt_length: 2048
|
|
generation_length: 64
|
|
time_between_requests: 2.0
|
|
iterations: 5
|
|
# - name: "pp2048_g512"
|
|
# prompt_length: 2048
|
|
# generation_length: 512
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|
|
- name: "pp4096_g64"
|
|
prompt_length: 4096
|
|
generation_length: 64
|
|
time_between_requests: 2.0
|
|
iterations: 5
|
|
# - name: "pp4096_g512"
|
|
# prompt_length: 4096
|
|
# generation_length: 512
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|
|
- name: "pp8192_g64"
|
|
prompt_length: 8192
|
|
generation_length: 64
|
|
time_between_requests: 2.0
|
|
iterations: 5
|
|
# - name: "pp8192_g512"
|
|
# prompt_length: 8192
|
|
# generation_length: 512
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|
|
# - name: "pp16384_g64"
|
|
# prompt_length: 16384
|
|
# generation_length: 64
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|
|
# - name: "pp16384_g512"
|
|
# prompt_length: 16384
|
|
# generation_length: 512
|
|
# time_between_requests: 2.0
|
|
# iterations: 10
|