exo/.github/configs/bench_simple.yaml

# Simple single-shot benchmark
# Tests 2 instances concurrently on 2 nodes

# Hardware configuration - maps runner labels to instance counts
hardware_plan:
  puffin4: 1
  puffin8: 1

# Environment variables to set on each node
environment:
  PLACEHOLDER: "placeholder"
  # OVERRIDE_MEMORY_MB: 50000
  MLX_METAL_FAST_SYNCH: 1

# Timeout for instance and runner readiness (seconds)
timeout_seconds: 1800

# Model instances to run concurrently
model_ids:
  # - "mlx-community/DeepSeek-V3.1-8bit"
  # - "mlx-community/Kimi-K2-Instruct-4bit"
  - "mlx-community/Kimi-K2-Thinking"
  # - "mlx-community/Qwen3-235B-A22B-4bit"
  # - "mlx-community/Llama-3.3-70B-Instruct-4bit"
  # - "mlx-community/Llama-3.3-70B-Instruct-8bit"
  # - "mlx-community/Llama-3.2-1B-Instruct-4bit"

# Sharding strategy: "Pipeline" or "Tensor"
sharding: "Pipeline"

# Instance type: "MlxRing" or "MlxIbv"
instance_meta: "MlxIbv"

# If true, run requests sequentially (no overlap); if false, fire-and-forget (default: false)
no_overlap: true

# Benchmark stages
# pp: 64, 256, 1024, 2048, 4096, 8192, 16384
# g: 64, 512
stages:
  # - name: "simple"
  #   prompt_length: 512
  #   generation_length: 10
  #   time_between_requests: 2.0
  #   iterations: 5
  - name: "pp64_g64"
    prompt_length: 64
    generation_length: 64
    time_between_requests: 2.0
    iterations: 5
  # - name: "pp64_g512"
  #   prompt_length: 64
  #   generation_length: 512
  #   time_between_requests: 2.0
  #   iterations: 10
  - name: "pp256_g64"
    prompt_length: 256
    generation_length: 64
    time_between_requests: 2.0
    iterations: 5
  # - name: "pp256_g512"
  #   prompt_length: 256
  #   generation_length: 512
  #   time_between_requests: 2.0
  #   iterations: 10
  - name: "pp1024_g64"
    prompt_length: 1024
    generation_length: 64
    time_between_requests: 2.0
    iterations: 5
  # - name: "pp1024_g512"
  #   prompt_length: 1024
  #   generation_length: 512
  #   time_between_requests: 2.0
  #   iterations: 10
  - name: "pp2048_g64"
    prompt_length: 2048
    generation_length: 64
    time_between_requests: 2.0
    iterations: 5
  # - name: "pp2048_g512"
  #   prompt_length: 2048
  #   generation_length: 512
  #   time_between_requests: 2.0
  #   iterations: 10
  - name: "pp4096_g64"
    prompt_length: 4096
    generation_length: 64
    time_between_requests: 2.0
    iterations: 5
  # - name: "pp4096_g512"
  #   prompt_length: 4096
  #   generation_length: 512
  #   time_between_requests: 2.0
  #   iterations: 10
  - name: "pp8192_g64"
    prompt_length: 8192
    generation_length: 64
    time_between_requests: 2.0
    iterations: 5
  # - name: "pp8192_g512"
  #   prompt_length: 8192
  #   generation_length: 512
  #   time_between_requests: 2.0
  #   iterations: 10
  # - name: "pp16384_g64"
  #   prompt_length: 16384
  #   generation_length: 64
  #   time_between_requests: 2.0
  #   iterations: 10
  # - name: "pp16384_g512"
  #   prompt_length: 16384
  #   generation_length: 512
  #   time_between_requests: 2.0
  #   iterations: 10