Files
exo/bench/livecodebench_runner.py
Ryuichi Leo Takashige 7608a5e7f4 told it to test...
2026-02-02 23:38:34 +00:00

288 lines
10 KiB
Python

#!/usr/bin/env python3
# pyright: reportAny=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
"""
LiveCodeBench runner wrapper for exo.
This wrapper allows running LiveCodeBench with custom OpenAI-compatible endpoints
by dynamically registering models and configuring the OpenAI client.
Usage:
python -m bench.livecodebench_runner --model my-model --base-url http://localhost:52415/v1 [lcb args...]
The wrapper:
1. Registers the custom model in LiveCodeBench's model registry
2. Sets up environment variables for the OpenAI client
3. Runs the standard LiveCodeBench runner
Requires LiveCodeBench to be installed:
git clone https://github.com/LiveCodeBench/LiveCodeBench
cd LiveCodeBench && uv pip install -e .
"""
from __future__ import annotations
import argparse
import multiprocessing
import os
import signal
import sys
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING, NoReturn
if TYPE_CHECKING:
from typing import Any
def _cleanup_and_exit(exit_code: int = 130) -> NoReturn:
"""Terminate all child processes and exit."""
# Terminate any active multiprocessing pools
for child in multiprocessing.active_children():
child.terminate()
child.join(timeout=1)
if child.is_alive():
child.kill()
# Force exit to avoid hanging on cleanup
os._exit(exit_code)
def _signal_handler(signum: int, frame: object) -> NoReturn:
"""Handle interrupt signals by terminating all child processes."""
_cleanup_and_exit(130)
def get_lcb_directory() -> Path | None:
"""Find the LiveCodeBench installation directory.
LiveCodeBench uses relative paths like 'lcb_runner/prompts/few_shot_examples/...'
which require running from the LiveCodeBench directory.
"""
# Check environment variable first
if env_path := os.environ.get("LIVECODEBENCH_DIR"):
lcb_path = Path(env_path)
if (lcb_path / "lcb_runner" / "prompts" / "few_shot_examples").exists():
return lcb_path
# Use importlib to find package location without executing module code
# This avoids triggering the relative path imports that would fail
try:
import importlib.util
spec = importlib.util.find_spec("lcb_runner")
if spec and spec.origin:
# spec.origin is the __init__.py path, go up two levels
lcb_path = Path(spec.origin).parent.parent
if (lcb_path / "lcb_runner" / "prompts" / "few_shot_examples").exists():
return lcb_path
except (ImportError, ModuleNotFoundError):
pass
# Check common locations relative to this script
script_dir = Path(__file__).parent.parent # exo/
common_locations = [
script_dir / "LiveCodeBench", # exo/LiveCodeBench
script_dir.parent / "LiveCodeBench", # sibling to exo
]
for loc in common_locations:
if (loc / "lcb_runner" / "prompts" / "few_shot_examples").exists():
return loc
return None
def setup_custom_model(model_name: str, base_url: str) -> None:
"""Register a custom model in LiveCodeBench's registry."""
try:
from lcb_runner.lm_styles import ( # pyright: ignore[reportMissingImports]
LanguageModel,
LanguageModelList,
LanguageModelStore,
LMStyle,
)
except ImportError as e:
print(
"Error: LiveCodeBench not installed. Install with:\n"
" git clone https://github.com/LiveCodeBench/LiveCodeBench\n"
" cd LiveCodeBench && uv pip install -e .",
file=sys.stderr,
)
raise SystemExit(1) from e
# Check if model already exists
if model_name in LanguageModelStore:
return
# Create a new model entry using OpenAIChat style
# This will route through the oai_runner which respects OPENAI_BASE_URL
custom_model = LanguageModel(
model_name=model_name,
model_repr=model_name,
model_style=LMStyle.OpenAIChat,
release_date=datetime.now(),
link=base_url,
)
# Add to the model list and store
LanguageModelList.append(custom_model)
LanguageModelStore[model_name] = custom_model
def patch_openai_client(base_url: str) -> None:
"""Patch the OpenAI client to use a custom base URL.
This patches the oai_runner module to use our custom base URL.
"""
try:
from lcb_runner.runner import oai_runner # noqa: I001 # pyright: ignore[reportMissingImports]
except ImportError as e:
print(f"Error importing required modules: {e}", file=sys.stderr)
raise SystemExit(1) from e
# Store original client creation
original_init = oai_runner.OpenAI
def patched_openai(*args: Any, **kwargs: Any) -> Any:
"""Create OpenAI client with custom base_url."""
# Inject base_url if not already set
if "base_url" not in kwargs:
kwargs["base_url"] = base_url
# Use dummy API key if not set (exo doesn't require auth)
if "api_key" not in kwargs and not os.getenv("OPENAI_KEY"):
kwargs["api_key"] = os.getenv("OPENAI_API_KEY", "exo-local")
return original_init(*args, **kwargs)
# Apply the patch
oai_runner.OpenAI = patched_openai
def main() -> int:
"""Main entry point."""
# Set up signal handlers for clean exit
signal.signal(signal.SIGINT, _signal_handler)
signal.signal(signal.SIGTERM, _signal_handler)
parser = argparse.ArgumentParser(
description="LiveCodeBench runner wrapper for exo",
epilog="Additional arguments are passed to lcb_runner.runner.main",
)
parser.add_argument(
"--base-url",
default=os.environ.get("OPENAI_BASE_URL", "http://localhost:52415/v1"),
help="OpenAI-compatible API base URL (default: OPENAI_BASE_URL or localhost:52415/v1)",
)
parser.add_argument(
"--model",
required=True,
help="Model name to use",
)
parser.add_argument(
"--output-dir",
default=None,
help="Output directory for results (maps to LiveCodeBench's --custom_output_save_name)",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Limit number of problems to evaluate (for testing)",
)
# Parse known args, pass rest to LiveCodeBench
args, remaining = parser.parse_known_args()
# Set up environment
os.environ["OPENAI_BASE_URL"] = args.base_url
if "OPENAI_API_KEY" not in os.environ and "OPENAI_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = "exo-local"
os.environ["OPENAI_KEY"] = "exo-local"
# Save original directory for output path resolution
original_cwd = os.getcwd()
# Change to LiveCodeBench directory before imports that use relative paths
# LiveCodeBench uses paths like 'lcb_runner/prompts/few_shot_examples/...'
lcb_dir = get_lcb_directory()
if lcb_dir:
os.chdir(lcb_dir)
else:
print(
"Warning: Could not find LiveCodeBench directory. "
"Relative path imports may fail.",
file=sys.stderr,
)
# Setup custom model and patch client
setup_custom_model(args.model, args.base_url)
patch_openai_client(args.base_url)
# Build arguments for LiveCodeBench runner
lcb_args = ["--model", args.model]
# Resolve output directory to absolute path (relative to original cwd)
output_base: str | None = None
if args.output_dir:
output_base = str(Path(original_cwd) / args.output_dir)
lcb_args.extend(remaining)
# Run LiveCodeBench
try:
from lcb_runner.runner import main as lcb_main_module # noqa: I001 # pyright: ignore[reportMissingImports]
from lcb_runner.utils import path_utils # noqa: I001 # pyright: ignore[reportMissingImports]
# Patch output path to use our output directory
if output_base:
original_get_output_path = path_utils.get_output_path
def patched_get_output_path(model_repr: str, runner_args: Any) -> str:
# Get the original path and replace 'output/' with our base
original_path = original_get_output_path(model_repr, runner_args)
# Replace 'output/' prefix with our custom base
if original_path.startswith("output/"):
new_path = str(
Path(output_base) / original_path[7:]
) # Skip 'output/'
else:
new_path = str(Path(output_base) / original_path)
path_utils.ensure_dir(new_path)
print(f"Saving results to: {new_path}")
return new_path
path_utils.get_output_path = patched_get_output_path
# Also patch in main module since it may have imported directly
if hasattr(lcb_main_module, "get_output_path"):
lcb_main_module.get_output_path = patched_get_output_path
# Patch benchmark loading to support --limit
# Must patch in the main module since it imports the function directly
if args.limit is not None:
original_build = lcb_main_module.build_prompt_benchmark
def limited_build(*a: Any, **kw: Any) -> Any:
benchmark, format_prompt = original_build(*a, **kw)
if args.limit and len(benchmark) > args.limit:
print(
f"Limiting benchmark from {len(benchmark)} to {args.limit} problems"
)
benchmark = benchmark[: args.limit]
return benchmark, format_prompt
lcb_main_module.build_prompt_benchmark = limited_build
# Patch sys.argv for argparse in lcb_main
sys.argv = [sys.argv[0], *lcb_args]
lcb_main_module.main()
return 0
except KeyboardInterrupt:
print("\nInterrupted by user", file=sys.stderr)
_cleanup_and_exit(130)
except SystemExit as e:
return e.code if isinstance(e.code, int) else 1
except Exception as e:
print(f"Error running LiveCodeBench: {e}", file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())