# pyright: reportAny=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false from __future__ import annotations import argparse import contextlib import json import os import sys import time import tomllib from dataclasses import dataclass, field from pathlib import Path from typing import Any, Literal import httpx from harness import ( ExoClient, ExoHttpError, add_common_instance_args, capture_cluster_snapshot, instance_id_from_instance, nodes_used_in_instance, resolve_model_short_id, run_planning_phase, settle_and_fetch_placements, wait_for_instance_gone, wait_for_instance_ready, ) SCENARIOS_PATH = Path(__file__).parent / "scenarios.toml" @dataclass class Scenario: name: str description: str messages: list[dict[str, Any]] tools: list[dict[str, Any]] expect_tool_call: bool expected_function: str | None = None required_arg_keys: list[str] | None = None tool_result: str | None = None nested_array_key: str | None = None required_item_keys: list[str] | None = None def load_scenarios(path: Path) -> list[Scenario]: with open(path, "rb") as f: data = tomllib.load(f) tools_data = data.get("tools", {}) all_tools: list[dict[str, Any]] = [] tool_by_name: dict[str, dict[str, Any]] = {} for name, defn in tools_data.items(): tool: dict[str, Any] = { "type": "function", "function": { "name": name, "description": defn.get("description", ""), "parameters": { "type": "object", "properties": defn.get("properties", {}), "required": defn.get("required", []), }, }, } all_tools.append(tool) tool_by_name[name] = tool scenarios: list[Scenario] = [] for s in data.get("scenarios", []): if "tools" in s: scenario_tools = [tool_by_name[t] for t in s["tools"]] else: scenario_tools = list(all_tools) messages: list[dict[str, Any]] = [] for msg in s.get("messages", []): m: dict[str, Any] = {"role": msg["role"]} if "content" in msg: m["content"] = msg["content"] if "tool_calls" in msg: m["tool_calls"] = [ { "id": tc["id"], "type": "function", "function": { "name": tc["name"], "arguments": json.dumps(tc["arguments"]), }, } for tc in msg["tool_calls"] ] if "tool_call_id" in msg: m["tool_call_id"] = msg["tool_call_id"] messages.append(m) tool_result: str | None = None if "tool_result" in s: tool_result = json.dumps(s["tool_result"]) scenarios.append( Scenario( name=s["name"], description=s["description"], messages=messages, tools=scenario_tools, expect_tool_call=s["expect_tool_call"], expected_function=s.get("expected_function"), required_arg_keys=s.get("required_arg_keys"), tool_result=tool_result, nested_array_key=s.get("nested_array_key"), required_item_keys=s.get("required_item_keys"), ) ) return scenarios ApiName = Literal["openai", "claude", "responses"] @dataclass class ParsedResponse: finish_reason: str # "tool_calls" | "stop" | ... has_tool_call: bool tool_call: dict[str, str] | None # {"id": ..., "name": ..., "arguments": ...} content: str | None @dataclass class ScenarioResult: name: str api: str phase: str # "tool_call" or "follow_up" passed: bool checks: dict[str, bool] = field(default_factory=dict) error: str | None = None latency_ms: float = 0.0 def validate_args(args_str: str, required_keys: list[str]) -> tuple[bool, str | None]: """Parse JSON arguments and check required keys exist.""" try: args = json.loads(args_str) except (json.JSONDecodeError, TypeError) as exc: return False, f"Invalid JSON: {exc}" if not isinstance(args, dict): return False, f"Expected dict, got {type(args).__name__}" missing = [k for k in required_keys if k not in args] if missing: return False, f"Missing keys: {missing}" return True, None def validate_nested_args( args_str: str, array_key: str, required_item_keys: list[str], ) -> tuple[bool, str | None]: """Check that args[array_key] is a list of objects with required keys.""" try: args = json.loads(args_str) except (json.JSONDecodeError, TypeError) as exc: return False, f"Invalid JSON: {exc}" if not isinstance(args, dict): return False, f"Expected dict, got {type(args).__name__}" arr = args.get(array_key) if not isinstance(arr, list): return False, f"'{array_key}' is not an array (got {type(arr).__name__})" if len(arr) == 0: return False, f"'{array_key}' is empty" for i, item in enumerate(arr): if not isinstance(item, dict): return ( False, f"'{array_key}[{i}]' is not an object (got {type(item).__name__})", ) missing = [k for k in required_item_keys if k not in item] if missing: return False, f"'{array_key}[{i}]' missing keys: {missing}" return True, None def call_api( client: httpx.Client, host: str, port: int, path: str, body: dict[str, Any], timeout: float, ) -> tuple[dict[str, Any], float]: """POST to http://{host}:{port}{path}, return (response_json, latency_ms).""" url = f"http://{host}:{port}{path}" t0 = time.monotonic() resp = client.post(url, json=body, timeout=timeout) latency = (time.monotonic() - t0) * 1000 resp.raise_for_status() return resp.json(), latency def _openai_build_request( model: str, messages: list[dict[str, Any]], tools: list[dict[str, Any]], ) -> tuple[str, dict[str, Any]]: """Build request for /v1/chat/completions.""" body: dict[str, Any] = { "model": model, "messages": messages, "tools": tools, "max_tokens": 16384, "temperature": 0.0, } return "/v1/chat/completions", body def _openai_parse_response(data: dict[str, Any]) -> ParsedResponse: """Parse OpenAI Chat Completions response into common format.""" choice = data["choices"][0] finish_reason = choice.get("finish_reason", "") message = choice.get("message", {}) tool_calls = message.get("tool_calls") content = message.get("content") has_tool_call = isinstance(tool_calls, list) and len(tool_calls) > 0 tool_call_info: dict[str, str] | None = None if has_tool_call: tc = tool_calls[0] fn = tc.get("function", {}) tool_call_info = { "id": tc.get("id", "call_0"), "name": fn.get("name", ""), "arguments": fn.get("arguments", "{}"), } return ParsedResponse( finish_reason=finish_reason, has_tool_call=has_tool_call, tool_call=tool_call_info, content=content, ) def _openai_build_followup( messages: list[dict[str, Any]], tools: list[dict[str, Any]], model: str, parsed: ParsedResponse, tool_result: str, ) -> tuple[str, dict[str, Any]]: """Build multi-turn follow-up for OpenAI Chat Completions.""" assert parsed.tool_call is not None tc = parsed.tool_call followup_messages: list[dict[str, Any]] = list(messages) + [ { "role": "assistant", "tool_calls": [ { "id": tc["id"], "type": "function", "function": { "name": tc["name"], "arguments": tc["arguments"], }, } ], }, { "role": "tool", "tool_call_id": tc["id"], "content": tool_result, }, ] body: dict[str, Any] = { "model": model, "messages": followup_messages, "tools": tools, "max_tokens": 16384, "temperature": 0.0, } return "/v1/chat/completions", body def _claude_translate_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]: """Translate OpenAI-format tools to Claude format.""" claude_tools: list[dict[str, Any]] = [] for tool in tools: fn = tool["function"] claude_tools.append( { "name": fn["name"], "description": fn.get("description", ""), "input_schema": fn.get("parameters", {}), } ) return claude_tools def _claude_translate_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: """Translate OpenAI-format messages to Claude Messages format.""" claude_messages: list[dict[str, Any]] = [] for msg in messages: role = msg["role"] if role == "user": claude_messages.append( { "role": "user", "content": msg["content"], } ) elif role == "assistant": content_blocks: list[dict[str, Any]] = [] text_content = msg.get("content") if text_content and isinstance(text_content, str) and text_content.strip(): content_blocks.append({"type": "text", "text": text_content}) tool_calls = msg.get("tool_calls") if tool_calls: for tc in tool_calls: fn = tc.get("function", {}) args_str = fn.get("arguments", "{}") try: args_dict = json.loads(args_str) except (json.JSONDecodeError, TypeError): args_dict = {} content_blocks.append( { "type": "tool_use", "id": tc.get("id", "call_0"), "name": fn.get("name", ""), "input": args_dict, } ) if not content_blocks: content_blocks.append({"type": "text", "text": ""}) claude_messages.append( { "role": "assistant", "content": content_blocks, } ) elif role == "tool": claude_messages.append( { "role": "user", "content": [ { "type": "tool_result", "tool_use_id": msg.get("tool_call_id", "call_0"), "content": msg.get("content", ""), } ], } ) elif role == "system": pass return claude_messages def _claude_build_request( model: str, messages: list[dict[str, Any]], tools: list[dict[str, Any]], ) -> tuple[str, dict[str, Any]]: """Build request for /v1/messages.""" claude_messages = _claude_translate_messages(messages) claude_tools = _claude_translate_tools(tools) system_content: str | None = None for msg in messages: if msg["role"] == "system": system_content = msg["content"] break body: dict[str, Any] = { "model": model, "messages": claude_messages, "tools": claude_tools, "max_tokens": 16384, "temperature": 0.0, } if system_content is not None: body["system"] = system_content return "/v1/messages", body def _claude_parse_response(data: dict[str, Any]) -> ParsedResponse: """Parse Claude Messages response into common format.""" stop_reason = data.get("stop_reason", "") content_blocks = data.get("content", []) if stop_reason == "tool_use": finish_reason = "tool_calls" elif stop_reason == "end_turn": finish_reason = "stop" else: finish_reason = stop_reason tool_call_info: dict[str, str] | None = None text_parts: list[str] = [] has_tool_call = False for block in content_blocks: block_type = block.get("type") if block_type == "tool_use": has_tool_call = True if tool_call_info is None: input_data = block.get("input", {}) tool_call_info = { "id": block.get("id", "call_0"), "name": block.get("name", ""), "arguments": json.dumps(input_data) if isinstance(input_data, dict) else str(input_data), } elif block_type == "text": text = block.get("text", "") if text.strip(): text_parts.append(text) content = "\n".join(text_parts) if text_parts else None return ParsedResponse( finish_reason=finish_reason, has_tool_call=has_tool_call, tool_call=tool_call_info, content=content, ) def _claude_build_followup( messages: list[dict[str, Any]], tools: list[dict[str, Any]], model: str, parsed: ParsedResponse, tool_result: str, ) -> tuple[str, dict[str, Any]]: """Build multi-turn follow-up for Claude Messages.""" assert parsed.tool_call is not None tc = parsed.tool_call try: args_dict = json.loads(tc["arguments"]) except (json.JSONDecodeError, TypeError): args_dict = {} claude_messages = _claude_translate_messages(messages) claude_messages.append( { "role": "assistant", "content": [ { "type": "tool_use", "id": tc["id"], "name": tc["name"], "input": args_dict, } ], } ) claude_messages.append( { "role": "user", "content": [ { "type": "tool_result", "tool_use_id": tc["id"], "content": tool_result, } ], } ) claude_tools = _claude_translate_tools(tools) system_content: str | None = None for msg in messages: if msg["role"] == "system": system_content = msg["content"] break body: dict[str, Any] = { "model": model, "messages": claude_messages, "tools": claude_tools, "max_tokens": 16384, "temperature": 0.0, } if system_content is not None: body["system"] = system_content return "/v1/messages", body def _responses_translate_input(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: """Translate OpenAI chat messages to Responses API input items.""" items: list[dict[str, Any]] = [] for msg in messages: role = msg["role"] if role in ("user", "system"): items.append( { "type": "message", "role": role, "content": msg["content"], } ) elif role == "assistant": text_content = msg.get("content") if text_content and isinstance(text_content, str) and text_content.strip(): items.append( { "type": "message", "role": "assistant", "content": text_content, } ) tool_calls = msg.get("tool_calls") if tool_calls: for tc in tool_calls: fn = tc.get("function", {}) items.append( { "type": "function_call", "call_id": tc.get("id", "call_0"), "name": fn.get("name", ""), "arguments": fn.get("arguments", "{}"), } ) elif role == "tool": items.append( { "type": "function_call_output", "call_id": msg.get("tool_call_id", "call_0"), "output": msg.get("content", ""), } ) return items def _responses_build_request( model: str, messages: list[dict[str, Any]], tools: list[dict[str, Any]], ) -> tuple[str, dict[str, Any]]: """Build request for /v1/responses.""" input_items = _responses_translate_input(messages) body: dict[str, Any] = { "model": model, "input": input_items, "tools": tools, "temperature": 0.0, "max_output_tokens": 4096, } return "/v1/responses", body def _responses_parse_response(data: dict[str, Any]) -> ParsedResponse: """Parse OpenAI Responses API response into common format.""" output = data.get("output", []) tool_call_info: dict[str, str] | None = None text_parts: list[str] = [] has_tool_call = False for item in output: item_type = item.get("type") if item_type == "function_call": has_tool_call = True if tool_call_info is None: tool_call_info = { "id": item.get("call_id", "call_0"), "name": item.get("name", ""), "arguments": item.get("arguments", "{}"), } elif item_type == "message": msg_content = item.get("content", []) if isinstance(msg_content, list): for block in msg_content: if isinstance(block, dict): text = block.get("text", "") if text and text.strip(): text_parts.append(text) elif isinstance(msg_content, str) and msg_content.strip(): text_parts.append(msg_content) content = "\n".join(text_parts) if text_parts else None if has_tool_call: finish_reason = "tool_calls" else: status = data.get("status", "completed") finish_reason = "stop" if status == "completed" else status return ParsedResponse( finish_reason=finish_reason, has_tool_call=has_tool_call, tool_call=tool_call_info, content=content, ) def _responses_build_followup( messages: list[dict[str, Any]], tools: list[dict[str, Any]], model: str, parsed: ParsedResponse, tool_result: str, ) -> tuple[str, dict[str, Any]]: """Build multi-turn follow-up for Responses API.""" assert parsed.tool_call is not None tc = parsed.tool_call input_items = _responses_translate_input(messages) input_items.append( { "type": "function_call", "call_id": tc["id"], "name": tc["name"], "arguments": tc["arguments"], } ) input_items.append( { "type": "function_call_output", "call_id": tc["id"], "output": tool_result, } ) body: dict[str, Any] = { "model": model, "input": input_items, "tools": tools, "temperature": 0.0, "max_output_tokens": 4096, } return "/v1/responses", body ADAPTERS: dict[ApiName, dict[str, Any]] = { "openai": { "build_request": _openai_build_request, "parse_response": _openai_parse_response, "build_followup": _openai_build_followup, }, "claude": { "build_request": _claude_build_request, "parse_response": _claude_parse_response, "build_followup": _claude_build_followup, }, "responses": { "build_request": _responses_build_request, "parse_response": _responses_parse_response, "build_followup": _responses_build_followup, }, } def run_scenario( client: httpx.Client, host: str, port: int, model: str, scenario: Scenario, api_name: ApiName, timeout: float, verbose: bool, ) -> list[ScenarioResult]: """Run a single scenario against one API adapter. Returns 1-2 results.""" adapter = ADAPTERS[api_name] build_request = adapter["build_request"] parse_response = adapter["parse_response"] build_followup = adapter["build_followup"] results: list[ScenarioResult] = [] # --- Phase 1: initial request --- path, body = build_request(model, scenario.messages, scenario.tools) if verbose: print( f" [{api_name}] request: {path} {json.dumps(body, indent=2)}", file=sys.stderr, ) try: data, latency = call_api(client, host, port, path, body, timeout) except Exception as exc: results.append( ScenarioResult( name=scenario.name, api=api_name, phase="tool_call", passed=False, error=f"API error: {exc}", ) ) return results if verbose: print( f" [{api_name}] response: {json.dumps(data, indent=2)}", file=sys.stderr ) parsed = parse_response(data) checks: dict[str, bool] = {} if scenario.expect_tool_call: checks["finish_reason_tool_calls"] = parsed.finish_reason == "tool_calls" checks["has_tool_call"] = parsed.has_tool_call args_err: str | None = None if parsed.has_tool_call and parsed.tool_call is not None: checks["correct_function"] = ( scenario.expected_function is None or parsed.tool_call["name"] == scenario.expected_function ) if scenario.required_arg_keys: ok, args_err = validate_args( parsed.tool_call["arguments"], scenario.required_arg_keys ) checks["valid_arguments"] = ok else: checks["valid_arguments"] = True if scenario.nested_array_key and scenario.required_item_keys: ok, nested_err = validate_nested_args( parsed.tool_call["arguments"], scenario.nested_array_key, scenario.required_item_keys, ) checks["valid_nested_structure"] = ok if not ok: args_err = nested_err else: checks["correct_function"] = False checks["valid_arguments"] = False args_err = "No tool call returned" passed = all(checks.values()) error = args_err if not passed else None else: checks["finish_reason_stop"] = parsed.finish_reason == "stop" checks["no_tool_call"] = not parsed.has_tool_call checks["has_content"] = ( parsed.content is not None and len(parsed.content.strip()) > 0 ) passed = all(checks.values()) error = ( None if passed else ( f"finish_reason={parsed.finish_reason}, " f"tool_call={'yes' if parsed.has_tool_call else 'no'}, " f"content={'yes' if parsed.content else 'no'}" ) ) results.append( ScenarioResult( name=scenario.name, api=api_name, phase="tool_call", passed=passed, checks=checks, error=error, latency_ms=latency, ) ) # --- Phase 2: multi-turn follow-up --- if ( scenario.tool_result is not None and parsed.has_tool_call and parsed.tool_call is not None ): followup_path, followup_body = build_followup( scenario.messages, scenario.tools, model, parsed, scenario.tool_result, ) if verbose: print( f" [{api_name}] follow_up request: {followup_path} {json.dumps(followup_body, indent=2)}", file=sys.stderr, ) try: data2, latency2 = call_api( client, host, port, followup_path, followup_body, timeout ) except Exception as exc: results.append( ScenarioResult( name=scenario.name, api=api_name, phase="follow_up", passed=False, error=f"API error: {exc}", ) ) return results if verbose: print( f" [{api_name}] follow_up response: {json.dumps(data2, indent=2)}", file=sys.stderr, ) parsed2 = parse_response(data2) checks2: dict[str, bool] = {} checks2["finish_reason_stop"] = parsed2.finish_reason == "stop" checks2["no_tool_call"] = not parsed2.has_tool_call checks2["has_content"] = ( parsed2.content is not None and len(parsed2.content.strip()) > 0 ) passed2 = all(checks2.values()) error2: str | None = None if not passed2: error2 = ( f"finish_reason={parsed2.finish_reason}, " f"tool_call={'yes' if parsed2.has_tool_call else 'no'}, " f"content={'yes' if parsed2.content else 'no'}" ) results.append( ScenarioResult( name=scenario.name, api=api_name, phase="follow_up", passed=passed2, checks=checks2, error=error2, latency_ms=latency2, ) ) return results def result_to_dict(result: ScenarioResult) -> dict[str, Any]: """Convert a ScenarioResult to a JSON-serializable dict.""" return { "name": result.name, "api": result.api, "phase": result.phase, "passed": result.passed, "checks": result.checks, "error": result.error, "latency_ms": round(result.latency_ms, 1), } _MULTI_NODE_PRIORITY: dict[tuple[str, str], int] = { ("tensor", "jaccl"): 0, ("pipeline", "jaccl"): 2, ("pipeline", "ring"): 3, ("tensor", "ring"): 4, } _SINGLE_NODE_PRIORITY = 1 def _placement_sort_key(p: dict[str, Any]) -> tuple[int, int]: sharding = p.get("sharding", "").lower() meta = p.get("instance_meta", "").lower() kind = ( "tensor" if "tensor" in sharding else "pipeline", "jaccl" if "jaccl" in meta else "ring", ) n_nodes = nodes_used_in_instance(p["instance"]) if n_nodes == 1: return (_SINGLE_NODE_PRIORITY, -n_nodes) priority = _MULTI_NODE_PRIORITY.get(kind, 99) return (priority, -n_nodes) def main() -> None: parser = argparse.ArgumentParser( description="Multi-API tool-calling eval for exo", formatter_class=argparse.RawDescriptionHelpFormatter, epilog="""\ Examples: %(prog)s --model mlx-community/Qwen3-30B-A3B-4bit %(prog)s --model my-model --api openai --repeat 3 %(prog)s --model my-model --api all --scenarios weather_simple calculator_multi_turn %(prog)s --model my-model --stdout """, ) add_common_instance_args(parser) parser.add_argument( "--api", choices=["openai", "claude", "responses", "all"], default="all", help="Which API adapter(s) to test (default: all)", ) parser.add_argument( "--repeat", type=int, default=1, help="Repeat each scenario N times (default: 1)", ) parser.add_argument( "--scenarios", nargs="*", help="Run only these scenarios (by name)", ) parser.add_argument( "--verbose", action="store_true", help="Print full API responses to stderr", ) parser.add_argument( "--json-out", default="bench/eval_results.json", help="Write JSON results to file (default: bench/eval_results.json)", ) parser.add_argument( "--stdout", action="store_true", help="Write JSON results to stdout instead of file", ) args = parser.parse_args() all_scenarios = load_scenarios(SCENARIOS_PATH) if args.scenarios: scenarios = [s for s in all_scenarios if s.name in args.scenarios] if not scenarios: print( f"No matching scenarios. Available: {[s.name for s in all_scenarios]}", file=sys.stderr, ) sys.exit(1) else: scenarios = all_scenarios api_names: list[ApiName] = ( ["openai", "claude", "responses"] if args.api == "all" else [args.api] ) log = sys.stderr if args.stdout else sys.stdout exo = ExoClient(args.host, args.port, timeout_s=args.timeout) _short_id, full_model_id = resolve_model_short_id(exo, args.model) selected = settle_and_fetch_placements( exo, full_model_id, args, settle_timeout=args.settle_timeout ) if not selected: print("No valid placements matched your filters.", file=sys.stderr) sys.exit(1) selected.sort(key=_placement_sort_key) preview = selected[0] settle_deadline = ( time.monotonic() + args.settle_timeout if args.settle_timeout > 0 else None ) print("Planning phase: checking downloads...", file=log) run_planning_phase( exo, full_model_id, preview, args.danger_delete_downloads, args.timeout, settle_deadline, ) instance = preview["instance"] instance_id = instance_id_from_instance(instance) sharding = str(preview["sharding"]) instance_meta = str(preview["instance_meta"]) n_nodes = nodes_used_in_instance(instance) print(f"Model: {full_model_id}", file=log) print(f"Placement: {sharding} / {instance_meta} / {n_nodes} nodes", file=log) print(f"Endpoint: http://{args.host}:{args.port}", file=log) print(f"APIs: {', '.join(api_names)}", file=log) total_runs = len(scenarios) * args.repeat * len(api_names) print( f"Scenarios: {len(scenarios)} x {args.repeat} repeats x {len(api_names)} APIs = {total_runs} runs", file=log, ) print("=" * 72, file=log) exo.request_json("POST", "/instance", body={"instance": instance}) try: wait_for_instance_ready(exo, instance_id) except (RuntimeError, TimeoutError) as e: print(f"Failed to initialize placement: {e}", file=sys.stderr) with contextlib.suppress(ExoHttpError): exo.request_json("DELETE", f"/instance/{instance_id}") sys.exit(1) time.sleep(1) cluster_snapshot = capture_cluster_snapshot(exo) all_results: list[ScenarioResult] = [] try: with httpx.Client() as http_client: for run_idx in range(args.repeat): if args.repeat > 1: print(f"\n--- Run {run_idx + 1}/{args.repeat} ---", file=log) for scenario in scenarios: for api_name in api_names: print( f"\n [{api_name:>9}] {scenario.name}: {scenario.description}", file=log, ) scenario_results = run_scenario( http_client, args.host, args.port, full_model_id, scenario, api_name, args.timeout, args.verbose, ) all_results.extend(scenario_results) for r in scenario_results: status = "PASS" if r.passed else "FAIL" print( f" [{r.phase:>10}] {status} ({r.latency_ms:.0f}ms)", file=log, ) for check_name, check_ok in r.checks.items(): mark = "+" if check_ok else "-" print(f" {mark} {check_name}", file=log) if r.error: print(f" ! {r.error}", file=log) finally: try: exo.request_json("DELETE", f"/instance/{instance_id}") except ExoHttpError as e: if e.status != 404: raise wait_for_instance_gone(exo, instance_id) # --- Summary --- print(f"\n{'=' * 72}", file=log) total = len(all_results) passed = sum(1 for r in all_results if r.passed) tool_call_results = [r for r in all_results if r.phase == "tool_call"] follow_up_results = [r for r in all_results if r.phase == "follow_up"] tc_passed = sum(1 for r in tool_call_results if r.passed) fu_passed = sum(1 for r in follow_up_results if r.passed) avg_latency = sum(r.latency_ms for r in all_results) / total if total else 0 print( f"Total: {passed}/{total} passed ({100 * passed / total:.0f}%)", file=log ) print(f"Tool call: {tc_passed}/{len(tool_call_results)} passed", file=log) if follow_up_results: print(f"Follow-up: {fu_passed}/{len(follow_up_results)} passed", file=log) print(f"Avg latency: {avg_latency:.0f}ms", file=log) for api_name in api_names: api_results = [r for r in all_results if r.api == api_name] api_passed = sum(1 for r in api_results if r.passed) print(f" {api_name:>9}: {api_passed}/{len(api_results)} passed", file=log) if passed < total: print("\nFailed:", file=log) for r in all_results: if not r.passed: print(f" - {r.name} [{r.api}/{r.phase}]: {r.error}", file=log) json_results = [result_to_dict(r) for r in all_results] output: dict[str, Any] = {"results": json_results} if cluster_snapshot: output["cluster"] = cluster_snapshot if args.stdout: print(json.dumps(output, indent=2)) else: json_path = args.json_out parent = os.path.dirname(json_path) if parent: os.makedirs(parent, exist_ok=True) with open(json_path, "w") as f: json.dump(output, f, indent=2) f.write("\n") print(f"\nJSON results written to {json_path}", file=log) sys.exit(0 if passed == total else 1) if __name__ == "__main__": main()