LocalAI/examples/vllm-bench/ttft_streaming_tool_parser.py

#!/usr/bin/env python3
"""
TTFT benchmark for the vLLM backend's streaming + tool-parser path.

Three scenarios:
  1. tool_call        — request mentions a tool; model is expected to call it
  2. plain_text_short — request offers a tool but explicitly asks for ~3 sentences
  3. plain_text_long  — same as above but asks for ~8 paragraphs (1500 tokens)

The long scenario shows the dramatic difference between buffering and
streaming most clearly: with buffer-all, the client sees nothing for
20+ seconds; with native streaming, the first token arrives in <100 ms.

Usage:
  python ttft_streaming_tool_parser.py \\
      --url http://localhost:8080 --model my-coder --runs 3

The script is self-contained (stdlib only — urllib, json, time, argparse).
"""
import argparse
import json
import sys
import time
import urllib.request

DEFAULT_TOOLS = [{
    "type": "function",
    "function": {
        "name": "get_weather",
        "description": "Get current weather for a city",
        "parameters": {
            "type": "object",
            "properties": {"city": {"type": "string"}},
            "required": ["city"],
        },
    },
}]

SCENARIOS = [
    {
        "label": "tool_call",
        "messages": [{"role": "user",
                      "content": "What is the weather in Paris? Please use the tool."}],
        "max_tokens": 80,
    },
    {
        "label": "plain_text_short",
        "messages": [{"role": "user",
                      "content": "Explain in 3 short sentences what a hash table is. "
                                 "Do NOT call any tool."}],
        "max_tokens": 200,
    },
    {
        "label": "plain_text_long",
        "messages": [{"role": "user",
                      "content": "Write a thorough 8-paragraph explanation of how "
                                 "Python's GIL works, including history, current "
                                 "state, no-GIL build, and alternatives. Be "
                                 "detailed. Do NOT call any tool."}],
        "max_tokens": 1500,
    },
]


def bench_one(url, model, messages, tools, max_tokens, timeout):
    body = json.dumps({
        "model": model,
        "stream": True,
        "tools": tools,
        "messages": messages,
        "max_tokens": max_tokens,
    }).encode()
    req = urllib.request.Request(
        f"{url.rstrip('/')}/v1/chat/completions",
        data=body, headers={"Content-Type": "application/json"},
    )

    t0 = time.perf_counter()
    first_content = None
    first_tool = None
    n_content = 0
    n_tool = 0
    last = None
    finish = None
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        for line in resp:
            line = line.decode("utf-8", "replace").strip()
            if not line.startswith("data: "):
                continue
            payload = line[6:]
            if payload == "[DONE]":
                break
            try:
                chunk = json.loads(payload)
            except Exception:
                continue
            if not chunk.get("choices"):
                continue
            ch = chunk["choices"][0]
            delta = ch.get("delta") or {}
            now = time.perf_counter() - t0
            if delta.get("content"):
                if first_content is None:
                    first_content = now
                n_content += 1
            if delta.get("tool_calls"):
                if first_tool is None:
                    first_tool = now
                n_tool += 1
            if ch.get("finish_reason"):
                finish = ch["finish_reason"]
            last = now
    return {
        "ttf_content_s": first_content,
        "ttf_tool_s": first_tool,
        "n_content_chunks": n_content,
        "n_tool_chunks": n_tool,
        "total_s": last,
        "finish_reason": finish,
    }


def stats(values):
    values = [v for v in values if v is not None]
    if not values:
        return "n/a"
    return f"min={min(values):.3f}  avg={sum(values)/len(values):.3f}  max={max(values):.3f}"


def main():
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--url", default="http://localhost:8080",
                   help="LocalAI base URL (default: %(default)s)")
    p.add_argument("--model", default="coder", help="Model name (default: %(default)s)")
    p.add_argument("--runs", type=int, default=3, help="Repetitions per scenario (default: %(default)s)")
    p.add_argument("--timeout", type=int, default=180, help="Per-request timeout in seconds")
    p.add_argument("--label", default="run",
                   help="Tag for the JSON output file (default: %(default)s)")
    args = p.parse_args()

    print(f"=== TTFT Bench — {args.url}  model={args.model}  runs={args.runs} ===")
    summary = {}
    for sc in SCENARIOS:
        print(f"\nScenario: {sc['label']}")
        rows = []
        for run in range(args.runs):
            r = bench_one(args.url, args.model,
                          sc["messages"], DEFAULT_TOOLS, sc["max_tokens"], args.timeout)
            rows.append(r)
            ttf_c = f"{r['ttf_content_s']:.3f}" if r["ttf_content_s"] is not None else "—"
            ttf_t = f"{r['ttf_tool_s']:.3f}" if r["ttf_tool_s"] is not None else "—"
            print(f"  run {run+1}/{args.runs}: "
                  f"ttf_content={ttf_c}s  ttf_tool={ttf_t}s  "
                  f"n_content={r['n_content_chunks']}  n_tool={r['n_tool_chunks']}  "
                  f"total={r['total_s']:.2f}s  finish={r['finish_reason']}")
        summary[sc["label"]] = rows

    print("\n=== Summary (per scenario) ===")
    for label, rows in summary.items():
        print(f"[{label}]")
        print(f"  ttf_content_s:    {stats(r['ttf_content_s'] for r in rows)}")
        print(f"  ttf_tool_s:       {stats(r['ttf_tool_s']    for r in rows)}")
        print(f"  n_content_chunks: {stats(r['n_content_chunks'] for r in rows)}")
        print(f"  n_tool_chunks:    {stats(r['n_tool_chunks']    for r in rows)}")
        print(f"  total_s:          {stats(r['total_s']        for r in rows)}")

    out = f"ttft_bench_{args.label}.json"
    with open(out, "w") as f:
        json.dump(summary, f, indent=2)
    print(f"\nSaved to {out}")
    return 0


if __name__ == "__main__":
    sys.exit(main())