Files
LocalAI/backend/cpp/llama-cpp-localai-paged/paged-mtp-serving-bench.sh
Ettore Di Giacinto 4d171e62bb docs(paged): reject MTP serving lever
Add the repeatable MTP serving A/B runner and record Phase 15 results showing current llama-server MTP regresses GB10 serving throughput despite passing inference gates.

Assisted-by: Codex:gpt-5
2026-07-01 02:29:28 +00:00

201 lines
5.6 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage: paged-mtp-serving-bench.sh
Runs a direct llama-server serving A/B on DGX:
baseline: no speculative decoding
mtp: --spec-type draft-mtp
Environment overrides:
SRC llama.cpp source dir (default: ~/llama-phase6-source)
BIN binary dir (default: $SRC/build-cuda/bin)
MODEL MoE GGUF path (default: ~/bench/q36-35b-a3b-nvfp4.gguf)
ART artifact dir (default: ~/bench/phase15_mtp_serving/<timestamp>)
PORT server port (default: 8097)
NPL comma/space list of concurrency values (default: "8 32 128")
PTOK prompt filler words for h2h_cli3.py (default: 128)
GEN max generated tokens (default: 128)
CTX server context (default: 131072)
PARALLEL server parallel slots (default: 128)
BATCH server logical batch size (default: 2048)
UBATCH server physical batch size (default: 512)
SKIP_GATES=1 to skip pre/post paged inference gates
EOF
}
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
usage
exit 0
fi
SRC=${SRC:-"$HOME/llama-phase6-source"}
BIN=${BIN:-"$SRC/build-cuda/bin"}
MODEL=${MODEL:-"$HOME/bench/q36-35b-a3b-nvfp4.gguf"}
ART=${ART:-"$HOME/bench/phase15_mtp_serving/$(date +%Y%m%d_%H%M%S)"}
PORT=${PORT:-8097}
NPL=${NPL:-"8 32 128"}
PTOK=${PTOK:-128}
GEN=${GEN:-128}
CTX=${CTX:-131072}
PARALLEL=${PARALLEL:-128}
BATCH=${BATCH:-2048}
UBATCH=${UBATCH:-512}
SKIP_GATES=${SKIP_GATES:-0}
LOCK_DIR="$HOME/gpu_bench_lock"
OWNER="$LOCK_DIR/owner"
SERVER_PID=""
log() {
printf '[%s] %s\n' "$(date -Is)" "$*" | tee -a "$ART/run.log"
}
preflight() {
mkdir -p "$ART"
local docker_count local_ai compute owner
docker_count=$(docker ps -q | wc -l)
local_ai=$(docker ps --format "{{.Names}}" | grep -c local-ai-worker || true)
compute=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader | sed '/^$/d' | wc -l)
owner="FREE-no-lock-file"
if [[ -f "$OWNER" ]]; then
owner=$(cat "$OWNER")
fi
{
echo "docker=$docker_count"
echo "local_ai_worker=$local_ai"
echo "compute=$compute"
echo "$owner"
} | tee "$ART/preflight.txt"
[[ "$docker_count" == "0" ]]
[[ "$local_ai" == "0" ]]
[[ "$compute" == "0" ]]
case "$owner" in
FREE*|FREE-no-lock-file) ;;
*) echo "GPU lock is busy: $owner" >&2; exit 2 ;;
esac
}
acquire_lock() {
mkdir -p "$LOCK_DIR"
echo "codex-phase15-mtp-serving-bench $(date +%s)" > "$OWNER"
}
release_lock() {
if [[ -n "$SERVER_PID" ]]; then
kill "$SERVER_PID" >/dev/null 2>&1 || true
wait "$SERVER_PID" >/dev/null 2>&1 || true
SERVER_PID=""
fi
mkdir -p "$LOCK_DIR"
echo "FREE released-by-codex-phase15-mtp-serving-bench $(date +%s)" > "$OWNER"
}
wait_server() {
local health="$1"
for _ in $(seq 1 180); do
if curl -fsS "http://127.0.0.1:$PORT/health" > "$health" 2>"$health.err"; then
return 0
fi
if ! kill -0 "$SERVER_PID" 2>/dev/null; then
return 1
fi
sleep 1
done
return 1
}
stop_server() {
if [[ -n "$SERVER_PID" ]]; then
kill "$SERVER_PID" >/dev/null 2>&1 || true
wait "$SERVER_PID" >/dev/null 2>&1 || true
SERVER_PID=""
fi
}
run_gate() {
local name="$1"
if [[ "$SKIP_GATES" == "1" ]]; then
log "skipping $name inference gate"
return
fi
log "running $name inference gate"
ART="$ART/gate_$name" "$HOME/paged-inference-gates.sh" > "$ART/gate_$name.log" 2>&1
cat "$ART/gate_$name.log" | tee -a "$ART/run.log"
}
run_arm() {
local arm="$1"
shift
local arm_dir="$ART/$arm"
mkdir -p "$arm_dir"
log "starting $arm server"
cd "$BIN"
env LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 GGML_NO_BACKTRACE=1 \
./llama-server \
-m "$MODEL" -ngl 99 -fa on -c "$CTX" -b "$BATCH" -ub "$UBATCH" \
--parallel "$PARALLEL" --host 127.0.0.1 --port "$PORT" --no-webui "$@" \
> "$arm_dir/server.log" 2>&1 &
SERVER_PID=$!
if ! wait_server "$arm_dir/health.json"; then
tail -120 "$arm_dir/server.log" >&2 || true
exit 3
fi
for n in $NPL; do
log "running $arm n=$n"
python3 "$HOME/bench/h2h_cli3.py" \
--url "http://127.0.0.1:$PORT/v1/completions" \
--model m -n "$n" --ptok "$PTOK" --gen "$GEN" \
--nonce "${arm}_${n}_$(date +%s)" --no-cache \
> "$arm_dir/n${n}.json"
cat "$arm_dir/n${n}.json" | tee -a "$ART/run.log"
done
grep -E "draft acceptance|statistics[[:space:]]+draft-mtp|speculative decoding context|bounded partial|backend sampling|common_speculative_impl_draft_mtp" \
"$arm_dir/server.log" > "$arm_dir/spec_lines.txt" || true
stop_server
}
preflight
log "building llama-server and test-backend-ops"
cmake --build "$SRC/build-cuda" --target llama-server test-backend-ops llama-completion -j 8 \
> "$ART/build.log" 2>&1
if [[ ! -x "$HOME/paged-inference-gates.sh" ]]; then
echo "missing $HOME/paged-inference-gates.sh; copy paged-inference-gates.sh there first" >&2
exit 4
fi
run_gate pre
acquire_lock
trap release_lock EXIT
run_arm baseline
run_arm mtp --spec-type draft-mtp --spec-draft-n-max 3 --no-spec-draft-backend-sampling
release_lock
trap - EXIT
run_gate post
python3 - "$ART" <<'PY' | tee "$ART/summary.tsv"
import json
import sys
from pathlib import Path
art = Path(sys.argv[1])
rows = []
for arm in ("baseline", "mtp"):
for path in sorted((art / arm).glob("n*.json")):
data = json.loads(path.read_text())
rows.append((arm, data["n"], data["gen_total"], data["agg_tps"],
data["decode_agg_tps"], data["decode_perseq_tps"],
data["ttft_mean_ms"], data["wall_s"]))
print("arm\tn\tgen_total\tagg_tps\tdecode_agg_tps\tdecode_perseq_tps\tttft_mean_ms\twall_s")
for row in rows:
print("\t".join(str(x) for x in row))
PY
log "artifacts: $ART"