Files
LocalAI/backend/cpp/llama-cpp-localai-paged/paged-current-serving-snapshot.sh
Ettore Di Giacinto ff3f0620de chore(paged): add current serving snapshot harness
Add a reusable current-stack paged-vs-vLLM serving snapshot harness that targets the clean DGX mirror, enforces idle/lock preflight, runs pre/post inference gates, and records ratio summaries.

Assisted-by: Codex:gpt-5
2026-07-01 03:19:36 +00:00

269 lines
8.7 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage: paged-current-serving-snapshot.sh
Run a current-stack paged llama.cpp vs vLLM MoE serving snapshot on DGX.
This harness uses the clean llama.cpp mirror by default, not stale development
trees. It runs pre/post paged inference gates, then a same-session serving
comparison with the h2h client.
Environment overrides:
SRC llama.cpp source dir (default: ~/llama-phase6-source)
BIN llama.cpp build bin dir (default: $SRC/build-cuda/bin)
MODEL paged GGUF path (default: ~/bench/q36-35b-a3b-nvfp4.gguf)
VLLM_MODEL vLLM model dir (default: ~/bench/q36-35b-a3b-nvfp4-vllm)
H2H h2h client (default: ~/bench/h2h_cli3.py)
ART artifact dir (default: ~/bench/phase_current_serving_snapshot/<timestamp>)
NPL concurrency list (default: "8 32 128")
PTOK prompt filler words (default: 128)
GEN generated tokens (default: 64)
CTX llama-server context (default: 131072)
PARALLEL llama-server parallel slots (default: 128)
BATCH llama-server logical batch (default: 2048)
UBATCH llama-server physical batch (default: 512)
LLAMA_PORT llama-server port (default: 8098)
VLLM_PORT vLLM port (default: 8000)
VLLM_BIN vLLM executable (default: ~/vllm-bench/bin/vllm)
SKIP_GATES=1 to skip pre/post paged inference gates
DRY_RUN=1 validate inputs/preflight and print commands without running servers
EOF
}
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
usage
exit 0
fi
SRC=${SRC:-"$HOME/llama-phase6-source"}
BIN=${BIN:-"$SRC/build-cuda/bin"}
MODEL=${MODEL:-"$HOME/bench/q36-35b-a3b-nvfp4.gguf"}
VLLM_MODEL=${VLLM_MODEL:-"$HOME/bench/q36-35b-a3b-nvfp4-vllm"}
H2H=${H2H:-"$HOME/bench/h2h_cli3.py"}
ART=${ART:-"$HOME/bench/phase_current_serving_snapshot/$(date +%Y%m%d_%H%M%S)"}
NPL=${NPL:-"8 32 128"}
PTOK=${PTOK:-128}
GEN=${GEN:-64}
CTX=${CTX:-131072}
PARALLEL=${PARALLEL:-128}
BATCH=${BATCH:-2048}
UBATCH=${UBATCH:-512}
LLAMA_PORT=${LLAMA_PORT:-8098}
VLLM_PORT=${VLLM_PORT:-8000}
VLLM_BIN=${VLLM_BIN:-"$HOME/vllm-bench/bin/vllm"}
SKIP_GATES=${SKIP_GATES:-0}
DRY_RUN=${DRY_RUN:-0}
LOCK_DIR="$HOME/gpu_bench_lock"
OWNER="$LOCK_DIR/owner"
SERVER_PID=""
log() {
printf '[%s] %s\n' "$(date -Is)" "$*" | tee -a "$ART/run.log"
}
require_path() {
if [[ ! -e "$1" ]]; then
echo "missing required path: $1" >&2
exit 2
fi
}
preflight() {
mkdir -p "$ART"
local docker_count local_ai compute owner
docker_count=$(docker ps -q | wc -l)
local_ai=$(docker ps --format "{{.Names}}" | grep -c local-ai-worker || true)
compute=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader | sed '/^$/d' | wc -l)
owner="FREE-no-lock-file"
if [[ -f "$OWNER" ]]; then
owner=$(cat "$OWNER")
fi
{
echo "docker=$docker_count"
echo "local_ai_worker=$local_ai"
echo "compute=$compute"
echo "$owner"
} | tee "$ART/preflight.txt"
[[ "$docker_count" == "0" ]]
[[ "$local_ai" == "0" ]]
[[ "$compute" == "0" ]]
case "$owner" in
FREE*|FREE-no-lock-file) ;;
*) echo "GPU lock is busy: $owner" >&2; exit 3 ;;
esac
}
acquire_lock() {
mkdir -p "$LOCK_DIR"
echo "codex-current-serving-snapshot $(date +%s)" > "$OWNER"
}
release_lock() {
if [[ -n "$SERVER_PID" ]]; then
kill "$SERVER_PID" >/dev/null 2>&1 || true
wait "$SERVER_PID" >/dev/null 2>&1 || true
SERVER_PID=""
fi
pkill -9 -f "[l]lama-server.*--port $LLAMA_PORT" >/dev/null 2>&1 || true
pkill -9 -u "$(id -u)" -f "[v]llm serve" >/dev/null 2>&1 || true
mkdir -p "$LOCK_DIR"
echo "FREE released-by-codex-current-serving-snapshot $(date +%s)" > "$OWNER"
}
wait_http() {
local url="$1"
local pattern="$2"
local log_file="$3"
local health="$4"
for _ in $(seq 1 240); do
if curl -fsS "$url" > "$health" 2>"$health.err" && grep -q "$pattern" "$health"; then
return 0
fi
if [[ -n "$SERVER_PID" ]] && ! kill -0 "$SERVER_PID" >/dev/null 2>&1; then
tail -120 "$log_file" >&2 || true
return 1
fi
sleep 1
done
tail -120 "$log_file" >&2 || true
return 1
}
run_gate() {
local name="$1"
if [[ "$SKIP_GATES" == "1" ]]; then
log "skipping $name inference gate"
return
fi
log "running $name inference gate"
ART="$ART/gate_$name" "$HOME/paged-inference-gates.sh" > "$ART/gate_$name.log" 2>&1
cat "$ART/gate_$name.log" | tee -a "$ART/run.log"
}
run_paged() {
local arm_dir="$ART/paged"
mkdir -p "$arm_dir"
log "starting paged current-stack server"
cd "$BIN"
env LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 GGML_NO_BACKTRACE=1 \
./llama-server \
-m "$MODEL" -ngl 99 -fa on -c "$CTX" -b "$BATCH" -ub "$UBATCH" \
--parallel "$PARALLEL" --host 127.0.0.1 --port "$LLAMA_PORT" --no-webui \
> "$arm_dir/server.log" 2>&1 &
SERVER_PID=$!
wait_http "http://127.0.0.1:$LLAMA_PORT/health" "ok" "$arm_dir/server.log" "$arm_dir/health.json"
python3 "$H2H" --url "http://127.0.0.1:$LLAMA_PORT/v1/completions" \
--model q36 -n 8 --ptok "$PTOK" --gen 16 --nonce "warm_paged_$(date +%s)" --no-cache >/dev/null
for n in $NPL; do
log "paged n=$n"
python3 "$H2H" --url "http://127.0.0.1:$LLAMA_PORT/v1/completions" \
--model q36 -n "$n" --ptok "$PTOK" --gen "$GEN" \
--nonce "paged_${n}_$(date +%s)" --no-cache > "$arm_dir/n${n}.json"
cat "$arm_dir/n${n}.json" | tee -a "$ART/run.log"
done
kill "$SERVER_PID" >/dev/null 2>&1 || true
wait "$SERVER_PID" >/dev/null 2>&1 || true
SERVER_PID=""
sleep 3
}
run_vllm() {
local arm_dir="$ART/vllm"
mkdir -p "$arm_dir"
export PATH="$(dirname "$VLLM_BIN"):$PATH"
export VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
export HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-1}
log "starting vLLM server"
nohup "$VLLM_BIN" serve "$VLLM_MODEL" \
--served-model-name q36 --gpu-memory-utilization 0.85 --max-model-len 4096 \
--max-num-seqs 256 --host 127.0.0.1 --port "$VLLM_PORT" --tensor-parallel-size 1 \
> "$arm_dir/server.log" 2>&1 &
SERVER_PID=$!
wait_http "http://127.0.0.1:$VLLM_PORT/v1/models" "q36" "$arm_dir/server.log" "$arm_dir/models.json"
python3 "$H2H" --url "http://127.0.0.1:$VLLM_PORT/v1/completions" \
--model q36 -n 8 --ptok "$PTOK" --gen 16 --nonce "warm_vllm_$(date +%s)" --no-cache >/dev/null
for n in $NPL; do
log "vllm n=$n"
python3 "$H2H" --url "http://127.0.0.1:$VLLM_PORT/v1/completions" \
--model q36 -n "$n" --ptok "$PTOK" --gen "$GEN" \
--nonce "vllm_${n}_$(date +%s)" --no-cache > "$arm_dir/n${n}.json"
cat "$arm_dir/n${n}.json" | tee -a "$ART/run.log"
done
kill "$SERVER_PID" >/dev/null 2>&1 || true
pkill -9 -u "$(id -u)" -f "[v]llm serve" >/dev/null 2>&1 || true
wait "$SERVER_PID" >/dev/null 2>&1 || true
SERVER_PID=""
sleep 5
}
write_summary() {
python3 - "$ART" <<'PY' | tee "$ART/summary.tsv"
import json
import sys
from pathlib import Path
art = Path(sys.argv[1])
rows = []
for arm in ("paged", "vllm"):
for path in sorted((art / arm).glob("n*.json")):
data = json.loads(path.read_text())
rows.append((arm, data["n"], data["agg_tps"], data["decode_agg_tps"],
data["decode_perseq_tps"], data["prefill_tps"],
data["ttft_mean_ms"], data["wall_s"]))
print("arm\tn\tagg_tps\tdecode_agg_tps\tdecode_perseq_tps\tprefill_tps\tttft_mean_ms\twall_s")
for row in rows:
print("\t".join(str(x) for x in row))
by_key = {(row[0], row[1]): row for row in rows}
print("\nratio\tn\tpaged_decode_over_vllm\tpaged_perseq_over_vllm\tpaged_agg_over_vllm\tpaged_ttft_over_vllm")
for n in sorted({row[1] for row in rows}):
paged = by_key.get(("paged", n))
vllm = by_key.get(("vllm", n))
if not paged or not vllm:
continue
print(f"ratio\t{n}\t{paged[3]/vllm[3]:.4f}\t{paged[4]/vllm[4]:.4f}\t{paged[2]/vllm[2]:.4f}\t{paged[6]/vllm[6]:.4f}")
PY
}
require_path "$SRC"
require_path "$BIN/llama-server"
require_path "$BIN/llama-completion"
require_path "$BIN/test-backend-ops"
require_path "$MODEL"
require_path "$VLLM_MODEL"
require_path "$H2H"
require_path "$VLLM_BIN"
require_path "$HOME/paged-inference-gates.sh"
preflight
log "artifact=$ART"
log "source=$(git -C "$SRC" log --oneline -1)"
if [[ "$DRY_RUN" == "1" ]]; then
log "dry run only; commands validated"
log "would build: cmake --build $SRC/build-cuda --target llama-server llama-completion test-backend-ops -j8"
log "would run paged NPL=[$NPL] PTOK=$PTOK GEN=$GEN"
log "would run vLLM NPL=[$NPL] PTOK=$PTOK GEN=$GEN"
exit 0
fi
log "building llama-server, llama-completion, and test-backend-ops"
cmake --build "$SRC/build-cuda" --target llama-server llama-completion test-backend-ops -j 8 \
> "$ART/build.log" 2>&1
run_gate pre
acquire_lock
trap release_lock EXIT
run_paged
run_vllm
release_lock
trap - EXIT
run_gate post
write_summary
log "artifacts: $ART"