LocalAI/backend/cpp/llama-cpp-localai-paged/paged-current-serving-snapshot.sh

#!/usr/bin/env bash
set -euo pipefail

usage() {
  cat <<'EOF'
Usage: paged-current-serving-snapshot.sh [--summarize-gates ART]

Run a current-stack paged llama.cpp vs vLLM MoE serving snapshot on DGX.

This harness uses the clean llama.cpp mirror by default, not stale development
trees. It runs pre/post paged inference gates, then a same-session serving
comparison with the h2h client.

Environment overrides:
  SRC          llama.cpp source dir (default: ~/llama-phase6-source)
  BUILD_DIR    llama.cpp CMake build dir (default: $SRC/build-cuda)
  BIN          llama.cpp build bin dir (default: $SRC/build-cuda/bin)
  MODEL        paged GGUF path (default: ~/bench/q36-35b-a3b-nvfp4.gguf)
  VLLM_MODEL   vLLM model dir (default: ~/bench/q36-35b-a3b-nvfp4-vllm)
  SERVED_MODEL_NAME OpenAI model name used by llama-server, vLLM, and h2h (default: q36)
  H2H          h2h client (default: ~/bench/h2h_cli3.py)
  ART          artifact dir (default: ~/bench/phase_current_serving_snapshot/<timestamp>)
  NPL          concurrency list (default: "8 32 128")
  PTOK         prompt filler words (default: 128)
  GEN          generated tokens (default: 64)
  CTX          llama-server context (default: 131072)
  PARALLEL     llama-server parallel slots (default: 128)
  BATCH        llama-server logical batch (default: 2048)
  UBATCH       llama-server physical batch (default: 512)
  LLAMA_PORT   llama-server port (default: 8098)
  LLAMA_READY_ATTEMPTS llama-server readiness attempts, one per second (default: 240)
  VLLM_PORT    vLLM port (default: 8000)
  VLLM_BIN     vLLM executable (default: ~/vllm-bench/bin/vllm)
  VLLM_READY_ATTEMPTS  vLLM readiness attempts, one per second (default: 600)
  VLLM_GPU_MEMORY_UTILIZATION vLLM --gpu-memory-utilization (default: 0.85)
  VLLM_MAX_MODEL_LEN          vLLM --max-model-len (default: 4096)
  VLLM_MAX_NUM_SEQS           vLLM --max-num-seqs (default: 256)
  VLLM_TENSOR_PARALLEL_SIZE   vLLM --tensor-parallel-size (default: 1)
  VLLM_EXTRA_ARGS             whitespace-split extra args appended to vLLM serve (default: empty)
  SKIP_GATES=1 to skip pre/post paged inference gates
  DRY_RUN=1    validate inputs/preflight, write hardware.txt, and print commands without running servers

Options:
  --summarize-gates ART  write ART/gate_summary.tsv from existing gate_pre/gate_post artifacts
EOF
}

SUMMARY_GATES_ART=""
case "${1:-}" in
  -h|--help)
  usage
  exit 0
  ;;
  --summarize-gates)
    if [[ -z "${2:-}" ]]; then
      usage >&2
      exit 2
    fi
    SUMMARY_GATES_ART="$2"
  ;;
  "")
  ;;
  *)
    usage >&2
    exit 2
  ;;
esac

SRC=${SRC:-"$HOME/llama-phase6-source"}
BUILD_DIR=${BUILD_DIR:-"$SRC/build-cuda"}
BIN=${BIN:-"$BUILD_DIR/bin"}
MODEL=${MODEL:-"$HOME/bench/q36-35b-a3b-nvfp4.gguf"}
VLLM_MODEL=${VLLM_MODEL:-"$HOME/bench/q36-35b-a3b-nvfp4-vllm"}
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-q36}
H2H=${H2H:-"$HOME/bench/h2h_cli3.py"}
ART=${ART:-"$HOME/bench/phase_current_serving_snapshot/$(date +%Y%m%d_%H%M%S)"}
NPL=${NPL:-"8 32 128"}
PTOK=${PTOK:-128}
GEN=${GEN:-64}
CTX=${CTX:-131072}
PARALLEL=${PARALLEL:-128}
BATCH=${BATCH:-2048}
UBATCH=${UBATCH:-512}
LLAMA_PORT=${LLAMA_PORT:-8098}
LLAMA_READY_ATTEMPTS=${LLAMA_READY_ATTEMPTS:-240}
VLLM_PORT=${VLLM_PORT:-8000}
VLLM_BIN=${VLLM_BIN:-"$HOME/vllm-bench/bin/vllm"}
VLLM_READY_ATTEMPTS=${VLLM_READY_ATTEMPTS:-600}
VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.85}
VLLM_MAX_MODEL_LEN=${VLLM_MAX_MODEL_LEN:-4096}
VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-256}
VLLM_TENSOR_PARALLEL_SIZE=${VLLM_TENSOR_PARALLEL_SIZE:-1}
VLLM_EXTRA_ARGS=${VLLM_EXTRA_ARGS:-}
SKIP_GATES=${SKIP_GATES:-0}
DRY_RUN=${DRY_RUN:-0}
MOE_MD5_EXPECTED=8cb0ce23777bf55f92f63d0292c756b0
DENSE_MD5_EXPECTED=5951a5b4d624ce891e22ab5fca9bc439

LOCK_DIR="$HOME/gpu_bench_lock"
OWNER="$LOCK_DIR/owner"
SERVER_PID=""

log() {
  printf '[%s] %s\n' "$(date -Is)" "$*" | tee -a "$ART/run.log"
}

require_path() {
  if [[ ! -e "$1" ]]; then
    echo "missing required path: $1" >&2
    exit 2
  fi
}

preflight() {
  mkdir -p "$ART"
  local docker_count local_ai compute owner
  docker_count=$(docker ps -q | wc -l)
  local_ai=$(docker ps --format "{{.Names}}" | grep -c local-ai-worker || true)
  compute=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader | sed '/^$/d' | wc -l)
  owner="FREE-no-lock-file"
  if [[ -f "$OWNER" ]]; then
    owner=$(cat "$OWNER")
  fi
  {
    echo "docker=$docker_count"
    echo "local_ai_worker=$local_ai"
    echo "compute=$compute"
    echo "$owner"
  } | tee "$ART/preflight.txt"
  [[ "$docker_count" == "0" ]]
  [[ "$local_ai" == "0" ]]
  [[ "$compute" == "0" ]]
  case "$owner" in
    FREE*|FREE-no-lock-file) ;;
    *) echo "GPU lock is busy: $owner" >&2; exit 3 ;;
  esac
}

write_hardware_report() {
  local out="$ART/hardware.txt"
  local gpu_name hardware_class

  gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 || true)
  hardware_class="unknown"
  case "$gpu_name" in
    *B200*|*B100*|*GB200*) hardware_class="datacenter_blackwell" ;;
    *H200*|*H100*) hardware_class="datacenter_other" ;;
    *GB10*|*"DGX Spark"*|*RTX*|*"PRO 6000"*) hardware_class="gb10_or_workstation_blackwell" ;;
  esac

  {
    echo "nvidia_smi_L:"
    nvidia-smi -L || true
    echo
    echo "nvidia_smi_query:"
    if ! nvidia-smi --query-gpu=name,driver_version,memory.total,compute_cap --format=csv,noheader; then
      nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader || true
    fi
    echo
    echo "gpu_name=$gpu_name"
    echo "hardware_class=$hardware_class"
    case "$hardware_class" in
      datacenter_blackwell)
        echo "parity_note=datacenter Blackwell hardware: full parity methodology can choose new levers"
        ;;
      datacenter_other)
        echo "parity_note=datacenter non-Blackwell hardware: do not generalize GB10 parity decisions"
        ;;
      gb10_or_workstation_blackwell)
        echo "parity_note=GB10/workstation Blackwell hardware: GB10 shortcut closures apply unless new evidence says otherwise"
        ;;
      *)
        echo "parity_note=unknown hardware: classify before making parity claims"
        ;;
    esac
  } > "$out"
  log "hardware report: $out"
}

acquire_lock() {
  mkdir -p "$LOCK_DIR"
  echo "codex-current-serving-snapshot $(date +%s)" > "$OWNER"
}

release_lock() {
  stop_server_pid
  pkill -9 -f "[l]lama-server.*--port $LLAMA_PORT" >/dev/null 2>&1 || true
  pkill -9 -u "$(id -u)" -f "[v]llm serve" >/dev/null 2>&1 || true
  mkdir -p "$LOCK_DIR"
  echo "FREE released-by-codex-current-serving-snapshot $(date +%s)" > "$OWNER"
}

stop_server_pid() {
  if [[ -n "$SERVER_PID" ]]; then
    kill "$SERVER_PID" >/dev/null 2>&1 || true
    for _ in $(seq 1 30); do
      if ! kill -0 "$SERVER_PID" >/dev/null 2>&1; then
        break
      fi
      sleep 1
    done
    if kill -0 "$SERVER_PID" >/dev/null 2>&1; then
      kill -9 "$SERVER_PID" >/dev/null 2>&1 || true
    fi
    wait "$SERVER_PID" >/dev/null 2>&1 || true
    SERVER_PID=""
  fi
}

wait_http() {
  local url="$1"
  local pattern="$2"
  local log_file="$3"
  local health="$4"
  local attempts="$5"
  for _ in $(seq 1 "$attempts"); do
    if curl --max-time 2 -fsS "$url" > "$health" 2>"$health.err" && grep -q "$pattern" "$health"; then
      return 0
    fi
    if [[ -n "$SERVER_PID" ]] && ! kill -0 "$SERVER_PID" >/dev/null 2>&1; then
      tail -120 "$log_file" >&2 || true
      return 1
    fi
    sleep 1
  done
  tail -120 "$log_file" >&2 || true
  return 1
}

run_gate() {
  local name="$1"
  if [[ "$SKIP_GATES" == "1" ]]; then
    log "skipping $name inference gate"
    return
  fi
  log "running $name inference gate"
  ART="$ART/gate_$name" "$HOME/paged-inference-gates.sh" > "$ART/gate_$name.log" 2>&1
  cat "$ART/gate_$name.log" | tee -a "$ART/run.log"
}

run_paged() {
  local arm_dir="$ART/paged"
  mkdir -p "$arm_dir"
  log "starting paged current-stack server"
  cd "$BIN"
  env LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 GGML_NO_BACKTRACE=1 \
    ./llama-server \
      -m "$MODEL" -ngl 99 -fa on -c "$CTX" -b "$BATCH" -ub "$UBATCH" \
      --parallel "$PARALLEL" --host 127.0.0.1 --port "$LLAMA_PORT" --no-webui \
      > "$arm_dir/server.log" 2>&1 &
  SERVER_PID=$!
  wait_http "http://127.0.0.1:$LLAMA_PORT/health" "ok" "$arm_dir/server.log" "$arm_dir/health.json" "$LLAMA_READY_ATTEMPTS"
  python3 "$H2H" --url "http://127.0.0.1:$LLAMA_PORT/v1/completions" \
    --model "$SERVED_MODEL_NAME" -n 8 --ptok "$PTOK" --gen 16 --nonce "warm_paged_$(date +%s)" --no-cache >/dev/null
  for n in $NPL; do
    log "paged n=$n"
    python3 "$H2H" --url "http://127.0.0.1:$LLAMA_PORT/v1/completions" \
      --model "$SERVED_MODEL_NAME" -n "$n" --ptok "$PTOK" --gen "$GEN" \
      --nonce "paged_${n}_$(date +%s)" --no-cache > "$arm_dir/n${n}.json"
    cat "$arm_dir/n${n}.json" | tee -a "$ART/run.log"
  done
  stop_server_pid
  sleep 3
}

run_vllm() {
  local arm_dir="$ART/vllm"
  local extra_args=()
  mkdir -p "$arm_dir"
  export PATH="$(dirname "$VLLM_BIN"):$PATH"
  export VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
  export HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-1}
  if [[ -n "$VLLM_EXTRA_ARGS" ]]; then
    read -r -a extra_args <<< "$VLLM_EXTRA_ARGS"
  fi
  log "starting vLLM server"
  nohup env \
    -u VLLM_MODEL -u VLLM_BIN -u VLLM_READY_ATTEMPTS \
    -u VLLM_GPU_MEMORY_UTILIZATION -u VLLM_MAX_MODEL_LEN -u VLLM_MAX_NUM_SEQS \
    -u VLLM_TENSOR_PARALLEL_SIZE -u VLLM_EXTRA_ARGS \
    "$VLLM_BIN" serve "$VLLM_MODEL" \
    --served-model-name "$SERVED_MODEL_NAME" --gpu-memory-utilization "$VLLM_GPU_MEMORY_UTILIZATION" --max-model-len "$VLLM_MAX_MODEL_LEN" \
    --max-num-seqs "$VLLM_MAX_NUM_SEQS" --host 127.0.0.1 --port "$VLLM_PORT" --tensor-parallel-size "$VLLM_TENSOR_PARALLEL_SIZE" \
    "${extra_args[@]}" \
    > "$arm_dir/server.log" 2>&1 &
  SERVER_PID=$!
  wait_http "http://127.0.0.1:$VLLM_PORT/v1/models" "$SERVED_MODEL_NAME" "$arm_dir/server.log" "$arm_dir/models.json" "$VLLM_READY_ATTEMPTS"
  python3 "$H2H" --url "http://127.0.0.1:$VLLM_PORT/v1/completions" \
    --model "$SERVED_MODEL_NAME" -n 8 --ptok "$PTOK" --gen 16 --nonce "warm_vllm_$(date +%s)" --no-cache >/dev/null
  for n in $NPL; do
    log "vllm n=$n"
    python3 "$H2H" --url "http://127.0.0.1:$VLLM_PORT/v1/completions" \
      --model "$SERVED_MODEL_NAME" -n "$n" --ptok "$PTOK" --gen "$GEN" \
      --nonce "vllm_${n}_$(date +%s)" --no-cache > "$arm_dir/n${n}.json"
    cat "$arm_dir/n${n}.json" | tee -a "$ART/run.log"
  done
  stop_server_pid
  pkill -9 -u "$(id -u)" -f "[v]llm serve" >/dev/null 2>&1 || true
  sleep 5
}

write_summary() {
  python3 - "$ART" <<'PY' | tee "$ART/summary.tsv"
import json
import sys
from pathlib import Path

art = Path(sys.argv[1])
rows = []
for arm in ("paged", "vllm"):
    for path in sorted((art / arm).glob("n*.json")):
        data = json.loads(path.read_text())
        rows.append((arm, data["n"], data["agg_tps"], data["decode_agg_tps"],
                     data["decode_perseq_tps"], data["prefill_tps"],
                     data["ttft_mean_ms"], data["wall_s"]))

print("arm\tn\tagg_tps\tdecode_agg_tps\tdecode_perseq_tps\tprefill_tps\tttft_mean_ms\twall_s")
for row in rows:
    print("\t".join(str(x) for x in row))

by_key = {(row[0], row[1]): row for row in rows}
print("\nratio\tn\tpaged_decode_over_vllm\tpaged_perseq_over_vllm\tpaged_agg_over_vllm\tpaged_ttft_over_vllm")
for n in sorted({row[1] for row in rows}):
    paged = by_key.get(("paged", n))
    vllm = by_key.get(("vllm", n))
    if not paged or not vllm:
        continue
    print(f"ratio\t{n}\t{paged[3]/vllm[3]:.4f}\t{paged[4]/vllm[4]:.4f}\t{paged[2]/vllm[2]:.4f}\t{paged[6]/vllm[6]:.4f}")
PY
}

write_gate_summary() {
  python3 - "$ART" "$MOE_MD5_EXPECTED" "$DENSE_MD5_EXPECTED" <<'PY' | tee "$ART/gate_summary.tsv"
import re
import sys
from pathlib import Path

art = Path(sys.argv[1])
expected = {
    "moe": sys.argv[2],
    "dense": sys.argv[3],
}
ansi = re.compile(r"\x1b\[[0-9;]*m")
bad = False

print("phase\tcheck\tstatus\tactual\texpected\tdetails")

for phase in ("pre", "post"):
    gate_dir = art / f"gate_{phase}"
    if not gate_dir.exists():
        print(f"{phase}\tall\tskipped\t\t\t{gate_dir} missing")
        continue

    for name, want in expected.items():
        md5_path = gate_dir / f"{name}.md5"
        if not md5_path.exists():
            print(f"{phase}\t{name}_md5\tmissing\t\t{want}\t{md5_path} missing")
            bad = True
            continue
        got = md5_path.read_text().split()[0]
        status = "ok" if got == want else "mismatch"
        if status != "ok":
            bad = True
        print(f"{phase}\t{name}_md5\t{status}\t{got}\t{want}\t{md5_path}")

    op_paths = sorted(gate_dir.glob("op_*.txt"))
    if not op_paths:
        print(f"{phase}\top\tmissing\t\t\tno op_*.txt files")
        bad = True
        continue

    for path in op_paths:
        op = path.stem.removeprefix("op_")
        text = ansi.sub("", path.read_text(errors="replace"))
        passed = re.search(r"(\d+)/(\d+) tests passed", text)
        backend_ok = re.search(r"Backend CUDA0:\s+OK", text)
        if passed:
            actual = f"{passed.group(1)}/{passed.group(2)}"
            status = "ok" if passed.group(1) == passed.group(2) and backend_ok else "fail"
        else:
            actual = ""
            status = "missing"
        if status != "ok":
            bad = True
        print(f"{phase}\top_{op}\t{status}\t{actual}\tall\t{path}")

if bad:
    sys.exit(6)
PY
}

if [[ -n "$SUMMARY_GATES_ART" ]]; then
  ART="$SUMMARY_GATES_ART"
  require_path "$ART"
  write_gate_summary
  exit 0
fi

require_path "$SRC"
require_path "$BIN/llama-server"
require_path "$BIN/llama-completion"
require_path "$BIN/test-backend-ops"
require_path "$MODEL"
require_path "$VLLM_MODEL"
require_path "$H2H"
require_path "$VLLM_BIN"
require_path "$HOME/paged-inference-gates.sh"

preflight
write_hardware_report
log "artifact=$ART"
log "source=$(git -C "$SRC" log --oneline -1)"

if [[ "$DRY_RUN" == "1" ]]; then
  log "dry run only; commands validated"
  log "would build: cmake --build $BUILD_DIR --target llama-server llama-completion test-backend-ops -j8"
  log "served model: SERVED_MODEL_NAME=$SERVED_MODEL_NAME"
  log "readiness: LLAMA_READY_ATTEMPTS=$LLAMA_READY_ATTEMPTS VLLM_READY_ATTEMPTS=$VLLM_READY_ATTEMPTS"
  log "would run paged NPL=[$NPL] PTOK=$PTOK GEN=$GEN"
  log "would run vLLM NPL=[$NPL] PTOK=$PTOK GEN=$GEN"
  log "vLLM config: VLLM_GPU_MEMORY_UTILIZATION=$VLLM_GPU_MEMORY_UTILIZATION VLLM_MAX_MODEL_LEN=$VLLM_MAX_MODEL_LEN VLLM_MAX_NUM_SEQS=$VLLM_MAX_NUM_SEQS VLLM_TENSOR_PARALLEL_SIZE=$VLLM_TENSOR_PARALLEL_SIZE VLLM_EXTRA_ARGS=[$VLLM_EXTRA_ARGS]"
  exit 0
fi

log "building llama-server, llama-completion, and test-backend-ops"
cmake --build "$BUILD_DIR" --target llama-server llama-completion test-backend-ops -j 8 \
  > "$ART/build.log" 2>&1

run_gate pre
acquire_lock
trap release_lock EXIT
run_paged
run_vllm
release_lock
trap - EXIT
run_gate post
write_gate_summary
write_summary
log "artifacts: $ART"