chore(paged): add DGX inference gate runner

Add a reusable paged llama.cpp gate script for DGX work. It checks docker/local-ai-worker/GPU lock state, runs the canonical MoE and dense transcript md5 gates, and runs selected test-backend-ops filters. Verified on dgx.casa: MoE 8cb0ce23777bf55f92f63d0292c756b0, dense 5951a5b4d624ce891e22ab5fca9bc439, MUL_MAT_ID 806/806. Artifact: /home/mudler/bench/paged_inference_gates/20260701_040048. Assisted-by: Codex:gpt-5
2026-07-03 04:46:54 -04:00 · 2026-07-01 02:01:55 +00:00
parent ede23df333
commit e169058e73
2 changed files with 140 additions and 0 deletions
--- a/backend/cpp/llama-cpp-localai-paged/README.md
+++ b/backend/cpp/llama-cpp-localai-paged/README.md
@@ -339,6 +339,10 @@ is" -n 48 --temp 0 --seed 1 | md5sum`, paged paths prefixed with
 `LLAMA_KV_PAGED=1` (+ `LLAMA_MOE_FORCE_GRAPHS=1` for paged MoE), on the default
 chat-template path; and (2) `test-backend-ops` (CUDA0 vs CPU oracle) for every
 touched op (`SSM_CONV*`, `GATED_DELTA_NET`, `MUL_MAT`, `MUL_MAT_ID`).
+For DGX work, `paged-inference-gates.sh` runs the canonical MoE/dense transcript
+md5 checks and selected `test-backend-ops` filters, and refuses to start while
+docker, `local-ai-worker`, GPU compute processes, or a non-free GPU lock are
+present.

 **The gate is per-path** (see [`PAGED_BITEXACT_NOTE.md`](docs/PAGED_BITEXACT_NOTE.md)).
 Dense is bit-exact across paged/non-paged (`5951a5b4`). The **paged MoE** md5
--- a/backend/cpp/llama-cpp-localai-paged/paged-inference-gates.sh
+++ b/backend/cpp/llama-cpp-localai-paged/paged-inference-gates.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+  cat <<'EOF'
+Usage: paged-inference-gates.sh
+
+Run the LocalAI paged llama.cpp inference safety gates on a DGX checkout.
+
+Environment:
+  BIN        llama.cpp build bin dir (default: ~/llama-phase6-source/build-cuda/bin)
+  MOE        MoE GGUF path (default: ~/bench/q36-35b-a3b-nvfp4.gguf)
+  DENSE      Dense GGUF path (default: ~/bench/q36-27b-nvfp4.gguf)
+  ART        artifact dir (default: ~/bench/paged_inference_gates/<timestamp>)
+  OPS        comma-separated test-backend-ops filters (default: MUL_MAT_ID)
+  EXTRA_ENV  extra env assignments for completion gates, e.g. "GDN_TC=5"
+
+Expected md5:
+  MoE paged:   8cb0ce23777bf55f92f63d0292c756b0
+  Dense paged: 5951a5b4d624ce891e22ab5fca9bc439
+EOF
+  exit 0
+fi
+
+MOE_MD5_EXPECTED=8cb0ce23777bf55f92f63d0292c756b0
+DENSE_MD5_EXPECTED=5951a5b4d624ce891e22ab5fca9bc439
+
+BIN=${BIN:-"$HOME/llama-phase6-source/build-cuda/bin"}
+MOE=${MOE:-"$HOME/bench/q36-35b-a3b-nvfp4.gguf"}
+DENSE=${DENSE:-"$HOME/bench/q36-27b-nvfp4.gguf"}
+OPS=${OPS:-MUL_MAT_ID}
+ART=${ART:-"$HOME/bench/paged_inference_gates/$(date +%Y%m%d_%H%M%S)"}
+EXTRA_ENV=${EXTRA_ENV:-}
+
+require_file() {
+  if [[ ! -e "$1" ]]; then
+    echo "missing required path: $1" >&2
+    exit 2
+  fi
+}
+
+check_idle() {
+  if command -v docker >/dev/null 2>&1; then
+    local docker_count
+    docker_count=$(docker ps -q | wc -l)
+    if [[ "$docker_count" != "0" ]]; then
+      echo "docker containers are running: $docker_count" >&2
+      docker ps >&2
+      exit 3
+    fi
+
+    local local_ai_worker
+    local_ai_worker=$(docker ps --format "{{.Names}}" | grep -c local-ai-worker || true)
+    if [[ "$local_ai_worker" != "0" ]]; then
+      echo "local-ai-worker container is running" >&2
+      exit 3
+    fi
+  fi
+
+  if command -v nvidia-smi >/dev/null 2>&1; then
+    local compute_count
+    compute_count=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader | sed "/^$/d" | wc -l)
+    if [[ "$compute_count" != "0" ]]; then
+      echo "GPU compute processes are already running: $compute_count" >&2
+      nvidia-smi >&2
+      exit 3
+    fi
+  fi
+
+  local owner_file="$HOME/gpu_bench_lock/owner"
+  if [[ -f "$owner_file" ]]; then
+    local owner
+    owner=$(cat "$owner_file")
+    if [[ -n "$owner" && "$owner" != FREE* ]]; then
+      echo "GPU lock is owned: $owner" >&2
+      exit 3
+    fi
+  fi
+}
+
+run_completion_gate() {
+  local name=$1
+  local model=$2
+  local expected=$3
+  local out="$ART/${name}.txt"
+  local err="$ART/${name}.err"
+  local md5_file="$ART/${name}.md5"
+
+  env LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 GGML_NO_BACKTRACE=1 $EXTRA_ENV \
+    "$BIN/llama-completion" -m "$model" -ngl 99 -fa on -c 4096 \
+      --temp 0 --seed 1 -n 48 -p "The capital of France is" \
+      </dev/null >"$out" 2>"$err"
+
+  md5sum "$out" >"$md5_file"
+  local actual
+  actual=$(awk '{print $1}' "$md5_file")
+  if [[ "$actual" != "$expected" ]]; then
+    echo "$name md5 mismatch: got $actual expected $expected" >&2
+    echo "artifacts: $ART" >&2
+    exit 4
+  fi
+  echo "$name md5 OK: $actual"
+}
+
+run_op_gate() {
+  local op=$1
+  local out="$ART/op_${op}.txt"
+  "$BIN/test-backend-ops" test -b CUDA0 -o "$op" -j 1 >"$out" 2>&1
+  if ! grep -q "Backend CUDA0: .*OK" "$out"; then
+    echo "$op gate failed" >&2
+    tail -80 "$out" >&2
+    echo "artifacts: $ART" >&2
+    exit 5
+  fi
+  grep -E "[0-9]+/[0-9]+ tests passed|Backend CUDA0" "$out" | tail -2
+}
+
+mkdir -p "$ART"
+require_file "$BIN/llama-completion"
+require_file "$BIN/test-backend-ops"
+require_file "$MOE"
+require_file "$DENSE"
+check_idle
+
+run_completion_gate moe "$MOE" "$MOE_MD5_EXPECTED"
+run_completion_gate dense "$DENSE" "$DENSE_MD5_EXPECTED"
+
+IFS=',' read -r -a op_list <<<"$OPS"
+for op in "${op_list[@]}"; do
+  op=${op//[[:space:]]/}
+  [[ -n "$op" ]] || continue
+  run_op_gate "$op"
+done
+
+echo "paged inference gates OK"
+echo "artifacts: $ART"