Files
LocalAI/backend/cpp/llama-cpp-localai-paged/paged-inference-gates.sh
Ettore Di Giacinto e169058e73 chore(paged): add DGX inference gate runner
Add a reusable paged llama.cpp gate script for DGX work. It checks docker/local-ai-worker/GPU lock state, runs the canonical MoE and dense transcript md5 gates, and runs selected test-backend-ops filters.

Verified on dgx.casa: MoE 8cb0ce23777bf55f92f63d0292c756b0, dense 5951a5b4d624ce891e22ab5fca9bc439, MUL_MAT_ID 806/806. Artifact: /home/mudler/bench/paged_inference_gates/20260701_040048.

Assisted-by: Codex:gpt-5
2026-07-01 02:01:55 +00:00

137 lines
3.8 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
cat <<'EOF'
Usage: paged-inference-gates.sh
Run the LocalAI paged llama.cpp inference safety gates on a DGX checkout.
Environment:
BIN llama.cpp build bin dir (default: ~/llama-phase6-source/build-cuda/bin)
MOE MoE GGUF path (default: ~/bench/q36-35b-a3b-nvfp4.gguf)
DENSE Dense GGUF path (default: ~/bench/q36-27b-nvfp4.gguf)
ART artifact dir (default: ~/bench/paged_inference_gates/<timestamp>)
OPS comma-separated test-backend-ops filters (default: MUL_MAT_ID)
EXTRA_ENV extra env assignments for completion gates, e.g. "GDN_TC=5"
Expected md5:
MoE paged: 8cb0ce23777bf55f92f63d0292c756b0
Dense paged: 5951a5b4d624ce891e22ab5fca9bc439
EOF
exit 0
fi
MOE_MD5_EXPECTED=8cb0ce23777bf55f92f63d0292c756b0
DENSE_MD5_EXPECTED=5951a5b4d624ce891e22ab5fca9bc439
BIN=${BIN:-"$HOME/llama-phase6-source/build-cuda/bin"}
MOE=${MOE:-"$HOME/bench/q36-35b-a3b-nvfp4.gguf"}
DENSE=${DENSE:-"$HOME/bench/q36-27b-nvfp4.gguf"}
OPS=${OPS:-MUL_MAT_ID}
ART=${ART:-"$HOME/bench/paged_inference_gates/$(date +%Y%m%d_%H%M%S)"}
EXTRA_ENV=${EXTRA_ENV:-}
require_file() {
if [[ ! -e "$1" ]]; then
echo "missing required path: $1" >&2
exit 2
fi
}
check_idle() {
if command -v docker >/dev/null 2>&1; then
local docker_count
docker_count=$(docker ps -q | wc -l)
if [[ "$docker_count" != "0" ]]; then
echo "docker containers are running: $docker_count" >&2
docker ps >&2
exit 3
fi
local local_ai_worker
local_ai_worker=$(docker ps --format "{{.Names}}" | grep -c local-ai-worker || true)
if [[ "$local_ai_worker" != "0" ]]; then
echo "local-ai-worker container is running" >&2
exit 3
fi
fi
if command -v nvidia-smi >/dev/null 2>&1; then
local compute_count
compute_count=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader | sed "/^$/d" | wc -l)
if [[ "$compute_count" != "0" ]]; then
echo "GPU compute processes are already running: $compute_count" >&2
nvidia-smi >&2
exit 3
fi
fi
local owner_file="$HOME/gpu_bench_lock/owner"
if [[ -f "$owner_file" ]]; then
local owner
owner=$(cat "$owner_file")
if [[ -n "$owner" && "$owner" != FREE* ]]; then
echo "GPU lock is owned: $owner" >&2
exit 3
fi
fi
}
run_completion_gate() {
local name=$1
local model=$2
local expected=$3
local out="$ART/${name}.txt"
local err="$ART/${name}.err"
local md5_file="$ART/${name}.md5"
env LLAMA_KV_PAGED=1 LLAMA_MOE_FORCE_GRAPHS=1 GGML_NO_BACKTRACE=1 $EXTRA_ENV \
"$BIN/llama-completion" -m "$model" -ngl 99 -fa on -c 4096 \
--temp 0 --seed 1 -n 48 -p "The capital of France is" \
</dev/null >"$out" 2>"$err"
md5sum "$out" >"$md5_file"
local actual
actual=$(awk '{print $1}' "$md5_file")
if [[ "$actual" != "$expected" ]]; then
echo "$name md5 mismatch: got $actual expected $expected" >&2
echo "artifacts: $ART" >&2
exit 4
fi
echo "$name md5 OK: $actual"
}
run_op_gate() {
local op=$1
local out="$ART/op_${op}.txt"
"$BIN/test-backend-ops" test -b CUDA0 -o "$op" -j 1 >"$out" 2>&1
if ! grep -q "Backend CUDA0: .*OK" "$out"; then
echo "$op gate failed" >&2
tail -80 "$out" >&2
echo "artifacts: $ART" >&2
exit 5
fi
grep -E "[0-9]+/[0-9]+ tests passed|Backend CUDA0" "$out" | tail -2
}
mkdir -p "$ART"
require_file "$BIN/llama-completion"
require_file "$BIN/test-backend-ops"
require_file "$MOE"
require_file "$DENSE"
check_idle
run_completion_gate moe "$MOE" "$MOE_MD5_EXPECTED"
run_completion_gate dense "$DENSE" "$DENSE_MD5_EXPECTED"
IFS=',' read -r -a op_list <<<"$OPS"
for op in "${op_list[@]}"; do
op=${op//[[:space:]]/}
[[ -n "$op" ]] || continue
run_op_gate "$op"
done
echo "paged inference gates OK"
echo "artifacts: $ART"