From 16b2d4c807bde46cd86e16836c84ec61cf396079 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Tue, 5 May 2026 23:28:01 +0100 Subject: [PATCH] fix(python-backend): make JIT subprocesses work on hosts of any size (#9679) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related runtime fixes for Python backends that JIT-compile CUDA kernels at first model load (FlashInfer, PyTorch inductor, triton): 1. libbackend.sh: replace `source ${EDIR}/venv/bin/activate` with a minimal manual setup (_activateVenv: export VIRTUAL_ENV, prepend PATH, unset PYTHONHOME) computed from $EDIR at runtime. `uv venv` and `python -m venv` both bake the create-time absolute path into bin/activate (e.g. VIRTUAL_ENV='/vllm/venv' from the Docker build stage), so sourcing activate on a relocated venv — copied out of the build container and unpacked at an arbitrary backend dir — prepends a stale, non-existent path to $PATH. Pip-installed CLI tools (e.g. ninja, used by FlashInfer's NVFP4 GEMM JIT) are then never found and the load aborts with FileNotFoundError. Doing the env setup ourselves matches what `uv run` does internally and sidesteps the relocation problem entirely. Generic — every Python backend benefits. 2. vllm/run.sh: replace ninja's default -j$(nproc)+2 with an adaptive MAX_JOBS = min(nproc, (MemAvailable-4)/4). Each concurrent nvcc/cudafe++ peaks at multiple GiB; the default OOM-kills on memory-tight hosts (e.g. a 16 GiB desktop loading a 27B NVFP4 model) but underutilises 100-core / 1 TB boxes. User-set MAX_JOBS still wins. Also pin NVCC_THREADS=2 unless overridden. Refs: https://github.com/vllm-project/vllm/issues/20079 Assisted-by: Claude:claude-opus-4-7 [Edit] [Bash] --- backend/python/common/libbackend.sh | 19 +++++++++++++++++-- backend/python/vllm/run.sh | 24 ++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh index 982dafab3..dc7517d06 100644 --- a/backend/python/common/libbackend.sh +++ b/backend/python/common/libbackend.sh @@ -318,6 +318,21 @@ _makeVenvPortable() { } +# Apply the venv to the current process: VIRTUAL_ENV, PATH, PYTHONHOME hygiene. +# Equivalent to the runtime portion of `source bin/activate`, but computed from +# $EDIR (resolved at runtime via realpath) instead of the path baked into +# bin/activate at venv-create time. `uv venv` (and `python -m venv`) both bake +# the create-time absolute path in, so sourcing activate on a relocated venv — +# e.g. one built at /vllm/venv inside a Docker stage and unpacked under +# /backends/cuda13-vllm-development/venv at runtime — silently prepends a +# stale, non-existent path to $PATH. Doing the setup ourselves sidesteps that; +# this is the same approach `uv run` takes internally. +_activateVenv() { + export VIRTUAL_ENV="${EDIR}/venv" + export PATH="${EDIR}/venv/bin:${PATH}" + unset PYTHONHOME +} + # ensureVenv makes sure that the venv for the backend both exists, and is activated. # # This function is idempotent, so you can call it as many times as you want and it will @@ -354,7 +369,7 @@ function ensureVenv() { venv_args="--copies" fi "${interpreter}" -m venv ${venv_args} "${EDIR}/venv" - source "${EDIR}/venv/bin/activate" + _activateVenv "${interpreter}" -m pip install --upgrade pip else if [ "x${PORTABLE_PYTHON}" == "xtrue" ]; then @@ -375,7 +390,7 @@ function ensureVenv() { fi if [ "x${VIRTUAL_ENV:-}" != "x${EDIR}/venv" ]; then - source "${EDIR}/venv/bin/activate" + _activateVenv fi } diff --git a/backend/python/vllm/run.sh b/backend/python/vllm/run.sh index 59efc33ff..83607fb8d 100755 --- a/backend/python/vllm/run.sh +++ b/backend/python/vllm/run.sh @@ -3,6 +3,30 @@ set -x backend_dir=$(dirname $0) +# FlashInfer / PyTorch JIT-compile CUDA kernels at first model load (e.g. +# the NVFP4 GEMM kernel for Blackwell SM120). Each concurrent nvcc / +# cudafe++ peaks at multiple GiB during compilation; ninja's default +# (-j$(nproc)+2) OOM-kills on memory-tight hosts but underutilises +# 100-core / 1 TB boxes. Default MAX_JOBS to the smaller of the CPU count +# and an available-memory budget at ~4 GiB per job. User-set MAX_JOBS in +# the environment wins. +# https://github.com/vllm-project/vllm/issues/20079 +if [ -z "${MAX_JOBS:-}" ]; then + _ncpus=$(nproc 2>/dev/null || echo 1) + _mem_avail_kb=$(awk '/^MemAvailable:/ {print $2; exit}' /proc/meminfo 2>/dev/null || echo 0) + _mem_avail_gb=$(( _mem_avail_kb / 1024 / 1024 )) + # Reserve ~4 GiB for the rest of the system; budget ~4 GiB per job. + if [ "${_mem_avail_gb}" -gt 8 ]; then + _mem_jobs=$(( (_mem_avail_gb - 4) / 4 )) + else + _mem_jobs=1 + fi + [ "${_mem_jobs}" -lt 1 ] && _mem_jobs=1 + [ "${_mem_jobs}" -gt "${_ncpus}" ] && _mem_jobs=${_ncpus} + export MAX_JOBS="${_mem_jobs}" +fi +export NVCC_THREADS="${NVCC_THREADS:-2}" + if [ -d $backend_dir/common ]; then source $backend_dir/common/libbackend.sh else