diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
index 982dafab3..dc7517d06 100644
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -318,6 +318,21 @@ _makeVenvPortable() {
 }
 
 
+# Apply the venv to the current process: VIRTUAL_ENV, PATH, PYTHONHOME hygiene.
+# Equivalent to the runtime portion of `source bin/activate`, but computed from
+# $EDIR (resolved at runtime via realpath) instead of the path baked into
+# bin/activate at venv-create time. `uv venv` (and `python -m venv`) both bake
+# the create-time absolute path in, so sourcing activate on a relocated venv —
+# e.g. one built at /vllm/venv inside a Docker stage and unpacked under
+# /backends/cuda13-vllm-development/venv at runtime — silently prepends a
+# stale, non-existent path to $PATH. Doing the setup ourselves sidesteps that;
+# this is the same approach `uv run` takes internally.
+_activateVenv() {
+    export VIRTUAL_ENV="${EDIR}/venv"
+    export PATH="${EDIR}/venv/bin:${PATH}"
+    unset PYTHONHOME
+}
+
 # ensureVenv makes sure that the venv for the backend both exists, and is activated.
 #
 # This function is idempotent, so you can call it as many times as you want and it will
@@ -354,7 +369,7 @@ function ensureVenv() {
                 venv_args="--copies"
             fi
             "${interpreter}" -m venv ${venv_args} "${EDIR}/venv"
-            source "${EDIR}/venv/bin/activate"
+            _activateVenv
             "${interpreter}" -m pip install --upgrade pip
         else
             if [ "x${PORTABLE_PYTHON}" == "xtrue" ]; then
@@ -375,7 +390,7 @@ function ensureVenv() {
     fi
 
     if [ "x${VIRTUAL_ENV:-}" != "x${EDIR}/venv" ]; then
-        source "${EDIR}/venv/bin/activate"
+        _activateVenv
     fi
 }
 
diff --git a/backend/python/vllm/run.sh b/backend/python/vllm/run.sh
index 59efc33ff..83607fb8d 100755
--- a/backend/python/vllm/run.sh
+++ b/backend/python/vllm/run.sh
@@ -3,6 +3,30 @@ set -x
 
 backend_dir=$(dirname $0)
 
+# FlashInfer / PyTorch JIT-compile CUDA kernels at first model load (e.g.
+# the NVFP4 GEMM kernel for Blackwell SM120). Each concurrent nvcc /
+# cudafe++ peaks at multiple GiB during compilation; ninja's default
+# (-j$(nproc)+2) OOM-kills on memory-tight hosts but underutilises
+# 100-core / 1 TB boxes. Default MAX_JOBS to the smaller of the CPU count
+# and an available-memory budget at ~4 GiB per job. User-set MAX_JOBS in
+# the environment wins.
+# https://github.com/vllm-project/vllm/issues/20079
+if [ -z "${MAX_JOBS:-}" ]; then
+    _ncpus=$(nproc 2>/dev/null || echo 1)
+    _mem_avail_kb=$(awk '/^MemAvailable:/ {print $2; exit}' /proc/meminfo 2>/dev/null || echo 0)
+    _mem_avail_gb=$(( _mem_avail_kb / 1024 / 1024 ))
+    # Reserve ~4 GiB for the rest of the system; budget ~4 GiB per job.
+    if [ "${_mem_avail_gb}" -gt 8 ]; then
+        _mem_jobs=$(( (_mem_avail_gb - 4) / 4 ))
+    else
+        _mem_jobs=1
+    fi
+    [ "${_mem_jobs}" -lt 1 ] && _mem_jobs=1
+    [ "${_mem_jobs}" -gt "${_ncpus}" ] && _mem_jobs=${_ncpus}
+    export MAX_JOBS="${_mem_jobs}"
+fi
+export NVCC_THREADS="${NVCC_THREADS:-2}"
+
 if [ -d $backend_dir/common ]; then
     source $backend_dir/common/libbackend.sh
 else