mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-17 04:56:52 -04:00
fix(python-backend): make JIT subprocesses work on hosts of any size (#9679)
Two related runtime fixes for Python backends that JIT-compile CUDA
kernels at first model load (FlashInfer, PyTorch inductor, triton):
1. libbackend.sh: replace `source ${EDIR}/venv/bin/activate` with a
minimal manual setup (_activateVenv: export VIRTUAL_ENV, prepend
PATH, unset PYTHONHOME) computed from $EDIR at runtime. `uv venv`
and `python -m venv` both bake the create-time absolute path into
bin/activate (e.g. VIRTUAL_ENV='/vllm/venv' from the Docker build
stage), so sourcing activate on a relocated venv — copied out of
the build container and unpacked at an arbitrary backend dir —
prepends a stale, non-existent path to $PATH. Pip-installed CLI
tools (e.g. ninja, used by FlashInfer's NVFP4 GEMM JIT) are then
never found and the load aborts with FileNotFoundError. Doing the
env setup ourselves matches what `uv run` does internally and
sidesteps the relocation problem entirely. Generic — every Python
backend benefits.
2. vllm/run.sh: replace ninja's default -j$(nproc)+2 with an adaptive
MAX_JOBS = min(nproc, (MemAvailable-4)/4). Each concurrent
nvcc/cudafe++ peaks at multiple GiB; the default OOM-kills on
memory-tight hosts (e.g. a 16 GiB desktop loading a 27B NVFP4
model) but underutilises 100-core / 1 TB boxes. User-set MAX_JOBS
still wins. Also pin NVCC_THREADS=2 unless overridden.
Refs: https://github.com/vllm-project/vllm/issues/20079
Assisted-by: Claude:claude-opus-4-7 [Edit] [Bash]
This commit is contained in:
committed by
GitHub
parent
8e43842175
commit
16b2d4c807
@@ -318,6 +318,21 @@ _makeVenvPortable() {
|
||||
}
|
||||
|
||||
|
||||
# Apply the venv to the current process: VIRTUAL_ENV, PATH, PYTHONHOME hygiene.
|
||||
# Equivalent to the runtime portion of `source bin/activate`, but computed from
|
||||
# $EDIR (resolved at runtime via realpath) instead of the path baked into
|
||||
# bin/activate at venv-create time. `uv venv` (and `python -m venv`) both bake
|
||||
# the create-time absolute path in, so sourcing activate on a relocated venv —
|
||||
# e.g. one built at /vllm/venv inside a Docker stage and unpacked under
|
||||
# /backends/cuda13-vllm-development/venv at runtime — silently prepends a
|
||||
# stale, non-existent path to $PATH. Doing the setup ourselves sidesteps that;
|
||||
# this is the same approach `uv run` takes internally.
|
||||
_activateVenv() {
|
||||
export VIRTUAL_ENV="${EDIR}/venv"
|
||||
export PATH="${EDIR}/venv/bin:${PATH}"
|
||||
unset PYTHONHOME
|
||||
}
|
||||
|
||||
# ensureVenv makes sure that the venv for the backend both exists, and is activated.
|
||||
#
|
||||
# This function is idempotent, so you can call it as many times as you want and it will
|
||||
@@ -354,7 +369,7 @@ function ensureVenv() {
|
||||
venv_args="--copies"
|
||||
fi
|
||||
"${interpreter}" -m venv ${venv_args} "${EDIR}/venv"
|
||||
source "${EDIR}/venv/bin/activate"
|
||||
_activateVenv
|
||||
"${interpreter}" -m pip install --upgrade pip
|
||||
else
|
||||
if [ "x${PORTABLE_PYTHON}" == "xtrue" ]; then
|
||||
@@ -375,7 +390,7 @@ function ensureVenv() {
|
||||
fi
|
||||
|
||||
if [ "x${VIRTUAL_ENV:-}" != "x${EDIR}/venv" ]; then
|
||||
source "${EDIR}/venv/bin/activate"
|
||||
_activateVenv
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,30 @@ set -x
|
||||
|
||||
backend_dir=$(dirname $0)
|
||||
|
||||
# FlashInfer / PyTorch JIT-compile CUDA kernels at first model load (e.g.
|
||||
# the NVFP4 GEMM kernel for Blackwell SM120). Each concurrent nvcc /
|
||||
# cudafe++ peaks at multiple GiB during compilation; ninja's default
|
||||
# (-j$(nproc)+2) OOM-kills on memory-tight hosts but underutilises
|
||||
# 100-core / 1 TB boxes. Default MAX_JOBS to the smaller of the CPU count
|
||||
# and an available-memory budget at ~4 GiB per job. User-set MAX_JOBS in
|
||||
# the environment wins.
|
||||
# https://github.com/vllm-project/vllm/issues/20079
|
||||
if [ -z "${MAX_JOBS:-}" ]; then
|
||||
_ncpus=$(nproc 2>/dev/null || echo 1)
|
||||
_mem_avail_kb=$(awk '/^MemAvailable:/ {print $2; exit}' /proc/meminfo 2>/dev/null || echo 0)
|
||||
_mem_avail_gb=$(( _mem_avail_kb / 1024 / 1024 ))
|
||||
# Reserve ~4 GiB for the rest of the system; budget ~4 GiB per job.
|
||||
if [ "${_mem_avail_gb}" -gt 8 ]; then
|
||||
_mem_jobs=$(( (_mem_avail_gb - 4) / 4 ))
|
||||
else
|
||||
_mem_jobs=1
|
||||
fi
|
||||
[ "${_mem_jobs}" -lt 1 ] && _mem_jobs=1
|
||||
[ "${_mem_jobs}" -gt "${_ncpus}" ] && _mem_jobs=${_ncpus}
|
||||
export MAX_JOBS="${_mem_jobs}"
|
||||
fi
|
||||
export NVCC_THREADS="${NVCC_THREADS:-2}"
|
||||
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user