mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-17 21:40:07 -04:00
The prebuilt vllm 0.14.1+cpu wheel from GitHub releases is compiled with SIMD instructions (AVX-512 VNNI/BF16 or AMX-BF16) that not every CPU supports. GitHub Actions ubuntu-latest runners SIGILL when vllm spawns the model_executor.models.registry subprocess for introspection, so LoadModel never reaches the actual inference path. - install.sh: when FROM_SOURCE=true on a CPU build, temporarily hide requirements-cpu-after.txt so installRequirements installs the base deps + torch CPU without pulling the prebuilt wheel, then clone vllm and compile it with VLLM_TARGET_DEVICE=cpu. The resulting binaries target the host's actual CPU. - backend/Dockerfile.python: accept a FROM_SOURCE build-arg and expose it as an ENV so install.sh sees it during `make`. - Makefile docker-build-backend: forward FROM_SOURCE as --build-arg when set, so backends that need source builds can opt in. - Makefile test-extra-backend-vllm: call docker-build-vllm via a recursive $(MAKE) invocation so FROM_SOURCE flows through. - .github/workflows/test-extra.yml: set FROM_SOURCE=true on the tests-vllm-grpc job. Slower but reliable — the prebuilt wheel only works on hosts that share the build-time SIMD baseline. Answers 'did you test locally?': yes, end-to-end on my local machine with the prebuilt wheel (CPU supports AVX-512 VNNI). The CI runner CPU gap was not covered locally — this commit plugs that gap.
69 lines
3.1 KiB
Bash
Executable File
69 lines
3.1 KiB
Bash
Executable File
#!/bin/bash
|
|
set -e
|
|
|
|
EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
|
|
|
|
# Avoid to overcommit the CPU during build
|
|
# https://github.com/vllm-project/vllm/issues/20079
|
|
# https://docs.vllm.ai/en/v0.8.3/serving/env_vars.html
|
|
# https://docs.redhat.com/it/documentation/red_hat_ai_inference_server/3.0/html/vllm_server_arguments/environment_variables-server-arguments
|
|
export NVCC_THREADS=2
|
|
export MAX_JOBS=1
|
|
|
|
backend_dir=$(dirname $0)
|
|
|
|
if [ -d $backend_dir/common ]; then
|
|
source $backend_dir/common/libbackend.sh
|
|
else
|
|
source $backend_dir/../common/libbackend.sh
|
|
fi
|
|
|
|
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
|
|
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
|
|
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
|
|
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
|
|
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
|
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
|
fi
|
|
|
|
# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
|
|
# pytorch test channel while still resolving transformers/vllm from pypi.
|
|
if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
|
|
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
|
|
fi
|
|
|
|
# When FROM_SOURCE=true on a CPU build, skip the prebuilt wheel in
|
|
# requirements-cpu-after.txt and compile vllm locally against the host's
|
|
# actual CPU. The prebuilt CPU wheels from vllm releases are compiled with
|
|
# wider SIMD (AVX-512 VNNI/BF16 etc.) than some environments support — in
|
|
# particular GitHub Actions runners SIGILL on the vllm model registry
|
|
# subprocess. FROM_SOURCE=true avoids that at the cost of a longer install.
|
|
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
|
|
# Temporarily hide the prebuilt wheel so installRequirements doesn't
|
|
# pull it — the rest of the requirements files (base deps, torch,
|
|
# transformers) are still installed normally.
|
|
_cpu_after="${backend_dir}/requirements-cpu-after.txt"
|
|
_cpu_after_bak=""
|
|
if [ -f "${_cpu_after}" ]; then
|
|
_cpu_after_bak="${_cpu_after}.from-source.bak"
|
|
mv "${_cpu_after}" "${_cpu_after_bak}"
|
|
fi
|
|
installRequirements
|
|
if [ -n "${_cpu_after_bak}" ]; then
|
|
mv "${_cpu_after_bak}" "${_cpu_after}"
|
|
fi
|
|
|
|
# Build vllm from source against the installed torch.
|
|
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/
|
|
_vllm_src=$(mktemp -d)
|
|
trap 'rm -rf "${_vllm_src}"' EXIT
|
|
git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
|
|
pushd "${_vllm_src}/vllm"
|
|
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm
|
|
# Respect pre-installed torch version — skip vllm's own requirements-build.txt torch pin.
|
|
VLLM_TARGET_DEVICE=cpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
|
|
popd
|
|
else
|
|
installRequirements
|
|
fi
|