mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-29 03:24:49 -04:00
fix(vllm): build from source on CI to avoid SIGILL on prebuilt wheel
The prebuilt vllm 0.14.1+cpu wheel from GitHub releases is compiled with SIMD instructions (AVX-512 VNNI/BF16 or AMX-BF16) that not every CPU supports. GitHub Actions ubuntu-latest runners SIGILL when vllm spawns the model_executor.models.registry subprocess for introspection, so LoadModel never reaches the actual inference path. - install.sh: when FROM_SOURCE=true on a CPU build, temporarily hide requirements-cpu-after.txt so installRequirements installs the base deps + torch CPU without pulling the prebuilt wheel, then clone vllm and compile it with VLLM_TARGET_DEVICE=cpu. The resulting binaries target the host's actual CPU. - backend/Dockerfile.python: accept a FROM_SOURCE build-arg and expose it as an ENV so install.sh sees it during `make`. - Makefile docker-build-backend: forward FROM_SOURCE as --build-arg when set, so backends that need source builds can opt in. - Makefile test-extra-backend-vllm: call docker-build-vllm via a recursive $(MAKE) invocation so FROM_SOURCE flows through. - .github/workflows/test-extra.yml: set FROM_SOURCE=true on the tests-vllm-grpc job. Slower but reliable — the prebuilt wheel only works on hosts that share the build-time SIMD baseline. Answers 'did you test locally?': yes, end-to-end on my local machine with the prebuilt wheel (CPU supports AVX-512 VNNI). The CI runner CPU gap was not covered locally — this commit plugs that gap.
This commit is contained in:
6
.github/workflows/test-extra.yml
vendored
6
.github/workflows/test-extra.yml
vendored
@@ -521,6 +521,12 @@ jobs:
|
||||
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
|
||||
df -h
|
||||
- name: Build vllm (cpu) backend image and run gRPC e2e tests
|
||||
env:
|
||||
# GitHub Actions runners don't all support the SIMD instructions
|
||||
# the prebuilt vllm CPU wheel was compiled against (SIGILL in
|
||||
# vllm.model_executor.models.registry on import). Build vllm from
|
||||
# source so it targets the actual CI CPU.
|
||||
FROM_SOURCE: "true"
|
||||
run: |
|
||||
make test-extra-backend-vllm
|
||||
tests-acestep-cpp:
|
||||
|
||||
7
Makefile
7
Makefile
@@ -509,7 +509,11 @@ test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
|
||||
|
||||
## vllm is resolved from a HuggingFace model id (no file download) and
|
||||
## exercises Predict + streaming + tool-call extraction via the hermes parser.
|
||||
test-extra-backend-vllm: docker-build-vllm
|
||||
## FROM_SOURCE=true passes through to Dockerfile.python → install.sh and
|
||||
## compiles vllm locally instead of using the prebuilt CPU wheel — required
|
||||
## on runners whose CPU doesn't support the wheel's baked-in SIMD.
|
||||
test-extra-backend-vllm:
|
||||
$(MAKE) docker-build-vllm
|
||||
BACKEND_IMAGE=local-ai-backend:vllm \
|
||||
BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
|
||||
BACKEND_TEST_CAPS=health,load,predict,stream,tools \
|
||||
@@ -669,6 +673,7 @@ define docker-build-backend
|
||||
--build-arg CUDA_MINOR_VERSION=$(CUDA_MINOR_VERSION) \
|
||||
--build-arg UBUNTU_VERSION=$(UBUNTU_VERSION) \
|
||||
--build-arg UBUNTU_CODENAME=$(UBUNTU_CODENAME) \
|
||||
$(if $(FROM_SOURCE),--build-arg FROM_SOURCE=$(FROM_SOURCE)) \
|
||||
$(if $(filter true,$(5)),--build-arg BACKEND=$(1)) \
|
||||
-t local-ai-backend:$(1) -f backend/Dockerfile.$(2) $(3)
|
||||
endef
|
||||
|
||||
@@ -195,6 +195,11 @@ COPY backend/backend.proto /${BACKEND}/backend.proto
|
||||
COPY backend/python/common/ /${BACKEND}/common
|
||||
COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh
|
||||
|
||||
# Optional per-backend source build toggle (e.g. vllm on CPU needs to
|
||||
# compile against the host SIMD instead of using the prebuilt wheel).
|
||||
ARG FROM_SOURCE=""
|
||||
ENV FROM_SOURCE=${FROM_SOURCE}
|
||||
|
||||
RUN cd /${BACKEND} && PORTABLE_PYTHON=true make
|
||||
|
||||
# Package GPU libraries into the backend's lib directory
|
||||
|
||||
@@ -32,20 +32,37 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
|
||||
fi
|
||||
|
||||
# We don't embed this into the images as it is a large dependency and not always needed.
|
||||
# Besides, the speed inference are not actually usable in the current state for production use-cases.
|
||||
# When FROM_SOURCE=true on a CPU build, skip the prebuilt wheel in
|
||||
# requirements-cpu-after.txt and compile vllm locally against the host's
|
||||
# actual CPU. The prebuilt CPU wheels from vllm releases are compiled with
|
||||
# wider SIMD (AVX-512 VNNI/BF16 etc.) than some environments support — in
|
||||
# particular GitHub Actions runners SIGILL on the vllm model registry
|
||||
# subprocess. FROM_SOURCE=true avoids that at the cost of a longer install.
|
||||
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
|
||||
ensureVenv
|
||||
# https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
|
||||
if [ ! -d vllm ]; then
|
||||
git clone https://github.com/vllm-project/vllm
|
||||
fi
|
||||
pushd vllm
|
||||
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
|
||||
uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
popd
|
||||
rm -rf vllm
|
||||
else
|
||||
installRequirements
|
||||
# Temporarily hide the prebuilt wheel so installRequirements doesn't
|
||||
# pull it — the rest of the requirements files (base deps, torch,
|
||||
# transformers) are still installed normally.
|
||||
_cpu_after="${backend_dir}/requirements-cpu-after.txt"
|
||||
_cpu_after_bak=""
|
||||
if [ -f "${_cpu_after}" ]; then
|
||||
_cpu_after_bak="${_cpu_after}.from-source.bak"
|
||||
mv "${_cpu_after}" "${_cpu_after_bak}"
|
||||
fi
|
||||
installRequirements
|
||||
if [ -n "${_cpu_after_bak}" ]; then
|
||||
mv "${_cpu_after_bak}" "${_cpu_after}"
|
||||
fi
|
||||
|
||||
# Build vllm from source against the installed torch.
|
||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/
|
||||
_vllm_src=$(mktemp -d)
|
||||
trap 'rm -rf "${_vllm_src}"' EXIT
|
||||
git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
|
||||
pushd "${_vllm_src}/vllm"
|
||||
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm
|
||||
# Respect pre-installed torch version — skip vllm's own requirements-build.txt torch pin.
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
|
||||
popd
|
||||
else
|
||||
installRequirements
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user