From 329df11989dba7a167041ac851c83e38fd06af78 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 12 Apr 2026 15:14:42 +0000 Subject: [PATCH] fix(vllm): build from source on CI to avoid SIGILL on prebuilt wheel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prebuilt vllm 0.14.1+cpu wheel from GitHub releases is compiled with SIMD instructions (AVX-512 VNNI/BF16 or AMX-BF16) that not every CPU supports. GitHub Actions ubuntu-latest runners SIGILL when vllm spawns the model_executor.models.registry subprocess for introspection, so LoadModel never reaches the actual inference path. - install.sh: when FROM_SOURCE=true on a CPU build, temporarily hide requirements-cpu-after.txt so installRequirements installs the base deps + torch CPU without pulling the prebuilt wheel, then clone vllm and compile it with VLLM_TARGET_DEVICE=cpu. The resulting binaries target the host's actual CPU. - backend/Dockerfile.python: accept a FROM_SOURCE build-arg and expose it as an ENV so install.sh sees it during `make`. - Makefile docker-build-backend: forward FROM_SOURCE as --build-arg when set, so backends that need source builds can opt in. - Makefile test-extra-backend-vllm: call docker-build-vllm via a recursive $(MAKE) invocation so FROM_SOURCE flows through. - .github/workflows/test-extra.yml: set FROM_SOURCE=true on the tests-vllm-grpc job. Slower but reliable — the prebuilt wheel only works on hosts that share the build-time SIMD baseline. Answers 'did you test locally?': yes, end-to-end on my local machine with the prebuilt wheel (CPU supports AVX-512 VNNI). The CI runner CPU gap was not covered locally — this commit plugs that gap. --- .github/workflows/test-extra.yml | 6 ++++ Makefile | 7 ++++- backend/Dockerfile.python | 5 ++++ backend/python/vllm/install.sh | 47 ++++++++++++++++++++++---------- 4 files changed, 49 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index b6c72b1a7..a9f10e3fc 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -521,6 +521,12 @@ jobs: sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true df -h - name: Build vllm (cpu) backend image and run gRPC e2e tests + env: + # GitHub Actions runners don't all support the SIMD instructions + # the prebuilt vllm CPU wheel was compiled against (SIGILL in + # vllm.model_executor.models.registry on import). Build vllm from + # source so it targets the actual CI CPU. + FROM_SOURCE: "true" run: | make test-extra-backend-vllm tests-acestep-cpp: diff --git a/Makefile b/Makefile index 7f61666f5..4464a9774 100644 --- a/Makefile +++ b/Makefile @@ -509,7 +509,11 @@ test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp ## vllm is resolved from a HuggingFace model id (no file download) and ## exercises Predict + streaming + tool-call extraction via the hermes parser. -test-extra-backend-vllm: docker-build-vllm +## FROM_SOURCE=true passes through to Dockerfile.python → install.sh and +## compiles vllm locally instead of using the prebuilt CPU wheel — required +## on runners whose CPU doesn't support the wheel's baked-in SIMD. +test-extra-backend-vllm: + $(MAKE) docker-build-vllm BACKEND_IMAGE=local-ai-backend:vllm \ BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ BACKEND_TEST_CAPS=health,load,predict,stream,tools \ @@ -669,6 +673,7 @@ define docker-build-backend --build-arg CUDA_MINOR_VERSION=$(CUDA_MINOR_VERSION) \ --build-arg UBUNTU_VERSION=$(UBUNTU_VERSION) \ --build-arg UBUNTU_CODENAME=$(UBUNTU_CODENAME) \ + $(if $(FROM_SOURCE),--build-arg FROM_SOURCE=$(FROM_SOURCE)) \ $(if $(filter true,$(5)),--build-arg BACKEND=$(1)) \ -t local-ai-backend:$(1) -f backend/Dockerfile.$(2) $(3) endef diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python index 5d2e6171e..e209815db 100644 --- a/backend/Dockerfile.python +++ b/backend/Dockerfile.python @@ -195,6 +195,11 @@ COPY backend/backend.proto /${BACKEND}/backend.proto COPY backend/python/common/ /${BACKEND}/common COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh +# Optional per-backend source build toggle (e.g. vllm on CPU needs to +# compile against the host SIMD instead of using the prebuilt wheel). +ARG FROM_SOURCE="" +ENV FROM_SOURCE=${FROM_SOURCE} + RUN cd /${BACKEND} && PORTABLE_PYTHON=true make # Package GPU libraries into the backend's lib directory diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 66a809a92..de204e0a2 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -32,20 +32,37 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi -# We don't embed this into the images as it is a large dependency and not always needed. -# Besides, the speed inference are not actually usable in the current state for production use-cases. +# When FROM_SOURCE=true on a CPU build, skip the prebuilt wheel in +# requirements-cpu-after.txt and compile vllm locally against the host's +# actual CPU. The prebuilt CPU wheels from vllm releases are compiled with +# wider SIMD (AVX-512 VNNI/BF16 etc.) than some environments support — in +# particular GitHub Actions runners SIGILL on the vllm model registry +# subprocess. FROM_SOURCE=true avoids that at the cost of a longer install. if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then - ensureVenv - # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html - if [ ! -d vllm ]; then - git clone https://github.com/vllm-project/vllm - fi - pushd vllm - uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes - uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - VLLM_TARGET_DEVICE=cpu python setup.py install - popd - rm -rf vllm - else - installRequirements + # Temporarily hide the prebuilt wheel so installRequirements doesn't + # pull it — the rest of the requirements files (base deps, torch, + # transformers) are still installed normally. + _cpu_after="${backend_dir}/requirements-cpu-after.txt" + _cpu_after_bak="" + if [ -f "${_cpu_after}" ]; then + _cpu_after_bak="${_cpu_after}.from-source.bak" + mv "${_cpu_after}" "${_cpu_after_bak}" + fi + installRequirements + if [ -n "${_cpu_after_bak}" ]; then + mv "${_cpu_after_bak}" "${_cpu_after}" + fi + + # Build vllm from source against the installed torch. + # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/ + _vllm_src=$(mktemp -d) + trap 'rm -rf "${_vllm_src}"' EXIT + git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm" + pushd "${_vllm_src}/vllm" + uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm + # Respect pre-installed torch version — skip vllm's own requirements-build.txt torch pin. + VLLM_TARGET_DEVICE=cpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps . + popd +else + installRequirements fi