From ea2bbabffd4a037cb1851a2be56dae577f058069 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 12 Apr 2026 16:02:49 +0000 Subject: [PATCH] ci(vllm): use bigger-runner instead of source build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prebuilt vllm 0.14.1+cpu wheel requires SIMD instructions (AVX-512 VNNI/BF16) that stock ubuntu-latest GitHub runners don't support — vllm.model_executor.models.registry SIGILLs on import during LoadModel. Source compilation works but takes 30-40 minutes per CI run, which is too slow for an e2e smoke test. Instead, switch tests-vllm-grpc to the bigger-runner self-hosted label (already used by backend.yml for the llama-cpp CUDA build) — that hardware has the required SIMD baseline and the prebuilt wheel runs cleanly. FROM_SOURCE=true is kept as an opt-in escape hatch: - install.sh still has the CPU source-build path for hosts that need it - backend/Dockerfile.python still declares the ARG + ENV - Makefile docker-build-backend still forwards the build-arg when set Default CI path uses the fast prebuilt wheel; source build can be re-enabled by exporting FROM_SOURCE=true in the environment. --- .github/workflows/test-extra.yml | 14 ++++++-------- Makefile | 9 ++++----- backend/Dockerfile.python | 5 +++-- backend/python/vllm/install.sh | 12 ++++++------ 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index a9f10e3fc..592caffc7 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -505,8 +505,12 @@ jobs: tests-vllm-grpc: needs: detect-changes if: needs.detect-changes.outputs.vllm == 'true' || needs.detect-changes.outputs.run-all == 'true' - runs-on: ubuntu-latest - timeout-minutes: 120 + # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16 + # instructions; stock ubuntu-latest runners SIGILL on import of + # vllm.model_executor.models.registry. bigger-runner has newer + # hardware that supports the required SIMD. + runs-on: bigger-runner + timeout-minutes: 90 steps: - name: Clone uses: actions/checkout@v6 @@ -521,12 +525,6 @@ jobs: sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true df -h - name: Build vllm (cpu) backend image and run gRPC e2e tests - env: - # GitHub Actions runners don't all support the SIMD instructions - # the prebuilt vllm CPU wheel was compiled against (SIGILL in - # vllm.model_executor.models.registry on import). Build vllm from - # source so it targets the actual CI CPU. - FROM_SOURCE: "true" run: | make test-extra-backend-vllm tests-acestep-cpp: diff --git a/Makefile b/Makefile index 4464a9774..7e2e35052 100644 --- a/Makefile +++ b/Makefile @@ -509,11 +509,10 @@ test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp ## vllm is resolved from a HuggingFace model id (no file download) and ## exercises Predict + streaming + tool-call extraction via the hermes parser. -## FROM_SOURCE=true passes through to Dockerfile.python → install.sh and -## compiles vllm locally instead of using the prebuilt CPU wheel — required -## on runners whose CPU doesn't support the wheel's baked-in SIMD. -test-extra-backend-vllm: - $(MAKE) docker-build-vllm +## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU +## wheel was compiled against (AVX-512 VNNI/BF16); older CPUs will SIGILL +## on import — on CI this means using the bigger-runner label. +test-extra-backend-vllm: docker-build-vllm BACKEND_IMAGE=local-ai-backend:vllm \ BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ BACKEND_TEST_CAPS=health,load,predict,stream,tools \ diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python index e209815db..16159c67b 100644 --- a/backend/Dockerfile.python +++ b/backend/Dockerfile.python @@ -195,8 +195,9 @@ COPY backend/backend.proto /${BACKEND}/backend.proto COPY backend/python/common/ /${BACKEND}/common COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh -# Optional per-backend source build toggle (e.g. vllm on CPU needs to -# compile against the host SIMD instead of using the prebuilt wheel). +# Optional per-backend source build toggle (e.g. vllm on CPU can set +# FROM_SOURCE=true to compile against the build host SIMD instead of +# pulling a prebuilt wheel). Default empty — most backends ignore it. ARG FROM_SOURCE="" ENV FROM_SOURCE=${FROM_SOURCE} diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index de204e0a2..cf6fa7efe 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -32,12 +32,12 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi -# When FROM_SOURCE=true on a CPU build, skip the prebuilt wheel in -# requirements-cpu-after.txt and compile vllm locally against the host's -# actual CPU. The prebuilt CPU wheels from vllm releases are compiled with -# wider SIMD (AVX-512 VNNI/BF16 etc.) than some environments support — in -# particular GitHub Actions runners SIGILL on the vllm model registry -# subprocess. FROM_SOURCE=true avoids that at the cost of a longer install. +# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in +# requirements-cpu-after.txt and compiles vllm locally against the host's +# actual CPU. Not used by default because it takes ~30-40 minutes, but +# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the +# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a +# bigger-runner with compatible hardware instead. if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then # Temporarily hide the prebuilt wheel so installRequirements doesn't # pull it — the rest of the requirements files (base deps, torch,