diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index a9f10e3fc..592caffc7 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -505,8 +505,12 @@ jobs: tests-vllm-grpc: needs: detect-changes if: needs.detect-changes.outputs.vllm == 'true' || needs.detect-changes.outputs.run-all == 'true' - runs-on: ubuntu-latest - timeout-minutes: 120 + # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16 + # instructions; stock ubuntu-latest runners SIGILL on import of + # vllm.model_executor.models.registry. bigger-runner has newer + # hardware that supports the required SIMD. + runs-on: bigger-runner + timeout-minutes: 90 steps: - name: Clone uses: actions/checkout@v6 @@ -521,12 +525,6 @@ jobs: sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true df -h - name: Build vllm (cpu) backend image and run gRPC e2e tests - env: - # GitHub Actions runners don't all support the SIMD instructions - # the prebuilt vllm CPU wheel was compiled against (SIGILL in - # vllm.model_executor.models.registry on import). Build vllm from - # source so it targets the actual CI CPU. - FROM_SOURCE: "true" run: | make test-extra-backend-vllm tests-acestep-cpp: diff --git a/Makefile b/Makefile index 4464a9774..7e2e35052 100644 --- a/Makefile +++ b/Makefile @@ -509,11 +509,10 @@ test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp ## vllm is resolved from a HuggingFace model id (no file download) and ## exercises Predict + streaming + tool-call extraction via the hermes parser. -## FROM_SOURCE=true passes through to Dockerfile.python → install.sh and -## compiles vllm locally instead of using the prebuilt CPU wheel — required -## on runners whose CPU doesn't support the wheel's baked-in SIMD. -test-extra-backend-vllm: - $(MAKE) docker-build-vllm +## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU +## wheel was compiled against (AVX-512 VNNI/BF16); older CPUs will SIGILL +## on import — on CI this means using the bigger-runner label. +test-extra-backend-vllm: docker-build-vllm BACKEND_IMAGE=local-ai-backend:vllm \ BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \ BACKEND_TEST_CAPS=health,load,predict,stream,tools \ diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python index e209815db..16159c67b 100644 --- a/backend/Dockerfile.python +++ b/backend/Dockerfile.python @@ -195,8 +195,9 @@ COPY backend/backend.proto /${BACKEND}/backend.proto COPY backend/python/common/ /${BACKEND}/common COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh -# Optional per-backend source build toggle (e.g. vllm on CPU needs to -# compile against the host SIMD instead of using the prebuilt wheel). +# Optional per-backend source build toggle (e.g. vllm on CPU can set +# FROM_SOURCE=true to compile against the build host SIMD instead of +# pulling a prebuilt wheel). Default empty — most backends ignore it. ARG FROM_SOURCE="" ENV FROM_SOURCE=${FROM_SOURCE} diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index de204e0a2..cf6fa7efe 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -32,12 +32,12 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" fi -# When FROM_SOURCE=true on a CPU build, skip the prebuilt wheel in -# requirements-cpu-after.txt and compile vllm locally against the host's -# actual CPU. The prebuilt CPU wheels from vllm releases are compiled with -# wider SIMD (AVX-512 VNNI/BF16 etc.) than some environments support — in -# particular GitHub Actions runners SIGILL on the vllm model registry -# subprocess. FROM_SOURCE=true avoids that at the cost of a longer install. +# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in +# requirements-cpu-after.txt and compiles vllm locally against the host's +# actual CPU. Not used by default because it takes ~30-40 minutes, but +# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the +# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a +# bigger-runner with compatible hardware instead. if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then # Temporarily hide the prebuilt wheel so installRequirements doesn't # pull it — the rest of the requirements files (base deps, torch,