From ea2bbabffd4a037cb1851a2be56dae577f058069 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 12 Apr 2026 16:02:49 +0000
Subject: [PATCH] ci(vllm): use bigger-runner instead of source build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prebuilt vllm 0.14.1+cpu wheel requires SIMD instructions (AVX-512
VNNI/BF16) that stock ubuntu-latest GitHub runners don't support —
vllm.model_executor.models.registry SIGILLs on import during LoadModel.

Source compilation works but takes 30-40 minutes per CI run, which is
too slow for an e2e smoke test. Instead, switch tests-vllm-grpc to the
bigger-runner self-hosted label (already used by backend.yml for the
llama-cpp CUDA build) — that hardware has the required SIMD baseline
and the prebuilt wheel runs cleanly.

FROM_SOURCE=true is kept as an opt-in escape hatch:
- install.sh still has the CPU source-build path for hosts that need it
- backend/Dockerfile.python still declares the ARG + ENV
- Makefile docker-build-backend still forwards the build-arg when set
Default CI path uses the fast prebuilt wheel; source build can be
re-enabled by exporting FROM_SOURCE=true in the environment.
---
 .github/workflows/test-extra.yml | 14 ++++++--------
 Makefile                         |  9 ++++-----
 backend/Dockerfile.python        |  5 +++--
 backend/python/vllm/install.sh   | 12 ++++++------
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index a9f10e3fc..592caffc7 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -505,8 +505,12 @@ jobs:
   tests-vllm-grpc:
     needs: detect-changes
     if: needs.detect-changes.outputs.vllm == 'true' || needs.detect-changes.outputs.run-all == 'true'
-    runs-on: ubuntu-latest
-    timeout-minutes: 120
+    # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16
+    # instructions; stock ubuntu-latest runners SIGILL on import of
+    # vllm.model_executor.models.registry. bigger-runner has newer
+    # hardware that supports the required SIMD.
+    runs-on: bigger-runner
+    timeout-minutes: 90
     steps:
       - name: Clone
         uses: actions/checkout@v6
@@ -521,12 +525,6 @@ jobs:
           sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
           df -h
       - name: Build vllm (cpu) backend image and run gRPC e2e tests
-        env:
-          # GitHub Actions runners don't all support the SIMD instructions
-          # the prebuilt vllm CPU wheel was compiled against (SIGILL in
-          # vllm.model_executor.models.registry on import). Build vllm from
-          # source so it targets the actual CI CPU.
-          FROM_SOURCE: "true"
         run: |
           make test-extra-backend-vllm
   tests-acestep-cpp:
diff --git a/Makefile b/Makefile
index 4464a9774..7e2e35052 100644
--- a/Makefile
+++ b/Makefile
@@ -509,11 +509,10 @@ test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
 
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
-## FROM_SOURCE=true passes through to Dockerfile.python → install.sh and
-## compiles vllm locally instead of using the prebuilt CPU wheel — required
-## on runners whose CPU doesn't support the wheel's baked-in SIMD.
-test-extra-backend-vllm:
-	$(MAKE) docker-build-vllm
+## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
+## wheel was compiled against (AVX-512 VNNI/BF16); older CPUs will SIGILL
+## on import — on CI this means using the bigger-runner label.
+test-extra-backend-vllm: docker-build-vllm
 	BACKEND_IMAGE=local-ai-backend:vllm \
 	BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
 	BACKEND_TEST_CAPS=health,load,predict,stream,tools \
diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python
index e209815db..16159c67b 100644
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -195,8 +195,9 @@ COPY backend/backend.proto /${BACKEND}/backend.proto
 COPY backend/python/common/ /${BACKEND}/common
 COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh
 
-# Optional per-backend source build toggle (e.g. vllm on CPU needs to
-# compile against the host SIMD instead of using the prebuilt wheel).
+# Optional per-backend source build toggle (e.g. vllm on CPU can set
+# FROM_SOURCE=true to compile against the build host SIMD instead of
+# pulling a prebuilt wheel). Default empty — most backends ignore it.
 ARG FROM_SOURCE=""
 ENV FROM_SOURCE=${FROM_SOURCE}
 
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index de204e0a2..cf6fa7efe 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -32,12 +32,12 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi
 
-# When FROM_SOURCE=true on a CPU build, skip the prebuilt wheel in
-# requirements-cpu-after.txt and compile vllm locally against the host's
-# actual CPU. The prebuilt CPU wheels from vllm releases are compiled with
-# wider SIMD (AVX-512 VNNI/BF16 etc.) than some environments support — in
-# particular GitHub Actions runners SIGILL on the vllm model registry
-# subprocess. FROM_SOURCE=true avoids that at the cost of a longer install.
+# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
+# requirements-cpu-after.txt and compiles vllm locally against the host's
+# actual CPU. Not used by default because it takes ~30-40 minutes, but
+# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the
+# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a
+# bigger-runner with compatible hardware instead.
 if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
     # Temporarily hide the prebuilt wheel so installRequirements doesn't
     # pull it — the rest of the requirements files (base deps, torch,