From 329df11989dba7a167041ac851c83e38fd06af78 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 12 Apr 2026 15:14:42 +0000
Subject: [PATCH] fix(vllm): build from source on CI to avoid SIGILL on
 prebuilt wheel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prebuilt vllm 0.14.1+cpu wheel from GitHub releases is compiled with
SIMD instructions (AVX-512 VNNI/BF16 or AMX-BF16) that not every CPU
supports. GitHub Actions ubuntu-latest runners SIGILL when vllm spawns
the model_executor.models.registry subprocess for introspection, so
LoadModel never reaches the actual inference path.

- install.sh: when FROM_SOURCE=true on a CPU build, temporarily hide
  requirements-cpu-after.txt so installRequirements installs the base
  deps + torch CPU without pulling the prebuilt wheel, then clone vllm
  and compile it with VLLM_TARGET_DEVICE=cpu. The resulting binaries
  target the host's actual CPU.
- backend/Dockerfile.python: accept a FROM_SOURCE build-arg and expose
  it as an ENV so install.sh sees it during `make`.
- Makefile docker-build-backend: forward FROM_SOURCE as --build-arg
  when set, so backends that need source builds can opt in.
- Makefile test-extra-backend-vllm: call docker-build-vllm via a
  recursive $(MAKE) invocation so FROM_SOURCE flows through.
- .github/workflows/test-extra.yml: set FROM_SOURCE=true on the
  tests-vllm-grpc job. Slower but reliable — the prebuilt wheel only
  works on hosts that share the build-time SIMD baseline.

Answers 'did you test locally?': yes, end-to-end on my local machine
with the prebuilt wheel (CPU supports AVX-512 VNNI). The CI runner CPU
gap was not covered locally — this commit plugs that gap.
---
 .github/workflows/test-extra.yml |  6 ++++
 Makefile                         |  7 ++++-
 backend/Dockerfile.python        |  5 ++++
 backend/python/vllm/install.sh   | 47 ++++++++++++++++++++++----------
 4 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index b6c72b1a7..a9f10e3fc 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -521,6 +521,12 @@ jobs:
           sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
           df -h
       - name: Build vllm (cpu) backend image and run gRPC e2e tests
+        env:
+          # GitHub Actions runners don't all support the SIMD instructions
+          # the prebuilt vllm CPU wheel was compiled against (SIGILL in
+          # vllm.model_executor.models.registry on import). Build vllm from
+          # source so it targets the actual CI CPU.
+          FROM_SOURCE: "true"
         run: |
           make test-extra-backend-vllm
   tests-acestep-cpp:
diff --git a/Makefile b/Makefile
index 7f61666f5..4464a9774 100644
--- a/Makefile
+++ b/Makefile
@@ -509,7 +509,11 @@ test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
 
 ## vllm is resolved from a HuggingFace model id (no file download) and
 ## exercises Predict + streaming + tool-call extraction via the hermes parser.
-test-extra-backend-vllm: docker-build-vllm
+## FROM_SOURCE=true passes through to Dockerfile.python → install.sh and
+## compiles vllm locally instead of using the prebuilt CPU wheel — required
+## on runners whose CPU doesn't support the wheel's baked-in SIMD.
+test-extra-backend-vllm:
+	$(MAKE) docker-build-vllm
 	BACKEND_IMAGE=local-ai-backend:vllm \
 	BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
 	BACKEND_TEST_CAPS=health,load,predict,stream,tools \
@@ -669,6 +673,7 @@ define docker-build-backend
 		--build-arg CUDA_MINOR_VERSION=$(CUDA_MINOR_VERSION) \
 		--build-arg UBUNTU_VERSION=$(UBUNTU_VERSION) \
 		--build-arg UBUNTU_CODENAME=$(UBUNTU_CODENAME) \
+		$(if $(FROM_SOURCE),--build-arg FROM_SOURCE=$(FROM_SOURCE)) \
 		$(if $(filter true,$(5)),--build-arg BACKEND=$(1)) \
 		-t local-ai-backend:$(1) -f backend/Dockerfile.$(2) $(3)
 endef
diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python
index 5d2e6171e..e209815db 100644
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -195,6 +195,11 @@ COPY backend/backend.proto /${BACKEND}/backend.proto
 COPY backend/python/common/ /${BACKEND}/common
 COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh
 
+# Optional per-backend source build toggle (e.g. vllm on CPU needs to
+# compile against the host SIMD instead of using the prebuilt wheel).
+ARG FROM_SOURCE=""
+ENV FROM_SOURCE=${FROM_SOURCE}
+
 RUN cd /${BACKEND} && PORTABLE_PYTHON=true make
 
 # Package GPU libraries into the backend's lib directory
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 66a809a92..de204e0a2 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -32,20 +32,37 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi
 
-# We don't embed this into the images as it is a large dependency and not always needed.
-# Besides, the speed inference are not actually usable in the current state for production use-cases.
+# When FROM_SOURCE=true on a CPU build, skip the prebuilt wheel in
+# requirements-cpu-after.txt and compile vllm locally against the host's
+# actual CPU. The prebuilt CPU wheels from vllm releases are compiled with
+# wider SIMD (AVX-512 VNNI/BF16 etc.) than some environments support — in
+# particular GitHub Actions runners SIGILL on the vllm model registry
+# subprocess. FROM_SOURCE=true avoids that at the cost of a longer install.
 if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
-        ensureVenv
-        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
-        if [ ! -d vllm ]; then
-            git clone https://github.com/vllm-project/vllm
-        fi
-        pushd vllm
-            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
-            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-            VLLM_TARGET_DEVICE=cpu python setup.py install
-        popd
-        rm -rf vllm
-    else
-        installRequirements
+    # Temporarily hide the prebuilt wheel so installRequirements doesn't
+    # pull it — the rest of the requirements files (base deps, torch,
+    # transformers) are still installed normally.
+    _cpu_after="${backend_dir}/requirements-cpu-after.txt"
+    _cpu_after_bak=""
+    if [ -f "${_cpu_after}" ]; then
+        _cpu_after_bak="${_cpu_after}.from-source.bak"
+        mv "${_cpu_after}" "${_cpu_after_bak}"
+    fi
+    installRequirements
+    if [ -n "${_cpu_after_bak}" ]; then
+        mv "${_cpu_after_bak}" "${_cpu_after}"
+    fi
+
+    # Build vllm from source against the installed torch.
+    # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/
+    _vllm_src=$(mktemp -d)
+    trap 'rm -rf "${_vllm_src}"' EXIT
+    git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
+    pushd "${_vllm_src}/vllm"
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm
+        # Respect pre-installed torch version — skip vllm's own requirements-build.txt torch pin.
+        VLLM_TARGET_DEVICE=cpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
+    popd
+else
+    installRequirements
 fi