From c2f73a987eab338bbeb22e72f6eab09732510bf1 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 12 Apr 2026 08:58:57 +0000 Subject: [PATCH] fix(vllm): CPU build compatibility with vllm 0.14.1 Validated end-to-end on CPU with Qwen2.5-0.5B-Instruct (LoadModel, Predict, TokenizeString, Free all working). - requirements-cpu-after.txt: pin vllm to 0.14.1+cpu (pre-built wheel from GitHub releases) for x86_64 and aarch64. vllm 0.14.1 is the newest CPU wheel whose torch dependency resolves against published PyTorch builds (torch==2.9.1+cpu). Later vllm CPU wheels currently require torch==2.10.0+cpu which is only available on the PyTorch test channel with incompatible torchvision. - requirements-cpu.txt: bump torch to 2.9.1+cpu, add torchvision/torchaudio so uv resolves them consistently from the PyTorch CPU index. - install.sh: add --index-strategy=unsafe-best-match for CPU builds so uv can mix the PyTorch index and PyPI for transitive deps (matches the existing intel profile behaviour). - backend.py LoadModel: vllm >= 0.14 removed AsyncLLMEngine.get_model_config so the old code path errored out with AttributeError on model load. Switch to the new get_tokenizer()/tokenizer accessor with a fallback to building the tokenizer directly from request.Model. --- backend/python/vllm/backend.py | 27 ++++++++++++++----- backend/python/vllm/install.sh | 6 +++++ .../python/vllm/requirements-cpu-after.txt | 3 ++- backend/python/vllm/requirements-cpu.txt | 6 +++-- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 14d12af1e..cfb69a684 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -183,13 +183,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") try: - engine_model_config = await self.llm.get_model_config() - self.tokenizer = get_tokenizer( - engine_model_config.tokenizer, - tokenizer_mode=engine_model_config.tokenizer_mode, - trust_remote_code=engine_model_config.trust_remote_code, - truncation_side="left", - ) + # vLLM >= 0.14 removed get_model_config() on AsyncLLM; the tokenizer + # is either already loaded on the engine or can be built from the + # Model name directly. + tokenizer = None + if hasattr(self.llm, "get_tokenizer"): + try: + tokenizer = await self.llm.get_tokenizer() + except TypeError: + tokenizer = self.llm.get_tokenizer() + except Exception: + tokenizer = None + if tokenizer is None and hasattr(self.llm, "tokenizer"): + tokenizer = self.llm.tokenizer + if tokenizer is None: + tokenizer = get_tokenizer( + request.Model, + trust_remote_code=bool(request.TrustRemoteCode), + truncation_side="left", + ) + self.tokenizer = tokenizer except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 7dcd29db4..66a809a92 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -26,6 +26,12 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" fi +# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the +# pytorch test channel while still resolving transformers/vllm from pypi. +if [ "x${BUILD_PROFILE}" == "xcpu" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" +fi + # We don't embed this into the images as it is a large dependency and not always needed. # Besides, the speed inference are not actually usable in the current state for production use-cases. if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then diff --git a/backend/python/vllm/requirements-cpu-after.txt b/backend/python/vllm/requirements-cpu-after.txt index 20cf3d395..e5e4908f7 100644 --- a/backend/python/vllm/requirements-cpu-after.txt +++ b/backend/python/vllm/requirements-cpu-after.txt @@ -1 +1,2 @@ -https://github.com/vllm-project/vllm/releases/download/v0.8.5/vllm-0.8.5+cpu-cp38-abi3-manylinux_2_35_x86_64.whl +vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_x86_64.whl ; platform_machine == "x86_64" +vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ; platform_machine == "aarch64" diff --git a/backend/python/vllm/requirements-cpu.txt b/backend/python/vllm/requirements-cpu.txt index d1e882245..5eeb8a708 100644 --- a/backend/python/vllm/requirements-cpu.txt +++ b/backend/python/vllm/requirements-cpu.txt @@ -1,4 +1,6 @@ -accelerate --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.7.0+cpu +accelerate +torch==2.9.1+cpu +torchvision +torchaudio transformers