From c2f73a987eab338bbeb22e72f6eab09732510bf1 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 12 Apr 2026 08:58:57 +0000
Subject: [PATCH] fix(vllm): CPU build compatibility with vllm 0.14.1

Validated end-to-end on CPU with Qwen2.5-0.5B-Instruct (LoadModel, Predict,
TokenizeString, Free all working).

- requirements-cpu-after.txt: pin vllm to 0.14.1+cpu (pre-built wheel from
  GitHub releases) for x86_64 and aarch64. vllm 0.14.1 is the newest CPU
  wheel whose torch dependency resolves against published PyTorch builds
  (torch==2.9.1+cpu). Later vllm CPU wheels currently require
  torch==2.10.0+cpu which is only available on the PyTorch test channel
  with incompatible torchvision.
- requirements-cpu.txt: bump torch to 2.9.1+cpu, add torchvision/torchaudio
  so uv resolves them consistently from the PyTorch CPU index.
- install.sh: add --index-strategy=unsafe-best-match for CPU builds so uv
  can mix the PyTorch index and PyPI for transitive deps (matches the
  existing intel profile behaviour).
- backend.py LoadModel: vllm >= 0.14 removed AsyncLLMEngine.get_model_config
  so the old code path errored out with AttributeError on model load.
  Switch to the new get_tokenizer()/tokenizer accessor with a fallback
  to building the tokenizer directly from request.Model.
---
 backend/python/vllm/backend.py                | 27 ++++++++++++++-----
 backend/python/vllm/install.sh                |  6 +++++
 .../python/vllm/requirements-cpu-after.txt    |  3 ++-
 backend/python/vllm/requirements-cpu.txt      |  6 +++--
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 14d12af1e..cfb69a684 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -183,13 +183,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
 
         try:
-           engine_model_config = await self.llm.get_model_config()
-           self.tokenizer = get_tokenizer(
-               engine_model_config.tokenizer,
-               tokenizer_mode=engine_model_config.tokenizer_mode,
-               trust_remote_code=engine_model_config.trust_remote_code,
-               truncation_side="left",
-           )
+            # vLLM >= 0.14 removed get_model_config() on AsyncLLM; the tokenizer
+            # is either already loaded on the engine or can be built from the
+            # Model name directly.
+            tokenizer = None
+            if hasattr(self.llm, "get_tokenizer"):
+                try:
+                    tokenizer = await self.llm.get_tokenizer()
+                except TypeError:
+                    tokenizer = self.llm.get_tokenizer()
+                except Exception:
+                    tokenizer = None
+            if tokenizer is None and hasattr(self.llm, "tokenizer"):
+                tokenizer = self.llm.tokenizer
+            if tokenizer is None:
+                tokenizer = get_tokenizer(
+                    request.Model,
+                    trust_remote_code=bool(request.TrustRemoteCode),
+                    truncation_side="left",
+                )
+            self.tokenizer = tokenizer
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
 
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 7dcd29db4..66a809a92 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -26,6 +26,12 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 
+# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
+# pytorch test channel while still resolving transformers/vllm from pypi.
+if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
+fi
+
 # We don't embed this into the images as it is a large dependency and not always needed.
 # Besides, the speed inference are not actually usable in the current state for production use-cases.
 if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
diff --git a/backend/python/vllm/requirements-cpu-after.txt b/backend/python/vllm/requirements-cpu-after.txt
index 20cf3d395..e5e4908f7 100644
--- a/backend/python/vllm/requirements-cpu-after.txt
+++ b/backend/python/vllm/requirements-cpu-after.txt
@@ -1 +1,2 @@
-https://github.com/vllm-project/vllm/releases/download/v0.8.5/vllm-0.8.5+cpu-cp38-abi3-manylinux_2_35_x86_64.whl
+vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_x86_64.whl ; platform_machine == "x86_64"
+vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ; platform_machine == "aarch64"
diff --git a/backend/python/vllm/requirements-cpu.txt b/backend/python/vllm/requirements-cpu.txt
index d1e882245..5eeb8a708 100644
--- a/backend/python/vllm/requirements-cpu.txt
+++ b/backend/python/vllm/requirements-cpu.txt
@@ -1,4 +1,6 @@
-accelerate
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.7.0+cpu
+accelerate
+torch==2.9.1+cpu
+torchvision
+torchaudio
 transformers