diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 14d12af1e..cfb69a684 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -183,13 +183,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") try: - engine_model_config = await self.llm.get_model_config() - self.tokenizer = get_tokenizer( - engine_model_config.tokenizer, - tokenizer_mode=engine_model_config.tokenizer_mode, - trust_remote_code=engine_model_config.trust_remote_code, - truncation_side="left", - ) + # vLLM >= 0.14 removed get_model_config() on AsyncLLM; the tokenizer + # is either already loaded on the engine or can be built from the + # Model name directly. + tokenizer = None + if hasattr(self.llm, "get_tokenizer"): + try: + tokenizer = await self.llm.get_tokenizer() + except TypeError: + tokenizer = self.llm.get_tokenizer() + except Exception: + tokenizer = None + if tokenizer is None and hasattr(self.llm, "tokenizer"): + tokenizer = self.llm.tokenizer + if tokenizer is None: + tokenizer = get_tokenizer( + request.Model, + trust_remote_code=bool(request.TrustRemoteCode), + truncation_side="left", + ) + self.tokenizer = tokenizer except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 7dcd29db4..66a809a92 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -26,6 +26,12 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" fi +# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the +# pytorch test channel while still resolving transformers/vllm from pypi. +if [ "x${BUILD_PROFILE}" == "xcpu" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match" +fi + # We don't embed this into the images as it is a large dependency and not always needed. # Besides, the speed inference are not actually usable in the current state for production use-cases. if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then diff --git a/backend/python/vllm/requirements-cpu-after.txt b/backend/python/vllm/requirements-cpu-after.txt index 20cf3d395..e5e4908f7 100644 --- a/backend/python/vllm/requirements-cpu-after.txt +++ b/backend/python/vllm/requirements-cpu-after.txt @@ -1 +1,2 @@ -https://github.com/vllm-project/vllm/releases/download/v0.8.5/vllm-0.8.5+cpu-cp38-abi3-manylinux_2_35_x86_64.whl +vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_x86_64.whl ; platform_machine == "x86_64" +vllm @ https://github.com/vllm-project/vllm/releases/download/v0.14.1/vllm-0.14.1+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ; platform_machine == "aarch64" diff --git a/backend/python/vllm/requirements-cpu.txt b/backend/python/vllm/requirements-cpu.txt index d1e882245..5eeb8a708 100644 --- a/backend/python/vllm/requirements-cpu.txt +++ b/backend/python/vllm/requirements-cpu.txt @@ -1,4 +1,6 @@ -accelerate --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.7.0+cpu +accelerate +torch==2.9.1+cpu +torchvision +torchaudio transformers