From 88d7ba7ca8ac44a2ef082e3382aaccd408adfbbc Mon Sep 17 00:00:00 2001 From: Alan Poulain Date: Tue, 12 Sep 2023 18:45:00 +0200 Subject: [PATCH] fix(vllm): Make sure to use max number of GPUs available (#326) * fix(serving): vllm bad num_gpus Signed-off-by: Alan Poulain * ci: auto fixes from pre-commit.ci For more information, see https://pre-commit.ci --------- Signed-off-by: Alan Poulain Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- openllm-python/src/openllm/_assign.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openllm-python/src/openllm/_assign.py b/openllm-python/src/openllm/_assign.py index 195d2bb2..db696ac6 100644 --- a/openllm-python/src/openllm/_assign.py +++ b/openllm-python/src/openllm/_assign.py @@ -1,7 +1,6 @@ '''LLM assignment magik.''' from __future__ import annotations import functools -import math import traceback import typing as t @@ -52,7 +51,7 @@ def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vll def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine: if self.__llm_backend__ == 'vllm': num_gpus, dev = 1, device_count() - if dev >= 2: num_gpus = dev if dev // 2 == 0 else math.ceil(dev / 2) + if dev >= 2: num_gpus = min(dev // 2 * 2, dev) # TODO: Do some more processing with token_id once we support token streaming try: return vllm.LLMEngine.from_engine_args(