mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-02 21:56:10 -05:00
fix(vllm): Make sure to use max number of GPUs available (#326)
* fix(serving): vllm bad num_gpus Signed-off-by: Alan Poulain <contact@alanpoulain.eu> * ci: auto fixes from pre-commit.ci For more information, see https://pre-commit.ci --------- Signed-off-by: Alan Poulain <contact@alanpoulain.eu> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
'''LLM assignment magik.'''
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import math
|
||||
import traceback
|
||||
import typing as t
|
||||
|
||||
@@ -52,7 +51,7 @@ def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vll
|
||||
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
|
||||
if self.__llm_backend__ == 'vllm':
|
||||
num_gpus, dev = 1, device_count()
|
||||
if dev >= 2: num_gpus = dev if dev // 2 == 0 else math.ceil(dev / 2)
|
||||
if dev >= 2: num_gpus = min(dev // 2 * 2, dev)
|
||||
# TODO: Do some more processing with token_id once we support token streaming
|
||||
try:
|
||||
return vllm.LLMEngine.from_engine_args(
|
||||
|
||||
Reference in New Issue
Block a user