mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-26 00:07:51 -05:00
fix: device imports using strategies (#584)
* fix: device imports using strategies Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> * chore: support trust_remote_code for vLLM runners Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -15,7 +15,6 @@ from openllm_core._schemas import GenerationOutput
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import M
|
||||
from openllm_core._typing_compat import T
|
||||
from openllm_core.utils import device_count
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import get_debug_mode
|
||||
from openllm_core.utils import is_vllm_available
|
||||
@@ -41,7 +40,7 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
|
||||
def __init__(self, llm: openllm.LLM[M, T]) -> None:
|
||||
self.config = llm.config
|
||||
num_gpus, dev = 1, device_count()
|
||||
num_gpus, dev = 1, openllm.utils.device_count()
|
||||
if dev >= 2: num_gpus = min(dev // 2 * 2, dev)
|
||||
quantization = None
|
||||
if llm._quantise and llm._quantise == 'awq': quantization = llm._quantise
|
||||
@@ -49,6 +48,7 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
self.model = vllm.AsyncLLMEngine.from_engine_args(
|
||||
vllm.AsyncEngineArgs(model=llm.bentomodel.path,
|
||||
tokenizer=llm.bentomodel.path,
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
tokenizer_mode='auto',
|
||||
tensor_parallel_size=num_gpus,
|
||||
dtype='auto',
|
||||
|
||||
@@ -202,7 +202,7 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
if isinstance(workers_per_resource, str):
|
||||
if workers_per_resource == 'round_robin': workers_per_resource = 1.0
|
||||
elif workers_per_resource == 'conserved':
|
||||
workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
|
||||
workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count())
|
||||
else:
|
||||
try:
|
||||
workers_per_resource = float(workers_per_resource)
|
||||
|
||||
@@ -137,8 +137,6 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
|
||||
if not _BUILDER.health(): raise openllm.exceptions.Error
|
||||
except (openllm.exceptions.Error, subprocess.CalledProcessError):
|
||||
raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None
|
||||
if openllm_core.utils.device_count() == 0:
|
||||
raise RuntimeError('Building base container requires GPUs (None available)')
|
||||
if not shutil.which('nvidia-container-runtime'):
|
||||
raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
|
||||
if not _module_location:
|
||||
@@ -147,9 +145,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
|
||||
if not pyproject_path.exists():
|
||||
raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
|
||||
if not registries:
|
||||
tags: dict[str | LiteralContainerRegistry, str] = {
|
||||
alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
|
||||
} # default to all registries with latest tag strategy
|
||||
tags: dict[str | LiteralContainerRegistry, str] = {alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()}
|
||||
else:
|
||||
registries = [registries] if isinstance(registries, str) else list(registries)
|
||||
tags = {name: f'{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}' for name in registries}
|
||||
|
||||
@@ -6,6 +6,7 @@ we won't ensure backward compatibility for these functions. So use with caution.
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import functools
|
||||
import openllm_core
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -14,7 +15,16 @@ if t.TYPE_CHECKING:
|
||||
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
|
||||
return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation}
|
||||
|
||||
__all__ = ['generate_labels']
|
||||
def available_devices() -> tuple[str, ...]:
|
||||
"""Return available GPU under system. Currently only supports NVIDIA GPUs."""
|
||||
from .._strategies import NvidiaGpuResource
|
||||
return tuple(NvidiaGpuResource.from_system())
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def device_count() -> int:
|
||||
return len(available_devices())
|
||||
|
||||
__all__ = ['generate_labels', 'available_devices', 'device_count']
|
||||
|
||||
def __dir__() -> t.Sequence[str]:
|
||||
return sorted(__all__)
|
||||
|
||||
Reference in New Issue
Block a user