diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index 2f59d63f..02ee1001 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -138,11 +138,6 @@ def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any: if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result() else: return loop.run_until_complete(coro) -def available_devices() -> tuple[str, ...]: - """Return available GPU under system. Currently only supports NVIDIA GPUs.""" - from openllm_core._strategies import NvidiaGpuResource - return tuple(NvidiaGpuResource.from_system()) - @functools.lru_cache(maxsize=128) def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1') -> str: """Generate a hash from given file's modification time. @@ -156,10 +151,6 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1' """ return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest() -@functools.lru_cache(maxsize=1) -def device_count() -> int: - return len(available_devices()) - def check_bool_env(env: str, default: bool = True) -> bool: v = os.environ.get(env, str(default)).upper() if v.isdigit(): return bool(int(v)) # special check for digits diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py index fb6be61b..66c32c98 100644 --- a/openllm-core/src/openllm_core/utils/dantic.py +++ b/openllm-core/src/openllm_core/utils/dantic.py @@ -399,7 +399,7 @@ class CudaValueType(ParamType): param: The parameter that is requesting completion. incomplete: Value being completed. May be empty. """ - from openllm_core.utils import available_devices + from openllm.utils import available_devices mapping = incomplete.split(self.envvar_list_splitter) if incomplete else available_devices() return [sc.CompletionItem(str(i), help=f'CUDA device index {i}') for i in mapping] diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py index c12e6807..be0aa405 100644 --- a/openllm-python/src/openllm/_runners.py +++ b/openllm-python/src/openllm/_runners.py @@ -15,7 +15,6 @@ from openllm_core._schemas import GenerationOutput from openllm_core._typing_compat import LiteralBackend from openllm_core._typing_compat import M from openllm_core._typing_compat import T -from openllm_core.utils import device_count from openllm_core.utils import first_not_none from openllm_core.utils import get_debug_mode from openllm_core.utils import is_vllm_available @@ -41,7 +40,7 @@ class vLLMRunnable(bentoml.Runnable): def __init__(self, llm: openllm.LLM[M, T]) -> None: self.config = llm.config - num_gpus, dev = 1, device_count() + num_gpus, dev = 1, openllm.utils.device_count() if dev >= 2: num_gpus = min(dev // 2 * 2, dev) quantization = None if llm._quantise and llm._quantise == 'awq': quantization = llm._quantise @@ -49,6 +48,7 @@ class vLLMRunnable(bentoml.Runnable): self.model = vllm.AsyncLLMEngine.from_engine_args( vllm.AsyncEngineArgs(model=llm.bentomodel.path, tokenizer=llm.bentomodel.path, + trust_remote_code=llm.trust_remote_code, tokenizer_mode='auto', tensor_parallel_size=num_gpus, dtype='auto', diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 0bfe1e75..5b6ba480 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -202,7 +202,7 @@ def create_bento(bento_tag: bentoml.Tag, if isinstance(workers_per_resource, str): if workers_per_resource == 'round_robin': workers_per_resource = 1.0 elif workers_per_resource == 'conserved': - workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count()) + workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count()) else: try: workers_per_resource = float(workers_per_resource) diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index b144652e..e10e051a 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -137,8 +137,6 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon if not _BUILDER.health(): raise openllm.exceptions.Error except (openllm.exceptions.Error, subprocess.CalledProcessError): raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None - if openllm_core.utils.device_count() == 0: - raise RuntimeError('Building base container requires GPUs (None available)') if not shutil.which('nvidia-container-runtime'): raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.') if not _module_location: @@ -147,9 +145,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon if not pyproject_path.exists(): raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'") if not registries: - tags: dict[str | LiteralContainerRegistry, str] = { - alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items() - } # default to all registries with latest tag strategy + tags: dict[str | LiteralContainerRegistry, str] = {alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()} else: registries = [registries] if isinstance(registries, str) else list(registries) tags = {name: f'{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}' for name in registries} diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py index 59169725..89f28a18 100644 --- a/openllm-python/src/openllm/utils/__init__.py +++ b/openllm-python/src/openllm/utils/__init__.py @@ -6,6 +6,7 @@ we won't ensure backward compatibility for these functions. So use with caution. from __future__ import annotations import typing as t +import functools import openllm_core if t.TYPE_CHECKING: @@ -14,7 +15,16 @@ if t.TYPE_CHECKING: def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation} -__all__ = ['generate_labels'] +def available_devices() -> tuple[str, ...]: + """Return available GPU under system. Currently only supports NVIDIA GPUs.""" + from .._strategies import NvidiaGpuResource + return tuple(NvidiaGpuResource.from_system()) + +@functools.lru_cache(maxsize=1) +def device_count() -> int: + return len(available_devices()) + +__all__ = ['generate_labels', 'available_devices', 'device_count'] def __dir__() -> t.Sequence[str]: return sorted(__all__)