diff --git a/mypy.ini b/mypy.ini index d8b92f8e..78a6b681 100644 --- a/mypy.ini +++ b/mypy.ini @@ -8,4 +8,4 @@ warn_unused_configs = true ignore_missing_imports = true check_untyped_defs = true warn_unreachable = true -files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/serialisation/_helpers.pyi, openllm-python/src/openllm/utils.pyi +files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/serialisation/_helpers.pyi, openllm-python/src/openllm/utils.pyi, openllm-python/src/openllm/_strategies.pyi diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py index 6cd5df40..9ea0e1c8 100644 --- a/openllm-python/src/openllm/_service_vars.py +++ b/openllm-python/src/openllm/_service_vars.py @@ -1,11 +1,2 @@ -import os - -import orjson - -from openllm_core.utils import ENV_VARS_TRUE_VALUES - -model_id = os.environ['OPENLLM_MODEL_ID'] -model_tag = None -adapter_map = orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))) -serialization = os.getenv('OPENLLM_SERIALIZATION', default='safetensors') -trust_remote_code = str(os.getenv('TRUST_REMOTE_CODE', default=str(False))).upper() in ENV_VARS_TRUE_VALUES +# fmt: off +import os,orjson,openllm_core.utils as coreutils;model_id,model_tag,adapter_map,serialization,trust_remote_code=os.environ['OPENLLM_MODEL_ID'],None,orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP',orjson.dumps(None))),os.getenv('OPENLLM_SERIALIZATION',default='safetensors'),coreutils.check_bool_env('TRUST_REMOTE_CODE',False) diff --git a/openllm-python/src/openllm/_service_vars_pkg.py b/openllm-python/src/openllm/_service_vars_pkg.py deleted file mode 100644 index 7cf5c203..00000000 --- a/openllm-python/src/openllm/_service_vars_pkg.py +++ /dev/null @@ -1,7 +0,0 @@ -import orjson - -model_id = '{__model_id__}' # openllm: model id -model_tag = '{__model_tag__}' # openllm: model tag -adapter_map = orjson.loads("""{__model_adapter_map__}""") # openllm: model adapter map -serialization = '{__model_serialization__}' # openllm: model serialization -trust_remote_code = {__model_trust_remote_code__} # openllm: model trust remote code diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py index 7a731a52..4f2d8950 100644 --- a/openllm-python/src/openllm/_strategies.py +++ b/openllm-python/src/openllm/_strategies.py @@ -1,20 +1,8 @@ -# mypy: disable-error-code="no-redef" from __future__ import annotations -import inspect -import logging -import math -import os -import sys -import types -import typing as t -import warnings - -import psutil - -import bentoml +import inspect, logging, math, os, sys, types, warnings, typing as t +import psutil, bentoml, openllm_core.utils as coreutils from bentoml._internal.resource import get_resource, system_resources from bentoml._internal.runner.strategy import THREAD_ENVS -from openllm_core.utils import DEBUG, ReprMixin logger = logging.getLogger(__name__) @@ -46,8 +34,7 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: return rcs -def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None: - """CUDA_VISIBLE_DEVICES aware with default var for parsing spec.""" +def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None: if respect_env: spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var) if not spec: @@ -116,11 +103,11 @@ def _raw_device_uuid_nvml() -> list[str] | None: class _ResourceMixin: @staticmethod def from_system(cls) -> list[str]: - visible_devices = _parse_visible_devices() + visible_devices = _parse_cuda_visible_devices() if visible_devices is None: if cls.resource_id == 'amd.com/gpu': if not psutil.LINUX: - if DEBUG: + if coreutils.DEBUG: logger.debug('AMD GPUs is currently only supported on Linux.') return [] # ROCm does not currently have the rocm_smi wheel. @@ -167,7 +154,7 @@ class _ResourceMixin: return [] if spec.isdigit(): spec = ','.join([str(i) for i in range(_strtoul(spec))]) - return _parse_visible_devices(spec, respect_env=False) + return _parse_cuda_visible_devices(spec, respect_env=False) elif isinstance(spec, list): return [str(x) for x in spec] else: @@ -209,7 +196,7 @@ class _ResourceMixin: def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]: return types.new_class( name, - (bentoml.Resource[t.List[str]], ReprMixin), + (bentoml.Resource[t.List[str]], coreutils.ReprMixin), {'resource_id': resource_kind}, lambda ns: ns.update( { @@ -243,24 +230,9 @@ AmdGpuResource = _make_resource_class( ) -class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): - """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource. - - It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU. - See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices - for ROCm's support for CUDA_VISIBLE_DEVICES. - - TODO: Support CloudTPUResource - """ - +class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin): @classmethod - def get_worker_count( - cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float - ) -> int: - """Return the number of workers to be used for the given runnable class. - - Note that for all available GPU, the number of workers will always be 1. - """ + def get_worker_count(cls, runnable_class, resource_request, workers_per_resource): if resource_request is None: resource_request = system_resources() # use NVIDIA @@ -291,21 +263,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): ) @classmethod - def get_worker_env( - cls, - runnable_class: type[bentoml.Runnable], - resource_request: dict[str, t.Any] | None, - workers_per_resource: int | float, - worker_index: int, - ) -> dict[str, t.Any]: - """Get worker env for this given worker_index. - - Args: - runnable_class: The runnable class to be run. - resource_request: The resource request of the runnable. - workers_per_resource: # of workers per resource. - worker_index: The index of the worker, start from 0. - """ + def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index): cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None) disabled = cuda_env in ('', '-1') environ: dict[str, t.Any] = {} @@ -350,7 +308,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): return environ @staticmethod - def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str: + def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index): # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string. if isinstance(workers_per_resource, float): # NOTE: We hit this branch when workers_per_resource is set to diff --git a/openllm-python/src/openllm/_strategies.pyi b/openllm-python/src/openllm/_strategies.pyi new file mode 100644 index 00000000..a917267b --- /dev/null +++ b/openllm-python/src/openllm/_strategies.pyi @@ -0,0 +1,46 @@ +from typing import Any, Type, Dict, Optional, Union, List +import bentoml + +def get_resource(resources: Dict[str, Any], resource_kind: str, validate: bool = ...) -> Any: ... + +class CascadingResourceStrategy: + """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource. + + It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU. + See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices + for ROCm's support for CUDA_VISIBLE_DEVICES. + + TODO: Support CloudTPUResource + """ + @classmethod + def get_worker_count( + cls, + runnable_class: Type[bentoml.Runnable], + resource_request: Optional[Dict[str, Any]], + workers_per_resource: float, + ) -> int: + """Return the number of workers to be used for the given runnable class. + + Note that for all available GPU, the number of workers will always be 1. + """ + @classmethod + def get_worker_env( + cls, + runnable_class: type[bentoml.Runnable], + resource_request: Optional[Dict[str, Any]], + workers_per_resource: Union[int, float], + worker_index: int, + ) -> Dict[str, Any]: + """Get worker env for this given worker_index. + + Args: + runnable_class: The runnable class to be run. + resource_request: The resource request of the runnable. + workers_per_resource: # of workers per resource. + worker_index: The index of the worker, start from 0. + """ + @staticmethod + def transpile_workers_to_cuda_envvar( + workers_per_resource: Union[float, int], gpus: List[str], worker_index: int + ) -> str: + """Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.""" diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 81edce30..7a9518be 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -3,8 +3,6 @@ from __future__ import annotations import importlib.metadata import logging import os -import string -import typing as t from pathlib import Path import orjson @@ -18,9 +16,6 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg from . import oci -if t.TYPE_CHECKING: - from openllm_core._typing_compat import LiteralString - logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD' @@ -96,92 +91,25 @@ def construct_docker_options( ) -OPENLLM_MODEL_ID = '# openllm: model id' -OPENLLM_MODEL_TAG = '# openllm: model tag' -OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map' -OPENLLM_MODEL_SERIALIZATION = '# openllm: model serialization' -OPENLLM_MODEL_TRUST_REMOTE_CODE = '# openllm: model trust remote code' - - -class _ServiceVarsFormatter(string.Formatter): - keyword: LiteralString = '__model_name__' - identifier: LiteralString = '# openllm: model name' - - def __init__(self, target): - super().__init__() - self.target = target - - def vformat(self, format_string, *args, **attrs) -> str: - return super().vformat(format_string, (), {self.keyword: self.target}) - - def parse_line(self, line: str, nl: bool = True) -> str: - if self.identifier not in line: - return line - gen = self.vformat(line)[: -(len(self.identifier) + 3)] + ('\n' if nl else '') - return gen - - -class ModelIdFormatter(_ServiceVarsFormatter): - keyword = '__model_id__' - identifier = OPENLLM_MODEL_ID - - -class ModelTagFormatter(_ServiceVarsFormatter): - keyword = '__model_tag__' - identifier = OPENLLM_MODEL_TAG - - -class ModelAdapterMapFormatter(_ServiceVarsFormatter): - keyword = '__model_adapter_map__' - identifier = OPENLLM_MODEL_ADAPTER_MAP - - -class ModelSerializationFormatter(_ServiceVarsFormatter): - keyword = '__model_serialization__' - identifier = OPENLLM_MODEL_SERIALIZATION - - -class ModelTrustRemoteCodeFormatter(_ServiceVarsFormatter): - keyword = '__model_trust_remote_code__' - identifier = OPENLLM_MODEL_TRUST_REMOTE_CODE - - _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py' -_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py' + +_SERVICE_VARS = '''\ +import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__} +''' def write_service(llm, llm_fs, adapter_map): - model_id_formatter = ModelIdFormatter(llm.model_id) - model_tag_formatter = ModelTagFormatter(str(llm.tag)) - adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()) - serialization_formatter = ModelSerializationFormatter(llm.config['serialisation']) - trust_remote_code_formatter = ModelTrustRemoteCodeFormatter(str(llm.trust_remote_code)) - - logger.debug( - 'Generating service vars file for %s at %s (dir=%s)', llm.model_id, '_service_vars.py', llm_fs.getsyspath('/') + logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/')) + script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n# fmt: off\n" + _SERVICE_VARS.format( + __model_id__=llm.model_id, + __model_tag__=str(llm.tag), + __model_adapter_map__=orjson.dumps(adapter_map).decode(), + __model_serialization__=llm.config['serialisation'], + __model_trust_remote_code__=str(llm.trust_remote_code), ) - with open(_service_vars_file.__fspath__(), 'r') as f: - src_contents = f.readlines() - for i, it in enumerate(src_contents): - if model_id_formatter.identifier in it: - src_contents[i] = model_id_formatter.parse_line(it) - elif model_tag_formatter.identifier in it: - src_contents[i] = model_tag_formatter.parse_line(it) - elif adapter_map_formatter.identifier in it: - src_contents[i] = adapter_map_formatter.parse_line(it) - elif serialization_formatter.identifier in it: - src_contents[i] = serialization_formatter.parse_line(it) - elif trust_remote_code_formatter.identifier in it: - src_contents[i] = trust_remote_code_formatter.parse_line(it) - - script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents) if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script) llm_fs.writetext('_service_vars.py', script) - - logger.debug( - 'Generating service file for %s at %s (dir=%s)', llm.model_id, llm.config['service_name'], llm_fs.getsyspath('/') - ) with open(_service_file.__fspath__(), 'r') as f: service_src = f.read() llm_fs.writetext(llm.config['service_name'], service_src)