From b2dba6143fa614d4cd7bc00bc6272b071fb26a7d Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sat, 15 Jul 2023 07:19:35 -0400 Subject: [PATCH] fix(resource): correctly parse CUDA_VISIBLE_DEVICES (#114) --- .github/workflows/binary-releases.yml | 1 + .github/workflows/ci.yml | 4 +- .github/workflows/create-releases.yml | 2 + .github/workflows/release-notes.yml | 2 + README.md | 2 +- changelog.d/114.fix.md | 7 + hatch.toml | 3 +- nightly-requirements-gpu.txt | 2 +- nightly-requirements.txt | 2 +- pyproject.toml | 35 ++- src/openllm/_llm.py | 58 +++- src/openllm/_quantisation.py | 48 +++- src/openllm/_strategies.py | 390 +++++++++++++++++++------- src/openllm/_types.py | 3 +- src/openllm/cli.py | 33 ++- src/openllm/utils/__init__.py | 9 +- src/openllm/utils/codegen.py | 3 +- src/openllm/utils/import_utils.py | 26 +- tests/strategies_test.py | 118 +++++++- tools/dependencies.py | 285 +++++++++++++++++++ tools/update-optional-dependencies.py | 160 ----------- typings/cuda/__init__.pyi | 2 + typings/cuda/cuda.pyi | 26 ++ 23 files changed, 903 insertions(+), 318 deletions(-) create mode 100644 changelog.d/114.fix.md create mode 100755 tools/dependencies.py delete mode 100755 tools/update-optional-dependencies.py create mode 100644 typings/cuda/__init__.pyi create mode 100644 typings/cuda/cuda.pyi diff --git a/.github/workflows/binary-releases.yml b/.github/workflows/binary-releases.yml index 62dcd133..9785b2e2 100644 --- a/.github/workflows/binary-releases.yml +++ b/.github/workflows/binary-releases.yml @@ -13,6 +13,7 @@ env: APP_NAME: openllm PYTHON_VERSION: '3.11' PYOXIDIZER_VERSION: '0.24.0' + HATCH_VERBOSE: 10 jobs: python-artifacts: name: Build wheel and source distribution diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3017dfc1..366ff7ba 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,7 @@ env: OPENLLM_DO_NOT_TRACK: True PYTHONUNBUFFERED: '1' STABLE_PYTHON_VERSION: '3.11' + HATCH_VERBOSE: 10 # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun defaults: run: @@ -31,7 +32,6 @@ defaults: jobs: quality: runs-on: ubuntu-latest - if: github.event_name == 'pull_request' name: quality-check steps: - uses: actions/checkout@v3 @@ -43,6 +43,8 @@ jobs: python-version: ${{ env.STABLE_PYTHON_VERSION }} - name: Run type check run: hatch run typing + - if: failure() + run: echo "Not failing quality workflow." tests: runs-on: ubuntu-latest if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }} diff --git a/.github/workflows/create-releases.yml b/.github/workflows/create-releases.yml index 93d37a90..447b01e3 100644 --- a/.github/workflows/create-releases.yml +++ b/.github/workflows/create-releases.yml @@ -28,6 +28,8 @@ on: defaults: run: shell: bash --noprofile --norc -exo pipefail {0} +env: + HATCH_VERBOSE: 10 jobs: release: if: github.repository_owner == 'bentoml' diff --git a/.github/workflows/release-notes.yml b/.github/workflows/release-notes.yml index 8b572454..f451c4a4 100644 --- a/.github/workflows/release-notes.yml +++ b/.github/workflows/release-notes.yml @@ -25,6 +25,8 @@ on: tags: required: true type: string +env: + HATCH_VERBOSE: 10 defaults: run: shell: bash --noprofile --norc -exo pipefail {0} diff --git a/README.md b/README.md index 991ea2c4..b65bf043 100644 --- a/README.md +++ b/README.md @@ -266,7 +266,7 @@ pip install "openllm[mpt]" ```bash -pip install openllm +pip install "openllm[opt]" ``` diff --git a/changelog.d/114.fix.md b/changelog.d/114.fix.md new file mode 100644 index 00000000..346670f3 --- /dev/null +++ b/changelog.d/114.fix.md @@ -0,0 +1,7 @@ +Fixes resources to correctly follows CUDA_VISIBLE_DEVICES spec + +OpenLLM now contains a standalone parser that mimic `torch.cuda` parser for set +GPU devices. This parser will be used to parse both AMD and NVIDIA GPUs. + +`openllm` should now be able to parse `GPU-` and `MIG-` UUID from both +configuration or spec. diff --git a/hatch.toml b/hatch.toml index 286109b7..9e9c06fa 100644 --- a/hatch.toml +++ b/hatch.toml @@ -26,8 +26,8 @@ features = ['flan-t5'] [envs.default.scripts] changelog = "towncrier build --version main --draft" quality = [ + "./tools/dependencies.py", "./tools/update-readme.py", - "./tools/update-optional-dependencies.py", "./tools/update-config-stubs.py", "./tools/update-models-import.py", "- ./tools/add-license-headers .", @@ -42,6 +42,7 @@ extra-dependencies = [ ] [envs.tests.scripts] _run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml" +distributed = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -n 3 -r aR {args:tests}" models = "_run_script -r aR {args:tests/models}" python = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -r aR {args:tests}" [envs.tests.overrides] diff --git a/nightly-requirements-gpu.txt b/nightly-requirements-gpu.txt index a7d80a06..ac6eac73 100644 --- a/nightly-requirements-gpu.txt +++ b/nightly-requirements-gpu.txt @@ -1,4 +1,4 @@ -# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT +# This file is generated by `tools/dependencies.py`. # DO NOT EDIT # For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup. -r nightly-requirements.txt -e .[all] diff --git a/nightly-requirements.txt b/nightly-requirements.txt index 9d6780f2..7ec37722 100644 --- a/nightly-requirements.txt +++ b/nightly-requirements.txt @@ -1,4 +1,4 @@ -# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT +# This file is generated by `tools/dependencies.py`. DO NOT EDIT -e .[playground,flan-t5] bentoml[grpc,io] @ git+https://github.com/bentoml/bentoml.git@main peft @ git+https://github.com/huggingface/peft.git@main diff --git a/pyproject.toml b/pyproject.toml index 7ab907b0..3e1ede0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,5 @@ +# NOTE: The following are managed by ./tools/dependencies.py +# project.classifiers, project.dependencies, project.optional-dependencies [build-system] build-backend = "hatchling.build" requires = ["hatchling"] @@ -29,18 +31,18 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -# NOTE: The below is managed by ./tools/update-optional-dependencies.py dependencies = [ - "bentoml[grpc,io]>=1.0.22", - "transformers[torch,tokenizers,accelerate]>=4.29.0", - "optimum", - "attrs>=23.1.0", - "cattrs>=23.1.0", - "orjson", - "inflection", - "tabulate[widechars]>=0.9.0", - "httpx", - "typing_extensions", + 'bentoml[grpc,io]>=1.0.22', + 'transformers[torch,tokenizers,accelerate]>=4.29.0', + 'optimum', + 'attrs>=23.1.0', + 'cattrs>=23.1.0', + 'orjson', + 'inflection', + 'tabulate[widechars]>=0.9.0', + 'httpx', + 'typing_extensions', + 'cuda-python;platform_system!="Darwin"', ] description = 'OpenLLM: Operating LLMs in production' dynamic = ["version"] @@ -62,9 +64,6 @@ license = "Apache-2.0" name = "openllm" readme = "README.md" requires-python = ">=3.8" - -# NOTE: Don't modify project.optional-dependencies -# as it is managed by ./tools/update-optional-dependencies.py [project.optional-dependencies] agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"] all = [ @@ -72,22 +71,28 @@ all = [ "openllm[falcon]", "openllm[mpt]", "openllm[starcoder]", + "openllm[opt]", "openllm[flan-t5]", "openllm[fine-tune]", + "openllm[vllm]", "openllm[agents]", - "openllm[playground]", "openllm[ggml]", + "openllm[playground]", "openllm[openai]", + "openllm[gptq]", ] chatglm = ["cpm-kernels", "sentencepiece"] falcon = ["einops", "xformers", "safetensors"] fine-tune = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"] flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"] ggml = ["ctransformers"] +gptq = ["auto-gptq", "triton"] mpt = ["triton", "einops"] openai = ["openai", "tiktoken"] +opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"] playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] starcoder = ["bitsandbytes"] +vllm = ["vllm"] [project.urls] Documentation = "https://github.com/bentoml/openllm#readme" diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py index feca2f80..6574d1bf 100644 --- a/src/openllm/_llm.py +++ b/src/openllm/_llm.py @@ -71,6 +71,7 @@ else: from typing_extensions import overload if t.TYPE_CHECKING: + import auto_gptq as autogptq import peft import torch @@ -96,6 +97,8 @@ else: UserDictAny = collections.UserDict LLMRunnable = bentoml.Runnable LLMRunner = bentoml.Runner + + autogptq = LazyLoader("autogptq", globals(), "auto_gptq") transformers = LazyLoader("transformers", globals(), "transformers") torch = LazyLoader("torch", globals(), "torch") peft = LazyLoader("peft", globals(), "peft") @@ -445,7 +448,7 @@ class LLM(LLMInterface[M, T], ReprMixin): """The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.""" - quantization_config: transformers.BitsAndBytesConfig | None + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None """Quantisation config for quantised model on the fly.""" _model_id: str @@ -548,6 +551,44 @@ class LLM(LLMInterface[M, T], ReprMixin): openllm.serialisation.save_pretrained(self, save_directory, **attrs) + @classmethod + @overload + def from_pretrained( + cls, + model_id: str | None = ..., + model_version: str | None = ..., + llm_config: openllm.LLMConfig | None = ..., + *args: t.Any, + runtime: t.Literal["ggml", "transformers"] | None = ..., + quantize: t.Literal["int8", "int4"] = ..., + bettertransformer: str | bool | None = ..., + adapter_id: str | None = ..., + adapter_name: str | None = ..., + adapter_map: dict[str, str | None] | None = ..., + quantization_config: transformers.BitsAndBytesConfig | None = ..., + **attrs: t.Any, + ) -> LLM[M, T]: + ... + + @classmethod + @overload + def from_pretrained( + cls, + model_id: str | None = ..., + model_version: str | None = ..., + llm_config: openllm.LLMConfig | None = ..., + *args: t.Any, + runtime: t.Literal["ggml", "transformers"] | None = ..., + quantize: t.Literal["gptq"] = ..., + bettertransformer: str | bool | None = ..., + adapter_id: str | None = ..., + adapter_name: str | None = ..., + adapter_map: dict[str, str | None] | None = ..., + quantization_config: autogptq.BaseQuantizeConfig | None = ..., + **attrs: t.Any, + ) -> LLM[M, T]: + ... + @classmethod def from_pretrained( cls, @@ -561,7 +602,7 @@ class LLM(LLMInterface[M, T], ReprMixin): adapter_id: str | None = None, adapter_name: str | None = None, adapter_map: dict[str, str | None] | None = None, - quantization_config: transformers.BitsAndBytesConfig | None = None, + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, **attrs: t.Any, ) -> LLM[M, T]: """Instantiate a pretrained LLM. @@ -577,6 +618,17 @@ class LLM(LLMInterface[M, T], ReprMixin): > Currently, the above two options are mutually exclusive. + #### Quantisation options + + For customising options for quantisation config, ``openllm.LLM`` accepts all arbitrary arguments that is passed to ``transformers.BitsAndBytesConfig`` + plus ``quantize`` value. For example, for ``int8`` quantisation, specify the following: + ```python + model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False) + ``` + + For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed + to ``auto_gptq.BaseQuantizeConfig``. + ### Adapter options: > This is used in conjunction with the fine-tuning features @@ -689,7 +741,7 @@ class LLM(LLMInterface[M, T], ReprMixin): model_id: str, llm_config: openllm.LLMConfig, bettertransformer: bool | None, - quantization_config: transformers.BitsAndBytesConfig | None, + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, diff --git a/src/openllm/_quantisation.py b/src/openllm/_quantisation.py index 31c17de5..26c9ab4f 100644 --- a/src/openllm/_quantisation.py +++ b/src/openllm/_quantisation.py @@ -13,15 +13,26 @@ # limitations under the License. from __future__ import annotations import logging +import sys import typing as t from .utils import LazyLoader +from .utils import is_autogptq_available from .utils import is_bitsandbytes_available from .utils import is_transformers_supports_kbit from .utils import pkg +# NOTE: We need to do this so that overload can register +# correct overloads to typing registry +if sys.version_info[:2] >= (3, 11): + from typing import overload +else: + from typing_extensions import overload + + if t.TYPE_CHECKING: + import auto_gptq as autogptq import torch import openllm @@ -29,6 +40,7 @@ if t.TYPE_CHECKING: from ._types import DictStrAny else: + autogptq = LazyLoader("autogptq", globals(), "auto_gptq") torch = LazyLoader("torch", globals(), "torch") transformers = LazyLoader("transformers", globals(), "transformers") @@ -37,15 +49,38 @@ logger = logging.getLogger(__name__) QuantiseMode = t.Literal["int8", "int4", "gptq"] +@overload +def infer_quantisation_config( + cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any +) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: + ... + + +@overload +def infer_quantisation_config( + cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any +) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: + ... + + def infer_quantisation_config( cls: type[openllm.LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any -) -> tuple[transformers.BitsAndBytesConfig | t.Any, DictStrAny]: +) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: # 8 bit configuration int8_threshold = attrs.pop("llm_int8_threshhold", 6.0) int8_enable_fp32_cpu_offload = attrs.pop("llm_int8_enable_fp32_cpu_offload", False) int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None) int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False) + autogptq_attrs: DictStrAny = { + "bits": attrs.pop("gptq_bits", 4), + "group_size": attrs.pop("gptq_group_size", -1), + "damp_percent": attrs.pop("gptq_damp_percent", 0.01), + "desc_act": attrs.pop("gptq_desc_act", True), + "sym": attrs.pop("gptq_sym", True), + "true_sequential": attrs.pop("gptq_true_sequential", True), + } + def create_int8_config(int8_skip_modules: list[str] | None): if int8_skip_modules is None: int8_skip_modules = [] @@ -94,8 +129,15 @@ def infer_quantisation_config( logger.warning("OpenLLM will fallback to 8-bit quantization.") quantisation_config = create_int8_config(int8_skip_modules) elif quantise == "gptq": - # TODO: support GPTQ loading quantization - raise NotImplementedError("GPTQ is not supported yet.") + if not is_autogptq_available(): + logger.warning( + "'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment)." + " Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback " + "to int8 with bitsandbytes." + ) + quantisation_config = create_int8_config(int8_skip_modules) + else: + quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs) else: raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.") diff --git a/src/openllm/_strategies.py b/src/openllm/_strategies.py index 9ec9e270..c3c2198c 100644 --- a/src/openllm/_strategies.py +++ b/src/openllm/_strategies.py @@ -13,11 +13,15 @@ # limitations under the License. from __future__ import annotations +import functools +import inspect import logging import math import os import sys +import types import typing as t +import warnings import psutil @@ -27,62 +31,113 @@ from bentoml._internal.resource import system_resources from bentoml._internal.runner.strategy import THREAD_ENVS from bentoml._internal.runner.strategy import Strategy -from .exceptions import OpenLLMException +from .utils import LazyLoader +from .utils import LazyType from .utils import ReprMixin if t.TYPE_CHECKING: + import torch + import bentoml ListIntStr = list[int | str] + + class DynResource(Resource[t.List[str]], resource_id=""): + resource_id: t.ClassVar[str] + else: + DynResource = Resource[t.List[str]] + torch = LazyLoader("torch", globals(), "torch") ListIntStr = list +# NOTE: We need to do this so that overload can register +# correct overloads to typing registry +if sys.version_info[:2] >= (3, 11): + from typing import overload +else: + from typing_extensions import overload + logger = logging.getLogger(__name__) -class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"): - @classmethod - def from_spec(cls, spec: t.Any) -> list[str]: - if not isinstance(spec, (int, str, list)): - raise TypeError("AMD GPU device IDs must be int, str or a list specifing the exact GPUs to use.") +def _strtoul(s: str) -> int: + """Return -1 or positive integer sequence string starts with,.""" + if not s: + return -1 + for idx, c in enumerate(s): + if not (c.isdigit() or (idx == 0 and c in "+-")): + break + if idx + 1 == len(s): + idx += 1 # noqa: PLW2901 + return int(s[:idx]) if idx > 0 else -1 # type: ignore (idx will be set via enumerate) - try: - if isinstance(spec, int): - if spec == -1: - return [] - if spec < -1: - raise ValueError - return [str(i) for i in range(spec)] - elif isinstance(spec, str): - try: - return cls.from_spec(int(spec)) - except ValueError: - if spec.startswith("GPU"): - return [spec] - raise ValueError - else: - return [str(x) for x in spec] - except ValueError: - raise OpenLLMException(f"Invalid AMD GPU resource limit '{spec}'.") - @classmethod - def from_system(cls) -> list[str]: - """Retrieve AMD GPU from system, currently only supports on Linux. - - This assumes that ROCm is setup correctly. - """ - cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_visible_devices in ("", "-1"): +def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: + rcs: list[str] = [] + for elem in lst.split(","): + # Repeated id results in empty set + if elem in rcs: return [] - if cuda_visible_devices is not None: - cuda_visible_devices = cuda_visible_devices.split(",") - if "-1" in cuda_visible_devices: - cuda_visible_devices = cuda_visible_devices[: cuda_visible_devices.index("-1")] - return cuda_visible_devices + # Anything other but prefix is ignored + if not elem.startswith(prefix): + break + rcs.append(elem) + return rcs + +_STACK_LEVEL = 3 + + +@overload +def _parse_visible_devices(default_var: str | None = ..., respect_env: t.Literal[True] = True) -> list[str] | None: + ... + + +@overload +def _parse_visible_devices(default_var: str = ..., respect_env: t.Literal[False] = False) -> list[str]: + ... + + +def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None: + """CUDA_VISIBLE_DEVICES aware with default var for parsing spec.""" + if respect_env: + spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var) + if not spec: + return + else: + assert default_var is not None, "spec is required to be not None when parsing spec." # noqa: S101 + spec = default_var + + if spec.startswith("GPU-"): + return _parse_list_with_prefix(spec, "GPU-") + if spec.startswith("MIG-"): + return _parse_list_with_prefix(spec, "MIG-") + + # XXX: We to somehow handle cases such as '100m' + # CUDA_VISIBLE_DEVICES uses something like strtoul + # which makes `1gpu2,2ampere` is equivalent to `1,2` + rc: list[int] = [] + for el in spec.split(","): + x = _strtoul(el.strip()) + # Repeated ordinal results in empty set + if x in rc: + return [] + # Negative value aborts the sequence + if x < 0: + break + rc.append(x) + return [str(i) for i in rc] + + +def _from_system(cls: type[DynResource]) -> list[str]: + """Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation. + + It relies on torch.cuda implementation and in turns respect CUDA_VISIBLE_DEVICES. + """ + if cls.resource_id == "amd.com/gpu": if not psutil.LINUX: - logger.debug("AMD GPU resource is only supported on Linux.") + warnings.warn("AMD GPUs is currently only supported on Linux.", stacklevel=_STACK_LEVEL) return [] # ROCm does not currently have the rocm_smi wheel. @@ -90,37 +145,169 @@ class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"): # we don't want to use CLI because parsing is a pain. sys.path.append("/opt/rocm/libexec/rocm_smi") try: - from ctypes import byref - from ctypes import c_uint32 - # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py - from rsmiBindings import rocmsmi - from rsmiBindings import rsmi_status_t - - num = c_uint32(0) - ret = rocmsmi.rsmi_num_monitor_devices(byref(num)) - if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: - return [str(i) for i in range(num.value)] - return [] - except Exception as err: - logger.debug("Failed to setup AMD GPU resource: %s", err) + from rsmiBindings import rocmsmi as rocmsmi + except (ModuleNotFoundError, ImportError): + # In this case the binary is not found, returning empty list return [] finally: sys.path.remove("/opt/rocm/libexec/rocm_smi") + visible_devices = _parse_visible_devices() + if visible_devices is None: + return [str(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else [] + return visible_devices - @classmethod - def validate(cls, val: list[str]): - for gpu_index_or_literal in val: - try: - idx = int(gpu_index_or_literal) - except ValueError: - raise OpenLLMException(f"Invalid AMD GPU device index: {val}") - if int(idx) < 0: - raise OpenLLMException(f"Negative GPU device in {val}.") - if int(idx) >= len(cls.from_system()): - raise OpenLLMException( - f"GPU device index in {val} is greater than the system available: {cls.from_system()}" - ) + +@overload +def _from_spec(cls: type[DynResource], spec: int) -> list[str]: + ... + + +@overload +def _from_spec(cls: type[DynResource], spec: ListIntStr) -> list[str]: + ... + + +@overload +def _from_spec(cls: type[DynResource], spec: str) -> list[str]: + ... + + +def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]: + """Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation. + + The parser behaves similar to how PyTorch handles CUDA_VISIBLE_DEVICES. This means within + BentoML's resource configuration, its behaviour is similar to CUDA_VISIBLE_DEVICES. + """ + if isinstance(spec, int): + if spec in (-1, 0): + return [] + if spec < -1: + raise ValueError("Spec cannot be < -1.") + return [str(i) for i in range(spec)] + elif isinstance(spec, str): + if not spec: + return [] + if spec.isdigit(): + spec = ",".join([str(i) for i in range(_strtoul(spec))]) + return _parse_visible_devices(spec, respect_env=False) + elif LazyType(ListIntStr).isinstance(spec): + return [str(x) for x in spec] + else: + raise TypeError( + f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead." + ) + + +@functools.lru_cache +def _raw_uuid_nvml() -> list[str] | None: + """Return list of device UUID as reported by NVML or None if NVML discovery/initialization failed.""" + try: + from cuda import cuda + except ImportError: + if sys.platform == "darwin": + raise RuntimeError("GPU is not available on Darwin system.") from None + raise RuntimeError( + "Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly." + ) from None + + from ctypes import CDLL + from ctypes import byref + from ctypes import c_void_p + from ctypes import create_string_buffer + + nvml_h = CDLL("libnvidia-ml.so.1") + rc = nvml_h.nvmlInit() + if rc != 0: + warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL) + return + err, dev_count = cuda.cuDeviceGetCount() + if err != cuda.CUresult.CUDA_SUCCESS: + warnings.warn("Failed to get available device from system.", stacklevel=_STACK_LEVEL) + return + uuids: list[str] = [] + for idx in range(dev_count): + dev_id = c_void_p() + rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id)) + if rc != 0: + warnings.warn(f"Failed to get device handle for {idx}", stacklevel=_STACK_LEVEL) + return + buf_len = 96 + buf = create_string_buffer(buf_len) + rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len) + if rc != 0: + warnings.warn(f"Failed to get device UUID for {idx}", stacklevel=_STACK_LEVEL) + return + uuids.append(buf.raw.decode("ascii").strip("\0")) + del nvml_h + return uuids + + +def _validate(cls: type[DynResource], val: list[t.Any]): + if cls.resource_id == "amd.com/gpu": + raise RuntimeError( + "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'" + ) + if not all(isinstance(i, str) for i in val): + raise ValueError("Input list should be all string type.") + + try: + from cuda import cuda + except ImportError: + if sys.platform == "darwin": + raise RuntimeError("GPU is not available on Darwin system.") from None + raise RuntimeError( + "Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly." + ) from None + # correctly parse handle + for el in val: + if el.startswith("GPU-") or el.startswith("MIG-"): + uuids = _raw_uuid_nvml() + if uuids is None: + raise ValueError("Failed to parse available GPUs UUID") + if el not in uuids: + raise ValueError(f"Given UUID {el} is not found with available UUID (available: {uuids})") + elif el.isdigit(): + err, _ = cuda.cuDeviceGet(int(el)) + if err != cuda.CUresult.CUDA_SUCCESS: + raise ValueError(f"Failed to get device {el}") + + +def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: + return types.new_class( + name, + (DynResource, ReprMixin), + {"resource_id": resource_kind}, + lambda ns: ns.update( + { + "resource_id": resource_kind, + "from_spec": classmethod(_from_spec), + "from_system": classmethod(_from_system), + "validate": classmethod(_validate), + "__repr_keys__": property(lambda _: {"resource_id"}), + "__doc__": inspect.cleandoc(docstring), + "__module__": "openllm._strategies", + } + ), + ) + + +NvidiaGpuResource = _make_resource_class( + "NvidiaGpuResource", + "nvidia.com/gpu", + """NVIDIA GPU resource. + + This is a modified version of internal's BentoML's NvidiaGpuResource + where it respects and parse CUDA_VISIBLE_DEVICES correctly.""", +) +AmdGpuResource = _make_resource_class( + "AmdGpuResource", + "amd.com/gpu", + """AMD GPU resource. + + Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to + ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""", +) class CascadingResourceStrategy(Strategy, ReprMixin): @@ -147,15 +334,21 @@ class CascadingResourceStrategy(Strategy, ReprMixin): if resource_request is None: resource_request = system_resources() - # use nvidia gpu - nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu") - if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES: - return math.ceil(len(nvidia_gpus) * workers_per_resource) + def _get_gpu_count(typ: list[str] | None, kind: str): + if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: + return math.ceil(len(typ) * workers_per_resource) - # use amd gpu - amd_gpus = get_resource(resource_request, "amd.com/gpu") - if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES: - return math.ceil(len(amd_gpus) * workers_per_resource) + # use NVIDIA + kind = "nvidia.com/gpu" + count = _get_gpu_count(get_resource(resource_request, kind), kind) + if count: + return count + + # use AMD + kind = "amd.com/gpu" + count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind) + if count: + return count # use CPU cpus = get_resource(resource_request, "cpu") @@ -203,36 +396,32 @@ class CascadingResourceStrategy(Strategy, ReprMixin): if resource_request is None: resource_request = system_resources() - # use nvidia gpu - nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu") - if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES: - dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, nvidia_gpus, worker_index) + # use NVIDIA + kind = "nvidia.com/gpu" + typ = get_resource(resource_request, kind) + if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: if disabled: logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index) environ["CUDA_VISIBLE_DEVICES"] = cuda_env return environ - environ["CUDA_VISIBLE_DEVICES"] = dev - logger.info( - "Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s", - worker_index, - dev, + environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar( + workers_per_resource, typ, worker_index ) + logger.debug("Environ for worker %s: %s", worker_index, environ) return environ - # use amd gpu - amd_gpus = get_resource(resource_request, "amd.com/gpu") - if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES: - dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, amd_gpus, worker_index) + # use AMD + kind = "amd.com/gpu" + typ = get_resource(resource_request, kind, validate=False) + if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: if disabled: logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index) environ["CUDA_VISIBLE_DEVICES"] = cuda_env return environ - environ["CUDA_VISIBLE_DEVICES"] = dev - logger.info( - "Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s", - worker_index, - dev, + environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar( + workers_per_resource, typ, worker_index ) + logger.debug("Environ for worker %s: %s", worker_index, environ) return environ # use CPU @@ -243,23 +432,16 @@ class CascadingResourceStrategy(Strategy, ReprMixin): thread_count = math.ceil(cpus) for thread_env in THREAD_ENVS: environ[thread_env] = os.getenv(thread_env, str(thread_count)) - logger.info( - "Environ for worker %d: set CPU thread count to %d", - worker_index, - thread_count, - ) - return environ - else: - for thread_env in THREAD_ENVS: - environ[thread_env] = os.getenv(thread_env, "1") + logger.debug("Environ for worker %s: %s", worker_index, environ) return environ + for thread_env in THREAD_ENVS: + environ[thread_env] = os.getenv(thread_env, "1") + return environ return environ @staticmethod - def transpile_workers_to_cuda_visible_devices( - workers_per_resource: float | int, gpus: list[str], worker_index: int - ) -> str: + def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str: # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string. if isinstance(workers_per_resource, float): # NOTE: We hit this branch when workers_per_resource is set to @@ -287,9 +469,9 @@ class CascadingResourceStrategy(Strategy, ReprMixin): dev = ",".join(assigned_gpu) else: idx = worker_index // workers_per_resource - if len(gpus) == idx: + if idx >= len(gpus): raise ValueError( f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}" ) - dev = gpus[idx] + dev = str(gpus[idx]) return dev diff --git a/src/openllm/_types.py b/src/openllm/_types.py index a67d1876..0947c69d 100644 --- a/src/openllm/_types.py +++ b/src/openllm/_types.py @@ -30,6 +30,7 @@ from ._configuration import AdapterType if t.TYPE_CHECKING: + import auto_gptq as autogptq import click import peft @@ -155,7 +156,7 @@ class LLMRunner(bentoml.Runner): class LLMInitAttrs(t.TypedDict): config: openllm.LLMConfig - quantization_config: transformers.BitsAndBytesConfig | None + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None model_id: str runtime: t.Literal["ggml", "transformers"] model_decls: TupleAny diff --git a/src/openllm/cli.py b/src/openllm/cli.py index 712fca69..3d4b5925 100644 --- a/src/openllm/cli.py +++ b/src/openllm/cli.py @@ -773,7 +773,6 @@ def noop_command( def prerequisite_check( ctx: click.Context, llm_config: openllm.LLMConfig, - env: EnvVarMixin, gpu_available: tuple[str, ...], quantize: t.LiteralString | None, adapter_map: dict[str, str | None] | None, @@ -785,9 +784,6 @@ def prerequisite_check( if len(gpu_available) < 1: _echo(f"Quantization requires at least 1 GPU (got {len(gpu_available)})", fg="red") ctx.exit(1) - if env.framework_value != "pt": - _echo("Quantization is currently only available for PyTorch models.", fg="red") - ctx.exit(1) if adapter_map and not is_peft_available(): _echo( @@ -905,7 +901,7 @@ def start_bento( config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime ) - prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers) + prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers) # NOTE: This is to set current configuration start_env = os.environ.copy() @@ -1037,7 +1033,7 @@ def start_model( config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime ) - prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers) + prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers) # NOTE: This is to set current configuration start_env = os.environ.copy() @@ -1151,7 +1147,7 @@ def start_model( @output_option @quantize_option(click) @click.option("--machine", is_flag=True, default=False, hidden=True) -@click.option("--implementation", type=click.Choice(["pt", "tf", "flax"]), default=None, hidden=True) +@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, hidden=True) def download_models_command( model: str, model_id: str | None, @@ -1193,7 +1189,7 @@ def download_models_command( > only use this option if you want the weight to be quantized by default. Note that OpenLLM also > support on-demand quantisation during initial startup. """ - impl: t.Literal["pt", "tf", "flax"] = first_not_none(implementation, default=EnvVarMixin(model).framework_value) + impl: LiteralRuntime = first_not_none(implementation, default=EnvVarMixin(model).framework_value) llm = openllm.infer_auto_class(impl).for_model( model, model_id=model_id, @@ -1263,7 +1259,7 @@ def _start( runtime: t.Literal["ggml", "transformers"] = ..., fast: bool = ..., adapter_map: dict[t.LiteralString, str | None] | None = ..., - framework: t.Literal["flax", "tf", "pt"] | None = ..., + framework: LiteralRuntime | None = ..., additional_args: ListStr | None = ..., _serve_grpc: bool = ..., __test__: t.Literal[False] = ..., @@ -1284,7 +1280,7 @@ def _start( runtime: t.Literal["ggml", "transformers"] = ..., fast: bool = ..., adapter_map: dict[t.LiteralString, str | None] | None = ..., - framework: t.Literal["flax", "tf", "pt"] | None = ..., + framework: LiteralRuntime | None = ..., additional_args: ListStr | None = ..., _serve_grpc: bool = ..., __test__: t.Literal[True] = ..., @@ -1304,7 +1300,7 @@ def _start( runtime: t.Literal["ggml", "transformers"] = "transformers", fast: bool = False, adapter_map: dict[t.LiteralString, str | None] | None = None, - framework: t.Literal["flax", "tf", "pt"] | None = None, + framework: LiteralRuntime | None = None, additional_args: ListStr | None = None, _serve_grpc: bool = False, __test__: bool = False, @@ -1615,6 +1611,13 @@ start, start_grpc, build, import_model, list_models = ( help="The output format for 'openllm build'. By default this will build a BentoLLM. 'container' is the shortcut of 'openllm build && bentoml containerize'.", hidden=not get_debug_mode(), ) +@click.option( + "--push", + default=False, + is_flag=True, + type=click.BOOL, + help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.", +) @click.pass_context def build_command( ctx: click.Context, @@ -1632,6 +1635,7 @@ def build_command( model_version: str | None, dockerfile_template: t.TextIO | None, format: t.Literal["bento", "container"], + push: bool, **attrs: t.Any, ): """Package a given models into a Bento. @@ -1788,7 +1792,12 @@ def build_command( else: _echo(bento.tag) - if format == "container": + if format == "container" and push: + ctx.fail("'--format=container' and '--push' are mutually exclusive.") + if push: + client = BentoMLContainer.bentocloud_client.get() + client.push_bento(bento) + elif format == "container": backend = os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker") _echo(f"Building {bento} into a LLMContainer using backend '{backend}'", fg="magenta") if not bentoml.container.health(backend): diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py index 2a1f3a45..69ba80ca 100644 --- a/src/openllm/utils/__init__.py +++ b/src/openllm/utils/__init__.py @@ -99,11 +99,8 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An def gpu_count() -> tuple[str, ...]: - from bentoml._internal.resource import NvidiaGpuResource - - cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) - if cuda_visible_devices is not None: - return tuple(i for i in cuda_visible_devices.split(",")) + """Return available GPU under system. Currently only supports NVIDIA GPUs.""" + from .._strategies import NvidiaGpuResource return tuple(NvidiaGpuResource.from_system()) @@ -417,6 +414,7 @@ _import_structure = { "is_jupytext_available", "is_notebook_available", "is_triton_available", + "is_autogptq_available", "require_backends", ], } @@ -443,6 +441,7 @@ if t.TYPE_CHECKING: from .import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES from .import_utils import DummyMetaclass as DummyMetaclass from .import_utils import EnvVarMixin as EnvVarMixin + from .import_utils import is_autogptq_available as is_autogptq_available from .import_utils import is_bitsandbytes_available as is_bitsandbytes_available from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available from .import_utils import is_datasets_available as is_datasets_available diff --git a/src/openllm/utils/codegen.py b/src/openllm/utils/codegen.py index 56d0ee75..ffee6a0d 100644 --- a/src/openllm/utils/codegen.py +++ b/src/openllm/utils/codegen.py @@ -252,7 +252,7 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]): def generate_unique_filename(cls: type[t.Any], func_name: str): - return f"<{cls.__name__} generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>" + return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>" def generate_function( @@ -332,6 +332,7 @@ def make_env_transformer( def gen_sdk(func: t.Callable[P, t.Any], name: str | None = None, **attrs: t.Any): + """Enhance function with nicer Repr.""" from .representation import ReprMixin if name is None: diff --git a/src/openllm/utils/import_utils.py b/src/openllm/utils/import_utils.py index b17b6409..cfbc4dfd 100644 --- a/src/openllm/utils/import_utils.py +++ b/src/openllm/utils/import_utils.py @@ -56,16 +56,17 @@ else: logger = logging.getLogger(__name__) OPTIONAL_DEPENDENCIES = { + "chatglm", + "falcon", + "mpt", + "starcoder", "fine-tune", "flan-t5", - "mpt", - "falcon", - "starcoder", - "chatglm", - "openai", - "agents", - "playground", "ggml", + "agents", + "openai", + "playground", + "gptq", } ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"}) @@ -100,6 +101,7 @@ _triton_available = _is_package_available("triton") _jupyter_available = _is_package_available("jupyter") _jupytext_available = _is_package_available("jupytext") _notebook_available = _is_package_available("notebook") +_autogptq_available = _is_package_available("auto-gptq") def is_transformers_supports_kbit() -> bool: @@ -146,6 +148,10 @@ def is_bitsandbytes_available(): return _bitsandbytes_available +def is_autogptq_available(): + return _autogptq_available + + def is_torch_available(): global _torch_available if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES: @@ -309,6 +315,11 @@ You can install it with pip: `pip install bitsandbytes`. Please note that you ma your runtime after installation. """ +AUTOGPTQ_IMPORT_ERROR = """{0} requires the auto-gptq library but it was not found in your environment. +You can install it with pip: `pip install auto-gptq`. Please note that you may need to restart +your runtime after installation. +""" + BACKENDS_MAPPING = BackendOrderredDict( [ ("flax", (is_flax_available, FLAX_IMPORT_ERROR)), @@ -320,6 +331,7 @@ BACKENDS_MAPPING = BackendOrderredDict( ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ("peft", (is_peft_available, PEFT_IMPORT_ERROR)), ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), + ("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), ] ) diff --git a/tests/strategies_test.py b/tests/strategies_test.py index 8ae288a0..0a3f4252 100644 --- a/tests/strategies_test.py +++ b/tests/strategies_test.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import sys import typing as t import pytest @@ -25,14 +26,127 @@ import bentoml from bentoml._internal.resource import get_resource from openllm import _strategies as strategy from openllm._strategies import CascadingResourceStrategy +from openllm._strategies import NvidiaGpuResource + + +def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as mcls: + mcls.setenv("CUDA_VISIBLE_DEVICES", "0,1") + resource = NvidiaGpuResource.from_system() + assert len(resource) == 2 + assert resource == ["0", "1"] + mcls.delenv("CUDA_VISIBLE_DEVICES") + + +def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as mcls: + mcls.setenv("CUDA_VISIBLE_DEVICES", "0,2,-1,1") + resource = NvidiaGpuResource.from_system() + assert len(resource) == 2 + assert resource == ["0", "2"] + mcls.delenv("CUDA_VISIBLE_DEVICES") + + +def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as mcls: + mcls.setenv("CUDA_VISIBLE_DEVICES", "-1") + resource = NvidiaGpuResource.from_system() + assert len(resource) == 0 + assert resource == [] + mcls.delenv("CUDA_VISIBLE_DEVICES") + + +def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as mcls: + mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43-ac33420d4628") + resource = NvidiaGpuResource.from_system() + assert len(resource) == 1 + assert resource == ["GPU-5ebe9f43-ac33420d4628"] + mcls.delenv("CUDA_VISIBLE_DEVICES") + with monkeypatch.context() as mcls: + mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,GPU-ac33420d4628") + resource = NvidiaGpuResource.from_system() + assert len(resource) == 2 + assert resource == ["GPU-5ebe9f43", "GPU-ac33420d4628"] + mcls.delenv("CUDA_VISIBLE_DEVICES") + with monkeypatch.context() as mcls: + mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,-1,GPU-ac33420d4628") + resource = NvidiaGpuResource.from_system() + assert len(resource) == 1 + assert resource == ["GPU-5ebe9f43"] + mcls.delenv("CUDA_VISIBLE_DEVICES") + with monkeypatch.context() as mcls: + mcls.setenv("CUDA_VISIBLE_DEVICES", "MIG-GPU-5ebe9f43-ac33420d4628") + resource = NvidiaGpuResource.from_system() + assert len(resource) == 1 + assert resource == ["MIG-GPU-5ebe9f43-ac33420d4628"] + mcls.delenv("CUDA_VISIBLE_DEVICES") + + +def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as mcls: + # to make this tests works with system that has GPU + mcls.setenv("CUDA_VISIBLE_DEVICES", "") + assert len(NvidiaGpuResource.from_system()) >= 0 # TODO: real from_system tests + + assert pytest.raises( + ValueError, + NvidiaGpuResource.validate, + [*NvidiaGpuResource.from_system(), 1], + ).match("Input list should be all string type.") + assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match( + "Input list should be all string type." + ) + assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["GPU-5ebe9f43", "GPU-ac33420d4628"]).match( + "Failed to parse available GPUs UUID" + ) + + +def test_nvidia_gpu_validate_no_gpu_available(): + assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["0", "1"]).match("Failed to get device *") + + +@pytest.mark.skipif(sys.platform != "darwin", reason="Test NVIDIA validation on Darwin only") +def test_nvidia_gpu_validation_on_darwin(): + assert pytest.raises(RuntimeError, NvidiaGpuResource.validate, ["0"]).match( + "GPU is not available on Darwin system." + ) + + +def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as mcls: + # to make this tests works with system that has GPU + mcls.setenv("CUDA_VISIBLE_DEVICES", "") + assert NvidiaGpuResource.from_spec(1) == ["0"] + assert NvidiaGpuResource.from_spec("5") == ["0", "1", "2", "3", "4"] + assert NvidiaGpuResource.from_spec(1) == ["0"] + assert NvidiaGpuResource.from_spec(2) == ["0", "1"] + assert NvidiaGpuResource.from_spec("3") == ["0", "1", "2"] + assert NvidiaGpuResource.from_spec([1, 3]) == ["1", "3"] + assert NvidiaGpuResource.from_spec(["1", "3"]) == ["1", "3"] + assert NvidiaGpuResource.from_spec(-1) == [] + assert NvidiaGpuResource.from_spec("-1") == [] + assert NvidiaGpuResource.from_spec("") == [] + assert NvidiaGpuResource.from_spec("-2") == [] + assert NvidiaGpuResource.from_spec("GPU-288347ab") == ["GPU-288347ab"] + assert NvidiaGpuResource.from_spec("GPU-288347ab,-1,GPU-ac33420d4628") == ["GPU-288347ab"] + assert NvidiaGpuResource.from_spec("GPU-288347ab,GPU-ac33420d4628") == ["GPU-288347ab", "GPU-ac33420d4628"] + assert NvidiaGpuResource.from_spec("MIG-GPU-288347ab") == ["MIG-GPU-288347ab"] + + with pytest.raises(TypeError): + NvidiaGpuResource.from_spec((1, 2, 3)) + with pytest.raises(TypeError): + NvidiaGpuResource.from_spec(1.5) + with pytest.raises(ValueError): + assert NvidiaGpuResource.from_spec(-2) class GPURunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu") -def unvalidated_get_resource(x: dict[str, t.Any], y: str): - return get_resource(x, y, validate=False) +def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False): + return get_resource(x, y, validate=validate) @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"]) diff --git a/tools/dependencies.py b/tools/dependencies.py new file mode 100755 index 00000000..13aa25d2 --- /dev/null +++ b/tools/dependencies.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +# Copyright 2023 BentoML Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import dataclasses +import os +import shutil +import subprocess +import typing as t + +import inflection +import tomlkit + +import openllm + +if t.TYPE_CHECKING: + from tomlkit.items import Array + from tomlkit.items import Table + + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +@dataclasses.dataclass(frozen=True) +class Classifier: + identifier: t.Dict[str, str] = dataclasses.field( + default_factory=lambda: { + "status": "Development Status", + "environment": "Environment", + "license": "License", + "topic": "Topic", + "os": "Operating System", + "audience": "Intended Audience", + "typing": "Typing", + "language": "Programming Language", + } + ) + + joiner: str = " :: " + + @staticmethod + def status() -> dict[int, str]: + return { + v: status + for v, status in zip( + range(1, 8), + [ + "1 - Planning", + "2 - Pre-Alpha", + "3 - Alpha", + "4 - Beta", + "5 - Production/Stable", + "6 - Mature", + "7 - Inactive", + ], + ) + } + + @staticmethod + def apache() -> str: + return Classifier.create_classifier("license", "OSI Approved", "Apache Software License") + + @staticmethod + def create_classifier(identifier: str, *decls: t.Any) -> str: + cls_ = Classifier() + if identifier not in cls_.identifier: + raise ValueError(f"{identifier} is not yet supported (supported alias: {Classifier.identifier})") + return cls_.joiner.join([cls_.identifier[identifier], *decls]) + + @staticmethod + def create_python_classifier( + implementation: list[str] | None = None, supported_version: list[str] | None = None + ) -> list[str]: + if supported_version is None: + supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"] + if implementation is None: + implementation = ["CPython", "PyPy"] + base = [ + Classifier.create_classifier("language", "Python"), + Classifier.create_classifier("language", "Python", "3"), + ] + base.append(Classifier.create_classifier("language", "Python", "3", "Only")) + base.extend([Classifier.create_classifier("language", "Python", version) for version in supported_version]) + base.extend( + [Classifier.create_classifier("language", "Python", "Implementation", impl) for impl in implementation] + ) + return base + + @staticmethod + def create_status_classifier(level: int) -> str: + return Classifier.create_classifier("status", Classifier.status()[level]) + + +@dataclasses.dataclass(frozen=True) +class Dependencies: + name: str + git_repo_url: t.Optional[str] = None + branch: t.Optional[str] = None + extensions: t.Optional[t.List[str]] = None + subdirectory: t.Optional[str] = None + requires_gpu: bool = False + lower_constraint: t.Optional[str] = None + platform: t.Optional[t.Tuple[t.Literal["Linux", "Windows", "Darwin"], t.Literal["eq", "ne"]]] = None + + def with_options(self, **kwargs: t.Any) -> Dependencies: + return dataclasses.replace(self, **kwargs) + + @property + def has_constraint(self) -> bool: + return self.lower_constraint is not None + + @property + def pypi_extensions(self) -> str: + return "" if self.extensions is None else f"[{','.join(self.extensions)}]" + + @staticmethod + def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str: + return f'platform_system{"==" if op == "eq" else "!="}"{platform}"' + + def to_str(self) -> str: + deps: list[str] = [] + if self.lower_constraint is not None: + deps.append(f"{self.name}{self.pypi_extensions}>={self.lower_constraint}") + elif self.subdirectory is not None: + deps.append( + f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}" + ) + elif self.branch is not None: + deps.append( + f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}" + ) + else: + deps.append(f"{self.name}{self.pypi_extensions}") + + if self.platform: + deps.append(self.platform_restriction(*self.platform)) + + return ";".join(deps) + + @classmethod + def from_tuple(cls, *decls: t.Any) -> Dependencies: + return cls(*decls) + + +_BENTOML_EXT = ["grpc", "io"] +_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"] + +_BASE_DEPENDENCIES = [ + Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"), + Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"), + Dependencies(name="optimum"), + Dependencies(name="attrs", lower_constraint="23.1.0"), + Dependencies(name="cattrs", lower_constraint="23.1.0"), + Dependencies(name="orjson"), + Dependencies(name="inflection"), + Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"), + Dependencies(name="httpx"), + Dependencies(name="typing_extensions"), + Dependencies(name="cuda-python", platform=("Darwin", "ne")), +] + +_NIGHTLY_MAPPING: dict[str, Dependencies] = { + "bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT), + "peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None), + "transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT), + "optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None), + "accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None), + "bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None), + "trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None), + "triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True), +} + +_ALL_RUNTIME_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"] +FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"] +FLAN_T5_DEPS = _ALL_RUNTIME_DEPS +OPT_DEPS = _ALL_RUNTIME_DEPS +MPT_DEPS = ["triton", "einops"] +OPENAI_DEPS = ["openai", "tiktoken"] +AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"] +FALCON_DEPS = ["einops", "xformers", "safetensors"] +STARCODER_DEPS = ["bitsandbytes"] +CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"] +PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] +GGML_DEPS = ["ctransformers"] +GPTQ_DEPS = ["auto-gptq", "triton"] +VLLM_DEPS = ["vllm"] + +_base_requirements = { + inflection.dasherize(name): config_cls.__openllm_requirements__ + for name, config_cls in openllm.CONFIG_MAPPING.items() + if config_cls.__openllm_requirements__ +} + +# shallow copy from locals() +_locals = locals().copy() + +# NOTE: update this table when adding new external dependencies +# sync with openllm.utils.OPTIONAL_DEPENDENCIES +_base_requirements.update( + {v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES} +) + +fname = f"{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}" + + +def create_classifiers() -> Array: + arr = tomlkit.array() + arr.extend( + [ + Classifier.create_status_classifier(5), + Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA"), + Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "12"), + Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.8"), + Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.7"), + Classifier.apache(), + Classifier.create_classifier("topic", "Scientific/Engineering", "Artificial Intelligence"), + Classifier.create_classifier("topic", "Software Development", "Libraries"), + Classifier.create_classifier("os", "OS Independent"), + Classifier.create_classifier("audience", "Developers"), + Classifier.create_classifier("audience", "Science/Research"), + Classifier.create_classifier("audience", "System Administrators"), + Classifier.create_classifier("typing", "Typed"), + *Classifier.create_python_classifier(), + ] + ) + return arr.multiline(True) + + +def create_optional_table() -> Table: + table = tomlkit.table() + table.update(_base_requirements) + + all_array = tomlkit.array() + all_array.extend([f"openllm[{k}]" for k in table.keys()]) + table.add("all", all_array.multiline(True)) + return table + + +def main() -> int: + with open(os.path.join(ROOT, "pyproject.toml"), "r") as f: + pyproject = tomlkit.parse(f.read()) + + t.cast("Table", pyproject["project"]).update( + { + "classifiers": create_classifiers(), + "optional-dependencies": create_optional_table(), + "dependencies": tomlkit.array(f"{[v.to_str() for v in _BASE_DEPENDENCIES]}").multiline(True), + } + ) + with open(os.path.join(ROOT, "pyproject.toml"), "w") as f: + f.write(tomlkit.dumps(pyproject)) + + with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f: + f.write(f"# This file is generated by `{fname}`. DO NOT EDIT\n-e .[playground,flan-t5]\n") + f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu]) + with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f: + f.write(f"# This file is generated by `{fname}`. # DO NOT EDIT\n") + f.write( + "# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n" + ) + f.write("-r nightly-requirements.txt\n-e .[all]\n") + f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu]) + + if shutil.which("taplo"): + return subprocess.check_call(["taplo", "format", os.path.join(ROOT, "pyproject.toml")]) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/update-optional-dependencies.py b/tools/update-optional-dependencies.py deleted file mode 100755 index 82fff857..00000000 --- a/tools/update-optional-dependencies.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2023 BentoML Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import annotations - -import dataclasses -import os -import shutil -import typing as t - -import inflection -import tomlkit - -import openllm - - -ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - - -@dataclasses.dataclass(frozen=True) -class Dependencies: - name: str - git_repo_url: t.Optional[str] = None - branch: t.Optional[str] = None - extensions: t.Optional[t.List[str]] = None - subdirectory: t.Optional[str] = None - requires_gpu: bool = False - lower_constraint: t.Optional[str] = None - - def with_options(self, **kwargs: t.Any) -> Dependencies: - return dataclasses.replace(self, **kwargs) - - @property - def has_constraint(self) -> bool: - return self.lower_constraint is not None - - @property - def pypi_extensions(self) -> str: - return "" if self.extensions is None else f"[{','.join(self.extensions)}]" - - def to_str(self) -> str: - if self.lower_constraint is not None: - return f"{self.name}{self.pypi_extensions}>={self.lower_constraint}" - elif self.subdirectory is not None: - return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}" - elif self.branch is not None: - return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}" - else: - return f"{self.name}{self.pypi_extensions}" - - @classmethod - def from_tuple(cls, *decls: t.Any) -> Dependencies: - return cls(*decls) - - -_BENTOML_EXT = ["grpc", "io"] -_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"] - -_BASE_DEPENDENCIES = [ - Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"), - Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"), - Dependencies(name="optimum"), - Dependencies(name="attrs", lower_constraint="23.1.0"), - Dependencies(name="cattrs", lower_constraint="23.1.0"), - Dependencies(name="orjson"), - Dependencies(name="inflection"), - Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"), - Dependencies(name="httpx"), - Dependencies(name="typing_extensions"), -] - -_NIGHTLY_MAPPING: dict[str, Dependencies] = { - "bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT), - "peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None), - "transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT), - "optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None), - "accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None), - "bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None), - "trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None), - "triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True), -} - -FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"] -FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"] -MPT_DEPS = ["triton", "einops"] -OPENAI_DEPS = ["openai", "tiktoken"] -AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"] -FALCON_DEPS = ["einops", "xformers", "safetensors"] -STARCODER_DEPS = ["bitsandbytes"] -CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"] -PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] -GGML_DEPS = ["ctransformers"] - -_base_requirements = { - inflection.dasherize(name): config_cls.__openllm_requirements__ - for name, config_cls in openllm.CONFIG_MAPPING.items() - if config_cls.__openllm_requirements__ -} - -# shallow copy from locals() -_locals = locals().copy() - -# NOTE: update this table when adding new external dependencies -# sync with openllm.utils.OPTIONAL_DEPENDENCIES -_base_requirements.update( - {v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES} -) - - -def main() -> int: - with open(os.path.join(ROOT, "pyproject.toml"), "r") as f: - pyproject = tomlkit.parse(f.read()) - - table = tomlkit.table() - for name, config in _base_requirements.items(): - table.add(name, config) - - table.add("all", [f"openllm[{k}]" for k in table.keys()]) - - pyproject["project"]["optional-dependencies"] = table - - # write project dependencies - pyproject["project"]["dependencies"] = [v.to_str() for v in _BASE_DEPENDENCIES] - with open(os.path.join(ROOT, "pyproject.toml"), "w") as f: - f.write(tomlkit.dumps(pyproject)) - - with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f: - f.write( - "# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT\n-e .[playground,flan-t5]\n" - ) - f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu]) - with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f: - f.write("# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT\n") - f.write( - "# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n" - ) - f.write("-r nightly-requirements.txt\n-e .[all]\n") - f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu]) - - if shutil.which("taplo"): - return os.system(f"taplo fmt {os.path.join(ROOT, 'pyproject.toml')}") - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/typings/cuda/__init__.pyi b/typings/cuda/__init__.pyi new file mode 100644 index 00000000..e76cccb8 --- /dev/null +++ b/typings/cuda/__init__.pyi @@ -0,0 +1,2 @@ +from . import cuda as cuda +from . import cudart as cudart diff --git a/typings/cuda/cuda.pyi b/typings/cuda/cuda.pyi new file mode 100644 index 00000000..982d3e5f --- /dev/null +++ b/typings/cuda/cuda.pyi @@ -0,0 +1,26 @@ +# Copyright 2023 BentoML Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum + +class CUresult(Enum): + CUDA_SUCCESS = 0 + +class _CUMixin: + def getPtr(self) -> int: ... + +class CUdevice(_CUMixin): ... + +def cuDeviceGetCount() -> tuple[CUresult, int]: ... +def cuDeviceGet(dev: int) -> tuple[CUresult, CUdevice]: ...