From b2dba6143fa614d4cd7bc00bc6272b071fb26a7d Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Sat, 15 Jul 2023 07:19:35 -0400
Subject: [PATCH] fix(resource): correctly parse CUDA_VISIBLE_DEVICES (#114)
---
.github/workflows/binary-releases.yml | 1 +
.github/workflows/ci.yml | 4 +-
.github/workflows/create-releases.yml | 2 +
.github/workflows/release-notes.yml | 2 +
README.md | 2 +-
changelog.d/114.fix.md | 7 +
hatch.toml | 3 +-
nightly-requirements-gpu.txt | 2 +-
nightly-requirements.txt | 2 +-
pyproject.toml | 35 ++-
src/openllm/_llm.py | 58 +++-
src/openllm/_quantisation.py | 48 +++-
src/openllm/_strategies.py | 390 +++++++++++++++++++-------
src/openllm/_types.py | 3 +-
src/openllm/cli.py | 33 ++-
src/openllm/utils/__init__.py | 9 +-
src/openllm/utils/codegen.py | 3 +-
src/openllm/utils/import_utils.py | 26 +-
tests/strategies_test.py | 118 +++++++-
tools/dependencies.py | 285 +++++++++++++++++++
tools/update-optional-dependencies.py | 160 -----------
typings/cuda/__init__.pyi | 2 +
typings/cuda/cuda.pyi | 26 ++
23 files changed, 903 insertions(+), 318 deletions(-)
create mode 100644 changelog.d/114.fix.md
create mode 100755 tools/dependencies.py
delete mode 100755 tools/update-optional-dependencies.py
create mode 100644 typings/cuda/__init__.pyi
create mode 100644 typings/cuda/cuda.pyi
diff --git a/.github/workflows/binary-releases.yml b/.github/workflows/binary-releases.yml
index 62dcd133..9785b2e2 100644
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -13,6 +13,7 @@ env:
APP_NAME: openllm
PYTHON_VERSION: '3.11'
PYOXIDIZER_VERSION: '0.24.0'
+ HATCH_VERBOSE: 10
jobs:
python-artifacts:
name: Build wheel and source distribution
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3017dfc1..366ff7ba 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,7 @@ env:
OPENLLM_DO_NOT_TRACK: True
PYTHONUNBUFFERED: '1'
STABLE_PYTHON_VERSION: '3.11'
+ HATCH_VERBOSE: 10
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
defaults:
run:
@@ -31,7 +32,6 @@ defaults:
jobs:
quality:
runs-on: ubuntu-latest
- if: github.event_name == 'pull_request'
name: quality-check
steps:
- uses: actions/checkout@v3
@@ -43,6 +43,8 @@ jobs:
python-version: ${{ env.STABLE_PYTHON_VERSION }}
- name: Run type check
run: hatch run typing
+ - if: failure()
+ run: echo "Not failing quality workflow."
tests:
runs-on: ubuntu-latest
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
diff --git a/.github/workflows/create-releases.yml b/.github/workflows/create-releases.yml
index 93d37a90..447b01e3 100644
--- a/.github/workflows/create-releases.yml
+++ b/.github/workflows/create-releases.yml
@@ -28,6 +28,8 @@ on:
defaults:
run:
shell: bash --noprofile --norc -exo pipefail {0}
+env:
+ HATCH_VERBOSE: 10
jobs:
release:
if: github.repository_owner == 'bentoml'
diff --git a/.github/workflows/release-notes.yml b/.github/workflows/release-notes.yml
index 8b572454..f451c4a4 100644
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -25,6 +25,8 @@ on:
tags:
required: true
type: string
+env:
+ HATCH_VERBOSE: 10
defaults:
run:
shell: bash --noprofile --norc -exo pipefail {0}
diff --git a/README.md b/README.md
index 991ea2c4..b65bf043 100644
--- a/README.md
+++ b/README.md
@@ -266,7 +266,7 @@ pip install "openllm[mpt]"
```bash
-pip install openllm
+pip install "openllm[opt]"
```
|
diff --git a/changelog.d/114.fix.md b/changelog.d/114.fix.md
new file mode 100644
index 00000000..346670f3
--- /dev/null
+++ b/changelog.d/114.fix.md
@@ -0,0 +1,7 @@
+Fixes resources to correctly follows CUDA_VISIBLE_DEVICES spec
+
+OpenLLM now contains a standalone parser that mimic `torch.cuda` parser for set
+GPU devices. This parser will be used to parse both AMD and NVIDIA GPUs.
+
+`openllm` should now be able to parse `GPU-` and `MIG-` UUID from both
+configuration or spec.
diff --git a/hatch.toml b/hatch.toml
index 286109b7..9e9c06fa 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -26,8 +26,8 @@ features = ['flan-t5']
[envs.default.scripts]
changelog = "towncrier build --version main --draft"
quality = [
+ "./tools/dependencies.py",
"./tools/update-readme.py",
- "./tools/update-optional-dependencies.py",
"./tools/update-config-stubs.py",
"./tools/update-models-import.py",
"- ./tools/add-license-headers .",
@@ -42,6 +42,7 @@ extra-dependencies = [
]
[envs.tests.scripts]
_run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml"
+distributed = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -n 3 -r aR {args:tests}"
models = "_run_script -r aR {args:tests/models}"
python = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -r aR {args:tests}"
[envs.tests.overrides]
diff --git a/nightly-requirements-gpu.txt b/nightly-requirements-gpu.txt
index a7d80a06..ac6eac73 100644
--- a/nightly-requirements-gpu.txt
+++ b/nightly-requirements-gpu.txt
@@ -1,4 +1,4 @@
-# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT
+# This file is generated by `tools/dependencies.py`. # DO NOT EDIT
# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.
-r nightly-requirements.txt
-e .[all]
diff --git a/nightly-requirements.txt b/nightly-requirements.txt
index 9d6780f2..7ec37722 100644
--- a/nightly-requirements.txt
+++ b/nightly-requirements.txt
@@ -1,4 +1,4 @@
-# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT
+# This file is generated by `tools/dependencies.py`. DO NOT EDIT
-e .[playground,flan-t5]
bentoml[grpc,io] @ git+https://github.com/bentoml/bentoml.git@main
peft @ git+https://github.com/huggingface/peft.git@main
diff --git a/pyproject.toml b/pyproject.toml
index 7ab907b0..3e1ede0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,5 @@
+# NOTE: The following are managed by ./tools/dependencies.py
+# project.classifiers, project.dependencies, project.optional-dependencies
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
@@ -29,18 +31,18 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
-# NOTE: The below is managed by ./tools/update-optional-dependencies.py
dependencies = [
- "bentoml[grpc,io]>=1.0.22",
- "transformers[torch,tokenizers,accelerate]>=4.29.0",
- "optimum",
- "attrs>=23.1.0",
- "cattrs>=23.1.0",
- "orjson",
- "inflection",
- "tabulate[widechars]>=0.9.0",
- "httpx",
- "typing_extensions",
+ 'bentoml[grpc,io]>=1.0.22',
+ 'transformers[torch,tokenizers,accelerate]>=4.29.0',
+ 'optimum',
+ 'attrs>=23.1.0',
+ 'cattrs>=23.1.0',
+ 'orjson',
+ 'inflection',
+ 'tabulate[widechars]>=0.9.0',
+ 'httpx',
+ 'typing_extensions',
+ 'cuda-python;platform_system!="Darwin"',
]
description = 'OpenLLM: Operating LLMs in production'
dynamic = ["version"]
@@ -62,9 +64,6 @@ license = "Apache-2.0"
name = "openllm"
readme = "README.md"
requires-python = ">=3.8"
-
-# NOTE: Don't modify project.optional-dependencies
-# as it is managed by ./tools/update-optional-dependencies.py
[project.optional-dependencies]
agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
all = [
@@ -72,22 +71,28 @@ all = [
"openllm[falcon]",
"openllm[mpt]",
"openllm[starcoder]",
+ "openllm[opt]",
"openllm[flan-t5]",
"openllm[fine-tune]",
+ "openllm[vllm]",
"openllm[agents]",
- "openllm[playground]",
"openllm[ggml]",
+ "openllm[playground]",
"openllm[openai]",
+ "openllm[gptq]",
]
chatglm = ["cpm-kernels", "sentencepiece"]
falcon = ["einops", "xformers", "safetensors"]
fine-tune = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
ggml = ["ctransformers"]
+gptq = ["auto-gptq", "triton"]
mpt = ["triton", "einops"]
openai = ["openai", "tiktoken"]
+opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
starcoder = ["bitsandbytes"]
+vllm = ["vllm"]
[project.urls]
Documentation = "https://github.com/bentoml/openllm#readme"
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index feca2f80..6574d1bf 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -71,6 +71,7 @@ else:
from typing_extensions import overload
if t.TYPE_CHECKING:
+ import auto_gptq as autogptq
import peft
import torch
@@ -96,6 +97,8 @@ else:
UserDictAny = collections.UserDict
LLMRunnable = bentoml.Runnable
LLMRunner = bentoml.Runner
+
+ autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
transformers = LazyLoader("transformers", globals(), "transformers")
torch = LazyLoader("torch", globals(), "torch")
peft = LazyLoader("peft", globals(), "peft")
@@ -445,7 +448,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
"""The config instance to use for this LLM. This will be created based on config_class and available
when initialising the LLM."""
- quantization_config: transformers.BitsAndBytesConfig | None
+ quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
"""Quantisation config for quantised model on the fly."""
_model_id: str
@@ -548,6 +551,44 @@ class LLM(LLMInterface[M, T], ReprMixin):
openllm.serialisation.save_pretrained(self, save_directory, **attrs)
+ @classmethod
+ @overload
+ def from_pretrained(
+ cls,
+ model_id: str | None = ...,
+ model_version: str | None = ...,
+ llm_config: openllm.LLMConfig | None = ...,
+ *args: t.Any,
+ runtime: t.Literal["ggml", "transformers"] | None = ...,
+ quantize: t.Literal["int8", "int4"] = ...,
+ bettertransformer: str | bool | None = ...,
+ adapter_id: str | None = ...,
+ adapter_name: str | None = ...,
+ adapter_map: dict[str, str | None] | None = ...,
+ quantization_config: transformers.BitsAndBytesConfig | None = ...,
+ **attrs: t.Any,
+ ) -> LLM[M, T]:
+ ...
+
+ @classmethod
+ @overload
+ def from_pretrained(
+ cls,
+ model_id: str | None = ...,
+ model_version: str | None = ...,
+ llm_config: openllm.LLMConfig | None = ...,
+ *args: t.Any,
+ runtime: t.Literal["ggml", "transformers"] | None = ...,
+ quantize: t.Literal["gptq"] = ...,
+ bettertransformer: str | bool | None = ...,
+ adapter_id: str | None = ...,
+ adapter_name: str | None = ...,
+ adapter_map: dict[str, str | None] | None = ...,
+ quantization_config: autogptq.BaseQuantizeConfig | None = ...,
+ **attrs: t.Any,
+ ) -> LLM[M, T]:
+ ...
+
@classmethod
def from_pretrained(
cls,
@@ -561,7 +602,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
adapter_id: str | None = None,
adapter_name: str | None = None,
adapter_map: dict[str, str | None] | None = None,
- quantization_config: transformers.BitsAndBytesConfig | None = None,
+ quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
**attrs: t.Any,
) -> LLM[M, T]:
"""Instantiate a pretrained LLM.
@@ -577,6 +618,17 @@ class LLM(LLMInterface[M, T], ReprMixin):
> Currently, the above two options are mutually exclusive.
+ #### Quantisation options
+
+ For customising options for quantisation config, ``openllm.LLM`` accepts all arbitrary arguments that is passed to ``transformers.BitsAndBytesConfig``
+ plus ``quantize`` value. For example, for ``int8`` quantisation, specify the following:
+ ```python
+ model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
+ ```
+
+ For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
+ to ``auto_gptq.BaseQuantizeConfig``.
+
### Adapter options:
> This is used in conjunction with the fine-tuning features
@@ -689,7 +741,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
model_id: str,
llm_config: openllm.LLMConfig,
bettertransformer: bool | None,
- quantization_config: transformers.BitsAndBytesConfig | None,
+ quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
_adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag,
_quantize_method: t.Literal["int8", "int4", "gptq"] | None,
diff --git a/src/openllm/_quantisation.py b/src/openllm/_quantisation.py
index 31c17de5..26c9ab4f 100644
--- a/src/openllm/_quantisation.py
+++ b/src/openllm/_quantisation.py
@@ -13,15 +13,26 @@
# limitations under the License.
from __future__ import annotations
import logging
+import sys
import typing as t
from .utils import LazyLoader
+from .utils import is_autogptq_available
from .utils import is_bitsandbytes_available
from .utils import is_transformers_supports_kbit
from .utils import pkg
+# NOTE: We need to do this so that overload can register
+# correct overloads to typing registry
+if sys.version_info[:2] >= (3, 11):
+ from typing import overload
+else:
+ from typing_extensions import overload
+
+
if t.TYPE_CHECKING:
+ import auto_gptq as autogptq
import torch
import openllm
@@ -29,6 +40,7 @@ if t.TYPE_CHECKING:
from ._types import DictStrAny
else:
+ autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
torch = LazyLoader("torch", globals(), "torch")
transformers = LazyLoader("transformers", globals(), "transformers")
@@ -37,15 +49,38 @@ logger = logging.getLogger(__name__)
QuantiseMode = t.Literal["int8", "int4", "gptq"]
+@overload
+def infer_quantisation_config(
+ cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any
+) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
+ ...
+
+
+@overload
+def infer_quantisation_config(
+ cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any
+) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
+ ...
+
+
def infer_quantisation_config(
cls: type[openllm.LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any
-) -> tuple[transformers.BitsAndBytesConfig | t.Any, DictStrAny]:
+) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
int8_enable_fp32_cpu_offload = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
+ autogptq_attrs: DictStrAny = {
+ "bits": attrs.pop("gptq_bits", 4),
+ "group_size": attrs.pop("gptq_group_size", -1),
+ "damp_percent": attrs.pop("gptq_damp_percent", 0.01),
+ "desc_act": attrs.pop("gptq_desc_act", True),
+ "sym": attrs.pop("gptq_sym", True),
+ "true_sequential": attrs.pop("gptq_true_sequential", True),
+ }
+
def create_int8_config(int8_skip_modules: list[str] | None):
if int8_skip_modules is None:
int8_skip_modules = []
@@ -94,8 +129,15 @@ def infer_quantisation_config(
logger.warning("OpenLLM will fallback to 8-bit quantization.")
quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == "gptq":
- # TODO: support GPTQ loading quantization
- raise NotImplementedError("GPTQ is not supported yet.")
+ if not is_autogptq_available():
+ logger.warning(
+ "'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment)."
+ " Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback "
+ "to int8 with bitsandbytes."
+ )
+ quantisation_config = create_int8_config(int8_skip_modules)
+ else:
+ quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
else:
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
diff --git a/src/openllm/_strategies.py b/src/openllm/_strategies.py
index 9ec9e270..c3c2198c 100644
--- a/src/openllm/_strategies.py
+++ b/src/openllm/_strategies.py
@@ -13,11 +13,15 @@
# limitations under the License.
from __future__ import annotations
+import functools
+import inspect
import logging
import math
import os
import sys
+import types
import typing as t
+import warnings
import psutil
@@ -27,62 +31,113 @@ from bentoml._internal.resource import system_resources
from bentoml._internal.runner.strategy import THREAD_ENVS
from bentoml._internal.runner.strategy import Strategy
-from .exceptions import OpenLLMException
+from .utils import LazyLoader
+from .utils import LazyType
from .utils import ReprMixin
if t.TYPE_CHECKING:
+ import torch
+
import bentoml
ListIntStr = list[int | str]
+
+ class DynResource(Resource[t.List[str]], resource_id=""):
+ resource_id: t.ClassVar[str]
+
else:
+ DynResource = Resource[t.List[str]]
+ torch = LazyLoader("torch", globals(), "torch")
ListIntStr = list
+# NOTE: We need to do this so that overload can register
+# correct overloads to typing registry
+if sys.version_info[:2] >= (3, 11):
+ from typing import overload
+else:
+ from typing_extensions import overload
+
logger = logging.getLogger(__name__)
-class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
- @classmethod
- def from_spec(cls, spec: t.Any) -> list[str]:
- if not isinstance(spec, (int, str, list)):
- raise TypeError("AMD GPU device IDs must be int, str or a list specifing the exact GPUs to use.")
+def _strtoul(s: str) -> int:
+ """Return -1 or positive integer sequence string starts with,."""
+ if not s:
+ return -1
+ for idx, c in enumerate(s):
+ if not (c.isdigit() or (idx == 0 and c in "+-")):
+ break
+ if idx + 1 == len(s):
+ idx += 1 # noqa: PLW2901
+ return int(s[:idx]) if idx > 0 else -1 # type: ignore (idx will be set via enumerate)
- try:
- if isinstance(spec, int):
- if spec == -1:
- return []
- if spec < -1:
- raise ValueError
- return [str(i) for i in range(spec)]
- elif isinstance(spec, str):
- try:
- return cls.from_spec(int(spec))
- except ValueError:
- if spec.startswith("GPU"):
- return [spec]
- raise ValueError
- else:
- return [str(x) for x in spec]
- except ValueError:
- raise OpenLLMException(f"Invalid AMD GPU resource limit '{spec}'.")
- @classmethod
- def from_system(cls) -> list[str]:
- """Retrieve AMD GPU from system, currently only supports on Linux.
-
- This assumes that ROCm is setup correctly.
- """
- cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
- if cuda_visible_devices in ("", "-1"):
+def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
+ rcs: list[str] = []
+ for elem in lst.split(","):
+ # Repeated id results in empty set
+ if elem in rcs:
return []
- if cuda_visible_devices is not None:
- cuda_visible_devices = cuda_visible_devices.split(",")
- if "-1" in cuda_visible_devices:
- cuda_visible_devices = cuda_visible_devices[: cuda_visible_devices.index("-1")]
- return cuda_visible_devices
+ # Anything other but prefix is ignored
+ if not elem.startswith(prefix):
+ break
+ rcs.append(elem)
+ return rcs
+
+_STACK_LEVEL = 3
+
+
+@overload
+def _parse_visible_devices(default_var: str | None = ..., respect_env: t.Literal[True] = True) -> list[str] | None:
+ ...
+
+
+@overload
+def _parse_visible_devices(default_var: str = ..., respect_env: t.Literal[False] = False) -> list[str]:
+ ...
+
+
+def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
+ """CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
+ if respect_env:
+ spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var)
+ if not spec:
+ return
+ else:
+ assert default_var is not None, "spec is required to be not None when parsing spec." # noqa: S101
+ spec = default_var
+
+ if spec.startswith("GPU-"):
+ return _parse_list_with_prefix(spec, "GPU-")
+ if spec.startswith("MIG-"):
+ return _parse_list_with_prefix(spec, "MIG-")
+
+ # XXX: We to somehow handle cases such as '100m'
+ # CUDA_VISIBLE_DEVICES uses something like strtoul
+ # which makes `1gpu2,2ampere` is equivalent to `1,2`
+ rc: list[int] = []
+ for el in spec.split(","):
+ x = _strtoul(el.strip())
+ # Repeated ordinal results in empty set
+ if x in rc:
+ return []
+ # Negative value aborts the sequence
+ if x < 0:
+ break
+ rc.append(x)
+ return [str(i) for i in rc]
+
+
+def _from_system(cls: type[DynResource]) -> list[str]:
+ """Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
+
+ It relies on torch.cuda implementation and in turns respect CUDA_VISIBLE_DEVICES.
+ """
+ if cls.resource_id == "amd.com/gpu":
if not psutil.LINUX:
- logger.debug("AMD GPU resource is only supported on Linux.")
+ warnings.warn("AMD GPUs is currently only supported on Linux.", stacklevel=_STACK_LEVEL)
return []
# ROCm does not currently have the rocm_smi wheel.
@@ -90,37 +145,169 @@ class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
# we don't want to use CLI because parsing is a pain.
sys.path.append("/opt/rocm/libexec/rocm_smi")
try:
- from ctypes import byref
- from ctypes import c_uint32
-
# refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
- from rsmiBindings import rocmsmi
- from rsmiBindings import rsmi_status_t
-
- num = c_uint32(0)
- ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
- if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
- return [str(i) for i in range(num.value)]
- return []
- except Exception as err:
- logger.debug("Failed to setup AMD GPU resource: %s", err)
+ from rsmiBindings import rocmsmi as rocmsmi
+ except (ModuleNotFoundError, ImportError):
+ # In this case the binary is not found, returning empty list
return []
finally:
sys.path.remove("/opt/rocm/libexec/rocm_smi")
+ visible_devices = _parse_visible_devices()
+ if visible_devices is None:
+ return [str(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else []
+ return visible_devices
- @classmethod
- def validate(cls, val: list[str]):
- for gpu_index_or_literal in val:
- try:
- idx = int(gpu_index_or_literal)
- except ValueError:
- raise OpenLLMException(f"Invalid AMD GPU device index: {val}")
- if int(idx) < 0:
- raise OpenLLMException(f"Negative GPU device in {val}.")
- if int(idx) >= len(cls.from_system()):
- raise OpenLLMException(
- f"GPU device index in {val} is greater than the system available: {cls.from_system()}"
- )
+
+@overload
+def _from_spec(cls: type[DynResource], spec: int) -> list[str]:
+ ...
+
+
+@overload
+def _from_spec(cls: type[DynResource], spec: ListIntStr) -> list[str]:
+ ...
+
+
+@overload
+def _from_spec(cls: type[DynResource], spec: str) -> list[str]:
+ ...
+
+
+def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
+ """Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
+
+ The parser behaves similar to how PyTorch handles CUDA_VISIBLE_DEVICES. This means within
+ BentoML's resource configuration, its behaviour is similar to CUDA_VISIBLE_DEVICES.
+ """
+ if isinstance(spec, int):
+ if spec in (-1, 0):
+ return []
+ if spec < -1:
+ raise ValueError("Spec cannot be < -1.")
+ return [str(i) for i in range(spec)]
+ elif isinstance(spec, str):
+ if not spec:
+ return []
+ if spec.isdigit():
+ spec = ",".join([str(i) for i in range(_strtoul(spec))])
+ return _parse_visible_devices(spec, respect_env=False)
+ elif LazyType(ListIntStr).isinstance(spec):
+ return [str(x) for x in spec]
+ else:
+ raise TypeError(
+ f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
+ )
+
+
+@functools.lru_cache
+def _raw_uuid_nvml() -> list[str] | None:
+ """Return list of device UUID as reported by NVML or None if NVML discovery/initialization failed."""
+ try:
+ from cuda import cuda
+ except ImportError:
+ if sys.platform == "darwin":
+ raise RuntimeError("GPU is not available on Darwin system.") from None
+ raise RuntimeError(
+ "Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
+ ) from None
+
+ from ctypes import CDLL
+ from ctypes import byref
+ from ctypes import c_void_p
+ from ctypes import create_string_buffer
+
+ nvml_h = CDLL("libnvidia-ml.so.1")
+ rc = nvml_h.nvmlInit()
+ if rc != 0:
+ warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL)
+ return
+ err, dev_count = cuda.cuDeviceGetCount()
+ if err != cuda.CUresult.CUDA_SUCCESS:
+ warnings.warn("Failed to get available device from system.", stacklevel=_STACK_LEVEL)
+ return
+ uuids: list[str] = []
+ for idx in range(dev_count):
+ dev_id = c_void_p()
+ rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
+ if rc != 0:
+ warnings.warn(f"Failed to get device handle for {idx}", stacklevel=_STACK_LEVEL)
+ return
+ buf_len = 96
+ buf = create_string_buffer(buf_len)
+ rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
+ if rc != 0:
+ warnings.warn(f"Failed to get device UUID for {idx}", stacklevel=_STACK_LEVEL)
+ return
+ uuids.append(buf.raw.decode("ascii").strip("\0"))
+ del nvml_h
+ return uuids
+
+
+def _validate(cls: type[DynResource], val: list[t.Any]):
+ if cls.resource_id == "amd.com/gpu":
+ raise RuntimeError(
+ "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
+ )
+ if not all(isinstance(i, str) for i in val):
+ raise ValueError("Input list should be all string type.")
+
+ try:
+ from cuda import cuda
+ except ImportError:
+ if sys.platform == "darwin":
+ raise RuntimeError("GPU is not available on Darwin system.") from None
+ raise RuntimeError(
+ "Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
+ ) from None
+ # correctly parse handle
+ for el in val:
+ if el.startswith("GPU-") or el.startswith("MIG-"):
+ uuids = _raw_uuid_nvml()
+ if uuids is None:
+ raise ValueError("Failed to parse available GPUs UUID")
+ if el not in uuids:
+ raise ValueError(f"Given UUID {el} is not found with available UUID (available: {uuids})")
+ elif el.isdigit():
+ err, _ = cuda.cuDeviceGet(int(el))
+ if err != cuda.CUresult.CUDA_SUCCESS:
+ raise ValueError(f"Failed to get device {el}")
+
+
+def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
+ return types.new_class(
+ name,
+ (DynResource, ReprMixin),
+ {"resource_id": resource_kind},
+ lambda ns: ns.update(
+ {
+ "resource_id": resource_kind,
+ "from_spec": classmethod(_from_spec),
+ "from_system": classmethod(_from_system),
+ "validate": classmethod(_validate),
+ "__repr_keys__": property(lambda _: {"resource_id"}),
+ "__doc__": inspect.cleandoc(docstring),
+ "__module__": "openllm._strategies",
+ }
+ ),
+ )
+
+
+NvidiaGpuResource = _make_resource_class(
+ "NvidiaGpuResource",
+ "nvidia.com/gpu",
+ """NVIDIA GPU resource.
+
+ This is a modified version of internal's BentoML's NvidiaGpuResource
+ where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",
+)
+AmdGpuResource = _make_resource_class(
+ "AmdGpuResource",
+ "amd.com/gpu",
+ """AMD GPU resource.
+
+ Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
+ ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",
+)
class CascadingResourceStrategy(Strategy, ReprMixin):
@@ -147,15 +334,21 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
if resource_request is None:
resource_request = system_resources()
- # use nvidia gpu
- nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
- if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
- return math.ceil(len(nvidia_gpus) * workers_per_resource)
+ def _get_gpu_count(typ: list[str] | None, kind: str):
+ if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
+ return math.ceil(len(typ) * workers_per_resource)
- # use amd gpu
- amd_gpus = get_resource(resource_request, "amd.com/gpu")
- if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
- return math.ceil(len(amd_gpus) * workers_per_resource)
+ # use NVIDIA
+ kind = "nvidia.com/gpu"
+ count = _get_gpu_count(get_resource(resource_request, kind), kind)
+ if count:
+ return count
+
+ # use AMD
+ kind = "amd.com/gpu"
+ count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
+ if count:
+ return count
# use CPU
cpus = get_resource(resource_request, "cpu")
@@ -203,36 +396,32 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
if resource_request is None:
resource_request = system_resources()
- # use nvidia gpu
- nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
- if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
- dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, nvidia_gpus, worker_index)
+ # use NVIDIA
+ kind = "nvidia.com/gpu"
+ typ = get_resource(resource_request, kind)
+ if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
if disabled:
logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
environ["CUDA_VISIBLE_DEVICES"] = cuda_env
return environ
- environ["CUDA_VISIBLE_DEVICES"] = dev
- logger.info(
- "Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
- worker_index,
- dev,
+ environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
+ workers_per_resource, typ, worker_index
)
+ logger.debug("Environ for worker %s: %s", worker_index, environ)
return environ
- # use amd gpu
- amd_gpus = get_resource(resource_request, "amd.com/gpu")
- if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
- dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, amd_gpus, worker_index)
+ # use AMD
+ kind = "amd.com/gpu"
+ typ = get_resource(resource_request, kind, validate=False)
+ if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
if disabled:
logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
environ["CUDA_VISIBLE_DEVICES"] = cuda_env
return environ
- environ["CUDA_VISIBLE_DEVICES"] = dev
- logger.info(
- "Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
- worker_index,
- dev,
+ environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
+ workers_per_resource, typ, worker_index
)
+ logger.debug("Environ for worker %s: %s", worker_index, environ)
return environ
# use CPU
@@ -243,23 +432,16 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
thread_count = math.ceil(cpus)
for thread_env in THREAD_ENVS:
environ[thread_env] = os.getenv(thread_env, str(thread_count))
- logger.info(
- "Environ for worker %d: set CPU thread count to %d",
- worker_index,
- thread_count,
- )
- return environ
- else:
- for thread_env in THREAD_ENVS:
- environ[thread_env] = os.getenv(thread_env, "1")
+ logger.debug("Environ for worker %s: %s", worker_index, environ)
return environ
+ for thread_env in THREAD_ENVS:
+ environ[thread_env] = os.getenv(thread_env, "1")
+ return environ
return environ
@staticmethod
- def transpile_workers_to_cuda_visible_devices(
- workers_per_resource: float | int, gpus: list[str], worker_index: int
- ) -> str:
+ def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
# Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
if isinstance(workers_per_resource, float):
# NOTE: We hit this branch when workers_per_resource is set to
@@ -287,9 +469,9 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
dev = ",".join(assigned_gpu)
else:
idx = worker_index // workers_per_resource
- if len(gpus) == idx:
+ if idx >= len(gpus):
raise ValueError(
f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}"
)
- dev = gpus[idx]
+ dev = str(gpus[idx])
return dev
diff --git a/src/openllm/_types.py b/src/openllm/_types.py
index a67d1876..0947c69d 100644
--- a/src/openllm/_types.py
+++ b/src/openllm/_types.py
@@ -30,6 +30,7 @@ from ._configuration import AdapterType
if t.TYPE_CHECKING:
+ import auto_gptq as autogptq
import click
import peft
@@ -155,7 +156,7 @@ class LLMRunner(bentoml.Runner):
class LLMInitAttrs(t.TypedDict):
config: openllm.LLMConfig
- quantization_config: transformers.BitsAndBytesConfig | None
+ quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
model_id: str
runtime: t.Literal["ggml", "transformers"]
model_decls: TupleAny
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index 712fca69..3d4b5925 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -773,7 +773,6 @@ def noop_command(
def prerequisite_check(
ctx: click.Context,
llm_config: openllm.LLMConfig,
- env: EnvVarMixin,
gpu_available: tuple[str, ...],
quantize: t.LiteralString | None,
adapter_map: dict[str, str | None] | None,
@@ -785,9 +784,6 @@ def prerequisite_check(
if len(gpu_available) < 1:
_echo(f"Quantization requires at least 1 GPU (got {len(gpu_available)})", fg="red")
ctx.exit(1)
- if env.framework_value != "pt":
- _echo("Quantization is currently only available for PyTorch models.", fg="red")
- ctx.exit(1)
if adapter_map and not is_peft_available():
_echo(
@@ -905,7 +901,7 @@ def start_bento(
config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
)
- prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
+ prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
# NOTE: This is to set current configuration
start_env = os.environ.copy()
@@ -1037,7 +1033,7 @@ def start_model(
config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
)
- prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
+ prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
# NOTE: This is to set current configuration
start_env = os.environ.copy()
@@ -1151,7 +1147,7 @@ def start_model(
@output_option
@quantize_option(click)
@click.option("--machine", is_flag=True, default=False, hidden=True)
-@click.option("--implementation", type=click.Choice(["pt", "tf", "flax"]), default=None, hidden=True)
+@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, hidden=True)
def download_models_command(
model: str,
model_id: str | None,
@@ -1193,7 +1189,7 @@ def download_models_command(
> only use this option if you want the weight to be quantized by default. Note that OpenLLM also
> support on-demand quantisation during initial startup.
"""
- impl: t.Literal["pt", "tf", "flax"] = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
+ impl: LiteralRuntime = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
llm = openllm.infer_auto_class(impl).for_model(
model,
model_id=model_id,
@@ -1263,7 +1259,7 @@ def _start(
runtime: t.Literal["ggml", "transformers"] = ...,
fast: bool = ...,
adapter_map: dict[t.LiteralString, str | None] | None = ...,
- framework: t.Literal["flax", "tf", "pt"] | None = ...,
+ framework: LiteralRuntime | None = ...,
additional_args: ListStr | None = ...,
_serve_grpc: bool = ...,
__test__: t.Literal[False] = ...,
@@ -1284,7 +1280,7 @@ def _start(
runtime: t.Literal["ggml", "transformers"] = ...,
fast: bool = ...,
adapter_map: dict[t.LiteralString, str | None] | None = ...,
- framework: t.Literal["flax", "tf", "pt"] | None = ...,
+ framework: LiteralRuntime | None = ...,
additional_args: ListStr | None = ...,
_serve_grpc: bool = ...,
__test__: t.Literal[True] = ...,
@@ -1304,7 +1300,7 @@ def _start(
runtime: t.Literal["ggml", "transformers"] = "transformers",
fast: bool = False,
adapter_map: dict[t.LiteralString, str | None] | None = None,
- framework: t.Literal["flax", "tf", "pt"] | None = None,
+ framework: LiteralRuntime | None = None,
additional_args: ListStr | None = None,
_serve_grpc: bool = False,
__test__: bool = False,
@@ -1615,6 +1611,13 @@ start, start_grpc, build, import_model, list_models = (
help="The output format for 'openllm build'. By default this will build a BentoLLM. 'container' is the shortcut of 'openllm build && bentoml containerize'.",
hidden=not get_debug_mode(),
)
+@click.option(
+ "--push",
+ default=False,
+ is_flag=True,
+ type=click.BOOL,
+ help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.",
+)
@click.pass_context
def build_command(
ctx: click.Context,
@@ -1632,6 +1635,7 @@ def build_command(
model_version: str | None,
dockerfile_template: t.TextIO | None,
format: t.Literal["bento", "container"],
+ push: bool,
**attrs: t.Any,
):
"""Package a given models into a Bento.
@@ -1788,7 +1792,12 @@ def build_command(
else:
_echo(bento.tag)
- if format == "container":
+ if format == "container" and push:
+ ctx.fail("'--format=container' and '--push' are mutually exclusive.")
+ if push:
+ client = BentoMLContainer.bentocloud_client.get()
+ client.push_bento(bento)
+ elif format == "container":
backend = os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker")
_echo(f"Building {bento} into a LLMContainer using backend '{backend}'", fg="magenta")
if not bentoml.container.health(backend):
diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py
index 2a1f3a45..69ba80ca 100644
--- a/src/openllm/utils/__init__.py
+++ b/src/openllm/utils/__init__.py
@@ -99,11 +99,8 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
def gpu_count() -> tuple[str, ...]:
- from bentoml._internal.resource import NvidiaGpuResource
-
- cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
- if cuda_visible_devices is not None:
- return tuple(i for i in cuda_visible_devices.split(","))
+ """Return available GPU under system. Currently only supports NVIDIA GPUs."""
+ from .._strategies import NvidiaGpuResource
return tuple(NvidiaGpuResource.from_system())
@@ -417,6 +414,7 @@ _import_structure = {
"is_jupytext_available",
"is_notebook_available",
"is_triton_available",
+ "is_autogptq_available",
"require_backends",
],
}
@@ -443,6 +441,7 @@ if t.TYPE_CHECKING:
from .import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES
from .import_utils import DummyMetaclass as DummyMetaclass
from .import_utils import EnvVarMixin as EnvVarMixin
+ from .import_utils import is_autogptq_available as is_autogptq_available
from .import_utils import is_bitsandbytes_available as is_bitsandbytes_available
from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
from .import_utils import is_datasets_available as is_datasets_available
diff --git a/src/openllm/utils/codegen.py b/src/openllm/utils/codegen.py
index 56d0ee75..ffee6a0d 100644
--- a/src/openllm/utils/codegen.py
+++ b/src/openllm/utils/codegen.py
@@ -252,7 +252,7 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]):
def generate_unique_filename(cls: type[t.Any], func_name: str):
- return f"<{cls.__name__} generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>"
+ return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
def generate_function(
@@ -332,6 +332,7 @@ def make_env_transformer(
def gen_sdk(func: t.Callable[P, t.Any], name: str | None = None, **attrs: t.Any):
+ """Enhance function with nicer Repr."""
from .representation import ReprMixin
if name is None:
diff --git a/src/openllm/utils/import_utils.py b/src/openllm/utils/import_utils.py
index b17b6409..cfbc4dfd 100644
--- a/src/openllm/utils/import_utils.py
+++ b/src/openllm/utils/import_utils.py
@@ -56,16 +56,17 @@ else:
logger = logging.getLogger(__name__)
OPTIONAL_DEPENDENCIES = {
+ "chatglm",
+ "falcon",
+ "mpt",
+ "starcoder",
"fine-tune",
"flan-t5",
- "mpt",
- "falcon",
- "starcoder",
- "chatglm",
- "openai",
- "agents",
- "playground",
"ggml",
+ "agents",
+ "openai",
+ "playground",
+ "gptq",
}
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
@@ -100,6 +101,7 @@ _triton_available = _is_package_available("triton")
_jupyter_available = _is_package_available("jupyter")
_jupytext_available = _is_package_available("jupytext")
_notebook_available = _is_package_available("notebook")
+_autogptq_available = _is_package_available("auto-gptq")
def is_transformers_supports_kbit() -> bool:
@@ -146,6 +148,10 @@ def is_bitsandbytes_available():
return _bitsandbytes_available
+def is_autogptq_available():
+ return _autogptq_available
+
+
def is_torch_available():
global _torch_available
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
@@ -309,6 +315,11 @@ You can install it with pip: `pip install bitsandbytes`. Please note that you ma
your runtime after installation.
"""
+AUTOGPTQ_IMPORT_ERROR = """{0} requires the auto-gptq library but it was not found in your environment.
+You can install it with pip: `pip install auto-gptq`. Please note that you may need to restart
+your runtime after installation.
+"""
+
BACKENDS_MAPPING = BackendOrderredDict(
[
("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
@@ -320,6 +331,7 @@ BACKENDS_MAPPING = BackendOrderredDict(
("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
+ ("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)),
]
)
diff --git a/tests/strategies_test.py b/tests/strategies_test.py
index 8ae288a0..0a3f4252 100644
--- a/tests/strategies_test.py
+++ b/tests/strategies_test.py
@@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import annotations
+import sys
import typing as t
import pytest
@@ -25,14 +26,127 @@ import bentoml
from bentoml._internal.resource import get_resource
from openllm import _strategies as strategy
from openllm._strategies import CascadingResourceStrategy
+from openllm._strategies import NvidiaGpuResource
+
+
+def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as mcls:
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "0,1")
+ resource = NvidiaGpuResource.from_system()
+ assert len(resource) == 2
+ assert resource == ["0", "1"]
+ mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as mcls:
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "0,2,-1,1")
+ resource = NvidiaGpuResource.from_system()
+ assert len(resource) == 2
+ assert resource == ["0", "2"]
+ mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as mcls:
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "-1")
+ resource = NvidiaGpuResource.from_system()
+ assert len(resource) == 0
+ assert resource == []
+ mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as mcls:
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43-ac33420d4628")
+ resource = NvidiaGpuResource.from_system()
+ assert len(resource) == 1
+ assert resource == ["GPU-5ebe9f43-ac33420d4628"]
+ mcls.delenv("CUDA_VISIBLE_DEVICES")
+ with monkeypatch.context() as mcls:
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,GPU-ac33420d4628")
+ resource = NvidiaGpuResource.from_system()
+ assert len(resource) == 2
+ assert resource == ["GPU-5ebe9f43", "GPU-ac33420d4628"]
+ mcls.delenv("CUDA_VISIBLE_DEVICES")
+ with monkeypatch.context() as mcls:
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,-1,GPU-ac33420d4628")
+ resource = NvidiaGpuResource.from_system()
+ assert len(resource) == 1
+ assert resource == ["GPU-5ebe9f43"]
+ mcls.delenv("CUDA_VISIBLE_DEVICES")
+ with monkeypatch.context() as mcls:
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "MIG-GPU-5ebe9f43-ac33420d4628")
+ resource = NvidiaGpuResource.from_system()
+ assert len(resource) == 1
+ assert resource == ["MIG-GPU-5ebe9f43-ac33420d4628"]
+ mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as mcls:
+ # to make this tests works with system that has GPU
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "")
+ assert len(NvidiaGpuResource.from_system()) >= 0 # TODO: real from_system tests
+
+ assert pytest.raises(
+ ValueError,
+ NvidiaGpuResource.validate,
+ [*NvidiaGpuResource.from_system(), 1],
+ ).match("Input list should be all string type.")
+ assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match(
+ "Input list should be all string type."
+ )
+ assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["GPU-5ebe9f43", "GPU-ac33420d4628"]).match(
+ "Failed to parse available GPUs UUID"
+ )
+
+
+def test_nvidia_gpu_validate_no_gpu_available():
+ assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["0", "1"]).match("Failed to get device *")
+
+
+@pytest.mark.skipif(sys.platform != "darwin", reason="Test NVIDIA validation on Darwin only")
+def test_nvidia_gpu_validation_on_darwin():
+ assert pytest.raises(RuntimeError, NvidiaGpuResource.validate, ["0"]).match(
+ "GPU is not available on Darwin system."
+ )
+
+
+def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as mcls:
+ # to make this tests works with system that has GPU
+ mcls.setenv("CUDA_VISIBLE_DEVICES", "")
+ assert NvidiaGpuResource.from_spec(1) == ["0"]
+ assert NvidiaGpuResource.from_spec("5") == ["0", "1", "2", "3", "4"]
+ assert NvidiaGpuResource.from_spec(1) == ["0"]
+ assert NvidiaGpuResource.from_spec(2) == ["0", "1"]
+ assert NvidiaGpuResource.from_spec("3") == ["0", "1", "2"]
+ assert NvidiaGpuResource.from_spec([1, 3]) == ["1", "3"]
+ assert NvidiaGpuResource.from_spec(["1", "3"]) == ["1", "3"]
+ assert NvidiaGpuResource.from_spec(-1) == []
+ assert NvidiaGpuResource.from_spec("-1") == []
+ assert NvidiaGpuResource.from_spec("") == []
+ assert NvidiaGpuResource.from_spec("-2") == []
+ assert NvidiaGpuResource.from_spec("GPU-288347ab") == ["GPU-288347ab"]
+ assert NvidiaGpuResource.from_spec("GPU-288347ab,-1,GPU-ac33420d4628") == ["GPU-288347ab"]
+ assert NvidiaGpuResource.from_spec("GPU-288347ab,GPU-ac33420d4628") == ["GPU-288347ab", "GPU-ac33420d4628"]
+ assert NvidiaGpuResource.from_spec("MIG-GPU-288347ab") == ["MIG-GPU-288347ab"]
+
+ with pytest.raises(TypeError):
+ NvidiaGpuResource.from_spec((1, 2, 3))
+ with pytest.raises(TypeError):
+ NvidiaGpuResource.from_spec(1.5)
+ with pytest.raises(ValueError):
+ assert NvidiaGpuResource.from_spec(-2)
class GPURunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu")
-def unvalidated_get_resource(x: dict[str, t.Any], y: str):
- return get_resource(x, y, validate=False)
+def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False):
+ return get_resource(x, y, validate=validate)
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
diff --git a/tools/dependencies.py b/tools/dependencies.py
new file mode 100755
index 00000000..13aa25d2
--- /dev/null
+++ b/tools/dependencies.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+import dataclasses
+import os
+import shutil
+import subprocess
+import typing as t
+
+import inflection
+import tomlkit
+
+import openllm
+
+if t.TYPE_CHECKING:
+ from tomlkit.items import Array
+ from tomlkit.items import Table
+
+
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+@dataclasses.dataclass(frozen=True)
+class Classifier:
+ identifier: t.Dict[str, str] = dataclasses.field(
+ default_factory=lambda: {
+ "status": "Development Status",
+ "environment": "Environment",
+ "license": "License",
+ "topic": "Topic",
+ "os": "Operating System",
+ "audience": "Intended Audience",
+ "typing": "Typing",
+ "language": "Programming Language",
+ }
+ )
+
+ joiner: str = " :: "
+
+ @staticmethod
+ def status() -> dict[int, str]:
+ return {
+ v: status
+ for v, status in zip(
+ range(1, 8),
+ [
+ "1 - Planning",
+ "2 - Pre-Alpha",
+ "3 - Alpha",
+ "4 - Beta",
+ "5 - Production/Stable",
+ "6 - Mature",
+ "7 - Inactive",
+ ],
+ )
+ }
+
+ @staticmethod
+ def apache() -> str:
+ return Classifier.create_classifier("license", "OSI Approved", "Apache Software License")
+
+ @staticmethod
+ def create_classifier(identifier: str, *decls: t.Any) -> str:
+ cls_ = Classifier()
+ if identifier not in cls_.identifier:
+ raise ValueError(f"{identifier} is not yet supported (supported alias: {Classifier.identifier})")
+ return cls_.joiner.join([cls_.identifier[identifier], *decls])
+
+ @staticmethod
+ def create_python_classifier(
+ implementation: list[str] | None = None, supported_version: list[str] | None = None
+ ) -> list[str]:
+ if supported_version is None:
+ supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ if implementation is None:
+ implementation = ["CPython", "PyPy"]
+ base = [
+ Classifier.create_classifier("language", "Python"),
+ Classifier.create_classifier("language", "Python", "3"),
+ ]
+ base.append(Classifier.create_classifier("language", "Python", "3", "Only"))
+ base.extend([Classifier.create_classifier("language", "Python", version) for version in supported_version])
+ base.extend(
+ [Classifier.create_classifier("language", "Python", "Implementation", impl) for impl in implementation]
+ )
+ return base
+
+ @staticmethod
+ def create_status_classifier(level: int) -> str:
+ return Classifier.create_classifier("status", Classifier.status()[level])
+
+
+@dataclasses.dataclass(frozen=True)
+class Dependencies:
+ name: str
+ git_repo_url: t.Optional[str] = None
+ branch: t.Optional[str] = None
+ extensions: t.Optional[t.List[str]] = None
+ subdirectory: t.Optional[str] = None
+ requires_gpu: bool = False
+ lower_constraint: t.Optional[str] = None
+ platform: t.Optional[t.Tuple[t.Literal["Linux", "Windows", "Darwin"], t.Literal["eq", "ne"]]] = None
+
+ def with_options(self, **kwargs: t.Any) -> Dependencies:
+ return dataclasses.replace(self, **kwargs)
+
+ @property
+ def has_constraint(self) -> bool:
+ return self.lower_constraint is not None
+
+ @property
+ def pypi_extensions(self) -> str:
+ return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
+
+ @staticmethod
+ def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str:
+ return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
+
+ def to_str(self) -> str:
+ deps: list[str] = []
+ if self.lower_constraint is not None:
+ deps.append(f"{self.name}{self.pypi_extensions}>={self.lower_constraint}")
+ elif self.subdirectory is not None:
+ deps.append(
+ f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
+ )
+ elif self.branch is not None:
+ deps.append(
+ f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
+ )
+ else:
+ deps.append(f"{self.name}{self.pypi_extensions}")
+
+ if self.platform:
+ deps.append(self.platform_restriction(*self.platform))
+
+ return ";".join(deps)
+
+ @classmethod
+ def from_tuple(cls, *decls: t.Any) -> Dependencies:
+ return cls(*decls)
+
+
+_BENTOML_EXT = ["grpc", "io"]
+_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
+
+_BASE_DEPENDENCIES = [
+ Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
+ Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
+ Dependencies(name="optimum"),
+ Dependencies(name="attrs", lower_constraint="23.1.0"),
+ Dependencies(name="cattrs", lower_constraint="23.1.0"),
+ Dependencies(name="orjson"),
+ Dependencies(name="inflection"),
+ Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
+ Dependencies(name="httpx"),
+ Dependencies(name="typing_extensions"),
+ Dependencies(name="cuda-python", platform=("Darwin", "ne")),
+]
+
+_NIGHTLY_MAPPING: dict[str, Dependencies] = {
+ "bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
+ "peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
+ "transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
+ "optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
+ "accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
+ "bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
+ "trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
+ "triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
+}
+
+_ALL_RUNTIME_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
+FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
+OPT_DEPS = _ALL_RUNTIME_DEPS
+MPT_DEPS = ["triton", "einops"]
+OPENAI_DEPS = ["openai", "tiktoken"]
+AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
+FALCON_DEPS = ["einops", "xformers", "safetensors"]
+STARCODER_DEPS = ["bitsandbytes"]
+CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
+PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
+GGML_DEPS = ["ctransformers"]
+GPTQ_DEPS = ["auto-gptq", "triton"]
+VLLM_DEPS = ["vllm"]
+
+_base_requirements = {
+ inflection.dasherize(name): config_cls.__openllm_requirements__
+ for name, config_cls in openllm.CONFIG_MAPPING.items()
+ if config_cls.__openllm_requirements__
+}
+
+# shallow copy from locals()
+_locals = locals().copy()
+
+# NOTE: update this table when adding new external dependencies
+# sync with openllm.utils.OPTIONAL_DEPENDENCIES
+_base_requirements.update(
+ {v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
+)
+
+fname = f"{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}"
+
+
+def create_classifiers() -> Array:
+ arr = tomlkit.array()
+ arr.extend(
+ [
+ Classifier.create_status_classifier(5),
+ Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA"),
+ Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "12"),
+ Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.8"),
+ Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.7"),
+ Classifier.apache(),
+ Classifier.create_classifier("topic", "Scientific/Engineering", "Artificial Intelligence"),
+ Classifier.create_classifier("topic", "Software Development", "Libraries"),
+ Classifier.create_classifier("os", "OS Independent"),
+ Classifier.create_classifier("audience", "Developers"),
+ Classifier.create_classifier("audience", "Science/Research"),
+ Classifier.create_classifier("audience", "System Administrators"),
+ Classifier.create_classifier("typing", "Typed"),
+ *Classifier.create_python_classifier(),
+ ]
+ )
+ return arr.multiline(True)
+
+
+def create_optional_table() -> Table:
+ table = tomlkit.table()
+ table.update(_base_requirements)
+
+ all_array = tomlkit.array()
+ all_array.extend([f"openllm[{k}]" for k in table.keys()])
+ table.add("all", all_array.multiline(True))
+ return table
+
+
+def main() -> int:
+ with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
+ pyproject = tomlkit.parse(f.read())
+
+ t.cast("Table", pyproject["project"]).update(
+ {
+ "classifiers": create_classifiers(),
+ "optional-dependencies": create_optional_table(),
+ "dependencies": tomlkit.array(f"{[v.to_str() for v in _BASE_DEPENDENCIES]}").multiline(True),
+ }
+ )
+ with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
+ f.write(tomlkit.dumps(pyproject))
+
+ with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
+ f.write(f"# This file is generated by `{fname}`. DO NOT EDIT\n-e .[playground,flan-t5]\n")
+ f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
+ with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
+ f.write(f"# This file is generated by `{fname}`. # DO NOT EDIT\n")
+ f.write(
+ "# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
+ )
+ f.write("-r nightly-requirements.txt\n-e .[all]\n")
+ f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
+
+ if shutil.which("taplo"):
+ return subprocess.check_call(["taplo", "format", os.path.join(ROOT, "pyproject.toml")])
+
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tools/update-optional-dependencies.py b/tools/update-optional-dependencies.py
deleted file mode 100755
index 82fff857..00000000
--- a/tools/update-optional-dependencies.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2023 BentoML Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import dataclasses
-import os
-import shutil
-import typing as t
-
-import inflection
-import tomlkit
-
-import openllm
-
-
-ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-
-@dataclasses.dataclass(frozen=True)
-class Dependencies:
- name: str
- git_repo_url: t.Optional[str] = None
- branch: t.Optional[str] = None
- extensions: t.Optional[t.List[str]] = None
- subdirectory: t.Optional[str] = None
- requires_gpu: bool = False
- lower_constraint: t.Optional[str] = None
-
- def with_options(self, **kwargs: t.Any) -> Dependencies:
- return dataclasses.replace(self, **kwargs)
-
- @property
- def has_constraint(self) -> bool:
- return self.lower_constraint is not None
-
- @property
- def pypi_extensions(self) -> str:
- return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
-
- def to_str(self) -> str:
- if self.lower_constraint is not None:
- return f"{self.name}{self.pypi_extensions}>={self.lower_constraint}"
- elif self.subdirectory is not None:
- return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
- elif self.branch is not None:
- return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
- else:
- return f"{self.name}{self.pypi_extensions}"
-
- @classmethod
- def from_tuple(cls, *decls: t.Any) -> Dependencies:
- return cls(*decls)
-
-
-_BENTOML_EXT = ["grpc", "io"]
-_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
-
-_BASE_DEPENDENCIES = [
- Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
- Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
- Dependencies(name="optimum"),
- Dependencies(name="attrs", lower_constraint="23.1.0"),
- Dependencies(name="cattrs", lower_constraint="23.1.0"),
- Dependencies(name="orjson"),
- Dependencies(name="inflection"),
- Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
- Dependencies(name="httpx"),
- Dependencies(name="typing_extensions"),
-]
-
-_NIGHTLY_MAPPING: dict[str, Dependencies] = {
- "bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
- "peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
- "transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
- "optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
- "accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
- "bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
- "trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
- "triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
-}
-
-FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
-FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
-MPT_DEPS = ["triton", "einops"]
-OPENAI_DEPS = ["openai", "tiktoken"]
-AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
-FALCON_DEPS = ["einops", "xformers", "safetensors"]
-STARCODER_DEPS = ["bitsandbytes"]
-CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
-PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
-GGML_DEPS = ["ctransformers"]
-
-_base_requirements = {
- inflection.dasherize(name): config_cls.__openllm_requirements__
- for name, config_cls in openllm.CONFIG_MAPPING.items()
- if config_cls.__openllm_requirements__
-}
-
-# shallow copy from locals()
-_locals = locals().copy()
-
-# NOTE: update this table when adding new external dependencies
-# sync with openllm.utils.OPTIONAL_DEPENDENCIES
-_base_requirements.update(
- {v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
-)
-
-
-def main() -> int:
- with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
- pyproject = tomlkit.parse(f.read())
-
- table = tomlkit.table()
- for name, config in _base_requirements.items():
- table.add(name, config)
-
- table.add("all", [f"openllm[{k}]" for k in table.keys()])
-
- pyproject["project"]["optional-dependencies"] = table
-
- # write project dependencies
- pyproject["project"]["dependencies"] = [v.to_str() for v in _BASE_DEPENDENCIES]
- with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
- f.write(tomlkit.dumps(pyproject))
-
- with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
- f.write(
- "# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT\n-e .[playground,flan-t5]\n"
- )
- f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
- with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
- f.write("# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT\n")
- f.write(
- "# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
- )
- f.write("-r nightly-requirements.txt\n-e .[all]\n")
- f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
-
- if shutil.which("taplo"):
- return os.system(f"taplo fmt {os.path.join(ROOT, 'pyproject.toml')}")
-
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/typings/cuda/__init__.pyi b/typings/cuda/__init__.pyi
new file mode 100644
index 00000000..e76cccb8
--- /dev/null
+++ b/typings/cuda/__init__.pyi
@@ -0,0 +1,2 @@
+from . import cuda as cuda
+from . import cudart as cudart
diff --git a/typings/cuda/cuda.pyi b/typings/cuda/cuda.pyi
new file mode 100644
index 00000000..982d3e5f
--- /dev/null
+++ b/typings/cuda/cuda.pyi
@@ -0,0 +1,26 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+class CUresult(Enum):
+ CUDA_SUCCESS = 0
+
+class _CUMixin:
+ def getPtr(self) -> int: ...
+
+class CUdevice(_CUMixin): ...
+
+def cuDeviceGetCount() -> tuple[CUresult, int]: ...
+def cuDeviceGet(dev: int) -> tuple[CUresult, CUdevice]: ...