fix(resource): correctly parse CUDA_VISIBLE_DEVICES (#114)

This commit is contained in:
Aaron Pham
2023-07-15 07:19:35 -04:00
committed by GitHub
parent b291526248
commit b2dba6143f
23 changed files with 903 additions and 318 deletions

View File

@@ -13,6 +13,7 @@ env:
APP_NAME: openllm
PYTHON_VERSION: '3.11'
PYOXIDIZER_VERSION: '0.24.0'
HATCH_VERBOSE: 10
jobs:
python-artifacts:
name: Build wheel and source distribution

View File

@@ -24,6 +24,7 @@ env:
OPENLLM_DO_NOT_TRACK: True
PYTHONUNBUFFERED: '1'
STABLE_PYTHON_VERSION: '3.11'
HATCH_VERBOSE: 10
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
defaults:
run:
@@ -31,7 +32,6 @@ defaults:
jobs:
quality:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
name: quality-check
steps:
- uses: actions/checkout@v3
@@ -43,6 +43,8 @@ jobs:
python-version: ${{ env.STABLE_PYTHON_VERSION }}
- name: Run type check
run: hatch run typing
- if: failure()
run: echo "Not failing quality workflow."
tests:
runs-on: ubuntu-latest
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}

View File

@@ -28,6 +28,8 @@ on:
defaults:
run:
shell: bash --noprofile --norc -exo pipefail {0}
env:
HATCH_VERBOSE: 10
jobs:
release:
if: github.repository_owner == 'bentoml'

View File

@@ -25,6 +25,8 @@ on:
tags:
required: true
type: string
env:
HATCH_VERBOSE: 10
defaults:
run:
shell: bash --noprofile --norc -exo pipefail {0}

View File

@@ -266,7 +266,7 @@ pip install "openllm[mpt]"
<td>
```bash
pip install openllm
pip install "openllm[opt]"
```
</td>

7
changelog.d/114.fix.md Normal file
View File

@@ -0,0 +1,7 @@
Fixes resources to correctly follows CUDA_VISIBLE_DEVICES spec
OpenLLM now contains a standalone parser that mimic `torch.cuda` parser for set
GPU devices. This parser will be used to parse both AMD and NVIDIA GPUs.
`openllm` should now be able to parse `GPU-` and `MIG-` UUID from both
configuration or spec.

View File

@@ -26,8 +26,8 @@ features = ['flan-t5']
[envs.default.scripts]
changelog = "towncrier build --version main --draft"
quality = [
"./tools/dependencies.py",
"./tools/update-readme.py",
"./tools/update-optional-dependencies.py",
"./tools/update-config-stubs.py",
"./tools/update-models-import.py",
"- ./tools/add-license-headers .",
@@ -42,6 +42,7 @@ extra-dependencies = [
]
[envs.tests.scripts]
_run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml"
distributed = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -n 3 -r aR {args:tests}"
models = "_run_script -r aR {args:tests/models}"
python = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -r aR {args:tests}"
[envs.tests.overrides]

View File

@@ -1,4 +1,4 @@
# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT
# This file is generated by `tools/dependencies.py`. # DO NOT EDIT
# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.
-r nightly-requirements.txt
-e .[all]

View File

@@ -1,4 +1,4 @@
# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT
# This file is generated by `tools/dependencies.py`. DO NOT EDIT
-e .[playground,flan-t5]
bentoml[grpc,io] @ git+https://github.com/bentoml/bentoml.git@main
peft @ git+https://github.com/huggingface/peft.git@main

View File

@@ -1,3 +1,5 @@
# NOTE: The following are managed by ./tools/dependencies.py
# project.classifiers, project.dependencies, project.optional-dependencies
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
@@ -29,18 +31,18 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
# NOTE: The below is managed by ./tools/update-optional-dependencies.py
dependencies = [
"bentoml[grpc,io]>=1.0.22",
"transformers[torch,tokenizers,accelerate]>=4.29.0",
"optimum",
"attrs>=23.1.0",
"cattrs>=23.1.0",
"orjson",
"inflection",
"tabulate[widechars]>=0.9.0",
"httpx",
"typing_extensions",
'bentoml[grpc,io]>=1.0.22',
'transformers[torch,tokenizers,accelerate]>=4.29.0',
'optimum',
'attrs>=23.1.0',
'cattrs>=23.1.0',
'orjson',
'inflection',
'tabulate[widechars]>=0.9.0',
'httpx',
'typing_extensions',
'cuda-python;platform_system!="Darwin"',
]
description = 'OpenLLM: Operating LLMs in production'
dynamic = ["version"]
@@ -62,9 +64,6 @@ license = "Apache-2.0"
name = "openllm"
readme = "README.md"
requires-python = ">=3.8"
# NOTE: Don't modify project.optional-dependencies
# as it is managed by ./tools/update-optional-dependencies.py
[project.optional-dependencies]
agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
all = [
@@ -72,22 +71,28 @@ all = [
"openllm[falcon]",
"openllm[mpt]",
"openllm[starcoder]",
"openllm[opt]",
"openllm[flan-t5]",
"openllm[fine-tune]",
"openllm[vllm]",
"openllm[agents]",
"openllm[playground]",
"openllm[ggml]",
"openllm[playground]",
"openllm[openai]",
"openllm[gptq]",
]
chatglm = ["cpm-kernels", "sentencepiece"]
falcon = ["einops", "xformers", "safetensors"]
fine-tune = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
ggml = ["ctransformers"]
gptq = ["auto-gptq", "triton"]
mpt = ["triton", "einops"]
openai = ["openai", "tiktoken"]
opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
starcoder = ["bitsandbytes"]
vllm = ["vllm"]
[project.urls]
Documentation = "https://github.com/bentoml/openllm#readme"

View File

@@ -71,6 +71,7 @@ else:
from typing_extensions import overload
if t.TYPE_CHECKING:
import auto_gptq as autogptq
import peft
import torch
@@ -96,6 +97,8 @@ else:
UserDictAny = collections.UserDict
LLMRunnable = bentoml.Runnable
LLMRunner = bentoml.Runner
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
transformers = LazyLoader("transformers", globals(), "transformers")
torch = LazyLoader("torch", globals(), "torch")
peft = LazyLoader("peft", globals(), "peft")
@@ -445,7 +448,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
"""The config instance to use for this LLM. This will be created based on config_class and available
when initialising the LLM."""
quantization_config: transformers.BitsAndBytesConfig | None
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
"""Quantisation config for quantised model on the fly."""
_model_id: str
@@ -548,6 +551,44 @@ class LLM(LLMInterface[M, T], ReprMixin):
openllm.serialisation.save_pretrained(self, save_directory, **attrs)
@classmethod
@overload
def from_pretrained(
cls,
model_id: str | None = ...,
model_version: str | None = ...,
llm_config: openllm.LLMConfig | None = ...,
*args: t.Any,
runtime: t.Literal["ggml", "transformers"] | None = ...,
quantize: t.Literal["int8", "int4"] = ...,
bettertransformer: str | bool | None = ...,
adapter_id: str | None = ...,
adapter_name: str | None = ...,
adapter_map: dict[str, str | None] | None = ...,
quantization_config: transformers.BitsAndBytesConfig | None = ...,
**attrs: t.Any,
) -> LLM[M, T]:
...
@classmethod
@overload
def from_pretrained(
cls,
model_id: str | None = ...,
model_version: str | None = ...,
llm_config: openllm.LLMConfig | None = ...,
*args: t.Any,
runtime: t.Literal["ggml", "transformers"] | None = ...,
quantize: t.Literal["gptq"] = ...,
bettertransformer: str | bool | None = ...,
adapter_id: str | None = ...,
adapter_name: str | None = ...,
adapter_map: dict[str, str | None] | None = ...,
quantization_config: autogptq.BaseQuantizeConfig | None = ...,
**attrs: t.Any,
) -> LLM[M, T]:
...
@classmethod
def from_pretrained(
cls,
@@ -561,7 +602,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
adapter_id: str | None = None,
adapter_name: str | None = None,
adapter_map: dict[str, str | None] | None = None,
quantization_config: transformers.BitsAndBytesConfig | None = None,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
**attrs: t.Any,
) -> LLM[M, T]:
"""Instantiate a pretrained LLM.
@@ -577,6 +618,17 @@ class LLM(LLMInterface[M, T], ReprMixin):
> Currently, the above two options are mutually exclusive.
#### Quantisation options
For customising options for quantisation config, ``openllm.LLM`` accepts all arbitrary arguments that is passed to ``transformers.BitsAndBytesConfig``
plus ``quantize`` value. For example, for ``int8`` quantisation, specify the following:
```python
model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
```
For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
to ``auto_gptq.BaseQuantizeConfig``.
### Adapter options:
> This is used in conjunction with the fine-tuning features
@@ -689,7 +741,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
model_id: str,
llm_config: openllm.LLMConfig,
bettertransformer: bool | None,
quantization_config: transformers.BitsAndBytesConfig | None,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
_adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag,
_quantize_method: t.Literal["int8", "int4", "gptq"] | None,

View File

@@ -13,15 +13,26 @@
# limitations under the License.
from __future__ import annotations
import logging
import sys
import typing as t
from .utils import LazyLoader
from .utils import is_autogptq_available
from .utils import is_bitsandbytes_available
from .utils import is_transformers_supports_kbit
from .utils import pkg
# NOTE: We need to do this so that overload can register
# correct overloads to typing registry
if sys.version_info[:2] >= (3, 11):
from typing import overload
else:
from typing_extensions import overload
if t.TYPE_CHECKING:
import auto_gptq as autogptq
import torch
import openllm
@@ -29,6 +40,7 @@ if t.TYPE_CHECKING:
from ._types import DictStrAny
else:
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
torch = LazyLoader("torch", globals(), "torch")
transformers = LazyLoader("transformers", globals(), "transformers")
@@ -37,15 +49,38 @@ logger = logging.getLogger(__name__)
QuantiseMode = t.Literal["int8", "int4", "gptq"]
@overload
def infer_quantisation_config(
cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any
) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
...
@overload
def infer_quantisation_config(
cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any
) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
...
def infer_quantisation_config(
cls: type[openllm.LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any
) -> tuple[transformers.BitsAndBytesConfig | t.Any, DictStrAny]:
) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
int8_enable_fp32_cpu_offload = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
autogptq_attrs: DictStrAny = {
"bits": attrs.pop("gptq_bits", 4),
"group_size": attrs.pop("gptq_group_size", -1),
"damp_percent": attrs.pop("gptq_damp_percent", 0.01),
"desc_act": attrs.pop("gptq_desc_act", True),
"sym": attrs.pop("gptq_sym", True),
"true_sequential": attrs.pop("gptq_true_sequential", True),
}
def create_int8_config(int8_skip_modules: list[str] | None):
if int8_skip_modules is None:
int8_skip_modules = []
@@ -94,8 +129,15 @@ def infer_quantisation_config(
logger.warning("OpenLLM will fallback to 8-bit quantization.")
quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == "gptq":
# TODO: support GPTQ loading quantization
raise NotImplementedError("GPTQ is not supported yet.")
if not is_autogptq_available():
logger.warning(
"'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment)."
" Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback "
"to int8 with bitsandbytes."
)
quantisation_config = create_int8_config(int8_skip_modules)
else:
quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
else:
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")

View File

@@ -13,11 +13,15 @@
# limitations under the License.
from __future__ import annotations
import functools
import inspect
import logging
import math
import os
import sys
import types
import typing as t
import warnings
import psutil
@@ -27,62 +31,113 @@ from bentoml._internal.resource import system_resources
from bentoml._internal.runner.strategy import THREAD_ENVS
from bentoml._internal.runner.strategy import Strategy
from .exceptions import OpenLLMException
from .utils import LazyLoader
from .utils import LazyType
from .utils import ReprMixin
if t.TYPE_CHECKING:
import torch
import bentoml
ListIntStr = list[int | str]
class DynResource(Resource[t.List[str]], resource_id=""):
resource_id: t.ClassVar[str]
else:
DynResource = Resource[t.List[str]]
torch = LazyLoader("torch", globals(), "torch")
ListIntStr = list
# NOTE: We need to do this so that overload can register
# correct overloads to typing registry
if sys.version_info[:2] >= (3, 11):
from typing import overload
else:
from typing_extensions import overload
logger = logging.getLogger(__name__)
class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
@classmethod
def from_spec(cls, spec: t.Any) -> list[str]:
if not isinstance(spec, (int, str, list)):
raise TypeError("AMD GPU device IDs must be int, str or a list specifing the exact GPUs to use.")
def _strtoul(s: str) -> int:
"""Return -1 or positive integer sequence string starts with,."""
if not s:
return -1
for idx, c in enumerate(s):
if not (c.isdigit() or (idx == 0 and c in "+-")):
break
if idx + 1 == len(s):
idx += 1 # noqa: PLW2901
return int(s[:idx]) if idx > 0 else -1 # type: ignore (idx will be set via enumerate)
try:
if isinstance(spec, int):
if spec == -1:
return []
if spec < -1:
raise ValueError
return [str(i) for i in range(spec)]
elif isinstance(spec, str):
try:
return cls.from_spec(int(spec))
except ValueError:
if spec.startswith("GPU"):
return [spec]
raise ValueError
else:
return [str(x) for x in spec]
except ValueError:
raise OpenLLMException(f"Invalid AMD GPU resource limit '{spec}'.")
@classmethod
def from_system(cls) -> list[str]:
"""Retrieve AMD GPU from system, currently only supports on Linux.
This assumes that ROCm is setup correctly.
"""
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_visible_devices in ("", "-1"):
def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
rcs: list[str] = []
for elem in lst.split(","):
# Repeated id results in empty set
if elem in rcs:
return []
if cuda_visible_devices is not None:
cuda_visible_devices = cuda_visible_devices.split(",")
if "-1" in cuda_visible_devices:
cuda_visible_devices = cuda_visible_devices[: cuda_visible_devices.index("-1")]
return cuda_visible_devices
# Anything other but prefix is ignored
if not elem.startswith(prefix):
break
rcs.append(elem)
return rcs
_STACK_LEVEL = 3
@overload
def _parse_visible_devices(default_var: str | None = ..., respect_env: t.Literal[True] = True) -> list[str] | None:
...
@overload
def _parse_visible_devices(default_var: str = ..., respect_env: t.Literal[False] = False) -> list[str]:
...
def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
"""CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
if respect_env:
spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var)
if not spec:
return
else:
assert default_var is not None, "spec is required to be not None when parsing spec." # noqa: S101
spec = default_var
if spec.startswith("GPU-"):
return _parse_list_with_prefix(spec, "GPU-")
if spec.startswith("MIG-"):
return _parse_list_with_prefix(spec, "MIG-")
# XXX: We to somehow handle cases such as '100m'
# CUDA_VISIBLE_DEVICES uses something like strtoul
# which makes `1gpu2,2ampere` is equivalent to `1,2`
rc: list[int] = []
for el in spec.split(","):
x = _strtoul(el.strip())
# Repeated ordinal results in empty set
if x in rc:
return []
# Negative value aborts the sequence
if x < 0:
break
rc.append(x)
return [str(i) for i in rc]
def _from_system(cls: type[DynResource]) -> list[str]:
"""Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
It relies on torch.cuda implementation and in turns respect CUDA_VISIBLE_DEVICES.
"""
if cls.resource_id == "amd.com/gpu":
if not psutil.LINUX:
logger.debug("AMD GPU resource is only supported on Linux.")
warnings.warn("AMD GPUs is currently only supported on Linux.", stacklevel=_STACK_LEVEL)
return []
# ROCm does not currently have the rocm_smi wheel.
@@ -90,37 +145,169 @@ class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
# we don't want to use CLI because parsing is a pain.
sys.path.append("/opt/rocm/libexec/rocm_smi")
try:
from ctypes import byref
from ctypes import c_uint32
# refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
from rsmiBindings import rocmsmi
from rsmiBindings import rsmi_status_t
num = c_uint32(0)
ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
return [str(i) for i in range(num.value)]
return []
except Exception as err:
logger.debug("Failed to setup AMD GPU resource: %s", err)
from rsmiBindings import rocmsmi as rocmsmi
except (ModuleNotFoundError, ImportError):
# In this case the binary is not found, returning empty list
return []
finally:
sys.path.remove("/opt/rocm/libexec/rocm_smi")
visible_devices = _parse_visible_devices()
if visible_devices is None:
return [str(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else []
return visible_devices
@classmethod
def validate(cls, val: list[str]):
for gpu_index_or_literal in val:
try:
idx = int(gpu_index_or_literal)
except ValueError:
raise OpenLLMException(f"Invalid AMD GPU device index: {val}")
if int(idx) < 0:
raise OpenLLMException(f"Negative GPU device in {val}.")
if int(idx) >= len(cls.from_system()):
raise OpenLLMException(
f"GPU device index in {val} is greater than the system available: {cls.from_system()}"
)
@overload
def _from_spec(cls: type[DynResource], spec: int) -> list[str]:
...
@overload
def _from_spec(cls: type[DynResource], spec: ListIntStr) -> list[str]:
...
@overload
def _from_spec(cls: type[DynResource], spec: str) -> list[str]:
...
def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
"""Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
The parser behaves similar to how PyTorch handles CUDA_VISIBLE_DEVICES. This means within
BentoML's resource configuration, its behaviour is similar to CUDA_VISIBLE_DEVICES.
"""
if isinstance(spec, int):
if spec in (-1, 0):
return []
if spec < -1:
raise ValueError("Spec cannot be < -1.")
return [str(i) for i in range(spec)]
elif isinstance(spec, str):
if not spec:
return []
if spec.isdigit():
spec = ",".join([str(i) for i in range(_strtoul(spec))])
return _parse_visible_devices(spec, respect_env=False)
elif LazyType(ListIntStr).isinstance(spec):
return [str(x) for x in spec]
else:
raise TypeError(
f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
)
@functools.lru_cache
def _raw_uuid_nvml() -> list[str] | None:
"""Return list of device UUID as reported by NVML or None if NVML discovery/initialization failed."""
try:
from cuda import cuda
except ImportError:
if sys.platform == "darwin":
raise RuntimeError("GPU is not available on Darwin system.") from None
raise RuntimeError(
"Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
) from None
from ctypes import CDLL
from ctypes import byref
from ctypes import c_void_p
from ctypes import create_string_buffer
nvml_h = CDLL("libnvidia-ml.so.1")
rc = nvml_h.nvmlInit()
if rc != 0:
warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL)
return
err, dev_count = cuda.cuDeviceGetCount()
if err != cuda.CUresult.CUDA_SUCCESS:
warnings.warn("Failed to get available device from system.", stacklevel=_STACK_LEVEL)
return
uuids: list[str] = []
for idx in range(dev_count):
dev_id = c_void_p()
rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
if rc != 0:
warnings.warn(f"Failed to get device handle for {idx}", stacklevel=_STACK_LEVEL)
return
buf_len = 96
buf = create_string_buffer(buf_len)
rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
if rc != 0:
warnings.warn(f"Failed to get device UUID for {idx}", stacklevel=_STACK_LEVEL)
return
uuids.append(buf.raw.decode("ascii").strip("\0"))
del nvml_h
return uuids
def _validate(cls: type[DynResource], val: list[t.Any]):
if cls.resource_id == "amd.com/gpu":
raise RuntimeError(
"AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
)
if not all(isinstance(i, str) for i in val):
raise ValueError("Input list should be all string type.")
try:
from cuda import cuda
except ImportError:
if sys.platform == "darwin":
raise RuntimeError("GPU is not available on Darwin system.") from None
raise RuntimeError(
"Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
) from None
# correctly parse handle
for el in val:
if el.startswith("GPU-") or el.startswith("MIG-"):
uuids = _raw_uuid_nvml()
if uuids is None:
raise ValueError("Failed to parse available GPUs UUID")
if el not in uuids:
raise ValueError(f"Given UUID {el} is not found with available UUID (available: {uuids})")
elif el.isdigit():
err, _ = cuda.cuDeviceGet(int(el))
if err != cuda.CUresult.CUDA_SUCCESS:
raise ValueError(f"Failed to get device {el}")
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
return types.new_class(
name,
(DynResource, ReprMixin),
{"resource_id": resource_kind},
lambda ns: ns.update(
{
"resource_id": resource_kind,
"from_spec": classmethod(_from_spec),
"from_system": classmethod(_from_system),
"validate": classmethod(_validate),
"__repr_keys__": property(lambda _: {"resource_id"}),
"__doc__": inspect.cleandoc(docstring),
"__module__": "openllm._strategies",
}
),
)
NvidiaGpuResource = _make_resource_class(
"NvidiaGpuResource",
"nvidia.com/gpu",
"""NVIDIA GPU resource.
This is a modified version of internal's BentoML's NvidiaGpuResource
where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",
)
AmdGpuResource = _make_resource_class(
"AmdGpuResource",
"amd.com/gpu",
"""AMD GPU resource.
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",
)
class CascadingResourceStrategy(Strategy, ReprMixin):
@@ -147,15 +334,21 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
if resource_request is None:
resource_request = system_resources()
# use nvidia gpu
nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
return math.ceil(len(nvidia_gpus) * workers_per_resource)
def _get_gpu_count(typ: list[str] | None, kind: str):
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
return math.ceil(len(typ) * workers_per_resource)
# use amd gpu
amd_gpus = get_resource(resource_request, "amd.com/gpu")
if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
return math.ceil(len(amd_gpus) * workers_per_resource)
# use NVIDIA
kind = "nvidia.com/gpu"
count = _get_gpu_count(get_resource(resource_request, kind), kind)
if count:
return count
# use AMD
kind = "amd.com/gpu"
count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
if count:
return count
# use CPU
cpus = get_resource(resource_request, "cpu")
@@ -203,36 +396,32 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
if resource_request is None:
resource_request = system_resources()
# use nvidia gpu
nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, nvidia_gpus, worker_index)
# use NVIDIA
kind = "nvidia.com/gpu"
typ = get_resource(resource_request, kind)
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
if disabled:
logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
environ["CUDA_VISIBLE_DEVICES"] = cuda_env
return environ
environ["CUDA_VISIBLE_DEVICES"] = dev
logger.info(
"Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
worker_index,
dev,
environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
workers_per_resource, typ, worker_index
)
logger.debug("Environ for worker %s: %s", worker_index, environ)
return environ
# use amd gpu
amd_gpus = get_resource(resource_request, "amd.com/gpu")
if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, amd_gpus, worker_index)
# use AMD
kind = "amd.com/gpu"
typ = get_resource(resource_request, kind, validate=False)
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
if disabled:
logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
environ["CUDA_VISIBLE_DEVICES"] = cuda_env
return environ
environ["CUDA_VISIBLE_DEVICES"] = dev
logger.info(
"Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
worker_index,
dev,
environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
workers_per_resource, typ, worker_index
)
logger.debug("Environ for worker %s: %s", worker_index, environ)
return environ
# use CPU
@@ -243,23 +432,16 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
thread_count = math.ceil(cpus)
for thread_env in THREAD_ENVS:
environ[thread_env] = os.getenv(thread_env, str(thread_count))
logger.info(
"Environ for worker %d: set CPU thread count to %d",
worker_index,
thread_count,
)
return environ
else:
for thread_env in THREAD_ENVS:
environ[thread_env] = os.getenv(thread_env, "1")
logger.debug("Environ for worker %s: %s", worker_index, environ)
return environ
for thread_env in THREAD_ENVS:
environ[thread_env] = os.getenv(thread_env, "1")
return environ
return environ
@staticmethod
def transpile_workers_to_cuda_visible_devices(
workers_per_resource: float | int, gpus: list[str], worker_index: int
) -> str:
def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
# Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
if isinstance(workers_per_resource, float):
# NOTE: We hit this branch when workers_per_resource is set to
@@ -287,9 +469,9 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
dev = ",".join(assigned_gpu)
else:
idx = worker_index // workers_per_resource
if len(gpus) == idx:
if idx >= len(gpus):
raise ValueError(
f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}"
)
dev = gpus[idx]
dev = str(gpus[idx])
return dev

View File

@@ -30,6 +30,7 @@ from ._configuration import AdapterType
if t.TYPE_CHECKING:
import auto_gptq as autogptq
import click
import peft
@@ -155,7 +156,7 @@ class LLMRunner(bentoml.Runner):
class LLMInitAttrs(t.TypedDict):
config: openllm.LLMConfig
quantization_config: transformers.BitsAndBytesConfig | None
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
model_id: str
runtime: t.Literal["ggml", "transformers"]
model_decls: TupleAny

View File

@@ -773,7 +773,6 @@ def noop_command(
def prerequisite_check(
ctx: click.Context,
llm_config: openllm.LLMConfig,
env: EnvVarMixin,
gpu_available: tuple[str, ...],
quantize: t.LiteralString | None,
adapter_map: dict[str, str | None] | None,
@@ -785,9 +784,6 @@ def prerequisite_check(
if len(gpu_available) < 1:
_echo(f"Quantization requires at least 1 GPU (got {len(gpu_available)})", fg="red")
ctx.exit(1)
if env.framework_value != "pt":
_echo("Quantization is currently only available for PyTorch models.", fg="red")
ctx.exit(1)
if adapter_map and not is_peft_available():
_echo(
@@ -905,7 +901,7 @@ def start_bento(
config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
)
prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
# NOTE: This is to set current configuration
start_env = os.environ.copy()
@@ -1037,7 +1033,7 @@ def start_model(
config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
)
prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
# NOTE: This is to set current configuration
start_env = os.environ.copy()
@@ -1151,7 +1147,7 @@ def start_model(
@output_option
@quantize_option(click)
@click.option("--machine", is_flag=True, default=False, hidden=True)
@click.option("--implementation", type=click.Choice(["pt", "tf", "flax"]), default=None, hidden=True)
@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, hidden=True)
def download_models_command(
model: str,
model_id: str | None,
@@ -1193,7 +1189,7 @@ def download_models_command(
> only use this option if you want the weight to be quantized by default. Note that OpenLLM also
> support on-demand quantisation during initial startup.
"""
impl: t.Literal["pt", "tf", "flax"] = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
impl: LiteralRuntime = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
llm = openllm.infer_auto_class(impl).for_model(
model,
model_id=model_id,
@@ -1263,7 +1259,7 @@ def _start(
runtime: t.Literal["ggml", "transformers"] = ...,
fast: bool = ...,
adapter_map: dict[t.LiteralString, str | None] | None = ...,
framework: t.Literal["flax", "tf", "pt"] | None = ...,
framework: LiteralRuntime | None = ...,
additional_args: ListStr | None = ...,
_serve_grpc: bool = ...,
__test__: t.Literal[False] = ...,
@@ -1284,7 +1280,7 @@ def _start(
runtime: t.Literal["ggml", "transformers"] = ...,
fast: bool = ...,
adapter_map: dict[t.LiteralString, str | None] | None = ...,
framework: t.Literal["flax", "tf", "pt"] | None = ...,
framework: LiteralRuntime | None = ...,
additional_args: ListStr | None = ...,
_serve_grpc: bool = ...,
__test__: t.Literal[True] = ...,
@@ -1304,7 +1300,7 @@ def _start(
runtime: t.Literal["ggml", "transformers"] = "transformers",
fast: bool = False,
adapter_map: dict[t.LiteralString, str | None] | None = None,
framework: t.Literal["flax", "tf", "pt"] | None = None,
framework: LiteralRuntime | None = None,
additional_args: ListStr | None = None,
_serve_grpc: bool = False,
__test__: bool = False,
@@ -1615,6 +1611,13 @@ start, start_grpc, build, import_model, list_models = (
help="The output format for 'openllm build'. By default this will build a BentoLLM. 'container' is the shortcut of 'openllm build && bentoml containerize'.",
hidden=not get_debug_mode(),
)
@click.option(
"--push",
default=False,
is_flag=True,
type=click.BOOL,
help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.",
)
@click.pass_context
def build_command(
ctx: click.Context,
@@ -1632,6 +1635,7 @@ def build_command(
model_version: str | None,
dockerfile_template: t.TextIO | None,
format: t.Literal["bento", "container"],
push: bool,
**attrs: t.Any,
):
"""Package a given models into a Bento.
@@ -1788,7 +1792,12 @@ def build_command(
else:
_echo(bento.tag)
if format == "container":
if format == "container" and push:
ctx.fail("'--format=container' and '--push' are mutually exclusive.")
if push:
client = BentoMLContainer.bentocloud_client.get()
client.push_bento(bento)
elif format == "container":
backend = os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker")
_echo(f"Building {bento} into a LLMContainer using backend '{backend}'", fg="magenta")
if not bentoml.container.health(backend):

View File

@@ -99,11 +99,8 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
def gpu_count() -> tuple[str, ...]:
from bentoml._internal.resource import NvidiaGpuResource
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if cuda_visible_devices is not None:
return tuple(i for i in cuda_visible_devices.split(","))
"""Return available GPU under system. Currently only supports NVIDIA GPUs."""
from .._strategies import NvidiaGpuResource
return tuple(NvidiaGpuResource.from_system())
@@ -417,6 +414,7 @@ _import_structure = {
"is_jupytext_available",
"is_notebook_available",
"is_triton_available",
"is_autogptq_available",
"require_backends",
],
}
@@ -443,6 +441,7 @@ if t.TYPE_CHECKING:
from .import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES
from .import_utils import DummyMetaclass as DummyMetaclass
from .import_utils import EnvVarMixin as EnvVarMixin
from .import_utils import is_autogptq_available as is_autogptq_available
from .import_utils import is_bitsandbytes_available as is_bitsandbytes_available
from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
from .import_utils import is_datasets_available as is_datasets_available

View File

@@ -252,7 +252,7 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]):
def generate_unique_filename(cls: type[t.Any], func_name: str):
return f"<{cls.__name__} generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>"
return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
def generate_function(
@@ -332,6 +332,7 @@ def make_env_transformer(
def gen_sdk(func: t.Callable[P, t.Any], name: str | None = None, **attrs: t.Any):
"""Enhance function with nicer Repr."""
from .representation import ReprMixin
if name is None:

View File

@@ -56,16 +56,17 @@ else:
logger = logging.getLogger(__name__)
OPTIONAL_DEPENDENCIES = {
"chatglm",
"falcon",
"mpt",
"starcoder",
"fine-tune",
"flan-t5",
"mpt",
"falcon",
"starcoder",
"chatglm",
"openai",
"agents",
"playground",
"ggml",
"agents",
"openai",
"playground",
"gptq",
}
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
@@ -100,6 +101,7 @@ _triton_available = _is_package_available("triton")
_jupyter_available = _is_package_available("jupyter")
_jupytext_available = _is_package_available("jupytext")
_notebook_available = _is_package_available("notebook")
_autogptq_available = _is_package_available("auto-gptq")
def is_transformers_supports_kbit() -> bool:
@@ -146,6 +148,10 @@ def is_bitsandbytes_available():
return _bitsandbytes_available
def is_autogptq_available():
return _autogptq_available
def is_torch_available():
global _torch_available
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
@@ -309,6 +315,11 @@ You can install it with pip: `pip install bitsandbytes`. Please note that you ma
your runtime after installation.
"""
AUTOGPTQ_IMPORT_ERROR = """{0} requires the auto-gptq library but it was not found in your environment.
You can install it with pip: `pip install auto-gptq`. Please note that you may need to restart
your runtime after installation.
"""
BACKENDS_MAPPING = BackendOrderredDict(
[
("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
@@ -320,6 +331,7 @@ BACKENDS_MAPPING = BackendOrderredDict(
("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)),
]
)

View File

@@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import annotations
import sys
import typing as t
import pytest
@@ -25,14 +26,127 @@ import bentoml
from bentoml._internal.resource import get_resource
from openllm import _strategies as strategy
from openllm._strategies import CascadingResourceStrategy
from openllm._strategies import NvidiaGpuResource
def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls:
mcls.setenv("CUDA_VISIBLE_DEVICES", "0,1")
resource = NvidiaGpuResource.from_system()
assert len(resource) == 2
assert resource == ["0", "1"]
mcls.delenv("CUDA_VISIBLE_DEVICES")
def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls:
mcls.setenv("CUDA_VISIBLE_DEVICES", "0,2,-1,1")
resource = NvidiaGpuResource.from_system()
assert len(resource) == 2
assert resource == ["0", "2"]
mcls.delenv("CUDA_VISIBLE_DEVICES")
def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls:
mcls.setenv("CUDA_VISIBLE_DEVICES", "-1")
resource = NvidiaGpuResource.from_system()
assert len(resource) == 0
assert resource == []
mcls.delenv("CUDA_VISIBLE_DEVICES")
def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls:
mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43-ac33420d4628")
resource = NvidiaGpuResource.from_system()
assert len(resource) == 1
assert resource == ["GPU-5ebe9f43-ac33420d4628"]
mcls.delenv("CUDA_VISIBLE_DEVICES")
with monkeypatch.context() as mcls:
mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,GPU-ac33420d4628")
resource = NvidiaGpuResource.from_system()
assert len(resource) == 2
assert resource == ["GPU-5ebe9f43", "GPU-ac33420d4628"]
mcls.delenv("CUDA_VISIBLE_DEVICES")
with monkeypatch.context() as mcls:
mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,-1,GPU-ac33420d4628")
resource = NvidiaGpuResource.from_system()
assert len(resource) == 1
assert resource == ["GPU-5ebe9f43"]
mcls.delenv("CUDA_VISIBLE_DEVICES")
with monkeypatch.context() as mcls:
mcls.setenv("CUDA_VISIBLE_DEVICES", "MIG-GPU-5ebe9f43-ac33420d4628")
resource = NvidiaGpuResource.from_system()
assert len(resource) == 1
assert resource == ["MIG-GPU-5ebe9f43-ac33420d4628"]
mcls.delenv("CUDA_VISIBLE_DEVICES")
def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls:
# to make this tests works with system that has GPU
mcls.setenv("CUDA_VISIBLE_DEVICES", "")
assert len(NvidiaGpuResource.from_system()) >= 0 # TODO: real from_system tests
assert pytest.raises(
ValueError,
NvidiaGpuResource.validate,
[*NvidiaGpuResource.from_system(), 1],
).match("Input list should be all string type.")
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match(
"Input list should be all string type."
)
assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["GPU-5ebe9f43", "GPU-ac33420d4628"]).match(
"Failed to parse available GPUs UUID"
)
def test_nvidia_gpu_validate_no_gpu_available():
assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["0", "1"]).match("Failed to get device *")
@pytest.mark.skipif(sys.platform != "darwin", reason="Test NVIDIA validation on Darwin only")
def test_nvidia_gpu_validation_on_darwin():
assert pytest.raises(RuntimeError, NvidiaGpuResource.validate, ["0"]).match(
"GPU is not available on Darwin system."
)
def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls:
# to make this tests works with system that has GPU
mcls.setenv("CUDA_VISIBLE_DEVICES", "")
assert NvidiaGpuResource.from_spec(1) == ["0"]
assert NvidiaGpuResource.from_spec("5") == ["0", "1", "2", "3", "4"]
assert NvidiaGpuResource.from_spec(1) == ["0"]
assert NvidiaGpuResource.from_spec(2) == ["0", "1"]
assert NvidiaGpuResource.from_spec("3") == ["0", "1", "2"]
assert NvidiaGpuResource.from_spec([1, 3]) == ["1", "3"]
assert NvidiaGpuResource.from_spec(["1", "3"]) == ["1", "3"]
assert NvidiaGpuResource.from_spec(-1) == []
assert NvidiaGpuResource.from_spec("-1") == []
assert NvidiaGpuResource.from_spec("") == []
assert NvidiaGpuResource.from_spec("-2") == []
assert NvidiaGpuResource.from_spec("GPU-288347ab") == ["GPU-288347ab"]
assert NvidiaGpuResource.from_spec("GPU-288347ab,-1,GPU-ac33420d4628") == ["GPU-288347ab"]
assert NvidiaGpuResource.from_spec("GPU-288347ab,GPU-ac33420d4628") == ["GPU-288347ab", "GPU-ac33420d4628"]
assert NvidiaGpuResource.from_spec("MIG-GPU-288347ab") == ["MIG-GPU-288347ab"]
with pytest.raises(TypeError):
NvidiaGpuResource.from_spec((1, 2, 3))
with pytest.raises(TypeError):
NvidiaGpuResource.from_spec(1.5)
with pytest.raises(ValueError):
assert NvidiaGpuResource.from_spec(-2)
class GPURunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu")
def unvalidated_get_resource(x: dict[str, t.Any], y: str):
return get_resource(x, y, validate=False)
def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False):
return get_resource(x, y, validate=validate)
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])

285
tools/dependencies.py Executable file
View File

@@ -0,0 +1,285 @@
#!/usr/bin/env python3
# Copyright 2023 BentoML Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import dataclasses
import os
import shutil
import subprocess
import typing as t
import inflection
import tomlkit
import openllm
if t.TYPE_CHECKING:
from tomlkit.items import Array
from tomlkit.items import Table
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@dataclasses.dataclass(frozen=True)
class Classifier:
identifier: t.Dict[str, str] = dataclasses.field(
default_factory=lambda: {
"status": "Development Status",
"environment": "Environment",
"license": "License",
"topic": "Topic",
"os": "Operating System",
"audience": "Intended Audience",
"typing": "Typing",
"language": "Programming Language",
}
)
joiner: str = " :: "
@staticmethod
def status() -> dict[int, str]:
return {
v: status
for v, status in zip(
range(1, 8),
[
"1 - Planning",
"2 - Pre-Alpha",
"3 - Alpha",
"4 - Beta",
"5 - Production/Stable",
"6 - Mature",
"7 - Inactive",
],
)
}
@staticmethod
def apache() -> str:
return Classifier.create_classifier("license", "OSI Approved", "Apache Software License")
@staticmethod
def create_classifier(identifier: str, *decls: t.Any) -> str:
cls_ = Classifier()
if identifier not in cls_.identifier:
raise ValueError(f"{identifier} is not yet supported (supported alias: {Classifier.identifier})")
return cls_.joiner.join([cls_.identifier[identifier], *decls])
@staticmethod
def create_python_classifier(
implementation: list[str] | None = None, supported_version: list[str] | None = None
) -> list[str]:
if supported_version is None:
supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
if implementation is None:
implementation = ["CPython", "PyPy"]
base = [
Classifier.create_classifier("language", "Python"),
Classifier.create_classifier("language", "Python", "3"),
]
base.append(Classifier.create_classifier("language", "Python", "3", "Only"))
base.extend([Classifier.create_classifier("language", "Python", version) for version in supported_version])
base.extend(
[Classifier.create_classifier("language", "Python", "Implementation", impl) for impl in implementation]
)
return base
@staticmethod
def create_status_classifier(level: int) -> str:
return Classifier.create_classifier("status", Classifier.status()[level])
@dataclasses.dataclass(frozen=True)
class Dependencies:
name: str
git_repo_url: t.Optional[str] = None
branch: t.Optional[str] = None
extensions: t.Optional[t.List[str]] = None
subdirectory: t.Optional[str] = None
requires_gpu: bool = False
lower_constraint: t.Optional[str] = None
platform: t.Optional[t.Tuple[t.Literal["Linux", "Windows", "Darwin"], t.Literal["eq", "ne"]]] = None
def with_options(self, **kwargs: t.Any) -> Dependencies:
return dataclasses.replace(self, **kwargs)
@property
def has_constraint(self) -> bool:
return self.lower_constraint is not None
@property
def pypi_extensions(self) -> str:
return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
@staticmethod
def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str:
return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
def to_str(self) -> str:
deps: list[str] = []
if self.lower_constraint is not None:
deps.append(f"{self.name}{self.pypi_extensions}>={self.lower_constraint}")
elif self.subdirectory is not None:
deps.append(
f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
)
elif self.branch is not None:
deps.append(
f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
)
else:
deps.append(f"{self.name}{self.pypi_extensions}")
if self.platform:
deps.append(self.platform_restriction(*self.platform))
return ";".join(deps)
@classmethod
def from_tuple(cls, *decls: t.Any) -> Dependencies:
return cls(*decls)
_BENTOML_EXT = ["grpc", "io"]
_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
_BASE_DEPENDENCIES = [
Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
Dependencies(name="optimum"),
Dependencies(name="attrs", lower_constraint="23.1.0"),
Dependencies(name="cattrs", lower_constraint="23.1.0"),
Dependencies(name="orjson"),
Dependencies(name="inflection"),
Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
Dependencies(name="httpx"),
Dependencies(name="typing_extensions"),
Dependencies(name="cuda-python", platform=("Darwin", "ne")),
]
_NIGHTLY_MAPPING: dict[str, Dependencies] = {
"bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
"peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
"transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
"optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
"accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
"bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
"trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
"triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
}
_ALL_RUNTIME_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
OPT_DEPS = _ALL_RUNTIME_DEPS
MPT_DEPS = ["triton", "einops"]
OPENAI_DEPS = ["openai", "tiktoken"]
AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
FALCON_DEPS = ["einops", "xformers", "safetensors"]
STARCODER_DEPS = ["bitsandbytes"]
CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
GGML_DEPS = ["ctransformers"]
GPTQ_DEPS = ["auto-gptq", "triton"]
VLLM_DEPS = ["vllm"]
_base_requirements = {
inflection.dasherize(name): config_cls.__openllm_requirements__
for name, config_cls in openllm.CONFIG_MAPPING.items()
if config_cls.__openllm_requirements__
}
# shallow copy from locals()
_locals = locals().copy()
# NOTE: update this table when adding new external dependencies
# sync with openllm.utils.OPTIONAL_DEPENDENCIES
_base_requirements.update(
{v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
)
fname = f"{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}"
def create_classifiers() -> Array:
arr = tomlkit.array()
arr.extend(
[
Classifier.create_status_classifier(5),
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA"),
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "12"),
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.8"),
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.7"),
Classifier.apache(),
Classifier.create_classifier("topic", "Scientific/Engineering", "Artificial Intelligence"),
Classifier.create_classifier("topic", "Software Development", "Libraries"),
Classifier.create_classifier("os", "OS Independent"),
Classifier.create_classifier("audience", "Developers"),
Classifier.create_classifier("audience", "Science/Research"),
Classifier.create_classifier("audience", "System Administrators"),
Classifier.create_classifier("typing", "Typed"),
*Classifier.create_python_classifier(),
]
)
return arr.multiline(True)
def create_optional_table() -> Table:
table = tomlkit.table()
table.update(_base_requirements)
all_array = tomlkit.array()
all_array.extend([f"openllm[{k}]" for k in table.keys()])
table.add("all", all_array.multiline(True))
return table
def main() -> int:
with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
pyproject = tomlkit.parse(f.read())
t.cast("Table", pyproject["project"]).update(
{
"classifiers": create_classifiers(),
"optional-dependencies": create_optional_table(),
"dependencies": tomlkit.array(f"{[v.to_str() for v in _BASE_DEPENDENCIES]}").multiline(True),
}
)
with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
f.write(tomlkit.dumps(pyproject))
with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
f.write(f"# This file is generated by `{fname}`. DO NOT EDIT\n-e .[playground,flan-t5]\n")
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
f.write(f"# This file is generated by `{fname}`. # DO NOT EDIT\n")
f.write(
"# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
)
f.write("-r nightly-requirements.txt\n-e .[all]\n")
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
if shutil.which("taplo"):
return subprocess.check_call(["taplo", "format", os.path.join(ROOT, "pyproject.toml")])
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,160 +0,0 @@
#!/usr/bin/env python3
# Copyright 2023 BentoML Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import dataclasses
import os
import shutil
import typing as t
import inflection
import tomlkit
import openllm
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@dataclasses.dataclass(frozen=True)
class Dependencies:
name: str
git_repo_url: t.Optional[str] = None
branch: t.Optional[str] = None
extensions: t.Optional[t.List[str]] = None
subdirectory: t.Optional[str] = None
requires_gpu: bool = False
lower_constraint: t.Optional[str] = None
def with_options(self, **kwargs: t.Any) -> Dependencies:
return dataclasses.replace(self, **kwargs)
@property
def has_constraint(self) -> bool:
return self.lower_constraint is not None
@property
def pypi_extensions(self) -> str:
return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
def to_str(self) -> str:
if self.lower_constraint is not None:
return f"{self.name}{self.pypi_extensions}>={self.lower_constraint}"
elif self.subdirectory is not None:
return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
elif self.branch is not None:
return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
else:
return f"{self.name}{self.pypi_extensions}"
@classmethod
def from_tuple(cls, *decls: t.Any) -> Dependencies:
return cls(*decls)
_BENTOML_EXT = ["grpc", "io"]
_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
_BASE_DEPENDENCIES = [
Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
Dependencies(name="optimum"),
Dependencies(name="attrs", lower_constraint="23.1.0"),
Dependencies(name="cattrs", lower_constraint="23.1.0"),
Dependencies(name="orjson"),
Dependencies(name="inflection"),
Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
Dependencies(name="httpx"),
Dependencies(name="typing_extensions"),
]
_NIGHTLY_MAPPING: dict[str, Dependencies] = {
"bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
"peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
"transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
"optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
"accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
"bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
"trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
"triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
}
FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
MPT_DEPS = ["triton", "einops"]
OPENAI_DEPS = ["openai", "tiktoken"]
AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
FALCON_DEPS = ["einops", "xformers", "safetensors"]
STARCODER_DEPS = ["bitsandbytes"]
CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
GGML_DEPS = ["ctransformers"]
_base_requirements = {
inflection.dasherize(name): config_cls.__openllm_requirements__
for name, config_cls in openllm.CONFIG_MAPPING.items()
if config_cls.__openllm_requirements__
}
# shallow copy from locals()
_locals = locals().copy()
# NOTE: update this table when adding new external dependencies
# sync with openllm.utils.OPTIONAL_DEPENDENCIES
_base_requirements.update(
{v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
)
def main() -> int:
with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
pyproject = tomlkit.parse(f.read())
table = tomlkit.table()
for name, config in _base_requirements.items():
table.add(name, config)
table.add("all", [f"openllm[{k}]" for k in table.keys()])
pyproject["project"]["optional-dependencies"] = table
# write project dependencies
pyproject["project"]["dependencies"] = [v.to_str() for v in _BASE_DEPENDENCIES]
with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
f.write(tomlkit.dumps(pyproject))
with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
f.write(
"# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT\n-e .[playground,flan-t5]\n"
)
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
f.write("# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT\n")
f.write(
"# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
)
f.write("-r nightly-requirements.txt\n-e .[all]\n")
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
if shutil.which("taplo"):
return os.system(f"taplo fmt {os.path.join(ROOT, 'pyproject.toml')}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

2
typings/cuda/__init__.pyi generated Normal file
View File

@@ -0,0 +1,2 @@
from . import cuda as cuda
from . import cudart as cudart

26
typings/cuda/cuda.pyi generated Normal file
View File

@@ -0,0 +1,26 @@
# Copyright 2023 BentoML Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
class CUresult(Enum):
CUDA_SUCCESS = 0
class _CUMixin:
def getPtr(self) -> int: ...
class CUdevice(_CUMixin): ...
def cuDeviceGetCount() -> tuple[CUresult, int]: ...
def cuDeviceGet(dev: int) -> tuple[CUresult, CUdevice]: ...