mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-02-23 10:16:06 -05:00
fix(resource): correctly parse CUDA_VISIBLE_DEVICES (#114)
This commit is contained in:
1
.github/workflows/binary-releases.yml
vendored
1
.github/workflows/binary-releases.yml
vendored
@@ -13,6 +13,7 @@ env:
|
||||
APP_NAME: openllm
|
||||
PYTHON_VERSION: '3.11'
|
||||
PYOXIDIZER_VERSION: '0.24.0'
|
||||
HATCH_VERBOSE: 10
|
||||
jobs:
|
||||
python-artifacts:
|
||||
name: Build wheel and source distribution
|
||||
|
||||
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@@ -24,6 +24,7 @@ env:
|
||||
OPENLLM_DO_NOT_TRACK: True
|
||||
PYTHONUNBUFFERED: '1'
|
||||
STABLE_PYTHON_VERSION: '3.11'
|
||||
HATCH_VERBOSE: 10
|
||||
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
|
||||
defaults:
|
||||
run:
|
||||
@@ -31,7 +32,6 @@ defaults:
|
||||
jobs:
|
||||
quality:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'pull_request'
|
||||
name: quality-check
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
@@ -43,6 +43,8 @@ jobs:
|
||||
python-version: ${{ env.STABLE_PYTHON_VERSION }}
|
||||
- name: Run type check
|
||||
run: hatch run typing
|
||||
- if: failure()
|
||||
run: echo "Not failing quality workflow."
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
|
||||
|
||||
2
.github/workflows/create-releases.yml
vendored
2
.github/workflows/create-releases.yml
vendored
@@ -28,6 +28,8 @@ on:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash --noprofile --norc -exo pipefail {0}
|
||||
env:
|
||||
HATCH_VERBOSE: 10
|
||||
jobs:
|
||||
release:
|
||||
if: github.repository_owner == 'bentoml'
|
||||
|
||||
2
.github/workflows/release-notes.yml
vendored
2
.github/workflows/release-notes.yml
vendored
@@ -25,6 +25,8 @@ on:
|
||||
tags:
|
||||
required: true
|
||||
type: string
|
||||
env:
|
||||
HATCH_VERBOSE: 10
|
||||
defaults:
|
||||
run:
|
||||
shell: bash --noprofile --norc -exo pipefail {0}
|
||||
|
||||
@@ -266,7 +266,7 @@ pip install "openllm[mpt]"
|
||||
<td>
|
||||
|
||||
```bash
|
||||
pip install openllm
|
||||
pip install "openllm[opt]"
|
||||
```
|
||||
|
||||
</td>
|
||||
|
||||
7
changelog.d/114.fix.md
Normal file
7
changelog.d/114.fix.md
Normal file
@@ -0,0 +1,7 @@
|
||||
Fixes resources to correctly follows CUDA_VISIBLE_DEVICES spec
|
||||
|
||||
OpenLLM now contains a standalone parser that mimic `torch.cuda` parser for set
|
||||
GPU devices. This parser will be used to parse both AMD and NVIDIA GPUs.
|
||||
|
||||
`openllm` should now be able to parse `GPU-` and `MIG-` UUID from both
|
||||
configuration or spec.
|
||||
@@ -26,8 +26,8 @@ features = ['flan-t5']
|
||||
[envs.default.scripts]
|
||||
changelog = "towncrier build --version main --draft"
|
||||
quality = [
|
||||
"./tools/dependencies.py",
|
||||
"./tools/update-readme.py",
|
||||
"./tools/update-optional-dependencies.py",
|
||||
"./tools/update-config-stubs.py",
|
||||
"./tools/update-models-import.py",
|
||||
"- ./tools/add-license-headers .",
|
||||
@@ -42,6 +42,7 @@ extra-dependencies = [
|
||||
]
|
||||
[envs.tests.scripts]
|
||||
_run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml"
|
||||
distributed = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -n 3 -r aR {args:tests}"
|
||||
models = "_run_script -r aR {args:tests/models}"
|
||||
python = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -r aR {args:tests}"
|
||||
[envs.tests.overrides]
|
||||
|
||||
2
nightly-requirements-gpu.txt
generated
2
nightly-requirements-gpu.txt
generated
@@ -1,4 +1,4 @@
|
||||
# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT
|
||||
# This file is generated by `tools/dependencies.py`. # DO NOT EDIT
|
||||
# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.
|
||||
-r nightly-requirements.txt
|
||||
-e .[all]
|
||||
|
||||
2
nightly-requirements.txt
generated
2
nightly-requirements.txt
generated
@@ -1,4 +1,4 @@
|
||||
# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT
|
||||
# This file is generated by `tools/dependencies.py`. DO NOT EDIT
|
||||
-e .[playground,flan-t5]
|
||||
bentoml[grpc,io] @ git+https://github.com/bentoml/bentoml.git@main
|
||||
peft @ git+https://github.com/huggingface/peft.git@main
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# NOTE: The following are managed by ./tools/dependencies.py
|
||||
# project.classifiers, project.dependencies, project.optional-dependencies
|
||||
[build-system]
|
||||
build-backend = "hatchling.build"
|
||||
requires = ["hatchling"]
|
||||
@@ -29,18 +31,18 @@ classifiers = [
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
# NOTE: The below is managed by ./tools/update-optional-dependencies.py
|
||||
dependencies = [
|
||||
"bentoml[grpc,io]>=1.0.22",
|
||||
"transformers[torch,tokenizers,accelerate]>=4.29.0",
|
||||
"optimum",
|
||||
"attrs>=23.1.0",
|
||||
"cattrs>=23.1.0",
|
||||
"orjson",
|
||||
"inflection",
|
||||
"tabulate[widechars]>=0.9.0",
|
||||
"httpx",
|
||||
"typing_extensions",
|
||||
'bentoml[grpc,io]>=1.0.22',
|
||||
'transformers[torch,tokenizers,accelerate]>=4.29.0',
|
||||
'optimum',
|
||||
'attrs>=23.1.0',
|
||||
'cattrs>=23.1.0',
|
||||
'orjson',
|
||||
'inflection',
|
||||
'tabulate[widechars]>=0.9.0',
|
||||
'httpx',
|
||||
'typing_extensions',
|
||||
'cuda-python;platform_system!="Darwin"',
|
||||
]
|
||||
description = 'OpenLLM: Operating LLMs in production'
|
||||
dynamic = ["version"]
|
||||
@@ -62,9 +64,6 @@ license = "Apache-2.0"
|
||||
name = "openllm"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
|
||||
# NOTE: Don't modify project.optional-dependencies
|
||||
# as it is managed by ./tools/update-optional-dependencies.py
|
||||
[project.optional-dependencies]
|
||||
agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
|
||||
all = [
|
||||
@@ -72,22 +71,28 @@ all = [
|
||||
"openllm[falcon]",
|
||||
"openllm[mpt]",
|
||||
"openllm[starcoder]",
|
||||
"openllm[opt]",
|
||||
"openllm[flan-t5]",
|
||||
"openllm[fine-tune]",
|
||||
"openllm[vllm]",
|
||||
"openllm[agents]",
|
||||
"openllm[playground]",
|
||||
"openllm[ggml]",
|
||||
"openllm[playground]",
|
||||
"openllm[openai]",
|
||||
"openllm[gptq]",
|
||||
]
|
||||
chatglm = ["cpm-kernels", "sentencepiece"]
|
||||
falcon = ["einops", "xformers", "safetensors"]
|
||||
fine-tune = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
|
||||
flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
ggml = ["ctransformers"]
|
||||
gptq = ["auto-gptq", "triton"]
|
||||
mpt = ["triton", "einops"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
|
||||
starcoder = ["bitsandbytes"]
|
||||
vllm = ["vllm"]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/bentoml/openllm#readme"
|
||||
|
||||
@@ -71,6 +71,7 @@ else:
|
||||
from typing_extensions import overload
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import auto_gptq as autogptq
|
||||
import peft
|
||||
import torch
|
||||
|
||||
@@ -96,6 +97,8 @@ else:
|
||||
UserDictAny = collections.UserDict
|
||||
LLMRunnable = bentoml.Runnable
|
||||
LLMRunner = bentoml.Runner
|
||||
|
||||
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
|
||||
transformers = LazyLoader("transformers", globals(), "transformers")
|
||||
torch = LazyLoader("torch", globals(), "torch")
|
||||
peft = LazyLoader("peft", globals(), "peft")
|
||||
@@ -445,7 +448,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
"""The config instance to use for this LLM. This will be created based on config_class and available
|
||||
when initialising the LLM."""
|
||||
|
||||
quantization_config: transformers.BitsAndBytesConfig | None
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
|
||||
"""Quantisation config for quantised model on the fly."""
|
||||
|
||||
_model_id: str
|
||||
@@ -548,6 +551,44 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
|
||||
openllm.serialisation.save_pretrained(self, save_directory, **attrs)
|
||||
|
||||
@classmethod
|
||||
@overload
|
||||
def from_pretrained(
|
||||
cls,
|
||||
model_id: str | None = ...,
|
||||
model_version: str | None = ...,
|
||||
llm_config: openllm.LLMConfig | None = ...,
|
||||
*args: t.Any,
|
||||
runtime: t.Literal["ggml", "transformers"] | None = ...,
|
||||
quantize: t.Literal["int8", "int4"] = ...,
|
||||
bettertransformer: str | bool | None = ...,
|
||||
adapter_id: str | None = ...,
|
||||
adapter_name: str | None = ...,
|
||||
adapter_map: dict[str, str | None] | None = ...,
|
||||
quantization_config: transformers.BitsAndBytesConfig | None = ...,
|
||||
**attrs: t.Any,
|
||||
) -> LLM[M, T]:
|
||||
...
|
||||
|
||||
@classmethod
|
||||
@overload
|
||||
def from_pretrained(
|
||||
cls,
|
||||
model_id: str | None = ...,
|
||||
model_version: str | None = ...,
|
||||
llm_config: openllm.LLMConfig | None = ...,
|
||||
*args: t.Any,
|
||||
runtime: t.Literal["ggml", "transformers"] | None = ...,
|
||||
quantize: t.Literal["gptq"] = ...,
|
||||
bettertransformer: str | bool | None = ...,
|
||||
adapter_id: str | None = ...,
|
||||
adapter_name: str | None = ...,
|
||||
adapter_map: dict[str, str | None] | None = ...,
|
||||
quantization_config: autogptq.BaseQuantizeConfig | None = ...,
|
||||
**attrs: t.Any,
|
||||
) -> LLM[M, T]:
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
@@ -561,7 +602,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
adapter_id: str | None = None,
|
||||
adapter_name: str | None = None,
|
||||
adapter_map: dict[str, str | None] | None = None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | None = None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> LLM[M, T]:
|
||||
"""Instantiate a pretrained LLM.
|
||||
@@ -577,6 +618,17 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
|
||||
> Currently, the above two options are mutually exclusive.
|
||||
|
||||
#### Quantisation options
|
||||
|
||||
For customising options for quantisation config, ``openllm.LLM`` accepts all arbitrary arguments that is passed to ``transformers.BitsAndBytesConfig``
|
||||
plus ``quantize`` value. For example, for ``int8`` quantisation, specify the following:
|
||||
```python
|
||||
model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
|
||||
```
|
||||
|
||||
For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
|
||||
to ``auto_gptq.BaseQuantizeConfig``.
|
||||
|
||||
### Adapter options:
|
||||
|
||||
> This is used in conjunction with the fine-tuning features
|
||||
@@ -689,7 +741,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
model_id: str,
|
||||
llm_config: openllm.LLMConfig,
|
||||
bettertransformer: bool | None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
|
||||
_adapters_mapping: AdaptersMapping | None,
|
||||
_tag: bentoml.Tag,
|
||||
_quantize_method: t.Literal["int8", "int4", "gptq"] | None,
|
||||
|
||||
@@ -13,15 +13,26 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
from .utils import LazyLoader
|
||||
from .utils import is_autogptq_available
|
||||
from .utils import is_bitsandbytes_available
|
||||
from .utils import is_transformers_supports_kbit
|
||||
from .utils import pkg
|
||||
|
||||
|
||||
# NOTE: We need to do this so that overload can register
|
||||
# correct overloads to typing registry
|
||||
if sys.version_info[:2] >= (3, 11):
|
||||
from typing import overload
|
||||
else:
|
||||
from typing_extensions import overload
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import auto_gptq as autogptq
|
||||
import torch
|
||||
|
||||
import openllm
|
||||
@@ -29,6 +40,7 @@ if t.TYPE_CHECKING:
|
||||
|
||||
from ._types import DictStrAny
|
||||
else:
|
||||
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
|
||||
torch = LazyLoader("torch", globals(), "torch")
|
||||
transformers = LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
@@ -37,15 +49,38 @@ logger = logging.getLogger(__name__)
|
||||
QuantiseMode = t.Literal["int8", "int4", "gptq"]
|
||||
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any
|
||||
) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any
|
||||
) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
|
||||
...
|
||||
|
||||
|
||||
def infer_quantisation_config(
|
||||
cls: type[openllm.LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any
|
||||
) -> tuple[transformers.BitsAndBytesConfig | t.Any, DictStrAny]:
|
||||
) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
|
||||
# 8 bit configuration
|
||||
int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
|
||||
int8_enable_fp32_cpu_offload = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
|
||||
int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
|
||||
int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
|
||||
|
||||
autogptq_attrs: DictStrAny = {
|
||||
"bits": attrs.pop("gptq_bits", 4),
|
||||
"group_size": attrs.pop("gptq_group_size", -1),
|
||||
"damp_percent": attrs.pop("gptq_damp_percent", 0.01),
|
||||
"desc_act": attrs.pop("gptq_desc_act", True),
|
||||
"sym": attrs.pop("gptq_sym", True),
|
||||
"true_sequential": attrs.pop("gptq_true_sequential", True),
|
||||
}
|
||||
|
||||
def create_int8_config(int8_skip_modules: list[str] | None):
|
||||
if int8_skip_modules is None:
|
||||
int8_skip_modules = []
|
||||
@@ -94,8 +129,15 @@ def infer_quantisation_config(
|
||||
logger.warning("OpenLLM will fallback to 8-bit quantization.")
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == "gptq":
|
||||
# TODO: support GPTQ loading quantization
|
||||
raise NotImplementedError("GPTQ is not supported yet.")
|
||||
if not is_autogptq_available():
|
||||
logger.warning(
|
||||
"'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment)."
|
||||
" Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback "
|
||||
"to int8 with bitsandbytes."
|
||||
)
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
else:
|
||||
quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
|
||||
else:
|
||||
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
|
||||
|
||||
|
||||
@@ -13,11 +13,15 @@
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import inspect
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
import typing as t
|
||||
import warnings
|
||||
|
||||
import psutil
|
||||
|
||||
@@ -27,62 +31,113 @@ from bentoml._internal.resource import system_resources
|
||||
from bentoml._internal.runner.strategy import THREAD_ENVS
|
||||
from bentoml._internal.runner.strategy import Strategy
|
||||
|
||||
from .exceptions import OpenLLMException
|
||||
from .utils import LazyLoader
|
||||
from .utils import LazyType
|
||||
from .utils import ReprMixin
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import bentoml
|
||||
|
||||
ListIntStr = list[int | str]
|
||||
|
||||
class DynResource(Resource[t.List[str]], resource_id=""):
|
||||
resource_id: t.ClassVar[str]
|
||||
|
||||
else:
|
||||
DynResource = Resource[t.List[str]]
|
||||
torch = LazyLoader("torch", globals(), "torch")
|
||||
ListIntStr = list
|
||||
|
||||
# NOTE: We need to do this so that overload can register
|
||||
# correct overloads to typing registry
|
||||
if sys.version_info[:2] >= (3, 11):
|
||||
from typing import overload
|
||||
else:
|
||||
from typing_extensions import overload
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
|
||||
@classmethod
|
||||
def from_spec(cls, spec: t.Any) -> list[str]:
|
||||
if not isinstance(spec, (int, str, list)):
|
||||
raise TypeError("AMD GPU device IDs must be int, str or a list specifing the exact GPUs to use.")
|
||||
def _strtoul(s: str) -> int:
|
||||
"""Return -1 or positive integer sequence string starts with,."""
|
||||
if not s:
|
||||
return -1
|
||||
for idx, c in enumerate(s):
|
||||
if not (c.isdigit() or (idx == 0 and c in "+-")):
|
||||
break
|
||||
if idx + 1 == len(s):
|
||||
idx += 1 # noqa: PLW2901
|
||||
return int(s[:idx]) if idx > 0 else -1 # type: ignore (idx will be set via enumerate)
|
||||
|
||||
try:
|
||||
if isinstance(spec, int):
|
||||
if spec == -1:
|
||||
return []
|
||||
if spec < -1:
|
||||
raise ValueError
|
||||
return [str(i) for i in range(spec)]
|
||||
elif isinstance(spec, str):
|
||||
try:
|
||||
return cls.from_spec(int(spec))
|
||||
except ValueError:
|
||||
if spec.startswith("GPU"):
|
||||
return [spec]
|
||||
raise ValueError
|
||||
else:
|
||||
return [str(x) for x in spec]
|
||||
except ValueError:
|
||||
raise OpenLLMException(f"Invalid AMD GPU resource limit '{spec}'.")
|
||||
|
||||
@classmethod
|
||||
def from_system(cls) -> list[str]:
|
||||
"""Retrieve AMD GPU from system, currently only supports on Linux.
|
||||
|
||||
This assumes that ROCm is setup correctly.
|
||||
"""
|
||||
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
|
||||
if cuda_visible_devices in ("", "-1"):
|
||||
def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
|
||||
rcs: list[str] = []
|
||||
for elem in lst.split(","):
|
||||
# Repeated id results in empty set
|
||||
if elem in rcs:
|
||||
return []
|
||||
if cuda_visible_devices is not None:
|
||||
cuda_visible_devices = cuda_visible_devices.split(",")
|
||||
if "-1" in cuda_visible_devices:
|
||||
cuda_visible_devices = cuda_visible_devices[: cuda_visible_devices.index("-1")]
|
||||
return cuda_visible_devices
|
||||
# Anything other but prefix is ignored
|
||||
if not elem.startswith(prefix):
|
||||
break
|
||||
rcs.append(elem)
|
||||
return rcs
|
||||
|
||||
|
||||
_STACK_LEVEL = 3
|
||||
|
||||
|
||||
@overload
|
||||
def _parse_visible_devices(default_var: str | None = ..., respect_env: t.Literal[True] = True) -> list[str] | None:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def _parse_visible_devices(default_var: str = ..., respect_env: t.Literal[False] = False) -> list[str]:
|
||||
...
|
||||
|
||||
|
||||
def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
|
||||
"""CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
|
||||
if respect_env:
|
||||
spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var)
|
||||
if not spec:
|
||||
return
|
||||
else:
|
||||
assert default_var is not None, "spec is required to be not None when parsing spec." # noqa: S101
|
||||
spec = default_var
|
||||
|
||||
if spec.startswith("GPU-"):
|
||||
return _parse_list_with_prefix(spec, "GPU-")
|
||||
if spec.startswith("MIG-"):
|
||||
return _parse_list_with_prefix(spec, "MIG-")
|
||||
|
||||
# XXX: We to somehow handle cases such as '100m'
|
||||
# CUDA_VISIBLE_DEVICES uses something like strtoul
|
||||
# which makes `1gpu2,2ampere` is equivalent to `1,2`
|
||||
rc: list[int] = []
|
||||
for el in spec.split(","):
|
||||
x = _strtoul(el.strip())
|
||||
# Repeated ordinal results in empty set
|
||||
if x in rc:
|
||||
return []
|
||||
# Negative value aborts the sequence
|
||||
if x < 0:
|
||||
break
|
||||
rc.append(x)
|
||||
return [str(i) for i in rc]
|
||||
|
||||
|
||||
def _from_system(cls: type[DynResource]) -> list[str]:
|
||||
"""Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
|
||||
|
||||
It relies on torch.cuda implementation and in turns respect CUDA_VISIBLE_DEVICES.
|
||||
"""
|
||||
if cls.resource_id == "amd.com/gpu":
|
||||
if not psutil.LINUX:
|
||||
logger.debug("AMD GPU resource is only supported on Linux.")
|
||||
warnings.warn("AMD GPUs is currently only supported on Linux.", stacklevel=_STACK_LEVEL)
|
||||
return []
|
||||
|
||||
# ROCm does not currently have the rocm_smi wheel.
|
||||
@@ -90,37 +145,169 @@ class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
|
||||
# we don't want to use CLI because parsing is a pain.
|
||||
sys.path.append("/opt/rocm/libexec/rocm_smi")
|
||||
try:
|
||||
from ctypes import byref
|
||||
from ctypes import c_uint32
|
||||
|
||||
# refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
|
||||
from rsmiBindings import rocmsmi
|
||||
from rsmiBindings import rsmi_status_t
|
||||
|
||||
num = c_uint32(0)
|
||||
ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
|
||||
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
|
||||
return [str(i) for i in range(num.value)]
|
||||
return []
|
||||
except Exception as err:
|
||||
logger.debug("Failed to setup AMD GPU resource: %s", err)
|
||||
from rsmiBindings import rocmsmi as rocmsmi
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
# In this case the binary is not found, returning empty list
|
||||
return []
|
||||
finally:
|
||||
sys.path.remove("/opt/rocm/libexec/rocm_smi")
|
||||
visible_devices = _parse_visible_devices()
|
||||
if visible_devices is None:
|
||||
return [str(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else []
|
||||
return visible_devices
|
||||
|
||||
@classmethod
|
||||
def validate(cls, val: list[str]):
|
||||
for gpu_index_or_literal in val:
|
||||
try:
|
||||
idx = int(gpu_index_or_literal)
|
||||
except ValueError:
|
||||
raise OpenLLMException(f"Invalid AMD GPU device index: {val}")
|
||||
if int(idx) < 0:
|
||||
raise OpenLLMException(f"Negative GPU device in {val}.")
|
||||
if int(idx) >= len(cls.from_system()):
|
||||
raise OpenLLMException(
|
||||
f"GPU device index in {val} is greater than the system available: {cls.from_system()}"
|
||||
)
|
||||
|
||||
@overload
|
||||
def _from_spec(cls: type[DynResource], spec: int) -> list[str]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def _from_spec(cls: type[DynResource], spec: ListIntStr) -> list[str]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def _from_spec(cls: type[DynResource], spec: str) -> list[str]:
|
||||
...
|
||||
|
||||
|
||||
def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
|
||||
"""Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
|
||||
|
||||
The parser behaves similar to how PyTorch handles CUDA_VISIBLE_DEVICES. This means within
|
||||
BentoML's resource configuration, its behaviour is similar to CUDA_VISIBLE_DEVICES.
|
||||
"""
|
||||
if isinstance(spec, int):
|
||||
if spec in (-1, 0):
|
||||
return []
|
||||
if spec < -1:
|
||||
raise ValueError("Spec cannot be < -1.")
|
||||
return [str(i) for i in range(spec)]
|
||||
elif isinstance(spec, str):
|
||||
if not spec:
|
||||
return []
|
||||
if spec.isdigit():
|
||||
spec = ",".join([str(i) for i in range(_strtoul(spec))])
|
||||
return _parse_visible_devices(spec, respect_env=False)
|
||||
elif LazyType(ListIntStr).isinstance(spec):
|
||||
return [str(x) for x in spec]
|
||||
else:
|
||||
raise TypeError(
|
||||
f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
|
||||
)
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def _raw_uuid_nvml() -> list[str] | None:
|
||||
"""Return list of device UUID as reported by NVML or None if NVML discovery/initialization failed."""
|
||||
try:
|
||||
from cuda import cuda
|
||||
except ImportError:
|
||||
if sys.platform == "darwin":
|
||||
raise RuntimeError("GPU is not available on Darwin system.") from None
|
||||
raise RuntimeError(
|
||||
"Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
|
||||
) from None
|
||||
|
||||
from ctypes import CDLL
|
||||
from ctypes import byref
|
||||
from ctypes import c_void_p
|
||||
from ctypes import create_string_buffer
|
||||
|
||||
nvml_h = CDLL("libnvidia-ml.so.1")
|
||||
rc = nvml_h.nvmlInit()
|
||||
if rc != 0:
|
||||
warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL)
|
||||
return
|
||||
err, dev_count = cuda.cuDeviceGetCount()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
warnings.warn("Failed to get available device from system.", stacklevel=_STACK_LEVEL)
|
||||
return
|
||||
uuids: list[str] = []
|
||||
for idx in range(dev_count):
|
||||
dev_id = c_void_p()
|
||||
rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
|
||||
if rc != 0:
|
||||
warnings.warn(f"Failed to get device handle for {idx}", stacklevel=_STACK_LEVEL)
|
||||
return
|
||||
buf_len = 96
|
||||
buf = create_string_buffer(buf_len)
|
||||
rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
|
||||
if rc != 0:
|
||||
warnings.warn(f"Failed to get device UUID for {idx}", stacklevel=_STACK_LEVEL)
|
||||
return
|
||||
uuids.append(buf.raw.decode("ascii").strip("\0"))
|
||||
del nvml_h
|
||||
return uuids
|
||||
|
||||
|
||||
def _validate(cls: type[DynResource], val: list[t.Any]):
|
||||
if cls.resource_id == "amd.com/gpu":
|
||||
raise RuntimeError(
|
||||
"AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
|
||||
)
|
||||
if not all(isinstance(i, str) for i in val):
|
||||
raise ValueError("Input list should be all string type.")
|
||||
|
||||
try:
|
||||
from cuda import cuda
|
||||
except ImportError:
|
||||
if sys.platform == "darwin":
|
||||
raise RuntimeError("GPU is not available on Darwin system.") from None
|
||||
raise RuntimeError(
|
||||
"Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
|
||||
) from None
|
||||
# correctly parse handle
|
||||
for el in val:
|
||||
if el.startswith("GPU-") or el.startswith("MIG-"):
|
||||
uuids = _raw_uuid_nvml()
|
||||
if uuids is None:
|
||||
raise ValueError("Failed to parse available GPUs UUID")
|
||||
if el not in uuids:
|
||||
raise ValueError(f"Given UUID {el} is not found with available UUID (available: {uuids})")
|
||||
elif el.isdigit():
|
||||
err, _ = cuda.cuDeviceGet(int(el))
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise ValueError(f"Failed to get device {el}")
|
||||
|
||||
|
||||
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
|
||||
return types.new_class(
|
||||
name,
|
||||
(DynResource, ReprMixin),
|
||||
{"resource_id": resource_kind},
|
||||
lambda ns: ns.update(
|
||||
{
|
||||
"resource_id": resource_kind,
|
||||
"from_spec": classmethod(_from_spec),
|
||||
"from_system": classmethod(_from_system),
|
||||
"validate": classmethod(_validate),
|
||||
"__repr_keys__": property(lambda _: {"resource_id"}),
|
||||
"__doc__": inspect.cleandoc(docstring),
|
||||
"__module__": "openllm._strategies",
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
NvidiaGpuResource = _make_resource_class(
|
||||
"NvidiaGpuResource",
|
||||
"nvidia.com/gpu",
|
||||
"""NVIDIA GPU resource.
|
||||
|
||||
This is a modified version of internal's BentoML's NvidiaGpuResource
|
||||
where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",
|
||||
)
|
||||
AmdGpuResource = _make_resource_class(
|
||||
"AmdGpuResource",
|
||||
"amd.com/gpu",
|
||||
"""AMD GPU resource.
|
||||
|
||||
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
|
||||
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",
|
||||
)
|
||||
|
||||
|
||||
class CascadingResourceStrategy(Strategy, ReprMixin):
|
||||
@@ -147,15 +334,21 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
|
||||
if resource_request is None:
|
||||
resource_request = system_resources()
|
||||
|
||||
# use nvidia gpu
|
||||
nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
|
||||
if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
|
||||
return math.ceil(len(nvidia_gpus) * workers_per_resource)
|
||||
def _get_gpu_count(typ: list[str] | None, kind: str):
|
||||
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
|
||||
return math.ceil(len(typ) * workers_per_resource)
|
||||
|
||||
# use amd gpu
|
||||
amd_gpus = get_resource(resource_request, "amd.com/gpu")
|
||||
if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
|
||||
return math.ceil(len(amd_gpus) * workers_per_resource)
|
||||
# use NVIDIA
|
||||
kind = "nvidia.com/gpu"
|
||||
count = _get_gpu_count(get_resource(resource_request, kind), kind)
|
||||
if count:
|
||||
return count
|
||||
|
||||
# use AMD
|
||||
kind = "amd.com/gpu"
|
||||
count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
|
||||
if count:
|
||||
return count
|
||||
|
||||
# use CPU
|
||||
cpus = get_resource(resource_request, "cpu")
|
||||
@@ -203,36 +396,32 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
|
||||
if resource_request is None:
|
||||
resource_request = system_resources()
|
||||
|
||||
# use nvidia gpu
|
||||
nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
|
||||
if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
|
||||
dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, nvidia_gpus, worker_index)
|
||||
# use NVIDIA
|
||||
kind = "nvidia.com/gpu"
|
||||
typ = get_resource(resource_request, kind)
|
||||
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
|
||||
if disabled:
|
||||
logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
|
||||
environ["CUDA_VISIBLE_DEVICES"] = cuda_env
|
||||
return environ
|
||||
environ["CUDA_VISIBLE_DEVICES"] = dev
|
||||
logger.info(
|
||||
"Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
|
||||
worker_index,
|
||||
dev,
|
||||
environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
|
||||
workers_per_resource, typ, worker_index
|
||||
)
|
||||
logger.debug("Environ for worker %s: %s", worker_index, environ)
|
||||
return environ
|
||||
|
||||
# use amd gpu
|
||||
amd_gpus = get_resource(resource_request, "amd.com/gpu")
|
||||
if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
|
||||
dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, amd_gpus, worker_index)
|
||||
# use AMD
|
||||
kind = "amd.com/gpu"
|
||||
typ = get_resource(resource_request, kind, validate=False)
|
||||
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
|
||||
if disabled:
|
||||
logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
|
||||
environ["CUDA_VISIBLE_DEVICES"] = cuda_env
|
||||
return environ
|
||||
environ["CUDA_VISIBLE_DEVICES"] = dev
|
||||
logger.info(
|
||||
"Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
|
||||
worker_index,
|
||||
dev,
|
||||
environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
|
||||
workers_per_resource, typ, worker_index
|
||||
)
|
||||
logger.debug("Environ for worker %s: %s", worker_index, environ)
|
||||
return environ
|
||||
|
||||
# use CPU
|
||||
@@ -243,23 +432,16 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
|
||||
thread_count = math.ceil(cpus)
|
||||
for thread_env in THREAD_ENVS:
|
||||
environ[thread_env] = os.getenv(thread_env, str(thread_count))
|
||||
logger.info(
|
||||
"Environ for worker %d: set CPU thread count to %d",
|
||||
worker_index,
|
||||
thread_count,
|
||||
)
|
||||
return environ
|
||||
else:
|
||||
for thread_env in THREAD_ENVS:
|
||||
environ[thread_env] = os.getenv(thread_env, "1")
|
||||
logger.debug("Environ for worker %s: %s", worker_index, environ)
|
||||
return environ
|
||||
for thread_env in THREAD_ENVS:
|
||||
environ[thread_env] = os.getenv(thread_env, "1")
|
||||
return environ
|
||||
|
||||
return environ
|
||||
|
||||
@staticmethod
|
||||
def transpile_workers_to_cuda_visible_devices(
|
||||
workers_per_resource: float | int, gpus: list[str], worker_index: int
|
||||
) -> str:
|
||||
def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
|
||||
# Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
|
||||
if isinstance(workers_per_resource, float):
|
||||
# NOTE: We hit this branch when workers_per_resource is set to
|
||||
@@ -287,9 +469,9 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
|
||||
dev = ",".join(assigned_gpu)
|
||||
else:
|
||||
idx = worker_index // workers_per_resource
|
||||
if len(gpus) == idx:
|
||||
if idx >= len(gpus):
|
||||
raise ValueError(
|
||||
f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}"
|
||||
)
|
||||
dev = gpus[idx]
|
||||
dev = str(gpus[idx])
|
||||
return dev
|
||||
|
||||
@@ -30,6 +30,7 @@ from ._configuration import AdapterType
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import auto_gptq as autogptq
|
||||
import click
|
||||
import peft
|
||||
|
||||
@@ -155,7 +156,7 @@ class LLMRunner(bentoml.Runner):
|
||||
|
||||
class LLMInitAttrs(t.TypedDict):
|
||||
config: openllm.LLMConfig
|
||||
quantization_config: transformers.BitsAndBytesConfig | None
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
|
||||
model_id: str
|
||||
runtime: t.Literal["ggml", "transformers"]
|
||||
model_decls: TupleAny
|
||||
|
||||
@@ -773,7 +773,6 @@ def noop_command(
|
||||
def prerequisite_check(
|
||||
ctx: click.Context,
|
||||
llm_config: openllm.LLMConfig,
|
||||
env: EnvVarMixin,
|
||||
gpu_available: tuple[str, ...],
|
||||
quantize: t.LiteralString | None,
|
||||
adapter_map: dict[str, str | None] | None,
|
||||
@@ -785,9 +784,6 @@ def prerequisite_check(
|
||||
if len(gpu_available) < 1:
|
||||
_echo(f"Quantization requires at least 1 GPU (got {len(gpu_available)})", fg="red")
|
||||
ctx.exit(1)
|
||||
if env.framework_value != "pt":
|
||||
_echo("Quantization is currently only available for PyTorch models.", fg="red")
|
||||
ctx.exit(1)
|
||||
|
||||
if adapter_map and not is_peft_available():
|
||||
_echo(
|
||||
@@ -905,7 +901,7 @@ def start_bento(
|
||||
config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
|
||||
)
|
||||
|
||||
prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
|
||||
prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
|
||||
|
||||
# NOTE: This is to set current configuration
|
||||
start_env = os.environ.copy()
|
||||
@@ -1037,7 +1033,7 @@ def start_model(
|
||||
config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
|
||||
)
|
||||
|
||||
prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
|
||||
prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
|
||||
|
||||
# NOTE: This is to set current configuration
|
||||
start_env = os.environ.copy()
|
||||
@@ -1151,7 +1147,7 @@ def start_model(
|
||||
@output_option
|
||||
@quantize_option(click)
|
||||
@click.option("--machine", is_flag=True, default=False, hidden=True)
|
||||
@click.option("--implementation", type=click.Choice(["pt", "tf", "flax"]), default=None, hidden=True)
|
||||
@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, hidden=True)
|
||||
def download_models_command(
|
||||
model: str,
|
||||
model_id: str | None,
|
||||
@@ -1193,7 +1189,7 @@ def download_models_command(
|
||||
> only use this option if you want the weight to be quantized by default. Note that OpenLLM also
|
||||
> support on-demand quantisation during initial startup.
|
||||
"""
|
||||
impl: t.Literal["pt", "tf", "flax"] = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
|
||||
impl: LiteralRuntime = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
|
||||
llm = openllm.infer_auto_class(impl).for_model(
|
||||
model,
|
||||
model_id=model_id,
|
||||
@@ -1263,7 +1259,7 @@ def _start(
|
||||
runtime: t.Literal["ggml", "transformers"] = ...,
|
||||
fast: bool = ...,
|
||||
adapter_map: dict[t.LiteralString, str | None] | None = ...,
|
||||
framework: t.Literal["flax", "tf", "pt"] | None = ...,
|
||||
framework: LiteralRuntime | None = ...,
|
||||
additional_args: ListStr | None = ...,
|
||||
_serve_grpc: bool = ...,
|
||||
__test__: t.Literal[False] = ...,
|
||||
@@ -1284,7 +1280,7 @@ def _start(
|
||||
runtime: t.Literal["ggml", "transformers"] = ...,
|
||||
fast: bool = ...,
|
||||
adapter_map: dict[t.LiteralString, str | None] | None = ...,
|
||||
framework: t.Literal["flax", "tf", "pt"] | None = ...,
|
||||
framework: LiteralRuntime | None = ...,
|
||||
additional_args: ListStr | None = ...,
|
||||
_serve_grpc: bool = ...,
|
||||
__test__: t.Literal[True] = ...,
|
||||
@@ -1304,7 +1300,7 @@ def _start(
|
||||
runtime: t.Literal["ggml", "transformers"] = "transformers",
|
||||
fast: bool = False,
|
||||
adapter_map: dict[t.LiteralString, str | None] | None = None,
|
||||
framework: t.Literal["flax", "tf", "pt"] | None = None,
|
||||
framework: LiteralRuntime | None = None,
|
||||
additional_args: ListStr | None = None,
|
||||
_serve_grpc: bool = False,
|
||||
__test__: bool = False,
|
||||
@@ -1615,6 +1611,13 @@ start, start_grpc, build, import_model, list_models = (
|
||||
help="The output format for 'openllm build'. By default this will build a BentoLLM. 'container' is the shortcut of 'openllm build && bentoml containerize'.",
|
||||
hidden=not get_debug_mode(),
|
||||
)
|
||||
@click.option(
|
||||
"--push",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
type=click.BOOL,
|
||||
help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.",
|
||||
)
|
||||
@click.pass_context
|
||||
def build_command(
|
||||
ctx: click.Context,
|
||||
@@ -1632,6 +1635,7 @@ def build_command(
|
||||
model_version: str | None,
|
||||
dockerfile_template: t.TextIO | None,
|
||||
format: t.Literal["bento", "container"],
|
||||
push: bool,
|
||||
**attrs: t.Any,
|
||||
):
|
||||
"""Package a given models into a Bento.
|
||||
@@ -1788,7 +1792,12 @@ def build_command(
|
||||
else:
|
||||
_echo(bento.tag)
|
||||
|
||||
if format == "container":
|
||||
if format == "container" and push:
|
||||
ctx.fail("'--format=container' and '--push' are mutually exclusive.")
|
||||
if push:
|
||||
client = BentoMLContainer.bentocloud_client.get()
|
||||
client.push_bento(bento)
|
||||
elif format == "container":
|
||||
backend = os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker")
|
||||
_echo(f"Building {bento} into a LLMContainer using backend '{backend}'", fg="magenta")
|
||||
if not bentoml.container.health(backend):
|
||||
|
||||
@@ -99,11 +99,8 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
|
||||
|
||||
|
||||
def gpu_count() -> tuple[str, ...]:
|
||||
from bentoml._internal.resource import NvidiaGpuResource
|
||||
|
||||
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
||||
if cuda_visible_devices is not None:
|
||||
return tuple(i for i in cuda_visible_devices.split(","))
|
||||
"""Return available GPU under system. Currently only supports NVIDIA GPUs."""
|
||||
from .._strategies import NvidiaGpuResource
|
||||
|
||||
return tuple(NvidiaGpuResource.from_system())
|
||||
|
||||
@@ -417,6 +414,7 @@ _import_structure = {
|
||||
"is_jupytext_available",
|
||||
"is_notebook_available",
|
||||
"is_triton_available",
|
||||
"is_autogptq_available",
|
||||
"require_backends",
|
||||
],
|
||||
}
|
||||
@@ -443,6 +441,7 @@ if t.TYPE_CHECKING:
|
||||
from .import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES
|
||||
from .import_utils import DummyMetaclass as DummyMetaclass
|
||||
from .import_utils import EnvVarMixin as EnvVarMixin
|
||||
from .import_utils import is_autogptq_available as is_autogptq_available
|
||||
from .import_utils import is_bitsandbytes_available as is_bitsandbytes_available
|
||||
from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
|
||||
from .import_utils import is_datasets_available as is_datasets_available
|
||||
|
||||
@@ -252,7 +252,7 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]):
|
||||
|
||||
|
||||
def generate_unique_filename(cls: type[t.Any], func_name: str):
|
||||
return f"<{cls.__name__} generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>"
|
||||
return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
|
||||
|
||||
|
||||
def generate_function(
|
||||
@@ -332,6 +332,7 @@ def make_env_transformer(
|
||||
|
||||
|
||||
def gen_sdk(func: t.Callable[P, t.Any], name: str | None = None, **attrs: t.Any):
|
||||
"""Enhance function with nicer Repr."""
|
||||
from .representation import ReprMixin
|
||||
|
||||
if name is None:
|
||||
|
||||
@@ -56,16 +56,17 @@ else:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPTIONAL_DEPENDENCIES = {
|
||||
"chatglm",
|
||||
"falcon",
|
||||
"mpt",
|
||||
"starcoder",
|
||||
"fine-tune",
|
||||
"flan-t5",
|
||||
"mpt",
|
||||
"falcon",
|
||||
"starcoder",
|
||||
"chatglm",
|
||||
"openai",
|
||||
"agents",
|
||||
"playground",
|
||||
"ggml",
|
||||
"agents",
|
||||
"openai",
|
||||
"playground",
|
||||
"gptq",
|
||||
}
|
||||
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
|
||||
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
|
||||
@@ -100,6 +101,7 @@ _triton_available = _is_package_available("triton")
|
||||
_jupyter_available = _is_package_available("jupyter")
|
||||
_jupytext_available = _is_package_available("jupytext")
|
||||
_notebook_available = _is_package_available("notebook")
|
||||
_autogptq_available = _is_package_available("auto-gptq")
|
||||
|
||||
|
||||
def is_transformers_supports_kbit() -> bool:
|
||||
@@ -146,6 +148,10 @@ def is_bitsandbytes_available():
|
||||
return _bitsandbytes_available
|
||||
|
||||
|
||||
def is_autogptq_available():
|
||||
return _autogptq_available
|
||||
|
||||
|
||||
def is_torch_available():
|
||||
global _torch_available
|
||||
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
|
||||
@@ -309,6 +315,11 @@ You can install it with pip: `pip install bitsandbytes`. Please note that you ma
|
||||
your runtime after installation.
|
||||
"""
|
||||
|
||||
AUTOGPTQ_IMPORT_ERROR = """{0} requires the auto-gptq library but it was not found in your environment.
|
||||
You can install it with pip: `pip install auto-gptq`. Please note that you may need to restart
|
||||
your runtime after installation.
|
||||
"""
|
||||
|
||||
BACKENDS_MAPPING = BackendOrderredDict(
|
||||
[
|
||||
("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
|
||||
@@ -320,6 +331,7 @@ BACKENDS_MAPPING = BackendOrderredDict(
|
||||
("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
|
||||
("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
|
||||
("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
|
||||
("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
import typing as t
|
||||
|
||||
import pytest
|
||||
@@ -25,14 +26,127 @@ import bentoml
|
||||
from bentoml._internal.resource import get_resource
|
||||
from openllm import _strategies as strategy
|
||||
from openllm._strategies import CascadingResourceStrategy
|
||||
from openllm._strategies import NvidiaGpuResource
|
||||
|
||||
|
||||
def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as mcls:
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "0,1")
|
||||
resource = NvidiaGpuResource.from_system()
|
||||
assert len(resource) == 2
|
||||
assert resource == ["0", "1"]
|
||||
mcls.delenv("CUDA_VISIBLE_DEVICES")
|
||||
|
||||
|
||||
def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as mcls:
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "0,2,-1,1")
|
||||
resource = NvidiaGpuResource.from_system()
|
||||
assert len(resource) == 2
|
||||
assert resource == ["0", "2"]
|
||||
mcls.delenv("CUDA_VISIBLE_DEVICES")
|
||||
|
||||
|
||||
def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as mcls:
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "-1")
|
||||
resource = NvidiaGpuResource.from_system()
|
||||
assert len(resource) == 0
|
||||
assert resource == []
|
||||
mcls.delenv("CUDA_VISIBLE_DEVICES")
|
||||
|
||||
|
||||
def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as mcls:
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43-ac33420d4628")
|
||||
resource = NvidiaGpuResource.from_system()
|
||||
assert len(resource) == 1
|
||||
assert resource == ["GPU-5ebe9f43-ac33420d4628"]
|
||||
mcls.delenv("CUDA_VISIBLE_DEVICES")
|
||||
with monkeypatch.context() as mcls:
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,GPU-ac33420d4628")
|
||||
resource = NvidiaGpuResource.from_system()
|
||||
assert len(resource) == 2
|
||||
assert resource == ["GPU-5ebe9f43", "GPU-ac33420d4628"]
|
||||
mcls.delenv("CUDA_VISIBLE_DEVICES")
|
||||
with monkeypatch.context() as mcls:
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,-1,GPU-ac33420d4628")
|
||||
resource = NvidiaGpuResource.from_system()
|
||||
assert len(resource) == 1
|
||||
assert resource == ["GPU-5ebe9f43"]
|
||||
mcls.delenv("CUDA_VISIBLE_DEVICES")
|
||||
with monkeypatch.context() as mcls:
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "MIG-GPU-5ebe9f43-ac33420d4628")
|
||||
resource = NvidiaGpuResource.from_system()
|
||||
assert len(resource) == 1
|
||||
assert resource == ["MIG-GPU-5ebe9f43-ac33420d4628"]
|
||||
mcls.delenv("CUDA_VISIBLE_DEVICES")
|
||||
|
||||
|
||||
def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as mcls:
|
||||
# to make this tests works with system that has GPU
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "")
|
||||
assert len(NvidiaGpuResource.from_system()) >= 0 # TODO: real from_system tests
|
||||
|
||||
assert pytest.raises(
|
||||
ValueError,
|
||||
NvidiaGpuResource.validate,
|
||||
[*NvidiaGpuResource.from_system(), 1],
|
||||
).match("Input list should be all string type.")
|
||||
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match(
|
||||
"Input list should be all string type."
|
||||
)
|
||||
assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["GPU-5ebe9f43", "GPU-ac33420d4628"]).match(
|
||||
"Failed to parse available GPUs UUID"
|
||||
)
|
||||
|
||||
|
||||
def test_nvidia_gpu_validate_no_gpu_available():
|
||||
assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["0", "1"]).match("Failed to get device *")
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform != "darwin", reason="Test NVIDIA validation on Darwin only")
|
||||
def test_nvidia_gpu_validation_on_darwin():
|
||||
assert pytest.raises(RuntimeError, NvidiaGpuResource.validate, ["0"]).match(
|
||||
"GPU is not available on Darwin system."
|
||||
)
|
||||
|
||||
|
||||
def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as mcls:
|
||||
# to make this tests works with system that has GPU
|
||||
mcls.setenv("CUDA_VISIBLE_DEVICES", "")
|
||||
assert NvidiaGpuResource.from_spec(1) == ["0"]
|
||||
assert NvidiaGpuResource.from_spec("5") == ["0", "1", "2", "3", "4"]
|
||||
assert NvidiaGpuResource.from_spec(1) == ["0"]
|
||||
assert NvidiaGpuResource.from_spec(2) == ["0", "1"]
|
||||
assert NvidiaGpuResource.from_spec("3") == ["0", "1", "2"]
|
||||
assert NvidiaGpuResource.from_spec([1, 3]) == ["1", "3"]
|
||||
assert NvidiaGpuResource.from_spec(["1", "3"]) == ["1", "3"]
|
||||
assert NvidiaGpuResource.from_spec(-1) == []
|
||||
assert NvidiaGpuResource.from_spec("-1") == []
|
||||
assert NvidiaGpuResource.from_spec("") == []
|
||||
assert NvidiaGpuResource.from_spec("-2") == []
|
||||
assert NvidiaGpuResource.from_spec("GPU-288347ab") == ["GPU-288347ab"]
|
||||
assert NvidiaGpuResource.from_spec("GPU-288347ab,-1,GPU-ac33420d4628") == ["GPU-288347ab"]
|
||||
assert NvidiaGpuResource.from_spec("GPU-288347ab,GPU-ac33420d4628") == ["GPU-288347ab", "GPU-ac33420d4628"]
|
||||
assert NvidiaGpuResource.from_spec("MIG-GPU-288347ab") == ["MIG-GPU-288347ab"]
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
NvidiaGpuResource.from_spec((1, 2, 3))
|
||||
with pytest.raises(TypeError):
|
||||
NvidiaGpuResource.from_spec(1.5)
|
||||
with pytest.raises(ValueError):
|
||||
assert NvidiaGpuResource.from_spec(-2)
|
||||
|
||||
|
||||
class GPURunnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu")
|
||||
|
||||
|
||||
def unvalidated_get_resource(x: dict[str, t.Any], y: str):
|
||||
return get_resource(x, y, validate=False)
|
||||
def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False):
|
||||
return get_resource(x, y, validate=validate)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
|
||||
|
||||
285
tools/dependencies.py
Executable file
285
tools/dependencies.py
Executable file
@@ -0,0 +1,285 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2023 BentoML Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import typing as t
|
||||
|
||||
import inflection
|
||||
import tomlkit
|
||||
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from tomlkit.items import Array
|
||||
from tomlkit.items import Table
|
||||
|
||||
|
||||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class Classifier:
|
||||
identifier: t.Dict[str, str] = dataclasses.field(
|
||||
default_factory=lambda: {
|
||||
"status": "Development Status",
|
||||
"environment": "Environment",
|
||||
"license": "License",
|
||||
"topic": "Topic",
|
||||
"os": "Operating System",
|
||||
"audience": "Intended Audience",
|
||||
"typing": "Typing",
|
||||
"language": "Programming Language",
|
||||
}
|
||||
)
|
||||
|
||||
joiner: str = " :: "
|
||||
|
||||
@staticmethod
|
||||
def status() -> dict[int, str]:
|
||||
return {
|
||||
v: status
|
||||
for v, status in zip(
|
||||
range(1, 8),
|
||||
[
|
||||
"1 - Planning",
|
||||
"2 - Pre-Alpha",
|
||||
"3 - Alpha",
|
||||
"4 - Beta",
|
||||
"5 - Production/Stable",
|
||||
"6 - Mature",
|
||||
"7 - Inactive",
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def apache() -> str:
|
||||
return Classifier.create_classifier("license", "OSI Approved", "Apache Software License")
|
||||
|
||||
@staticmethod
|
||||
def create_classifier(identifier: str, *decls: t.Any) -> str:
|
||||
cls_ = Classifier()
|
||||
if identifier not in cls_.identifier:
|
||||
raise ValueError(f"{identifier} is not yet supported (supported alias: {Classifier.identifier})")
|
||||
return cls_.joiner.join([cls_.identifier[identifier], *decls])
|
||||
|
||||
@staticmethod
|
||||
def create_python_classifier(
|
||||
implementation: list[str] | None = None, supported_version: list[str] | None = None
|
||||
) -> list[str]:
|
||||
if supported_version is None:
|
||||
supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||
if implementation is None:
|
||||
implementation = ["CPython", "PyPy"]
|
||||
base = [
|
||||
Classifier.create_classifier("language", "Python"),
|
||||
Classifier.create_classifier("language", "Python", "3"),
|
||||
]
|
||||
base.append(Classifier.create_classifier("language", "Python", "3", "Only"))
|
||||
base.extend([Classifier.create_classifier("language", "Python", version) for version in supported_version])
|
||||
base.extend(
|
||||
[Classifier.create_classifier("language", "Python", "Implementation", impl) for impl in implementation]
|
||||
)
|
||||
return base
|
||||
|
||||
@staticmethod
|
||||
def create_status_classifier(level: int) -> str:
|
||||
return Classifier.create_classifier("status", Classifier.status()[level])
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class Dependencies:
|
||||
name: str
|
||||
git_repo_url: t.Optional[str] = None
|
||||
branch: t.Optional[str] = None
|
||||
extensions: t.Optional[t.List[str]] = None
|
||||
subdirectory: t.Optional[str] = None
|
||||
requires_gpu: bool = False
|
||||
lower_constraint: t.Optional[str] = None
|
||||
platform: t.Optional[t.Tuple[t.Literal["Linux", "Windows", "Darwin"], t.Literal["eq", "ne"]]] = None
|
||||
|
||||
def with_options(self, **kwargs: t.Any) -> Dependencies:
|
||||
return dataclasses.replace(self, **kwargs)
|
||||
|
||||
@property
|
||||
def has_constraint(self) -> bool:
|
||||
return self.lower_constraint is not None
|
||||
|
||||
@property
|
||||
def pypi_extensions(self) -> str:
|
||||
return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
|
||||
|
||||
@staticmethod
|
||||
def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str:
|
||||
return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
|
||||
|
||||
def to_str(self) -> str:
|
||||
deps: list[str] = []
|
||||
if self.lower_constraint is not None:
|
||||
deps.append(f"{self.name}{self.pypi_extensions}>={self.lower_constraint}")
|
||||
elif self.subdirectory is not None:
|
||||
deps.append(
|
||||
f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
|
||||
)
|
||||
elif self.branch is not None:
|
||||
deps.append(
|
||||
f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
|
||||
)
|
||||
else:
|
||||
deps.append(f"{self.name}{self.pypi_extensions}")
|
||||
|
||||
if self.platform:
|
||||
deps.append(self.platform_restriction(*self.platform))
|
||||
|
||||
return ";".join(deps)
|
||||
|
||||
@classmethod
|
||||
def from_tuple(cls, *decls: t.Any) -> Dependencies:
|
||||
return cls(*decls)
|
||||
|
||||
|
||||
_BENTOML_EXT = ["grpc", "io"]
|
||||
_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
|
||||
|
||||
_BASE_DEPENDENCIES = [
|
||||
Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
|
||||
Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
|
||||
Dependencies(name="optimum"),
|
||||
Dependencies(name="attrs", lower_constraint="23.1.0"),
|
||||
Dependencies(name="cattrs", lower_constraint="23.1.0"),
|
||||
Dependencies(name="orjson"),
|
||||
Dependencies(name="inflection"),
|
||||
Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
|
||||
Dependencies(name="httpx"),
|
||||
Dependencies(name="typing_extensions"),
|
||||
Dependencies(name="cuda-python", platform=("Darwin", "ne")),
|
||||
]
|
||||
|
||||
_NIGHTLY_MAPPING: dict[str, Dependencies] = {
|
||||
"bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
|
||||
"peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
|
||||
"transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
|
||||
"optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
|
||||
"accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
|
||||
"bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
|
||||
"trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
|
||||
"triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
|
||||
}
|
||||
|
||||
_ALL_RUNTIME_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
|
||||
FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
|
||||
OPT_DEPS = _ALL_RUNTIME_DEPS
|
||||
MPT_DEPS = ["triton", "einops"]
|
||||
OPENAI_DEPS = ["openai", "tiktoken"]
|
||||
AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
|
||||
FALCON_DEPS = ["einops", "xformers", "safetensors"]
|
||||
STARCODER_DEPS = ["bitsandbytes"]
|
||||
CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
|
||||
PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
|
||||
GGML_DEPS = ["ctransformers"]
|
||||
GPTQ_DEPS = ["auto-gptq", "triton"]
|
||||
VLLM_DEPS = ["vllm"]
|
||||
|
||||
_base_requirements = {
|
||||
inflection.dasherize(name): config_cls.__openllm_requirements__
|
||||
for name, config_cls in openllm.CONFIG_MAPPING.items()
|
||||
if config_cls.__openllm_requirements__
|
||||
}
|
||||
|
||||
# shallow copy from locals()
|
||||
_locals = locals().copy()
|
||||
|
||||
# NOTE: update this table when adding new external dependencies
|
||||
# sync with openllm.utils.OPTIONAL_DEPENDENCIES
|
||||
_base_requirements.update(
|
||||
{v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
|
||||
)
|
||||
|
||||
fname = f"{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}"
|
||||
|
||||
|
||||
def create_classifiers() -> Array:
|
||||
arr = tomlkit.array()
|
||||
arr.extend(
|
||||
[
|
||||
Classifier.create_status_classifier(5),
|
||||
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA"),
|
||||
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "12"),
|
||||
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.8"),
|
||||
Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.7"),
|
||||
Classifier.apache(),
|
||||
Classifier.create_classifier("topic", "Scientific/Engineering", "Artificial Intelligence"),
|
||||
Classifier.create_classifier("topic", "Software Development", "Libraries"),
|
||||
Classifier.create_classifier("os", "OS Independent"),
|
||||
Classifier.create_classifier("audience", "Developers"),
|
||||
Classifier.create_classifier("audience", "Science/Research"),
|
||||
Classifier.create_classifier("audience", "System Administrators"),
|
||||
Classifier.create_classifier("typing", "Typed"),
|
||||
*Classifier.create_python_classifier(),
|
||||
]
|
||||
)
|
||||
return arr.multiline(True)
|
||||
|
||||
|
||||
def create_optional_table() -> Table:
|
||||
table = tomlkit.table()
|
||||
table.update(_base_requirements)
|
||||
|
||||
all_array = tomlkit.array()
|
||||
all_array.extend([f"openllm[{k}]" for k in table.keys()])
|
||||
table.add("all", all_array.multiline(True))
|
||||
return table
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
|
||||
pyproject = tomlkit.parse(f.read())
|
||||
|
||||
t.cast("Table", pyproject["project"]).update(
|
||||
{
|
||||
"classifiers": create_classifiers(),
|
||||
"optional-dependencies": create_optional_table(),
|
||||
"dependencies": tomlkit.array(f"{[v.to_str() for v in _BASE_DEPENDENCIES]}").multiline(True),
|
||||
}
|
||||
)
|
||||
with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
|
||||
f.write(tomlkit.dumps(pyproject))
|
||||
|
||||
with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
|
||||
f.write(f"# This file is generated by `{fname}`. DO NOT EDIT\n-e .[playground,flan-t5]\n")
|
||||
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
|
||||
with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
|
||||
f.write(f"# This file is generated by `{fname}`. # DO NOT EDIT\n")
|
||||
f.write(
|
||||
"# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
|
||||
)
|
||||
f.write("-r nightly-requirements.txt\n-e .[all]\n")
|
||||
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
|
||||
|
||||
if shutil.which("taplo"):
|
||||
return subprocess.check_call(["taplo", "format", os.path.join(ROOT, "pyproject.toml")])
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,160 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2023 BentoML Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import os
|
||||
import shutil
|
||||
import typing as t
|
||||
|
||||
import inflection
|
||||
import tomlkit
|
||||
|
||||
import openllm
|
||||
|
||||
|
||||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class Dependencies:
|
||||
name: str
|
||||
git_repo_url: t.Optional[str] = None
|
||||
branch: t.Optional[str] = None
|
||||
extensions: t.Optional[t.List[str]] = None
|
||||
subdirectory: t.Optional[str] = None
|
||||
requires_gpu: bool = False
|
||||
lower_constraint: t.Optional[str] = None
|
||||
|
||||
def with_options(self, **kwargs: t.Any) -> Dependencies:
|
||||
return dataclasses.replace(self, **kwargs)
|
||||
|
||||
@property
|
||||
def has_constraint(self) -> bool:
|
||||
return self.lower_constraint is not None
|
||||
|
||||
@property
|
||||
def pypi_extensions(self) -> str:
|
||||
return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
|
||||
|
||||
def to_str(self) -> str:
|
||||
if self.lower_constraint is not None:
|
||||
return f"{self.name}{self.pypi_extensions}>={self.lower_constraint}"
|
||||
elif self.subdirectory is not None:
|
||||
return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
|
||||
elif self.branch is not None:
|
||||
return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
|
||||
else:
|
||||
return f"{self.name}{self.pypi_extensions}"
|
||||
|
||||
@classmethod
|
||||
def from_tuple(cls, *decls: t.Any) -> Dependencies:
|
||||
return cls(*decls)
|
||||
|
||||
|
||||
_BENTOML_EXT = ["grpc", "io"]
|
||||
_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
|
||||
|
||||
_BASE_DEPENDENCIES = [
|
||||
Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
|
||||
Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
|
||||
Dependencies(name="optimum"),
|
||||
Dependencies(name="attrs", lower_constraint="23.1.0"),
|
||||
Dependencies(name="cattrs", lower_constraint="23.1.0"),
|
||||
Dependencies(name="orjson"),
|
||||
Dependencies(name="inflection"),
|
||||
Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
|
||||
Dependencies(name="httpx"),
|
||||
Dependencies(name="typing_extensions"),
|
||||
]
|
||||
|
||||
_NIGHTLY_MAPPING: dict[str, Dependencies] = {
|
||||
"bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
|
||||
"peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
|
||||
"transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
|
||||
"optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
|
||||
"accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
|
||||
"bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
|
||||
"trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
|
||||
"triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
|
||||
}
|
||||
|
||||
FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
|
||||
FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
MPT_DEPS = ["triton", "einops"]
|
||||
OPENAI_DEPS = ["openai", "tiktoken"]
|
||||
AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
|
||||
FALCON_DEPS = ["einops", "xformers", "safetensors"]
|
||||
STARCODER_DEPS = ["bitsandbytes"]
|
||||
CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
|
||||
PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
|
||||
GGML_DEPS = ["ctransformers"]
|
||||
|
||||
_base_requirements = {
|
||||
inflection.dasherize(name): config_cls.__openllm_requirements__
|
||||
for name, config_cls in openllm.CONFIG_MAPPING.items()
|
||||
if config_cls.__openllm_requirements__
|
||||
}
|
||||
|
||||
# shallow copy from locals()
|
||||
_locals = locals().copy()
|
||||
|
||||
# NOTE: update this table when adding new external dependencies
|
||||
# sync with openllm.utils.OPTIONAL_DEPENDENCIES
|
||||
_base_requirements.update(
|
||||
{v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
|
||||
pyproject = tomlkit.parse(f.read())
|
||||
|
||||
table = tomlkit.table()
|
||||
for name, config in _base_requirements.items():
|
||||
table.add(name, config)
|
||||
|
||||
table.add("all", [f"openllm[{k}]" for k in table.keys()])
|
||||
|
||||
pyproject["project"]["optional-dependencies"] = table
|
||||
|
||||
# write project dependencies
|
||||
pyproject["project"]["dependencies"] = [v.to_str() for v in _BASE_DEPENDENCIES]
|
||||
with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
|
||||
f.write(tomlkit.dumps(pyproject))
|
||||
|
||||
with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
|
||||
f.write(
|
||||
"# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT\n-e .[playground,flan-t5]\n"
|
||||
)
|
||||
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
|
||||
with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
|
||||
f.write("# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT\n")
|
||||
f.write(
|
||||
"# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
|
||||
)
|
||||
f.write("-r nightly-requirements.txt\n-e .[all]\n")
|
||||
f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
|
||||
|
||||
if shutil.which("taplo"):
|
||||
return os.system(f"taplo fmt {os.path.join(ROOT, 'pyproject.toml')}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
2
typings/cuda/__init__.pyi
generated
Normal file
2
typings/cuda/__init__.pyi
generated
Normal file
@@ -0,0 +1,2 @@
|
||||
from . import cuda as cuda
|
||||
from . import cudart as cudart
|
||||
26
typings/cuda/cuda.pyi
generated
Normal file
26
typings/cuda/cuda.pyi
generated
Normal file
@@ -0,0 +1,26 @@
|
||||
# Copyright 2023 BentoML Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from enum import Enum
|
||||
|
||||
class CUresult(Enum):
|
||||
CUDA_SUCCESS = 0
|
||||
|
||||
class _CUMixin:
|
||||
def getPtr(self) -> int: ...
|
||||
|
||||
class CUdevice(_CUMixin): ...
|
||||
|
||||
def cuDeviceGetCount() -> tuple[CUresult, int]: ...
|
||||
def cuDeviceGet(dev: int) -> tuple[CUresult, CUdevice]: ...
|
||||
Reference in New Issue
Block a user