From b2dba6143fa614d4cd7bc00bc6272b071fb26a7d Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Sat, 15 Jul 2023 07:19:35 -0400
Subject: [PATCH] fix(resource): correctly parse CUDA_VISIBLE_DEVICES (#114)

---
 .github/workflows/binary-releases.yml |   1 +
 .github/workflows/ci.yml              |   4 +-
 .github/workflows/create-releases.yml |   2 +
 .github/workflows/release-notes.yml   |   2 +
 README.md                             |   2 +-
 changelog.d/114.fix.md                |   7 +
 hatch.toml                            |   3 +-
 nightly-requirements-gpu.txt          |   2 +-
 nightly-requirements.txt              |   2 +-
 pyproject.toml                        |  35 ++-
 src/openllm/_llm.py                   |  58 +++-
 src/openllm/_quantisation.py          |  48 +++-
 src/openllm/_strategies.py            | 390 +++++++++++++++++++-------
 src/openllm/_types.py                 |   3 +-
 src/openllm/cli.py                    |  33 ++-
 src/openllm/utils/__init__.py         |   9 +-
 src/openllm/utils/codegen.py          |   3 +-
 src/openllm/utils/import_utils.py     |  26 +-
 tests/strategies_test.py              | 118 +++++++-
 tools/dependencies.py                 | 285 +++++++++++++++++++
 tools/update-optional-dependencies.py | 160 -----------
 typings/cuda/__init__.pyi             |   2 +
 typings/cuda/cuda.pyi                 |  26 ++
 23 files changed, 903 insertions(+), 318 deletions(-)
 create mode 100644 changelog.d/114.fix.md
 create mode 100755 tools/dependencies.py
 delete mode 100755 tools/update-optional-dependencies.py
 create mode 100644 typings/cuda/__init__.pyi
 create mode 100644 typings/cuda/cuda.pyi
diff --git a/.github/workflows/binary-releases.yml b/.github/workflows/binary-releases.yml
index 62dcd133..9785b2e2 100644
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -13,6 +13,7 @@ env:
   APP_NAME: openllm
   PYTHON_VERSION: '3.11'
   PYOXIDIZER_VERSION: '0.24.0'
+  HATCH_VERBOSE: 10
 jobs:
   python-artifacts:
     name: Build wheel and source distribution
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3017dfc1..366ff7ba 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,7 @@ env:
   OPENLLM_DO_NOT_TRACK: True
   PYTHONUNBUFFERED: '1'
   STABLE_PYTHON_VERSION: '3.11'
+  HATCH_VERBOSE: 10
 # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
 defaults:
   run:
@@ -31,7 +32,6 @@ defaults:
 jobs:
   quality:
     runs-on: ubuntu-latest
-    if: github.event_name == 'pull_request'
     name: quality-check
     steps:
       - uses: actions/checkout@v3
@@ -43,6 +43,8 @@ jobs:
           python-version: ${{ env.STABLE_PYTHON_VERSION }}
       - name: Run type check
         run: hatch run typing
+      - if: failure()
+        run: echo "Not failing quality workflow."
   tests:
     runs-on: ubuntu-latest
     if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
diff --git a/.github/workflows/create-releases.yml b/.github/workflows/create-releases.yml
index 93d37a90..447b01e3 100644
--- a/.github/workflows/create-releases.yml
+++ b/.github/workflows/create-releases.yml
@@ -28,6 +28,8 @@ on:
 defaults:
   run:
     shell: bash --noprofile --norc -exo pipefail {0}
+env:
+  HATCH_VERBOSE: 10
 jobs:
   release:
     if: github.repository_owner == 'bentoml'
diff --git a/.github/workflows/release-notes.yml b/.github/workflows/release-notes.yml
index 8b572454..f451c4a4 100644
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -25,6 +25,8 @@ on:
       tags:
         required: true
         type: string
+env:
+  HATCH_VERBOSE: 10
 defaults:
   run:
     shell: bash --noprofile --norc -exo pipefail {0}
diff --git a/README.md b/README.md
index 991ea2c4..b65bf043 100644
--- a/README.md
+++ b/README.md
@@ -266,7 +266,7 @@ pip install "openllm[mpt]"
 <td>
 
 ```bash
-pip install openllm
+pip install "openllm[opt]"
 ```
 
 </td>
diff --git a/changelog.d/114.fix.md b/changelog.d/114.fix.md
new file mode 100644
index 00000000..346670f3
--- /dev/null
+++ b/changelog.d/114.fix.md
@@ -0,0 +1,7 @@
+Fixes resources to correctly follows CUDA_VISIBLE_DEVICES spec
+
+OpenLLM now contains a standalone parser that mimic `torch.cuda` parser for set
+GPU devices. This parser will be used to parse both AMD and NVIDIA GPUs.
+
+`openllm` should now be able to parse `GPU-` and `MIG-` UUID from both
+configuration or spec.
diff --git a/hatch.toml b/hatch.toml
index 286109b7..9e9c06fa 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -26,8 +26,8 @@ features = ['flan-t5']
 [envs.default.scripts]
 changelog = "towncrier build --version main --draft"
 quality = [
+    "./tools/dependencies.py",
     "./tools/update-readme.py",
-    "./tools/update-optional-dependencies.py",
     "./tools/update-config-stubs.py",
     "./tools/update-models-import.py",
     "- ./tools/add-license-headers .",
@@ -42,6 +42,7 @@ extra-dependencies = [
 ]
 [envs.tests.scripts]
 _run_script = "pytest --cov --cov-report={env:COVERAGE_REPORT:term-missing} --cov-config=pyproject.toml"
+distributed = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -n 3 -r aR {args:tests}"
 models = "_run_script -r aR {args:tests/models}"
 python = "_run_script --reruns 5 --reruns-delay 3 --ignore tests/models -r aR {args:tests}"
 [envs.tests.overrides]
diff --git a/nightly-requirements-gpu.txt b/nightly-requirements-gpu.txt
index a7d80a06..ac6eac73 100644
--- a/nightly-requirements-gpu.txt
+++ b/nightly-requirements-gpu.txt
@@ -1,4 +1,4 @@
-# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT
+# This file is generated by `tools/dependencies.py`. # DO NOT EDIT
 # For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.
 -r nightly-requirements.txt
 -e .[all]
diff --git a/nightly-requirements.txt b/nightly-requirements.txt
index 9d6780f2..7ec37722 100644
--- a/nightly-requirements.txt
+++ b/nightly-requirements.txt
@@ -1,4 +1,4 @@
-# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT
+# This file is generated by `tools/dependencies.py`. DO NOT EDIT
 -e .[playground,flan-t5]
 bentoml[grpc,io] @ git+https://github.com/bentoml/bentoml.git@main
 peft @ git+https://github.com/huggingface/peft.git@main
diff --git a/pyproject.toml b/pyproject.toml
index 7ab907b0..3e1ede0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,5 @@
+# NOTE: The following are managed by ./tools/dependencies.py
+# project.classifiers, project.dependencies, project.optional-dependencies
 [build-system]
 build-backend = "hatchling.build"
 requires = ["hatchling"]
@@ -29,18 +31,18 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
-# NOTE: The below is managed by ./tools/update-optional-dependencies.py
 dependencies = [
-    "bentoml[grpc,io]>=1.0.22",
-    "transformers[torch,tokenizers,accelerate]>=4.29.0",
-    "optimum",
-    "attrs>=23.1.0",
-    "cattrs>=23.1.0",
-    "orjson",
-    "inflection",
-    "tabulate[widechars]>=0.9.0",
-    "httpx",
-    "typing_extensions",
+    'bentoml[grpc,io]>=1.0.22',
+    'transformers[torch,tokenizers,accelerate]>=4.29.0',
+    'optimum',
+    'attrs>=23.1.0',
+    'cattrs>=23.1.0',
+    'orjson',
+    'inflection',
+    'tabulate[widechars]>=0.9.0',
+    'httpx',
+    'typing_extensions',
+    'cuda-python;platform_system!="Darwin"',
 ]
 description = 'OpenLLM: Operating LLMs in production'
 dynamic = ["version"]
@@ -62,9 +64,6 @@ license = "Apache-2.0"
 name = "openllm"
 readme = "README.md"
 requires-python = ">=3.8"
-
-# NOTE: Don't modify project.optional-dependencies
-# as it is managed by ./tools/update-optional-dependencies.py
 [project.optional-dependencies]
 agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
 all = [
@@ -72,22 +71,28 @@ all = [
     "openllm[falcon]",
     "openllm[mpt]",
     "openllm[starcoder]",
+    "openllm[opt]",
     "openllm[flan-t5]",
     "openllm[fine-tune]",
+    "openllm[vllm]",
     "openllm[agents]",
-    "openllm[playground]",
     "openllm[ggml]",
+    "openllm[playground]",
     "openllm[openai]",
+    "openllm[gptq]",
 ]
 chatglm = ["cpm-kernels", "sentencepiece"]
 falcon = ["einops", "xformers", "safetensors"]
 fine-tune = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
 flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
 ggml = ["ctransformers"]
+gptq = ["auto-gptq", "triton"]
 mpt = ["triton", "einops"]
 openai = ["openai", "tiktoken"]
+opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 starcoder = ["bitsandbytes"]
+vllm = ["vllm"]
 
 [project.urls]
 Documentation = "https://github.com/bentoml/openllm#readme"
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index feca2f80..6574d1bf 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -71,6 +71,7 @@ else:
     from typing_extensions import overload
 
 if t.TYPE_CHECKING:
+    import auto_gptq as autogptq
     import peft
     import torch
 
@@ -96,6 +97,8 @@ else:
     UserDictAny = collections.UserDict
     LLMRunnable = bentoml.Runnable
     LLMRunner = bentoml.Runner
+
+    autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
     transformers = LazyLoader("transformers", globals(), "transformers")
     torch = LazyLoader("torch", globals(), "torch")
     peft = LazyLoader("peft", globals(), "peft")
@@ -445,7 +448,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     """The config instance to use for this LLM. This will be created based on config_class and available
     when initialising the LLM."""
 
-    quantization_config: transformers.BitsAndBytesConfig | None
+    quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
     """Quantisation config for quantised model on the fly."""
 
     _model_id: str
@@ -548,6 +551,44 @@ class LLM(LLMInterface[M, T], ReprMixin):
 
         openllm.serialisation.save_pretrained(self, save_directory, **attrs)
 
+    @classmethod
+    @overload
+    def from_pretrained(
+        cls,
+        model_id: str | None = ...,
+        model_version: str | None = ...,
+        llm_config: openllm.LLMConfig | None = ...,
+        *args: t.Any,
+        runtime: t.Literal["ggml", "transformers"] | None = ...,
+        quantize: t.Literal["int8", "int4"] = ...,
+        bettertransformer: str | bool | None = ...,
+        adapter_id: str | None = ...,
+        adapter_name: str | None = ...,
+        adapter_map: dict[str, str | None] | None = ...,
+        quantization_config: transformers.BitsAndBytesConfig | None = ...,
+        **attrs: t.Any,
+    ) -> LLM[M, T]:
+        ...
+
+    @classmethod
+    @overload
+    def from_pretrained(
+        cls,
+        model_id: str | None = ...,
+        model_version: str | None = ...,
+        llm_config: openllm.LLMConfig | None = ...,
+        *args: t.Any,
+        runtime: t.Literal["ggml", "transformers"] | None = ...,
+        quantize: t.Literal["gptq"] = ...,
+        bettertransformer: str | bool | None = ...,
+        adapter_id: str | None = ...,
+        adapter_name: str | None = ...,
+        adapter_map: dict[str, str | None] | None = ...,
+        quantization_config: autogptq.BaseQuantizeConfig | None = ...,
+        **attrs: t.Any,
+    ) -> LLM[M, T]:
+        ...
+
     @classmethod
     def from_pretrained(
         cls,
@@ -561,7 +602,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
         adapter_id: str | None = None,
         adapter_name: str | None = None,
         adapter_map: dict[str, str | None] | None = None,
-        quantization_config: transformers.BitsAndBytesConfig | None = None,
+        quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
         **attrs: t.Any,
     ) -> LLM[M, T]:
         """Instantiate a pretrained LLM.
@@ -577,6 +618,17 @@ class LLM(LLMInterface[M, T], ReprMixin):
 
         > Currently, the above two options are mutually exclusive.
 
+        #### Quantisation options
+
+        For customising options for quantisation config, ``openllm.LLM`` accepts all arbitrary arguments that is passed to ``transformers.BitsAndBytesConfig``
+        plus ``quantize`` value. For example, for ``int8`` quantisation, specify the following:
+        ```python
+        model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
+        ```
+
+        For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
+        to ``auto_gptq.BaseQuantizeConfig``.
+
         ### Adapter options:
 
         > This is used in conjunction with the fine-tuning features
@@ -689,7 +741,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
         model_id: str,
         llm_config: openllm.LLMConfig,
         bettertransformer: bool | None,
-        quantization_config: transformers.BitsAndBytesConfig | None,
+        quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
         _adapters_mapping: AdaptersMapping | None,
         _tag: bentoml.Tag,
         _quantize_method: t.Literal["int8", "int4", "gptq"] | None,
diff --git a/src/openllm/_quantisation.py b/src/openllm/_quantisation.py
index 31c17de5..26c9ab4f 100644
--- a/src/openllm/_quantisation.py
+++ b/src/openllm/_quantisation.py
@@ -13,15 +13,26 @@
 # limitations under the License.
 from __future__ import annotations
 import logging
+import sys
 import typing as t
 
 from .utils import LazyLoader
+from .utils import is_autogptq_available
 from .utils import is_bitsandbytes_available
 from .utils import is_transformers_supports_kbit
 from .utils import pkg
 
 
+# NOTE: We need to do this so that overload can register
+# correct overloads to typing registry
+if sys.version_info[:2] >= (3, 11):
+    from typing import overload
+else:
+    from typing_extensions import overload
+
+
 if t.TYPE_CHECKING:
+    import auto_gptq as autogptq
     import torch
 
     import openllm
@@ -29,6 +40,7 @@ if t.TYPE_CHECKING:
 
     from ._types import DictStrAny
 else:
+    autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
     torch = LazyLoader("torch", globals(), "torch")
     transformers = LazyLoader("transformers", globals(), "transformers")
 
@@ -37,15 +49,38 @@ logger = logging.getLogger(__name__)
 QuantiseMode = t.Literal["int8", "int4", "gptq"]
 
 
+@overload
+def infer_quantisation_config(
+    cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any
+) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
+    ...
+
+
+@overload
+def infer_quantisation_config(
+    cls: type[openllm.LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any
+) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
+    ...
+
+
 def infer_quantisation_config(
     cls: type[openllm.LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any
-) -> tuple[transformers.BitsAndBytesConfig | t.Any, DictStrAny]:
+) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
     # 8 bit configuration
     int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
     int8_enable_fp32_cpu_offload = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
     int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
     int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
 
+    autogptq_attrs: DictStrAny = {
+        "bits": attrs.pop("gptq_bits", 4),
+        "group_size": attrs.pop("gptq_group_size", -1),
+        "damp_percent": attrs.pop("gptq_damp_percent", 0.01),
+        "desc_act": attrs.pop("gptq_desc_act", True),
+        "sym": attrs.pop("gptq_sym", True),
+        "true_sequential": attrs.pop("gptq_true_sequential", True),
+    }
+
     def create_int8_config(int8_skip_modules: list[str] | None):
         if int8_skip_modules is None:
             int8_skip_modules = []
@@ -94,8 +129,15 @@ def infer_quantisation_config(
             logger.warning("OpenLLM will fallback to 8-bit quantization.")
             quantisation_config = create_int8_config(int8_skip_modules)
     elif quantise == "gptq":
-        # TODO: support GPTQ loading quantization
-        raise NotImplementedError("GPTQ is not supported yet.")
+        if not is_autogptq_available():
+            logger.warning(
+                "'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment)."
+                " Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback "
+                "to int8 with bitsandbytes."
+            )
+            quantisation_config = create_int8_config(int8_skip_modules)
+        else:
+            quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
     else:
         raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
 
diff --git a/src/openllm/_strategies.py b/src/openllm/_strategies.py
index 9ec9e270..c3c2198c 100644
--- a/src/openllm/_strategies.py
+++ b/src/openllm/_strategies.py
@@ -13,11 +13,15 @@
 # limitations under the License.
 
 from __future__ import annotations
+import functools
+import inspect
 import logging
 import math
 import os
 import sys
+import types
 import typing as t
+import warnings
 
 import psutil
 
@@ -27,62 +31,113 @@ from bentoml._internal.resource import system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
 from bentoml._internal.runner.strategy import Strategy
 
-from .exceptions import OpenLLMException
+from .utils import LazyLoader
+from .utils import LazyType
 from .utils import ReprMixin
 
 
 if t.TYPE_CHECKING:
+    import torch
+
     import bentoml
 
     ListIntStr = list[int | str]
+
+    class DynResource(Resource[t.List[str]], resource_id=""):
+        resource_id: t.ClassVar[str]
+
 else:
+    DynResource = Resource[t.List[str]]
+    torch = LazyLoader("torch", globals(), "torch")
     ListIntStr = list
 
+# NOTE: We need to do this so that overload can register
+# correct overloads to typing registry
+if sys.version_info[:2] >= (3, 11):
+    from typing import overload
+else:
+    from typing_extensions import overload
+
 logger = logging.getLogger(__name__)
 
 
-class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
-    @classmethod
-    def from_spec(cls, spec: t.Any) -> list[str]:
-        if not isinstance(spec, (int, str, list)):
-            raise TypeError("AMD GPU device IDs must be int, str or a list specifing the exact GPUs to use.")
+def _strtoul(s: str) -> int:
+    """Return -1 or positive integer sequence string starts with,."""
+    if not s:
+        return -1
+    for idx, c in enumerate(s):
+        if not (c.isdigit() or (idx == 0 and c in "+-")):
+            break
+        if idx + 1 == len(s):
+            idx += 1  # noqa: PLW2901
+    return int(s[:idx]) if idx > 0 else -1  # type: ignore (idx will be set via enumerate)
 
-        try:
-            if isinstance(spec, int):
-                if spec == -1:
-                    return []
-                if spec < -1:
-                    raise ValueError
-                return [str(i) for i in range(spec)]
-            elif isinstance(spec, str):
-                try:
-                    return cls.from_spec(int(spec))
-                except ValueError:
-                    if spec.startswith("GPU"):
-                        return [spec]
-                    raise ValueError
-            else:
-                return [str(x) for x in spec]
-        except ValueError:
-            raise OpenLLMException(f"Invalid AMD GPU resource limit '{spec}'.")
 
-    @classmethod
-    def from_system(cls) -> list[str]:
-        """Retrieve AMD GPU from system, currently only supports on Linux.
-
-        This assumes that ROCm is setup correctly.
-        """
-        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
-        if cuda_visible_devices in ("", "-1"):
+def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
+    rcs: list[str] = []
+    for elem in lst.split(","):
+        # Repeated id results in empty set
+        if elem in rcs:
             return []
-        if cuda_visible_devices is not None:
-            cuda_visible_devices = cuda_visible_devices.split(",")
-            if "-1" in cuda_visible_devices:
-                cuda_visible_devices = cuda_visible_devices[: cuda_visible_devices.index("-1")]
-            return cuda_visible_devices
+        # Anything other but prefix is ignored
+        if not elem.startswith(prefix):
+            break
+        rcs.append(elem)
+    return rcs
 
+
+_STACK_LEVEL = 3
+
+
+@overload
+def _parse_visible_devices(default_var: str | None = ..., respect_env: t.Literal[True] = True) -> list[str] | None:
+    ...
+
+
+@overload
+def _parse_visible_devices(default_var: str = ..., respect_env: t.Literal[False] = False) -> list[str]:
+    ...
+
+
+def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
+    """CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
+    if respect_env:
+        spec = os.getenv("CUDA_VISIBLE_DEVICES", default_var)
+        if not spec:
+            return
+    else:
+        assert default_var is not None, "spec is required to be not None when parsing spec."  # noqa: S101
+        spec = default_var
+
+    if spec.startswith("GPU-"):
+        return _parse_list_with_prefix(spec, "GPU-")
+    if spec.startswith("MIG-"):
+        return _parse_list_with_prefix(spec, "MIG-")
+
+    # XXX: We to somehow handle cases such as '100m'
+    # CUDA_VISIBLE_DEVICES uses something like strtoul
+    # which makes `1gpu2,2ampere` is equivalent to `1,2`
+    rc: list[int] = []
+    for el in spec.split(","):
+        x = _strtoul(el.strip())
+        # Repeated ordinal results in empty set
+        if x in rc:
+            return []
+        # Negative value aborts the sequence
+        if x < 0:
+            break
+        rc.append(x)
+    return [str(i) for i in rc]
+
+
+def _from_system(cls: type[DynResource]) -> list[str]:
+    """Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
+
+    It relies on torch.cuda implementation and in turns respect CUDA_VISIBLE_DEVICES.
+    """
+    if cls.resource_id == "amd.com/gpu":
         if not psutil.LINUX:
-            logger.debug("AMD GPU resource is only supported on Linux.")
+            warnings.warn("AMD GPUs is currently only supported on Linux.", stacklevel=_STACK_LEVEL)
             return []
 
         # ROCm does not currently have the rocm_smi wheel.
@@ -90,37 +145,169 @@ class AmdGpuResource(Resource[t.List[str]], resource_id="amd.com/gpu"):
         # we don't want to use CLI because parsing is a pain.
         sys.path.append("/opt/rocm/libexec/rocm_smi")
         try:
-            from ctypes import byref
-            from ctypes import c_uint32
-
             # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
-            from rsmiBindings import rocmsmi
-            from rsmiBindings import rsmi_status_t
-
-            num = c_uint32(0)
-            ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
-            if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
-                return [str(i) for i in range(num.value)]
-            return []
-        except Exception as err:
-            logger.debug("Failed to setup AMD GPU resource: %s", err)
+            from rsmiBindings import rocmsmi as rocmsmi
+        except (ModuleNotFoundError, ImportError):
+            # In this case the binary is not found, returning empty list
             return []
         finally:
             sys.path.remove("/opt/rocm/libexec/rocm_smi")
+    visible_devices = _parse_visible_devices()
+    if visible_devices is None:
+        return [str(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else []
+    return visible_devices
 
-    @classmethod
-    def validate(cls, val: list[str]):
-        for gpu_index_or_literal in val:
-            try:
-                idx = int(gpu_index_or_literal)
-            except ValueError:
-                raise OpenLLMException(f"Invalid AMD GPU device index: {val}")
-            if int(idx) < 0:
-                raise OpenLLMException(f"Negative GPU device in {val}.")
-            if int(idx) >= len(cls.from_system()):
-                raise OpenLLMException(
-                    f"GPU device index in {val} is greater than the system available: {cls.from_system()}"
-                )
+
+@overload
+def _from_spec(cls: type[DynResource], spec: int) -> list[str]:
+    ...
+
+
+@overload
+def _from_spec(cls: type[DynResource], spec: ListIntStr) -> list[str]:
+    ...
+
+
+@overload
+def _from_spec(cls: type[DynResource], spec: str) -> list[str]:
+    ...
+
+
+def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
+    """Shared mixin implementation for OpenLLM's NVIDIA and AMD resource implementation.
+
+    The parser behaves similar to how PyTorch handles CUDA_VISIBLE_DEVICES. This means within
+    BentoML's resource configuration, its behaviour is similar to CUDA_VISIBLE_DEVICES.
+    """
+    if isinstance(spec, int):
+        if spec in (-1, 0):
+            return []
+        if spec < -1:
+            raise ValueError("Spec cannot be < -1.")
+        return [str(i) for i in range(spec)]
+    elif isinstance(spec, str):
+        if not spec:
+            return []
+        if spec.isdigit():
+            spec = ",".join([str(i) for i in range(_strtoul(spec))])
+        return _parse_visible_devices(spec, respect_env=False)
+    elif LazyType(ListIntStr).isinstance(spec):
+        return [str(x) for x in spec]
+    else:
+        raise TypeError(
+            f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
+        )
+
+
+@functools.lru_cache
+def _raw_uuid_nvml() -> list[str] | None:
+    """Return list of device UUID as reported by NVML or None if NVML discovery/initialization failed."""
+    try:
+        from cuda import cuda
+    except ImportError:
+        if sys.platform == "darwin":
+            raise RuntimeError("GPU is not available on Darwin system.") from None
+        raise RuntimeError(
+            "Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
+        ) from None
+
+    from ctypes import CDLL
+    from ctypes import byref
+    from ctypes import c_void_p
+    from ctypes import create_string_buffer
+
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL)
+        return
+    err, dev_count = cuda.cuDeviceGetCount()
+    if err != cuda.CUresult.CUDA_SUCCESS:
+        warnings.warn("Failed to get available device from system.", stacklevel=_STACK_LEVEL)
+        return
+    uuids: list[str] = []
+    for idx in range(dev_count):
+        dev_id = c_void_p()
+        rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
+        if rc != 0:
+            warnings.warn(f"Failed to get device handle for {idx}", stacklevel=_STACK_LEVEL)
+            return
+        buf_len = 96
+        buf = create_string_buffer(buf_len)
+        rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
+        if rc != 0:
+            warnings.warn(f"Failed to get device UUID for {idx}", stacklevel=_STACK_LEVEL)
+            return
+        uuids.append(buf.raw.decode("ascii").strip("\0"))
+    del nvml_h
+    return uuids
+
+
+def _validate(cls: type[DynResource], val: list[t.Any]):
+    if cls.resource_id == "amd.com/gpu":
+        raise RuntimeError(
+            "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
+        )
+    if not all(isinstance(i, str) for i in val):
+        raise ValueError("Input list should be all string type.")
+
+    try:
+        from cuda import cuda
+    except ImportError:
+        if sys.platform == "darwin":
+            raise RuntimeError("GPU is not available on Darwin system.") from None
+        raise RuntimeError(
+            "Failed to initialise CUDA runtime binding. Make sure that 'cuda-python' is setup correctly."
+        ) from None
+    # correctly parse handle
+    for el in val:
+        if el.startswith("GPU-") or el.startswith("MIG-"):
+            uuids = _raw_uuid_nvml()
+            if uuids is None:
+                raise ValueError("Failed to parse available GPUs UUID")
+            if el not in uuids:
+                raise ValueError(f"Given UUID {el} is not found with available UUID (available: {uuids})")
+        elif el.isdigit():
+            err, _ = cuda.cuDeviceGet(int(el))
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise ValueError(f"Failed to get device {el}")
+
+
+def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
+    return types.new_class(
+        name,
+        (DynResource, ReprMixin),
+        {"resource_id": resource_kind},
+        lambda ns: ns.update(
+            {
+                "resource_id": resource_kind,
+                "from_spec": classmethod(_from_spec),
+                "from_system": classmethod(_from_system),
+                "validate": classmethod(_validate),
+                "__repr_keys__": property(lambda _: {"resource_id"}),
+                "__doc__": inspect.cleandoc(docstring),
+                "__module__": "openllm._strategies",
+            }
+        ),
+    )
+
+
+NvidiaGpuResource = _make_resource_class(
+    "NvidiaGpuResource",
+    "nvidia.com/gpu",
+    """NVIDIA GPU resource.
+
+    This is a modified version of internal's BentoML's NvidiaGpuResource
+    where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",
+)
+AmdGpuResource = _make_resource_class(
+    "AmdGpuResource",
+    "amd.com/gpu",
+    """AMD GPU resource.
+
+    Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
+    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",
+)
 
 
 class CascadingResourceStrategy(Strategy, ReprMixin):
@@ -147,15 +334,21 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
         if resource_request is None:
             resource_request = system_resources()
 
-        # use nvidia gpu
-        nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
-        if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
-            return math.ceil(len(nvidia_gpus) * workers_per_resource)
+        def _get_gpu_count(typ: list[str] | None, kind: str):
+            if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
+                return math.ceil(len(typ) * workers_per_resource)
 
-        # use amd gpu
-        amd_gpus = get_resource(resource_request, "amd.com/gpu")
-        if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
-            return math.ceil(len(amd_gpus) * workers_per_resource)
+        # use NVIDIA
+        kind = "nvidia.com/gpu"
+        count = _get_gpu_count(get_resource(resource_request, kind), kind)
+        if count:
+            return count
+
+        # use AMD
+        kind = "amd.com/gpu"
+        count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
+        if count:
+            return count
 
         # use CPU
         cpus = get_resource(resource_request, "cpu")
@@ -203,36 +396,32 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
         if resource_request is None:
             resource_request = system_resources()
 
-        # use nvidia gpu
-        nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
-        if nvidia_gpus is not None and len(nvidia_gpus) > 0 and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
-            dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, nvidia_gpus, worker_index)
+        # use NVIDIA
+        kind = "nvidia.com/gpu"
+        typ = get_resource(resource_request, kind)
+        if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
             if disabled:
                 logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
                 environ["CUDA_VISIBLE_DEVICES"] = cuda_env
                 return environ
-            environ["CUDA_VISIBLE_DEVICES"] = dev
-            logger.info(
-                "Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
-                worker_index,
-                dev,
+            environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
+                workers_per_resource, typ, worker_index
             )
+            logger.debug("Environ for worker %s: %s", worker_index, environ)
             return environ
 
-        # use amd gpu
-        amd_gpus = get_resource(resource_request, "amd.com/gpu")
-        if amd_gpus is not None and len(amd_gpus) > 0 and "amd.com/gpu" in runnable_class.SUPPORTED_RESOURCES:
-            dev = cls.transpile_workers_to_cuda_visible_devices(workers_per_resource, amd_gpus, worker_index)
+        # use AMD
+        kind = "amd.com/gpu"
+        typ = get_resource(resource_request, kind, validate=False)
+        if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
             if disabled:
                 logger.debug("CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.", worker_index)
                 environ["CUDA_VISIBLE_DEVICES"] = cuda_env
                 return environ
-            environ["CUDA_VISIBLE_DEVICES"] = dev
-            logger.info(
-                "Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
-                worker_index,
-                dev,
+            environ["CUDA_VISIBLE_DEVICES"] = cls.transpile_workers_to_cuda_envvar(
+                workers_per_resource, typ, worker_index
             )
+            logger.debug("Environ for worker %s: %s", worker_index, environ)
             return environ
 
         # use CPU
@@ -243,23 +432,16 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
                 thread_count = math.ceil(cpus)
                 for thread_env in THREAD_ENVS:
                     environ[thread_env] = os.getenv(thread_env, str(thread_count))
-                logger.info(
-                    "Environ for worker %d: set CPU thread count to %d",
-                    worker_index,
-                    thread_count,
-                )
-                return environ
-            else:
-                for thread_env in THREAD_ENVS:
-                    environ[thread_env] = os.getenv(thread_env, "1")
+                logger.debug("Environ for worker %s: %s", worker_index, environ)
                 return environ
+            for thread_env in THREAD_ENVS:
+                environ[thread_env] = os.getenv(thread_env, "1")
+            return environ
 
         return environ
 
     @staticmethod
-    def transpile_workers_to_cuda_visible_devices(
-        workers_per_resource: float | int, gpus: list[str], worker_index: int
-    ) -> str:
+    def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
         # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
         if isinstance(workers_per_resource, float):
             # NOTE: We hit this branch when workers_per_resource is set to
@@ -287,9 +469,9 @@ class CascadingResourceStrategy(Strategy, ReprMixin):
             dev = ",".join(assigned_gpu)
         else:
             idx = worker_index // workers_per_resource
-            if len(gpus) == idx:
+            if idx >= len(gpus):
                 raise ValueError(
                     f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}"
                 )
-            dev = gpus[idx]
+            dev = str(gpus[idx])
         return dev
diff --git a/src/openllm/_types.py b/src/openllm/_types.py
index a67d1876..0947c69d 100644
--- a/src/openllm/_types.py
+++ b/src/openllm/_types.py
@@ -30,6 +30,7 @@ from ._configuration import AdapterType
 
 
 if t.TYPE_CHECKING:
+    import auto_gptq as autogptq
     import click
     import peft
 
@@ -155,7 +156,7 @@ class LLMRunner(bentoml.Runner):
 
 class LLMInitAttrs(t.TypedDict):
     config: openllm.LLMConfig
-    quantization_config: transformers.BitsAndBytesConfig | None
+    quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
     model_id: str
     runtime: t.Literal["ggml", "transformers"]
     model_decls: TupleAny
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index 712fca69..3d4b5925 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -773,7 +773,6 @@ def noop_command(
 def prerequisite_check(
     ctx: click.Context,
     llm_config: openllm.LLMConfig,
-    env: EnvVarMixin,
     gpu_available: tuple[str, ...],
     quantize: t.LiteralString | None,
     adapter_map: dict[str, str | None] | None,
@@ -785,9 +784,6 @@ def prerequisite_check(
         if len(gpu_available) < 1:
             _echo(f"Quantization requires at least 1 GPU (got {len(gpu_available)})", fg="red")
             ctx.exit(1)
-        if env.framework_value != "pt":
-            _echo("Quantization is currently only available for PyTorch models.", fg="red")
-            ctx.exit(1)
 
     if adapter_map and not is_peft_available():
         _echo(
@@ -905,7 +901,7 @@ def start_bento(
             config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
         )
 
-        prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
+        prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
 
         # NOTE: This is to set current configuration
         start_env = os.environ.copy()
@@ -1037,7 +1033,7 @@ def start_model(
             config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
         )
 
-        prerequisite_check(ctx, config, env, gpu_available, quantize, adapter_map, num_workers)
+        prerequisite_check(ctx, config, gpu_available, quantize, adapter_map, num_workers)
 
         # NOTE: This is to set current configuration
         start_env = os.environ.copy()
@@ -1151,7 +1147,7 @@ def start_model(
 @output_option
 @quantize_option(click)
 @click.option("--machine", is_flag=True, default=False, hidden=True)
-@click.option("--implementation", type=click.Choice(["pt", "tf", "flax"]), default=None, hidden=True)
+@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, hidden=True)
 def download_models_command(
     model: str,
     model_id: str | None,
@@ -1193,7 +1189,7 @@ def download_models_command(
     > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
     > support on-demand quantisation during initial startup.
     """
-    impl: t.Literal["pt", "tf", "flax"] = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
+    impl: LiteralRuntime = first_not_none(implementation, default=EnvVarMixin(model).framework_value)
     llm = openllm.infer_auto_class(impl).for_model(
         model,
         model_id=model_id,
@@ -1263,7 +1259,7 @@ def _start(
     runtime: t.Literal["ggml", "transformers"] = ...,
     fast: bool = ...,
     adapter_map: dict[t.LiteralString, str | None] | None = ...,
-    framework: t.Literal["flax", "tf", "pt"] | None = ...,
+    framework: LiteralRuntime | None = ...,
     additional_args: ListStr | None = ...,
     _serve_grpc: bool = ...,
     __test__: t.Literal[False] = ...,
@@ -1284,7 +1280,7 @@ def _start(
     runtime: t.Literal["ggml", "transformers"] = ...,
     fast: bool = ...,
     adapter_map: dict[t.LiteralString, str | None] | None = ...,
-    framework: t.Literal["flax", "tf", "pt"] | None = ...,
+    framework: LiteralRuntime | None = ...,
     additional_args: ListStr | None = ...,
     _serve_grpc: bool = ...,
     __test__: t.Literal[True] = ...,
@@ -1304,7 +1300,7 @@ def _start(
     runtime: t.Literal["ggml", "transformers"] = "transformers",
     fast: bool = False,
     adapter_map: dict[t.LiteralString, str | None] | None = None,
-    framework: t.Literal["flax", "tf", "pt"] | None = None,
+    framework: LiteralRuntime | None = None,
     additional_args: ListStr | None = None,
     _serve_grpc: bool = False,
     __test__: bool = False,
@@ -1615,6 +1611,13 @@ start, start_grpc, build, import_model, list_models = (
     help="The output format for 'openllm build'. By default this will build a BentoLLM. 'container' is the shortcut of 'openllm build && bentoml containerize'.",
     hidden=not get_debug_mode(),
 )
+@click.option(
+    "--push",
+    default=False,
+    is_flag=True,
+    type=click.BOOL,
+    help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.",
+)
 @click.pass_context
 def build_command(
     ctx: click.Context,
@@ -1632,6 +1635,7 @@ def build_command(
     model_version: str | None,
     dockerfile_template: t.TextIO | None,
     format: t.Literal["bento", "container"],
+    push: bool,
     **attrs: t.Any,
 ):
     """Package a given models into a Bento.
@@ -1788,7 +1792,12 @@ def build_command(
     else:
         _echo(bento.tag)
 
-    if format == "container":
+    if format == "container" and push:
+        ctx.fail("'--format=container' and '--push' are mutually exclusive.")
+    if push:
+        client = BentoMLContainer.bentocloud_client.get()
+        client.push_bento(bento)
+    elif format == "container":
         backend = os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker")
         _echo(f"Building {bento} into a LLMContainer using backend '{backend}'", fg="magenta")
         if not bentoml.container.health(backend):
diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py
index 2a1f3a45..69ba80ca 100644
--- a/src/openllm/utils/__init__.py
+++ b/src/openllm/utils/__init__.py
@@ -99,11 +99,8 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
 
 
 def gpu_count() -> tuple[str, ...]:
-    from bentoml._internal.resource import NvidiaGpuResource
-
-    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-    if cuda_visible_devices is not None:
-        return tuple(i for i in cuda_visible_devices.split(","))
+    """Return available GPU under system. Currently only supports NVIDIA GPUs."""
+    from .._strategies import NvidiaGpuResource
 
     return tuple(NvidiaGpuResource.from_system())
 
@@ -417,6 +414,7 @@ _import_structure = {
         "is_jupytext_available",
         "is_notebook_available",
         "is_triton_available",
+        "is_autogptq_available",
         "require_backends",
     ],
 }
@@ -443,6 +441,7 @@ if t.TYPE_CHECKING:
     from .import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES
     from .import_utils import DummyMetaclass as DummyMetaclass
     from .import_utils import EnvVarMixin as EnvVarMixin
+    from .import_utils import is_autogptq_available as is_autogptq_available
     from .import_utils import is_bitsandbytes_available as is_bitsandbytes_available
     from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
     from .import_utils import is_datasets_available as is_datasets_available
diff --git a/src/openllm/utils/codegen.py b/src/openllm/utils/codegen.py
index 56d0ee75..ffee6a0d 100644
--- a/src/openllm/utils/codegen.py
+++ b/src/openllm/utils/codegen.py
@@ -252,7 +252,7 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]):
 
 
 def generate_unique_filename(cls: type[t.Any], func_name: str):
-    return f"<{cls.__name__} generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>"
+    return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
 
 
 def generate_function(
@@ -332,6 +332,7 @@ def make_env_transformer(
 
 
 def gen_sdk(func: t.Callable[P, t.Any], name: str | None = None, **attrs: t.Any):
+    """Enhance function with nicer Repr."""
     from .representation import ReprMixin
 
     if name is None:
diff --git a/src/openllm/utils/import_utils.py b/src/openllm/utils/import_utils.py
index b17b6409..cfbc4dfd 100644
--- a/src/openllm/utils/import_utils.py
+++ b/src/openllm/utils/import_utils.py
@@ -56,16 +56,17 @@ else:
 logger = logging.getLogger(__name__)
 
 OPTIONAL_DEPENDENCIES = {
+    "chatglm",
+    "falcon",
+    "mpt",
+    "starcoder",
     "fine-tune",
     "flan-t5",
-    "mpt",
-    "falcon",
-    "starcoder",
-    "chatglm",
-    "openai",
-    "agents",
-    "playground",
     "ggml",
+    "agents",
+    "openai",
+    "playground",
+    "gptq",
 }
 ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
@@ -100,6 +101,7 @@ _triton_available = _is_package_available("triton")
 _jupyter_available = _is_package_available("jupyter")
 _jupytext_available = _is_package_available("jupytext")
 _notebook_available = _is_package_available("notebook")
+_autogptq_available = _is_package_available("auto-gptq")
 
 
 def is_transformers_supports_kbit() -> bool:
@@ -146,6 +148,10 @@ def is_bitsandbytes_available():
     return _bitsandbytes_available
 
 
+def is_autogptq_available():
+    return _autogptq_available
+
+
 def is_torch_available():
     global _torch_available
     if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
@@ -309,6 +315,11 @@ You can install it with pip: `pip install bitsandbytes`. Please note that you ma
 your runtime after installation.
 """
 
+AUTOGPTQ_IMPORT_ERROR = """{0} requires the auto-gptq library but it was not found in your environment.
+You can install it with pip: `pip install auto-gptq`. Please note that you may need to restart
+your runtime after installation.
+"""
+
 BACKENDS_MAPPING = BackendOrderredDict(
     [
         ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
@@ -320,6 +331,7 @@ BACKENDS_MAPPING = BackendOrderredDict(
         ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
         ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
         ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
+        ("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)),
     ]
 )
 
diff --git a/tests/strategies_test.py b/tests/strategies_test.py
index 8ae288a0..0a3f4252 100644
--- a/tests/strategies_test.py
+++ b/tests/strategies_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import annotations
+import sys
 import typing as t
 
 import pytest
@@ -25,14 +26,127 @@ import bentoml
 from bentoml._internal.resource import get_resource
 from openllm import _strategies as strategy
 from openllm._strategies import CascadingResourceStrategy
+from openllm._strategies import NvidiaGpuResource
+
+
+def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as mcls:
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "0,1")
+        resource = NvidiaGpuResource.from_system()
+        assert len(resource) == 2
+        assert resource == ["0", "1"]
+        mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as mcls:
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "0,2,-1,1")
+        resource = NvidiaGpuResource.from_system()
+        assert len(resource) == 2
+        assert resource == ["0", "2"]
+        mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as mcls:
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "-1")
+        resource = NvidiaGpuResource.from_system()
+        assert len(resource) == 0
+        assert resource == []
+        mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as mcls:
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43-ac33420d4628")
+        resource = NvidiaGpuResource.from_system()
+        assert len(resource) == 1
+        assert resource == ["GPU-5ebe9f43-ac33420d4628"]
+        mcls.delenv("CUDA_VISIBLE_DEVICES")
+    with monkeypatch.context() as mcls:
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,GPU-ac33420d4628")
+        resource = NvidiaGpuResource.from_system()
+        assert len(resource) == 2
+        assert resource == ["GPU-5ebe9f43", "GPU-ac33420d4628"]
+        mcls.delenv("CUDA_VISIBLE_DEVICES")
+    with monkeypatch.context() as mcls:
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43,-1,GPU-ac33420d4628")
+        resource = NvidiaGpuResource.from_system()
+        assert len(resource) == 1
+        assert resource == ["GPU-5ebe9f43"]
+        mcls.delenv("CUDA_VISIBLE_DEVICES")
+    with monkeypatch.context() as mcls:
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "MIG-GPU-5ebe9f43-ac33420d4628")
+        resource = NvidiaGpuResource.from_system()
+        assert len(resource) == 1
+        assert resource == ["MIG-GPU-5ebe9f43-ac33420d4628"]
+        mcls.delenv("CUDA_VISIBLE_DEVICES")
+
+
+def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as mcls:
+        # to make this tests works with system that has GPU
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "")
+        assert len(NvidiaGpuResource.from_system()) >= 0  # TODO: real from_system tests
+
+        assert pytest.raises(
+            ValueError,
+            NvidiaGpuResource.validate,
+            [*NvidiaGpuResource.from_system(), 1],
+        ).match("Input list should be all string type.")
+        assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match(
+            "Input list should be all string type."
+        )
+        assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["GPU-5ebe9f43", "GPU-ac33420d4628"]).match(
+            "Failed to parse available GPUs UUID"
+        )
+
+
+def test_nvidia_gpu_validate_no_gpu_available():
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["0", "1"]).match("Failed to get device *")
+
+
+@pytest.mark.skipif(sys.platform != "darwin", reason="Test NVIDIA validation on Darwin only")
+def test_nvidia_gpu_validation_on_darwin():
+    assert pytest.raises(RuntimeError, NvidiaGpuResource.validate, ["0"]).match(
+        "GPU is not available on Darwin system."
+    )
+
+
+def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as mcls:
+        # to make this tests works with system that has GPU
+        mcls.setenv("CUDA_VISIBLE_DEVICES", "")
+        assert NvidiaGpuResource.from_spec(1) == ["0"]
+        assert NvidiaGpuResource.from_spec("5") == ["0", "1", "2", "3", "4"]
+        assert NvidiaGpuResource.from_spec(1) == ["0"]
+        assert NvidiaGpuResource.from_spec(2) == ["0", "1"]
+        assert NvidiaGpuResource.from_spec("3") == ["0", "1", "2"]
+        assert NvidiaGpuResource.from_spec([1, 3]) == ["1", "3"]
+        assert NvidiaGpuResource.from_spec(["1", "3"]) == ["1", "3"]
+        assert NvidiaGpuResource.from_spec(-1) == []
+        assert NvidiaGpuResource.from_spec("-1") == []
+        assert NvidiaGpuResource.from_spec("") == []
+        assert NvidiaGpuResource.from_spec("-2") == []
+        assert NvidiaGpuResource.from_spec("GPU-288347ab") == ["GPU-288347ab"]
+        assert NvidiaGpuResource.from_spec("GPU-288347ab,-1,GPU-ac33420d4628") == ["GPU-288347ab"]
+        assert NvidiaGpuResource.from_spec("GPU-288347ab,GPU-ac33420d4628") == ["GPU-288347ab", "GPU-ac33420d4628"]
+        assert NvidiaGpuResource.from_spec("MIG-GPU-288347ab") == ["MIG-GPU-288347ab"]
+
+    with pytest.raises(TypeError):
+        NvidiaGpuResource.from_spec((1, 2, 3))
+    with pytest.raises(TypeError):
+        NvidiaGpuResource.from_spec(1.5)
+    with pytest.raises(ValueError):
+        assert NvidiaGpuResource.from_spec(-2)
 
 
 class GPURunnable(bentoml.Runnable):
     SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu")
 
 
-def unvalidated_get_resource(x: dict[str, t.Any], y: str):
-    return get_resource(x, y, validate=False)
+def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False):
+    return get_resource(x, y, validate=validate)
 
 
 @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
diff --git a/tools/dependencies.py b/tools/dependencies.py
new file mode 100755
index 00000000..13aa25d2
--- /dev/null
+++ b/tools/dependencies.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+import dataclasses
+import os
+import shutil
+import subprocess
+import typing as t
+
+import inflection
+import tomlkit
+
+import openllm
+
+if t.TYPE_CHECKING:
+    from tomlkit.items import Array
+    from tomlkit.items import Table
+
+
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+@dataclasses.dataclass(frozen=True)
+class Classifier:
+    identifier: t.Dict[str, str] = dataclasses.field(
+        default_factory=lambda: {
+            "status": "Development Status",
+            "environment": "Environment",
+            "license": "License",
+            "topic": "Topic",
+            "os": "Operating System",
+            "audience": "Intended Audience",
+            "typing": "Typing",
+            "language": "Programming Language",
+        }
+    )
+
+    joiner: str = " :: "
+
+    @staticmethod
+    def status() -> dict[int, str]:
+        return {
+            v: status
+            for v, status in zip(
+                range(1, 8),
+                [
+                    "1 - Planning",
+                    "2 - Pre-Alpha",
+                    "3 - Alpha",
+                    "4 - Beta",
+                    "5 - Production/Stable",
+                    "6 - Mature",
+                    "7 - Inactive",
+                ],
+            )
+        }
+
+    @staticmethod
+    def apache() -> str:
+        return Classifier.create_classifier("license", "OSI Approved", "Apache Software License")
+
+    @staticmethod
+    def create_classifier(identifier: str, *decls: t.Any) -> str:
+        cls_ = Classifier()
+        if identifier not in cls_.identifier:
+            raise ValueError(f"{identifier} is not yet supported (supported alias: {Classifier.identifier})")
+        return cls_.joiner.join([cls_.identifier[identifier], *decls])
+
+    @staticmethod
+    def create_python_classifier(
+        implementation: list[str] | None = None, supported_version: list[str] | None = None
+    ) -> list[str]:
+        if supported_version is None:
+            supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        if implementation is None:
+            implementation = ["CPython", "PyPy"]
+        base = [
+            Classifier.create_classifier("language", "Python"),
+            Classifier.create_classifier("language", "Python", "3"),
+        ]
+        base.append(Classifier.create_classifier("language", "Python", "3", "Only"))
+        base.extend([Classifier.create_classifier("language", "Python", version) for version in supported_version])
+        base.extend(
+            [Classifier.create_classifier("language", "Python", "Implementation", impl) for impl in implementation]
+        )
+        return base
+
+    @staticmethod
+    def create_status_classifier(level: int) -> str:
+        return Classifier.create_classifier("status", Classifier.status()[level])
+
+
+@dataclasses.dataclass(frozen=True)
+class Dependencies:
+    name: str
+    git_repo_url: t.Optional[str] = None
+    branch: t.Optional[str] = None
+    extensions: t.Optional[t.List[str]] = None
+    subdirectory: t.Optional[str] = None
+    requires_gpu: bool = False
+    lower_constraint: t.Optional[str] = None
+    platform: t.Optional[t.Tuple[t.Literal["Linux", "Windows", "Darwin"], t.Literal["eq", "ne"]]] = None
+
+    def with_options(self, **kwargs: t.Any) -> Dependencies:
+        return dataclasses.replace(self, **kwargs)
+
+    @property
+    def has_constraint(self) -> bool:
+        return self.lower_constraint is not None
+
+    @property
+    def pypi_extensions(self) -> str:
+        return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
+
+    @staticmethod
+    def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str:
+        return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
+
+    def to_str(self) -> str:
+        deps: list[str] = []
+        if self.lower_constraint is not None:
+            deps.append(f"{self.name}{self.pypi_extensions}>={self.lower_constraint}")
+        elif self.subdirectory is not None:
+            deps.append(
+                f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
+            )
+        elif self.branch is not None:
+            deps.append(
+                f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
+            )
+        else:
+            deps.append(f"{self.name}{self.pypi_extensions}")
+
+        if self.platform:
+            deps.append(self.platform_restriction(*self.platform))
+
+        return ";".join(deps)
+
+    @classmethod
+    def from_tuple(cls, *decls: t.Any) -> Dependencies:
+        return cls(*decls)
+
+
+_BENTOML_EXT = ["grpc", "io"]
+_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
+
+_BASE_DEPENDENCIES = [
+    Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
+    Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
+    Dependencies(name="optimum"),
+    Dependencies(name="attrs", lower_constraint="23.1.0"),
+    Dependencies(name="cattrs", lower_constraint="23.1.0"),
+    Dependencies(name="orjson"),
+    Dependencies(name="inflection"),
+    Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
+    Dependencies(name="httpx"),
+    Dependencies(name="typing_extensions"),
+    Dependencies(name="cuda-python", platform=("Darwin", "ne")),
+]
+
+_NIGHTLY_MAPPING: dict[str, Dependencies] = {
+    "bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
+    "peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
+    "transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
+    "optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
+    "accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
+    "bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
+    "trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
+    "triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
+}
+
+_ALL_RUNTIME_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
+FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
+FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
+OPT_DEPS = _ALL_RUNTIME_DEPS
+MPT_DEPS = ["triton", "einops"]
+OPENAI_DEPS = ["openai", "tiktoken"]
+AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
+FALCON_DEPS = ["einops", "xformers", "safetensors"]
+STARCODER_DEPS = ["bitsandbytes"]
+CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
+PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
+GGML_DEPS = ["ctransformers"]
+GPTQ_DEPS = ["auto-gptq", "triton"]
+VLLM_DEPS = ["vllm"]
+
+_base_requirements = {
+    inflection.dasherize(name): config_cls.__openllm_requirements__
+    for name, config_cls in openllm.CONFIG_MAPPING.items()
+    if config_cls.__openllm_requirements__
+}
+
+# shallow copy from locals()
+_locals = locals().copy()
+
+# NOTE: update this table when adding new external dependencies
+# sync with openllm.utils.OPTIONAL_DEPENDENCIES
+_base_requirements.update(
+    {v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
+)
+
+fname = f"{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}"
+
+
+def create_classifiers() -> Array:
+    arr = tomlkit.array()
+    arr.extend(
+        [
+            Classifier.create_status_classifier(5),
+            Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA"),
+            Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "12"),
+            Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.8"),
+            Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.7"),
+            Classifier.apache(),
+            Classifier.create_classifier("topic", "Scientific/Engineering", "Artificial Intelligence"),
+            Classifier.create_classifier("topic", "Software Development", "Libraries"),
+            Classifier.create_classifier("os", "OS Independent"),
+            Classifier.create_classifier("audience", "Developers"),
+            Classifier.create_classifier("audience", "Science/Research"),
+            Classifier.create_classifier("audience", "System Administrators"),
+            Classifier.create_classifier("typing", "Typed"),
+            *Classifier.create_python_classifier(),
+        ]
+    )
+    return arr.multiline(True)
+
+
+def create_optional_table() -> Table:
+    table = tomlkit.table()
+    table.update(_base_requirements)
+
+    all_array = tomlkit.array()
+    all_array.extend([f"openllm[{k}]" for k in table.keys()])
+    table.add("all", all_array.multiline(True))
+    return table
+
+
+def main() -> int:
+    with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
+        pyproject = tomlkit.parse(f.read())
+
+    t.cast("Table", pyproject["project"]).update(
+        {
+            "classifiers": create_classifiers(),
+            "optional-dependencies": create_optional_table(),
+            "dependencies": tomlkit.array(f"{[v.to_str() for v in _BASE_DEPENDENCIES]}").multiline(True),
+        }
+    )
+    with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
+        f.write(tomlkit.dumps(pyproject))
+
+    with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
+        f.write(f"# This file is generated by `{fname}`. DO NOT EDIT\n-e .[playground,flan-t5]\n")
+        f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
+    with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
+        f.write(f"# This file is generated by `{fname}`. # DO NOT EDIT\n")
+        f.write(
+            "# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
+        )
+        f.write("-r nightly-requirements.txt\n-e .[all]\n")
+        f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
+
+    if shutil.which("taplo"):
+        return subprocess.check_call(["taplo", "format", os.path.join(ROOT, "pyproject.toml")])
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/update-optional-dependencies.py b/tools/update-optional-dependencies.py
deleted file mode 100755
index 82fff857..00000000
--- a/tools/update-optional-dependencies.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2023 BentoML Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import dataclasses
-import os
-import shutil
-import typing as t
-
-import inflection
-import tomlkit
-
-import openllm
-
-
-ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-
-@dataclasses.dataclass(frozen=True)
-class Dependencies:
-    name: str
-    git_repo_url: t.Optional[str] = None
-    branch: t.Optional[str] = None
-    extensions: t.Optional[t.List[str]] = None
-    subdirectory: t.Optional[str] = None
-    requires_gpu: bool = False
-    lower_constraint: t.Optional[str] = None
-
-    def with_options(self, **kwargs: t.Any) -> Dependencies:
-        return dataclasses.replace(self, **kwargs)
-
-    @property
-    def has_constraint(self) -> bool:
-        return self.lower_constraint is not None
-
-    @property
-    def pypi_extensions(self) -> str:
-        return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
-
-    def to_str(self) -> str:
-        if self.lower_constraint is not None:
-            return f"{self.name}{self.pypi_extensions}>={self.lower_constraint}"
-        elif self.subdirectory is not None:
-            return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
-        elif self.branch is not None:
-            return f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
-        else:
-            return f"{self.name}{self.pypi_extensions}"
-
-    @classmethod
-    def from_tuple(cls, *decls: t.Any) -> Dependencies:
-        return cls(*decls)
-
-
-_BENTOML_EXT = ["grpc", "io"]
-_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
-
-_BASE_DEPENDENCIES = [
-    Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.0.22"),
-    Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
-    Dependencies(name="optimum"),
-    Dependencies(name="attrs", lower_constraint="23.1.0"),
-    Dependencies(name="cattrs", lower_constraint="23.1.0"),
-    Dependencies(name="orjson"),
-    Dependencies(name="inflection"),
-    Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
-    Dependencies(name="httpx"),
-    Dependencies(name="typing_extensions"),
-]
-
-_NIGHTLY_MAPPING: dict[str, Dependencies] = {
-    "bentoml": Dependencies.from_tuple("bentoml", "bentoml/bentoml", "main", _BENTOML_EXT),
-    "peft": Dependencies.from_tuple("peft", "huggingface/peft", "main", None),
-    "transformers": Dependencies.from_tuple("transformers", "huggingface/transformers", "main", _TRANSFORMERS_EXT),
-    "optimum": Dependencies.from_tuple("optimum", "huggingface/optimum", "main", None),
-    "accelerate": Dependencies.from_tuple("accelerate", "huggingface/accelerate", "main", None),
-    "bitsandbytes": Dependencies.from_tuple("bitsandbytes", "TimDettmers/bitsandbytes", "main", None),
-    "trl": Dependencies.from_tuple("trl", "lvwerra/trl", "main", None),
-    "triton": Dependencies.from_tuple("triton", "openai/triton", "main", None, "python", True),
-}
-
-FINE_TUNE_DEPS = ["peft", "bitsandbytes", "datasets", "accelerate", "deepspeed", "trl"]
-FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
-MPT_DEPS = ["triton", "einops"]
-OPENAI_DEPS = ["openai", "tiktoken"]
-AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
-FALCON_DEPS = ["einops", "xformers", "safetensors"]
-STARCODER_DEPS = ["bitsandbytes"]
-CHATGLM_DEPS = ["cpm-kernels", "sentencepiece"]
-PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
-GGML_DEPS = ["ctransformers"]
-
-_base_requirements = {
-    inflection.dasherize(name): config_cls.__openllm_requirements__
-    for name, config_cls in openllm.CONFIG_MAPPING.items()
-    if config_cls.__openllm_requirements__
-}
-
-# shallow copy from locals()
-_locals = locals().copy()
-
-# NOTE: update this table when adding new external dependencies
-# sync with openllm.utils.OPTIONAL_DEPENDENCIES
-_base_requirements.update(
-    {v: _locals[f"{inflection.underscore(v).upper()}_DEPS"] for v in openllm.utils.OPTIONAL_DEPENDENCIES}
-)
-
-
-def main() -> int:
-    with open(os.path.join(ROOT, "pyproject.toml"), "r") as f:
-        pyproject = tomlkit.parse(f.read())
-
-    table = tomlkit.table()
-    for name, config in _base_requirements.items():
-        table.add(name, config)
-
-    table.add("all", [f"openllm[{k}]" for k in table.keys()])
-
-    pyproject["project"]["optional-dependencies"] = table
-
-    # write project dependencies
-    pyproject["project"]["dependencies"] = [v.to_str() for v in _BASE_DEPENDENCIES]
-    with open(os.path.join(ROOT, "pyproject.toml"), "w") as f:
-        f.write(tomlkit.dumps(pyproject))
-
-    with open(os.path.join(ROOT, "nightly-requirements.txt"), "w") as f:
-        f.write(
-            "# This file is generated by `./tools/update-optional-dependencies.py`. DO NOT EDIT\n-e .[playground,flan-t5]\n"
-        )
-        f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if not v.requires_gpu])
-    with open(os.path.join(ROOT, "nightly-requirements-gpu.txt"), "w") as f:
-        f.write("# This file is generated by `./tools/update-optional-dependencies.py`. # DO NOT EDIT\n")
-        f.write(
-            "# For Jax, Flax, Tensorflow, PyTorch CUDA support, please refers to their official installation for your specific setup.\n"
-        )
-        f.write("-r nightly-requirements.txt\n-e .[all]\n")
-        f.writelines([f"{v.to_str()}\n" for v in _NIGHTLY_MAPPING.values() if v.requires_gpu])
-
-    if shutil.which("taplo"):
-        return os.system(f"taplo fmt {os.path.join(ROOT, 'pyproject.toml')}")
-
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/typings/cuda/__init__.pyi b/typings/cuda/__init__.pyi
new file mode 100644
index 00000000..e76cccb8
--- /dev/null
+++ b/typings/cuda/__init__.pyi
@@ -0,0 +1,2 @@
+from . import cuda as cuda
+from . import cudart as cudart
diff --git a/typings/cuda/cuda.pyi b/typings/cuda/cuda.pyi
new file mode 100644
index 00000000..982d3e5f
--- /dev/null
+++ b/typings/cuda/cuda.pyi
@@ -0,0 +1,26 @@
+# Copyright 2023 BentoML Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+class CUresult(Enum):
+    CUDA_SUCCESS = 0
+
+class _CUMixin:
+    def getPtr(self) -> int: ...
+
+class CUdevice(_CUMixin): ...
+
+def cuDeviceGetCount() -> tuple[CUresult, int]: ...
+def cuDeviceGet(dev: int) -> tuple[CUresult, CUdevice]: ...