chore(release): update base container restriction (#173)

Prepare for 0.2.12 release Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2025-12-23 23:57:46 -05:00 · 2023-08-01 15:25:17 -04:00
parent 6ba8899743
commit c2ed1d56da
20 changed files with 319 additions and 256 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -8,3 +8,6 @@ charset = utf-8
 [*.py]
 indent_style = space
 indent_size = 2
+
+[src/openllm/cli/entrypoint.py]
+indent_size = unset
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,14 @@ ci:
 exclude: '.*\.(css|js|svg)$'
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.0.280'
+    rev: 'v0.0.281'
    hooks:
      - id: ruff
        args: [--exit-non-zero-on-fix, --show-fixes]
+  -   repo: https://github.com/editorconfig-checker/editorconfig-checker.python
+      rev: '2.7.2'
+      hooks:
+      - id: editorconfig-checker
  - repo: https://github.com/econchick/interrogate
    rev: 1.5.0
    hooks:
@@ -33,7 +37,7 @@ repos:
        exclude: ^(docs|tools|tests)
        args: [--config=pyproject.toml]
  - repo: https://github.com/google/yapf
-    rev: v0.40.1
+    rev: v0.40.0
    hooks:
      - id: yapf
        types: [python]
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -414,7 +414,7 @@ class GenerationConfig(ReprMixin):
 bentoml_cattr.register_unstructure_hook_factory(
    lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig),
    lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True,
-                                         **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)}))
+                                        **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)}))

@attr.frozen(slots=True, repr=False, init=False)
 class SamplingParams(ReprMixin):
@@ -450,7 +450,7 @@ class SamplingParams(ReprMixin):
  ignore_eos: bool = dantic.Field(False, description="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
  logprobs: int = dantic.Field(None, description="Number of log probabilities to return per output token.")

-  if t.TYPE_CHECKING and not MYPY:
+  if t.TYPE_CHECKING:
    max_tokens: int
    temperature: float
    top_k: int
@@ -490,7 +490,7 @@ class SamplingParams(ReprMixin):
 bentoml_cattr.register_unstructure_hook_factory(
    lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
    lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True,
-                                         **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)}))
+                                        **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)}))
 bentoml_cattr.register_structure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename="max_tokens")))

 # cached it here to save one lookup per assignment
@@ -758,7 +758,7 @@ class _ConfigAttr:

        For example:
            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
-                                             "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
+                                            "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]

        This field is required when defining under '__config__'.
        """
@@ -1028,7 +1028,7 @@ class LLMConfig(_ConfigAttr):
    klass = attr.make_class(
        f"{camel_name}{class_attr}", [], bases=(base,), slots=True, weakref_slot=True, frozen=True, repr=False, init=False, collect_by_mro=True,
        field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__, suffix=suffix_env, globs=globs,
-                                                       default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default) if codegen.has_own_attribute(cls, class_attr) else field_default))
+                                                      default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default) if codegen.has_own_attribute(cls, class_attr) else field_default))
    # For pickling to work, the __module__ variable needs to be set to the
    # frame where the class is created. This respect the module that is created from cls
    try: klass.__module__ = cls.__module__
@@ -1338,7 +1338,7 @@ class LLMConfig(_ConfigAttr):
    Args:
        name: The name of the new class.
        **attrs: The attributes to be added to the new class. This will override
-                 any existing attributes with the same name.
+                any existing attributes with the same name.
    """
    if not hasattr(cls, "__config__"):
      raise ValueError("Cannot derivate a LLMConfig without __config__")
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -344,7 +344,8 @@ _object_setattr = object.__setattr__
 def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
  @functools.wraps(f)
  def wrapper(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
-    trust_remote_code: bool = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
+    trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
+    if t.TYPE_CHECKING: assert trust_remote_code is not None  # NOTE: Mypy is too stupid to understand that the default type of trust_remote_code is bool in L347
    (model_decls, model_attrs), _ = self.llm_parameters
    decls = (*model_decls, *decls)
    attrs = {**model_attrs, **attrs}
@@ -567,8 +568,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
        model_name: Optional model name to be saved with this LLM. Default to None. It will be inferred automatically from model_id.
                    If model_id is a custom path, it will be the basename of the given path.
        model_version: Optional version for this given model id. Default to None. This is useful for saving from custom path.
-                       If set to None, the version will either be the git hash from given pretrained model, or the hash inferred
-                       from last modified time of the given directory.
+                      If set to None, the version will either be the git hash from given pretrained model, or the hash inferred
+                      from last modified time of the given directory.
        llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
                    will use `config_class` to construct default configuration.
        quantize: The quantization to use for this LLM. Defaults to None. Possible values
@@ -576,7 +577,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
        runtime: Optional runtime to run this LLM. Default to 'transformers'. 'ggml' supports is working in progress.
        quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
        serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
-                       Default behaviour is similar to ``safe_serialization=False``.
+                      Default behaviour is similar to ``safe_serialization=False``.
        bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
        adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
        adapter_name: The adapter name to use for this LLM. Defaults to None.
--- a/src/openllm/_strategies.py
+++ b/src/openllm/_strategies.py
@@ -76,11 +76,11 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
 _STACK_LEVEL = 3

@overload
-def _parse_visible_devices(default_var: str | None = ..., respect_env: t.Literal[True] = True) -> list[str] | None:
+def _parse_visible_devices(default_var: str | None = ..., *, respect_env: t.Literal[True]) -> list[str] | None:
  ...

@overload
-def _parse_visible_devices(default_var: str = ..., respect_env: t.Literal[False] = ...) -> list[str]:
+def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]:
  ...

 def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
--- a/src/openllm/_types.py
+++ b/src/openllm/_types.py
@@ -33,8 +33,10 @@ if t.TYPE_CHECKING:
  import peft

  import openllm
-  from openllm._llm import M as _M
-  from openllm._llm import T as _T
+  from .utils.lazy import VersionInfo
+  from .bundle.oci import LiteralContainerVersionStrategy
+  from ._llm import M as _M
+  from ._llm import T as _T
  from bentoml._internal.runner.runnable import RunnableMethod
  from bentoml._internal.runner.runner import RunnerMethod
  from bentoml._internal.runner.strategy import Strategy
@@ -64,6 +66,11 @@ class AdaptersTuple(TupleAny):
  name: str | None
  config: DictStrAny

+class RefTuple(TupleAny):
+  git_hash: str
+  version: VersionInfo
+  strategy: LiteralContainerVersionStrategy
+
 AdaptersMapping = dict[AdapterType, tuple[AdaptersTuple, ...]]

 class LLMRunnable(bentoml.Runnable, t.Generic[_M, _T]):
--- a/src/openllm/bundle/oci/Dockerfile
+++ b/src/openllm/bundle/oci/Dockerfile
@@ -131,6 +131,14 @@ COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x8
 COPY src src
 COPY hatch.toml README.md CHANGELOG.md pyproject.toml ./

+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git && \
+        rm -rf /var/lib/apt/lists/*
+
 # Install all required dependencies
 RUN pip install  "ray==2.6.0" "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,fine-tune,llama,gptq,falcon,chatglm]" -v --no-cache-dir

--- a/src/openllm/bundle/oci/init.py
+++ b/src/openllm/bundle/oci/init.py
@@ -15,28 +15,42 @@
 from __future__ import annotations
 import functools
 import importlib
+import logging
 import pathlib
 import shutil
 import subprocess
 import typing as t

-import git.cmd
+import attr

 import bentoml

 from ...exceptions import Error
 from ...exceptions import OpenLLMException
+from ...utils import LazyLoader
+from ...utils import VersionInfo
 from ...utils import apply
 from ...utils import device_count
 from ...utils import get_debug_mode
 from ...utils import pkg
+from ...utils.codegen import make_attr_tuple_class
+
+if t.TYPE_CHECKING:
+  import git.cmd
+
+  from ..._types import RefTuple
+else:
+  git = LazyLoader("git", globals(), "git")
+  git.cmd = LazyLoader("git.cmd", globals(), "git.cmd")
+
+logger = logging.getLogger(__name__)

 _BUILDER = bentoml.container.get_backend("buildx")
 ROOT_DIR = pathlib.Path(__file__).parent.parent.parent

 # TODO: support quay
 LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
-LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest"]
+LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]

 # XXX: This registry will be hard code for now for easier to maintain
 # but in the future, we can infer based on git repo and everything to make it more options for users
@@ -44,32 +58,68 @@ LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest"]
 # NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it.
 _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {"docker": "docker.io/bentoml/openllm", "gh": "ghcr.io/bentoml/openllm", "ecr": "public.ecr.aws/y5w8i4y6/bentoml/openllm"}

+# TODO: support custom fork. Currently it only support openllm main.
 _URI = "https://github.com/bentoml/openllm.git"

 _module_location = pkg.source_locations("openllm")

@functools.lru_cache
@apply(str.lower)
-def get_base_container_name(reg: LiteralContainerRegistry) -> str:
-  return _CONTAINER_REGISTRY[reg]
+def get_base_container_name(reg: LiteralContainerRegistry) -> str: return _CONTAINER_REGISTRY[reg]

-@functools.lru_cache(maxsize=1)
-def _git() -> git.cmd.Git:
-  return git.cmd.Git(_URI)
+def _convert_version_from_string(s: str) -> VersionInfo: return VersionInfo.from_version_string(s)

-@functools.lru_cache
-def _nightly_ref() -> tuple[str, str]:
-  return _git().ls_remote(_URI, "main", heads=True).split()
+class VersionNotSupported(OpenLLMException):
+  """Raised when the stable release is too low that it doesn't include OpenLLM base container."""

-@functools.lru_cache
-def _stable_ref() -> tuple[str, str]:
-  return max([item.split() for item in _git().ls_remote(_URI, refs=True, tags=True).split("\n")], key=lambda tag: tuple(int(k) for k in tag[-1].replace("refs/tags/v", "").split(".")))
+_RefTuple: type[RefTuple] = make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])

-def get_base_container_tag(strategy: LiteralContainerVersionStrategy) -> str:
-  if strategy == "release": return _stable_ref()[-1].replace("refs/tags/v", "")  # for stable, we can also use latest, but discouraged
-  elif strategy == "latest": return "latest"
-  elif strategy == "nightly": return f"sha-{_nightly_ref()[0][:7]}"  # we prefixed with sha-<git_rev_short> (giv_rev[:7])
-  else: raise ValueError(f"Unknown strategy '{strategy}'. Valid strategies are 'release', 'nightly', and 'latest'")
+@attr.attrs(eq=False, order=False, slots=True, frozen=True)
+class Ref:
+  """TODO: Support offline mode.
+
+  Maybe we need to save git hash when building the Bento.
+  """
+  git_hash: str = attr.field()
+  version: VersionInfo = attr.field(converter=_convert_version_from_string)
+  strategy: LiteralContainerVersionStrategy = attr.field()
+  _git: git.cmd.Git = git.cmd.Git(_URI)  # TODO: support offline mode
+
+  @classmethod
+  def _nightly_ref(cls) -> RefTuple: return _RefTuple((*cls._git.ls_remote(_URI, "main", heads=True).split(), "nightly"))
+  @classmethod
+  def _release_ref(cls, version_str: str | None = None) -> RefTuple:
+    _use_base_strategy = version_str is None
+    if version_str is None:
+      # NOTE: This strategy will only support openllm>0.2.12
+      version: tuple[str, str] = tuple(max([item.split() for item in cls._git.ls_remote(_URI, refs=True, tags=True).split("\n")], key=lambda tag: tuple(int(k) for k in tag[-1].replace("refs/tags/v", "").split("."))))
+      version_str = version[-1].replace("refs/tags/v", "")
+      version = (version[0], version_str)
+    else:
+      version = ("", version_str)
+    if t.TYPE_CHECKING: assert version_str # NOTE: Mypy cannot infer the correct type here. We have handle the cases where version_str is None in L86
+    if VersionInfo.from_version_string(version_str) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
+    return _RefTuple((*version, "release" if not _use_base_strategy else "custom"))
+  @classmethod
+  def from_strategy(cls, strategy_or_version: t.Literal["release", "nightly"] | str | None = None) -> Ref:
+    if strategy_or_version is None or strategy_or_version == "release":
+      logger.debug("Using default strategy 'release' for resolving base image version.")
+      return cls(*cls._release_ref())
+    elif strategy_or_version == "latest": return cls("latest", "0.0.0", "latest")
+    elif strategy_or_version == "nightly":
+      _ref = cls._nightly_ref()
+      return cls(_ref[0], "0.0.0", _ref[-1])
+    else:
+      logger.warning("Using custom %s. Make sure that it is at lease 0.2.12 for base container support.", strategy_or_version)
+      return cls(*cls._release_ref(version_str=strategy_or_version))
+  @property
+  def tag(self) -> str:
+    if self.strategy == "latest": return "latest"
+    elif self.strategy == "nightly": return f"sha-{self.git_hash[:7]}"
+    else: return repr(self.version)
+
+@functools.lru_cache(maxsize=256)
+def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str: return Ref.from_strategy(strategy).tag

 def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None, version_strategy: LiteralContainerVersionStrategy = "release", push: bool = False, machine: bool = False) -> dict[str | LiteralContainerRegistry, str]:
  """This is a utility function for building base container for OpenLLM. It will build the base container for all registries if ``None`` is passed.
--- a/src/openllm/cli/_factory.py
+++ b/src/openllm/cli/_factory.py
@@ -346,8 +346,7 @@ def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
      "--fast/--no-fast", show_default=True, default=False, envvar="OPENLLM_USE_LOCAL_LATEST", show_envvar=True, help="""Whether to skip checking if models is already in store.

                                                                                                          This is useful if you already downloaded or setup the model beforehand.
-                                                                                                          """, **attrs
-  )(f)
+                                                                                                          """, **attrs)(f)

 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
@@ -379,8 +378,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
          """
                                                                                                                                                                            **Note** that this will set the mode for serving within deployment.""" if build else ""
      ) + """
-                                                                                                                                                                            **Note** that quantization are currently only available in *PyTorch* models.""", **attrs
-  )(f)
+                                                                                                                                                                            **Note** that quantization are currently only available in *PyTorch* models.""", **attrs)(f)

 def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
@@ -399,8 +397,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
                                                                                                                                                  **Note**: The workers value passed into 'build' will determine how the LLM can
                                                                                                                                                  be provisioned in Kubernetes as well as in standalone container. This will
                                                                                                                                                  ensure it has the same effect with 'openllm start --workers ...'""" if build else ""
-      ), **attrs
-  )(f)
+      ), **attrs)(f)

 def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
@@ -411,23 +408,22 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
  return cli_option(
      "--serialisation", "--serialization", "serialisation_format", type=click.Choice(["safetensors", "legacy"]), default="safetensors", show_default=True, show_envvar=True, envvar="OPENLLM_SERIALIZATION", help="""Serialisation format for save/load LLM.

-                                                                                                                   Currently the following strategies are supported:
+                                                                                                                  Currently the following strategies are supported:

-                                                                                                                   - ``safetensors``: This will use safetensors format, which is synonymous to
+                                                                                                                  - ``safetensors``: This will use safetensors format, which is synonymous to

-                                                                                                                               \b
-                                                                                                                               ``safe_serialization=True``.
+                                                                                                                              \b
+                                                                                                                              ``safe_serialization=True``.

-                                                                                                                               \b
-                                                                                                                               **Note** that this format might not work for every cases, and
-                                                                                                                               you can always fallback to ``legacy`` if needed.
+                                                                                                                              \b
+                                                                                                                              **Note** that this format might not work for every cases, and
+                                                                                                                              you can always fallback to ``legacy`` if needed.

-                                                                                                                   - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files.
-                                                                                                                                   This should be used if the model doesn't yet support safetensors.
+                                                                                                                  - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files.
+                                                                                                                                  This should be used if the model doesn't yet support safetensors.

-                                                                                                                   **Note** that GGML format is working in progress.
-                                                                                                                   """, **attrs
-  )(f)
+                                                                                                                  **Note** that GGML format is working in progress.
+                                                                                                                  """, **attrs)(f)

 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
@@ -437,8 +433,7 @@ def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) ->

                                                                                                                        \b
                                                                                                                        **Note** that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
-                                                                                                                        """
-  )(f)
+                                                                                                                        """)(f)

 _wpr_strategies = {"round_robin", "conserved"}

--- a/src/openllm/cli/entrypoint.py
+++ b/src/openllm/cli/entrypoint.py
@@ -162,7 +162,7 @@ ServeCommand = t.Literal["serve", "serve-grpc"]

@attr.define
 class GlobalOptions:
-  cloud_context: str | None = attr.field(default=None, converter=attr.converters.default_if_none("default"))
+  cloud_context: str | None = attr.field(default=None)

  def with_options(self, **attrs: t.Any) -> t.Self:
    return attr.evolve(self, **attrs)
@@ -223,7 +223,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
          analytics.track(event)
          raise

-    return wrapper
+    return t.cast("t.Callable[t.Concatenate[bool, P], t.Any]", wrapper)

  @staticmethod
  def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]:
--- a/src/openllm/serialisation/transformers.py
+++ b/src/openllm/serialisation/transformers.py
@@ -28,7 +28,6 @@ from .constants import HUB_ATTRS
 from ..exceptions import OpenLLMException
 from ..utils import LazyLoader
 from ..utils import LazyType
-from ..utils import device_count
 from ..utils import first_not_none
 from ..utils import generate_context
 from ..utils import generate_labels
@@ -40,7 +39,7 @@ from ..utils import normalize_attrs_to_model_tokenizer_pair
 if t.TYPE_CHECKING:
  import auto_gptq as autogptq
  import torch
-  import torch.cuda
+  import torch.nn
  import vllm

  import openllm
@@ -55,7 +54,6 @@ else:
  autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
  _transformers = LazyLoader("_transformers", globals(), "transformers")
  torch = LazyLoader("torch", globals(), "torch")
-  torch.cuda = LazyLoader("torch.cuda", globals(), "torch.cuda")

 _object_setattr = object.__setattr__

@@ -91,6 +89,10 @@ def infer_autoclass_from_llm_config(llm: openllm.LLM[M, T], config: _transformer
    else: raise OpenLLMException(f"Model type {type(config)} is not supported yet.")
    return getattr(_transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx])

+def check_initialized(model: torch.nn.Module) -> None:
+  unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device("meta")]
+  if len(unintialized) > 0: raise RuntimeError(f"Found the following unintialized parameters in {model}: {unintialized}")
+
 def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model:
  """Auto detect model type from given model_id and import it to bentoml's model store.

@@ -184,22 +186,15 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
  if "_quantize" in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata["_quantize"] == "gptq":
    if not is_autogptq_available(): raise OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
    if llm.config["model_type"] != "causal_lm": raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-    return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path, *decls, quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), trust_remote_code=llm.__llm_trust_remote_code__, use_safetensors=safe_serialization, **hub_attrs, **attrs,)
+    return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path, *decls, quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), trust_remote_code=llm.__llm_trust_remote_code__, use_safetensors=safe_serialization, **hub_attrs, **attrs)

-  model = infer_autoclass_from_llm_config(llm, config).from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, **hub_attrs, **attrs,)
-  # NOTE: we only cast and load the model if it is not already quantized and setup correctly
-  loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_quantized", False)
-  if torch.cuda.is_available() and device_count() == 1 and not loaded_in_kbit:
-    try:
-      model = model.to("cuda")
-    except torch.cuda.OutOfMemoryError as err:
-      raise RuntimeError(f"Failed to convert {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization.") from err
+  model = infer_autoclass_from_llm_config(llm, config).from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, **hub_attrs, **attrs).eval()
+  if llm.__llm_implementation__ in {"pt", "vllm"}: check_initialized(model)
  # BetterTransformer is currently only supported on PyTorch.
  if llm.bettertransformer and isinstance(model, _transformers.PreTrainedModel): model = model.to_bettertransformer()
  return t.cast("M", model)

 def save_pretrained(llm: openllm.LLM[M, T], save_directory: str, is_main_process: bool = True, state_dict: DictStrAny | None = None, save_function: t.Callable[..., None] | None = None, push_to_hub: bool = False, max_shard_size: int | str = "10GB", safe_serialization: bool = False, variant: str | None = None, **attrs: t.Any,) -> None:
-  """Light wrapper around ``transformers.PreTrainedTokenizer.save_pretrained`` and ``transformers.PreTrainedModel.save_pretrained``."""
  save_function = first_not_none(save_function, default=torch.save)
  model_save_attrs, tokenizer_save_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
  safe_serialization = safe_serialization or llm._serialisation_format == "safetensors"
--- a/src/openllm/utils/init.py
+++ b/src/openllm/utils/init.py
@@ -46,6 +46,7 @@ from bentoml._internal.utils import reserve_free_port as reserve_free_port
 from bentoml._internal.utils import resolve_user_filepath as resolve_user_filepath

 from .lazy import LazyModule
+from .lazy import VersionInfo as VersionInfo

 logger = logging.getLogger(__name__)

--- a/src/openllm/utils/import_utils.py
+++ b/src/openllm/utils/import_utils.py
@@ -376,7 +376,7 @@ class EnvVarMixin(ReprMixin):
    if hasattr(self, item): return getattr(self, item)
    raise KeyError(f"Key {item} not found in {self}")
  def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None,
-               runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
+              runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
    """EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
    from .._configuration import field_env_key
    self.model_name = inflection.underscore(model_name)
--- a/src/openllm/utils/lazy.py
+++ b/src/openllm/utils/lazy.py
@@ -36,7 +36,7 @@ class MissingAttributesError(OpenLLMException):
  """Raised when given keys is not available in LazyModule special mapping."""

@functools.total_ordering
-@attr.attrs(eq=False, order=False, slots=True, frozen=True)
+@attr.attrs(eq=False, order=False, slots=True, frozen=True, repr=False)
 class VersionInfo:
  """A version object that can be compared to tuple of length 1--4.

@@ -96,6 +96,8 @@ class VersionInfo:
    # have to do anything special with releaselevel for now.
    return us < them

+  def __repr__(self) -> str: return "{0}.{1}.{2}".format(*attr.astuple(self)[:3])
+
 _sentinel, _reserved_namespace = object(), {"__openllm_special__", "__openllm_migration__"}

 class LazyModule(types.ModuleType):
@@ -120,7 +122,7 @@ class LazyModule(types.ModuleType):
        module_spec: __spec__ of the lazily loaded module
        doc: Optional docstring for this module.
        extra_objects: Any additional objects that this module can also be accessed. Useful for additional metadata as well
-                       as any locals() functions
+                      as any locals() functions
    """
    super().__init__(name)
    self._modules = set(import_structure.keys())
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -53,7 +53,7 @@ _value_docstring = {

        For example:
            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
-                                             "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
+                                            "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]

        This field is required when defining under '__config__'.
        """, "architecture": """The model architecture that is supported by this LLM.
--- a/tools/write-coverage-report.py
+++ b/tools/write-coverage-report.py
@@ -25,7 +25,7 @@ PRECISION = Decimal(".01")

 ROOT = Path(__file__).resolve().parent.parent

-def main():
+def main() -> int:
  coverage_summary = ROOT / "coverage-summary.json"

  coverage_data = orjson.loads(coverage_summary.read_text(encoding="utf-8"))
@@ -39,8 +39,7 @@ def main():

    rate = Decimal(statements_covered) / Decimal(statements) * 100
    rate = rate.quantize(PRECISION, rounding=ROUND_DOWN)
-    lines.append(f"{package} | {100 if rate == 100 else rate}% ({statements_covered} / {statements})\n"  # noqa: PLR2004
-                 )
+    lines.append(f"{package} | {100 if rate == 100 else rate}% ({statements_covered} / {statements})\n")

  total_statements_covered = total_data["statements_covered"]
  total_statements = total_data["statements"]
--- a/typings/click_option_group/init.pyi
+++ b/typings/click_option_group/init.pyi
@@ -1,30 +1,30 @@
-from ._core import AllOptionGroup
-from ._core import GroupedOption
-from ._core import MutuallyExclusiveOptionGroup
-from ._core import OptionGroup
-from ._core import RequiredAllOptionGroup
-from ._core import RequiredAnyOptionGroup
-from ._core import RequiredMutuallyExclusiveOptionGroup
-from ._decorators import optgroup
-from ._version import __version__
-
-"""
-click-option-group
-~~~~~~~~~~~~~~~~~~
-
-Option groups missing in Click
-
-:copyright: © 2019-2020 by Eugene Prilepin
-:license: BSD, see LICENSE for more details.
-"""
-__all__ = [
-    "__version__",
-    "optgroup",
-    "GroupedOption",
-    "OptionGroup",
-    "RequiredAnyOptionGroup",
-    "AllOptionGroup",
-    "RequiredAllOptionGroup",
-    "MutuallyExclusiveOptionGroup",
-    "RequiredMutuallyExclusiveOptionGroup",
-]
+from ._core import AllOptionGroup
+from ._core import GroupedOption
+from ._core import MutuallyExclusiveOptionGroup
+from ._core import OptionGroup
+from ._core import RequiredAllOptionGroup
+from ._core import RequiredAnyOptionGroup
+from ._core import RequiredMutuallyExclusiveOptionGroup
+from ._decorators import optgroup
+from ._version import __version__
+
+"""
+click-option-group
+~~~~~~~~~~~~~~~~~~
+
+Option groups missing in Click
+
+:copyright: © 2019-2020 by Eugene Prilepin
+:license: BSD, see LICENSE for more details.
+"""
+__all__ = [
+    "__version__",
+    "optgroup",
+    "GroupedOption",
+    "OptionGroup",
+    "RequiredAnyOptionGroup",
+    "AllOptionGroup",
+    "RequiredAllOptionGroup",
+    "MutuallyExclusiveOptionGroup",
+    "RequiredMutuallyExclusiveOptionGroup",
+]
--- a/typings/click_option_group/_core.pyi
+++ b/typings/click_option_group/_core.pyi
@@ -1,82 +1,80 @@
-from typing import Any
-from typing import Callable
-from typing import Dict
-from typing import List
-from typing import Mapping
-from typing import Optional
-from typing import Sequence
-from typing import Set
-from typing import Tuple
-from typing import TypeAlias
-from typing import TypeVar
-from typing import Union
-
-import click
-
-_R = TypeVar("_R")
-_T = TypeVar("_T")
-AnyCallable: TypeAlias = Callable[..., Any]
-_FC = TypeVar("_FC", bound=Union[AnyCallable, click.Command])
-
-class GroupedOption(click.Option):
-    def __init__(self, param_decls: Optional[Sequence[str]] = ..., *, group: OptionGroup, **attrs: Any) -> None: ...
-    @property
-    def group(self) -> OptionGroup: ...
-    def handle_parse_result(
-        self, ctx: click.Context, opts: Mapping[str, Any], args: List[str]
-    ) -> Tuple[Any, List[str]]: ...
-    def get_help_record(self, ctx: click.Context) -> Optional[Tuple[str, str]]: ...
-
-class _GroupTitleFakeOption(click.Option):
-    def __init__(self, param_decls: Optional[Sequence[str]] = ..., *, group: OptionGroup, **attrs: Any) -> None: ...
-    def get_help_record(self, ctx: click.Context) -> Optional[Tuple[str, str]]: ...
-
-class OptionGroup:
-    def __init__(self, name: Optional[str] = ..., *, hidden: bool = ..., help: Optional[str] = ...) -> None: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def help(self) -> str: ...
-    @property
-    def name_extra(self) -> List[str]: ...
-    @property
-    def forbidden_option_attrs(self) -> List[str]: ...
-    def get_help_record(self, ctx: click.Context) -> Optional[Tuple[str, str]]: ...
-    def option(self, *param_decls: Any, **attrs: Any) -> Callable[[_FC], _FC]: ...
-    def get_options(self, ctx: click.Context) -> Dict[str, GroupedOption]: ...
-    def get_option_names(self, ctx: click.Context) -> List[str]: ...
-    def get_error_hint(self, ctx: click.Context, option_names: Optional[Set[str]] = ...) -> str: ...
-    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
-
-class RequiredAnyOptionGroup(OptionGroup):
-    @property
-    def forbidden_option_attrs(self) -> List[str]: ...
-    @property
-    def name_extra(self) -> List[str]: ...
-    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
-
-class RequiredAllOptionGroup(OptionGroup):
-    @property
-    def forbidden_option_attrs(self) -> List[str]: ...
-    @property
-    def name_extra(self) -> List[str]: ...
-    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
-
-class MutuallyExclusiveOptionGroup(OptionGroup):
-    @property
-    def forbidden_option_attrs(self) -> List[str]: ...
-    @property
-    def name_extra(self) -> List[str]: ...
-    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
-
-class RequiredMutuallyExclusiveOptionGroup(MutuallyExclusiveOptionGroup):
-    @property
-    def name_extra(self) -> List[str]: ...
-    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
-
-class AllOptionGroup(OptionGroup):
-    @property
-    def forbidden_option_attrs(self) -> List[str]: ...
-    @property
-    def name_extra(self) -> List[str]: ...
-    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Mapping
+from typing import Optional
+from typing import Sequence
+from typing import Set
+from typing import Tuple
+from typing import TypeAlias
+from typing import TypeVar
+from typing import Union
+
+import click
+
+AnyCallable: TypeAlias = Callable[..., Any]
+_FC = TypeVar("_FC", bound=Union[AnyCallable, click.Command])
+
+class GroupedOption(click.Option):
+    def __init__(self, param_decls: Optional[Sequence[str]] = ..., *, group: OptionGroup, **attrs: Any) -> None: ...
+    @property
+    def group(self) -> OptionGroup: ...
+    def handle_parse_result(
+        self, ctx: click.Context, opts: Mapping[str, Any], args: List[str]
+    ) -> Tuple[Any, List[str]]: ...
+    def get_help_record(self, ctx: click.Context) -> Optional[Tuple[str, str]]: ...
+
+class _GroupTitleFakeOption(click.Option):
+    def __init__(self, param_decls: Optional[Sequence[str]] = ..., *, group: OptionGroup, **attrs: Any) -> None: ...
+    def get_help_record(self, ctx: click.Context) -> Optional[Tuple[str, str]]: ...
+
+class OptionGroup:
+    def __init__(self, name: Optional[str] = ..., *, hidden: bool = ..., help: Optional[str] = ...) -> None: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def help(self) -> str: ...
+    @property
+    def name_extra(self) -> List[str]: ...
+    @property
+    def forbidden_option_attrs(self) -> List[str]: ...
+    def get_help_record(self, ctx: click.Context) -> Optional[Tuple[str, str]]: ...
+    def option(self, *param_decls: Any, **attrs: Any) -> Callable[[_FC], _FC]: ...
+    def get_options(self, ctx: click.Context) -> Dict[str, GroupedOption]: ...
+    def get_option_names(self, ctx: click.Context) -> List[str]: ...
+    def get_error_hint(self, ctx: click.Context, option_names: Optional[Set[str]] = ...) -> str: ...
+    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
+
+class RequiredAnyOptionGroup(OptionGroup):
+    @property
+    def forbidden_option_attrs(self) -> List[str]: ...
+    @property
+    def name_extra(self) -> List[str]: ...
+    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
+
+class RequiredAllOptionGroup(OptionGroup):
+    @property
+    def forbidden_option_attrs(self) -> List[str]: ...
+    @property
+    def name_extra(self) -> List[str]: ...
+    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
+
+class MutuallyExclusiveOptionGroup(OptionGroup):
+    @property
+    def forbidden_option_attrs(self) -> List[str]: ...
+    @property
+    def name_extra(self) -> List[str]: ...
+    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
+
+class RequiredMutuallyExclusiveOptionGroup(MutuallyExclusiveOptionGroup):
+    @property
+    def name_extra(self) -> List[str]: ...
+    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
+
+class AllOptionGroup(OptionGroup):
+    @property
+    def forbidden_option_attrs(self) -> List[str]: ...
+    @property
+    def name_extra(self) -> List[str]: ...
+    def handle_parse_result(self, option: GroupedOption, ctx: click.Context, opts: Mapping[str, Any]) -> None: ...
--- a/typings/click_option_group/_decorators.pyi
+++ b/typings/click_option_group/_decorators.pyi
@@ -1,65 +1,65 @@
-from typing import Any
-from typing import Callable
-from typing import Dict
-from typing import List
-from typing import NamedTuple
-from typing import Optional
-from typing import Tuple
-from typing import Type
-from typing import TypeVar
-from typing import Union
-from typing import overload
-
-import click
-
-from ._core import _FC
-from ._core import AnyCallable
-from ._core import OptionGroup
-
-class OptionStackItem(NamedTuple):
-    param_decls: Tuple[str, ...]
-    attrs: Dict[str, Any]
-    param_count: int
-
-class _NotAttachedOption(click.Option):
-    def __init__(self, param_decls: Any = ..., *, all_not_attached_options: Any, **attrs: Any) -> None: ...
-    def handle_parse_result(self, ctx: click.Context, opts: Any, args: List[str]) -> Any: ...
-
-_GrpType = TypeVar("_GrpType", bound=OptionGroup)
-
-class _OptGroup:
-    def __init__(self) -> None: ...
-    def __call__(
-        self,
-        name: Optional[str] = ...,
-        *,
-        help: Optional[str] = None,
-        cls: Optional[Type[_GrpType]] = None,
-        **attrs: Any,
-    ) -> Union[click.Command, Callable[[AnyCallable], click.Command]]: ...
-    @overload
-    def group(
-        self,
-        name: Optional[str],
-        cls: type[_GrpType],
-        **attrs: Any,
-    ) -> Callable[[AnyCallable], click.Command]: ...
-    @overload
-    def group(
-        self,
-        name: str = ...,
-        cls: None = None,
-        **attrs: Any,
-    ) -> Callable[[AnyCallable], click.Command]: ...
-    @overload
-    def group(
-        self,
-        name: Optional[str] = ...,
-        *,
-        help: Optional[str] = ...,
-        cls: Optional[Type[_GrpType]] = None,
-        **attrs: Any,
-    ) -> Union[click.Command, Callable[[AnyCallable], click.Command]]: ...
-    def option(self, *param_decls: Any, **attrs: Any) -> Callable[[_FC], _FC]: ...
-
-optgroup: _OptGroup = ...
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import NamedTuple
+from typing import Optional
+from typing import Tuple
+from typing import Type
+from typing import TypeVar
+from typing import Union
+from typing import overload
+
+import click
+
+from ._core import _FC
+from ._core import AnyCallable
+from ._core import OptionGroup
+
+class OptionStackItem(NamedTuple):
+    param_decls: Tuple[str, ...]
+    attrs: Dict[str, Any]
+    param_count: int
+
+class _NotAttachedOption(click.Option):
+    def __init__(self, param_decls: Any = ..., *, all_not_attached_options: Any, **attrs: Any) -> None: ...
+    def handle_parse_result(self, ctx: click.Context, opts: Any, args: List[str]) -> Any: ...
+
+_GrpType = TypeVar("_GrpType", bound=OptionGroup)
+
+class _OptGroup:
+    def __init__(self) -> None: ...
+    def __call__(
+        self,
+        name: Optional[str] = ...,
+        *,
+        help: Optional[str] = None,
+        cls: Optional[Type[_GrpType]] = None,
+        **attrs: Any,
+    ) -> Union[click.Command, Callable[[AnyCallable], click.Command]]: ...
+    @overload
+    def group(
+        self,
+        name: Optional[str],
+        cls: type[_GrpType],
+        **attrs: Any,
+    ) -> Callable[[AnyCallable], click.Command]: ...
+    @overload
+    def group(
+        self,
+        name: str = ...,
+        cls: None = None,
+        **attrs: Any,
+    ) -> Callable[[AnyCallable], click.Command]: ...
+    @overload
+    def group(
+        self,
+        name: Optional[str] = ...,
+        *,
+        help: Optional[str] = ...,
+        cls: Optional[Type[_GrpType]] = None,
+        **attrs: Any,
+    ) -> Union[click.Command, Callable[[AnyCallable], click.Command]]: ...
+    def option(self, *param_decls: Any, **attrs: Any) -> Callable[[_FC], _FC]: ...
+
+optgroup: _OptGroup = ...
--- a/typings/click_option_group/_version.pyi
+++ b/typings/click_option_group/_version.pyi
@@ -1,3 +1,3 @@
-"""This type stub file was generated by pyright."""
-
-__version__ = ...
+"""This type stub file was generated by pyright."""
+
+__version__ = ...