From 2e453fb005afff4a18560ae35fcda98d36a0dca8 Mon Sep 17 00:00:00 2001
From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Date: Sun, 11 Jun 2023 12:53:15 +0000
Subject: [PATCH] refactor(configuration): __config__ and perf

move model_ids and default_id to config class declaration,
cleanup dependencies between config and LLM implementation

lazy load module during LLM creation to llm_post_init

fix post_init hooks to run load_in_mha.

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/_configuration.py                 | 445 ++++++++++++------
 src/openllm/_llm.py                           | 166 +++----
 src/openllm/_package.py                       |  18 +-
 src/openllm/cli.py                            |  44 +-
 src/openllm/models/auto/configuration_auto.py |  13 +
 .../models/chatglm/configuration_chatglm.py   |  31 +-
 .../models/chatglm/modeling_chatglm.py        |   7 +-
 .../models/dolly_v2/configuration_dolly_v2.py |  15 +-
 .../models/dolly_v2/modeling_dolly_v2.py      |  19 +-
 .../models/falcon/configuration_falcon.py     |  26 +-
 src/openllm/models/falcon/modeling_falcon.py  |  14 +-
 .../models/flan_t5/configuration_flan_t5.py   |  14 +-
 .../models/flan_t5/modeling_flan_t5.py        |  13 +-
 .../models/flan_t5/modeling_flax_flan_t5.py   |  10 -
 .../models/flan_t5/modeling_tf_flan_t5.py     |  10 -
 .../models/stablelm/configuration_stablelm.py |  14 +-
 .../models/stablelm/modeling_stablelm.py      |  27 +-
 .../starcoder/configuration_starcoder.py      |  19 +-
 .../models/starcoder/modeling_starcoder.py    |  21 +-
 src/openllm/utils/dantic.py                   |   7 +-
 typings/attr/__init__.pyi                     |   4 +
 typings/attr/_compat.pyi                      |   4 +
 22 files changed, 565 insertions(+), 376 deletions(-)
 create mode 100644 typings/attr/_compat.pyi

diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py
index a4e230d1..48444d97 100644
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -12,13 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Configuration utilities for OpenLLM. All model configuration will inherit from openllm.configuration_utils.LLMConfig.
+Configuration utilities for OpenLLM. All model configuration will inherit from ``openllm.LLMConfig``.
 
-Note that ``openllm.LLMConfig`` is a subclass of ``pydantic.BaseModel``. It also
-has a ``to_click_options`` that returns a list of Click-compatible options for the model.
-Such options will then be parsed to ``openllm.__main__.cli``.
-
-Each fields in ``openllm.LLMConfig`` will also automatically generate a environment
+Highlight feature: Each fields in ``openllm.LLMConfig`` will also automatically generate a environment
 variable based on its name field.
 
 For example, the following config class:
@@ -34,10 +30,14 @@ class FlanT5Config(openllm.LLMConfig):
         repetition_penalty = 1.0
 ```
 
+which generates the environment OPENLLM_FLAN_T5_GENERATION_TEMPERATURE for users to configure temperature
+dynamically during serve, ahead-of-serve or per requests.
 
+Refer to ``openllm.LLMConfig`` docstring for more information.
 """
 from __future__ import annotations
 
+import inspect
 import logging
 import os
 import typing as t
@@ -53,13 +53,13 @@ from deepmerge.merger import Merger
 import openllm
 
 from .exceptions import GpuNotAvailableError, OpenLLMException
-from .utils import LazyType, ModelEnv, bentoml_cattr, dantic, lenient_issubclass
+from .utils import LazyType, ModelEnv, bentoml_cattr, dantic, first_not_none, lenient_issubclass
 
 if t.TYPE_CHECKING:
     import tensorflow as tf
     import torch
     import transformers
-    from attr import _CountingAttr, _make_init
+    from attr import _CountingAttr, _make_init, _make_method, _make_repr
     from transformers.generation.beam_constraints import Constraint
 
     from ._types import ClickFunctionWrapper, F, O_co, P
@@ -67,14 +67,16 @@ if t.TYPE_CHECKING:
     ReprArgs: t.TypeAlias = t.Iterable[tuple[str | None, t.Any]]
 
     DictStrAny = dict[str, t.Any]
+    ListStr = list[str]
     ItemgetterAny = itemgetter[t.Any]
 else:
     Constraint = t.Any
+    ListStr = list
     DictStrAny = dict
     ItemgetterAny = itemgetter
     # NOTE: Using internal API from attr here, since we are actually
     # allowing subclass of openllm.LLMConfig to become 'attrs'-ish
-    from attr._make import _CountingAttr, _make_init
+    from attr._make import _CountingAttr, _make_init, _make_method, _make_repr
 
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -93,6 +95,8 @@ config_merger = Merger(
     type_conflict_strategies=["override"],
 )
 
+_T = t.TypeVar("_T")
+
 
 @t.overload
 def attrs_to_options(
@@ -158,7 +162,9 @@ class GenerationConfig:
     """Generation config provides the configuration to then be parsed to ``transformers.GenerationConfig``,
     with some additional validation and environment constructor.
 
-    Note that we always set `do_sample=True`
+    Note that we always set `do_sample=True`. This class is not designed to be used directly, rather
+    to be used conjunction with LLMConfig. The instance of the generation config can then be accessed
+    via ``LLMConfig.generation_config``.
     """
 
     # NOTE: parameters for controlling the length of the output
@@ -417,20 +423,6 @@ def _populate_value_from_env_var(
     return os.environ.get(key, fallback)
 
 
-def env_transformers(cls: type[GenerationConfig], fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
-    transformed: list[attr.Attribute[t.Any]] = []
-    for f in fields:
-        if "env" not in f.metadata:
-            raise ValueError(
-                "Make sure to setup the field with 'cls.Field' or 'attr.field(..., metadata={\"env\": \"...\"})'"
-            )
-        _from_env = _populate_value_from_env_var(f.metadata["env"])
-        if _from_env is not None:
-            f = f.evolve(default=_from_env)
-        transformed.append(f)
-    return transformed
-
-
 # sentinel object for unequivocal object() getattr
 _sentinel = object()
 
@@ -590,7 +582,7 @@ def _make_internal_generation_class(cls: type[LLMConfig]) -> type[GenerationConf
             )
         return transformed
 
-    generated_cls = attr.make_class(
+    _cl = attr.make_class(
         cls.__name__.replace("Config", "GenerationConfig"),
         [],
         bases=(GenerationConfig,),
@@ -599,28 +591,205 @@ def _make_internal_generation_class(cls: type[LLMConfig]) -> type[GenerationConf
         repr=True,
         field_transformer=_evolve_with_base_default,
     )
+    _cl.__doc__ = GenerationConfig.__doc__
 
-    return generated_cls
+    if _has_gen_class:
+        delattr(cls, "GenerationConfig")
+
+    return _cl
 
 
-# NOTE: This DEFAULT_LLMCONFIG_ATTRS is a way to dynamically generate attr.field
-# and will be saved for future use in LLMConfig if we have some shared config.
-DEFAULT_LLMCONFIG_ATTRS: tuple[tuple[str, t.Any, str, type[t.Any]], ...] = ()
+# NOTE: This is the ModelConfig where we can control the behaviour of the LLM.
+# refers to the __openllm_*__ docstring inside LLMConfig for more information.
+class ModelConfig(t.TypedDict, total=False):
+    # NOTE: meta
+    url: str
+    requires_gpu: bool
+    trust_remote_code: bool
+    requirements: t.Optional[t.List[str]]
+
+    # NOTE: naming convention, only name_type is needed
+    # as the three below it can be determined automatically
+    name_type: t.Literal["dasherize", "lowercase"]
+    model_name: str
+    start_name: str
+    env: openllm.utils.ModelEnv
+
+    # NOTE: serving configuration
+    timeout: int
+    workers_per_resource: t.Union[int, float]
+
+    # NOTE: use t.Required once we drop 3.8 support
+    default_id: str
+    model_ids: list[str]
+
+
+def _gen_default_model_config(cls: type[LLMConfig]) -> ModelConfig:
+    """Generate the default ModelConfig and delete __config__ in LLMConfig
+    if defined inplace."""
+
+    _internal_config = t.cast(ModelConfig, getattr(cls, "__config__", {}))
+    default_id = _internal_config.get("default_id", None)
+    if default_id is None:
+        raise RuntimeError("'default_id' is required under '__config__'.")
+    model_ids = _internal_config.get("model_ids", None)
+    if model_ids is None:
+        raise RuntimeError("'model_ids' is required under '__config__'.")
+
+    def _first_not_null(key: str, default: _T) -> _T:
+        return first_not_none(_internal_config.get(key), default=default)
+
+    llm_config_striped = cls.__name__.replace("Config", "")
+
+    name_type: t.Literal["dasherize", "lowercase"] = _first_not_null("name_type", "dasherize")
+
+    if name_type == "dasherize":
+        default_model_name = inflection.underscore(llm_config_striped)
+        default_start_name = inflection.dasherize(default_model_name)
+    else:
+        default_model_name = llm_config_striped.lower()
+        default_start_name = default_model_name
+
+    model_name = _first_not_null("model_name", default_model_name)
+
+    _config = ModelConfig(
+        name_type=name_type,
+        model_name=model_name,
+        default_id=default_id,
+        model_ids=model_ids,
+        start_name=_first_not_null("start_name", default_start_name),
+        url=_first_not_null("url", "(not provided)"),
+        requires_gpu=_first_not_null("requires_gpu", False),
+        trust_remote_code=_first_not_null("trust_remote_code", False),
+        requirements=_first_not_null("requirements", ListStr()),
+        env=_first_not_null("env", openllm.utils.ModelEnv(model_name)),
+        timeout=_first_not_null("timeout", 3600),
+        workers_per_resource=_first_not_null("workers_per_resource", 1),
+    )
+
+    if hasattr(cls, "__config__"):
+        delattr(cls, "__config__")
+
+    return _config
+
+
+def _generate_unique_filename(cls: type[t.Any], func_name: str):
+    return f"<LLMConfig generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>"
+
+
+def _setattr_class(attr_name: str, value_var: t.Any):
+    """
+    Use the builtin setattr to set *attr_name* to *value_var*.
+    We can't use the cached object.__setattr__ since we are setting
+    attributes to a class.
+    """
+    return f"setattr(cls, '{attr_name}', {value_var})"
+
+
+@t.overload
+def _make_assignment_with_prefix_script(cls: type[LLMConfig], attributes: ModelConfig) -> t.Callable[..., None]:
+    ...
+
+
+@t.overload
+def _make_assignment_with_prefix_script(cls: type[LLMConfig], attributes: dict[str, t.Any]) -> t.Callable[..., None]:
+    ...
+
+
+def _make_assignment_with_prefix_script(cls: type[LLMConfig], attributes: t.Any) -> t.Callable[..., None]:
+    """Generate the assignment script with prefix attributes __openllm_<value>__"""
+    args: list[str] = []
+    globs: dict[str, t.Any] = {"cls": cls, "attr_dict": attributes}
+    annotations: dict[str, t.Any] = {"return": None}
+
+    # Circumvent __setattr__ descriptor to save one lookup per assigment
+    lines: list[str] = []
+    for attr_name in attributes:
+        arg_name = f"__openllm_{inflection.underscore(attr_name)}__"
+        args.append(f"{attr_name}=attr_dict['{attr_name}']")
+        lines.append(_setattr_class(arg_name, attr_name))
+        annotations[attr_name] = type(attributes[attr_name])
+
+    script = "def __assign_attr(cls, %s):\n    %s\n" % (", ".join(args), "\n    ".join(lines) if lines else "pass")
+    assign_method = _make_method(
+        "__assign_attr",
+        script,
+        _generate_unique_filename(cls, "__assign_attr"),
+        globs,
+    )
+    assign_method.__annotations__ = annotations
+
+    return assign_method
 
 
 @attr.define
 class LLMConfig:
-    Field = dantic.Field
-    """Field is a alias to the internal dantic utilities to easily create
-    attrs.fields with pydantic-compatible interface.
+    """
+    ``openllm.LLMConfig`` is somewhat a hybrid combination between the performance of `attrs` with the
+    easy-to-use interface that pydantic offer. It lives in between where it allows users to quickly formulate
+    a LLMConfig for any LLM without worrying too much about performance. It does a few things:
+
+    - Automatic environment conversion: Each fields will automatically be provisioned with an environment
+        variable, make it easy to work with ahead-of-time or during serving time
+    - Familiar API: It is compatible with cattrs as well as providing a few Pydantic-2 like API,
+        i.e: ``model_construct_env``, ``to_generation_config``, ``to_click_options``
+    - Automatic CLI generation: It can identify each fields and convert it to compatible Click options.
+        This means developers can use any of the LLMConfig to create CLI with compatible-Python
+        CLI library (click, typer, ...)
+
+    > Internally, LLMConfig is an attrs class. All subclass of LLMConfig contains "attrs-like" features,
+    > which means LLMConfig will actually generate subclass to have attrs-compatible API, so that the subclass
+    > can be written as any normal Python class.
+
+    To directly configure GenerationConfig for any given LLM, create a GenerationConfig under the subclass:
+
+    ```python
+    class FlanT5Config(openllm.LLMConfig):
+
+        class GenerationConfig:
+            temperature: float = 0.75
+            max_new_tokens: int = 3000
+            top_k: int = 50
+            top_p: float = 0.4
+            repetition_penalty = 1.0
+    ```
+    By doing so, openllm.LLMConfig will create a compatible GenerationConfig attrs class that can be converted
+    to ``transformers.GenerationConfig``. These attribute can be accessed via ``LLMConfig.generation_config``.
+
+    By default, all LLMConfig has a __config__ that contains a default value. If any LLM requires customization,
+    provide a ``__config__`` under the class declaration:
+
+    ```python
+    class FalconConfig(openllm.LLMConfig):
+        __config__ = {"trust_remote_code": True, "default_timeout": 3600000}
+    ```
+
+    Note that ``model_name``, ``start_name``, and ``env`` is optional under ``__config__``. If set, then OpenLLM
+    will respect that option for start and other components within the library.
     """
 
-    if t.TYPE_CHECKING:
-        # The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
+    Field = dantic.Field
+    """Field is a alias to the internal dantic utilities to easily create
+    attrs.fields with pydantic-compatible interface. For example:
 
+    ```python
+    class MyModelConfig(openllm.LLMConfig):
+
+        field1 = openllm.LLMConfig.Field(...)
+    ```
+    """
+
+    # NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
+    if t.TYPE_CHECKING:
+        # NOTE: Internal attributes that should only be used by OpenLLM. Users usually shouldn't
+        # concern any of these.
         def __attrs_init__(self, **attrs: t.Any):
             """Generated __attrs_init__ for LLMConfig subclass that follows the attrs contract."""
 
+        __config__: ModelConfig | None = None
+        """Internal configuration for this LLM model. Each of the field in here will be populated
+        and prefixed with __openllm_<value>__"""
+
         __attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = tuple()
         """Since we are writing our own __init_subclass__, which is an alternative way for __prepare__,
         we want openllm.LLMConfig to be attrs-like dataclass that has pydantic-like interface.
@@ -630,8 +799,15 @@ class LLMConfig:
         __openllm_attrs__: tuple[str, ...] = tuple()
         """Internal attribute tracking to store converted LLMConfig attributes to correct attrs"""
 
-        __openllm_timeout__: int = 3600
-        """The default timeout to be set for this given LLM."""
+        __openllm_hints__: dict[str, t.Any] = Field(None, init=False)
+        """An internal cache of resolved types for this LLMConfig."""
+
+        __openllm_accepted_keys__: set[str] = Field(None, init=False)
+        """The accepted keys for this LLMConfig."""
+
+        # NOTE: The following will be populated from __config__
+        __openllm_url__: str = Field(None, init=False)
+        """The resolved url for this LLMConfig."""
 
         __openllm_requires_gpu__: bool = False
         """Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU."""
@@ -639,6 +815,13 @@ class LLMConfig:
         __openllm_trust_remote_code__: bool = False
         """Whether to always trust remote code"""
 
+        __openllm_requirements__: list[str] | None = None
+        """The default PyPI requirements needed to run this given LLM. By default, we will depend on
+        bentoml, torch, transformers."""
+
+        __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
+        """A ModelEnv instance for this LLMConfig."""
+
         __openllm_model_name__: str = ""
         """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
 
@@ -649,21 +832,8 @@ class LLMConfig:
         """the default name typed for this model. "dasherize" will convert the name to lowercase and
         replace spaces with dashes. "lowercase" will convert the name to lowercase."""
 
-        __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
-        """A ModelEnv instance for this LLMConfig."""
-
-        __openllm_hints__: dict[str, t.Any] = Field(None, init=False)
-        """An internal cache of resolved types for this LLMConfig."""
-
-        __openllm_url__: str = Field(None, init=False)
-        """The resolved url for this LLMConfig."""
-
-        __openllm_accepted_keys__: set[str] = Field(None, init=False)
-        """The accepted keys for this LLMConfig."""
-
-        __openllm_requirements__: list[str] | None = None
-        """The default PyPI requirements needed to run this given LLM. By default, we will depend on
-        bentoml, torch, transformers."""
+        __openllm_timeout__: int = 3600
+        """The default timeout to be set for this given LLM."""
 
         __openllm_workers_per_resource__: int | float = 1
         """The default number of workers per resource. By default, we will use 1 worker per resource.
@@ -671,6 +841,18 @@ class LLMConfig:
         https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
         """
 
+        __openllm_default_id__: str = Field(None)
+        """Return the default model to use when using 'openllm start <model_id>'.
+        This could be one of the keys in 'self.model_ids' or custom users model."""
+
+        __openllm_model_ids__: list[str] = Field(None)
+        """A list of supported pretrained models tag for this given runnable.
+
+        For example:
+            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
+                                                "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
+        """
+
         GenerationConfig: type = type
         """Users can override this subclass of any given LLMConfig to provide GenerationConfig
         default value. For example:
@@ -689,35 +871,9 @@ class LLMConfig:
         """The result generated GenerationConfig class for this LLMConfig. This will be used
         to create the generation_config argument that can be used throughout the lifecycle."""
 
-    def __init_subclass__(
-        cls,
-        *,
-        name_type: t.Literal["dasherize", "lowercase"] = "dasherize",
-        default_timeout: int | None = None,
-        trust_remote_code: bool = False,
-        requires_gpu: bool = False,
-        url: str | None = None,
-        requirements: list[str] | None = None,
-        workers_per_resource: int | float = 1,
-    ):
-        if name_type == "dasherize":
-            model_name = inflection.underscore(cls.__name__.replace("Config", ""))
-            start_name = inflection.dasherize(model_name)
-        else:
-            model_name = cls.__name__.replace("Config", "").lower()
-            start_name = model_name
-
-        cls.__openllm_name_type__ = name_type
-        cls.__openllm_requires_gpu__ = requires_gpu
-        cls.__openllm_timeout__ = default_timeout or 3600
-        cls.__openllm_trust_remote_code__ = trust_remote_code
-
-        cls.__openllm_model_name__ = model_name
-        cls.__openllm_start_name__ = start_name
-        cls.__openllm_env__ = openllm.utils.ModelEnv(model_name)
-        cls.__openllm_url__ = url or "(not set)"
-        cls.__openllm_requirements__ = requirements
-        cls.__openllm_workers_per_resource__ = workers_per_resource
+    def __init_subclass__(cls):
+        # NOTE: auto assignment attributes generated from __config__
+        _make_assignment_with_prefix_script(cls, _gen_default_model_config(cls))(cls)
 
         # NOTE: Since we want to enable a pydantic-like experience
         # this means we will have to hide the attr abstraction, and generate
@@ -727,7 +883,7 @@ class LLMConfig:
         cd = cls.__dict__
 
         def field_env_key(key: str) -> str:
-            return f"OPENLLM_{model_name.upper()}_{key.upper()}"
+            return f"OPENLLM_{cls.__openllm_model_name__.upper()}_{key.upper()}"
 
         ca_names = {name for name, attr in cd.items() if isinstance(attr, _CountingAttr)}
         ca_list: list[tuple[str, _CountingAttr[t.Any]]] = []
@@ -763,22 +919,14 @@ class LLMConfig:
                 gen_attribute = gen_attribute.evolve(metadata=metadata)
             own_attrs.append(gen_attribute)
 
+        # This is to handle subclass of subclass of all provided LLMConfig.
+        # refer to attrs for the original implementation.
         base_attrs, base_attr_map = _collect_base_attrs(cls, {a.name for a in own_attrs})
 
         # __openllm_attrs__ is a tracking tuple[attr.Attribute[t.Any]]
         # that we construct ourself.
         cls.__openllm_attrs__ = tuple(a.name for a in own_attrs)
 
-        # NOTE: Enable some default attributes that can be shared across all LLMConfig
-        if len(DEFAULT_LLMCONFIG_ATTRS) > 0:
-            # NOTE: update the hints for default variables we dynamically added.
-            hints.update({k: hints for k, _, _, hints in DEFAULT_LLMCONFIG_ATTRS})
-            base_attrs = [
-                attr.Attribute.from_counting_attr(k, cls.Field(default, env=field_env_key(k), description=docs), hints)
-                for k, default, docs, hints in DEFAULT_LLMCONFIG_ATTRS
-                if k not in cls.__openllm_attrs__
-            ] + base_attrs
-
         attrs: list[attr.Attribute[t.Any]] = own_attrs + base_attrs
 
         # Mandatory vs non-mandatory attr order only matters when they are part of
@@ -810,38 +958,33 @@ class LLMConfig:
         _has_post_init = bool(getattr(cls, "__attrs_post_init__", False))
 
         AttrsTuple = _make_attr_tuple_class(cls.__name__, cls.__openllm_attrs__)
+        # NOTE: the protocol for attrs-decorated class
+        cls.__attrs_attrs__ = AttrsTuple(attrs)
         # NOTE: generate a __attrs_init__ for the subclass
         cls.__attrs_init__ = _add_method_dunders(
             cls,
             _make_init(
-                cls,
-                AttrsTuple(attrs),
-                _has_pre_init,
-                _has_post_init,
-                False,
-                True,
-                True,
-                base_attr_map,
-                False,
-                None,
-                attrs_init=True,
+                cls,  # cls (the attrs-decorated class)
+                cls.__attrs_attrs__,  # tuple of attr.Attribute of cls
+                _has_pre_init,  # pre_init
+                _has_post_init,  # post_init
+                False,  # frozen
+                True,  # slots
+                True,  # cache_hash
+                base_attr_map,  # base_attr_map
+                False,  # is_exc (check if it is exception)
+                None,  # cls_on_setattr (essentially attr.setters)
+                attrs_init=True,  # whether to create __attrs_init__ instead of __init__
             ),
         )
-        cls.__attrs_attrs__ = AttrsTuple(attrs)
 
         # NOTE: Finally, set the generation_class for this given config.
         cls.generation_class = _make_internal_generation_class(cls)
 
         hints.update(t.get_type_hints(cls.generation_class))
-
         cls.__openllm_hints__ = hints
-
         cls.__openllm_accepted_keys__ = set(cls.__openllm_attrs__) | set(attr.fields_dict(cls.generation_class))
 
-    @property
-    def name_type(self) -> t.Literal["dasherize", "lowercase"]:
-        return self.__openllm_name_type__
-
     def __init__(
         self,
         *,
@@ -849,33 +992,56 @@ class LLMConfig:
         __openllm_extras__: dict[str, t.Any] | None = None,
         **attrs: t.Any,
     ):
-        self.__openllm_extras__ = openllm.utils.first_not_none(__openllm_extras__, default={})
+        # create a copy of the list of keys as cache
+        _cached_keys = tuple(attrs.keys())
+
+        self.__openllm_extras__ = first_not_none(__openllm_extras__, default={})
         config_merger.merge(
             self.__openllm_extras__, {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__}
         )
 
-        attrs = {k: v for k, v in attrs.items() if k not in self.__openllm_extras__ and v is not None}
+        for k in _cached_keys:
+            if k in self.__openllm_extras__ or attrs.get(k) is None:
+                del attrs[k]
+        _cached_keys = tuple(k for k in _cached_keys if k in attrs)
 
+        _generation_cl_dict = attr.fields_dict(self.generation_class)
         if generation_config is None:
-            generation_config = {k: v for k, v in attrs.items() if k in attr.fields_dict(self.generation_class)}
+            generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
+        else:
+            generation_keys = {k for k in attrs if k in _generation_cl_dict}
+            if len(generation_keys) > 0:
+                logger.warning(
+                    "When 'generation_config' is passed, \
+                the following keys are ignored and won't be used: %s. If you wish to use those values, \
+                pass it into 'generation_config'.",
+                    ", ".join(generation_keys),
+                )
+            for k in _cached_keys:
+                if k in generation_keys:
+                    del attrs[k]
+            _cached_keys = tuple(k for k in _cached_keys if k in attrs)
 
         self.generation_config = self.generation_class(**generation_config)
+        base_attrs: tuple[attr.Attribute[t.Any], ...] = attr.fields(self.__class__)
+        base_attrs += (
+            attr.Attribute.from_counting_attr(
+                name="generation_config",
+                ca=dantic.Field(
+                    self.generation_config, description=inspect.cleandoc(self.generation_class.__doc__ or "")
+                ),
+                type=self.generation_class,
+            ),
+        )
+        # mk the class __repr__ function with the updated fields.
+        self.__class__.__repr__ = _add_method_dunders(self.__class__, _make_repr(base_attrs, None, self.__class__))
 
-        attrs = {k: v for k, v in attrs.items() if k not in generation_config}
+        for k in _cached_keys:
+            if k in generation_config:
+                del attrs[k]
 
-        self.__attrs_init__(**{k: v for k, v in attrs.items() if k in self.__openllm_attrs__})
-
-        # The rest update to extras
-        attrs = {k: v for k, v in attrs.items() if k not in self.__openllm_attrs__}
-        config_merger.merge(self.__openllm_extras__, attrs)
-
-    def __repr__(self) -> str:
-        bases = f"{self.__class__.__qualname__.rsplit('>.', 1)[-1]}(generation_config={repr(self.generation_class())}"
-        if len(self.__openllm_attrs__) > 0:
-            bases += ", " + ", ".join([f"{k}={getattr(self, k)}" for k in self.__openllm_attrs__]) + ")"
-        else:
-            bases += ")"
-        return bases
+        # The rest of attrs should only be the attributes to be passed to __attrs_init__
+        self.__attrs_init__(**attrs)
 
     def __getattr__(self, item: str) -> t.Any:
         if hasattr(self.generation_config, item):
@@ -975,43 +1141,46 @@ class LLMConfig:
         config = transformers.GenerationConfig(**bentoml_cattr.unstructure(self.generation_config))
         return config.to_dict() if return_as_dict else config
 
+    @classmethod
     @t.overload
     def to_click_options(
-        self, f: t.Callable[..., openllm.LLMConfig]
+        cls, f: t.Callable[..., openllm.LLMConfig]
     ) -> F[P, ClickFunctionWrapper[..., openllm.LLMConfig]]:
         ...
 
+    @classmethod
     @t.overload
-    def to_click_options(self, f: t.Callable[P, O_co]) -> F[P, ClickFunctionWrapper[P, O_co]]:
+    def to_click_options(cls, f: t.Callable[P, O_co]) -> F[P, ClickFunctionWrapper[P, O_co]]:
         ...
 
-    def to_click_options(self, f: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
+    @classmethod
+    def to_click_options(cls, f: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
         """
         Convert current model to click options. This can be used as a decorator for click commands.
         Note that the identifier for all LLMConfig will be prefixed with '<model_name>_*', and the generation config
         will be prefixed with '<model_name>_generation_*'.
         """
-        for name, field in attr.fields_dict(self.generation_class).items():
-            ty = self.__openllm_hints__.get(name)
+        for name, field in attr.fields_dict(cls.generation_class).items():
+            ty = cls.__openllm_hints__.get(name)
             if t.get_origin(ty) is t.Union:
                 # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
                 continue
-            f = attrs_to_options(name, field, self.__openllm_model_name__, typ=ty, suffix_generation=True)(f)
-        f = optgroup.group(f"{self.generation_class.__name__} generation options")(f)
+            f = attrs_to_options(name, field, cls.__openllm_model_name__, typ=ty, suffix_generation=True)(f)
+        f = optgroup.group(f"{cls.generation_class.__name__} generation options")(f)
 
-        if len(self.__class__.__openllm_attrs__) == 0:
+        if len(cls.__openllm_attrs__) == 0:
             # NOTE: in this case, the function is already a ClickFunctionWrapper
             # hence the casting
             return f
 
-        for name, field in attr.fields_dict(self.__class__).items():
-            ty = self.__openllm_hints__.get(name)
+        for name, field in attr.fields_dict(cls).items():
+            ty = cls.__openllm_hints__.get(name)
             if t.get_origin(ty) is t.Union:
                 # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
                 continue
-            f = attrs_to_options(name, field, self.__openllm_model_name__, typ=ty)(f)
+            f = attrs_to_options(name, field, cls.__openllm_model_name__, typ=ty)(f)
 
-        return optgroup.group(f"{self.__class__.__name__} options")(f)
+        return optgroup.group(f"{cls.__name__} options")(f)
 
 
 bentoml_cattr.register_unstructure_hook_factory(
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index da883bb1..2e4ea206 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -32,6 +32,7 @@ from bentoml.types import ModelSignature, ModelSignatureDict
 import openllm
 
 from .exceptions import ForbiddenAttributeError, OpenLLMException
+from .models.auto import AutoConfig
 from .utils import ENV_VARS_TRUE_VALUES, LazyLoader, bentoml_cattr
 
 if t.TYPE_CHECKING:
@@ -180,37 +181,19 @@ def import_model(
             torch.cuda.empty_cache()
 
 
-_required_namespace = {"default_id", "model_ids"}
-
-_reserved_namespace = _required_namespace | {
-    "config_class",
-    "model",
-    "tokenizer",
-    "import_kwargs",
-}
+_reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"}
 
 
 class LLMInterface(ABC):
     """This defines the loose contract for all openllm.LLM implementations."""
 
-    default_id: str
-    """Return the default model to use when using 'openllm start <model_id>'.
-    This could be one of the keys in 'self.model_ids' or custom users model."""
-
-    model_ids: list[str]
-    """A list of supported pretrained models tag for this given runnable.
-
-    For example:
-        For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
-                                            "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
-    """
-
     config_class: type[openllm.LLMConfig]
     """The config class to use for this LLM. If you are creating a custom LLM, you must specify this class."""
 
-    import_kwargs: dict[str, t.Any] | None = None
-    """The default import kwargs to used when importing the model.
-    This will be passed into 'openllm.LLM.import_model'."""
+    @property
+    def import_kwargs(self) -> dict[str, t.Any] | None:
+        """The default import kwargs to used when importing the model.
+        This will be passed into 'openllm.LLM.import_model'."""
 
     @abstractmethod
     def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any:
@@ -265,34 +248,29 @@ class LLMInterface(ABC):
         raise NotImplementedError
 
 
+def _default_post_init(self: LLM):
+    # load_in_mha: Whether to apply BetterTransformer (or Torch MultiHeadAttention) during inference load.
+    #              See https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/
+    #              for more information.
+    # NOTE: set a default variable to transform to BetterTransformer by default for inference
+    self.load_in_mha = (
+        os.environ.get(self.config_class.__openllm_env__.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES
+    )
+    if self.config_class.__openllm_requires_gpu__:
+        # For all models that requires GPU, no need to offload it to BetterTransformer
+        # use bitsandbytes instead
+
+        self.load_in_mha = False
+
+
 class LLMMetaclass(ABCMeta):
     def __new__(
         mcls, cls_name: str, bases: tuple[type[t.Any], ...], namespace: dict[str, t.Any], **attrs: t.Any
     ) -> type:
         """Metaclass for creating a LLM."""
         if LLMInterface not in bases:  # only actual openllm.LLM should hit this branch.
-            if "__annotations__" not in namespace:
-                annotations_dict: dict[str, t.Any] = {}
-                namespace["__annotations__"] = annotations_dict
-
-            # NOTE: check for required attributes
-            if "__openllm_internal__" not in namespace:
-                _required_namespace.add("config_class")
-            for k in _required_namespace:
-                if k not in namespace:
-                    raise RuntimeError(f"Missing required key '{k}'. Make sure to define it within the LLM subclass.")
-
             # NOTE: set implementation branch
             prefix_class_name_config = cls_name
-            if "__llm_implementation__" in namespace:
-                raise RuntimeError(
-                    f"""\
-                __llm_implementation__ should not be set directly. Instead make sure that your class
-                name follows the convention prefix:
-                - For Tensorflow implementation: 'TF{cls_name}'
-                - For Flax implementation: 'Flax{cls_name}'
-                - For PyTorch implementation: '{cls_name}'"""
-                )
             if cls_name.startswith("Flax"):
                 implementation = "flax"
                 prefix_class_name_config = cls_name[4:]
@@ -302,39 +280,37 @@ class LLMMetaclass(ABCMeta):
             else:
                 implementation = "pt"
             namespace["__llm_implementation__"] = implementation
+            config_class = AutoConfig.infer_class_from_name(prefix_class_name_config)
 
             # NOTE: setup config class branch
             if "__openllm_internal__" in namespace:
                 # NOTE: we will automatically find the subclass for this given config class
                 if "config_class" not in namespace:
                     # this branch we will automatically get the class
-                    namespace["config_class"] = getattr(openllm, f"{prefix_class_name_config}Config")
+                    namespace["config_class"] = config_class
                 else:
                     logger.debug(f"Using config class {namespace['config_class']} for {cls_name}.")
+            # NOTE: check for required attributes
+            else:
+                if "config_class" not in namespace:
+                    raise RuntimeError(
+                        "Missing required key 'config_class'. Make sure to define it within the LLM subclass."
+                    )
 
-            config_class: type[openllm.LLMConfig] = namespace["config_class"]
+            # NOTE: the llm_post_init branch
+            if "llm_post_init" in namespace:
+                original_llm_post_init = namespace["llm_post_init"]
 
-            # NOTE: update the annotations for self.config
-            namespace["__annotations__"]["config"] = t.get_type_hints(config_class)
+                def wrapped_llm_post_init(self: LLM) -> None:
+                    """We need to both initialize private attributes and call the user-defined model_post_init
+                    method.
+                    """
+                    _default_post_init(self)
+                    original_llm_post_init(self)
 
-            for key in ("__openllm_start_name__", "__openllm_requires_gpu__"):
-                namespace[key] = getattr(config_class, key)
-
-            # load_in_mha: Whether to apply BetterTransformer (or Torch MultiHeadAttention) during inference load.
-            #              See https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/
-            #              for more information.
-            # NOTE: set a default variable to transform to BetterTransformer by default for inference
-            if "load_in_mha" not in namespace:
-                load_in_mha = (
-                    os.environ.get(config_class().__openllm_env__.bettertransformer, str(False)).upper()
-                    in ENV_VARS_TRUE_VALUES
-                )
-                namespace["load_in_mha"] = load_in_mha
-
-            if namespace["__openllm_requires_gpu__"]:
-                # For all models that requires GPU, no need to offload it to BetterTransformer
-                # use bitsandbytes instead
-                namespace["load_in_mha"] = False
+                namespace["llm_post_init"] = wrapped_llm_post_init
+            else:
+                namespace["llm_post_init"] = _default_post_init
 
             # NOTE: import_model branch
             if "import_model" not in namespace:
@@ -343,16 +319,10 @@ class LLMMetaclass(ABCMeta):
             else:
                 logger.debug("Using custom 'import_model' for %s", cls_name)
 
-            # NOTE: populate with default cache.
-            namespace.update({k: None for k in ("__llm_bentomodel__", "__llm_model__", "__llm_tokenizer__")})
-
             cls: type[LLM] = super().__new__(t.cast("type[type[LLM]]", mcls), cls_name, bases, namespace, **attrs)
+
             cls.__openllm_post_init__ = None if cls.llm_post_init is LLMInterface.llm_post_init else cls.llm_post_init
-
             cls.__openllm_custom_load__ = None if cls.load_model is LLMInterface.load_model else cls.load_model
-
-            if getattr(cls, "config_class") is None:
-                raise RuntimeError(f"'config_class' must be defined for '{cls.__name__}'")
             return cls
         else:
             # the LLM class itself being created, no need to setup
@@ -362,23 +332,19 @@ class LLMMetaclass(ABCMeta):
 class LLM(LLMInterface, metaclass=LLMMetaclass):
     if t.TYPE_CHECKING:
         # NOTE: the following will be populated by metaclass
-        __llm_bentomodel__: bentoml.Model | None = None
-        __llm_model__: LLMModel | None = None
-        __llm_tokenizer__: LLMTokenizer | None = None
         __llm_implementation__: t.Literal["pt", "tf", "flax"]
-
-        __openllm_start_name__: str
-        __openllm_requires_gpu__: bool
         __openllm_post_init__: t.Callable[[t.Self], None] | None
-
         __openllm_custom_load__: t.Callable[[t.Self, t.Any, t.Any], None] | None
 
         load_in_mha: bool
         _llm_attrs: dict[str, t.Any]
         _llm_args: tuple[t.Any, ...]
 
-    # NOTE: the following is the similar interface to HuggingFace pretrained protocol.
+    __llm_bentomodel__: bentoml.Model | None = None
+    __llm_model__: LLMModel | None = None
+    __llm_tokenizer__: LLMTokenizer | None = None
 
+    # NOTE: the following is the similar interface to HuggingFace pretrained protocol.
     @classmethod
     def from_pretrained(
         cls, model_id: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
@@ -446,15 +412,33 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         Note that this tag will be generated based on `self.default_id` or the given `pretrained` kwds.
         passed from the __init__ constructor.
 
-        ``llm_post_init`` can also be implemented if you need to do any
-        additional initialization after everything is setup.
+        ``llm_post_init`` can also be implemented if you need to do any additional
+        initialization after everything is setup.
+
+        Note: If you need to implement a custom `load_model`, the following is an example from Falcon implementation:
+
+        ```python
+        def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
+            torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
+            device_map = attrs.pop("device_map", "auto")
+
+            _ref = bentoml.transformers.get(tag)
+
+            model = bentoml.transformers.load_model(_ref, device_map=device_map, torch_dtype=torch_dtype, **attrs)
+            return transformers.pipeline("text-generation", model=model, tokenizer=_ref.custom_objects["tokenizer"])
+        ```
 
         Args:
             model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
-            llm_config: The config to use for this LLM. Defaults to None. If not passed, we will use 'self.config_class'
-                        to construct default configuration.
+            llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
+                        will use `config_class` to construct default configuration.
             *args: The args to be passed to the model.
             **attrs: The kwargs to be passed to the model.
+
+        The following are optional:
+            openllm_model_version: version for this `model_id`. By default, users can ignore this if using pretrained
+                                   weights as OpenLLM will use the commit_hash of given model_id.
+                                   However, if `model_id` is a path, this argument is recomended to include.
         """
 
         if llm_config is not None:
@@ -466,10 +450,8 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
             attrs = self.config.__openllm_extras__
 
         if model_id is None:
-            model_id = os.environ.get(self.config.__openllm_env__.model_id, None)
-            if not model_id:
-                assert self.default_id, "A default model is required for any LLM."
-                model_id = self.default_id
+            model_id = os.environ.get(self.config.__openllm_env__.model_id, self.config.__openllm_default_id__)
+            assert model_id is not None
 
         # NOTE: This is the actual given path or pretrained weight for this LLM.
         self._model_id = model_id
@@ -479,7 +461,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         self._llm_attrs = attrs
 
         if self.__openllm_post_init__:
-            self.__openllm_post_init__(self)
+            self.llm_post_init()
 
     def __setattr__(self, attr: str, value: t.Any):
         if attr in _reserved_namespace:
@@ -500,7 +482,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
     def identifying_params(self) -> dict[str, t.Any]:
         return {
             "configuration": self.config.model_dump_json().decode(),
-            "model_ids": orjson.dumps(self.model_ids).decode(),
+            "model_ids": orjson.dumps(self.config.__openllm_model_ids__).decode(),
         }
 
     @t.overload
@@ -643,7 +625,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
         return self._bentomodel.tag
 
     @property
-    def model(self) -> LLMModel | torch.nn.Module:
+    def model(self) -> LLMModel:
         """The model to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
         # Run check for GPU
         trust_remote_code = self._llm_attrs.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
@@ -818,7 +800,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
                 (_Runnable,),
                 {
                     "SUPPORTED_RESOURCES": ("nvidia.com/gpu", "cpu")
-                    if self.__openllm_requires_gpu__
+                    if self.config.__openllm_requires_gpu__
                     else ("nvidia.com/gpu",),
                     "llm_type": self.llm_type,
                     "identifying_params": self.identifying_params,
diff --git a/src/openllm/_package.py b/src/openllm/_package.py
index a1887aa6..5b39727c 100644
--- a/src/openllm/_package.py
+++ b/src/openllm/_package.py
@@ -76,16 +76,17 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
     # first, then proceed to install everything inside the wheels/ folder.
     packages: list[str] = ["openllm"]
 
-    ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
     if llm.config.__openllm_requirements__ is not None:
         packages.extend(llm.config.__openllm_requirements__)
 
     if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
         packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
 
-    to_use_framework = ModelEnv.get_framework_env()
+    to_use_framework = llm.config.__openllm_env__.get_framework_env()
     if to_use_framework == "flax":
-        assert utils.is_flax_available(), f"Flax is not available, while {ModelEnv.framework} is set to 'flax'"
+        assert (
+            utils.is_flax_available()
+        ), f"Flax is not available, while {llm.config.__openllm_env__.framework} is set to 'flax'"
         packages.extend(
             [
                 f"flax>={importlib.metadata.version('flax')}",
@@ -94,7 +95,9 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
             ]
         )
     elif to_use_framework == "tf":
-        assert utils.is_tf_available(), f"TensorFlow is not available, while {ModelEnv.framework} is set to 'tf'"
+        assert (
+            utils.is_tf_available()
+        ), f"TensorFlow is not available, while {llm.config.__openllm_env__.framework} is set to 'tf'"
         candidates = (
             "tensorflow",
             "tensorflow-cpu",
@@ -128,7 +131,6 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
 
 
 def construct_docker_options(llm: openllm.LLM, _: FS) -> DockerOptions:
-    ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
     _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
     _bentoml_config_options += (
         " "
@@ -141,7 +143,7 @@ def construct_docker_options(llm: openllm.LLM, _: FS) -> DockerOptions:
     return DockerOptions(
         cuda_version="11.6",  # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
         env={
-            ModelEnv.framework: ModelEnv.get_framework_env(),
+            llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
             "OPENLLM_MODEL": llm.config.__openllm_model_name__,
             "BENTOML_DEBUG": str(get_debug_mode()),
             "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
@@ -200,12 +202,12 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
                     raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
                 _previously_built = True
             except bentoml.exceptions.NotFound:
-                logger.info("Building Bento for LLM '%s'", llm.__openllm_start_name__)
+                logger.info("Building Bento for LLM '%s'", llm.config.__openllm_start_name__)
                 bento = bentoml.bentos.build(
                     f"{service_name}:svc",
                     name=bento_tag.name,
                     labels=labels,
-                    description=f"OpenLLM service for {llm.__openllm_start_name__}",
+                    description=f"OpenLLM service for {llm.config.__openllm_start_name__}",
                     include=[
                         f for f in llm_fs.walk.files(filter=["*.py"])
                     ],  # NOTE: By default, we are using _service.py as the default service, for now.
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index 017d48fc..1fc46dfe 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -384,12 +384,10 @@ def start_model_command(
     ModelEnv = openllm.utils.ModelEnv(model_name)
     llm_config = openllm.AutoConfig.for_model(model_name)
 
-    for_doc = openllm.AutoLLM.for_model(model_name)
     docstring = f"""\
 {ModelEnv.start_docstring}
 \b
-Available model_id(s) to use with '{model_name}' are: {for_doc.model_ids} [default: {for_doc.default_id}]
-Tip: One can pass one of the aforementioned to '--model-id' to use other pretrained weights.
+Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.__openllm_default_id__}]
 """
     command_attrs: dict[str, t.Any] = {
         "name": ModelEnv.model_name,
@@ -399,7 +397,7 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
     }
 
     aliases: list[str] = []
-    if llm_config.name_type == "dasherize":
+    if llm_config.__openllm_name_type__ == "dasherize":
         aliases.append(llm_config.__openllm_start_name__)
 
     command_attrs["aliases"] = aliases if len(aliases) > 0 else None
@@ -429,8 +427,9 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
     @group.command(**command_attrs)
     @llm_config.to_click_options
     @parse_serve_args(_serve_grpc)
-    @click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds")
-    @model_id_option
+    @cog.optgroup.group("General LLM Options")
+    @cog.optgroup.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds")
+    @model_id_option(cog.optgroup, model_env=ModelEnv)
     @click.option(
         "--device",
         type=tuple,
@@ -439,6 +438,7 @@ Tip: One can pass one of the aforementioned to '--model-id' to use other pretrai
         envvar="CUDA_VISIBLE_DEVICES",
         callback=parse_device_callback,
         help=f"Assign GPU devices (if available) for {model_name}.",
+        show_envvar=True,
     )
     def model_start(
         server_timeout: int,
@@ -575,12 +575,20 @@ output_option = click.option(
     default="pretty",
     help="Showing output type. Default to 'pretty'",
 )
-model_id_option = click.option(
-    "--model-id",
-    type=click.STRING,
-    default=None,
-    help="Optional model_id name or path for (fine-tune) weight.",
-)
+
+
+def model_id_option(factory: t.Any, model_env: openllm.utils.ModelEnv | None = None):
+    envvar = None
+    if model_env is not None:
+        envvar = model_env.model_id
+    return factory.option(
+        "--model-id",
+        type=click.STRING,
+        default=None,
+        help="Optional model_id name or path for (fine-tune) weight.",
+        envvar=envvar,
+        show_envvar=True if envvar is not None else False,
+    )
 
 
 def cli_factory() -> click.Group:
@@ -627,7 +635,7 @@ def cli_factory() -> click.Group:
     @click.argument(
         "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
     )
-    @model_id_option
+    @model_id_option(click)
     @output_option
     @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
     def build(model_name: str, model_id: str | None, overwrite: bool, output: OutputLiteral):
@@ -713,8 +721,12 @@ def cli_factory() -> click.Group:
                         runtime_impl += ("flax",)
                     if model.config.__openllm_model_name__ in openllm.MODEL_TF_MAPPING_NAMES:
                         runtime_impl += ("tf",)
-                    json_data[m] = {"model_id": model.model_ids, "description": docs, "runtime_impl": runtime_impl}
-                    converted.extend([convert_transformers_model_name(i) for i in model.model_ids])
+                    json_data[m] = {
+                        "model_id": model.config.__openllm_model_ids__,
+                        "description": docs,
+                        "runtime_impl": runtime_impl,
+                    }
+                    converted.extend([convert_transformers_model_name(i) for i in model.config.__openllm_model_ids__])
                 except Exception as err:
                     failed_initialized.append((m, err))
 
@@ -783,7 +795,7 @@ def cli_factory() -> click.Group:
     @click.argument(
         "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
     )
-    @model_id_option
+    @model_id_option(click)
     @output_option
     def download_models(model_name: str, model_id: str | None, output: OutputLiteral):
         """Setup LLM interactively.
diff --git a/src/openllm/models/auto/configuration_auto.py b/src/openllm/models/auto/configuration_auto.py
index f45ec170..a36012d5 100644
--- a/src/openllm/models/auto/configuration_auto.py
+++ b/src/openllm/models/auto/configuration_auto.py
@@ -87,6 +87,7 @@ class _LazyConfigMapping(ConfigOrderedDict):
 
 
 CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+CONFIG_NAME_ALIASES: dict[str, str] = {"chat_glm": "chatglm", "stable_lm": "stablelm", "star_coder": "starcoder"}
 
 
 class AutoConfig:
@@ -102,3 +103,15 @@ class AutoConfig:
             f"Unrecognized configuration class for {model_name}. "
             f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
         )
+
+    @classmethod
+    def infer_class_from_name(cls, name: str) -> type[openllm.LLMConfig]:
+        model_name = inflection.underscore(name)
+        if model_name in CONFIG_NAME_ALIASES:
+            model_name = CONFIG_NAME_ALIASES[model_name]
+        if model_name in CONFIG_MAPPING:
+            return CONFIG_MAPPING[model_name]
+        raise ValueError(
+            f"Unrecognized configuration class for {model_name}. "
+            f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
+        )
diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/src/openllm/models/chatglm/configuration_chatglm.py
index d5230d2e..d330a161 100644
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -16,15 +16,7 @@ from __future__ import annotations
 import openllm
 
 
-class ChatGLMConfig(
-    openllm.LLMConfig,
-    name_type="lowercase",
-    trust_remote_code=True,
-    default_timeout=3600000,
-    requires_gpu=True,
-    url="https://github.com/THUDM/ChatGLM-6B",
-    requirements=["cpm_kernels", "sentencepiece"],
-):
+class ChatGLMConfig(openllm.LLMConfig):
     """
     ChatGLM is an open bilingual language model based on
     [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
@@ -41,11 +33,24 @@ class ChatGLMConfig(
     Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
     """
 
-    retain_history: bool = False
-    """Whether to retain history given to the model. If set to True, then the model will retain given history."""
+    __config__ = {
+        "name_type": "lowercase",
+        "trust_remote_code": True,
+        "timeout": 3600000,
+        "requires_gpu": True,
+        "url": "https://github.com/THUDM/ChatGLM-6B",
+        "requirements": ["cpm_kernels", "sentencepiece"],
+        "default_id": "thudm/chatglm-6b-int4",
+        "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"],
+    }
 
-    use_half_precision: bool = True
-    """Whether to use half precision for model."""
+    retain_history: bool = openllm.LLMConfig.Field(
+        False,
+        description="""Whether to retain history given to the model. 
+        If set to True, then the model will retain given history.""",
+    )
+
+    use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
 
     class GenerationConfig:
         max_new_tokens: int = 2048
diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py
index 01b1154b..a3672163 100644
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -41,11 +41,8 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
 class ChatGLM(openllm.LLM):
     __openllm_internal__ = True
 
-    default_id = "thudm/chatglm-6b-int4"
-
-    model_ids = ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4"]
-
-    device = torch.device("cuda")
+    def llm_post_init(self):
+        self.device = torch.device("cuda")
 
     def import_model(
         self,
diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
index 2b148bce..49584ee0 100644
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -20,12 +20,7 @@ from __future__ import annotations
 import openllm
 
 
-class DollyV2Config(
-    openllm.LLMConfig,
-    default_timeout=3600000,
-    trust_remote_code=True,
-    url="https://github.com/databrickslabs/dolly",
-):
+class DollyV2Config(openllm.LLMConfig):
     """Databricks’ Dolly is an instruction-following large language model trained on the Databricks
     machine learning platform that is licensed for commercial use.
 
@@ -39,6 +34,14 @@ class DollyV2Config(
     Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
     """
 
+    __config__ = {
+        "timeout": 3600000,
+        "trust_remote_code": True,
+        "url": "https://github.com/databrickslabs/dolly",
+        "default_id": "databricks/dolly-v2-3b",
+        "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
+    }
+
     return_full_text: bool = openllm.LLMConfig.Field(
         False, description="Whether to return the full prompt to the users."
     )
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index d01b8cfe..f892a64c 100644
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -38,17 +38,16 @@ class DollyV2(openllm.LLM):
 
     __openllm_internal__ = True
 
-    default_id = "databricks/dolly-v2-3b"
+    @property
+    def import_kwargs(self):
+        return {
+            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+            "torch_dtype": torch.bfloat16,
+            "_tokenizer_padding_side": "left",
+        }
 
-    model_ids = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
-
-    import_kwargs = {
-        "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-        "torch_dtype": torch.bfloat16,
-        "_tokenizer_padding_side": "left",
-    }
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def llm_post_init(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     def import_model(
         self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
diff --git a/src/openllm/models/falcon/configuration_falcon.py b/src/openllm/models/falcon/configuration_falcon.py
index a2704e25..cca8fad1 100644
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -16,15 +16,7 @@ from __future__ import annotations
 import openllm
 
 
-class FalconConfig(
-    openllm.LLMConfig,
-    name_type="lowercase",
-    trust_remote_code=True,
-    requires_gpu=True,
-    default_timeout=3600000,
-    url="https://falconllm.tii.ae/",
-    requirements=["einops", "xformers", "safetensors"],
-):
+class FalconConfig(openllm.LLMConfig):
     """Falcon-7B is a 7B parameters causal decoder-only model built by
     TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
     enhanced with curated corpora. It is made available under the TII Falcon LLM License.
@@ -32,6 +24,22 @@ class FalconConfig(
     Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
     """
 
+    __config__ = {
+        "name_type": "lowercase",
+        "trust_remote_code": True,
+        "requires_gpu": True,
+        "timeout": 3600000,
+        "url": "https://falconllm.tii.ae/",
+        "requirements": ["einops", "xformers", "safetensors"],
+        "default_id": "tiiuae/falcon-7b",
+        "model_ids": [
+            "tiiuae/falcon-7b",
+            "tiiuae/falcon-40b",
+            "tiiuae/falcon-7b-instruct",
+            "tiiuae/falcon-40b-instruct",
+        ],
+    }
+
     class GenerationConfig:
         max_new_tokens: int = 200
         top_k: int = 10
diff --git a/src/openllm/models/falcon/modeling_falcon.py b/src/openllm/models/falcon/modeling_falcon.py
index 41535d0f..34595871 100644
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -34,14 +34,12 @@ else:
 class Falcon(openllm.LLM):
     __openllm_internal__ = True
 
-    default_id = "tiiuae/falcon-7b"
-
-    model_ids = ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]
-
-    import_kwargs = {
-        "torch_dtype": torch.bfloat16,
-        "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-    }
+    @property
+    def import_kwargs(self):
+        return {
+            "torch_dtype": torch.bfloat16,
+            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+        }
 
     def import_model(
         self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
diff --git a/src/openllm/models/flan_t5/configuration_flan_t5.py b/src/openllm/models/flan_t5/configuration_flan_t5.py
index d67f972c..a23861bd 100644
--- a/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -44,13 +44,25 @@ $ openllm start flan-t5 --model-id google/flan-t5-xxl
 DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
 
 
-class FlanT5Config(openllm.LLMConfig, url="https://huggingface.co/docs/transformers/model_doc/flan-t5"):
+class FlanT5Config(openllm.LLMConfig):
     """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf)
     - it is an enhanced version of T5 that has been finetuned in a mixture of tasks.
 
     Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
     """
 
+    __config__ = {
+        "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
+        "default_id": "google/flan-t5-large",
+        "model_ids": [
+            "google/flan-t5-small",
+            "google/flan-t5-base",
+            "google/flan-t5-large",
+            "google/flan-t5-xl",
+            "google/flan-t5-xxl",
+        ],
+    }
+
     class GenerationConfig:
         temperature: float = 0.9
         max_new_tokens: int = 2048
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index 658436e0..b2cbead6 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -29,17 +29,8 @@ else:
 class FlanT5(openllm.LLM):
     __openllm_internal__ = True
 
-    default_id = "google/flan-t5-large"
-
-    model_ids = [
-        "google/flan-t5-small",
-        "google/flan-t5-base",
-        "google/flan-t5-large",
-        "google/flan-t5-xl",
-        "google/flan-t5-xxl",
-    ]
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def llm_post_init(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     def sanitize_parameters(
         self,
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 5c08db5e..c176bac1 100644
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -24,16 +24,6 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 class FlaxFlanT5(openllm.LLM):
     __openllm_internal__ = True
 
-    default_id: str = "google/flan-t5-large"
-
-    model_ids = [
-        "google/flan-t5-small",
-        "google/flan-t5-base",
-        "google/flan-t5-large",
-        "google/flan-t5-xl",
-        "google/flan-t5-xxl",
-    ]
-
     def sanitize_parameters(
         self,
         prompt: str,
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index e950919c..2d254df2 100644
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -24,16 +24,6 @@ from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 class TFFlanT5(openllm.LLM):
     __openllm_internal__ = True
 
-    default_id: str = "google/flan-t5-large"
-
-    model_ids = [
-        "google/flan-t5-small",
-        "google/flan-t5-base",
-        "google/flan-t5-large",
-        "google/flan-t5-xl",
-        "google/flan-t5-xxl",
-    ]
-
     def sanitize_parameters(
         self,
         prompt: str,
diff --git a/src/openllm/models/stablelm/configuration_stablelm.py b/src/openllm/models/stablelm/configuration_stablelm.py
index 4dd777e1..7ddcc9d4 100644
--- a/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/src/openllm/models/stablelm/configuration_stablelm.py
@@ -16,7 +16,7 @@ from __future__ import annotations
 import openllm
 
 
-class StableLMConfig(openllm.LLMConfig, name_type="lowercase", url="https://github.com/Stability-AI/StableLM"):
+class StableLMConfig(openllm.LLMConfig):
     """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models
     pre-trained on a diverse collection of English datasets with a sequence
     length of 4096 to push beyond the context window limitations of existing open-source language models.
@@ -30,6 +30,18 @@ class StableLMConfig(openllm.LLMConfig, name_type="lowercase", url="https://gith
     for more information.
     """
 
+    __config__ = {
+        "name_type": "lowercase",
+        "url": "https://github.com/Stability-AI/StableLM",
+        "default_id": "stabilityai/stablelm-tuned-alpha-3b",
+        "model_ids": [
+            "stabilityai/stablelm-tuned-alpha-3b",
+            "stabilityai/stablelm-tuned-alpha-7b",
+            "stabilityai/stablelm-base-alpha-3b",
+            "stabilityai/stablelm-base-alpha-7b",
+        ],
+    }
+
     class GenerationConfig:
         temperature: float = 0.9
         max_new_tokens: int = 128
diff --git a/src/openllm/models/stablelm/modeling_stablelm.py b/src/openllm/models/stablelm/modeling_stablelm.py
index 25e30fb7..ac9ff5af 100644
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -42,23 +42,17 @@ logger = logging.getLogger(__name__)
 class StableLM(openllm.LLM):
     __openllm_internal__ = True
 
-    load_in_mha = True if not torch.cuda.is_available() else False
-    default_id = "stabilityai/stablelm-tuned-alpha-3b"
+    def llm_post_init(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.load_in_mha = True if not torch.cuda.is_available() else False
 
-    model_ids = [
-        "stabilityai/stablelm-tuned-alpha-3b",
-        "stabilityai/stablelm-tuned-alpha-7b",
-        "stabilityai/stablelm-base-alpha-3b",
-        "stabilityai/stablelm-base-alpha-7b",
-    ]
-
-    import_kwargs = {
-        "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-        "load_in_8bit": False,
-        "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-    }
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    @property
+    def import_kwargs(self):
+        return {
+            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
+            "load_in_8bit": False,
+            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+        }
 
     def sanitize_parameters(
         self,
@@ -98,7 +92,6 @@ class StableLM(openllm.LLM):
     def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
         return generation_result[0]
 
-    @torch.inference_mode()
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         generation_kwargs = {
             "do_sample": True,
diff --git a/src/openllm/models/starcoder/configuration_starcoder.py b/src/openllm/models/starcoder/configuration_starcoder.py
index 1346a108..a7593e3a 100644
--- a/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/src/openllm/models/starcoder/configuration_starcoder.py
@@ -16,14 +16,7 @@ from __future__ import annotations
 import openllm
 
 
-class StarCoderConfig(
-    openllm.LLMConfig,
-    name_type="lowercase",
-    requires_gpu=True,
-    url="https://github.com/bigcode-project/starcoder",
-    requirements=["bitsandbytes"],
-    workers_per_resource=0.5,
-):
+class StarCoderConfig(openllm.LLMConfig):
     """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from
     [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
 
@@ -34,6 +27,16 @@ class StarCoderConfig(
     Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
     """
 
+    __config__ = {
+        "name_type": "lowercase",
+        "requires_gpu": True,
+        "url": "https://github.com/bigcode-project/starcoder",
+        "requirements": ["bitsandbytes"],
+        "workers_per_resource": 0.5,
+        "default_id": "bigcode/starcoder",
+        "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"],
+    }
+
     class GenerationConfig:
         temperature: float = 0.2
         max_new_tokens: int = 256
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index 7959ec19..cf70e204 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -40,18 +40,17 @@ FIM_INDICATOR = "<FILL_HERE>"
 class StarCoder(openllm.LLM):
     __openllm_internal__ = True
 
-    default_id = "bigcode/starcoder"
+    def llm_post_init(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    model_ids = ["bigcode/starcoder", "bigcode/starcoderbase"]
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    import_kwargs = {
-        "_tokenizer_padding_side": "left",
-        "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-        "load_in_8bit": True if torch.cuda.device_count() > 1 else False,
-        "torch_dtype": torch.float16,
-    }
+    @property
+    def import_kwargs(self):
+        return {
+            "_tokenizer_padding_side": "left",
+            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+            "load_in_8bit": True if torch.cuda.device_count() > 1 else False,
+            "torch_dtype": torch.float16,
+        }
 
     def import_model(
         self,
diff --git a/src/openllm/utils/dantic.py b/src/openllm/utils/dantic.py
index 3c5bfb43..a7f9cc79 100644
--- a/src/openllm/utils/dantic.py
+++ b/src/openllm/utils/dantic.py
@@ -38,7 +38,10 @@ def _default_converter(value: t.Any, env: str | None) -> t.Any:
     if env is not None:
         value = os.environ.get(env, value)
     if value is not None and isinstance(value, str):
-        return eval(value, {"__builtins__": {}}, {})
+        try:
+            return orjson.loads(value)
+        except orjson.JSONDecodeError as err:
+            raise RuntimeError(f"Failed to parse '{value}' from '{env}': {err}")
     return value
 
 
@@ -65,7 +68,7 @@ def Field(
     """
     metadata = attrs.pop("metadata", {})
     if description is None:
-        description = "(No description is available)"
+        description = "(No description provided)"
     metadata["description"] = description
     if env is not None:
         metadata["env"] = env
diff --git a/typings/attr/__init__.pyi b/typings/attr/__init__.pyi
index 99aa31b7..0b1b5d30 100644
--- a/typings/attr/__init__.pyi
+++ b/typings/attr/__init__.pyi
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import enum
 import sys
 from typing import (
@@ -500,3 +502,5 @@ def _make_init(
     cls_on_setattr: Any,
     attrs_init: bool,
 ) -> Callable[_P, Any]: ...
+def _make_method(name: str, script: str, filename: str, globs: dict[str, Any]) -> Callable[..., Any]: ...
+def _make_repr(attrs: tuple[Attribute[Any]], ns: str | None, cls: AttrsInstance) -> Callable[[AttrsInstance], str]: ...
diff --git a/typings/attr/_compat.pyi b/typings/attr/_compat.pyi
new file mode 100644
index 00000000..8f559fce
--- /dev/null
+++ b/typings/attr/_compat.pyi
@@ -0,0 +1,4 @@
+import threading
+
+set_closure_cell = ...
+repr_context: threading.local = ...