diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py
index 3aa8c435..28dc39c3 100644
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -38,7 +38,6 @@ class FlanT5Config(openllm.LLMConfig):
 """
 from __future__ import annotations
 
-import copy
 import os
 import types
 import typing as t
@@ -55,7 +54,7 @@ from click_option_group import optgroup
 import openllm
 
 from .exceptions import GpuNotAvailableError, OpenLLMException
-from .utils import _object_setattr
+from .utils import LazyType
 from .utils.dantic import allows_multiple, parse_default
 
 if t.TYPE_CHECKING:
@@ -70,15 +69,20 @@ if t.TYPE_CHECKING:
     import transformers
     from pydantic.fields import FieldInfo
     from transformers.generation.beam_constraints import Constraint
+
+    DictStrAny = dict[str, t.Any]
 else:
     from transformers.utils.dummy_pt_objects import Constraint
 
+    DictStrAny = dict
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
     tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
 
 __all__ = ["LLMConfig", "ModelSignature"]
 
+_object_setattr = object.__setattr__
+
 
 def field_to_options(
     name: str, field: FieldInfo, model_name: str, suffix_generation: bool = False
@@ -87,6 +91,7 @@ def field_to_options(
     envvar = field.json_schema_extra.get("env") if field.json_schema_extra else None
     dasherized = inflection.dasherize(name)
     underscored = inflection.underscore(name)
+
     full_option_name = f"--{dasherized}"
     if field.annotation is bool:
         full_option_name += f"/--no-{dasherized}"
@@ -101,7 +106,7 @@ def field_to_options(
         type=field.annotation,
         required=field.is_required(),
         default=parse_default(field.default, field.annotation),
-        show_default=False,
+        show_default=True if field.default else False,
         multiple=allows_multiple(field.annotation),
         help=field.description,
         show_envvar=True if envvar else False,
@@ -109,14 +114,13 @@ def field_to_options(
     )
 
 
-def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, str]:
-    kwargs: dict[str, t.Any] = {}
-    for key, field in model.model_fields.items():
-        if field.json_schema_extra is not None:
-            if "env" not in field.json_schema_extra:
-                raise RuntimeError(f"Invalid {model} passed. Only accept LLMConfig or LLMConfig.generation_config")
-            kwargs[key] = os.environ.get(field.json_schema_extra["env"], field.default)
-    return {k: v for k, v in kwargs.items() if v is not None}
+def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, t.Any]:
+    # NOTE: We can safe cast here since all of the fields in GenerationConfig or LLMConfig
+    # will have a `env` field in `json_schema_extra`
+    return {
+        key: os.environ.get(t.cast("dict[str, t.Any]", field.json_schema_extra)["env"], field.default)
+        for key, field in model.model_fields.items()
+    }
 
 
 class GenerationConfig(pydantic.BaseModel):
@@ -136,7 +140,7 @@ class GenerationConfig(pydantic.BaseModel):
         description="""The minimum length of the sequence to be generated. Corresponds to the length of the 
         input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.""",
     )
-    min_new_tokens: t.Optional[int] = pydantic.Field(
+    min_new_tokens: int = pydantic.Field(
         None, description="The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt."
     )
     early_stopping: bool = pydantic.Field(
@@ -150,7 +154,7 @@ class GenerationConfig(pydantic.BaseModel):
             (canonical beam search algorithm)
     """,
     )
-    max_time: t.Optional[float] = pydantic.Field(
+    max_time: float = pydantic.Field(
         None,
         description="""The maximum amount of time you allow the computation to run for in seconds. generation will 
         still finish the current pass after allocated time has been passed.""",
@@ -163,7 +167,7 @@ class GenerationConfig(pydantic.BaseModel):
         description="""Number of groups to divide `num_beams` into in order to ensure diversity among different 
         groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.""",
     )
-    penalty_alpha: t.Optional[float] = pydantic.Field(
+    penalty_alpha: float = pydantic.Field(
         None,
         description="""The values balance the model confidence and the degeneration penalty in 
         contrastive search decoding.""",
@@ -242,14 +246,15 @@ class GenerationConfig(pydantic.BaseModel):
     no_repeat_ngram_size: int = pydantic.Field(
         0, description="If set to int > 0, all ngrams of that size can only occur once."
     )
-    bad_words_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
+    bad_words_ids: t.List[t.List[int]] = pydantic.Field(
         None,
         description="""List of token ids that are not allowed to be generated. In order to get the token ids 
         of the words that should not appear in the generated text, use 
         `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.
         """,
     )
-    force_words_ids: t.Optional[t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]]] = pydantic.Field(
+    # NOTE: t.Union is not yet supported on CLI, but the environment variable should already be available.
+    force_words_ids: t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]] = pydantic.Field(
         None,
         description="""List of token ids that must be generated. If given a `List[List[int]]`, this is treated 
         as a simple list of words that must be included, the opposite to `bad_words_ids`. 
@@ -265,13 +270,13 @@ class GenerationConfig(pydantic.BaseModel):
         algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization.
     """,
     )
-    constraints: t.Optional[t.List["Constraint"]] = pydantic.Field(
+    constraints: t.List["Constraint"] = pydantic.Field(
         None,
         description="""Custom constraints that can be added to the generation to ensure that the output 
         will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible.
         """,
     )
-    forced_bos_token_id: t.Optional[int] = pydantic.Field(
+    forced_bos_token_id: int = pydantic.Field(
         None,
         description="""The id of the token to force as the first generated token after the 
         ``decoder_start_token_id``. Useful for multilingual models like 
@@ -279,7 +284,7 @@ class GenerationConfig(pydantic.BaseModel):
         to be the target language token.
     """,
     )
-    forced_eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
+    forced_eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
         None,
         description="""The id of the token to force as the last generated token when `max_length` is reached. 
         Optionally, use a list to set multiple *end-of-sequence* tokens.""",
@@ -289,26 +294,26 @@ class GenerationConfig(pydantic.BaseModel):
         description="""Whether to remove possible *nan* and *inf* outputs of the model to prevent the 
         generation method to crash. Note that using `remove_invalid_values` can slow down generation.""",
     )
-    exponential_decay_length_penalty: t.Optional[t.Tuple[int, float]] = pydantic.Field(
+    exponential_decay_length_penalty: t.Tuple[int, float] = pydantic.Field(
         None,
         description="""This tuple adds an exponentially increasing length penalty, after a certain amount of tokens 
         have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` 
         indicates where penalty starts and `decay_factor` represents the factor of exponential decay
     """,
     )
-    suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
+    suppress_tokens: t.List[int] = pydantic.Field(
         None,
         description="""A list of tokens that will be suppressed at generation. The `SupressTokens` logit 
         processor will set their log probs to `-inf` so that they are not sampled.
     """,
     )
-    begin_suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
+    begin_suppress_tokens: t.List[int] = pydantic.Field(
         None,
         description="""A list of tokens that will be suppressed at the beginning of the generation. The 
         `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
         """,
     )
-    forced_decoder_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
+    forced_decoder_ids: t.List[t.List[int]] = pydantic.Field(
         None,
         description="""A list of pairs of integers which indicates a mapping from generation indices to token indices 
         that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always 
@@ -338,9 +343,9 @@ class GenerationConfig(pydantic.BaseModel):
     )
 
     # NOTE: Special tokens that can be used at generation time
-    pad_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *padding* token.")
-    bos_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
-    eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
+    pad_token_id: int = pydantic.Field(None, description="The id of the *padding* token.")
+    bos_token_id: int = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
+    eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
         None,
         description="""The id of the *end-of-sequence* token. Optionally, use a list to set 
         multiple *end-of-sequence* tokens.""",
@@ -353,7 +358,7 @@ class GenerationConfig(pydantic.BaseModel):
         `encoder_input_ids` cannot occur in the `decoder_input_ids`.
         """,
     )
-    decoder_start_token_id: t.Optional[int] = pydantic.Field(
+    decoder_start_token_id: int = pydantic.Field(
         None,
         description="""If an encoder-decoder model starts decoding with a 
         different token than *bos*, the id of that token.
@@ -361,7 +366,7 @@ class GenerationConfig(pydantic.BaseModel):
     )
 
     # NOTE: pydantic definition
-    model_config = dict(arbitrary_types_allowed=True, extra="forbid")
+    model_config = {"extra": "forbid", "arbitrary_types_allowed": True}
 
     if t.TYPE_CHECKING:
         # The following is handled via __pydantic_init_subclass__
@@ -395,28 +400,11 @@ class GenerationConfig(pydantic.BaseModel):
         # NOTE: I don't know how to do this more efficiently in pydantic v2 yet, will probably
         # need to consult the pydantic team on this.
         for key, field in self.model_fields.items():
-            json_schema: dict[str, t.Any] = (
-                copy.deepcopy(field.json_schema_extra) if field.json_schema_extra is not None else {}
-            )
-            env_key = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"
-            if "env" in json_schema:
-                field.default = os.environ.get(json_schema["env"], field.default)
+            if not field.json_schema_extra:
+                field.json_schema_extra = {}
+            if "env" in field.json_schema_extra:
                 continue
-            json_schema["env"] = env_key
-            # then assign json_schema back to field
-            field.json_schema_extra = json_schema
-            field.default = os.environ.get(env_key, field.default)
-
-    def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
-        for name, field in self.model_fields.items():
-            if t.get_origin(field.annotation) is t.Union:
-                # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
-                continue
-            f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
-        return optgroup.group(
-            f"{self.__class__.__name__} generation options",
-            help=f"[Auto-generated from '{self.__class__.__qualname__}']",
-        )(f)
+            field.json_schema_extra["env"] = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"
 
 
 class LLMConfig(pydantic.BaseModel, ABC):
@@ -428,23 +416,15 @@ class LLMConfig(pydantic.BaseModel, ABC):
             return getattr(self.generation_config, attr)
         return getattr(self, attr)
 
-    def __repr_args__(self) -> ReprArgs:
-        """Overwrite from default BaseModel and don't show __pydantic_extra__."""
-        yield from (
-            (k, v)
-            for k, v in self.__dict__.items()
-            if not k.startswith("_") and (k not in self.model_fields or self.model_fields[k].repr)
-        )
-        yield from ((k, getattr(self, k)) for k, v in self.model_computed_fields.items() if v.repr)
-
     if t.TYPE_CHECKING:
         # The following is handled via __pydantic_init_subclass__, and is only used for TYPE_CHECKING
-        __openllm_model_name__: str = ""
-        __openllm_start_name__: str = ""
-        __openllm_timeout__: int = 0
+        __openllm_model_name__: str
+        __openllm_start_name__: str
+        __openllm_timeout__: int = 3600
         __openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
         __openllm_trust_remote_code__: bool = False
         __openllm_requires_gpu__: bool = False
+        __openllm_env__: openllm.utils.ModelEnv
         GenerationConfig: type[t.Any] = GenerationConfig
 
     def __init_subclass__(
@@ -488,6 +468,8 @@ class LLMConfig(pydantic.BaseModel, ABC):
             cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower()
             cls.__openllm_start_name__ = cls.__openllm_model_name__
 
+        cls.__openllm_env__ = openllm.utils.ModelEnv(cls.__openllm_model_name__)
+
         if hasattr(cls, "GenerationConfig"):
             cls.generation_config = t.cast(
                 "type[GenerationConfig]",
@@ -508,12 +490,10 @@ class LLMConfig(pydantic.BaseModel, ABC):
 
     def model_post_init(self, _: t.Any):
         if self.__pydantic_extra__:
-            generation_config = self.__pydantic_extra__.pop("generation_config", None)
+            generation_config: dict[str, t.Any] | None = self.__pydantic_extra__.pop("generation_config", None)
             if generation_config is not None:
-                assert isinstance(generation_config, dict), "generation_config must be a dict."
-                self.generation_config = self.generation_config.model_copy(
-                    update=t.cast("dict[str, t.Any]", generation_config), deep=True
-                )
+                assert LazyType[DictStrAny](dict).isinstance(generation_config), "generation_config must be a dict."
+                self.generation_config = self.generation_config.model_copy(update=generation_config, deep=True)
             else:
                 # The rest of the extras fields should just be the generation_config.
                 self.generation_config = self.generation_config.model_copy(update=self.__pydantic_extra__, deep=True)
@@ -551,64 +531,48 @@ class LLMConfig(pydantic.BaseModel, ABC):
         except pydantic.ValidationError as e:
             raise openllm.exceptions.ValidationError(f"Failed to dump configuration to dict: {e}") from e
 
-    def with_options(self, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
+    @classmethod
+    def model_construct_env(cls, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
         """A helpers that respect configuration values that
         sets from environment variables for any given configuration class.
         """
-        from_env_ = self.from_env()
-        # filtered out None values
-        attrs = {k: v for k, v in attrs.items() if v is not None}
-        generation_keys = {k for k in attrs if k in self.generation_config.model_fields}
-
-        generation_attrs = {k: v for k, v in attrs.items() if k in generation_keys}
-        config_attrs = {k: v for k, v in attrs.items() if k not in generation_keys}
-
-        # NOTE: first set the default config kwargs.
-        # We will always respect envvar as default, then the one that is pass
-        attrs = {**generate_kwargs_from_envvar(self), **config_attrs}
+        env_json_string = os.environ.get(cls.__openllm_env__.model_config, None)
+        if env_json_string is not None:
+            try:
+                self = cls.model_construct(**orjson.loads(env_json_string))
+            except pydantic.ValidationError as e:
+                raise RuntimeError(f"Failed to parse '{cls.__openllm_env__.model_config}' as valid JSON string.") from e
+        else:
+            self = cls.model_construct()
 
         if __llm_config__ is not None:
             # NOTE: Only hit this branch on the server. Client shouldn't use __llm_config__
-            attrs = {**attrs, **__llm_config__.model_dump()}
+            # as it is not set.
+            return self.model_construct(**__llm_config__.model_dump(flatten=True))
 
-        # NOTE: Then we setup generation config values
-        attrs["generation_config"] = {
-            **generate_kwargs_from_envvar(self.generation_config),
-            **attrs.get("generation_config", {}),
-            **generation_attrs,
-        }
+        # filtered out None values
+        attrs = {k: v for k, v in attrs.items() if v is not None}
 
-        if from_env_:
-            return from_env_.model_construct(**attrs)
-        return self.model_construct(**attrs)
+        construct_attrs = generate_kwargs_from_envvar(self)
+        construct_attrs.update(generate_kwargs_from_envvar(self.generation_config))
+        construct_attrs.update(attrs)
 
-    @classmethod
-    def from_env(cls) -> LLMConfig | None:
-        envvar = openllm.utils.MODEL_CONFIG_ENV_VAR(cls.__openllm_model_name__)
-        env_json_string = os.environ.get(envvar, None)
-        if env_json_string is None:
-            return
-
-        try:
-            return cls.model_construct(**orjson.loads(env_json_string))
-        except pydantic.ValidationError as e:
-            raise RuntimeError(f"Failed to parse environment variable '{envvar}' as a valid JSON string.") from e
+        return self.model_construct(**construct_attrs)
 
     def model_validate_click(self, **attrs: t.Any) -> tuple[LLMConfig, dict[str, t.Any]]:
         """Parse given click attributes into a LLMConfig and return the remaining click attributes."""
-        llm_config_attrs = {
-            k[len(self.__openllm_model_name__) + 1 :]: v
-            for k, v in attrs.items()
-            if k[len(self.__openllm_model_name__) + 1 :] in self.model_fields
-        }
-        llm_config_attrs["generation_config"] = {
-            k[len(self.__openllm_model_name__ + "_generation") + 1 :]: v
-            for k, v in attrs.items()
-            if k[len(self.__openllm_model_name__ + "_generation") + 1 :] in self.generation_config.model_fields
-        }
-        return self.with_options(**llm_config_attrs), {
-            k: v for k, v in attrs.items() if not k.startswith(self.__openllm_model_name__)
-        }
+        llm_config_attrs = {}
+        key_to_remove: list[str] = []
+
+        for k, v in attrs.items():
+            if k.startswith(f"{self.__openllm_model_name__}_"):
+                llm_config_attrs[k[len(self.__openllm_model_name__) + 1 :]] = v
+                key_to_remove.append(k)
+            elif k.startswith(f"{self.__openllm_model_name__}_generation_"):
+                llm_config_attrs[k[len(self.__openllm_model_name__ + "_generation") + 1 :]] = v
+                key_to_remove.append(k)
+
+        return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}
 
     @t.overload
     def to_generation_config(self, return_as_dict: t.Literal[True] = ...) -> dict[str, t.Any]:
@@ -627,17 +591,25 @@ class LLMConfig(pydantic.BaseModel, ABC):
 
         return config
 
-    def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
+    def to_click_options(self, f: F[P]) -> t.Callable[[F[P]], click.Command]:
         """
         Convert current model to click options. This can be used as a decorator for click commands.
         Note that the identifier for all LLMConfig will be prefixed with '<model_name>_*', and the generation config
         will be prefixed with '<model_name>_generation_*'.
         """
-        wrapped_generation = self.generation_config.to_click_options(f)
+
+        for name, field in self.generation_config.model_fields.items():
+            if t.get_origin(field.annotation) is t.Union:
+                # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
+                continue
+            f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
+        f = optgroup.group(f"{self.__class__.__name__} generation options")(f)
+
         if len(self.model_fields.values()) == 0:
-            return wrapped_generation
+            return f
         for name, field in self.model_fields.items():
-            wrapped_generation = field_to_options(name, field, self.__openllm_model_name__)(wrapped_generation)
-        return optgroup.group(
-            f"{self.__class__.__name__} options", help=f"[Auto-generated from '{self.__class__.__qualname__}']"
-        )(wrapped_generation)
+            if t.get_origin(field.annotation) is t.Union:
+                # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
+                continue
+            f = field_to_options(name, field, self.__openllm_model_name__)(f)
+        return optgroup.group(f"{self.__class__.__name__} options")(f)
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index a9a99021..5b8044a9 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -440,8 +440,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
     # NOTE: The section below defines a loose contract with langchain's LLM interface.
     @property
     def llm_type(self) -> str:
-        assert self.default_model is not None
-        return openllm.utils.convert_transformers_model_name(self.default_model)
+        return openllm.utils.convert_transformers_model_name(self._pretrained)
 
     @property
     def identifying_params(self) -> dict[str, t.Any]:
@@ -637,10 +636,10 @@ def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
                 behaviour
     """
     init_local = attrs.pop("init_local", False)
-    envvar = openllm.utils.get_framework_env(start_name)
-    if envvar == "flax":
+    ModelEnv = openllm.utils.ModelEnv(start_name)
+    if ModelEnv.get_framework_env() == "flax":
         runner = openllm.AutoFlaxLLM.create_runner(start_name, **attrs)
-    elif envvar == "tf":
+    elif ModelEnv.get_framework_env() == "tf":
         runner = openllm.AutoTFLLM.create_runner(start_name, **attrs)
     else:
         runner = openllm.AutoLLM.create_runner(start_name, **attrs)
diff --git a/src/openllm/_package.py b/src/openllm/_package.py
index 1255c7c2..c3649133 100644
--- a/src/openllm/_package.py
+++ b/src/openllm/_package.py
@@ -70,6 +70,8 @@ def build_editable(path: str) -> str | None:
 
 def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
     packages: list[str] = []
+
+    ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
     if llm.requirements is not None:
         packages.extend(llm.requirements)
 
@@ -89,11 +91,9 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
         ]
     )
 
-    to_use_framework = utils.get_framework_env(llm.__openllm_start_name__)
+    to_use_framework = ModelEnv.get_framework_env()
     if to_use_framework == "flax":
-        assert (
-            utils.is_flax_available()
-        ), f"Flax is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'flax'"
+        assert utils.is_flax_available(), f"Flax is not available, while {ModelEnv.framework} is set to 'flax'"
         packages.extend(
             [
                 f"flax>={importlib.metadata.version('flax')}",
@@ -102,9 +102,7 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
             ]
         )
     elif to_use_framework == "tf":
-        assert (
-            utils.is_tf_available()
-        ), f"TensorFlow is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'tf'"
+        assert utils.is_tf_available(), f"TensorFlow is not available, while {ModelEnv.framework} is set to 'tf'"
         candidates = (
             "tensorflow",
             "tensorflow-cpu",
@@ -137,11 +135,12 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
     return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
 
 
-def construct_docker_options(llm: openllm.LLM, llm_fs: FS) -> DockerOptions:
+def construct_docker_options(llm: openllm.LLM, _: FS) -> DockerOptions:
+    ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
     return DockerOptions(
         cuda_version="11.6",  # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
         env={
-            utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__): utils.get_framework_env(llm.__openllm_start_name__),
+            ModelEnv.framework: ModelEnv.get_framework_env(),
             "OPENLLM_MODEL": llm.config.__openllm_model_name__,
         },
         system_packages=["git"],
@@ -165,14 +164,16 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
     current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
     _previously_built = False
 
-    logger.debug("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)
+    ModelEnv = openllm.utils.ModelEnv(model_name)
+
+    logger.info("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)
 
     # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
     # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
     try:
         os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
 
-        to_use_framework = openllm.utils.get_framework_env(model_name)
+        to_use_framework = ModelEnv.get_framework_env()
         if to_use_framework == "flax":
             llm = openllm.AutoFlaxLLM.for_model(model_name, **attrs)
         elif to_use_framework == "tf":
diff --git a/src/openllm/_service.py b/src/openllm/_service.py
index 8385cd90..8979facf 100644
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -30,7 +30,7 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
     route="/v1/generate",
 )
 async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
-    config = llm_config.with_options(__llm_config__=qa.llm_config).model_dump()
+    config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump()
     responses = await runner.generate.async_run(qa.prompt, **config)
     return openllm.GenerationOutput(responses=responses, configuration=config)
 
@@ -39,5 +39,5 @@ async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
 def metadata_v1(_: str) -> dict[str, t.Any]:
     return {
         "model_name": llm_config.__openllm_model_name__,
-        "framework": openllm.utils.get_framework_env(llm_config.__openllm_model_name__),
+        "framework": llm_config.__openllm_env__.get_framework_env(),
     }
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index 7241dfba..ded1313a 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -314,7 +314,7 @@ def start_model_command(
     Note that the internal commands will return the llm_config and a boolean determine
     whether the server is run with GPU or not.
     """
-    envvar = openllm.utils.get_framework_env(model_name)
+    ModelEnv = openllm.utils.ModelEnv(model_name)
     model_command_decr: dict[str, t.Any] = {
         "name": inflection.underscore(model_name),
         "context_settings": _context_settings or {},
@@ -330,16 +330,15 @@ def start_model_command(
         {
             "name": config.__openllm_model_name__,
             "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
-            "help": getattr(
-                openllm.utils.get_lazy_module(model_name),
-                f"START_{inflection.underscore(model_name).upper()}_COMMAND_DOCSTRING",
-            ),
+            "help": ModelEnv.start_docstring,
             "aliases": aliases if len(aliases) > 0 else None,
         }
     )
 
+    gpu_available = False
     try:
-        config.check_if_gpu_is_available(envvar)
+        config.check_if_gpu_is_available(ModelEnv.get_framework_env())
+        gpu_available = True
     except openllm.exceptions.GpuNotAvailableError:
         # NOTE: The model requires GPU, therefore we will return a dummy command
         model_command_decr.update(
@@ -353,7 +352,7 @@ def start_model_command(
         @factory.command(**model_command_decr)
         def noop() -> openllm.LLMConfig:
             click.secho("No GPU available, therefore this command is disabled", fg="red")
-            openllm.utils.analytics.track_start_init(config, False)
+            openllm.utils.analytics.track_start_init(config, gpu_available)
             return config
 
         return noop
@@ -371,15 +370,24 @@ def start_model_command(
 
         configure_logging()
 
-        updated_config, server_kwds = config.model_validate_click(**attrs)
-        openllm.utils.analytics.track_start_init(updated_config, False)
+        updated_config, server_attrs = config.model_validate_click(**attrs)
 
-        server_kwds.update({"working_dir": os.path.dirname(__file__)})
+        # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
+        # run this model on GPU
+        try:
+            updated_config.check_if_gpu_is_available(ModelEnv.get_framework_env())
+            gpu_available = True
+        except openllm.exceptions.GpuNotAvailableError:
+            gpu_available = False
+
+        openllm.utils.analytics.track_start_init(updated_config, gpu_available)
+
+        server_attrs.update({"working_dir": os.path.dirname(__file__)})
         if _serve_grpc:
-            server_kwds["grpc_protocol_version"] = "v1"
+            server_attrs["grpc_protocol_version"] = "v1"
         # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
-        development = server_kwds.pop("development")
-        server_kwds.setdefault("production", not development)
+        development = server_attrs.pop("development")
+        server_attrs.setdefault("production", not development)
 
         start_env = os.environ.copy()
 
@@ -395,17 +403,17 @@ def start_model_command(
 
         start_env.update(
             {
-                openllm.utils.FRAMEWORK_ENV_VAR(model_name): envvar,
-                openllm.utils.MODEL_CONFIG_ENV_VAR(model_name): updated_config.model_dump_json(),
+                ModelEnv.framework: ModelEnv.get_framework_env(),
+                ModelEnv.model_config: updated_config.model_dump_json(),
                 "OPENLLM_MODEL": model_name,
                 "BENTOML_DEBUG": str(get_debug_mode()),
                 "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
             }
         )
 
-        if envvar == "flax":
+        if ModelEnv.get_framework_env() == "flax":
             llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained)
-        elif envvar == "tf":
+        elif ModelEnv.get_framework_env() == "tf":
             llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained)
         else:
             llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained)
@@ -416,7 +424,7 @@ def start_model_command(
             )
         click.secho(f"Starting LLM Server for '{model_name}'\n", fg="blue")
         server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
-        server: bentoml.server.Server = server_cls("_service.py:svc", **server_kwds)
+        server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs)
         server.timeout = 90
 
         try:
@@ -449,8 +457,10 @@ def _start(
 
     _serve_grpc = attrs.pop("_serve_grpc", False)
 
+    ModelEnv = openllm.utils.ModelEnv(model_name)
+
     if framework is not None:
-        os.environ[openllm.utils.FRAMEWORK_ENV_VAR(model_name)] = framework
+        os.environ[ModelEnv.framework] = framework
     start_model_command(model_name, t.cast(OpenLLMCommandGroup, cli), _serve_grpc=_serve_grpc)(
         standalone_mode=False, **attrs
     )
@@ -585,9 +595,12 @@ def list_supported_models(output: t.Literal["json", "pretty", "porcelain"]):
             except Exception as err:
                 failed_initialized.append((m, err))
         _console.print(table)
-        _console.print("\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n")
-        for m, err in failed_initialized:
-            _console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
+        if len(failed_initialized) > 0:
+            _console.print(
+                "\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n"
+            )
+            for m, err in failed_initialized:
+                _console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
     elif output == "json":
         result_json: dict[str, dict[t.Literal["variants", "description"], t.Any]] = {}
         for m in models:
diff --git a/src/openllm/models/auto/configuration_auto.py b/src/openllm/models/auto/configuration_auto.py
index 9405a1c3..f7861af0 100644
--- a/src/openllm/models/auto/configuration_auto.py
+++ b/src/openllm/models/auto/configuration_auto.py
@@ -53,7 +53,7 @@ class _LazyConfigMapping(ConfigOrderedDict):
         value = self._mapping[key]
         module_name = inflection.underscore(key)
         if module_name not in self._modules:
-            self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
+            self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
         if hasattr(self._modules[module_name], value):
             return getattr(self._modules[module_name], value)
 
@@ -93,10 +93,10 @@ class AutoConfig:
         raise EnvironmentError("Cannot instantiate Config. Please use `Config.for_model(model_name)` instead.")
 
     @classmethod
-    def for_model(cls, model_name: str, *args: t.Any, **attrs: t.Any) -> openllm.LLMConfig:
+    def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
         model_name = inflection.underscore(model_name)
         if model_name in CONFIG_MAPPING:
-            return CONFIG_MAPPING[model_name]().with_options(*args, **attrs)
+            return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
         raise ValueError(
             f"Unrecognized configuration class for {model_name}. "
             f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py
index 59481337..40a575cd 100644
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -190,7 +190,7 @@ class _LazyAutoMapping(ConfigModelOrderedDict):
     def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
         module_name = inflection.underscore(model_type)
         if module_name not in self._modules:
-            self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
+            self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
         return getattribute_from_module(self._modules[module_name], attr)
 
     def keys(self):
diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py
index bb48e312..38d8d9b9 100644
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -81,7 +81,7 @@ class ChatGLM(openllm.LLM):
         else:
             prompt_text = prompt
 
-        generation_config = self.config.with_options(
+        generation_config = self.config.model_construct_env(
             max_new_tokens=max_new_tokens,
             num_beams=num_beams,
             top_p=top_p,
@@ -123,7 +123,7 @@ class ChatGLM(openllm.LLM):
         inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
         outputs = self.model.generate(
             **inputs,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                 max_new_tokens=max_new_tokens,
                 num_beams=num_beams,
                 top_p=top_p,
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index 652847a5..5392464f 100644
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -79,7 +79,7 @@ class DollyV2(openllm.LLM):
     ) -> tuple[str, dict[str, t.Any]]:
         prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)
 
-        generation_config = self.config.with_options(
+        generation_config = self.config.model_construct_env(
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_k=top_k,
@@ -112,7 +112,7 @@ class DollyV2(openllm.LLM):
         end_key_token_id = None
         eos_token_id = None
 
-        llm_config = self.config.with_options(
+        llm_config = self.config.model_construct_env(
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_k=top_k,
diff --git a/src/openllm/models/falcon/modeling_falcon.py b/src/openllm/models/falcon/modeling_falcon.py
index 7edfa34d..9a544790 100644
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -66,7 +66,7 @@ class Falcon(openllm.LLM):
         eos_token_id: int | None = None,
         **attrs: t.Any,
     ) -> tuple[str, dict[str, t.Any]]:
-        generation_config = self.config.with_options(
+        generation_config = self.config.model_construct_env(
             max_new_tokens=max_new_tokens,
             top_k=top_k,
             num_return_sequences=num_return_sequences,
@@ -95,7 +95,7 @@ class Falcon(openllm.LLM):
         return self.model(
             prompt,
             do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                 max_new_tokens=max_new_tokens,
                 top_k=top_k,
                 num_return_sequences=num_return_sequences,
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index 40c90477..fc578168 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -48,7 +48,7 @@ class FlanT5(openllm.LLM):
         repetition_penalty: float | None = None,
         **attrs: t.Any,
     ) -> tuple[str, dict[str, t.Any]]:
-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_k=top_k,
@@ -75,7 +75,7 @@ class FlanT5(openllm.LLM):
         result_tensor = self.model.generate(
             input_ids,
             do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_k=top_k,
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 4253b84f..88d8f7de 100644
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -41,7 +41,7 @@ class FlaxFlanT5(openllm.LLM):
         repetition_penalty: float | None = None,
         **attrs: t.Any,
     ) -> tuple[str, dict[str, t.Any]]:
-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_k=top_k,
@@ -67,7 +67,7 @@ class FlaxFlanT5(openllm.LLM):
         result_tensor = self.model.generate(
             input_ids,
             do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_k=top_k,
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index 9dac1ad0..9ead74ba 100644
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -41,7 +41,7 @@ class TFFlanT5(openllm.LLM):
         repetition_penalty: float | None = None,
         **attrs: t.Any,
     ) -> tuple[str, dict[str, t.Any]]:
-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_k=top_k,
@@ -67,7 +67,7 @@ class TFFlanT5(openllm.LLM):
         outputs = self.model.generate(
             input_ids,
             do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_k=top_k,
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index 9db4e6bc..2f3626ea 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -116,7 +116,7 @@ class StarCoder(openllm.LLM):
                 raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
             prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
 
-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
             top_p=top_p,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
@@ -154,7 +154,7 @@ class StarCoder(openllm.LLM):
         result_tensor = self.model.generate(
             inputs,
             do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                 top_p=top_p,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py
index d7752322..ea32d0b1 100644
--- a/src/openllm/utils/__init__.py
+++ b/src/openllm/utils/__init__.py
@@ -26,6 +26,7 @@ import re
 import types
 import typing as t
 
+import attrs
 import bentoml
 import inflection
 from bentoml._internal.types import LazyType as LazyType
@@ -56,27 +57,37 @@ else:
 
 logger = logging.getLogger(__name__)
 
-_object_setattr = object.__setattr__
-
 
 def get_lazy_module(model_name: str) -> LazyLoader:
     snaked_model_name = inflection.underscore(model_name)
     return LazyLoader(snaked_model_name, globals(), f"openllm.models.{snaked_model_name}")
 
 
-def FRAMEWORK_ENV_VAR(model_name: str) -> str:
-    return f"OPENLLM_{inflection.underscore(model_name).upper()}_FRAMEWORK"
+@attrs.define
+class ModelEnv:
+    model_name: str = attrs.field(converter=inflection.underscore)
 
+    @property
+    def framework(self) -> str:
+        return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK"
 
-def MODEL_CONFIG_ENV_VAR(model_name: str) -> str:
-    return f"OPENLLM_{inflection.underscore(model_name).upper()}_CONFIG"
+    @property
+    def model_config(self) -> str:
+        return f"OPENLLM_{self.model_name.upper()}_CONFIG"
 
+    @property
+    def start_docstring(self) -> str:
+        return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
 
-def get_framework_env(model_name: str) -> t.Literal["pt", "flax", "tf"]:
-    envvar = os.environ.get(FRAMEWORK_ENV_VAR(model_name), "pt")
-    if envvar not in ("pt", "tf", "flax"):
-        raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
-    return envvar
+    @property
+    def module(self) -> LazyLoader:
+        return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
+
+    def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]:
+        envvar = os.environ.get(self.framework, "pt")
+        if envvar not in ("pt", "tf", "flax"):
+            raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
+        return envvar
 
 
 def convert_transformers_model_name(name: str) -> str:
diff --git a/src/openllm_client/_prompt.py b/src/openllm_client/_prompt.py
index 41b7afbc..082d9702 100644
--- a/src/openllm_client/_prompt.py
+++ b/src/openllm_client/_prompt.py
@@ -79,7 +79,7 @@ class PromptTemplate:
 
     @classmethod
     def from_default(cls, model: str) -> PromptTemplate:
-        template = getattr(openllm.utils.get_lazy_module(model), "DEFAULT_PROMPT_TEMPLATE")
+        template = getattr(openllm.utils.ModelEnv(model).module, "DEFAULT_PROMPT_TEMPLATE")
         if template is None:
             raise ValueError(f"Model {model} does not have a default prompt template.")
         return cls.from_template(template)
diff --git a/src/openllm_client/runtimes/base.py b/src/openllm_client/runtimes/base.py
index 389065a1..31d6f8b4 100644
--- a/src/openllm_client/runtimes/base.py
+++ b/src/openllm_client/runtimes/base.py
@@ -109,7 +109,7 @@ class BaseClient(ClientMixin):
     def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
         return_raw_response = attrs.pop("return_raw_response", False)
         prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
-        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
+        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
         r = openllm.GenerationOutput(**self.call("generate", inputs))
 
         if return_raw_response:
@@ -132,7 +132,7 @@ class BaseAsyncClient(ClientMixin):
     async def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
         return_raw_response = attrs.pop("return_raw_response", False)
         prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
-        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
+        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
         res = await self.acall("generate", inputs)
         r = openllm.GenerationOutput(**res)