perf: reduce unecessary object creation for config class

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-05-05 14:22:43 -04:00 · 2023-05-28 05:22:22 -07:00
parent 3fb1e5338a
commit 0df8d8b9a6
17 changed files with 187 additions and 191 deletions
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -38,7 +38,6 @@ class FlanT5Config(openllm.LLMConfig):
 """
 from __future__ import annotations

-import copy
 import os
 import types
 import typing as t
@@ -55,7 +54,7 @@ from click_option_group import optgroup
 import openllm

 from .exceptions import GpuNotAvailableError, OpenLLMException
-from .utils import _object_setattr
+from .utils import LazyType
 from .utils.dantic import allows_multiple, parse_default

 if t.TYPE_CHECKING:
@@ -70,15 +69,20 @@ if t.TYPE_CHECKING:
    import transformers
    from pydantic.fields import FieldInfo
    from transformers.generation.beam_constraints import Constraint
+
+    DictStrAny = dict[str, t.Any]
 else:
    from transformers.utils.dummy_pt_objects import Constraint

+    DictStrAny = dict
    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
    tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")

 __all__ = ["LLMConfig", "ModelSignature"]

+_object_setattr = object.__setattr__
+

 def field_to_options(
    name: str, field: FieldInfo, model_name: str, suffix_generation: bool = False
@@ -87,6 +91,7 @@ def field_to_options(
    envvar = field.json_schema_extra.get("env") if field.json_schema_extra else None
    dasherized = inflection.dasherize(name)
    underscored = inflection.underscore(name)
+
    full_option_name = f"--{dasherized}"
    if field.annotation is bool:
        full_option_name += f"/--no-{dasherized}"
@@ -101,7 +106,7 @@ def field_to_options(
        type=field.annotation,
        required=field.is_required(),
        default=parse_default(field.default, field.annotation),
-        show_default=False,
+        show_default=True if field.default else False,
        multiple=allows_multiple(field.annotation),
        help=field.description,
        show_envvar=True if envvar else False,
@@ -109,14 +114,13 @@ def field_to_options(
    )


-def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, str]:
-    kwargs: dict[str, t.Any] = {}
-    for key, field in model.model_fields.items():
-        if field.json_schema_extra is not None:
-            if "env" not in field.json_schema_extra:
-                raise RuntimeError(f"Invalid {model} passed. Only accept LLMConfig or LLMConfig.generation_config")
-            kwargs[key] = os.environ.get(field.json_schema_extra["env"], field.default)
-    return {k: v for k, v in kwargs.items() if v is not None}
+def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, t.Any]:
+    # NOTE: We can safe cast here since all of the fields in GenerationConfig or LLMConfig
+    # will have a `env` field in `json_schema_extra`
+    return {
+        key: os.environ.get(t.cast("dict[str, t.Any]", field.json_schema_extra)["env"], field.default)
+        for key, field in model.model_fields.items()
+    }


 class GenerationConfig(pydantic.BaseModel):
@@ -136,7 +140,7 @@ class GenerationConfig(pydantic.BaseModel):
        description="""The minimum length of the sequence to be generated. Corresponds to the length of the 
        input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.""",
    )
-    min_new_tokens: t.Optional[int] = pydantic.Field(
+    min_new_tokens: int = pydantic.Field(
        None, description="The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt."
    )
    early_stopping: bool = pydantic.Field(
@@ -150,7 +154,7 @@ class GenerationConfig(pydantic.BaseModel):
            (canonical beam search algorithm)
    """,
    )
-    max_time: t.Optional[float] = pydantic.Field(
+    max_time: float = pydantic.Field(
        None,
        description="""The maximum amount of time you allow the computation to run for in seconds. generation will 
        still finish the current pass after allocated time has been passed.""",
@@ -163,7 +167,7 @@ class GenerationConfig(pydantic.BaseModel):
        description="""Number of groups to divide `num_beams` into in order to ensure diversity among different 
        groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.""",
    )
-    penalty_alpha: t.Optional[float] = pydantic.Field(
+    penalty_alpha: float = pydantic.Field(
        None,
        description="""The values balance the model confidence and the degeneration penalty in 
        contrastive search decoding.""",
@@ -242,14 +246,15 @@ class GenerationConfig(pydantic.BaseModel):
    no_repeat_ngram_size: int = pydantic.Field(
        0, description="If set to int > 0, all ngrams of that size can only occur once."
    )
-    bad_words_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
+    bad_words_ids: t.List[t.List[int]] = pydantic.Field(
        None,
        description="""List of token ids that are not allowed to be generated. In order to get the token ids 
        of the words that should not appear in the generated text, use 
        `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.
        """,
    )
-    force_words_ids: t.Optional[t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]]] = pydantic.Field(
+    # NOTE: t.Union is not yet supported on CLI, but the environment variable should already be available.
+    force_words_ids: t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]] = pydantic.Field(
        None,
        description="""List of token ids that must be generated. If given a `List[List[int]]`, this is treated 
        as a simple list of words that must be included, the opposite to `bad_words_ids`. 
@@ -265,13 +270,13 @@ class GenerationConfig(pydantic.BaseModel):
        algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization.
    """,
    )
-    constraints: t.Optional[t.List["Constraint"]] = pydantic.Field(
+    constraints: t.List["Constraint"] = pydantic.Field(
        None,
        description="""Custom constraints that can be added to the generation to ensure that the output 
        will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible.
        """,
    )
-    forced_bos_token_id: t.Optional[int] = pydantic.Field(
+    forced_bos_token_id: int = pydantic.Field(
        None,
        description="""The id of the token to force as the first generated token after the 
        ``decoder_start_token_id``. Useful for multilingual models like 
@@ -279,7 +284,7 @@ class GenerationConfig(pydantic.BaseModel):
        to be the target language token.
    """,
    )
-    forced_eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
+    forced_eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
        None,
        description="""The id of the token to force as the last generated token when `max_length` is reached. 
        Optionally, use a list to set multiple *end-of-sequence* tokens.""",
@@ -289,26 +294,26 @@ class GenerationConfig(pydantic.BaseModel):
        description="""Whether to remove possible *nan* and *inf* outputs of the model to prevent the 
        generation method to crash. Note that using `remove_invalid_values` can slow down generation.""",
    )
-    exponential_decay_length_penalty: t.Optional[t.Tuple[int, float]] = pydantic.Field(
+    exponential_decay_length_penalty: t.Tuple[int, float] = pydantic.Field(
        None,
        description="""This tuple adds an exponentially increasing length penalty, after a certain amount of tokens 
        have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` 
        indicates where penalty starts and `decay_factor` represents the factor of exponential decay
    """,
    )
-    suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
+    suppress_tokens: t.List[int] = pydantic.Field(
        None,
        description="""A list of tokens that will be suppressed at generation. The `SupressTokens` logit 
        processor will set their log probs to `-inf` so that they are not sampled.
    """,
    )
-    begin_suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
+    begin_suppress_tokens: t.List[int] = pydantic.Field(
        None,
        description="""A list of tokens that will be suppressed at the beginning of the generation. The 
        `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
        """,
    )
-    forced_decoder_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
+    forced_decoder_ids: t.List[t.List[int]] = pydantic.Field(
        None,
        description="""A list of pairs of integers which indicates a mapping from generation indices to token indices 
        that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always 
@@ -338,9 +343,9 @@ class GenerationConfig(pydantic.BaseModel):
    )

    # NOTE: Special tokens that can be used at generation time
-    pad_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *padding* token.")
-    bos_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
-    eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
+    pad_token_id: int = pydantic.Field(None, description="The id of the *padding* token.")
+    bos_token_id: int = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
+    eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
        None,
        description="""The id of the *end-of-sequence* token. Optionally, use a list to set 
        multiple *end-of-sequence* tokens.""",
@@ -353,7 +358,7 @@ class GenerationConfig(pydantic.BaseModel):
        `encoder_input_ids` cannot occur in the `decoder_input_ids`.
        """,
    )
-    decoder_start_token_id: t.Optional[int] = pydantic.Field(
+    decoder_start_token_id: int = pydantic.Field(
        None,
        description="""If an encoder-decoder model starts decoding with a 
        different token than *bos*, the id of that token.
@@ -361,7 +366,7 @@ class GenerationConfig(pydantic.BaseModel):
    )

    # NOTE: pydantic definition
-    model_config = dict(arbitrary_types_allowed=True, extra="forbid")
+    model_config = {"extra": "forbid", "arbitrary_types_allowed": True}

    if t.TYPE_CHECKING:
        # The following is handled via __pydantic_init_subclass__
@@ -395,28 +400,11 @@ class GenerationConfig(pydantic.BaseModel):
        # NOTE: I don't know how to do this more efficiently in pydantic v2 yet, will probably
        # need to consult the pydantic team on this.
        for key, field in self.model_fields.items():
-            json_schema: dict[str, t.Any] = (
-                copy.deepcopy(field.json_schema_extra) if field.json_schema_extra is not None else {}
-            )
-            env_key = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"
-            if "env" in json_schema:
-                field.default = os.environ.get(json_schema["env"], field.default)
+            if not field.json_schema_extra:
+                field.json_schema_extra = {}
+            if "env" in field.json_schema_extra:
                continue
-            json_schema["env"] = env_key
-            # then assign json_schema back to field
-            field.json_schema_extra = json_schema
-            field.default = os.environ.get(env_key, field.default)
-
-    def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
-        for name, field in self.model_fields.items():
-            if t.get_origin(field.annotation) is t.Union:
-                # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
-                continue
-            f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
-        return optgroup.group(
-            f"{self.__class__.__name__} generation options",
-            help=f"[Auto-generated from '{self.__class__.__qualname__}']",
-        )(f)
+            field.json_schema_extra["env"] = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"


 class LLMConfig(pydantic.BaseModel, ABC):
@@ -428,23 +416,15 @@ class LLMConfig(pydantic.BaseModel, ABC):
            return getattr(self.generation_config, attr)
        return getattr(self, attr)

-    def __repr_args__(self) -> ReprArgs:
-        """Overwrite from default BaseModel and don't show __pydantic_extra__."""
-        yield from (
-            (k, v)
-            for k, v in self.__dict__.items()
-            if not k.startswith("_") and (k not in self.model_fields or self.model_fields[k].repr)
-        )
-        yield from ((k, getattr(self, k)) for k, v in self.model_computed_fields.items() if v.repr)
-
    if t.TYPE_CHECKING:
        # The following is handled via __pydantic_init_subclass__, and is only used for TYPE_CHECKING
-        __openllm_model_name__: str = ""
-        __openllm_start_name__: str = ""
-        __openllm_timeout__: int = 0
+        __openllm_model_name__: str
+        __openllm_start_name__: str
+        __openllm_timeout__: int = 3600
        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
        __openllm_trust_remote_code__: bool = False
        __openllm_requires_gpu__: bool = False
+        __openllm_env__: openllm.utils.ModelEnv
        GenerationConfig: type[t.Any] = GenerationConfig

    def __init_subclass__(
@@ -488,6 +468,8 @@ class LLMConfig(pydantic.BaseModel, ABC):
            cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower()
            cls.__openllm_start_name__ = cls.__openllm_model_name__

+        cls.__openllm_env__ = openllm.utils.ModelEnv(cls.__openllm_model_name__)
+
        if hasattr(cls, "GenerationConfig"):
            cls.generation_config = t.cast(
                "type[GenerationConfig]",
@@ -508,12 +490,10 @@ class LLMConfig(pydantic.BaseModel, ABC):

    def model_post_init(self, _: t.Any):
        if self.__pydantic_extra__:
-            generation_config = self.__pydantic_extra__.pop("generation_config", None)
+            generation_config: dict[str, t.Any] | None = self.__pydantic_extra__.pop("generation_config", None)
            if generation_config is not None:
-                assert isinstance(generation_config, dict), "generation_config must be a dict."
-                self.generation_config = self.generation_config.model_copy(
-                    update=t.cast("dict[str, t.Any]", generation_config), deep=True
-                )
+                assert LazyType[DictStrAny](dict).isinstance(generation_config), "generation_config must be a dict."
+                self.generation_config = self.generation_config.model_copy(update=generation_config, deep=True)
            else:
                # The rest of the extras fields should just be the generation_config.
                self.generation_config = self.generation_config.model_copy(update=self.__pydantic_extra__, deep=True)
@@ -551,64 +531,48 @@ class LLMConfig(pydantic.BaseModel, ABC):
        except pydantic.ValidationError as e:
            raise openllm.exceptions.ValidationError(f"Failed to dump configuration to dict: {e}") from e

-    def with_options(self, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
+    @classmethod
+    def model_construct_env(cls, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
        """A helpers that respect configuration values that
        sets from environment variables for any given configuration class.
        """
-        from_env_ = self.from_env()
-        # filtered out None values
-        attrs = {k: v for k, v in attrs.items() if v is not None}
-        generation_keys = {k for k in attrs if k in self.generation_config.model_fields}
-
-        generation_attrs = {k: v for k, v in attrs.items() if k in generation_keys}
-        config_attrs = {k: v for k, v in attrs.items() if k not in generation_keys}
-
-        # NOTE: first set the default config kwargs.
-        # We will always respect envvar as default, then the one that is pass
-        attrs = {**generate_kwargs_from_envvar(self), **config_attrs}
+        env_json_string = os.environ.get(cls.__openllm_env__.model_config, None)
+        if env_json_string is not None:
+            try:
+                self = cls.model_construct(**orjson.loads(env_json_string))
+            except pydantic.ValidationError as e:
+                raise RuntimeError(f"Failed to parse '{cls.__openllm_env__.model_config}' as valid JSON string.") from e
+        else:
+            self = cls.model_construct()

        if __llm_config__ is not None:
            # NOTE: Only hit this branch on the server. Client shouldn't use __llm_config__
-            attrs = {**attrs, **__llm_config__.model_dump()}
+            # as it is not set.
+            return self.model_construct(**__llm_config__.model_dump(flatten=True))

-        # NOTE: Then we setup generation config values
-        attrs["generation_config"] = {
-            **generate_kwargs_from_envvar(self.generation_config),
-            **attrs.get("generation_config", {}),
-            **generation_attrs,
-        }
+        # filtered out None values
+        attrs = {k: v for k, v in attrs.items() if v is not None}

-        if from_env_:
-            return from_env_.model_construct(**attrs)
-        return self.model_construct(**attrs)
+        construct_attrs = generate_kwargs_from_envvar(self)
+        construct_attrs.update(generate_kwargs_from_envvar(self.generation_config))
+        construct_attrs.update(attrs)

-    @classmethod
-    def from_env(cls) -> LLMConfig | None:
-        envvar = openllm.utils.MODEL_CONFIG_ENV_VAR(cls.__openllm_model_name__)
-        env_json_string = os.environ.get(envvar, None)
-        if env_json_string is None:
-            return
-
-        try:
-            return cls.model_construct(**orjson.loads(env_json_string))
-        except pydantic.ValidationError as e:
-            raise RuntimeError(f"Failed to parse environment variable '{envvar}' as a valid JSON string.") from e
+        return self.model_construct(**construct_attrs)

    def model_validate_click(self, **attrs: t.Any) -> tuple[LLMConfig, dict[str, t.Any]]:
        """Parse given click attributes into a LLMConfig and return the remaining click attributes."""
-        llm_config_attrs = {
-            k[len(self.__openllm_model_name__) + 1 :]: v
-            for k, v in attrs.items()
-            if k[len(self.__openllm_model_name__) + 1 :] in self.model_fields
-        }
-        llm_config_attrs["generation_config"] = {
-            k[len(self.__openllm_model_name__ + "_generation") + 1 :]: v
-            for k, v in attrs.items()
-            if k[len(self.__openllm_model_name__ + "_generation") + 1 :] in self.generation_config.model_fields
-        }
-        return self.with_options(**llm_config_attrs), {
-            k: v for k, v in attrs.items() if not k.startswith(self.__openllm_model_name__)
-        }
+        llm_config_attrs = {}
+        key_to_remove: list[str] = []
+
+        for k, v in attrs.items():
+            if k.startswith(f"{self.__openllm_model_name__}_"):
+                llm_config_attrs[k[len(self.__openllm_model_name__) + 1 :]] = v
+                key_to_remove.append(k)
+            elif k.startswith(f"{self.__openllm_model_name__}_generation_"):
+                llm_config_attrs[k[len(self.__openllm_model_name__ + "_generation") + 1 :]] = v
+                key_to_remove.append(k)
+
+        return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}

    @t.overload
    def to_generation_config(self, return_as_dict: t.Literal[True] = ...) -> dict[str, t.Any]:
@@ -627,17 +591,25 @@ class LLMConfig(pydantic.BaseModel, ABC):

        return config

-    def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
+    def to_click_options(self, f: F[P]) -> t.Callable[[F[P]], click.Command]:
        """
        Convert current model to click options. This can be used as a decorator for click commands.
        Note that the identifier for all LLMConfig will be prefixed with '<model_name>_*', and the generation config
        will be prefixed with '<model_name>_generation_*'.
        """
-        wrapped_generation = self.generation_config.to_click_options(f)
+
+        for name, field in self.generation_config.model_fields.items():
+            if t.get_origin(field.annotation) is t.Union:
+                # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
+                continue
+            f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
+        f = optgroup.group(f"{self.__class__.__name__} generation options")(f)
+
        if len(self.model_fields.values()) == 0:
-            return wrapped_generation
+            return f
        for name, field in self.model_fields.items():
-            wrapped_generation = field_to_options(name, field, self.__openllm_model_name__)(wrapped_generation)
-        return optgroup.group(
-            f"{self.__class__.__name__} options", help=f"[Auto-generated from '{self.__class__.__qualname__}']"
-        )(wrapped_generation)
+            if t.get_origin(field.annotation) is t.Union:
+                # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
+                continue
+            f = field_to_options(name, field, self.__openllm_model_name__)(f)
+        return optgroup.group(f"{self.__class__.__name__} options")(f)
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -440,8 +440,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
    # NOTE: The section below defines a loose contract with langchain's LLM interface.
    @property
    def llm_type(self) -> str:
-        assert self.default_model is not None
-        return openllm.utils.convert_transformers_model_name(self.default_model)
+        return openllm.utils.convert_transformers_model_name(self._pretrained)

    @property
    def identifying_params(self) -> dict[str, t.Any]:
@@ -637,10 +636,10 @@ def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
                behaviour
    """
    init_local = attrs.pop("init_local", False)
-    envvar = openllm.utils.get_framework_env(start_name)
-    if envvar == "flax":
+    ModelEnv = openllm.utils.ModelEnv(start_name)
+    if ModelEnv.get_framework_env() == "flax":
        runner = openllm.AutoFlaxLLM.create_runner(start_name, **attrs)
-    elif envvar == "tf":
+    elif ModelEnv.get_framework_env() == "tf":
        runner = openllm.AutoTFLLM.create_runner(start_name, **attrs)
    else:
        runner = openllm.AutoLLM.create_runner(start_name, **attrs)
--- a/src/openllm/_package.py
+++ b/src/openllm/_package.py
@@ -70,6 +70,8 @@ def build_editable(path: str) -> str | None:

 def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
    packages: list[str] = []
+
+    ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
    if llm.requirements is not None:
        packages.extend(llm.requirements)

@@ -89,11 +91,9 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
        ]
    )

-    to_use_framework = utils.get_framework_env(llm.__openllm_start_name__)
+    to_use_framework = ModelEnv.get_framework_env()
    if to_use_framework == "flax":
-        assert (
-            utils.is_flax_available()
-        ), f"Flax is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'flax'"
+        assert utils.is_flax_available(), f"Flax is not available, while {ModelEnv.framework} is set to 'flax'"
        packages.extend(
            [
                f"flax>={importlib.metadata.version('flax')}",
@@ -102,9 +102,7 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
            ]
        )
    elif to_use_framework == "tf":
-        assert (
-            utils.is_tf_available()
-        ), f"TensorFlow is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'tf'"
+        assert utils.is_tf_available(), f"TensorFlow is not available, while {ModelEnv.framework} is set to 'tf'"
        candidates = (
            "tensorflow",
            "tensorflow-cpu",
@@ -137,11 +135,12 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
    return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)


-def construct_docker_options(llm: openllm.LLM, llm_fs: FS) -> DockerOptions:
+def construct_docker_options(llm: openllm.LLM, _: FS) -> DockerOptions:
+    ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
    return DockerOptions(
        cuda_version="11.6",  # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
        env={
-            utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__): utils.get_framework_env(llm.__openllm_start_name__),
+            ModelEnv.framework: ModelEnv.get_framework_env(),
            "OPENLLM_MODEL": llm.config.__openllm_model_name__,
        },
        system_packages=["git"],
@@ -165,14 +164,16 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
    current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
    _previously_built = False

-    logger.debug("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)
+    ModelEnv = openllm.utils.ModelEnv(model_name)
+
+    logger.info("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)

    # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
    # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
    try:
        os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)

-        to_use_framework = openllm.utils.get_framework_env(model_name)
+        to_use_framework = ModelEnv.get_framework_env()
        if to_use_framework == "flax":
            llm = openllm.AutoFlaxLLM.for_model(model_name, **attrs)
        elif to_use_framework == "tf":
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -30,7 +30,7 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
    route="/v1/generate",
 )
 async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
-    config = llm_config.with_options(__llm_config__=qa.llm_config).model_dump()
+    config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump()
    responses = await runner.generate.async_run(qa.prompt, **config)
    return openllm.GenerationOutput(responses=responses, configuration=config)

@@ -39,5 +39,5 @@ async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
 def metadata_v1(_: str) -> dict[str, t.Any]:
    return {
        "model_name": llm_config.__openllm_model_name__,
-        "framework": openllm.utils.get_framework_env(llm_config.__openllm_model_name__),
+        "framework": llm_config.__openllm_env__.get_framework_env(),
    }
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -314,7 +314,7 @@ def start_model_command(
    Note that the internal commands will return the llm_config and a boolean determine
    whether the server is run with GPU or not.
    """
-    envvar = openllm.utils.get_framework_env(model_name)
+    ModelEnv = openllm.utils.ModelEnv(model_name)
    model_command_decr: dict[str, t.Any] = {
        "name": inflection.underscore(model_name),
        "context_settings": _context_settings or {},
@@ -330,16 +330,15 @@ def start_model_command(
        {
            "name": config.__openllm_model_name__,
            "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
-            "help": getattr(
-                openllm.utils.get_lazy_module(model_name),
-                f"START_{inflection.underscore(model_name).upper()}_COMMAND_DOCSTRING",
-            ),
+            "help": ModelEnv.start_docstring,
            "aliases": aliases if len(aliases) > 0 else None,
        }
    )

+    gpu_available = False
    try:
-        config.check_if_gpu_is_available(envvar)
+        config.check_if_gpu_is_available(ModelEnv.get_framework_env())
+        gpu_available = True
    except openllm.exceptions.GpuNotAvailableError:
        # NOTE: The model requires GPU, therefore we will return a dummy command
        model_command_decr.update(
@@ -353,7 +352,7 @@ def start_model_command(
        @factory.command(**model_command_decr)
        def noop() -> openllm.LLMConfig:
            click.secho("No GPU available, therefore this command is disabled", fg="red")
-            openllm.utils.analytics.track_start_init(config, False)
+            openllm.utils.analytics.track_start_init(config, gpu_available)
            return config

        return noop
@@ -371,15 +370,24 @@ def start_model_command(

        configure_logging()

-        updated_config, server_kwds = config.model_validate_click(**attrs)
-        openllm.utils.analytics.track_start_init(updated_config, False)
+        updated_config, server_attrs = config.model_validate_click(**attrs)

-        server_kwds.update({"working_dir": os.path.dirname(__file__)})
+        # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
+        # run this model on GPU
+        try:
+            updated_config.check_if_gpu_is_available(ModelEnv.get_framework_env())
+            gpu_available = True
+        except openllm.exceptions.GpuNotAvailableError:
+            gpu_available = False
+
+        openllm.utils.analytics.track_start_init(updated_config, gpu_available)
+
+        server_attrs.update({"working_dir": os.path.dirname(__file__)})
        if _serve_grpc:
-            server_kwds["grpc_protocol_version"] = "v1"
+            server_attrs["grpc_protocol_version"] = "v1"
        # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
-        development = server_kwds.pop("development")
-        server_kwds.setdefault("production", not development)
+        development = server_attrs.pop("development")
+        server_attrs.setdefault("production", not development)

        start_env = os.environ.copy()

@@ -395,17 +403,17 @@ def start_model_command(

        start_env.update(
            {
-                openllm.utils.FRAMEWORK_ENV_VAR(model_name): envvar,
-                openllm.utils.MODEL_CONFIG_ENV_VAR(model_name): updated_config.model_dump_json(),
+                ModelEnv.framework: ModelEnv.get_framework_env(),
+                ModelEnv.model_config: updated_config.model_dump_json(),
                "OPENLLM_MODEL": model_name,
                "BENTOML_DEBUG": str(get_debug_mode()),
                "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
            }
        )

-        if envvar == "flax":
+        if ModelEnv.get_framework_env() == "flax":
            llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained)
-        elif envvar == "tf":
+        elif ModelEnv.get_framework_env() == "tf":
            llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained)
        else:
            llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained)
@@ -416,7 +424,7 @@ def start_model_command(
            )
        click.secho(f"Starting LLM Server for '{model_name}'\n", fg="blue")
        server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
-        server: bentoml.server.Server = server_cls("_service.py:svc", **server_kwds)
+        server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs)
        server.timeout = 90

        try:
@@ -449,8 +457,10 @@ def _start(

    _serve_grpc = attrs.pop("_serve_grpc", False)

+    ModelEnv = openllm.utils.ModelEnv(model_name)
+
    if framework is not None:
-        os.environ[openllm.utils.FRAMEWORK_ENV_VAR(model_name)] = framework
+        os.environ[ModelEnv.framework] = framework
    start_model_command(model_name, t.cast(OpenLLMCommandGroup, cli), _serve_grpc=_serve_grpc)(
        standalone_mode=False, **attrs
    )
@@ -585,9 +595,12 @@ def list_supported_models(output: t.Literal["json", "pretty", "porcelain"]):
            except Exception as err:
                failed_initialized.append((m, err))
        _console.print(table)
-        _console.print("\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n")
-        for m, err in failed_initialized:
-            _console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
+        if len(failed_initialized) > 0:
+            _console.print(
+                "\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n"
+            )
+            for m, err in failed_initialized:
+                _console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
    elif output == "json":
        result_json: dict[str, dict[t.Literal["variants", "description"], t.Any]] = {}
        for m in models:
--- a/src/openllm/models/auto/configuration_auto.py
+++ b/src/openllm/models/auto/configuration_auto.py
@@ -53,7 +53,7 @@ class _LazyConfigMapping(ConfigOrderedDict):
        value = self._mapping[key]
        module_name = inflection.underscore(key)
        if module_name not in self._modules:
-            self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
+            self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
        if hasattr(self._modules[module_name], value):
            return getattr(self._modules[module_name], value)

@@ -93,10 +93,10 @@ class AutoConfig:
        raise EnvironmentError("Cannot instantiate Config. Please use `Config.for_model(model_name)` instead.")

    @classmethod
-    def for_model(cls, model_name: str, *args: t.Any, **attrs: t.Any) -> openllm.LLMConfig:
+    def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
        model_name = inflection.underscore(model_name)
        if model_name in CONFIG_MAPPING:
-            return CONFIG_MAPPING[model_name]().with_options(*args, **attrs)
+            return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
        raise ValueError(
            f"Unrecognized configuration class for {model_name}. "
            f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -190,7 +190,7 @@ class _LazyAutoMapping(ConfigModelOrderedDict):
    def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
        module_name = inflection.underscore(model_type)
        if module_name not in self._modules:
-            self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
+            self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
        return getattribute_from_module(self._modules[module_name], attr)

    def keys(self):
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -81,7 +81,7 @@ class ChatGLM(openllm.LLM):
        else:
            prompt_text = prompt

-        generation_config = self.config.with_options(
+        generation_config = self.config.model_construct_env(
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            top_p=top_p,
@@ -123,7 +123,7 @@ class ChatGLM(openllm.LLM):
        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
        outputs = self.model.generate(
            **inputs,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                top_p=top_p,
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -79,7 +79,7 @@ class DollyV2(openllm.LLM):
    ) -> tuple[str, dict[str, t.Any]]:
        prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)

-        generation_config = self.config.with_options(
+        generation_config = self.config.model_construct_env(
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
@@ -112,7 +112,7 @@ class DollyV2(openllm.LLM):
        end_key_token_id = None
        eos_token_id = None

-        llm_config = self.config.with_options(
+        llm_config = self.config.model_construct_env(
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -66,7 +66,7 @@ class Falcon(openllm.LLM):
        eos_token_id: int | None = None,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any]]:
-        generation_config = self.config.with_options(
+        generation_config = self.config.model_construct_env(
            max_new_tokens=max_new_tokens,
            top_k=top_k,
            num_return_sequences=num_return_sequences,
@@ -95,7 +95,7 @@ class Falcon(openllm.LLM):
        return self.model(
            prompt,
            do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                max_new_tokens=max_new_tokens,
                top_k=top_k,
                num_return_sequences=num_return_sequences,
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -48,7 +48,7 @@ class FlanT5(openllm.LLM):
        repetition_penalty: float | None = None,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any]]:
-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
@@ -75,7 +75,7 @@ class FlanT5(openllm.LLM):
        result_tensor = self.model.generate(
            input_ids,
            do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_k=top_k,
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -41,7 +41,7 @@ class FlaxFlanT5(openllm.LLM):
        repetition_penalty: float | None = None,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any]]:
-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
@@ -67,7 +67,7 @@ class FlaxFlanT5(openllm.LLM):
        result_tensor = self.model.generate(
            input_ids,
            do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_k=top_k,
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -41,7 +41,7 @@ class TFFlanT5(openllm.LLM):
        repetition_penalty: float | None = None,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any]]:
-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
@@ -67,7 +67,7 @@ class TFFlanT5(openllm.LLM):
        outputs = self.model.generate(
            input_ids,
            do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_k=top_k,
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -116,7 +116,7 @@ class StarCoder(openllm.LLM):
                raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
            prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"

-        return prompt, self.config.with_options(
+        return prompt, self.config.model_construct_env(
            top_p=top_p,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
@@ -154,7 +154,7 @@ class StarCoder(openllm.LLM):
        result_tensor = self.model.generate(
            inputs,
            do_sample=True,
-            generation_config=self.config.with_options(
+            generation_config=self.config.model_construct_env(
                top_p=top_p,
                temperature=temperature,
                max_new_tokens=max_new_tokens,
--- a/src/openllm/utils/init.py
+++ b/src/openllm/utils/init.py
@@ -26,6 +26,7 @@ import re
 import types
 import typing as t

+import attrs
 import bentoml
 import inflection
 from bentoml._internal.types import LazyType as LazyType
@@ -56,27 +57,37 @@ else:

 logger = logging.getLogger(__name__)

-_object_setattr = object.__setattr__
-

 def get_lazy_module(model_name: str) -> LazyLoader:
    snaked_model_name = inflection.underscore(model_name)
    return LazyLoader(snaked_model_name, globals(), f"openllm.models.{snaked_model_name}")


-def FRAMEWORK_ENV_VAR(model_name: str) -> str:
-    return f"OPENLLM_{inflection.underscore(model_name).upper()}_FRAMEWORK"
+@attrs.define
+class ModelEnv:
+    model_name: str = attrs.field(converter=inflection.underscore)

+    @property
+    def framework(self) -> str:
+        return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK"

-def MODEL_CONFIG_ENV_VAR(model_name: str) -> str:
-    return f"OPENLLM_{inflection.underscore(model_name).upper()}_CONFIG"
+    @property
+    def model_config(self) -> str:
+        return f"OPENLLM_{self.model_name.upper()}_CONFIG"

+    @property
+    def start_docstring(self) -> str:
+        return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")

-def get_framework_env(model_name: str) -> t.Literal["pt", "flax", "tf"]:
-    envvar = os.environ.get(FRAMEWORK_ENV_VAR(model_name), "pt")
-    if envvar not in ("pt", "tf", "flax"):
-        raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
-    return envvar
+    @property
+    def module(self) -> LazyLoader:
+        return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
+
+    def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]:
+        envvar = os.environ.get(self.framework, "pt")
+        if envvar not in ("pt", "tf", "flax"):
+            raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
+        return envvar


 def convert_transformers_model_name(name: str) -> str:
--- a/src/openllm_client/_prompt.py
+++ b/src/openllm_client/_prompt.py
@@ -79,7 +79,7 @@ class PromptTemplate:

    @classmethod
    def from_default(cls, model: str) -> PromptTemplate:
-        template = getattr(openllm.utils.get_lazy_module(model), "DEFAULT_PROMPT_TEMPLATE")
+        template = getattr(openllm.utils.ModelEnv(model).module, "DEFAULT_PROMPT_TEMPLATE")
        if template is None:
            raise ValueError(f"Model {model} does not have a default prompt template.")
        return cls.from_template(template)
--- a/src/openllm_client/runtimes/base.py
+++ b/src/openllm_client/runtimes/base.py
@@ -109,7 +109,7 @@ class BaseClient(ClientMixin):
    def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
        return_raw_response = attrs.pop("return_raw_response", False)
        prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
-        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
+        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
        r = openllm.GenerationOutput(**self.call("generate", inputs))

        if return_raw_response:
@@ -132,7 +132,7 @@ class BaseAsyncClient(ClientMixin):
    async def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
        return_raw_response = attrs.pop("return_raw_response", False)
        prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
-        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
+        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
        res = await self.acall("generate", inputs)
        r = openllm.GenerationOutput(**res)