perf: build quantization and better transformer behaviour (#28)

Fixes quantization_config and low_cpu_mem_usage to be available on PyTorch implementation only See changelog for more details on #28
2026-05-19 05:57:39 -04:00 · 2023-06-17 08:56:14 -04:00
parent 233d4697b5
commit 6f724416c0
23 changed files with 1159 additions and 853 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,22 +28,6 @@ defaults:
  run:
    shell: bash --noprofile --norc -exo pipefail {0}
 jobs:
-  codestyle_check:
-    runs-on: ubuntu-latest
-    if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Setup CI
-        uses: ./.github/actions/setup-repo
-      - name: Running changelog check
-        run: hatch run changelog
-      - name: Format and lint check
-        run: hatch run fmt
-      - name: Type check
-        if: ${{ github.event_name == 'pull_request' }}
-        run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing
  tests:
    runs-on: ubuntu-latest
    if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@

 ci:
  autoupdate_schedule: weekly
-  skip: [check-models-table-update, check-models-table-update]
+  skip: [check-models-table-update, check-models-table-update, changelog-dry-run]
 exclude: '.*\.(css|js|svg)$'
 repos:
  - repo: https://github.com/charliermarsh/ruff-pre-commit
@@ -51,13 +51,16 @@ repos:
              typings/.*|
              .github/.*
          )$
-  - repo: local
-    hooks:
      - id: check-models-table-update
        name: check if table in README.md is up-to-date
        entry: ./tools/assert-model-table-latest
        language: script
        files: README.md
+      - id: changelog-dry-run
+        name: Running changelog dry-run
+        entry: hatch run changelog
+        language: system
+        files: CHANGELOG.md
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.4.0
    hooks:
--- a/changelog.d/27.feature.md
+++ b/changelog.d/27.feature.md
@@ -1,14 +1,22 @@
 Added support for quantization during serving time.
-`openllm start` now support `--quantize 8bit` and `--quantize 4bit`
-`GPTQ` quantization support is on the roadmap and currently
-being worked on.
+
+`openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ`
+quantization support is on the roadmap and currently being worked on.
+
 `openllm start` now also support `--bettertransformer` to use
-`BetterTransformer` for serving
-Refactored `openllm.LLMConfig` to be able to use with `__getitem__`
-to acecss the config value: `openllm.DollyV2Config()['requirements']`
-the order being: `__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`
+`BetterTransformer` for serving.
+
+Refactored `openllm.LLMConfig` to be able to use with `__getitem__`:
+`openllm.DollyV2Config()['requirements']`.
+
+The access order being:
+`__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`.
+
 Added `towncrier` workflow to easily generate changelog entries
+
 Added `use_pipeline`, `bettertransformer` flag into ModelSettings
-`LLMConfig` now supported `__dataclass_transform__` protocol to help
-with type-checking
-Changed `openllm download-models` to `openllm download`
+
+`LLMConfig` now supported `__dataclass_transform__` protocol to help with
+type-checking
+
+`openllm download-models` now becomes `openllm download`
--- a/changelog.d/28.change.md
+++ b/changelog.d/28.change.md
@@ -0,0 +1,14 @@
+`--quantize` now takes `int8, int4` instead of `8bit, 4bit` to be consistent
+with bitsandbytes concept.
+
+`openllm CLI` now cached all available model command, allow faster startup time.
+
+Fixes `openllm start model-id --debug` to filtered out debug message log from
+`bentoml.Server`.
+
+`--model-id` from `openllm start` now support choice for easier selection.
+
+Updated `ModelConfig` implementation with **getitem** and auto generation value.
+
+Cleanup CLI and improve loading time, `openllm start` should be 'blazingly
+fast'.
--- a/src/openllm/init.py
+++ b/src/openllm/init.py
@@ -25,7 +25,7 @@ deploy, and monitor any LLMs with ease.
 """
 from __future__ import annotations

-import logging as _
+import logging
 import typing as t

 from . import utils as utils
@@ -33,15 +33,11 @@ from .__about__ import __version__ as __version__
 from .exceptions import MissingDependencyError

 if utils.DEBUG:
-    from bentoml._internal.configuration import set_debug_mode, set_quiet_mode
+    utils.set_debug_mode(True)
+    utils.set_quiet_mode(False)

-    set_debug_mode(True)
-    set_quiet_mode(False)
-
-    from bentoml._internal.log import configure_logging
-
-    configure_logging()
-    _.basicConfig(level=_.NOTSET)
+    utils.configure_logging()
+    logging.basicConfig(level=logging.NOTSET)


 _import_structure = {
@@ -147,7 +143,6 @@ if t.TYPE_CHECKING:
    from . import exceptions as exceptions
    from . import models as models
    from . import playground as playground
-
    # Specific types import
    from ._configuration import LLMConfig as LLMConfig
    from ._llm import LLM as LLM
@@ -160,7 +155,8 @@ if t.TYPE_CHECKING:
    from .cli import start as start
    from .cli import start_grpc as start_grpc
    from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
-    from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
+    from .models.auto import \
+        MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
    from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
    from .models.auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
    from .models.auto import AutoConfig as AutoConfig
@@ -234,5 +230,11 @@ else:
        globals()["__file__"],
        _import_structure,
        module_spec=__spec__,
-        extra_objects={"__version__": __version__},
+        extra_objects={
+            "__version__": __version__,
+            # The below is a special mapping that allows openllm to be used as a dictionary.
+            # This is purely for convenience sake, and should not be used in performance critcal
+            # code. This is also not considered as a public API.
+            "__openllm_special__": {"flax": "AutoFlaxLLM", "tf": "AutoTFLLM", "pt": "AutoLLM"},
+        },
    )
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -395,16 +395,7 @@ bentoml_cattr.register_unstructure_hook_factory(
 )


-def _populate_value_from_env_var(
-    key: str, transform: t.Callable[[str], str] | None = None, fallback: t.Any = None
-) -> t.Any:
-    if transform is not None and callable(transform):
-        key = transform(key)
-
-    return os.environ.get(key, fallback)
-
-
-def _field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
+def _field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None = None) -> str:
    return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))


@@ -425,6 +416,7 @@ class ModelSettings(t.TypedDict, total=False):
    url: str
    requires_gpu: bool
    trust_remote_code: bool
+    service_name: NotRequired[str]
    requirements: t.Optional[ListStr]

    # llm implementation specifics
@@ -448,128 +440,174 @@ class ModelSettings(t.TypedDict, total=False):
    generation_class: t.Type[GenerationConfig]


-_ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
-    type("__openllm_internal__", (ModelSettings,), {"__module__": "openllm._configuration"}),
-    attr.make_class(
-        "ModelSettings",
-        {
-            k: dantic.Field(
+def _settings_field_transformer(
+    _: type[attr.AttrsInstance], __: list[attr.Attribute[t.Any]]
+) -> list[attr.Attribute[t.Any]]:
+    return [
+        attr.Attribute.from_counting_attr(
+            k,
+            dantic.Field(
                kw_only=False if t.get_origin(ann) is not Required else True,
                auto_default=True,
                use_default_converter=False,
                type=ann,
-                metadata={
-                    "target": f"__openllm_{k}__",
-                    "required": False if t.get_origin(ann) is NotRequired else t.get_origin(ann) is Required,
-                },
+                metadata={"target": f"__openllm_{k}__"},
                description=f"ModelSettings field for {k}.",
-            )
-            for k, ann in t.get_type_hints(ModelSettings).items()
-        },
-        bases=(DictStrAny,),
-        slots=True,
-        weakref_slot=True,
-        collect_by_mro=True,
-    ),
-    _overwrite_doc="Internal attrs representation of ModelSettings.",
-)
+            ),
+        )
+        for k, ann in t.get_type_hints(ModelSettings).items()
+    ]


-def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
+@attr.define(slots=True, field_transformer=_settings_field_transformer, frozen=False)
+class _ModelSettingsAttr:
+    """Internal attrs representation of ModelSettings."""
+
+    def __getitem__(self, key: str) -> t.Any:
+        if key in codegen.get_annotations(ModelSettings):
+            return _object_getattribute(self, key)
+        raise KeyError(key)
+
+    @classmethod
+    def default(cls) -> _ModelSettingsAttr:
+        _ = ModelSettings(
+            default_id="__default__",
+            model_ids=["__default__"],
+            name_type="dasherize",
+            requires_gpu=False,
+            url="",
+            use_pipeline=False,
+            model_type="causal_lm",
+            trust_remote_code=False,
+            requirements=None,
+            timeout=3600,
+            service_name="",
+            workers_per_resource=1,
+            runtime="transformers",
+        )
+        return cls(**t.cast(DictStrAny, _))
+
+
+def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]):
    if not lenient_issubclass(cl_, LLMConfig):
-        raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
+        raise RuntimeError(f"Given '{cl_}' must be a subclass type of 'LLMConfig', got '{cl_}' instead.")

    if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None:
        raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")

-    settings = cl_.__config__
-    assert settings
+    assert cl_.__config__ is not None

-    required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)]
-
-    missing = set(required) - set(settings.keys())
-
-    if len(missing) > 0:
-        raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})")
-
-    if "generation_class" in settings:
+    if "generation_class" in cl_.__config__:
        raise ValueError(
            "'generation_class' shouldn't be defined in '__config__', rather defining "
-            f"all required attributes under '{cl_}.GenerationConfig' when defining the class."
+            f"all required attributes under '{cl_}.GenerationConfig' instead."
        )

-    if not settings["default_id"] or not settings["model_ids"]:
+    _cl_name = cl_.__name__.replace("Config", "")
+
+    _settings_attr = _ModelSettingsAttr.default()
+    try:
+        cls(**t.cast(DictStrAny, cl_.__config__))
+        _settings_attr = attr.evolve(_settings_attr, **t.cast(DictStrAny, cl_.__config__))
+    except TypeError:
        raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).")

-    # NOTE: value in __config__ can be None, hense we use setdefault
-    # to update in-place
-    _cl_name = cl_.__name__.replace("Config", "")
-    name_type = settings.setdefault("name_type", "dasherize")
-    model_name = settings.setdefault(
-        "model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
+    _final_value_dct: DictStrAny = {
+        "model_name": inflection.underscore(_cl_name)
+        if _settings_attr["name_type"] == "dasherize"
+        else _cl_name.lower()
+    }
+    _final_value_dct["start_name"] = (
+        inflection.dasherize(_final_value_dct["model_name"])
+        if _settings_attr["name_type"] == "dasherize"
+        else _final_value_dct["model_name"]
    )
-    partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")
+    env = openllm.utils.ModelEnv(_final_value_dct["model_name"])
+    _final_value_dct["env"] = env

-    def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
-        _has_own_gen = codegen.has_own_attribute(cl_, "GenerationConfig")
-        return [
-            f.evolve(
-                default=_populate_value_from_env_var(
-                    partialed(key=f.name),
-                    fallback=getattr(cl_.GenerationConfig, f.name, f.default) if _has_own_gen else f.default,
-                ),
-                metadata={"env": partialed(key=f.name), "description": f.metadata.get("description", "(not provided)")},
-                converter=None,
-            )
-            for f in fields
-        ]
+    # bettertransformer support
+    if _settings_attr["bettertransformer"] is None:
+        _final_value_dct["bettertransformer"] = (
+            os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES
+        )
+    if _settings_attr["requires_gpu"]:
+        # if requires_gpu is True, then disable BetterTransformer for quantization.
+        _final_value_dct["bettertransformer"] = False

-    settings.setdefault(
-        "generation_class",
-        attr.make_class(
-            f"{_cl_name}GenerationConfig",
-            [],
-            bases=(GenerationConfig,),
-            slots=True,
-            weakref_slot=True,
-            frozen=False,
-            repr=True,
-            collect_by_mro=True,
-            field_transformer=auto_env_transformers,
+    _final_value_dct["service_name"] = f"generated_{_final_value_dct['model_name']}_service.py"
+    _final_value_dct["generation_class"] = attr.make_class(
+        f"{_cl_name}GenerationConfig",
+        [],
+        bases=(GenerationConfig,),
+        slots=True,
+        weakref_slot=True,
+        frozen=True,
+        repr=True,
+        collect_by_mro=True,
+        field_transformer=_make_env_transformer(
+            cl_,
+            _final_value_dct["model_name"],
+            suffix="generation",
+            default_callback=lambda field_name, field_default: getattr(cl_.GenerationConfig, field_name, field_default)
+            if codegen.has_own_attribute(cl_, "GenerationConfig")
+            else field_default,
+            globs={"cl_": cl_},
        ),
    )

-    env = settings.setdefault("env", openllm.utils.ModelEnv(model_name))
-    requires_gpu = settings.setdefault("requires_gpu", False)
+    return attr.evolve(_settings_attr, **_final_value_dct)

-    # bettertransformer support
-    bettertransformer = settings.setdefault(
-        "bettertransformer",
-        os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES,
+
+bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings)
+
+
+def _make_env_transformer(
+    cls: type[LLMConfig],
+    model_name: str,
+    suffix: t.LiteralString | None = None,
+    default_callback: t.Callable[[str, t.Any], t.Any] | None = None,
+    globs: DictStrAny | None = None,
+):
+    def identity(_: str, x_value: t.Any) -> t.Any:
+        return x_value
+
+    default_callback = identity if default_callback is None else default_callback
+
+    globs = {} if globs is None else globs
+    globs.update(
+        {
+            "functools": functools,
+            "__populate_env": dantic.env_converter,
+            "__default_callback": default_callback,
+            "__field_env": _field_env_key,
+            "__suffix": suffix or "",
+            "__model_name": model_name,
+        }
    )
-    if requires_gpu:
-        # For all models that requires GPU, no need to offload it to BetterTransformer
-        # use bitsandbytes or gptq instead for latency improvement
-        if bettertransformer:
-            logger.debug("Model requires GPU by default, disabling bettertransformer.")
-        bettertransformer = False
-    settings["bettertransformer"] = bettertransformer

-    # default value
-    settings.setdefault("url", "")
-    settings.setdefault("use_pipeline", False)
-    settings.setdefault("model_type", "causal_lm")
-    settings.setdefault("trust_remote_code", False)
-    settings.setdefault("requirements", None)
-    settings.setdefault("timeout", 3600)
-    settings.setdefault("workers_per_resource", 1)
-    settings.setdefault("runtime", "transformers")
-    settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name)
+    lines: ListStr = [
+        "__env = lambda field_name: __field_env(__model_name, field_name, __suffix)",
+        "return [",
+        "    f.evolve(",
+        "        default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),",
+        "        metadata={",
+        "            'env': f.metadata.get('env', __env(f.name)),",
+        "            'description': f.metadata.get('description', '(not provided)'),",
+        "        },",
+        "    )",
+        "    for f in fields",
+        "]",
+    ]
+    fields_ann = "list[attr.Attribute[t.Any]]"

-    return cls(**settings)
-
-
-bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)
+    return codegen.generate_function(
+        cls,
+        "__auto_env",
+        lines,
+        args=("_", "fields"),
+        globs=globs,
+        annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann},
+    )


 def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
@@ -577,6 +615,10 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
    Use the builtin setattr to set *attr_name* to *value_var*.
    We can't use the cached object.__setattr__ since we are setting
    attributes to a class.
+
+    If add_dunder to True, the generated globs should include a __add_dunder
+    value that will be used to add the dunder methods to the class for given
+    value_var
    """
    val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var
    return f"setattr(cls, '{attr_name}', {val})"
@@ -742,6 +784,23 @@ class LLMConfig:

        # NOTE: The following will be populated from __config__ and also
        # considered to be public API.
+        __openllm_default_id__: str = Field(None)
+        """Return the default model to use when using 'openllm start <model_id>'.
+        This could be one of the keys in 'self.model_ids' or custom users model.
+
+        This field is required when defining under '__config__'.
+        """
+
+        __openllm_model_ids__: ListStr = Field(None)
+        """A list of supported pretrained models tag for this given runnable.
+
+        For example:
+            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
+                                                "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
+
+        This field is required when defining under '__config__'.
+        """
+
        __openllm_url__: str = Field(None, init=False)
        """The resolved url for this LLMConfig."""

@@ -751,46 +810,13 @@ class LLMConfig:
        __openllm_trust_remote_code__: bool = Field(False)
        """Whether to always trust remote code"""

+        __openllm_service_name__: str = Field(None)
+        """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'"""
+
        __openllm_requirements__: ListStr | None = Field(None)
        """The default PyPI requirements needed to run this given LLM. By default, we will depend on
        bentoml, torch, transformers."""

-        __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
-        """A ModelEnv instance for this LLMConfig."""
-
-        __openllm_model_name__: str = Field("")
-        """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
-
-        __openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
-        """The model type for this given LLM. By default, it should be causal language modeling.
-        Currently supported 'causal_lm' or 'seq2seq_lm'
-        """
-
-        __openllm_start_name__: str = Field("")
-        """Default name to be used with `openllm start`"""
-
-        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
-        """the default name typed for this model. "dasherize" will convert the name to lowercase and
-        replace spaces with dashes. "lowercase" will convert the name to lowercase."""
-
-        __openllm_timeout__: int = Field(36000)
-        """The default timeout to be set for this given LLM."""
-
-        __openllm_workers_per_resource__: int | float = Field(1)
-        """The number of workers per resource. This is used to determine the number of workers to use for this model.
-        For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
-        OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
-
-        See StarCoder for more advanced usage. See
-        https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
-
-        By default, it is set to 1.
-        """
-
-        __openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
-        """The runtime to use for this model. Possible values are `transformers` or `cpp`. See
-        LlaMA for more information."""
-
        __openllm_use_pipeline__: bool = Field(False)
        """Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
        The reason for this to be here is because we want to access this object before loading
@@ -804,16 +830,40 @@ class LLMConfig:
        and set to False for every other models.
        """

-        __openllm_default_id__: str = Field(None)
-        """Return the default model to use when using 'openllm start <model_id>'.
-        This could be one of the keys in 'self.model_ids' or custom users model."""
+        __openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
+        """The model type for this given LLM. By default, it should be causal language modeling.
+        Currently supported 'causal_lm' or 'seq2seq_lm'
+        """

-        __openllm_model_ids__: ListStr = Field(None)
-        """A list of supported pretrained models tag for this given runnable.
+        __openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
+        """The runtime to use for this model. Possible values are `transformers` or `cpp`. See
+        LlaMA for more information."""

-        For example:
-            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
-                                                "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
+        __openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
+        """the default name typed for this model. "dasherize" will convert the name to lowercase and
+        replace spaces with dashes. "lowercase" will convert the name to lowercase."""
+
+        __openllm_model_name__: str = Field(None)
+        """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
+
+        __openllm_start_name__: str = Field(None)
+        """Default name to be used with `openllm start`"""
+
+        __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
+        """A ModelEnv instance for this LLMConfig."""
+
+        __openllm_timeout__: int = Field(36000)
+        """The default timeout to be set for this given LLM."""
+
+        __openllm_workers_per_resource__: int | float = Field(1)
+        """The number of workers per resource. This is used to determine the number of workers to use for this model.
+        For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
+        OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
+
+        See StarCoder for more advanced usage. See
+        https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
+
+        By default, it is set to 1.
        """

        __openllm_generation_class__: type[GenerationConfig] = Field(None, init=False)
@@ -835,23 +885,10 @@ class LLMConfig:
            cls.__name__ = f"{cls.__name__}Config"

        # NOTE: auto assignment attributes generated from __config__
-        _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettings))(cls)
+        _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls)
        # process a fields under cls.__dict__ and auto convert them with dantic.Field
        cd = cls.__dict__
        anns = codegen.get_annotations(cls)
-        partialed = functools.partial(_field_env_key, model_name=cls.__openllm_model_name__)
-
-        def auto_config_env(_: type[LLMConfig], attrs: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
-            return [
-                a.evolve(
-                    default=_populate_value_from_env_var(partialed(key=a.name), fallback=a.default),
-                    metadata={
-                        "env": a.metadata.get("env", partialed(key=a.name)),
-                        "description": a.metadata.get("description", "(not provided)"),
-                    },
-                )
-                for a in attrs
-            ]

        # _CountingAttr is the underlying representation of attr.field
        ca_names = {name for name, attr in cd.items() if isinstance(attr, _CountingAttr)}
@@ -864,9 +901,9 @@ class LLMConfig:
            val = cd.get(attr_name, attr.NOTHING)
            if not LazyType["_CountingAttr[t.Any]"](_CountingAttr).isinstance(val):
                if val is attr.NOTHING:
-                    val = cls.Field(env=partialed(key=attr_name))
+                    val = cls.Field(env=_field_env_key(cls.__openllm_model_name__, attr_name))
                else:
-                    val = cls.Field(default=val, env=partialed(key=attr_name))
+                    val = cls.Field(default=val, env=_field_env_key(cls.__openllm_model_name__, attr_name))
            these[attr_name] = val
        unannotated = ca_names - annotated_names
        if len(unannotated) > 0:
@@ -894,7 +931,7 @@ class LLMConfig:
            False,  # disable auto_attribs, since we already handle these
            False,  # disable kw_only
            True,  # collect_by_mro
-            field_transformer=auto_config_env,
+            field_transformer=_make_env_transformer(cls, cls.__openllm_model_name__),
        )
        _weakref_slot = True  # slots = True
        _base_names = {a.name for a in base_attrs}
@@ -910,7 +947,7 @@ class LLMConfig:
            _make_init(
                cls,  # cls (the attrs-decorated class)
                attrs,  # tuple of attr.Attribute of cls
-                _has_pre_init,  # pre_initjk
+                _has_pre_init,  # pre_init
                _has_post_init,  # post_init
                False,  # frozen
                True,  # slots
@@ -1047,14 +1084,14 @@ class LLMConfig:
    def __getattribute__(self, item: str) -> t.Any:
        if item in _reserved_namespace:
            raise ForbiddenAttributeError(
-                f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
+                f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified."
            )
        return _object_getattribute.__get__(self)(item)

    @classmethod
    def check_if_gpu_is_available(cls, implementation: str | None = None, force: bool = False):
        if implementation is None:
-            implementation = cls.__openllm_env__.get_framework_env()
+            implementation = cls.__openllm_env__["framework_value"]

        try:
            if cls.__openllm_requires_gpu__ or force:
@@ -1091,7 +1128,7 @@ class LLMConfig:
        """
        attrs = {k: v for k, v in attrs.items() if v is not None}

-        model_config = cls.__openllm_env__.model_config
+        model_config = cls.__openllm_env__.config

        env_json_string = os.environ.get(model_config, None)

--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -35,14 +35,17 @@ from bentoml._internal.types import ModelSignatureDict
 import openllm

 from .exceptions import ForbiddenAttributeError, OpenLLMException
-from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available,
-                    non_intrusive_setattr)
+from .utils import (DEBUG, LazyLoader, ModelEnv, bentoml_cattr, first_not_none,
+                    get_debug_mode, is_bitsandbytes_available,
+                    is_torch_available, non_intrusive_setattr, pkg)

 if t.TYPE_CHECKING:
    import torch
    import transformers
    from bentoml._internal.runner.strategy import Strategy

+    from .models.auto.factory import _BaseAutoLLMClass
+
    class LLMRunner(bentoml.Runner):
        __doc__: str
        __module__: str
@@ -170,7 +173,7 @@ def import_model(
        # NOTE: We need to free up the cache after importing the model
        # in the case where users first run openllm start without the model
        # available locally.
-        if openllm.utils.is_torch_available() and torch.cuda.is_available():
+        if is_torch_available() and torch.cuda.is_available():
            torch.cuda.empty_cache()


@@ -314,16 +317,25 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        model_id: str | None = None,
        llm_config: openllm.LLMConfig | None = None,
        *args: t.Any,
+        quantize: t.Literal["int8", "int4", "gptq"] | None = None,
+        bettertransformer: bool | None = None,
        **attrs: t.Any,
    ) -> LLM[_M, _T]:
-        return cls(model_id=model_id, llm_config=llm_config, *args, **attrs)
+        return cls(
+            model_id=model_id,
+            llm_config=llm_config,
+            *args,
+            quantize=quantize,
+            bettertransformer=bettertransformer,
+            **attrs,
+        )

    def __init__(
        self,
        model_id: str | None = None,
        llm_config: openllm.LLMConfig | None = None,
        *args: t.Any,
-        quantize: t.Literal["8bit", "4bit", "gptq"] | None = None,
+        quantize: t.Literal["int8", "int4", "gptq"] | None = None,
        bettertransformer: bool | None = None,
        **attrs: t.Any,
    ):
@@ -402,7 +414,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
            llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
                        will use `config_class` to construct default configuration.
            quantize: The quantization to use for this LLM. Defaults to None. Possible values
-                      include 8bit, 4bit and gptq.
+                      include int8, int4 and gptq.
            bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
            *args: The args to be passed to the model.
            **attrs: The kwargs to be passed to the model.
@@ -431,6 +443,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4")
        int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True)

+        if llm_config is not None:
+            logger.debug("Using given 'llm_config=(%s)' to initialize LLM.", llm_config)
+            self.config = llm_config
+        else:
+            self.config = self.config_class.model_construct_env(**attrs)
+            # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
+            attrs = self.config["extras"]
+
        if quantization_config and quantize:
            raise ValueError(
                """'quantization_config' and 'quantize' are mutually exclusive. Either customise
@@ -452,7 +472,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                    self,
                    quantize,
                )
-                if quantize == "8bit":
+                if quantize == "int8":
                    if int8_skip_modules is None:
                        int8_skip_modules = []
                    if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm":
@@ -465,8 +485,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                        llm_int8_skip_modules=int8_skip_modules,
                        llm_int8_has_fp16_weight=int8_has_fp16_weight,
                    )
-                elif quantize == "4bit":
-                    trf_versions = openllm.utils.pkg.pkg_version_info("transformers")
+                elif quantize == "int4":
+                    trf_versions = pkg.pkg_version_info("transformers")
                    supports_kbits = trf_versions[:2] >= (4, 30)
                    if supports_kbits:
                        quantization_config = transformers.BitsAndBytesConfig(
@@ -477,7 +497,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                        )
                    else:
                        logger.warning(
-                            "'quantize' is set to 4bit, while the current transformers version %s does not support "
+                            "'quantize' is set to int4, while the current transformers version %s does not support "
                            "k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore "
                            "make sure to install the latest version of transformers either via PyPI or "
                            "from git source: 'pip install git+https://github.com/huggingface/transformers'.",
@@ -495,20 +515,12 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                        )
                    raise NotImplementedError("GPTQ is not supported yet.")
                else:
-                    raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.")
+                    raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantize} instead.")

-        attrs.update({"quantization_config": quantization_config})
-
-        if llm_config is not None:
-            logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
-            self.config = llm_config
-        else:
-            self.config = self.config_class.model_construct_env(**attrs)
-            # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
-            attrs = self.config["extras"]
-
-        if not self.config["use_pipeline"]:
-            attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
+        if self.__llm_implementation__ == "pt":
+            if not self.config["use_pipeline"]:
+                attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
+            attrs["quantization_config"] = quantization_config

        model_kwds, tokenizer_kwds = {}, {}
        if self.__llm_init_kwargs__:
@@ -527,8 +539,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
            model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"])

        # NOTE: This is the actual given path or pretrained weight for this LLM.
-        if t.TYPE_CHECKING:
-            assert model_id is not None
+        assert model_id is not None
        self._model_id = model_id

        # parsing tokenizer and model kwargs
@@ -590,6 +601,16 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
            "model_ids": orjson.dumps(self.config["model_ids"]).decode(),
        }

+    @property
+    def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], dict[str, t.Any]], dict[str, t.Any]]:
+        """Returning the processed model and tokenizer parameters to be used with
+        'import_model' or any other place that requires loading model and tokenizer.
+
+        See 'openllm.cli.download_models' for example usage.
+        It returns a tuple of (model_args, model_kwargs) & tokenizer_kwargs
+        """
+        return (self._model_args, self._model_attrs), self._tokenizer_attrs
+
    @staticmethod
    def make_tag(
        model_id: str | None = None,
@@ -638,6 +659,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        return bentoml.Tag.from_taglike(f"{implementation}-{name}:{model_version}")

    def ensure_model_id_exists(self) -> bentoml.Model:
+        """This utility function will download the model if it doesn't exist yet.
+        Make sure to call this function if 'ensure_available' is not set during
+        Auto LLM initialisation.
+        """
        output = subprocess.check_output(
            [
                sys.executable,
@@ -651,7 +676,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                "porcelain",
            ]
        )
-        if openllm.utils.DEBUG:
+        if DEBUG or get_debug_mode():
            # NOTE: This usually only concern BentoML devs.
            pattern = r"^__tag__:[^:\n]+:[^:\n]+"
            matched = re.search(pattern, output.decode("utf-8").strip(), re.MULTILINE)
@@ -665,7 +690,15 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
    @property
    def _bentomodel(self) -> bentoml.Model:
        if self.__llm_bentomodel__ is None:
-            self.__llm_bentomodel__ = self.ensure_model_id_exists()
+            # NOTE: Since PR#28, self.__llm_bentomodel__ changed from
+            # ensure_model_id_exists() into just returning the model ref.
+            # This is because we want to save a few seconds of loading time,
+            # as openllm.Runner and openllm.AutoLLM initialisation is around 700ms
+            # before #28.
+            # If users want to make sure to have the model downloaded,
+            # one should invoke `LLM.ensure_model_id_exists()` manually,
+            # or pass `ensure_available=True` into the Auto LLM initialisation.
+            self.__llm_bentomodel__ = bentoml.transformers.get(self.tag)
        return self.__llm_bentomodel__

    @property
@@ -729,13 +762,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                )
        return self.__llm_tokenizer__

+    # order of these fields matter here, make sure to sync it with
+    # openllm.models.auto.factory._BaseAutoLLMClass.for_model
    def to_runner(
        self,
        models: list[bentoml.Model] | None = None,
        max_batch_size: int | None = None,
        max_latency_ms: int | None = None,
        method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None,
-        embedded: bool = False,
        scheduling_strategy: type[Strategy] | None = None,
    ) -> LLMRunner:
        """Convert this LLM into a Runner.
@@ -753,6 +787,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
        NOTE: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner(): 'name'.
        - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this.
            The generated name will be 'llm-<model-start-name>-runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
+        - 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
        """
        models = models if models is not None else []
        models.append(self._bentomodel)
@@ -768,10 +803,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
            method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig}
        else:
            signatures = ModelSignature.convert_signatures_dict(method_configs)
-            generate_sig = openllm.utils.first_not_none(signatures.get("generate"), default=generate_sig)
-            generate_iterator_sig = openllm.utils.first_not_none(
-                signatures.get("generate_iterator"), default=generate_iterator_sig
-            )
+            generate_sig = first_not_none(signatures.get("generate"), default=generate_sig)
+            generate_iterator_sig = first_not_none(signatures.get("generate_iterator"), default=generate_iterator_sig)

        class _Runnable(bentoml.Runnable):
            SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
@@ -860,11 +893,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
                },
            ),
            name=self.runner_name,
+            embedded=False,
            models=models,
            max_batch_size=max_batch_size,
            max_latency_ms=max_latency_ms,
            method_configs=bentoml_cattr.unstructure(method_configs),
-            embedded=embedded,
            scheduling_strategy=scheduling_strategy,
        )

@@ -918,22 +951,28 @@ def Runner(
    ...


-def Runner(model_name: str, **attrs: t.Any) -> LLMRunner:
+def Runner(model_name: str, ensure_available: bool = True, init_local: bool = False, **attrs: t.Any) -> LLMRunner:
    """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'

    Args:
        model_name: Supported model name from 'openllm models'
+        ensure_available: If True, it will ensure the model is available before creating the runner.
+                          Set to False for faster creation time. Note that you will need to make sure
+                          the model for this 'model_id' is available before calling the runner.
+                          One can do this by doing the following:
+                          ```python
+                          runner = openllm.Runner("dolly-v2", ensure_available=False)
+                          runner.llm.ensure_model_id_exists()
+                          ```
+        init_local: If True, it will initialize the model locally. This is useful if you want to
+                    run the model locally. (Symmetrical to bentoml.Runner.init_local())
        **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs
                behaviour
    """
-    init_local = attrs.pop("init_local", False)
-    ModelEnv = openllm.utils.ModelEnv(model_name)
-    if ModelEnv.get_framework_env() == "flax":
-        runner = openllm.AutoFlaxLLM.create_runner(model_name, **attrs)
-    elif ModelEnv.get_framework_env() == "tf":
-        runner = openllm.AutoTFLLM.create_runner(model_name, **attrs)
-    else:
-        runner = openllm.AutoLLM.create_runner(model_name, **attrs)
+    runner = t.cast(
+        "_BaseAutoLLMClass",
+        openllm[ModelEnv(model_name)["framework_value"]],  # type: ignore (internal API)
+    ).create_runner(model_name, ensure_available=ensure_available, **attrs)

    if init_local:
        runner.init_local(quiet=True)
--- a/src/openllm/_package.py
+++ b/src/openllm/_package.py
@@ -29,12 +29,15 @@ from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
 from bentoml._internal.configuration import get_debug_mode

 import openllm
-import openllm.utils as utils
-from openllm.utils import pkg
+
+from .utils import (ModelEnv, codegen, first_not_none, is_flax_available,
+                    is_tf_available, is_torch_available, pkg)

 if t.TYPE_CHECKING:
    from fs.base import FS

+    from .models.auto.factory import _BaseAutoLLMClass
+
 logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD"
@@ -82,10 +85,10 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
    if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
        packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")

-    env = llm.config["env"]
-    to_use_framework = env.get_framework_env()
-    if to_use_framework == "flax":
-        assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
+    env: ModelEnv = llm.config["env"]
+    framework_envvar = env["framework_value"]
+    if framework_envvar == "flax":
+        assert is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
        packages.extend(
            [
                f"flax>={importlib.metadata.version('flax')}",
@@ -93,8 +96,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
                f"jaxlib>={importlib.metadata.version('jaxlib')}",
            ]
        )
-    elif to_use_framework == "tf":
-        assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
+    elif framework_envvar == "tf":
+        assert is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
        candidates = (
            "tensorflow",
            "tensorflow-cpu",
@@ -116,7 +119,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
            except importlib.metadata.PackageNotFoundError:
                pass
    else:
-        assert utils.is_torch_available(), "PyTorch is not available. Make sure to have it locally installed."
+        assert is_torch_available(), "PyTorch is not available. Make sure to have it locally installed."
        packages.extend([f"torch>={importlib.metadata.version('torch')}"])

    wheels: list[str] = []
@@ -127,7 +130,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
    return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)


-def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions:
+def construct_docker_options(
+    llm: openllm.LLM[t.Any, t.Any],
+    _: FS,
+    workers_per_resource: int | float,
+    quantize: t.LiteralString | None,
+    bettertransformer: bool | None,
+) -> DockerOptions:
    _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
    _bentoml_config_options_opts = [
        "api_server.traffic.timeout=36000",  # NOTE: Currently we hardcode this value
@@ -135,39 +144,112 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
        f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
    ]
    _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
-    env = llm.config["env"]
-    return DockerOptions(
-        cuda_version="11.6",  # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
-        env={
-            env.framework: env.get_framework_env(),
-            "OPENLLM_MODEL": llm.config["model_name"],
-            "OPENLLM_MODEL_ID": llm.model_id,
-            "BENTOML_DEBUG": str(get_debug_mode()),
-            "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
-        },
-        system_packages=["git"],
+    env: ModelEnv = llm.config["env"]
+
+    env_dict = {
+        env.framework: env.framework_value,
+        env.config: llm.config.model_dump_json().decode(),
+        "OPENLLM_MODEL": llm.config["model_name"],
+        "OPENLLM_MODEL_ID": llm.model_id,
+        "BENTOML_DEBUG": str(get_debug_mode()),
+        "BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
+    }
+
+    # We need to handle None separately here, as env from subprocess doesn't
+    # accept None value.
+    _env = ModelEnv(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize)
+
+    if _env.bettertransformer_value is not None:
+        env_dict[_env.bettertransformer] = _env.bettertransformer_value
+    if _env.quantize_value is not None:
+        env_dict[_env.quantize] = _env.quantize_value
+
+    # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
+    return DockerOptions(cuda_version="11.6", env=env_dict, system_packages=["git"])
+
+
+@t.overload
+def build(
+    model_name: str,
+    *,
+    model_id: str | None = ...,
+    quantize: t.LiteralString | None = ...,
+    bettertransformer: bool | None = ...,
+    _workers_per_resource: int | float | None = ...,
+    _overwrite_existing_bento: bool = ...,
+    __cli__: t.Literal[False] = ...,
+    **attrs: t.Any,
+) -> bentoml.Bento:
+    ...
+
+
+@t.overload
+def build(
+    model_name: str,
+    *,
+    model_id: str | None = ...,
+    quantize: t.LiteralString | None = ...,
+    bettertransformer: bool | None = ...,
+    _workers_per_resource: int | float | None = ...,
+    _overwrite_existing_bento: bool = ...,
+    __cli__: t.Literal[True] = ...,
+    **attrs: t.Any,
+) -> tuple[bentoml.Bento, bool]:
+    ...
+
+
+def _build_bento(
+    bento_tag: bentoml.Tag,
+    service_name: str,
+    llm_fs: FS,
+    llm: openllm.LLM[t.Any, t.Any],
+    workers_per_resource: int | float,
+    quantize: t.LiteralString | None,
+    bettertransformer: bool | None,
+) -> bentoml.Bento:
+    framework_envvar = llm.config["env"]["framework_value"]
+    labels = dict(llm.identifying_params)
+    labels.update({"_type": llm.llm_type, "_framework": framework_envvar})
+    logger.info("Building Bento for LLM '%s'", llm.config["start_name"])
+    return bentoml.bentos.build(
+        f"{service_name}:svc",
+        name=bento_tag.name,
+        labels=labels,
+        description=f"OpenLLM service for {llm.config['start_name']}",
+        include=[
+            f for f in llm_fs.walk.files(filter=["*.py"])
+        ],  # NOTE: By default, we are using _service.py as the default service, for now.
+        exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"],
+        python=construct_python_options(llm, llm_fs),
+        docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer),
+        version=bento_tag.version,
+        build_ctx=llm_fs.getsyspath("/"),
    )


-@t.overload
-def build(model_name: str, *, __cli__: t.Literal[False] = ..., **attrs: t.Any) -> bentoml.Bento:
-    ...
+def build(
+    model_name: str,
+    *,
+    model_id: str | None = None,
+    quantize: t.LiteralString | None = None,
+    bettertransformer: bool | None = None,
+    _workers_per_resource: int | float | None = None,
+    _overwrite_existing_bento: bool = False,
+    __cli__: bool = False,
+    **attrs: t.Any,
+) -> tuple[bentoml.Bento, bool] | bentoml.Bento:
+    """Package a LLM into a Bento.

+    The LLM will be built into a BentoService with the following structure:
+    if quantize is passed, it will instruct the model to be quantized dynamically during serving time.
+    if bettertransformer is passed, it will instruct the model to use BetterTransformer during serving time.

-@t.overload
-def build(model_name: str, *, __cli__: t.Literal[True] = ..., **attrs: t.Any) -> tuple[bentoml.Bento, bool]:
-    ...
+    Other parameters including model_name, model_id and attrs will be passed to the LLM class itself.
+    """

-
-def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[bentoml.Bento, bool] | bentoml.Bento:
-    """Package a LLM into a Bento."""
-
-    overwrite_existing_bento = attrs.pop("_overwrite_existing_bento", False)
+    _previously_built = False
    current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
    current_model_id_envvar = os.environ.pop("OPENLLM_MODEL_ID", None)
-    _previously_built = False
-    workers_per_resource = attrs.pop("_workers_per_resource", None)
-    model_id: str = attrs.pop("model_id", None)

    llm_config = openllm.AutoConfig.for_model(model_name)

@@ -178,52 +260,58 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
    try:
        os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)

-        to_use_framework = llm_config["env"].get_framework_env()
-        if to_use_framework == "flax":
-            llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
-        elif to_use_framework == "tf":
-            llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
-        else:
-            llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
+        framework_envvar = llm_config["env"]["framework_value"]
+        llm = t.cast(
+            "_BaseAutoLLMClass",
+            openllm[framework_envvar],  # type: ignore (internal API)
+        ).for_model(
+            model_name,
+            model_id=model_id,
+            llm_config=llm_config,
+            quantize=quantize,
+            bettertransformer=bettertransformer,
+            **attrs,
+        )

        os.environ["OPENLLM_MODEL_ID"] = llm.model_id

        labels = dict(llm.identifying_params)
-        labels.update({"_type": llm.llm_type, "_framework": to_use_framework})
+        labels.update({"_type": llm.llm_type, "_framework": framework_envvar})
        service_name = f"generated_{llm_config['model_name']}_service.py"
-        workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
+        workers_per_resource = first_not_none(_workers_per_resource, default=llm_config["workers_per_resource"])

        with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
            # add service.py definition to this temporary folder
-            utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
+            codegen.write_service(model_name, llm.model_id, service_name, llm_fs)

            bento_tag = bentoml.Tag.from_taglike(f"{llm.llm_type}-service:{llm.tag.version}")
            try:
                bento = bentoml.get(bento_tag)
-                if overwrite_existing_bento:
+                if _overwrite_existing_bento:
+                    logger.info("Overwriting previously saved Bento.")
                    bentoml.delete(bento_tag)
-                    raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
+                    bento = _build_bento(
+                        bento_tag,
+                        service_name,
+                        llm_fs,
+                        llm,
+                        workers_per_resource=workers_per_resource,
+                        quantize=quantize,
+                        bettertransformer=bettertransformer,
+                    )
                _previously_built = True
            except bentoml.exceptions.NotFound:
                logger.info("Building Bento for LLM '%s'", llm_config["start_name"])
-                bento = bentoml.bentos.build(
-                    f"{service_name}:svc",
-                    name=bento_tag.name,
-                    labels=labels,
-                    description=f"OpenLLM service for {llm_config['start_name']}",
-                    include=[
-                        f for f in llm_fs.walk.files(filter=["*.py"])
-                    ],  # NOTE: By default, we are using _service.py as the default service, for now.
-                    exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"],
-                    python=construct_python_options(llm, llm_fs),
-                    docker=construct_docker_options(llm, llm_fs, workers_per_resource),
-                    version=bento_tag.version,
-                    build_ctx=llm_fs.getsyspath("/"),
+                bento = _build_bento(
+                    bento_tag,
+                    service_name,
+                    llm_fs,
+                    llm,
+                    workers_per_resource=workers_per_resource,
+                    quantize=quantize,
+                    bettertransformer=bettertransformer,
                )
-            if __cli__:
-                return bento, _previously_built
-            else:
-                return bento
+            return (bento, _previously_built) if __cli__ else bento
    except Exception as e:
        logger.error("\nException caught during building LLM %s: \n", model_name, exc_info=e)
        raise
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -34,7 +34,16 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}")  # openllm: model na
 model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}")  # openllm: model id

 llm_config = openllm.AutoConfig.for_model(model)
-runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config)
+
+runner = openllm.Runner(
+    model,
+    model_id=model_id,
+    llm_config=llm_config,
+    bettertransformer=llm_config["env"]["bettertransformer_value"],
+    quantize=llm_config["env"]["quantize_value"],
+    ensure_available=False,
+    init_local=False,
+)

 svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])

@@ -57,6 +66,6 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
        model_id=model_id,
        timeout=llm_config["timeout"],
        model_name=llm_config["model_name"],
-        framework=llm_config["env"].get_framework_env(),
+        framework=llm_config["env"]["framework_value"],
        configuration=llm_config.model_dump_json().decode(),
    )
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -94,22 +94,21 @@ class _BaseAutoLLMClass:
        >>> llm = openllm.AutoLLM.for_model("flan-t5")
        ```
        """
-        runner_kwargs_name = [
+        # order matters here
+        runner_kwargs_name = {
            "models",
            "max_batch_size",
            "max_latency_ms",
            "method_configs",
-            "embedded",
            "scheduling_strategy",
-        ]
+        }
        to_runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name}
-        for k in to_runner_attrs:
-            del attrs[k]
-        normalized = inflection.underscore(model_name)
-        if cls._model_mapping.get(normalized, None, mapping_type="name2model"):
+        attrs = {k: v for k, v in attrs.items() if k not in to_runner_attrs}
+        if cls._model_mapping.get(inflection.underscore(model_name), None, mapping_type="name2model"):
            if not isinstance(llm_config, openllm.LLMConfig):
                # The rest of kwargs is now passed to config
-                llm_config = AutoConfig.for_model(normalized, **attrs)
+                llm_config = AutoConfig.for_model(model_name, **attrs)
+                attrs = llm_config.__openllm_extras__
            # the rest of attrs will be saved to __openllm_extras__
            llm = cls._model_mapping[type(llm_config)].from_pretrained(
                model_id,
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -104,25 +104,25 @@ class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrain
            chat_history.append((prompt, generation_result))
        return "".join(generation_result)

-    @torch.inference_mode()
-    def generate(self, prompt: str, use_default_prompt_template: bool = True, **attrs: t.Any) -> str:
-        self.model.eval()
+    def generate(self, prompt: str, **attrs: t.Any) -> str:
+        with torch.inference_mode():
+            self.model.eval()

-        # Only use half precision if the model is not yet quantized
-        if self.config.use_half_precision:
-            self.model.half()
+            # Only use half precision if the model is not yet quantized
+            if self.config.use_half_precision:
+                self.model.half()

-        self.model.cuda()
+            self.model.cuda()

-        logit_processor: list[LogitsProcessor] = LogitsProcessorList()
-        logit_processor.append(InvalidScoreLogitsProcessor())
+            logit_processor: list[LogitsProcessor] = LogitsProcessorList()
+            logit_processor.append(InvalidScoreLogitsProcessor())

-        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
-        outputs = self.model.generate(
-            **inputs,
-            generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(),
-            logits_processor=logit_processor,
-        )
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
-        response = self.tokenizer.decode(outputs)
-        return self.model.process_response(response)
+            inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
+            outputs = self.model.generate(
+                **inputs,
+                generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(),
+                logits_processor=logit_processor,
+            )
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
+            response = self.tokenizer.decode(outputs)
+            return self.model.process_response(response)
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -98,19 +98,19 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
    ) -> str:
        return generation_result[0]["generated_text"]

-    @torch.inference_mode()
    def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
-        self.model.tokenizer = self.tokenizer
-        llm_config = self.config.model_construct_env(**attrs)
-        decoded: list[dict[t.Literal["generated_text"], str]] = self.model(
-            prompt, generation_config=llm_config.to_generation_config()
-        )
+        with torch.inference_mode():
+            self.model.tokenizer = self.tokenizer
+            llm_config = self.config.model_construct_env(**attrs)
+            decoded: list[dict[t.Literal["generated_text"], str]] = self.model(
+                prompt, generation_config=llm_config.to_generation_config()
+            )

-        if llm_config.return_full_text:
-            return [
-                {k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"}
-                for i in decoded
-                for k, generated in i.items()
-            ]
+            if llm_config.return_full_text:
+                return [
+                    {k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"}
+                    for i in decoded
+                    for k, generated in i.items()
+                ]

-        return decoded
+            return decoded
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -74,14 +74,14 @@ class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformer
    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
        return generation_result[0]

-    @torch.inference_mode()
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        if torch.cuda.is_available():
-            self.model.cuda()
-        input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
-        result_tensor = self.model.generate(
-            input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
+        with torch.inference_mode():
+            if torch.cuda.is_available():
+                self.model.cuda()
+            input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
+            result_tensor = self.model.generate(
+                input_ids,
+                do_sample=True,
+                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+            )
+            return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
--- a/src/openllm/models/opt/modeling_opt.py
+++ b/src/openllm/models/opt/modeling_opt.py
@@ -129,15 +129,15 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
        else:
            return "\n".join(generation_result)

-    @torch.inference_mode()
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
-            self.model.cuda()
+        with torch.inference_mode():
+            if torch.cuda.is_available() and torch.cuda.device_count() == 1:
+                self.model.cuda()

-        input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
-        generated_tensors = self.model.generate(
-            input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
+            input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
+            generated_tensors = self.model.generate(
+                input_ids,
+                do_sample=True,
+                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+            )
+            return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -120,18 +120,20 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
        return generation_result[0]

-    @torch.inference_mode()
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
-        result_tensor = self.model.generate(
-            inputs,
-            do_sample=True,
-            pad_token_id=self.tokenizer.eos_token_id,
-            # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        # TODO: We will probably want to return the tokenizer here so that we can manually process this
-        # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
-        return self.tokenizer.batch_decode(
-            result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
+        with torch.inference_mode():
+            inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
+            result_tensor = self.model.generate(
+                inputs,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id,
+                # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
+                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+            )
+            # TODO: We will probably want to return the tokenizer here so that we can manually process this
+            # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
+            return self.tokenizer.batch_decode(
+                result_tensor[0],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
--- a/src/openllm/utils/init.py
+++ b/src/openllm/utils/init.py
@@ -35,6 +35,11 @@ from bentoml._internal.utils import (LazyLoader, bentoml_cattr,

 from .lazy import LazyModule

+# NOTE: The set marks contains a set of modules name
+# that are available above and are whitelisted
+# to be included in the extra_objects map.
+_whitelist_modules = {"pkg"}
+
 logger = logging.getLogger(__name__)

 try:
@@ -86,7 +91,9 @@ DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.envi
 # XXX: define all classes, functions import above this line
 # since _extras will be the locals() import from this file.
 _extras: dict[str, t.Any] = {
-    k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_")
+    k: v
+    for k, v in locals().items()
+    if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))
 }

 _import_structure = {
--- a/src/openllm/utils/dantic.py
+++ b/src/openllm/utils/dantic.py
@@ -97,7 +97,7 @@ def attrs_to_options(
    )


-def _default_converter(value: t.Any, env: str | None) -> t.Any:
+def env_converter(value: t.Any, env: str | None = None) -> t.Any:
    if env is not None:
        value = os.environ.get(env, value)
    if value is not None and isinstance(value, str):
@@ -135,7 +135,8 @@ def Field(
            on kw_only. If kw_only=True, the this field will become 'Required' and the default
            value is omitted. If kw_only=False, then the default value will be used as before.
        use_default_converter: a bool indicating whether to use the default converter. Defaults
-            to True. If set to False, then the default converter will not be used.
+            to True. If set to False, then the default converter will not be used. The default
+            converter converts a given value from the environment variable for this given Field.
        **kwargs: The rest of the arguments are passed to attr.field
    """
    metadata = attrs.pop("metadata", {})
@@ -148,7 +149,7 @@ def Field(

    converter = attrs.pop("converter", None)
    if use_default_converter:
-        converter = functools.partial(_default_converter, env=env)
+        converter = functools.partial(env_converter, env=env)

    if ge is not None:
        piped.append(attr.validators.ge(ge))
--- a/src/openllm/utils/import_utils.py
+++ b/src/openllm/utils/import_utils.py
@@ -15,6 +15,8 @@
 """
 Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons.
 """
+from __future__ import annotations
+
 import importlib
 import importlib.metadata
 import importlib.util
@@ -24,7 +26,6 @@ import typing as t
 from abc import ABCMeta
 from collections import OrderedDict

-import attr
 import inflection
 from bentoml._internal.utils import LazyLoader
 from packaging import version
@@ -236,31 +237,73 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]):
        raise ImportError("".join(failed))


-@attr.define
 class ModelEnv:
-    model_name: str = attr.field(converter=inflection.underscore)
+    model_name: str

-    @property
-    def framework(self) -> str:
-        return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK"
+    if t.TYPE_CHECKING:
+        config: property
+        model_id: property
+        quantize: property
+        framework: property
+        bettertransformer: property

-    @property
-    def model_config(self) -> str:
-        return f"OPENLLM_{self.model_name.upper()}_CONFIG"
+        framework_value: property
+        quantize_value: property
+        bettertransformer_value: property

-    @property
-    def model_id(self) -> str:
-        return f"OPENLLM_{self.model_name.upper()}_MODEL_ID"
+    def __getitem__(self, item: str | t.Any) -> t.Any:
+        if hasattr(self, item):
+            return getattr(self, item)
+        raise KeyError(f"Key {item} not found in {self}")

-    @property
-    def bettertransformer(self) -> str:
-        return f"OPENLLM_{self.model_name.upper()}_BETTERTRANSFORMER"
+    def __new__(cls, model_name: str, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None):
+        from .._configuration import _field_env_key
+        from . import codegen

-    def gen_env_key(self, key: str) -> str:
-        return f"OPENLLM_{self.model_name.upper()}_{key.upper()}"
+        model_name = inflection.underscore(model_name)

-    def convert_to_bettertransformer(self) -> bool:
-        return os.environ.get(self.bettertransformer, str(False)).lower() == "true"
+        res = super().__new__(cls)
+        res.model_name = model_name
+
+        # gen properties env key
+        attributes = {"config", "model_id", "quantize", "framework", "bettertransformer"}
+        for att in attributes:
+            setattr(res, att, _field_env_key(model_name, att.upper()))
+
+        # gen properties env value
+        attributes_with_values = {
+            "quantize": (bool, quantize),
+            "bettertransformer": (bool, bettertransformer),
+            "framework": (str, "pt"),
+        }
+        globs: dict[str, t.Any] = {
+            "__bool_vars_value": ENV_VARS_TRUE_VALUES,
+            "__env_get": os.environ.get,
+            "self": res,
+        }
+
+        for attribute, (default_type, default_value) in attributes_with_values.items():
+            lines: list[str] = []
+            if default_type is bool:
+                lines.append(
+                    f"return str(__env_get(self['{attribute}'], str(__env_default)).upper() in __bool_vars_value)"
+                )
+            else:
+                lines.append(f"return __env_get(self['{attribute}'], __env_default)")
+
+            setattr(
+                res,
+                f"{attribute}_value",
+                codegen.generate_function(
+                    cls,
+                    "_env_get_" + attribute,
+                    lines,
+                    ("__env_default",),
+                    globs,
+                )(default_value),
+            )
+
+        return res

    @property
    def start_docstring(self) -> str:
@@ -269,9 +312,3 @@ class ModelEnv:
    @property
    def module(self) -> LazyLoader:
        return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
-
-    def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]:
-        envvar = os.environ.get(self.framework, "pt")
-        if envvar not in ("pt", "tf", "flax"):
-            raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
-        return envvar
--- a/src/openllm/utils/lazy.py
+++ b/src/openllm/utils/lazy.py
@@ -21,6 +21,19 @@ import os
 import types
 import typing as t

+from ..exceptions import ForbiddenAttributeError, OpenLLMException
+
+
+class UsageNotAllowedError(OpenLLMException):
+    """Raised when LazyModule.__getitem__ is forbidden."""
+
+
+class MissingAttributesError(OpenLLMException):
+    """Raised when given keys is not available in LazyModule special mapping."""
+
+
+_reserved_namespace = {"__openllm_special__"}
+

 class LazyModule(types.ModuleType):
    """
@@ -49,9 +62,7 @@ class LazyModule(types.ModuleType):
            for value in values:
                self._class_to_module[value] = key
        # Needed for autocompletion in an IDE
-        self.__all__ = (
-            list(import_structure.keys()) + list(itertools.chain(*import_structure.values())) + list(_extra_objects)
-        )
+        self.__all__ = list(import_structure.keys()) + list(itertools.chain(*import_structure.values()))
        self.__file__ = module_file
        self.__spec__ = module_spec
        self.__path__ = [os.path.dirname(module_file)]
@@ -71,13 +82,30 @@ class LazyModule(types.ModuleType):
                result.append(attribute)
        return result

+    def __getitem__(self, key: str) -> t.Any:
+        if self._objects.get("__openllm_special__") is None:
+            raise UsageNotAllowedError(f"'{self._name}' is not allowed to be used as a dict.")
+        _special_mapping = self._objects.get("__openllm_special__", {})
+        try:
+            if key in _special_mapping:
+                return getattr(self, _special_mapping.__getitem__(key))
+            raise MissingAttributesError(f"Requested '{key}' is not available in given mapping.")
+        except AttributeError as e:
+            raise KeyError(f"'{self._name}' has no attribute {_special_mapping[key]}") from e
+        except Exception as e:
+            raise KeyError(f"Failed to lookup '{key}' in '{self._name}'") from e
+
    def __getattr__(self, name: str) -> t.Any:
+        if name in _reserved_namespace:
+            raise ForbiddenAttributeError(
+                f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified."
+            )
        if name in self._objects:
-            return self._objects[name]
+            return self._objects.__getitem__(name)
        if name in self._modules:
            value = self._get_module(name)
        elif name in self._class_to_module.keys():
-            module = self._get_module(self._class_to_module[name])
+            module = self._get_module(self._class_to_module.__getitem__(name))
            value = getattr(module, name)
        else:
            raise AttributeError(f"module {self.__name__} has no attribute {name}")
--- a/src/openllm_client/_prompt.py
+++ b/src/openllm_client/_prompt.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 from __future__ import annotations

-import dataclasses
 import typing as t

+import attr
+
 import openllm
 from openllm._prompt import PromptFormatter

@@ -34,13 +35,11 @@ class PartialDict(DictStrStr):
        return "{" + key + "}"


-@dataclasses.dataclass(slots=True)
+@attr.define(slots=True)
 class PromptTemplate:
    template: str
    input_variables: t.Sequence[str]

-    model_config = {"extra": "forbid"}
-
    def to_str(self, __partial_dict__: PartialDict | None = None, **attrs: str) -> str:
        """Generate a prompt from the template and input variables"""
        if __partial_dict__:
--- a/src/openllm_client/runtimes/base.py
+++ b/src/openllm_client/runtimes/base.py
@@ -25,6 +25,7 @@ import httpx
 import openllm

 if t.TYPE_CHECKING:
+    from openllm.models.auto.factory import _BaseAutoLLMClass

    class AnnotatedClient(bentoml.client.Client):
        def health(self, *args: t.Any, **attrs: t.Any) -> t.Any:
@@ -107,12 +108,10 @@ class ClientMixin:
    @property
    def llm(self) -> openllm.LLM[t.Any, t.Any]:
        if self.__llm__ is None:
-            if self.framework == "flax":
-                self.__llm__ = openllm.AutoFlaxLLM.for_model(self.model_name)
-            elif self.framework == "tf":
-                self.__llm__ = openllm.AutoTFLLM.for_model(self.model_name)
-            else:
-                self.__llm__ = openllm.AutoLLM.for_model(self.model_name)
+            self.__llm__ = t.cast(
+                "_BaseAutoLLMClass",
+                openllm[self.framework],  # type: ignore (internal API)
+            ).for_model(self.model_name)
        return self.__llm__

    @property
--- a/tests/test_configuration.py
+++ b/tests/test_configuration.py
@@ -33,10 +33,10 @@ logger = logging.getLogger(__name__)


 def test_missing_default():
-    with pytest.raises(ValueError, match="The following keys are required*"):
+    with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"):
        make_llm_config("MissingDefaultId", {"name_type": "lowercase", "requirements": ["bentoml"]})

-    with pytest.raises(ValueError, match="The following keys are required*"):
+    with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"):
        make_llm_config("MissingModelId", {"default_id": "huggingface/t5-tiny-testing", "requirements": ["bentoml"]})