perf: faster LLM loading

using attrs for faster class creation opposed to metaclass Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-04-30 11:55:06 -04:00 · 2023-06-14 01:36:42 -04:00
parent ebcedc35de
commit 111d205f63
8 changed files with 328 additions and 379 deletions
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -66,7 +66,7 @@ from deepmerge.merger import Merger
 import openllm

 from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
-from .utils import DEBUG, LazyType, bentoml_cattr, dantic, first_not_none, lenient_issubclass
+from .utils import DEBUG, LazyType, bentoml_cattr, codegen, dantic, first_not_none, lenient_issubclass

 if hasattr(t, "Required"):
    from typing import Required
@@ -85,7 +85,7 @@ if t.TYPE_CHECKING:
    import tensorflow as tf
    import torch
    import transformers
-    from attr import _CountingAttr, _make_init, _make_method, _make_repr, _transform_attrs
+    from attr import _CountingAttr, _make_init, _make_repr, _transform_attrs  # type: ignore
    from transformers.generation.beam_constraints import Constraint

    from ._types import ClickFunctionWrapper, F, O_co, P
@@ -103,7 +103,7 @@ else:
    ItemgetterAny = itemgetter
    # NOTE: Using internal API from attr here, since we are actually
    # allowing subclass of openllm.LLMConfig to become 'attrs'-ish
-    from attr._make import _CountingAttr, _make_init, _make_method, _make_repr, _transform_attrs
+    from attr._make import _CountingAttr, _make_init, _make_repr, _transform_attrs

    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -389,89 +389,10 @@ def _populate_value_from_env_var(
    return os.environ.get(key, fallback)


-# sentinel object for unequivocal object() getattr
-_sentinel = object()
-
-
 def _field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
    return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))


-def _has_own_attribute(cls: type[t.Any], attrib_name: t.Any):
-    """
-    Check whether *cls* defines *attrib_name* (and doesn't just inherit it).
-    """
-    attr = getattr(cls, attrib_name, _sentinel)
-    if attr is _sentinel:
-        return False
-
-    for base_cls in cls.__mro__[1:]:
-        a = getattr(base_cls, attrib_name, None)
-        if attr is a:
-            return False
-
-    return True
-
-
-def _get_annotations(cls: type[t.Any]) -> DictStrAny:
-    """
-    Get annotations for *cls*.
-    """
-    if _has_own_attribute(cls, "__annotations__"):
-        return cls.__annotations__
-
-    return DictStrAny()
-
-
-_classvar_prefixes = (
-    "typing.ClassVar",
-    "t.ClassVar",
-    "ClassVar",
-    "typing_extensions.ClassVar",
-)
-
-
-def _is_class_var(annot: str | t.Any) -> bool:
-    """
-    Check whether *annot* is a typing.ClassVar.
-
-    The string comparison hack is used to avoid evaluating all string
-    annotations which would put attrs-based classes at a performance
-    disadvantage compared to plain old classes.
-    """
-    annot = str(annot)
-
-    # Annotation can be quoted.
-    if annot.startswith(("'", '"')) and annot.endswith(("'", '"')):
-        annot = annot[1:-1]
-
-    return annot.startswith(_classvar_prefixes)
-
-
-def _add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str | None = None) -> _T:
-    """
-    Add __module__ and __qualname__ to a *method* if possible.
-    """
-    try:
-        method_or_cls.__module__ = cls.__module__
-    except AttributeError:
-        pass
-
-    try:
-        method_or_cls.__qualname__ = ".".join((cls.__qualname__, method_or_cls.__name__))
-    except AttributeError:
-        pass
-
-    try:
-        method_or_cls.__doc__ = (
-            _overwrite_doc or "Method or class generated by LLMConfig for class " f"{cls.__qualname__}."
-        )
-    except AttributeError:
-        pass
-
-    return method_or_cls
-
-
 # cached it here to save one lookup per assignment
 _object_getattribute = object.__getattribute__

@@ -506,8 +427,8 @@ class ModelSettings(t.TypedDict, total=False):
    generation_class: t.Type[GenerationConfig]


-_ModelSettings: type[attr.AttrsInstance] = _add_method_dunders(
-    type("__internal__", (ModelSettings,), {"__module__": "openllm._configuration"}),
+_ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
+    type("__openllm_internal__", (ModelSettings,), {"__module__": "openllm._configuration"}),
    attr.make_class(
        "ModelSettings",
        {
@@ -563,7 +484,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
    partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")

    def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
-        _has_own_gen = _has_own_attribute(cl_, "GenerationConfig")
+        _has_own_gen = codegen.has_own_attribute(cl_, "GenerationConfig")
        return [
            f.evolve(
                default=_populate_value_from_env_var(
@@ -576,20 +497,20 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
            for f in fields
        ]

-    _target: DictStrAny = {
-        "default_id": settings["default_id"],
-        "model_ids": settings["model_ids"],
-        "url": settings.get("url", ""),
-        "requires_gpu": settings.get("requires_gpu", False),
-        "trust_remote_code": settings.get("trust_remote_code", False),
-        "requirements": settings.get("requirements", None),
-        "name_type": name_type,
-        "model_name": model_name,
-        "start_name": start_name,
-        "env": openllm.utils.ModelEnv(model_name),
-        "timeout": settings.get("timeout", 3600),
-        "workers_per_resource": settings.get("workers_per_resource", 1),
-        "generation_class": attr.make_class(
+    return cls(
+        default_id=settings["default_id"],
+        model_ids=settings["model_ids"],
+        url=settings.get("url", ""),
+        requires_gpu=settings.get("requires_gpu", False),
+        trust_remote_code=settings.get("trust_remote_code", False),
+        requirements=settings.get("requirements", None),
+        name_type=name_type,
+        model_name=model_name,
+        start_name=start_name,
+        env=openllm.utils.ModelEnv(model_name),
+        timeout=settings.get("timeout", 3600),
+        workers_per_resource=settings.get("workers_per_resource", 1),
+        generation_class=attr.make_class(
            f"{_cl_name}GenerationConfig",
            [],
            bases=(GenerationConfig,),
@@ -599,18 +520,12 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
            repr=True,
            field_transformer=auto_env_transformers,
        ),
-    }
-
-    return cls(**_target)
+    )


 bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)


-def _generate_unique_filename(cls: type[t.Any], func_name: str):
-    return f"<LLMConfig generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>"
-
-
 def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
    """
    Use the builtin setattr to set *attr_name* to *value_var*.
@@ -632,7 +547,7 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
        "cls": cls,
        "_cached_attribute": attributes,
        "_cached_getattribute_get": _object_getattribute.__get__,
-        "__add_dunder": _add_method_dunders,
+        "__add_dunder": codegen.add_method_dunders,
    }
    annotations: DictStrAny = {"return": None}

@@ -643,19 +558,9 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
        lines.append(_setattr_class(arg_name, attr_name, add_dunder=attr_name in _dunder_add))
        annotations[attr_name] = field.type

-    script = "def __assign_attr(cls, %s):\n    %s\n" % (", ".join(args), "\n    ".join(lines) if lines else "pass")
-    assign_method = _make_method(
-        "__assign_attr",
-        script,
-        _generate_unique_filename(cls, "__assign_attr"),
-        globs,
+    return codegen.generate_function(
+        cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=annotations
    )
-    assign_method.__annotations__ = annotations
-
-    if DEBUG:
-        logger.info("Generated script:\n%s", script)
-
-    return assign_method


 _reserved_namespace = {"__config__", "GenerationConfig"}
@@ -841,7 +746,7 @@ class LLMConfig:
        _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettings))(cls)
        # process a fields under cls.__dict__ and auto convert them with dantic.Field
        cd = cls.__dict__
-        anns = _get_annotations(cls)
+        anns = codegen.get_annotations(cls)
        partialed = functools.partial(_field_env_key, model_name=cls.__openllm_model_name__)

        def auto_config_env(_: type[LLMConfig], attrs: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
@@ -861,7 +766,7 @@ class LLMConfig:
        these: dict[str, _CountingAttr[t.Any]] = {}
        annotated_names: set[str] = set()
        for attr_name, typ in anns.items():
-            if _is_class_var(typ):
+            if codegen.is_class_var(typ):
                continue
            annotated_names.add(attr_name)
            val = cd.get(attr_name, attr.NOTHING)
@@ -907,7 +812,7 @@ class LLMConfig:
        cls.__attrs_attrs__ = attrs
        # generate a __attrs_init__ for the subclass, since we will
        # implement a custom __init__
-        cls.__attrs_init__ = _add_method_dunders(
+        cls.__attrs_init__ = codegen.add_method_dunders(
            cls,
            _make_init(
                cls,  # cls (the attrs-decorated class)
@@ -924,7 +829,7 @@ class LLMConfig:
            ),
        )
        # __repr__ function with the updated fields.
-        cls.__repr__ = _add_method_dunders(cls, _make_repr(cls.__attrs_attrs__, None, cls))
+        cls.__repr__ = codegen.add_method_dunders(cls, _make_repr(cls.__attrs_attrs__, None, cls))
        # Traverse the MRO to collect existing slots
        # and check for an existing __weakref__.
        existing_slots: DictStrAny = dict()
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -19,11 +19,11 @@ import functools
 import logging
 import os
 import re
-import sys
 import types
 import typing as t
-from abc import ABC, ABCMeta, abstractmethod
+from abc import ABC, abstractmethod

+import attr
 import bentoml
 import inflection
 import orjson
@@ -33,7 +33,6 @@ from bentoml._internal.types import ModelSignatureDict
 import openllm

 from .exceptions import ForbiddenAttributeError, OpenLLMException
-from .models.auto import AutoConfig
 from .utils import ENV_VARS_TRUE_VALUES, LazyLoader, bentoml_cattr

 if t.TYPE_CHECKING:
@@ -64,6 +63,8 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
    "flax": ("FlaxAutoModelForCausalLM", "FlaxAutoModelForSeq2SeqLM"),
 }

+TOKENIZER_PREFIX = "_tokenizer_"
+

 def convert_transformers_model_name(name: str) -> str:
    if os.path.exists(os.path.dirname(name)):
@@ -178,9 +179,12 @@ class LLMInterface(ABC):
    """The config class to use for this LLM. If you are creating a custom LLM, you must specify this class."""

    @property
-    def import_kwargs(self) -> dict[str, t.Any] | None:
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
        """The default import kwargs to used when importing the model.
-        This will be passed into 'openllm.LLM.import_model'."""
+        This will be passed into 'openllm.LLM.import_model'.
+        It returns two dictionaries: one for model kwargs and one for tokenizer kwargs.
+        """
+        return

    @abstractmethod
    def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any:
@@ -254,132 +258,87 @@ _M = t.TypeVar("_M")
 _T = t.TypeVar("_T")


-class LLMMetaclass(ABCMeta):
-    def __new__(
-        mcls, cls_name: str, bases: tuple[type[t.Any], ...], namespace: dict[str, t.Any], **attrs: t.Any
-    ) -> type:
-        """Metaclass for creating a LLM."""
-        if LLMInterface not in bases:  # only actual openllm.LLM should hit this branch.
-            annotations_dict: dict[str, t.Any] = {}
-
-            # NOTE: set implementation branch
-            prefix_class_name_config = cls_name
-            if cls_name.startswith("Flax"):
-                implementation = "flax"
-                prefix_class_name_config = cls_name[4:]
-                annotations_dict.update(
-                    {
-                        "__llm_model__": "transformers.FlaxPreTrainedModel",
-                        "model": "transformers.FlaxPreTrainedModel",
-                    }
-                )
-            elif cls_name.startswith("TF"):
-                implementation = "tf"
-                prefix_class_name_config = cls_name[2:]
-                annotations_dict.update(
-                    {
-                        "__llm_model__": "transformers.TFPreTrainedModel",
-                        "model": "transformers.TFPreTrainedModel",
-                    }
-                )
-            else:
-                implementation = "pt"
-                annotations_dict.update(
-                    {
-                        "__llm_model__": "transformers.PreTrainedModel",
-                        "model": "transformers.PreTrainedModel",
-                    }
-                )
-            namespace["__llm_implementation__"] = implementation
-
-            config_class = AutoConfig.infer_class_from_name(prefix_class_name_config)
-            annotations_dict["config_class"] = f"type[openllm.{config_class.__qualname__}]"
-            annotations_dict["config"] = f"openllm.{config_class.__qualname__}"
-
-            # NOTE: setup config class branch
-            if "__openllm_internal__" in namespace:
-                # NOTE: we will automatically find the subclass for this given config class
-                if "config_class" not in namespace:
-                    # this branch we will automatically get the class
-                    namespace["config_class"] = config_class
-                else:
-                    logger.debug(f"Using config class {namespace['config_class']} for {cls_name}.")
-            # NOTE: check for required attributes
-            else:
-                if "config_class" not in namespace:
-                    raise RuntimeError(
-                        "Missing required key 'config_class'. Make sure to define it within the LLM subclass."
-                    )
-
-            # NOTE: the llm_post_init branch
-            if "llm_post_init" in namespace:
-                original_llm_post_init = namespace["llm_post_init"]
-
-                def wrapped_llm_post_init(self: LLM[t.Any, t.Any]) -> None:
-                    """We need to both initialize private attributes and call the user-defined model_post_init
-                    method.
-                    """
-                    _default_post_init(self)
-                    original_llm_post_init(self)
-
-                namespace["llm_post_init"] = wrapped_llm_post_init
-            else:
-                namespace["llm_post_init"] = _default_post_init
-
-            # NOTE: import_model branch
-            if "import_model" not in namespace:
-                # using the default import model
-                namespace["import_model"] = functools.partial(import_model, _model_framework=implementation)
-            else:
-                logger.debug("Using custom 'import_model' for %s", cls_name)
-
-            namespace.update({k: None for k in {"__llm_model__", "__llm_tokenizer__", "__llm_bentomodel__"}})
-            tokenizer_type = "typing.Optional[transformers.PreTrainedTokenizerFast]"
-            annotations_dict.update(
-                {
-                    "load_in_mha": "bool",
-                    "tokenizer": tokenizer_type,
-                    "__llm_tokenizer__": tokenizer_type,
-                    "__llm_bentomodel__": "typing.Optional[bentoml.Model]",
-                }
-            )
-            # NOTE: transformers to ForwardRef
-            for key, value in annotations_dict.items():
-                if (3, 10) > sys.version_info >= (3, 9, 8) or sys.version_info >= (3, 10, 1):
-                    annotations_dict[key] = t.ForwardRef(value, is_argument=False, is_class=True)
-                else:
-                    annotations_dict[key] = t.ForwardRef(value, is_argument=False)
-
-            namespace["__annotations__"] = annotations_dict
-
-            cls: type[LLM[t.Any, t.Any]] = super().__new__(
-                t.cast("type[type[LLM[t.Any, t.Any]]]", mcls), cls_name, bases, namespace, **attrs
-            )
-
-            cls.__openllm_post_init__ = None if cls.llm_post_init is LLMInterface.llm_post_init else cls.llm_post_init
-            cls.__openllm_custom_load__ = None if cls.load_model is LLMInterface.load_model else cls.load_model
-            return cls
-        else:
-            # the LLM class itself being created, no need to setup
-            return super().__new__(mcls, cls_name, bases, namespace, **attrs)
-
-
-class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
+@attr.define(slots=True, repr=False)
+class LLM(LLMInterface, t.Generic[_M, _T]):
    if t.TYPE_CHECKING:
-        # NOTE: the following will be populated by metaclass
+        # The following will be populated by metaclass
+        __llm_trust_remote_code__: bool
        __llm_implementation__: t.Literal["pt", "tf", "flax"]
+        __llm_model__: _M | None
+        __llm_tokenizer__: _T | None
+        __llm_tag__: bentoml.Tag | None
+        __llm_bentomodel__: bentoml.Model | None

        __openllm_post_init__: t.Callable[[t.Self], None] | None
        __openllm_custom_load__: t.Callable[[t.Self, t.Any, t.Any], None] | None
+        __openllm_custom_import_kwargs__: property | None

-        load_in_mha: bool
-        _llm_attrs: dict[str, t.Any]
-        _llm_args: tuple[t.Any, ...]
+        _model_args: tuple[t.Any, ...]
+        _model_attrs: dict[str, t.Any]
+        _tokenizer_attrs: dict[str, t.Any]

-    # NOTE: the following is the similar interface to HuggingFace pretrained protocol.
+    def __init_subclass__(cls):
+        cd = cls.__dict__
+        prefix_class_name_config = cls.__name__
+        if prefix_class_name_config.startswith("Flax"):
+            implementation = "flax"
+            prefix_class_name_config = prefix_class_name_config[4:]
+        elif prefix_class_name_config.startswith("TF"):
+            implementation = "tf"
+            prefix_class_name_config = prefix_class_name_config[2:]
+        else:
+            implementation = "pt"
+        cls.__llm_implementation__ = implementation
+        config_class = openllm.AutoConfig.infer_class_from_name(prefix_class_name_config)
+
+        if "__openllm_internal__" in cd:
+            if "config_class" not in cd:
+                cls.config_class = config_class
+            else:
+                logger.debug(f"Using config class {cd['config_class']} for {cls.__name__}.")
+        else:
+            if "config_class" not in cd:
+                raise RuntimeError(
+                    "Missing required key 'config_class'. Make sure to define it within the LLM subclass."
+                )
+
+        if cls.llm_post_init is not LLMInterface.llm_post_init:
+            original_llm_post_init = cd["llm_post_init"]
+
+            def wrapped_llm_post_init(self: t.Self) -> None:
+                """We need to both initialize private attributes and call the user-defined model_post_init
+                method.
+                """
+                _default_post_init(self)
+                original_llm_post_init(self)
+
+            cls.llm_post_init = wrapped_llm_post_init
+        else:
+            setattr(cls, "llm_post_init", _default_post_init)
+
+        if cls.import_model is LLMInterface.import_model:
+            # using the default import model
+            setattr(cls, "import_model", functools.partial(import_model, _model_framework=implementation))
+        else:
+            logger.debug("Using custom 'import_model' for %s", cls.__name__)
+
+        cls.__openllm_post_init__ = None if cls.llm_post_init is LLMInterface.llm_post_init else cls.llm_post_init
+        cls.__openllm_custom_load__ = None if cls.load_model is LLMInterface.load_model else cls.load_model
+        cls.__openllm_custom_import_kwargs__ = (
+            None if cls.import_kwargs is LLMInterface.import_kwargs else cls.import_kwargs
+        )
+
+        for at in {"bentomodel", "tag", "model", "tokenizer"}:
+            setattr(cls, f"__llm_{at}__", None)
+
+    # The following is the similar interface to HuggingFace pretrained protocol.
    @classmethod
    def from_pretrained(
-        cls, model_id: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, **attrs: t.Any
+        cls,
+        model_id: str | None = None,
+        llm_config: openllm.LLMConfig | None = None,
+        *args: t.Any,
+        **attrs: t.Any,
    ) -> LLM[_M, _T]:
        return cls(model_id=model_id, llm_config=llm_config, *args, **attrs)

@@ -474,6 +433,7 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
        """

        load_in_mha = attrs.pop("load_in_mha", False)
+        openllm_model_version = attrs.pop("openllm_model_version", None)

        if llm_config is not None:
            logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
@@ -490,17 +450,37 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
        # NOTE: This is the actual given path or pretrained weight for this LLM.
        self._model_id = model_id

+        # parsing tokenizer and model kwargs
+        tokenizer_kwds = {k[len(TOKENIZER_PREFIX) :]: v for k, v in attrs.items() if k.startswith(TOKENIZER_PREFIX)}
+        for k in attrs:
+            if k in tokenizer_kwds:
+                del attrs[k]
+        # the rest of kwargs should be model kwargs
+        model_kwds = attrs
+
+        if self.import_kwargs:
+            _custom_model_kwds, _custom_tokenizer_kwds = self.import_kwargs
+            model_kwds.update(_custom_model_kwds)
+            tokenizer_kwds.update(_custom_tokenizer_kwds)
+
+        # handle trust_remote_code
+        self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
+
        # NOTE: Save the args and kwargs for latter load
-        self._llm_args = args
-        self._llm_attrs = attrs
+        self._model_args = args
+        self._model_attrs = model_kwds
+        self._tokenizer_attrs = tokenizer_kwds

        if self.__openllm_post_init__:
            self.llm_post_init()

        # finally: we allow users to overwrite the load_in_mha defined by the LLM subclass.
        if load_in_mha:
+            logger.debug("Overwriting 'load_in_mha=%s' (base load_in_mha=%s)", load_in_mha, self.load_in_mha)
            self.load_in_mha = load_in_mha

+        self._openllm_model_version = openllm_model_version
+
    def __setattr__(self, attr: str, value: t.Any):
        if attr in _reserved_namespace:
            raise ForbiddenAttributeError(
@@ -511,10 +491,18 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):

        super().__setattr__(attr, value)

+    def __repr__(self) -> str:
+        keys = {"model_id", "runner_name", "llm_type", "config"}
+        return f"{self.__class__.__name__}({', '.join(f'{k}={getattr(self, k)!r}' for k in keys)})"
+
    @property
    def model_id(self) -> str:
        return self._model_id

+    @property
+    def runner_name(self) -> str:
+        return f"llm-{self.config.__openllm_start_name__}-runner"
+
    # NOTE: The section below defines a loose contract with langchain's LLM interface.
    @property
    def llm_type(self) -> str:
@@ -527,42 +515,14 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
            "model_ids": orjson.dumps(self.config.__openllm_model_ids__).decode(),
        }

-    @t.overload
-    def make_tag(
-        self,
-        model_id: str | None = None,
-        return_unused_kwargs: t.Literal[False] = ...,
-        trust_remote_code: bool = ...,
-        **attrs: t.Any,
-    ) -> bentoml.Tag:
-        ...
-
-    @t.overload
-    def make_tag(
-        self,
-        model_id: str | None = None,
-        return_unused_kwargs: t.Literal[True] = ...,
-        trust_remote_code: bool = ...,
-        **attrs: t.Any,
-    ) -> tuple[bentoml.Tag, dict[str, t.Any]]:
-        ...
-
-    def make_tag(
-        self,
-        model_id: str | None = None,
-        return_unused_kwargs: bool = False,
-        trust_remote_code: bool = False,
-        **attrs: t.Any,
-    ) -> bentoml.Tag | tuple[bentoml.Tag, dict[str, t.Any]]:
+    def make_tag(self, model_id: str | None = None) -> bentoml.Tag:
        """Generate a ``bentoml.Tag`` from a given transformers model name.

        Note that this depends on your model to have a config class available.

        Args:
-            model_name_or_path: The transformers model name or path to load the model from.
-                                If it is a path, then `openllm_model_version` must be passed in as a kwarg.
-            return_unused_kwargs: Whether to return unused kwargs. Defaults to False. If set, it will return a tuple
-                                  of ``bentoml.Tag`` and a dict of unused kwargs.
+            model_id: The transformers model name or path to load the model from.
+                      If it is a path, then `openllm_model_version` must be passed in as a kwarg.
            trust_remote_code: Whether to trust the remote code. Defaults to False.
            **attrs: Additional kwargs to pass to the ``transformers.AutoConfig`` constructor.
                    If your pass ``return_unused_kwargs=True``, it will be ignored.
@@ -572,88 +532,43 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
        """
        if model_id is None:
            model_id = self._model_id
-
-        if "return_unused_kwargs" in attrs:
-            logger.debug("Ignoring 'return_unused_kwargs' in 'generate_tag_from_model_name'.")
-            attrs.pop("return_unused_kwargs", None)
-
-        config, attrs = t.cast(
-            "tuple[transformers.PretrainedConfig, dict[str, t.Any]]",
-            transformers.AutoConfig.from_pretrained(
-                model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **attrs
-            ),
-        )
        name = convert_transformers_model_name(model_id)

-        if os.path.exists(os.path.dirname(model_id)):
-            # If the model_name_or_path is a path, we assume it's a local path,
-            # then users must pass a version for this.
-            model_version = attrs.pop("openllm_model_version", None)
-            if model_version is None:
+        config = t.cast(
+            "transformers.PretrainedConfig",
+            transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=self.__llm_trust_remote_code__),
+        )
+
+        model_version = getattr(config, "_commit_hash", None)
+        if model_version is None:
+            if self._openllm_model_version is not None:
                logger.warning(
-                    """\
-            When passing a path, it is recommended to also pass 'openllm_model_version'
-            into Runner/AutoLLM intialization.
-
-            For example:
-
-            >>> import openllm
-            >>> runner = openllm.Runner('/path/to/fine-tuning/model', openllm_model_version='lora-version')
-
-            Example with AutoLLM:
-
-            >>> import openllm
-            >>> model = openllm.AutoLLM.for_model('/path/to/fine-tuning/model', openllm_model_version='lora-version')
-
-            No worries, OpenLLM will generate one for you. But for your own convenience, make sure to
-            specify 'openllm_model_version'.
-            """
-                )
-                model_version = bentoml.Tag.from_taglike(name).make_new_version().version
-        else:
-            model_version = getattr(config, "_commit_hash", None)
-            if model_version is None:
-                logger.warning(
-                    "Given %s from '%s' doesn't contain a commit hash. We will generate"
-                    " the tag without specific version.",
-                    t.cast("type[transformers.PretrainedConfig]", config.__class__),
+                    "Given %s from '%s' doesn't contain a commit hash, and 'openllm_model_version' is not given. "
+                    "We will generate the tag without specific version.",
+                    config.__class__,
                    model_id,
                )
-        tag = bentoml.Tag.from_taglike(f"{self.__llm_implementation__}-{name}:{model_version}")
+                model_version = bentoml.Tag.from_taglike(name).make_new_version().version
+            else:
+                logger.debug("Using %s for '%s' as model version", self._openllm_model_version, model_id)
+                model_version = self._openllm_model_version

-        if return_unused_kwargs:
-            return tag, attrs
-        return tag
+        return bentoml.Tag.from_taglike(f"{self.__llm_implementation__}-{name}:{model_version}")

    def ensure_model_id_exists(self) -> bentoml.Model:
-        trust_remote_code = self._llm_attrs.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
-        tag, kwds = self.make_tag(return_unused_kwargs=True, trust_remote_code=trust_remote_code, **self._llm_attrs)
        try:
-            return bentoml.transformers.get(tag)
+            return bentoml.transformers.get(self.tag)
        except bentoml.exceptions.NotFound:
-            logger.info("'%s' with tag (%s) not found, importing from HuggingFace Hub.", self.__class__.__name__, tag)
-            tokenizer_kwds = {k[len("_tokenizer_") :]: v for k, v in kwds.items() if k.startswith("_tokenizer_")}
-            kwds = {k: v for k, v in kwds.items() if not k.startswith("_tokenizer_")}
-
-            if self.import_kwargs:
-                tokenizer_kwds = {
-                    **{
-                        k[len("_tokenizer_") :]: v for k, v in self.import_kwargs.items() if k.startswith("_tokenizer_")
-                    },
-                    **tokenizer_kwds,
-                }
-                kwds = {
-                    **{k: v for k, v in self.import_kwargs.items() if not k.startswith("_tokenizer_")},
-                    **kwds,
-                }
-
+            logger.info(
+                "'%s' with tag (%s) not found, importing from HuggingFace Hub.", self.__class__.__name__, self.tag
+            )
            return self.import_model(
                self._model_id,
-                tag,
-                *self._llm_args,
-                tokenizer_kwds=tokenizer_kwds,
-                trust_remote_code=trust_remote_code,
-                **kwds,
+                self.tag,
+                *self._model_args,
+                tokenizer_kwds=self._tokenizer_attrs,
+                trust_remote_code=self.__llm_trust_remote_code__,
+                **self._model_attrs,
            )
        finally:
            if openllm.utils.is_torch_available() and torch.cuda.is_available():
@@ -667,31 +582,30 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):

    @property
    def tag(self) -> bentoml.Tag:
-        return self._bentomodel.tag
+        if self.__llm_tag__ is None:
+            self.__llm_tag__ = self.make_tag()
+        return self.__llm_tag__

    @property
    def model(self) -> _M:
        """The model to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
        # Run check for GPU
-        trust_remote_code = self._llm_attrs.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
        self.config.check_if_gpu_is_available(implementation=self.__llm_implementation__)

-        kwds = {k: v for k, v in self._llm_attrs.items() if not k.startswith("_tokenizer_")}
+        kwds = self._model_attrs
+        kwds["trust_remote_code"] = self.__llm_trust_remote_code__

-        if self.import_kwargs:
-            kwds = {**{k: v for k, v in self.import_kwargs.items() if not k.startswith("_tokenizer_")}, **kwds}
-
-        kwds["trust_remote_code"] = trust_remote_code
        if self.load_in_mha and "_pretrained_class" not in self._bentomodel.info.metadata:
            # This is a pipeline, provide a accelerator args
            kwds["accelerator"] = "bettertransformer"

        if self.__llm_model__ is None:
            if self.__openllm_custom_load__:
-                self.__llm_model__ = self.load_model(self.tag, *self._llm_args, **kwds)
+                self.__llm_model__ = self.load_model(self.tag, *self._model_args, **kwds)
            else:
-                self.__llm_model__ = self._bentomodel.load_model(*self._llm_args, **kwds)
+                self.__llm_model__ = self._bentomodel.load_model(*self._model_args, **kwds)

+            # TODO: self.config.__openllm_runtime__: t.Literal['python', 'cpp']
            if (
                self.load_in_mha
                and all(i in self._bentomodel.info.metadata for i in ("_framework", "_pretrained_class"))
@@ -743,11 +657,7 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
        - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this.
            The generated name will be 'llm-<model-start-name>-runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
        """
-
-        name = f"llm-{self.config.__openllm_start_name__}-runner"
        models = models if models is not None else []
-
-        # NOTE: The side effect of this is that will load the imported model during runner creation.
        models.append(self._bentomodel)

        if scheduling_strategy is None:
@@ -776,6 +686,12 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
                cls.llm_type = llm_type
                cls.identifying_params = identifying_params

+            def __init__(__self: _Runnable):
+                # NOTE: The side effect of this line
+                # is that it will load the imported model during
+                # runner creation. So don't remove it!!
+                self.model
+
            @bentoml.Runnable.method(
                batchable=generate_sig.batchable,
                batch_dim=generate_sig.batch_dim,
@@ -803,7 +719,7 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
            def generate_iterator(__self, prompt: str, **attrs: t.Any) -> t.Iterator[t.Any]:
                yield self.generate_iterator(prompt, **attrs)

-        def _wrapped_generate_run(__self: bentoml.Runner, *args: t.Any, **kwargs: t.Any) -> t.Any:
+        def _wrapped_generate_run(__self: LLMRunner, prompt: str, **kwargs: t.Any) -> t.Any:
            """Wrapper for runner.generate.run() to handle the prompt and postprocessing.

            This will be used for LangChain API.
@@ -814,10 +730,6 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
            runner("What is the meaning of life?")
            ```
            """
-            if len(args) > 1:
-                raise RuntimeError("Only one positional argument is allowed for generate()")
-            prompt = args[0] if len(args) == 1 else kwargs.pop("prompt", "")
-
            prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **kwargs)
            generated_result = __self.generate.run(prompt, **generate_kwargs)
            return self.postprocess_generate(prompt, generated_result, **postprocess_kwargs)
@@ -849,7 +761,7 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
                    "identifying_params": self.identifying_params,
                },
            ),
-            name=name,
+            name=self.runner_name,
            models=models,
            max_batch_size=max_batch_size,
            max_latency_ms=max_latency_ms,
@@ -868,6 +780,13 @@ class LLM(LLMInterface, t.Generic[_M, _T], metaclass=LLMMetaclass):
        First, it runs `self.sanitize_parameters` to sanitize the parameters.
        The the sanitized prompt and kwargs will be pass into self.generate.
        Finally, run self.postprocess_generate to postprocess the generated result.
+
+        This allows users to do the following:
+
+        ```python
+        llm = openllm.AutoLLM.for_model("dolly-v2")
+        llm("What is the meaning of life?")
+        ```
        """
        prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **attrs)
        generated_result = self.generate(prompt, **generate_kwargs)
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -884,11 +884,9 @@ def cli_factory() -> click.Group:
        else:
            model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)

-        tag = model.make_tag(trust_remote_code=config.__openllm_trust_remote_code__)
-
-        if len(bentoml.models.list(tag)) == 0:
+        if len(bentoml.models.list(model.tag)) == 0:
            if output == "pretty":
-                _echo(f"{tag} does not exists yet!. Downloading...", fg="yellow", nl=True)
+                _echo(f"{model.tag} does not exists yet!. Downloading...", fg="yellow", nl=True)
            m = model.ensure_model_id_exists()
            if output == "pretty":
                _echo(f"Saved model: {m.tag}")
@@ -899,9 +897,9 @@ def cli_factory() -> click.Group:
                    ).decode()
                )
            else:
-                _echo(tag)
+                _echo(model.tag)
        else:
-            m = bentoml.transformers.get(tag)
+            m = bentoml.transformers.get(model.tag)
            if output == "pretty":
                _echo(f"{model_name} is already setup for framework '{env}': {str(m.tag)}", nl=True, fg="yellow")
            elif output == "json":
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -37,11 +37,12 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken

    @property
    def import_kwargs(self):
-        return {
+        model_kwds = {
            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
            "torch_dtype": torch.bfloat16,
-            "_tokenizer_padding_side": "left",
        }
+        tokenizer_kwds = {"padding_side": "left"}
+        return model_kwds, tokenizer_kwds

    def llm_post_init(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -36,10 +36,12 @@ class Falcon(openllm.LLM["transformers.TextGenerationPipeline", "transformers.Pr

    @property
    def import_kwargs(self):
-        return {
+        model_kwds = {
            "torch_dtype": torch.bfloat16,
            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
        }
+        tokenizer_kwds: dict[str, t.Any] = {}
+        return model_kwds, tokenizer_kwds

    def import_model(
        self, model_id: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -51,11 +51,13 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN

    @property
    def import_kwargs(self):
-        return {
+        model_kwds = {
            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
            "load_in_8bit": False,
            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
        }
+        tokenizer_kwds: dict[str, t.Any] = {}
+        return model_kwds, tokenizer_kwds

    def sanitize_parameters(
        self,
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -45,12 +45,13 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.

    @property
    def import_kwargs(self):
-        return {
-            "_tokenizer_padding_side": "left",
+        model_kwds = {
            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
            "load_in_8bit": True if torch.cuda.device_count() > 1 else False,
            "torch_dtype": torch.float16,
        }
+        tokenizer_kwds = {"padding_side": "left"}
+        return model_kwds, tokenizer_kwds

    def import_model(
        self,
--- a/src/openllm/utils/codegen.py
+++ b/src/openllm/utils/codegen.py
@@ -33,10 +33,21 @@ if t.TYPE_CHECKING:
    from black.lines import LinesBlock
    from fs.base import FS

+    DictStrAny = dict[str, t.Any]
+
    class ModifyNodeProtocol(t.Protocol):
        def __call__(self, node: Node, model_name: str, formatter: type[ModelNameFormatter]) -> None:
            ...

+    from attr import _make_method
+else:
+    # NOTE: Using internal API from attr here, since we are actually
+    # allowing subclass of openllm.LLMConfig to become 'attrs'-ish
+    from attr._make import _make_method
+
+    DictStrAny = dict
+
+_T = t.TypeVar("_T")

 logger = logging.getLogger(__name__)

@@ -165,3 +176,113 @@ def _parse_service_file(src_contents: str, model_name: str, model_id: str, mode:
            return newline
        return ""
    return "".join(dst_contents)
+
+
+# NOTE: The following ins extracted from attrs internal APIs
+
+# sentinel object for unequivocal object() getattr
+_sentinel = object()
+
+
+def has_own_attribute(cls: type[t.Any], attrib_name: t.Any):
+    """
+    Check whether *cls* defines *attrib_name* (and doesn't just inherit it).
+    """
+    attr = getattr(cls, attrib_name, _sentinel)
+    if attr is _sentinel:
+        return False
+
+    for base_cls in cls.__mro__[1:]:
+        a = getattr(base_cls, attrib_name, None)
+        if attr is a:
+            return False
+
+    return True
+
+
+def get_annotations(cls: type[t.Any]) -> DictStrAny:
+    """
+    Get annotations for *cls*.
+    """
+    if has_own_attribute(cls, "__annotations__"):
+        return cls.__annotations__
+
+    return DictStrAny()
+
+
+_classvar_prefixes = (
+    "typing.ClassVar",
+    "t.ClassVar",
+    "ClassVar",
+    "typing_extensions.ClassVar",
+)
+
+
+def is_class_var(annot: str | t.Any) -> bool:
+    """
+    Check whether *annot* is a typing.ClassVar.
+
+    The string comparison hack is used to avoid evaluating all string
+    annotations which would put attrs-based classes at a performance
+    disadvantage compared to plain old classes.
+    """
+    annot = str(annot)
+
+    # Annotation can be quoted.
+    if annot.startswith(("'", '"')) and annot.endswith(("'", '"')):
+        annot = annot[1:-1]
+
+    return annot.startswith(_classvar_prefixes)
+
+
+def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str | None = None) -> _T:
+    """
+    Add __module__ and __qualname__ to a *method* if possible.
+    """
+    try:
+        method_or_cls.__module__ = cls.__module__
+    except AttributeError:
+        pass
+
+    try:
+        method_or_cls.__qualname__ = ".".join((cls.__qualname__, method_or_cls.__name__))
+    except AttributeError:
+        pass
+
+    try:
+        method_or_cls.__doc__ = (
+            _overwrite_doc or "Method or class generated by LLMConfig for class " f"{cls.__qualname__}."
+        )
+    except AttributeError:
+        pass
+
+    return method_or_cls
+
+
+def generate_unique_filename(cls: type[t.Any], func_name: str):
+    return f"<{cls.__name__} generated {func_name} {cls.__module__}." f"{getattr(cls, '__qualname__', cls.__name__)}>"
+
+
+def generate_function(
+    typ: type[t.Any],
+    func_name: str,
+    lines: list[str] | None,
+    args: tuple[str, ...] | None,
+    globs: dict[str, t.Any],
+    annotations: dict[str, t.Any] | None = None,
+):
+    from . import DEBUG
+
+    script = "def %s(%s):\n    %s\n" % (
+        func_name,
+        ", ".join(args) if args is not None else "",
+        "\n    ".join(lines) if lines else "pass",
+    )
+    meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs)
+    if annotations:
+        meth.__annotations__ = annotations
+
+    if DEBUG:
+        logger.info("Generated script for %s:\n\n%s", typ, script)
+
+    return meth