From 19f20c7dad2b391c1b4867bae950ada6fc58387d Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Sat, 22 Jul 2023 17:15:03 -0400
Subject: [PATCH] perf(serialisation): implement wrapper to reduce callstack
 (#132)

---
 changelog.d/132.breaking.md                   |   4 +
 src/openllm/_llm.py                           | 228 +++++++++++++-----
 .../models/baichuan/modeling_baichuan.py      |   3 -
 .../models/chatglm/modeling_chatglm.py        |   3 -
 .../models/dolly_v2/modeling_dolly_v2.py      |  16 +-
 src/openllm/models/falcon/modeling_falcon.py  |  16 --
 .../models/flan_t5/modeling_flan_t5.py        |   3 -
 .../models/gpt_neox/modeling_gpt_neox.py      |   5 +-
 src/openllm/models/llama/modeling_llama.py    |   3 -
 .../models/llama/modeling_vllm_llama.py       |   5 +-
 src/openllm/models/mpt/modeling_mpt.py        |   5 +-
 src/openllm/models/opt/modeling_opt.py        |   8 +-
 .../models/stablelm/modeling_stablelm.py      |   1 -
 .../models/starcoder/modeling_starcoder.py    |   3 -
 src/openllm/serialisation/__init__.py         |  54 +++--
 src/openllm/serialisation/ggml.py             |  40 ++-
 src/openllm/serialisation/transformers.py     |  86 ++-----
 17 files changed, 259 insertions(+), 224 deletions(-)
 create mode 100644 changelog.d/132.breaking.md

diff --git a/changelog.d/132.breaking.md b/changelog.d/132.breaking.md
new file mode 100644
index 00000000..4ae5b446
--- /dev/null
+++ b/changelog.d/132.breaking.md
@@ -0,0 +1,4 @@
+Updated signature for `load_model` and `load_tokenizer` not to allow tag.
+Tag can be accessed via `llm.tag`, or if using `openllm.serialisation` or `bentoml.transformers` then you can use `self._bentomodel`
+
+Updated serialisation shared logics to reduce callstack for saving three calltrace.
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 5b58d19a..e1210054 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -38,12 +38,15 @@ from bentoml._internal.models.model import ModelSignature
 
 from ._configuration import AdapterType
 from ._configuration import FineTuneConfig
+from ._configuration import _object_getattribute
+from ._configuration import _setattr_class
 from ._quantisation import infer_quantisation_config
 from .exceptions import ForbiddenAttributeError
 from .exceptions import GpuNotAvailableError
 from .utils import DEBUG
 from .utils import ENV_VARS_TRUE_VALUES
 from .utils import MYPY
+from .utils import SHOW_CODEGEN
 from .utils import EnvVarMixin
 from .utils import LazyLoader
 from .utils import ReprMixin
@@ -77,12 +80,12 @@ if t.TYPE_CHECKING:
     import vllm
 
     import transformers
-    from bentoml._internal.runner.strategy import Strategy
 
     from ._configuration import PeftType
     from ._types import AdaptersMapping
     from ._types import AdaptersTuple
     from ._types import DictStrAny
+    from ._types import ListStr
     from ._types import LiteralRuntime
     from ._types import LLMEmbeddings
     from ._types import LLMRunnable
@@ -244,7 +247,7 @@ _reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"}
 
 M = t.TypeVar(
     "M",
-    bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM]",
+    bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]",
 )
 T = t.TypeVar(
     "T",
@@ -348,10 +351,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
         """
         raise NotImplementedError
 
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> M:
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> M:
         """This function can be implemented to override the default load_model behaviour.
 
-        See falcon for example implementation.
+        See falcon for example implementation. Tag can be accessed via ``self.tag``
         """
         raise NotImplementedError
 
@@ -394,8 +397,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
     - `OPTForConditionalGeneration` -> `pt`
     - `TFOPTForConditionalGeneration` -> `tf`
     - `FlaxOPTForConditionalGeneration` -> `flax`
+
+    An additional naming for all VLLM backend: VLLMLlaMA -> `vllm`
     """
-    __llm_model__: M | peft.PeftModel | None
+    __llm_model__: M | None
     """A reference to the actual model. Instead of access this directly, you should use `model` property instead."""
     __llm_tokenizer__: T | None
     """A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead."""
@@ -404,13 +409,6 @@ class LLMInterface(ABC, t.Generic[M, T]):
     __llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
     """A reference to the the cached LoRA adapter mapping."""
 
-    __llm_custom_import__: bool
-    """Whether this LLM has a custom import_model"""
-    __llm_custom_load__: bool
-    """A boolean to determine whether a custom 'load_model' is implemented"""
-    __llm_custom_tokenizer__: bool
-    """A boolean to determine whether a custom 'load_tokenizer' is implemented"""
-
     if t.TYPE_CHECKING and not MYPY:
 
         def __attrs_init__(
@@ -432,6 +430,121 @@ class LLMInterface(ABC, t.Generic[M, T]):
             """Generated __attrs_init__ for openllm.LLM."""
 
 
+if t.TYPE_CHECKING:
+    _R = t.TypeVar("_R")
+
+    class _import_model_wrapper(t.Generic[_R, M, T]):
+        def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
+            ...
+
+    class _load_model_wrapper(t.Generic[M, T]):
+        def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
+            ...
+
+    class _load_tokenizer_wrapper(t.Generic[M, T]):
+        def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
+            ...
+
+    class _llm_post_init_wrapper(t.Generic[M, T]):
+        def __call__(self, llm: LLM[M, T]) -> T:
+            ...
+
+
+def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]):
+    @functools.wraps(f)
+    def wrapper(
+        self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any
+    ) -> bentoml.Model:
+        trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
+        # wrapped around custom init to provide some meta compression
+        # for all decls and attrs
+        (model_decls, model_attrs), _ = self.llm_parameters
+        decls = (*model_decls, *decls)
+        attrs = {**model_attrs, **attrs}
+        return f(self, *decls, trust_remote_code=trust_remote_code, **attrs)
+
+    return wrapper
+
+
+def _wrapped_load_model(f: _load_model_wrapper[M, T]):
+    @functools.wraps(f)
+    def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
+        # wrapped around custom init to provide some meta compression
+        # for all decls and attrs
+        (model_decls, model_attrs), _ = self.llm_parameters
+        decls = (*model_decls, *decls)
+        attrs = {**model_attrs, **attrs}
+        return f(self, *decls, **attrs)
+
+    return wrapper
+
+
+def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]):
+    @functools.wraps(f)
+    def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
+        _, model_tokenizer_attrs = self.llm_parameters
+        tokenizer_attrs = {**model_tokenizer_attrs, **tokenizer_attrs}
+        return f(self, **tokenizer_attrs)
+
+    return wrapper
+
+
+def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]:
+    @functools.wraps(f)
+    def wrapper(self: LLM[M, T]):
+        _default_post_init(self)
+        f(self)
+
+    return wrapper
+
+
+def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
+    attributes = {
+        "import_model": _wrapped_import_model,
+        "load_model": _wrapped_load_model,
+        "load_tokenizer": _wrapped_load_tokenizer,
+        "llm_post_init": _wrapped_llm_post_init,
+    }
+    args: ListStr = []
+    anns: DictStrAny = {}
+    lines: ListStr = []
+    globs: DictStrAny = {
+        "cls": cls,
+        "_cached_attribute": attributes,
+        "_cached_getattribute_get": _object_getattribute.__get__,
+        "LLMInterface": LLMInterface,
+        "openllm": openllm,
+    }
+    # function initialisation
+    for func, impl in attributes.items():
+        globs[f"__wrapped_{func}"] = impl
+        impl_name = f"__wrapped_{func}"
+        cached_func_name = f"_cached_{cls.__name__}_func"
+        if func == "llm_post_init":
+            func_call = f"_impl_{cls.__name__}_{func}={impl_name}"
+        else:
+            func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_getattr('{func}') else openllm.serialisation.{func}"
+        lines.extend(
+            [
+                "_cached_LLMInterface_getattr=_cached_getattribute_get(LLMInterface)",
+                f"{cached_func_name}=cls.{func}",
+                func_call,
+                _setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),
+            ]
+        )
+
+    # cached attribute initialisation
+    interface_anns = codegen.get_annotations(LLMInterface)
+    for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
+        lines.append(_setattr_class(f"__llm_{v}__", None))
+        anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
+
+    if SHOW_CODEGEN:
+        logger.info("Generated script for %s:\n\n%s", cls.__name__, "\n".join(lines))
+
+    return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
+
+
 _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
 
 
@@ -480,46 +593,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
         elif "config_class" not in cd:
             raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
 
-        _custom_import = True
-        if cls.import_model is LLMInterface[M, T].import_model:
-            # using the default import model if no custom import is set
-            _custom_import = False
-            setattr(cls, "import_model", openllm.serialisation.import_model)
-        else:
-            import_func = getattr(cls, "import_model")
-
-            def _wrapped_import_model(
-                self: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any
-            ) -> bentoml.Model:
-                # wrapped around custom init to provide some meta compression
-                # for all decls and attrs
-                (model_decls, model_attrs), _ = self.llm_parameters
-
-                decls = (*model_decls, *decls)
-                attrs = {**model_attrs, **attrs}
-
-                return import_func(self, *decls, trust_remote_code=trust_remote_code, **attrs)
-
-            setattr(cls, "import_model", functools.update_wrapper(_wrapped_import_model, cls.import_model))
-
-        if cls.llm_post_init is LLMInterface[M, T].llm_post_init:
-            # using the default post init if no custom post init is set
-            wrapped_post_init = _default_post_init
-        else:
-            original_post_init = getattr(cls, "llm_post_init")
-
-            def wrapped_post_init(self: LLM[M, T]) -> None:
-                _default_post_init(self)
-                original_post_init(self)
-
-        setattr(cls, "llm_post_init", wrapped_post_init)
-
-        cls.__llm_custom_import__ = _custom_import
-        cls.__llm_custom_load__ = False if cls.load_model is LLMInterface[M, T].load_model else True
-        cls.__llm_custom_tokenizer__ = False if cls.load_tokenizer is LLMInterface[M, T].load_tokenizer else True
-
-        for at in {"bentomodel", "model", "tokenizer", "adapter_map"}:
-            setattr(cls, f"__llm_{at}__", None)
+        _make_assignment_script(cls)(cls)
 
         # update docstring for given entrypoint
         for fn in {"generate", "generate_one", "generate_iterator"}:
@@ -546,7 +620,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
                 M,
                 BetterTransformer.reverse(t.cast("transformers.PreTrainedModel", self.__llm_model__)),
             )
-
         openllm.serialisation.save_pretrained(self, save_directory, **attrs)
 
     @classmethod
@@ -997,16 +1070,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
             raise GpuNotAvailableError(f"{self} only supports running with GPU (None available).") from None
 
         if self.__llm_model__ is None:
-            self.__llm_model__ = t.cast(
-                M, openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
-            )
-        return t.cast(M, self.__llm_model__)
+            # NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
+            self.__llm_model__ = self.load_model(*self._model_decls, **self._model_attrs)
+        return self.__llm_model__
 
     @property
     def tokenizer(self) -> T:
         """The tokenizer to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
         if self.__llm_tokenizer__ is None:
-            self.__llm_tokenizer__ = t.cast(T, openllm.serialisation.load_tokenizer(self))
+            # NOTE: the signature of load_tokenizer here is the wrapper under _wrapped_load_tokenizer
+            self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs)
         return self.__llm_tokenizer__
 
     def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig:
@@ -1204,7 +1277,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
         models: list[bentoml.Model] | None = None,
         max_batch_size: int | None = None,
         max_latency_ms: int | None = None,
-        scheduling_strategy: type[Strategy] | None = None,
+        scheduling_strategy: type[bentoml.Strategy] | None = None,
     ) -> LLMRunner:
         """Convert this LLM into a Runner.
 
@@ -1292,6 +1365,7 @@ def Runner(
     model_name: str,
     *,
     model_id: str | None = None,
+    model_version: str | None = ...,
     init_local: t.Literal[False, True] = ...,
     **attrs: t.Any,
 ) -> LLMRunner:
@@ -1303,12 +1377,46 @@ def Runner(
     model_name: str,
     *,
     model_id: str = ...,
+    model_version: str | None = ...,
     models: list[bentoml.Model] | None = ...,
     max_batch_size: int | None = ...,
     max_latency_ms: int | None = ...,
     method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
     embedded: t.Literal[True, False] = ...,
-    scheduling_strategy: type[Strategy] | None = ...,
+    scheduling_strategy: type[bentoml.Strategy] | None = ...,
+    **attrs: t.Any,
+) -> LLMRunner:
+    ...
+
+
+@overload
+def Runner(
+    model_name: str,
+    *,
+    ensure_available: bool | None = None,
+    init_local: bool = ...,
+    implementation: LiteralRuntime | None = None,
+    llm_config: openllm.LLMConfig | None = None,
+    **attrs: t.Any,
+) -> LLMRunner:
+    ...
+
+
+@overload
+def Runner(
+    model_name: str,
+    *args: t.Any,
+    model_id: str | None = ...,
+    model_version: str | None = ...,
+    llm_config: openllm.LLMConfig | None = ...,
+    runtime: t.Literal["ggml", "transformers"] | None = ...,
+    quantize: t.Literal["int8", "int4", "gptq"] | None = ...,
+    bettertransformer: str | bool | None = ...,
+    adapter_id: str | None = ...,
+    adapter_name: str | None = ...,
+    adapter_map: dict[str, str | None] | None = ...,
+    quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
+    serialisation: t.Literal["safetensors", "legacy"] = ...,
     **attrs: t.Any,
 ) -> LLMRunner:
     ...
diff --git a/src/openllm/models/baichuan/modeling_baichuan.py b/src/openllm/models/baichuan/modeling_baichuan.py
index 0398e6b0..058bd2f3 100644
--- a/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/src/openllm/models/baichuan/modeling_baichuan.py
@@ -32,9 +32,6 @@ else:
 class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
     __openllm_internal__ = True
 
-    def llm_post_init(self):
-        self.device = torch.device("cuda")
-
     def sanitize_parameters(
         self,
         prompt: str,
diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py
index 74c6e0e1..b79f1744 100644
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -32,9 +32,6 @@ else:
 class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
     __openllm_internal__ = True
 
-    def llm_post_init(self):
-        self.device = torch.device("cuda")
-
     def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
         _, tokenizer_attrs = self.llm_parameters
 
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index c2b08358..0f69aaf5 100644
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -22,14 +22,12 @@ from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
 from .configuration_dolly_v2 import END_KEY
 from .configuration_dolly_v2 import RESPONSE_KEY
 from .configuration_dolly_v2 import get_special_token_id
-from ...utils import normalize_attrs_to_model_tokenizer_pair
 
 
 if t.TYPE_CHECKING:
     import tensorflow as tf
     import torch
 
-    import bentoml
     import transformers
 else:
     tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
@@ -261,18 +259,10 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
         tokenizer_kwds = {"padding_side": "left"}
         return model_kwds, tokenizer_kwds
 
-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
-        (_, model_attrs), tokenizer_attrs = self.llm_parameters
-        normalized_model_attrs, normalized_tokenizer_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
-        attrs = {**model_attrs, **normalized_model_attrs}
-        tokenizer_attrs = {**tokenizer_attrs, **normalized_tokenizer_attrs}
-        _ref = openllm.serialisation.get(self)
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
         return get_pipeline(
-            model=transformers.AutoModelForCausalLM.from_pretrained(_ref.path, **attrs),
-            tokenizer=transformers.AutoTokenizer.from_pretrained(_ref.path, **tokenizer_attrs),
+            model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
+            tokenizer=self.tokenizer,
             _init=True,
             return_full_text=self.config.return_full_text,
         )
diff --git a/src/openllm/models/falcon/modeling_falcon.py b/src/openllm/models/falcon/modeling_falcon.py
index 893d6f86..8ee85779 100644
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -24,7 +24,6 @@ from ..._prompt import default_formatter
 if t.TYPE_CHECKING:
     import torch
 
-    import bentoml
     import transformers
 else:
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -40,21 +39,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
         tokenizer_kwds: dict[str, t.Any] = {}
         return model_kwds, tokenizer_kwds
 
-    def llm_post_init(self):
-        self.device = torch.device("cuda")
-
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
-        trust_remote_code = attrs.pop("trust_remote_code", True)
-        return transformers.AutoModelForCausalLM.from_pretrained(
-            openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
-        )
-
-    def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> t.Any:
-        trust_remote_code = attrs.pop("trust_remote_code", True)
-        return transformers.AutoTokenizer.from_pretrained(
-            openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
-        )
-
     def sanitize_parameters(
         self,
         prompt: str,
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index f6a0f551..11ef2adc 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -31,9 +31,6 @@ else:
 class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
     __openllm_internal__ = True
 
-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
     def sanitize_parameters(
         self,
         prompt: str,
diff --git a/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
index daf42615..8f573e83 100644
--- a/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -25,7 +25,6 @@ from ..._prompt import default_formatter
 if t.TYPE_CHECKING:
     import torch
 
-    import bentoml
     import transformers
 else:
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -77,8 +76,8 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe
     def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
         return generation_result[0]
 
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
-        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
         if self.config.use_half_precision:
             model.half()
         return model
diff --git a/src/openllm/models/llama/modeling_llama.py b/src/openllm/models/llama/modeling_llama.py
index a2417f64..94a4b677 100644
--- a/src/openllm/models/llama/modeling_llama.py
+++ b/src/openllm/models/llama/modeling_llama.py
@@ -40,9 +40,6 @@ logger = logging.getLogger(__name__)
 class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
     __openllm_internal__ = True
 
-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
     def sanitize_parameters(
         self,
         prompt: str,
diff --git a/src/openllm/models/llama/modeling_vllm_llama.py b/src/openllm/models/llama/modeling_vllm_llama.py
index 53cec550..c5fb33b1 100644
--- a/src/openllm/models/llama/modeling_vllm_llama.py
+++ b/src/openllm/models/llama/modeling_vllm_llama.py
@@ -26,7 +26,6 @@ if t.TYPE_CHECKING:
     import torch
     import vllm
 
-    import bentoml
     import transformers
 else:
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -79,8 +78,8 @@ class VLLMLlaMA(openllm.LLM["vllm.LLM", "transformers.LlamaTokenizerFast"]):
     def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
         return generation_result[0]
 
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
-        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> t.Any:
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
         if self.config.use_half_precision:
             model.half()
         return model
diff --git a/src/openllm/models/mpt/modeling_mpt.py b/src/openllm/models/mpt/modeling_mpt.py
index 7c94d206..e6473c5e 100644
--- a/src/openllm/models/mpt/modeling_mpt.py
+++ b/src/openllm/models/mpt/modeling_mpt.py
@@ -63,7 +63,6 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
     __openllm_internal__ = True
 
     def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 
     @property
@@ -110,12 +109,12 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
         finally:
             torch.cuda.empty_cache()
 
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
         torch_dtype = attrs.pop("torch_dtype", self.dtype)
         device_map = attrs.pop("device_map", None)
         trust_remote_code = attrs.pop("trust_remote_code", True)
 
-        _ref = bentoml.transformers.get(tag)
+        _ref = bentoml.transformers.get(self.tag)
         config = get_mpt_config(
             _ref.path,
             self.config.max_sequence_length,
diff --git a/src/openllm/models/opt/modeling_opt.py b/src/openllm/models/opt/modeling_opt.py
index 775acf8f..5a97d347 100644
--- a/src/openllm/models/opt/modeling_opt.py
+++ b/src/openllm/models/opt/modeling_opt.py
@@ -39,7 +39,6 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
     __openllm_internal__ = True
 
     def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
     @property
@@ -75,13 +74,10 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
             labels=generate_labels(self),
         )
 
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
         torch_dtype = attrs.pop("torch_dtype", self.dtype)
-        trust_remote_code = attrs.pop("trust_remote_code", False)
-
-        _ref = bentoml.transformers.get(tag)
         model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
-            _ref.path, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, **attrs
+            bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
         )
         return model
 
diff --git a/src/openllm/models/stablelm/modeling_stablelm.py b/src/openllm/models/stablelm/modeling_stablelm.py
index 7945ce39..b31d1768 100644
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -37,7 +37,6 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
     __openllm_internal__ = True
 
     def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.bettertransformer = True if not torch.cuda.is_available() else False
 
     @property
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index 7f3a3e85..362ff5d6 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -42,9 +42,6 @@ FIM_INDICATOR = "<FILL_HERE>"
 class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
     __openllm_internal__ = True
 
-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
     @property
     def import_kwargs(self):
         model_kwds = {
diff --git a/src/openllm/serialisation/__init__.py b/src/openllm/serialisation/__init__.py
index 93a756ff..754756d3 100644
--- a/src/openllm/serialisation/__init__.py
+++ b/src/openllm/serialisation/__init__.py
@@ -39,19 +39,25 @@ llm.save_pretrained("./path/to/local-dolly")
 from __future__ import annotations
 import typing as t
 
-import openllm
+import cloudpickle
 
-from .constants import HUB_ATTRS
+import openllm
+from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
+
+from ..exceptions import OpenLLMException
+from ..utils import LazyLoader
 from ..utils import LazyModule
 
 
 if t.TYPE_CHECKING:
     import bentoml
+    import transformers
 
     from .._llm import M
     from .._llm import T
     from .._types import ModelProtocol
-    from .._types import TokenizerProtocol
+else:
+    transformers = LazyLoader("transformers", globals(), "transformers")
 
 
 def import_model(
@@ -87,9 +93,6 @@ def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs
 
 
 def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
-    if llm.__llm_custom_load__:
-        hub_attrs = {k: attrs.pop(k) for k in HUB_ATTRS if k in attrs}
-        return llm.load_model(llm.tag, *decls, **hub_attrs, **attrs)
     if llm.runtime == "transformers":
         return openllm.transformers.load_model(llm, *decls, **attrs)
     elif llm.runtime == "ggml":
@@ -98,16 +101,37 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
         raise ValueError(f"Unknown runtime: {llm.config['runtime']}")
 
 
-def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
-    if llm.__llm_custom_tokenizer__:
-        (_, _), tokenizer_attrs = llm.llm_parameters
-        return llm.load_tokenizer(llm.tag, **tokenizer_attrs)
-    elif llm.runtime == "transformers":
-        return openllm.transformers.load_tokenizer(llm)
-    elif llm.runtime == "ggml":
-        return openllm.ggml.load_tokenizer(llm)
+def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
+    """Load the tokenizer from BentoML store.
+
+    By default, it will try to find the bentomodel whether it is in store..
+    If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
+    """
+    from .transformers import infer_tokenizers_class_for_llm
+
+    bentomodel_fs = llm._bentomodel._fs
+    if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
+        with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
+            try:
+                tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
+            except KeyError:
+                # This could happen if users implement their own import_model
+                raise OpenLLMException(
+                    "Model does not have tokenizer. Make sure to save \
+                    the tokenizer within the model via 'custom_objects'.\
+                    For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
+                ) from None
     else:
-        raise ValueError(f"Unknown runtime: {llm.config['runtime']}")
+        tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
+            bentomodel_fs.getsyspath("/"),
+            trust_remote_code=llm.__llm_trust_remote_code__,
+            **tokenizer_attrs,
+        )
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer
 
 
 _extras = {
diff --git a/src/openllm/serialisation/ggml.py b/src/openllm/serialisation/ggml.py
index 79cddb9f..d2b05543 100644
--- a/src/openllm/serialisation/ggml.py
+++ b/src/openllm/serialisation/ggml.py
@@ -82,34 +82,30 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
     raise NotImplementedError("Currently work in progress.")
 
 
-def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
+def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> TokenizerProtocol[T]:
     """Load the tokenizer from BentoML store.
 
     By default, it will try to find the bentomodel whether it is in store..
     If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
     """
-    (_, _), tokenizer_attrs = llm.llm_parameters
-    if llm.__llm_custom_tokenizer__:
-        tokenizer = llm.load_tokenizer(llm.tag, **tokenizer_attrs)
+    bentomodel_fs = llm._bentomodel._fs
+    if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
+        with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
+            try:
+                tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
+            except KeyError:
+                # This could happen if users implement their own import_model
+                raise OpenLLMException(
+                    "Model does not have tokenizer. Make sure to save \
+                    the tokenizer within the model via 'custom_objects'.\
+                    For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
+                ) from None
     else:
-        bentomodel_fs = llm._bentomodel._fs
-        if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
-            with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
-                try:
-                    tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
-                except KeyError:
-                    # This could happen if users implement their own import_model
-                    raise OpenLLMException(
-                        "Model does not have tokenizer. Make sure to save \
-                        the tokenizer within the model via 'custom_objects'.\
-                        For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
-                    ) from None
-        else:
-            tokenizer = transformers.AutoTokenizer.from_pretrained(
-                bentomodel_fs.getsyspath("/"),
-                trust_remote_code=llm.__llm_trust_remote_code__,
-                **tokenizer_attrs,
-            )
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            bentomodel_fs.getsyspath("/"),
+            trust_remote_code=llm.__llm_trust_remote_code__,
+            **tokenizer_attrs,
+        )
     return t.cast("TokenizerProtocol[T]", tokenizer)
 
 
diff --git a/src/openllm/serialisation/transformers.py b/src/openllm/serialisation/transformers.py
index 41dd80eb..d85526db 100644
--- a/src/openllm/serialisation/transformers.py
+++ b/src/openllm/serialisation/transformers.py
@@ -18,11 +18,8 @@ import copy
 import importlib
 import typing as t
 
-import cloudpickle
-
 import bentoml
 from bentoml._internal.frameworks.transformers import make_default_signatures
-from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
 from bentoml._internal.models.model import ModelOptions
 
 from .constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
@@ -48,8 +45,6 @@ if t.TYPE_CHECKING:
     from .._llm import M
     from .._llm import T
     from .._types import DictStrAny
-    from .._types import ModelProtocol
-    from .._types import TokenizerProtocol
 else:
     autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
     _transformers = LazyLoader("_transformers", globals(), "transformers")
@@ -77,7 +72,7 @@ def process_transformers_config(
     return config, hub_attrs, attrs
 
 
-def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
+def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> T:
     tokenizer_class = __llm.config["tokenizer_class"]
     if tokenizer_class is None:
         tokenizer_class = "AutoTokenizer"
@@ -138,21 +133,18 @@ def import_model(
         **attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
     """
     config, hub_attrs, attrs = process_transformers_config(llm.model_id, trust_remote_code, **attrs)
-
-    # NOTE: get the base args and attrs, then
-    # allow override via import_model
-    (model_decls, model_attrs), tokenizer_attrs = llm.llm_parameters
-    decls = (*model_decls, *decls)
-    attrs = {**model_attrs, **attrs}
-
-    safe_serialisation = llm._serialisation_format == "safetensors"
+    _, tokenizer_attrs = llm.llm_parameters
     quantize_method = llm._quantize_method
-
+    safe_serialisation = first_not_none(
+        attrs.get("safe_serialization"), default=llm._serialisation_format == "safetensors"
+    )
+    if llm.__llm_implementation__ == "vllm":
+        # Disable safe serialization with vLLM
+        safe_serialisation = False
     metadata: DictStrAny = {
         "safe_serialisation": safe_serialisation,
         "_quantize": quantize_method if quantize_method is not None else False,
     }
-
     signatures: DictStrAny = {}
     if quantize_method == "gptq":
         if not is_autogptq_available():
@@ -260,18 +252,13 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo
         raise
 
 
-def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
+def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
     """Load the model from BentoML store.
 
     By default, it will try to find check the model in the local store.
     If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
     """
     config, hub_attrs, attrs = process_transformers_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
-    # NOTE: get the base args and attrs, then
-    # allow override via import_model
-    (model_decls, model_attrs), _ = llm.llm_parameters
-    decls = (*model_decls, *decls)
-    attrs = {**model_attrs, **attrs}
     metadata = llm._bentomodel.info.metadata
     safe_serialization = first_not_none(
         t.cast(t.Optional[bool], metadata.get("safe_serialisation", None)),
@@ -285,17 +272,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
             )
         if llm.config["model_type"] != "causal_lm":
             raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-        return t.cast(
-            "ModelProtocol[M]",
-            autogptq.AutoGPTQForCausalLM.from_quantized(
-                llm._bentomodel.path,
-                *decls,
-                quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
-                trust_remote_code=llm.__llm_trust_remote_code__,
-                use_safetensors=safe_serialization,
-                **hub_attrs,
-                **attrs,
-            ),
+        return autogptq.AutoGPTQForCausalLM.from_quantized(
+            llm._bentomodel.path,
+            *decls,
+            quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
+            trust_remote_code=llm.__llm_trust_remote_code__,
+            use_safetensors=safe_serialization,
+            **hub_attrs,
+            **attrs,
         )
     model = infer_autoclass_from_llm_config(llm, config).from_pretrained(
         llm._bentomodel.path,
@@ -316,46 +300,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
             model = model.to("cuda")
         except torch.cuda.OutOfMemoryError as err:
             raise RuntimeError(
-                f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8' for dynamic quantization."
+                f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
             ) from err
     if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
         # BetterTransformer is currently only supported on PyTorch.
         from optimum.bettertransformer import BetterTransformer
 
         model = BetterTransformer.transform(model)  # type: ignore
-    return t.cast("ModelProtocol[M]", model)
-
-
-def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
-    """Load the tokenizer from BentoML store.
-
-    By default, it will try to find the bentomodel whether it is in store..
-    If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
-    """
-    (_, _), tokenizer_attrs = llm.llm_parameters
-    bentomodel_fs = llm._bentomodel._fs
-    if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
-        with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
-            try:
-                tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
-            except KeyError:
-                # This could happen if users implement their own import_model
-                raise OpenLLMException(
-                    "Model does not have tokenizer. Make sure to save \
-                    the tokenizer within the model via 'custom_objects'.\
-                    For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
-                ) from None
-    else:
-        tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
-            bentomodel_fs.getsyspath("/"),
-            trust_remote_code=llm.__llm_trust_remote_code__,
-            **tokenizer_attrs,
-        )
-
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    return tokenizer
+    return t.cast("M", model)
 
 
 def save_pretrained(