perf(serialisation): implement wrapper to reduce callstack (#132)

2026-05-19 14:16:22 -04:00 · 2023-07-22 17:15:03 -04:00
parent ecf31e90b7
commit 19f20c7dad
17 changed files with 259 additions and 224 deletions
--- a/changelog.d/132.breaking.md
+++ b/changelog.d/132.breaking.md
@@ -0,0 +1,4 @@
+Updated signature for `load_model` and `load_tokenizer` not to allow tag.
+Tag can be accessed via `llm.tag`, or if using `openllm.serialisation` or `bentoml.transformers` then you can use `self._bentomodel`
+
+Updated serialisation shared logics to reduce callstack for saving three calltrace.
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -38,12 +38,15 @@ from bentoml._internal.models.model import ModelSignature

 from ._configuration import AdapterType
 from ._configuration import FineTuneConfig
+from ._configuration import _object_getattribute
+from ._configuration import _setattr_class
 from ._quantisation import infer_quantisation_config
 from .exceptions import ForbiddenAttributeError
 from .exceptions import GpuNotAvailableError
 from .utils import DEBUG
 from .utils import ENV_VARS_TRUE_VALUES
 from .utils import MYPY
+from .utils import SHOW_CODEGEN
 from .utils import EnvVarMixin
 from .utils import LazyLoader
 from .utils import ReprMixin
@@ -77,12 +80,12 @@ if t.TYPE_CHECKING:
    import vllm

    import transformers
-    from bentoml._internal.runner.strategy import Strategy

    from ._configuration import PeftType
    from ._types import AdaptersMapping
    from ._types import AdaptersTuple
    from ._types import DictStrAny
+    from ._types import ListStr
    from ._types import LiteralRuntime
    from ._types import LLMEmbeddings
    from ._types import LLMRunnable
@@ -244,7 +247,7 @@ _reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"}

 M = t.TypeVar(
    "M",
-    bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM]",
+    bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]",
 )
 T = t.TypeVar(
    "T",
@@ -348,10 +351,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
        """
        raise NotImplementedError

-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> M:
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> M:
        """This function can be implemented to override the default load_model behaviour.

-        See falcon for example implementation.
+        See falcon for example implementation. Tag can be accessed via ``self.tag``
        """
        raise NotImplementedError

@@ -394,8 +397,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
    - `OPTForConditionalGeneration` -> `pt`
    - `TFOPTForConditionalGeneration` -> `tf`
    - `FlaxOPTForConditionalGeneration` -> `flax`
+
+    An additional naming for all VLLM backend: VLLMLlaMA -> `vllm`
    """
-    __llm_model__: M | peft.PeftModel | None
+    __llm_model__: M | None
    """A reference to the actual model. Instead of access this directly, you should use `model` property instead."""
    __llm_tokenizer__: T | None
    """A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead."""
@@ -404,13 +409,6 @@ class LLMInterface(ABC, t.Generic[M, T]):
    __llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
    """A reference to the the cached LoRA adapter mapping."""

-    __llm_custom_import__: bool
-    """Whether this LLM has a custom import_model"""
-    __llm_custom_load__: bool
-    """A boolean to determine whether a custom 'load_model' is implemented"""
-    __llm_custom_tokenizer__: bool
-    """A boolean to determine whether a custom 'load_tokenizer' is implemented"""
-
    if t.TYPE_CHECKING and not MYPY:

        def __attrs_init__(
@@ -432,6 +430,121 @@ class LLMInterface(ABC, t.Generic[M, T]):
            """Generated __attrs_init__ for openllm.LLM."""


+if t.TYPE_CHECKING:
+    _R = t.TypeVar("_R")
+
+    class _import_model_wrapper(t.Generic[_R, M, T]):
+        def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
+            ...
+
+    class _load_model_wrapper(t.Generic[M, T]):
+        def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
+            ...
+
+    class _load_tokenizer_wrapper(t.Generic[M, T]):
+        def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
+            ...
+
+    class _llm_post_init_wrapper(t.Generic[M, T]):
+        def __call__(self, llm: LLM[M, T]) -> T:
+            ...
+
+
+def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]):
+    @functools.wraps(f)
+    def wrapper(
+        self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any
+    ) -> bentoml.Model:
+        trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
+        # wrapped around custom init to provide some meta compression
+        # for all decls and attrs
+        (model_decls, model_attrs), _ = self.llm_parameters
+        decls = (*model_decls, *decls)
+        attrs = {**model_attrs, **attrs}
+        return f(self, *decls, trust_remote_code=trust_remote_code, **attrs)
+
+    return wrapper
+
+
+def _wrapped_load_model(f: _load_model_wrapper[M, T]):
+    @functools.wraps(f)
+    def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
+        # wrapped around custom init to provide some meta compression
+        # for all decls and attrs
+        (model_decls, model_attrs), _ = self.llm_parameters
+        decls = (*model_decls, *decls)
+        attrs = {**model_attrs, **attrs}
+        return f(self, *decls, **attrs)
+
+    return wrapper
+
+
+def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]):
+    @functools.wraps(f)
+    def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
+        _, model_tokenizer_attrs = self.llm_parameters
+        tokenizer_attrs = {**model_tokenizer_attrs, **tokenizer_attrs}
+        return f(self, **tokenizer_attrs)
+
+    return wrapper
+
+
+def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]:
+    @functools.wraps(f)
+    def wrapper(self: LLM[M, T]):
+        _default_post_init(self)
+        f(self)
+
+    return wrapper
+
+
+def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
+    attributes = {
+        "import_model": _wrapped_import_model,
+        "load_model": _wrapped_load_model,
+        "load_tokenizer": _wrapped_load_tokenizer,
+        "llm_post_init": _wrapped_llm_post_init,
+    }
+    args: ListStr = []
+    anns: DictStrAny = {}
+    lines: ListStr = []
+    globs: DictStrAny = {
+        "cls": cls,
+        "_cached_attribute": attributes,
+        "_cached_getattribute_get": _object_getattribute.__get__,
+        "LLMInterface": LLMInterface,
+        "openllm": openllm,
+    }
+    # function initialisation
+    for func, impl in attributes.items():
+        globs[f"__wrapped_{func}"] = impl
+        impl_name = f"__wrapped_{func}"
+        cached_func_name = f"_cached_{cls.__name__}_func"
+        if func == "llm_post_init":
+            func_call = f"_impl_{cls.__name__}_{func}={impl_name}"
+        else:
+            func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_getattr('{func}') else openllm.serialisation.{func}"
+        lines.extend(
+            [
+                "_cached_LLMInterface_getattr=_cached_getattribute_get(LLMInterface)",
+                f"{cached_func_name}=cls.{func}",
+                func_call,
+                _setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),
+            ]
+        )
+
+    # cached attribute initialisation
+    interface_anns = codegen.get_annotations(LLMInterface)
+    for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
+        lines.append(_setattr_class(f"__llm_{v}__", None))
+        anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
+
+    if SHOW_CODEGEN:
+        logger.info("Generated script for %s:\n\n%s", cls.__name__, "\n".join(lines))
+
+    return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
+
+
 _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])


@@ -480,46 +593,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
        elif "config_class" not in cd:
            raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")

-        _custom_import = True
-        if cls.import_model is LLMInterface[M, T].import_model:
-            # using the default import model if no custom import is set
-            _custom_import = False
-            setattr(cls, "import_model", openllm.serialisation.import_model)
-        else:
-            import_func = getattr(cls, "import_model")
-
-            def _wrapped_import_model(
-                self: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any
-            ) -> bentoml.Model:
-                # wrapped around custom init to provide some meta compression
-                # for all decls and attrs
-                (model_decls, model_attrs), _ = self.llm_parameters
-
-                decls = (*model_decls, *decls)
-                attrs = {**model_attrs, **attrs}
-
-                return import_func(self, *decls, trust_remote_code=trust_remote_code, **attrs)
-
-            setattr(cls, "import_model", functools.update_wrapper(_wrapped_import_model, cls.import_model))
-
-        if cls.llm_post_init is LLMInterface[M, T].llm_post_init:
-            # using the default post init if no custom post init is set
-            wrapped_post_init = _default_post_init
-        else:
-            original_post_init = getattr(cls, "llm_post_init")
-
-            def wrapped_post_init(self: LLM[M, T]) -> None:
-                _default_post_init(self)
-                original_post_init(self)
-
-        setattr(cls, "llm_post_init", wrapped_post_init)
-
-        cls.__llm_custom_import__ = _custom_import
-        cls.__llm_custom_load__ = False if cls.load_model is LLMInterface[M, T].load_model else True
-        cls.__llm_custom_tokenizer__ = False if cls.load_tokenizer is LLMInterface[M, T].load_tokenizer else True
-
-        for at in {"bentomodel", "model", "tokenizer", "adapter_map"}:
-            setattr(cls, f"__llm_{at}__", None)
+        _make_assignment_script(cls)(cls)

        # update docstring for given entrypoint
        for fn in {"generate", "generate_one", "generate_iterator"}:
@@ -546,7 +620,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
                M,
                BetterTransformer.reverse(t.cast("transformers.PreTrainedModel", self.__llm_model__)),
            )
-
        openllm.serialisation.save_pretrained(self, save_directory, **attrs)

    @classmethod
@@ -997,16 +1070,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
            raise GpuNotAvailableError(f"{self} only supports running with GPU (None available).") from None

        if self.__llm_model__ is None:
-            self.__llm_model__ = t.cast(
-                M, openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
-            )
-        return t.cast(M, self.__llm_model__)
+            # NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
+            self.__llm_model__ = self.load_model(*self._model_decls, **self._model_attrs)
+        return self.__llm_model__

    @property
    def tokenizer(self) -> T:
        """The tokenizer to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
        if self.__llm_tokenizer__ is None:
-            self.__llm_tokenizer__ = t.cast(T, openllm.serialisation.load_tokenizer(self))
+            # NOTE: the signature of load_tokenizer here is the wrapper under _wrapped_load_tokenizer
+            self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs)
        return self.__llm_tokenizer__

    def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig:
@@ -1204,7 +1277,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
        models: list[bentoml.Model] | None = None,
        max_batch_size: int | None = None,
        max_latency_ms: int | None = None,
-        scheduling_strategy: type[Strategy] | None = None,
+        scheduling_strategy: type[bentoml.Strategy] | None = None,
    ) -> LLMRunner:
        """Convert this LLM into a Runner.

@@ -1292,6 +1365,7 @@ def Runner(
    model_name: str,
    *,
    model_id: str | None = None,
+    model_version: str | None = ...,
    init_local: t.Literal[False, True] = ...,
    **attrs: t.Any,
 ) -> LLMRunner:
@@ -1303,12 +1377,46 @@ def Runner(
    model_name: str,
    *,
    model_id: str = ...,
+    model_version: str | None = ...,
    models: list[bentoml.Model] | None = ...,
    max_batch_size: int | None = ...,
    max_latency_ms: int | None = ...,
    method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
    embedded: t.Literal[True, False] = ...,
-    scheduling_strategy: type[Strategy] | None = ...,
+    scheduling_strategy: type[bentoml.Strategy] | None = ...,
+    **attrs: t.Any,
+) -> LLMRunner:
+    ...
+
+
+@overload
+def Runner(
+    model_name: str,
+    *,
+    ensure_available: bool | None = None,
+    init_local: bool = ...,
+    implementation: LiteralRuntime | None = None,
+    llm_config: openllm.LLMConfig | None = None,
+    **attrs: t.Any,
+) -> LLMRunner:
+    ...
+
+
+@overload
+def Runner(
+    model_name: str,
+    *args: t.Any,
+    model_id: str | None = ...,
+    model_version: str | None = ...,
+    llm_config: openllm.LLMConfig | None = ...,
+    runtime: t.Literal["ggml", "transformers"] | None = ...,
+    quantize: t.Literal["int8", "int4", "gptq"] | None = ...,
+    bettertransformer: str | bool | None = ...,
+    adapter_id: str | None = ...,
+    adapter_name: str | None = ...,
+    adapter_map: dict[str, str | None] | None = ...,
+    quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
+    serialisation: t.Literal["safetensors", "legacy"] = ...,
    **attrs: t.Any,
 ) -> LLMRunner:
    ...
--- a/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/src/openllm/models/baichuan/modeling_baichuan.py
@@ -32,9 +32,6 @@ else:
 class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
    __openllm_internal__ = True

-    def llm_post_init(self):
-        self.device = torch.device("cuda")
-
    def sanitize_parameters(
        self,
        prompt: str,
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -32,9 +32,6 @@ else:
 class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
    __openllm_internal__ = True

-    def llm_post_init(self):
-        self.device = torch.device("cuda")
-
    def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
        _, tokenizer_attrs = self.llm_parameters

--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -22,14 +22,12 @@ from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
 from .configuration_dolly_v2 import END_KEY
 from .configuration_dolly_v2 import RESPONSE_KEY
 from .configuration_dolly_v2 import get_special_token_id
-from ...utils import normalize_attrs_to_model_tokenizer_pair


 if t.TYPE_CHECKING:
    import tensorflow as tf
    import torch

-    import bentoml
    import transformers
 else:
    tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
@@ -261,18 +259,10 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
        tokenizer_kwds = {"padding_side": "left"}
        return model_kwds, tokenizer_kwds

-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
-        (_, model_attrs), tokenizer_attrs = self.llm_parameters
-        normalized_model_attrs, normalized_tokenizer_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
-        attrs = {**model_attrs, **normalized_model_attrs}
-        tokenizer_attrs = {**tokenizer_attrs, **normalized_tokenizer_attrs}
-        _ref = openllm.serialisation.get(self)
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
        return get_pipeline(
-            model=transformers.AutoModelForCausalLM.from_pretrained(_ref.path, **attrs),
-            tokenizer=transformers.AutoTokenizer.from_pretrained(_ref.path, **tokenizer_attrs),
+            model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
+            tokenizer=self.tokenizer,
            _init=True,
            return_full_text=self.config.return_full_text,
        )
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -24,7 +24,6 @@ from ..._prompt import default_formatter
 if t.TYPE_CHECKING:
    import torch

-    import bentoml
    import transformers
 else:
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -40,21 +39,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
        tokenizer_kwds: dict[str, t.Any] = {}
        return model_kwds, tokenizer_kwds

-    def llm_post_init(self):
-        self.device = torch.device("cuda")
-
-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
-        trust_remote_code = attrs.pop("trust_remote_code", True)
-        return transformers.AutoModelForCausalLM.from_pretrained(
-            openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
-        )
-
-    def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> t.Any:
-        trust_remote_code = attrs.pop("trust_remote_code", True)
-        return transformers.AutoTokenizer.from_pretrained(
-            openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
-        )
-
    def sanitize_parameters(
        self,
        prompt: str,
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -31,9 +31,6 @@ else:
 class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
    __openllm_internal__ = True

-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
    def sanitize_parameters(
        self,
        prompt: str,
--- a/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -25,7 +25,6 @@ from ..._prompt import default_formatter
 if t.TYPE_CHECKING:
    import torch

-    import bentoml
    import transformers
 else:
    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -77,8 +76,8 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe
    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
        return generation_result[0]

-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
-        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
        if self.config.use_half_precision:
            model.half()
        return model
--- a/src/openllm/models/llama/modeling_llama.py
+++ b/src/openllm/models/llama/modeling_llama.py
@@ -40,9 +40,6 @@ logger = logging.getLogger(__name__)
 class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
    __openllm_internal__ = True

-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
    def sanitize_parameters(
        self,
        prompt: str,
--- a/src/openllm/models/llama/modeling_vllm_llama.py
+++ b/src/openllm/models/llama/modeling_vllm_llama.py
@@ -26,7 +26,6 @@ if t.TYPE_CHECKING:
    import torch
    import vllm

-    import bentoml
    import transformers
 else:
    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -79,8 +78,8 @@ class VLLMLlaMA(openllm.LLM["vllm.LLM", "transformers.LlamaTokenizerFast"]):
    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
        return generation_result[0]

-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
-        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> t.Any:
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
        if self.config.use_half_precision:
            model.half()
        return model
--- a/src/openllm/models/mpt/modeling_mpt.py
+++ b/src/openllm/models/mpt/modeling_mpt.py
@@ -63,7 +63,6 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
    __openllm_internal__ = True

    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

    @property
@@ -110,12 +109,12 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
        finally:
            torch.cuda.empty_cache()

-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
        torch_dtype = attrs.pop("torch_dtype", self.dtype)
        device_map = attrs.pop("device_map", None)
        trust_remote_code = attrs.pop("trust_remote_code", True)

-        _ref = bentoml.transformers.get(tag)
+        _ref = bentoml.transformers.get(self.tag)
        config = get_mpt_config(
            _ref.path,
            self.config.max_sequence_length,
--- a/src/openllm/models/opt/modeling_opt.py
+++ b/src/openllm/models/opt/modeling_opt.py
@@ -39,7 +39,6 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
    __openllm_internal__ = True

    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    @property
@@ -75,13 +74,10 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
            labels=generate_labels(self),
        )

-    def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
        torch_dtype = attrs.pop("torch_dtype", self.dtype)
-        trust_remote_code = attrs.pop("trust_remote_code", False)
-
-        _ref = bentoml.transformers.get(tag)
        model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
-            _ref.path, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, **attrs
+            bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
        )
        return model

--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -37,7 +37,6 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
    __openllm_internal__ = True

    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.bettertransformer = True if not torch.cuda.is_available() else False

    @property
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -42,9 +42,6 @@ FIM_INDICATOR = "<FILL_HERE>"
 class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
    __openllm_internal__ = True

-    def llm_post_init(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
    @property
    def import_kwargs(self):
        model_kwds = {
--- a/src/openllm/serialisation/init.py
+++ b/src/openllm/serialisation/init.py
@@ -39,19 +39,25 @@ llm.save_pretrained("./path/to/local-dolly")
 from __future__ import annotations
 import typing as t

-import openllm
+import cloudpickle

-from .constants import HUB_ATTRS
+import openllm
+from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
+
+from ..exceptions import OpenLLMException
+from ..utils import LazyLoader
 from ..utils import LazyModule


 if t.TYPE_CHECKING:
    import bentoml
+    import transformers

    from .._llm import M
    from .._llm import T
    from .._types import ModelProtocol
-    from .._types import TokenizerProtocol
+else:
+    transformers = LazyLoader("transformers", globals(), "transformers")


 def import_model(
@@ -87,9 +93,6 @@ def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs


 def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
-    if llm.__llm_custom_load__:
-        hub_attrs = {k: attrs.pop(k) for k in HUB_ATTRS if k in attrs}
-        return llm.load_model(llm.tag, *decls, **hub_attrs, **attrs)
    if llm.runtime == "transformers":
        return openllm.transformers.load_model(llm, *decls, **attrs)
    elif llm.runtime == "ggml":
@@ -98,16 +101,37 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
        raise ValueError(f"Unknown runtime: {llm.config['runtime']}")


-def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
-    if llm.__llm_custom_tokenizer__:
-        (_, _), tokenizer_attrs = llm.llm_parameters
-        return llm.load_tokenizer(llm.tag, **tokenizer_attrs)
-    elif llm.runtime == "transformers":
-        return openllm.transformers.load_tokenizer(llm)
-    elif llm.runtime == "ggml":
-        return openllm.ggml.load_tokenizer(llm)
+def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
+    """Load the tokenizer from BentoML store.
+
+    By default, it will try to find the bentomodel whether it is in store..
+    If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
+    """
+    from .transformers import infer_tokenizers_class_for_llm
+
+    bentomodel_fs = llm._bentomodel._fs
+    if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
+        with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
+            try:
+                tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
+            except KeyError:
+                # This could happen if users implement their own import_model
+                raise OpenLLMException(
+                    "Model does not have tokenizer. Make sure to save \
+                    the tokenizer within the model via 'custom_objects'.\
+                    For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
+                ) from None
    else:
-        raise ValueError(f"Unknown runtime: {llm.config['runtime']}")
+        tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
+            bentomodel_fs.getsyspath("/"),
+            trust_remote_code=llm.__llm_trust_remote_code__,
+            **tokenizer_attrs,
+        )
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer


 _extras = {
--- a/src/openllm/serialisation/ggml.py
+++ b/src/openllm/serialisation/ggml.py
@@ -82,34 +82,30 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
    raise NotImplementedError("Currently work in progress.")


-def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
+def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> TokenizerProtocol[T]:
    """Load the tokenizer from BentoML store.

    By default, it will try to find the bentomodel whether it is in store..
    If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
    """
-    (_, _), tokenizer_attrs = llm.llm_parameters
-    if llm.__llm_custom_tokenizer__:
-        tokenizer = llm.load_tokenizer(llm.tag, **tokenizer_attrs)
+    bentomodel_fs = llm._bentomodel._fs
+    if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
+        with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
+            try:
+                tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
+            except KeyError:
+                # This could happen if users implement their own import_model
+                raise OpenLLMException(
+                    "Model does not have tokenizer. Make sure to save \
+                    the tokenizer within the model via 'custom_objects'.\
+                    For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
+                ) from None
    else:
-        bentomodel_fs = llm._bentomodel._fs
-        if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
-            with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
-                try:
-                    tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
-                except KeyError:
-                    # This could happen if users implement their own import_model
-                    raise OpenLLMException(
-                        "Model does not have tokenizer. Make sure to save \
-                        the tokenizer within the model via 'custom_objects'.\
-                        For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
-                    ) from None
-        else:
-            tokenizer = transformers.AutoTokenizer.from_pretrained(
-                bentomodel_fs.getsyspath("/"),
-                trust_remote_code=llm.__llm_trust_remote_code__,
-                **tokenizer_attrs,
-            )
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            bentomodel_fs.getsyspath("/"),
+            trust_remote_code=llm.__llm_trust_remote_code__,
+            **tokenizer_attrs,
+        )
    return t.cast("TokenizerProtocol[T]", tokenizer)


--- a/src/openllm/serialisation/transformers.py
+++ b/src/openllm/serialisation/transformers.py
@@ -18,11 +18,8 @@ import copy
 import importlib
 import typing as t

-import cloudpickle
-
 import bentoml
 from bentoml._internal.frameworks.transformers import make_default_signatures
-from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
 from bentoml._internal.models.model import ModelOptions

 from .constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
@@ -48,8 +45,6 @@ if t.TYPE_CHECKING:
    from .._llm import M
    from .._llm import T
    from .._types import DictStrAny
-    from .._types import ModelProtocol
-    from .._types import TokenizerProtocol
 else:
    autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
    _transformers = LazyLoader("_transformers", globals(), "transformers")
@@ -77,7 +72,7 @@ def process_transformers_config(
    return config, hub_attrs, attrs


-def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
+def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> T:
    tokenizer_class = __llm.config["tokenizer_class"]
    if tokenizer_class is None:
        tokenizer_class = "AutoTokenizer"
@@ -138,21 +133,18 @@ def import_model(
        **attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
    """
    config, hub_attrs, attrs = process_transformers_config(llm.model_id, trust_remote_code, **attrs)
-
-    # NOTE: get the base args and attrs, then
-    # allow override via import_model
-    (model_decls, model_attrs), tokenizer_attrs = llm.llm_parameters
-    decls = (*model_decls, *decls)
-    attrs = {**model_attrs, **attrs}
-
-    safe_serialisation = llm._serialisation_format == "safetensors"
+    _, tokenizer_attrs = llm.llm_parameters
    quantize_method = llm._quantize_method
-
+    safe_serialisation = first_not_none(
+        attrs.get("safe_serialization"), default=llm._serialisation_format == "safetensors"
+    )
+    if llm.__llm_implementation__ == "vllm":
+        # Disable safe serialization with vLLM
+        safe_serialisation = False
    metadata: DictStrAny = {
        "safe_serialisation": safe_serialisation,
        "_quantize": quantize_method if quantize_method is not None else False,
    }
-
    signatures: DictStrAny = {}
    if quantize_method == "gptq":
        if not is_autogptq_available():
@@ -260,18 +252,13 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo
        raise


-def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
+def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
    """Load the model from BentoML store.

    By default, it will try to find check the model in the local store.
    If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
    """
    config, hub_attrs, attrs = process_transformers_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
-    # NOTE: get the base args and attrs, then
-    # allow override via import_model
-    (model_decls, model_attrs), _ = llm.llm_parameters
-    decls = (*model_decls, *decls)
-    attrs = {**model_attrs, **attrs}
    metadata = llm._bentomodel.info.metadata
    safe_serialization = first_not_none(
        t.cast(t.Optional[bool], metadata.get("safe_serialisation", None)),
@@ -285,17 +272,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
            )
        if llm.config["model_type"] != "causal_lm":
            raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-        return t.cast(
-            "ModelProtocol[M]",
-            autogptq.AutoGPTQForCausalLM.from_quantized(
-                llm._bentomodel.path,
-                *decls,
-                quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
-                trust_remote_code=llm.__llm_trust_remote_code__,
-                use_safetensors=safe_serialization,
-                **hub_attrs,
-                **attrs,
-            ),
+        return autogptq.AutoGPTQForCausalLM.from_quantized(
+            llm._bentomodel.path,
+            *decls,
+            quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
+            trust_remote_code=llm.__llm_trust_remote_code__,
+            use_safetensors=safe_serialization,
+            **hub_attrs,
+            **attrs,
        )
    model = infer_autoclass_from_llm_config(llm, config).from_pretrained(
        llm._bentomodel.path,
@@ -316,46 +300,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
            model = model.to("cuda")
        except torch.cuda.OutOfMemoryError as err:
            raise RuntimeError(
-                f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8' for dynamic quantization."
+                f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
            ) from err
    if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
        # BetterTransformer is currently only supported on PyTorch.
        from optimum.bettertransformer import BetterTransformer

        model = BetterTransformer.transform(model)  # type: ignore
-    return t.cast("ModelProtocol[M]", model)
-
-
-def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
-    """Load the tokenizer from BentoML store.
-
-    By default, it will try to find the bentomodel whether it is in store..
-    If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
-    """
-    (_, _), tokenizer_attrs = llm.llm_parameters
-    bentomodel_fs = llm._bentomodel._fs
-    if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
-        with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
-            try:
-                tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
-            except KeyError:
-                # This could happen if users implement their own import_model
-                raise OpenLLMException(
-                    "Model does not have tokenizer. Make sure to save \
-                    the tokenizer within the model via 'custom_objects'.\
-                    For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
-                ) from None
-    else:
-        tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
-            bentomodel_fs.getsyspath("/"),
-            trust_remote_code=llm.__llm_trust_remote_code__,
-            **tokenizer_attrs,
-        )
-
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    return tokenizer
+    return t.cast("M", model)


 def save_pretrained(