From 19f20c7dad2b391c1b4867bae950ada6fc58387d Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sat, 22 Jul 2023 17:15:03 -0400 Subject: [PATCH] perf(serialisation): implement wrapper to reduce callstack (#132) --- changelog.d/132.breaking.md | 4 + src/openllm/_llm.py | 228 +++++++++++++----- .../models/baichuan/modeling_baichuan.py | 3 - .../models/chatglm/modeling_chatglm.py | 3 - .../models/dolly_v2/modeling_dolly_v2.py | 16 +- src/openllm/models/falcon/modeling_falcon.py | 16 -- .../models/flan_t5/modeling_flan_t5.py | 3 - .../models/gpt_neox/modeling_gpt_neox.py | 5 +- src/openllm/models/llama/modeling_llama.py | 3 - .../models/llama/modeling_vllm_llama.py | 5 +- src/openllm/models/mpt/modeling_mpt.py | 5 +- src/openllm/models/opt/modeling_opt.py | 8 +- .../models/stablelm/modeling_stablelm.py | 1 - .../models/starcoder/modeling_starcoder.py | 3 - src/openllm/serialisation/__init__.py | 54 +++-- src/openllm/serialisation/ggml.py | 40 ++- src/openllm/serialisation/transformers.py | 86 ++----- 17 files changed, 259 insertions(+), 224 deletions(-) create mode 100644 changelog.d/132.breaking.md diff --git a/changelog.d/132.breaking.md b/changelog.d/132.breaking.md new file mode 100644 index 00000000..4ae5b446 --- /dev/null +++ b/changelog.d/132.breaking.md @@ -0,0 +1,4 @@ +Updated signature for `load_model` and `load_tokenizer` not to allow tag. +Tag can be accessed via `llm.tag`, or if using `openllm.serialisation` or `bentoml.transformers` then you can use `self._bentomodel` + +Updated serialisation shared logics to reduce callstack for saving three calltrace. diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py index 5b58d19a..e1210054 100644 --- a/src/openllm/_llm.py +++ b/src/openllm/_llm.py @@ -38,12 +38,15 @@ from bentoml._internal.models.model import ModelSignature from ._configuration import AdapterType from ._configuration import FineTuneConfig +from ._configuration import _object_getattribute +from ._configuration import _setattr_class from ._quantisation import infer_quantisation_config from .exceptions import ForbiddenAttributeError from .exceptions import GpuNotAvailableError from .utils import DEBUG from .utils import ENV_VARS_TRUE_VALUES from .utils import MYPY +from .utils import SHOW_CODEGEN from .utils import EnvVarMixin from .utils import LazyLoader from .utils import ReprMixin @@ -77,12 +80,12 @@ if t.TYPE_CHECKING: import vllm import transformers - from bentoml._internal.runner.strategy import Strategy from ._configuration import PeftType from ._types import AdaptersMapping from ._types import AdaptersTuple from ._types import DictStrAny + from ._types import ListStr from ._types import LiteralRuntime from ._types import LLMEmbeddings from ._types import LLMRunnable @@ -244,7 +247,7 @@ _reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"} M = t.TypeVar( "M", - bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM]", + bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]", ) T = t.TypeVar( "T", @@ -348,10 +351,10 @@ class LLMInterface(ABC, t.Generic[M, T]): """ raise NotImplementedError - def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> M: + def load_model(self, *args: t.Any, **attrs: t.Any) -> M: """This function can be implemented to override the default load_model behaviour. - See falcon for example implementation. + See falcon for example implementation. Tag can be accessed via ``self.tag`` """ raise NotImplementedError @@ -394,8 +397,10 @@ class LLMInterface(ABC, t.Generic[M, T]): - `OPTForConditionalGeneration` -> `pt` - `TFOPTForConditionalGeneration` -> `tf` - `FlaxOPTForConditionalGeneration` -> `flax` + + An additional naming for all VLLM backend: VLLMLlaMA -> `vllm` """ - __llm_model__: M | peft.PeftModel | None + __llm_model__: M | None """A reference to the actual model. Instead of access this directly, you should use `model` property instead.""" __llm_tokenizer__: T | None """A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead.""" @@ -404,13 +409,6 @@ class LLMInterface(ABC, t.Generic[M, T]): __llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None """A reference to the the cached LoRA adapter mapping.""" - __llm_custom_import__: bool - """Whether this LLM has a custom import_model""" - __llm_custom_load__: bool - """A boolean to determine whether a custom 'load_model' is implemented""" - __llm_custom_tokenizer__: bool - """A boolean to determine whether a custom 'load_tokenizer' is implemented""" - if t.TYPE_CHECKING and not MYPY: def __attrs_init__( @@ -432,6 +430,121 @@ class LLMInterface(ABC, t.Generic[M, T]): """Generated __attrs_init__ for openllm.LLM.""" +if t.TYPE_CHECKING: + _R = t.TypeVar("_R") + + class _import_model_wrapper(t.Generic[_R, M, T]): + def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R: + ... + + class _load_model_wrapper(t.Generic[M, T]): + def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: + ... + + class _load_tokenizer_wrapper(t.Generic[M, T]): + def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T: + ... + + class _llm_post_init_wrapper(t.Generic[M, T]): + def __call__(self, llm: LLM[M, T]) -> T: + ... + + +def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]): + @functools.wraps(f) + def wrapper( + self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any + ) -> bentoml.Model: + trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__) + # wrapped around custom init to provide some meta compression + # for all decls and attrs + (model_decls, model_attrs), _ = self.llm_parameters + decls = (*model_decls, *decls) + attrs = {**model_attrs, **attrs} + return f(self, *decls, trust_remote_code=trust_remote_code, **attrs) + + return wrapper + + +def _wrapped_load_model(f: _load_model_wrapper[M, T]): + @functools.wraps(f) + def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: + # wrapped around custom init to provide some meta compression + # for all decls and attrs + (model_decls, model_attrs), _ = self.llm_parameters + decls = (*model_decls, *decls) + attrs = {**model_attrs, **attrs} + return f(self, *decls, **attrs) + + return wrapper + + +def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]): + @functools.wraps(f) + def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T: + _, model_tokenizer_attrs = self.llm_parameters + tokenizer_attrs = {**model_tokenizer_attrs, **tokenizer_attrs} + return f(self, **tokenizer_attrs) + + return wrapper + + +def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]: + @functools.wraps(f) + def wrapper(self: LLM[M, T]): + _default_post_init(self) + f(self) + + return wrapper + + +def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]: + attributes = { + "import_model": _wrapped_import_model, + "load_model": _wrapped_load_model, + "load_tokenizer": _wrapped_load_tokenizer, + "llm_post_init": _wrapped_llm_post_init, + } + args: ListStr = [] + anns: DictStrAny = {} + lines: ListStr = [] + globs: DictStrAny = { + "cls": cls, + "_cached_attribute": attributes, + "_cached_getattribute_get": _object_getattribute.__get__, + "LLMInterface": LLMInterface, + "openllm": openllm, + } + # function initialisation + for func, impl in attributes.items(): + globs[f"__wrapped_{func}"] = impl + impl_name = f"__wrapped_{func}" + cached_func_name = f"_cached_{cls.__name__}_func" + if func == "llm_post_init": + func_call = f"_impl_{cls.__name__}_{func}={impl_name}" + else: + func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_getattr('{func}') else openllm.serialisation.{func}" + lines.extend( + [ + "_cached_LLMInterface_getattr=_cached_getattribute_get(LLMInterface)", + f"{cached_func_name}=cls.{func}", + func_call, + _setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"), + ] + ) + + # cached attribute initialisation + interface_anns = codegen.get_annotations(LLMInterface) + for v in {"bentomodel", "model", "tokenizer", "adapter_map"}: + lines.append(_setattr_class(f"__llm_{v}__", None)) + anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__") + + if SHOW_CODEGEN: + logger.info("Generated script for %s:\n\n%s", cls.__name__, "\n".join(lines)) + + return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns) + + _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"]) @@ -480,46 +593,7 @@ class LLM(LLMInterface[M, T], ReprMixin): elif "config_class" not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.") - _custom_import = True - if cls.import_model is LLMInterface[M, T].import_model: - # using the default import model if no custom import is set - _custom_import = False - setattr(cls, "import_model", openllm.serialisation.import_model) - else: - import_func = getattr(cls, "import_model") - - def _wrapped_import_model( - self: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any - ) -> bentoml.Model: - # wrapped around custom init to provide some meta compression - # for all decls and attrs - (model_decls, model_attrs), _ = self.llm_parameters - - decls = (*model_decls, *decls) - attrs = {**model_attrs, **attrs} - - return import_func(self, *decls, trust_remote_code=trust_remote_code, **attrs) - - setattr(cls, "import_model", functools.update_wrapper(_wrapped_import_model, cls.import_model)) - - if cls.llm_post_init is LLMInterface[M, T].llm_post_init: - # using the default post init if no custom post init is set - wrapped_post_init = _default_post_init - else: - original_post_init = getattr(cls, "llm_post_init") - - def wrapped_post_init(self: LLM[M, T]) -> None: - _default_post_init(self) - original_post_init(self) - - setattr(cls, "llm_post_init", wrapped_post_init) - - cls.__llm_custom_import__ = _custom_import - cls.__llm_custom_load__ = False if cls.load_model is LLMInterface[M, T].load_model else True - cls.__llm_custom_tokenizer__ = False if cls.load_tokenizer is LLMInterface[M, T].load_tokenizer else True - - for at in {"bentomodel", "model", "tokenizer", "adapter_map"}: - setattr(cls, f"__llm_{at}__", None) + _make_assignment_script(cls)(cls) # update docstring for given entrypoint for fn in {"generate", "generate_one", "generate_iterator"}: @@ -546,7 +620,6 @@ class LLM(LLMInterface[M, T], ReprMixin): M, BetterTransformer.reverse(t.cast("transformers.PreTrainedModel", self.__llm_model__)), ) - openllm.serialisation.save_pretrained(self, save_directory, **attrs) @classmethod @@ -997,16 +1070,16 @@ class LLM(LLMInterface[M, T], ReprMixin): raise GpuNotAvailableError(f"{self} only supports running with GPU (None available).") from None if self.__llm_model__ is None: - self.__llm_model__ = t.cast( - M, openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs) - ) - return t.cast(M, self.__llm_model__) + # NOTE: the signature of load_model here is the wrapper under _wrapped_load_model + self.__llm_model__ = self.load_model(*self._model_decls, **self._model_attrs) + return self.__llm_model__ @property def tokenizer(self) -> T: """The tokenizer to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it.""" if self.__llm_tokenizer__ is None: - self.__llm_tokenizer__ = t.cast(T, openllm.serialisation.load_tokenizer(self)) + # NOTE: the signature of load_tokenizer here is the wrapper under _wrapped_load_tokenizer + self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs) return self.__llm_tokenizer__ def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig: @@ -1204,7 +1277,7 @@ class LLM(LLMInterface[M, T], ReprMixin): models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, - scheduling_strategy: type[Strategy] | None = None, + scheduling_strategy: type[bentoml.Strategy] | None = None, ) -> LLMRunner: """Convert this LLM into a Runner. @@ -1292,6 +1365,7 @@ def Runner( model_name: str, *, model_id: str | None = None, + model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any, ) -> LLMRunner: @@ -1303,12 +1377,46 @@ def Runner( model_name: str, *, model_id: str = ..., + model_version: str | None = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ..., embedded: t.Literal[True, False] = ..., - scheduling_strategy: type[Strategy] | None = ..., + scheduling_strategy: type[bentoml.Strategy] | None = ..., + **attrs: t.Any, +) -> LLMRunner: + ... + + +@overload +def Runner( + model_name: str, + *, + ensure_available: bool | None = None, + init_local: bool = ..., + implementation: LiteralRuntime | None = None, + llm_config: openllm.LLMConfig | None = None, + **attrs: t.Any, +) -> LLMRunner: + ... + + +@overload +def Runner( + model_name: str, + *args: t.Any, + model_id: str | None = ..., + model_version: str | None = ..., + llm_config: openllm.LLMConfig | None = ..., + runtime: t.Literal["ggml", "transformers"] | None = ..., + quantize: t.Literal["int8", "int4", "gptq"] | None = ..., + bettertransformer: str | bool | None = ..., + adapter_id: str | None = ..., + adapter_name: str | None = ..., + adapter_map: dict[str, str | None] | None = ..., + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, + serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any, ) -> LLMRunner: ... diff --git a/src/openllm/models/baichuan/modeling_baichuan.py b/src/openllm/models/baichuan/modeling_baichuan.py index 0398e6b0..058bd2f3 100644 --- a/src/openllm/models/baichuan/modeling_baichuan.py +++ b/src/openllm/models/baichuan/modeling_baichuan.py @@ -32,9 +32,6 @@ else: class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]): __openllm_internal__ = True - def llm_post_init(self): - self.device = torch.device("cuda") - def sanitize_parameters( self, prompt: str, diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py index 74c6e0e1..b79f1744 100644 --- a/src/openllm/models/chatglm/modeling_chatglm.py +++ b/src/openllm/models/chatglm/modeling_chatglm.py @@ -32,9 +32,6 @@ else: class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]): __openllm_internal__ = True - def llm_post_init(self): - self.device = torch.device("cuda") - def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model: _, tokenizer_attrs = self.llm_parameters diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py index c2b08358..0f69aaf5 100644 --- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -22,14 +22,12 @@ from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE from .configuration_dolly_v2 import END_KEY from .configuration_dolly_v2 import RESPONSE_KEY from .configuration_dolly_v2 import get_special_token_id -from ...utils import normalize_attrs_to_model_tokenizer_pair if t.TYPE_CHECKING: import tensorflow as tf import torch - import bentoml import transformers else: tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow") @@ -261,18 +259,10 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken tokenizer_kwds = {"padding_side": "left"} return model_kwds, tokenizer_kwds - def llm_post_init(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: - (_, model_attrs), tokenizer_attrs = self.llm_parameters - normalized_model_attrs, normalized_tokenizer_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs) - attrs = {**model_attrs, **normalized_model_attrs} - tokenizer_attrs = {**tokenizer_attrs, **normalized_tokenizer_attrs} - _ref = openllm.serialisation.get(self) + def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline( - model=transformers.AutoModelForCausalLM.from_pretrained(_ref.path, **attrs), - tokenizer=transformers.AutoTokenizer.from_pretrained(_ref.path, **tokenizer_attrs), + model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), + tokenizer=self.tokenizer, _init=True, return_full_text=self.config.return_full_text, ) diff --git a/src/openllm/models/falcon/modeling_falcon.py b/src/openllm/models/falcon/modeling_falcon.py index 893d6f86..8ee85779 100644 --- a/src/openllm/models/falcon/modeling_falcon.py +++ b/src/openllm/models/falcon/modeling_falcon.py @@ -24,7 +24,6 @@ from ..._prompt import default_formatter if t.TYPE_CHECKING: import torch - import bentoml import transformers else: torch = openllm.utils.LazyLoader("torch", globals(), "torch") @@ -40,21 +39,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine tokenizer_kwds: dict[str, t.Any] = {} return model_kwds, tokenizer_kwds - def llm_post_init(self): - self.device = torch.device("cuda") - - def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any: - trust_remote_code = attrs.pop("trust_remote_code", True) - return transformers.AutoModelForCausalLM.from_pretrained( - openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs - ) - - def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> t.Any: - trust_remote_code = attrs.pop("trust_remote_code", True) - return transformers.AutoTokenizer.from_pretrained( - openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs - ) - def sanitize_parameters( self, prompt: str, diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py index f6a0f551..11ef2adc 100644 --- a/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -31,9 +31,6 @@ else: class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]): __openllm_internal__ = True - def llm_post_init(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - def sanitize_parameters( self, prompt: str, diff --git a/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/src/openllm/models/gpt_neox/modeling_gpt_neox.py index daf42615..8f573e83 100644 --- a/src/openllm/models/gpt_neox/modeling_gpt_neox.py +++ b/src/openllm/models/gpt_neox/modeling_gpt_neox.py @@ -25,7 +25,6 @@ from ..._prompt import default_formatter if t.TYPE_CHECKING: import torch - import bentoml import transformers else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers") @@ -77,8 +76,8 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] - def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any: - model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs) + def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM: + model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs) if self.config.use_half_precision: model.half() return model diff --git a/src/openllm/models/llama/modeling_llama.py b/src/openllm/models/llama/modeling_llama.py index a2417f64..94a4b677 100644 --- a/src/openllm/models/llama/modeling_llama.py +++ b/src/openllm/models/llama/modeling_llama.py @@ -40,9 +40,6 @@ logger = logging.getLogger(__name__) class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]): __openllm_internal__ = True - def llm_post_init(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - def sanitize_parameters( self, prompt: str, diff --git a/src/openllm/models/llama/modeling_vllm_llama.py b/src/openllm/models/llama/modeling_vllm_llama.py index 53cec550..c5fb33b1 100644 --- a/src/openllm/models/llama/modeling_vllm_llama.py +++ b/src/openllm/models/llama/modeling_vllm_llama.py @@ -26,7 +26,6 @@ if t.TYPE_CHECKING: import torch import vllm - import bentoml import transformers else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers") @@ -79,8 +78,8 @@ class VLLMLlaMA(openllm.LLM["vllm.LLM", "transformers.LlamaTokenizerFast"]): def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] - def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any: - model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs) + def load_model(self, *args: t.Any, **attrs: t.Any) -> t.Any: + model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs) if self.config.use_half_precision: model.half() return model diff --git a/src/openllm/models/mpt/modeling_mpt.py b/src/openllm/models/mpt/modeling_mpt.py index 7c94d206..e6473c5e 100644 --- a/src/openllm/models/mpt/modeling_mpt.py +++ b/src/openllm/models/mpt/modeling_mpt.py @@ -63,7 +63,6 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken __openllm_internal__ = True def llm_post_init(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 @property @@ -110,12 +109,12 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken finally: torch.cuda.empty_cache() - def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel: + def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel: torch_dtype = attrs.pop("torch_dtype", self.dtype) device_map = attrs.pop("device_map", None) trust_remote_code = attrs.pop("trust_remote_code", True) - _ref = bentoml.transformers.get(tag) + _ref = bentoml.transformers.get(self.tag) config = get_mpt_config( _ref.path, self.config.max_sequence_length, diff --git a/src/openllm/models/opt/modeling_opt.py b/src/openllm/models/opt/modeling_opt.py index 775acf8f..5a97d347 100644 --- a/src/openllm/models/opt/modeling_opt.py +++ b/src/openllm/models/opt/modeling_opt.py @@ -39,7 +39,6 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer __openllm_internal__ = True def llm_post_init(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32 @property @@ -75,13 +74,10 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer labels=generate_labels(self), ) - def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM: + def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM: torch_dtype = attrs.pop("torch_dtype", self.dtype) - trust_remote_code = attrs.pop("trust_remote_code", False) - - _ref = bentoml.transformers.get(tag) model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained( - _ref.path, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, **attrs + bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs ) return model diff --git a/src/openllm/models/stablelm/modeling_stablelm.py b/src/openllm/models/stablelm/modeling_stablelm.py index 7945ce39..b31d1768 100644 --- a/src/openllm/models/stablelm/modeling_stablelm.py +++ b/src/openllm/models/stablelm/modeling_stablelm.py @@ -37,7 +37,6 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN __openllm_internal__ = True def llm_post_init(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.bettertransformer = True if not torch.cuda.is_available() else False @property diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py index 7f3a3e85..362ff5d6 100644 --- a/src/openllm/models/starcoder/modeling_starcoder.py +++ b/src/openllm/models/starcoder/modeling_starcoder.py @@ -42,9 +42,6 @@ FIM_INDICATOR = "" class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]): __openllm_internal__ = True - def llm_post_init(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - @property def import_kwargs(self): model_kwds = { diff --git a/src/openllm/serialisation/__init__.py b/src/openllm/serialisation/__init__.py index 93a756ff..754756d3 100644 --- a/src/openllm/serialisation/__init__.py +++ b/src/openllm/serialisation/__init__.py @@ -39,19 +39,25 @@ llm.save_pretrained("./path/to/local-dolly") from __future__ import annotations import typing as t -import openllm +import cloudpickle -from .constants import HUB_ATTRS +import openllm +from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME + +from ..exceptions import OpenLLMException +from ..utils import LazyLoader from ..utils import LazyModule if t.TYPE_CHECKING: import bentoml + import transformers from .._llm import M from .._llm import T from .._types import ModelProtocol - from .._types import TokenizerProtocol +else: + transformers = LazyLoader("transformers", globals(), "transformers") def import_model( @@ -87,9 +93,6 @@ def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]: - if llm.__llm_custom_load__: - hub_attrs = {k: attrs.pop(k) for k in HUB_ATTRS if k in attrs} - return llm.load_model(llm.tag, *decls, **hub_attrs, **attrs) if llm.runtime == "transformers": return openllm.transformers.load_model(llm, *decls, **attrs) elif llm.runtime == "ggml": @@ -98,16 +101,37 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod raise ValueError(f"Unknown runtime: {llm.config['runtime']}") -def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]: - if llm.__llm_custom_tokenizer__: - (_, _), tokenizer_attrs = llm.llm_parameters - return llm.load_tokenizer(llm.tag, **tokenizer_attrs) - elif llm.runtime == "transformers": - return openllm.transformers.load_tokenizer(llm) - elif llm.runtime == "ggml": - return openllm.ggml.load_tokenizer(llm) +def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: + """Load the tokenizer from BentoML store. + + By default, it will try to find the bentomodel whether it is in store.. + If model is not found, it will raises a ``bentoml.exceptions.NotFound``. + """ + from .transformers import infer_tokenizers_class_for_llm + + bentomodel_fs = llm._bentomodel._fs + if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME): + with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile: + try: + tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"] + except KeyError: + # This could happen if users implement their own import_model + raise OpenLLMException( + "Model does not have tokenizer. Make sure to save \ + the tokenizer within the model via 'custom_objects'.\ + For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))" + ) from None else: - raise ValueError(f"Unknown runtime: {llm.config['runtime']}") + tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained( + bentomodel_fs.getsyspath("/"), + trust_remote_code=llm.__llm_trust_remote_code__, + **tokenizer_attrs, + ) + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + return tokenizer _extras = { diff --git a/src/openllm/serialisation/ggml.py b/src/openllm/serialisation/ggml.py index 79cddb9f..d2b05543 100644 --- a/src/openllm/serialisation/ggml.py +++ b/src/openllm/serialisation/ggml.py @@ -82,34 +82,30 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod raise NotImplementedError("Currently work in progress.") -def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]: +def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> TokenizerProtocol[T]: """Load the tokenizer from BentoML store. By default, it will try to find the bentomodel whether it is in store.. If model is not found, it will raises a ``bentoml.exceptions.NotFound``. """ - (_, _), tokenizer_attrs = llm.llm_parameters - if llm.__llm_custom_tokenizer__: - tokenizer = llm.load_tokenizer(llm.tag, **tokenizer_attrs) + bentomodel_fs = llm._bentomodel._fs + if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME): + with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile: + try: + tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"] + except KeyError: + # This could happen if users implement their own import_model + raise OpenLLMException( + "Model does not have tokenizer. Make sure to save \ + the tokenizer within the model via 'custom_objects'.\ + For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))" + ) from None else: - bentomodel_fs = llm._bentomodel._fs - if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME): - with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile: - try: - tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"] - except KeyError: - # This could happen if users implement their own import_model - raise OpenLLMException( - "Model does not have tokenizer. Make sure to save \ - the tokenizer within the model via 'custom_objects'.\ - For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))" - ) from None - else: - tokenizer = transformers.AutoTokenizer.from_pretrained( - bentomodel_fs.getsyspath("/"), - trust_remote_code=llm.__llm_trust_remote_code__, - **tokenizer_attrs, - ) + tokenizer = transformers.AutoTokenizer.from_pretrained( + bentomodel_fs.getsyspath("/"), + trust_remote_code=llm.__llm_trust_remote_code__, + **tokenizer_attrs, + ) return t.cast("TokenizerProtocol[T]", tokenizer) diff --git a/src/openllm/serialisation/transformers.py b/src/openllm/serialisation/transformers.py index 41dd80eb..d85526db 100644 --- a/src/openllm/serialisation/transformers.py +++ b/src/openllm/serialisation/transformers.py @@ -18,11 +18,8 @@ import copy import importlib import typing as t -import cloudpickle - import bentoml from bentoml._internal.frameworks.transformers import make_default_signatures -from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME from bentoml._internal.models.model import ModelOptions from .constants import FRAMEWORK_TO_AUTOCLASS_MAPPING @@ -48,8 +45,6 @@ if t.TYPE_CHECKING: from .._llm import M from .._llm import T from .._types import DictStrAny - from .._types import ModelProtocol - from .._types import TokenizerProtocol else: autogptq = LazyLoader("autogptq", globals(), "auto_gptq") _transformers = LazyLoader("_transformers", globals(), "transformers") @@ -77,7 +72,7 @@ def process_transformers_config( return config, hub_attrs, attrs -def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]: +def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> T: tokenizer_class = __llm.config["tokenizer_class"] if tokenizer_class is None: tokenizer_class = "AutoTokenizer" @@ -138,21 +133,18 @@ def import_model( **attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants). """ config, hub_attrs, attrs = process_transformers_config(llm.model_id, trust_remote_code, **attrs) - - # NOTE: get the base args and attrs, then - # allow override via import_model - (model_decls, model_attrs), tokenizer_attrs = llm.llm_parameters - decls = (*model_decls, *decls) - attrs = {**model_attrs, **attrs} - - safe_serialisation = llm._serialisation_format == "safetensors" + _, tokenizer_attrs = llm.llm_parameters quantize_method = llm._quantize_method - + safe_serialisation = first_not_none( + attrs.get("safe_serialization"), default=llm._serialisation_format == "safetensors" + ) + if llm.__llm_implementation__ == "vllm": + # Disable safe serialization with vLLM + safe_serialisation = False metadata: DictStrAny = { "safe_serialisation": safe_serialisation, "_quantize": quantize_method if quantize_method is not None else False, } - signatures: DictStrAny = {} if quantize_method == "gptq": if not is_autogptq_available(): @@ -260,18 +252,13 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo raise -def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]: +def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M: """Load the model from BentoML store. By default, it will try to find check the model in the local store. If model is not found, it will raises a ``bentoml.exceptions.NotFound``. """ config, hub_attrs, attrs = process_transformers_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs) - # NOTE: get the base args and attrs, then - # allow override via import_model - (model_decls, model_attrs), _ = llm.llm_parameters - decls = (*model_decls, *decls) - attrs = {**model_attrs, **attrs} metadata = llm._bentomodel.info.metadata safe_serialization = first_not_none( t.cast(t.Optional[bool], metadata.get("safe_serialisation", None)), @@ -285,17 +272,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod ) if llm.config["model_type"] != "causal_lm": raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") - return t.cast( - "ModelProtocol[M]", - autogptq.AutoGPTQForCausalLM.from_quantized( - llm._bentomodel.path, - *decls, - quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), - trust_remote_code=llm.__llm_trust_remote_code__, - use_safetensors=safe_serialization, - **hub_attrs, - **attrs, - ), + return autogptq.AutoGPTQForCausalLM.from_quantized( + llm._bentomodel.path, + *decls, + quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), + trust_remote_code=llm.__llm_trust_remote_code__, + use_safetensors=safe_serialization, + **hub_attrs, + **attrs, ) model = infer_autoclass_from_llm_config(llm, config).from_pretrained( llm._bentomodel.path, @@ -316,46 +300,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod model = model.to("cuda") except torch.cuda.OutOfMemoryError as err: raise RuntimeError( - f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8' for dynamic quantization." + f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization." ) from err if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline): # BetterTransformer is currently only supported on PyTorch. from optimum.bettertransformer import BetterTransformer model = BetterTransformer.transform(model) # type: ignore - return t.cast("ModelProtocol[M]", model) - - -def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]: - """Load the tokenizer from BentoML store. - - By default, it will try to find the bentomodel whether it is in store.. - If model is not found, it will raises a ``bentoml.exceptions.NotFound``. - """ - (_, _), tokenizer_attrs = llm.llm_parameters - bentomodel_fs = llm._bentomodel._fs - if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME): - with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile: - try: - tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"] - except KeyError: - # This could happen if users implement their own import_model - raise OpenLLMException( - "Model does not have tokenizer. Make sure to save \ - the tokenizer within the model via 'custom_objects'.\ - For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))" - ) from None - else: - tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained( - bentomodel_fs.getsyspath("/"), - trust_remote_code=llm.__llm_trust_remote_code__, - **tokenizer_attrs, - ) - - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - return tokenizer + return t.cast("M", model) def save_pretrained(