perf(serialisation): implement wrapper to reduce callstack (#132)

This commit is contained in:
Aaron Pham
2023-07-22 17:15:03 -04:00
committed by GitHub
parent ecf31e90b7
commit 19f20c7dad
17 changed files with 259 additions and 224 deletions

View File

@@ -0,0 +1,4 @@
Updated signature for `load_model` and `load_tokenizer` not to allow tag.
Tag can be accessed via `llm.tag`, or if using `openllm.serialisation` or `bentoml.transformers` then you can use `self._bentomodel`
Updated serialisation shared logics to reduce callstack for saving three calltrace.

View File

@@ -38,12 +38,15 @@ from bentoml._internal.models.model import ModelSignature
from ._configuration import AdapterType
from ._configuration import FineTuneConfig
from ._configuration import _object_getattribute
from ._configuration import _setattr_class
from ._quantisation import infer_quantisation_config
from .exceptions import ForbiddenAttributeError
from .exceptions import GpuNotAvailableError
from .utils import DEBUG
from .utils import ENV_VARS_TRUE_VALUES
from .utils import MYPY
from .utils import SHOW_CODEGEN
from .utils import EnvVarMixin
from .utils import LazyLoader
from .utils import ReprMixin
@@ -77,12 +80,12 @@ if t.TYPE_CHECKING:
import vllm
import transformers
from bentoml._internal.runner.strategy import Strategy
from ._configuration import PeftType
from ._types import AdaptersMapping
from ._types import AdaptersTuple
from ._types import DictStrAny
from ._types import ListStr
from ._types import LiteralRuntime
from ._types import LLMEmbeddings
from ._types import LLMRunnable
@@ -244,7 +247,7 @@ _reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"}
M = t.TypeVar(
"M",
bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM]",
bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]",
)
T = t.TypeVar(
"T",
@@ -348,10 +351,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
"""
raise NotImplementedError
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> M:
def load_model(self, *args: t.Any, **attrs: t.Any) -> M:
"""This function can be implemented to override the default load_model behaviour.
See falcon for example implementation.
See falcon for example implementation. Tag can be accessed via ``self.tag``
"""
raise NotImplementedError
@@ -394,8 +397,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
- `OPTForConditionalGeneration` -> `pt`
- `TFOPTForConditionalGeneration` -> `tf`
- `FlaxOPTForConditionalGeneration` -> `flax`
An additional naming for all VLLM backend: VLLMLlaMA -> `vllm`
"""
__llm_model__: M | peft.PeftModel | None
__llm_model__: M | None
"""A reference to the actual model. Instead of access this directly, you should use `model` property instead."""
__llm_tokenizer__: T | None
"""A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead."""
@@ -404,13 +409,6 @@ class LLMInterface(ABC, t.Generic[M, T]):
__llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
"""A reference to the the cached LoRA adapter mapping."""
__llm_custom_import__: bool
"""Whether this LLM has a custom import_model"""
__llm_custom_load__: bool
"""A boolean to determine whether a custom 'load_model' is implemented"""
__llm_custom_tokenizer__: bool
"""A boolean to determine whether a custom 'load_tokenizer' is implemented"""
if t.TYPE_CHECKING and not MYPY:
def __attrs_init__(
@@ -432,6 +430,121 @@ class LLMInterface(ABC, t.Generic[M, T]):
"""Generated __attrs_init__ for openllm.LLM."""
if t.TYPE_CHECKING:
_R = t.TypeVar("_R")
class _import_model_wrapper(t.Generic[_R, M, T]):
def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
...
class _load_model_wrapper(t.Generic[M, T]):
def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
...
class _load_tokenizer_wrapper(t.Generic[M, T]):
def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
...
class _llm_post_init_wrapper(t.Generic[M, T]):
def __call__(self, llm: LLM[M, T]) -> T:
...
def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]):
@functools.wraps(f)
def wrapper(
self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any
) -> bentoml.Model:
trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
# wrapped around custom init to provide some meta compression
# for all decls and attrs
(model_decls, model_attrs), _ = self.llm_parameters
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
return f(self, *decls, trust_remote_code=trust_remote_code, **attrs)
return wrapper
def _wrapped_load_model(f: _load_model_wrapper[M, T]):
@functools.wraps(f)
def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
# wrapped around custom init to provide some meta compression
# for all decls and attrs
(model_decls, model_attrs), _ = self.llm_parameters
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
return f(self, *decls, **attrs)
return wrapper
def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]):
@functools.wraps(f)
def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
_, model_tokenizer_attrs = self.llm_parameters
tokenizer_attrs = {**model_tokenizer_attrs, **tokenizer_attrs}
return f(self, **tokenizer_attrs)
return wrapper
def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]:
@functools.wraps(f)
def wrapper(self: LLM[M, T]):
_default_post_init(self)
f(self)
return wrapper
def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
attributes = {
"import_model": _wrapped_import_model,
"load_model": _wrapped_load_model,
"load_tokenizer": _wrapped_load_tokenizer,
"llm_post_init": _wrapped_llm_post_init,
}
args: ListStr = []
anns: DictStrAny = {}
lines: ListStr = []
globs: DictStrAny = {
"cls": cls,
"_cached_attribute": attributes,
"_cached_getattribute_get": _object_getattribute.__get__,
"LLMInterface": LLMInterface,
"openllm": openllm,
}
# function initialisation
for func, impl in attributes.items():
globs[f"__wrapped_{func}"] = impl
impl_name = f"__wrapped_{func}"
cached_func_name = f"_cached_{cls.__name__}_func"
if func == "llm_post_init":
func_call = f"_impl_{cls.__name__}_{func}={impl_name}"
else:
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_getattr('{func}') else openllm.serialisation.{func}"
lines.extend(
[
"_cached_LLMInterface_getattr=_cached_getattribute_get(LLMInterface)",
f"{cached_func_name}=cls.{func}",
func_call,
_setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),
]
)
# cached attribute initialisation
interface_anns = codegen.get_annotations(LLMInterface)
for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
lines.append(_setattr_class(f"__llm_{v}__", None))
anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
if SHOW_CODEGEN:
logger.info("Generated script for %s:\n\n%s", cls.__name__, "\n".join(lines))
return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
@@ -480,46 +593,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
elif "config_class" not in cd:
raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
_custom_import = True
if cls.import_model is LLMInterface[M, T].import_model:
# using the default import model if no custom import is set
_custom_import = False
setattr(cls, "import_model", openllm.serialisation.import_model)
else:
import_func = getattr(cls, "import_model")
def _wrapped_import_model(
self: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any
) -> bentoml.Model:
# wrapped around custom init to provide some meta compression
# for all decls and attrs
(model_decls, model_attrs), _ = self.llm_parameters
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
return import_func(self, *decls, trust_remote_code=trust_remote_code, **attrs)
setattr(cls, "import_model", functools.update_wrapper(_wrapped_import_model, cls.import_model))
if cls.llm_post_init is LLMInterface[M, T].llm_post_init:
# using the default post init if no custom post init is set
wrapped_post_init = _default_post_init
else:
original_post_init = getattr(cls, "llm_post_init")
def wrapped_post_init(self: LLM[M, T]) -> None:
_default_post_init(self)
original_post_init(self)
setattr(cls, "llm_post_init", wrapped_post_init)
cls.__llm_custom_import__ = _custom_import
cls.__llm_custom_load__ = False if cls.load_model is LLMInterface[M, T].load_model else True
cls.__llm_custom_tokenizer__ = False if cls.load_tokenizer is LLMInterface[M, T].load_tokenizer else True
for at in {"bentomodel", "model", "tokenizer", "adapter_map"}:
setattr(cls, f"__llm_{at}__", None)
_make_assignment_script(cls)(cls)
# update docstring for given entrypoint
for fn in {"generate", "generate_one", "generate_iterator"}:
@@ -546,7 +620,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
M,
BetterTransformer.reverse(t.cast("transformers.PreTrainedModel", self.__llm_model__)),
)
openllm.serialisation.save_pretrained(self, save_directory, **attrs)
@classmethod
@@ -997,16 +1070,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
raise GpuNotAvailableError(f"{self} only supports running with GPU (None available).") from None
if self.__llm_model__ is None:
self.__llm_model__ = t.cast(
M, openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
)
return t.cast(M, self.__llm_model__)
# NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
self.__llm_model__ = self.load_model(*self._model_decls, **self._model_attrs)
return self.__llm_model__
@property
def tokenizer(self) -> T:
"""The tokenizer to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
if self.__llm_tokenizer__ is None:
self.__llm_tokenizer__ = t.cast(T, openllm.serialisation.load_tokenizer(self))
# NOTE: the signature of load_tokenizer here is the wrapper under _wrapped_load_tokenizer
self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs)
return self.__llm_tokenizer__
def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig:
@@ -1204,7 +1277,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
models: list[bentoml.Model] | None = None,
max_batch_size: int | None = None,
max_latency_ms: int | None = None,
scheduling_strategy: type[Strategy] | None = None,
scheduling_strategy: type[bentoml.Strategy] | None = None,
) -> LLMRunner:
"""Convert this LLM into a Runner.
@@ -1292,6 +1365,7 @@ def Runner(
model_name: str,
*,
model_id: str | None = None,
model_version: str | None = ...,
init_local: t.Literal[False, True] = ...,
**attrs: t.Any,
) -> LLMRunner:
@@ -1303,12 +1377,46 @@ def Runner(
model_name: str,
*,
model_id: str = ...,
model_version: str | None = ...,
models: list[bentoml.Model] | None = ...,
max_batch_size: int | None = ...,
max_latency_ms: int | None = ...,
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
embedded: t.Literal[True, False] = ...,
scheduling_strategy: type[Strategy] | None = ...,
scheduling_strategy: type[bentoml.Strategy] | None = ...,
**attrs: t.Any,
) -> LLMRunner:
...
@overload
def Runner(
model_name: str,
*,
ensure_available: bool | None = None,
init_local: bool = ...,
implementation: LiteralRuntime | None = None,
llm_config: openllm.LLMConfig | None = None,
**attrs: t.Any,
) -> LLMRunner:
...
@overload
def Runner(
model_name: str,
*args: t.Any,
model_id: str | None = ...,
model_version: str | None = ...,
llm_config: openllm.LLMConfig | None = ...,
runtime: t.Literal["ggml", "transformers"] | None = ...,
quantize: t.Literal["int8", "int4", "gptq"] | None = ...,
bettertransformer: str | bool | None = ...,
adapter_id: str | None = ...,
adapter_name: str | None = ...,
adapter_map: dict[str, str | None] | None = ...,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
serialisation: t.Literal["safetensors", "legacy"] = ...,
**attrs: t.Any,
) -> LLMRunner:
...

View File

@@ -32,9 +32,6 @@ else:
class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda")
def sanitize_parameters(
self,
prompt: str,

View File

@@ -32,9 +32,6 @@ else:
class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda")
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
_, tokenizer_attrs = self.llm_parameters

View File

@@ -22,14 +22,12 @@ from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
from .configuration_dolly_v2 import END_KEY
from .configuration_dolly_v2 import RESPONSE_KEY
from .configuration_dolly_v2 import get_special_token_id
from ...utils import normalize_attrs_to_model_tokenizer_pair
if t.TYPE_CHECKING:
import tensorflow as tf
import torch
import bentoml
import transformers
else:
tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
@@ -261,18 +259,10 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
tokenizer_kwds = {"padding_side": "left"}
return model_kwds, tokenizer_kwds
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
(_, model_attrs), tokenizer_attrs = self.llm_parameters
normalized_model_attrs, normalized_tokenizer_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
attrs = {**model_attrs, **normalized_model_attrs}
tokenizer_attrs = {**tokenizer_attrs, **normalized_tokenizer_attrs}
_ref = openllm.serialisation.get(self)
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
return get_pipeline(
model=transformers.AutoModelForCausalLM.from_pretrained(_ref.path, **attrs),
tokenizer=transformers.AutoTokenizer.from_pretrained(_ref.path, **tokenizer_attrs),
model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
tokenizer=self.tokenizer,
_init=True,
return_full_text=self.config.return_full_text,
)

View File

@@ -24,7 +24,6 @@ from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import torch
import bentoml
import transformers
else:
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -40,21 +39,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
tokenizer_kwds: dict[str, t.Any] = {}
return model_kwds, tokenizer_kwds
def llm_post_init(self):
self.device = torch.device("cuda")
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
trust_remote_code = attrs.pop("trust_remote_code", True)
return transformers.AutoModelForCausalLM.from_pretrained(
openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
)
def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> t.Any:
trust_remote_code = attrs.pop("trust_remote_code", True)
return transformers.AutoTokenizer.from_pretrained(
openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
)
def sanitize_parameters(
self,
prompt: str,

View File

@@ -31,9 +31,6 @@ else:
class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def sanitize_parameters(
self,
prompt: str,

View File

@@ -25,7 +25,6 @@ from ..._prompt import default_formatter
if t.TYPE_CHECKING:
import torch
import bentoml
import transformers
else:
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -77,8 +76,8 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
return generation_result[0]
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
if self.config.use_half_precision:
model.half()
return model

View File

@@ -40,9 +40,6 @@ logger = logging.getLogger(__name__)
class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def sanitize_parameters(
self,
prompt: str,

View File

@@ -26,7 +26,6 @@ if t.TYPE_CHECKING:
import torch
import vllm
import bentoml
import transformers
else:
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -79,8 +78,8 @@ class VLLMLlaMA(openllm.LLM["vllm.LLM", "transformers.LlamaTokenizerFast"]):
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
return generation_result[0]
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
def load_model(self, *args: t.Any, **attrs: t.Any) -> t.Any:
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
if self.config.use_half_precision:
model.half()
return model

View File

@@ -63,7 +63,6 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
@property
@@ -110,12 +109,12 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
finally:
torch.cuda.empty_cache()
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
torch_dtype = attrs.pop("torch_dtype", self.dtype)
device_map = attrs.pop("device_map", None)
trust_remote_code = attrs.pop("trust_remote_code", True)
_ref = bentoml.transformers.get(tag)
_ref = bentoml.transformers.get(self.tag)
config = get_mpt_config(
_ref.path,
self.config.max_sequence_length,

View File

@@ -39,7 +39,6 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@property
@@ -75,13 +74,10 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
labels=generate_labels(self),
)
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
torch_dtype = attrs.pop("torch_dtype", self.dtype)
trust_remote_code = attrs.pop("trust_remote_code", False)
_ref = bentoml.transformers.get(tag)
model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
_ref.path, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, **attrs
bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
)
return model

View File

@@ -37,7 +37,6 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.bettertransformer = True if not torch.cuda.is_available() else False
@property

View File

@@ -42,9 +42,6 @@ FIM_INDICATOR = "<FILL_HERE>"
class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
__openllm_internal__ = True
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@property
def import_kwargs(self):
model_kwds = {

View File

@@ -39,19 +39,25 @@ llm.save_pretrained("./path/to/local-dolly")
from __future__ import annotations
import typing as t
import openllm
import cloudpickle
from .constants import HUB_ATTRS
import openllm
from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
from ..exceptions import OpenLLMException
from ..utils import LazyLoader
from ..utils import LazyModule
if t.TYPE_CHECKING:
import bentoml
import transformers
from .._llm import M
from .._llm import T
from .._types import ModelProtocol
from .._types import TokenizerProtocol
else:
transformers = LazyLoader("transformers", globals(), "transformers")
def import_model(
@@ -87,9 +93,6 @@ def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
if llm.__llm_custom_load__:
hub_attrs = {k: attrs.pop(k) for k in HUB_ATTRS if k in attrs}
return llm.load_model(llm.tag, *decls, **hub_attrs, **attrs)
if llm.runtime == "transformers":
return openllm.transformers.load_model(llm, *decls, **attrs)
elif llm.runtime == "ggml":
@@ -98,16 +101,37 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
raise ValueError(f"Unknown runtime: {llm.config['runtime']}")
def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
if llm.__llm_custom_tokenizer__:
(_, _), tokenizer_attrs = llm.llm_parameters
return llm.load_tokenizer(llm.tag, **tokenizer_attrs)
elif llm.runtime == "transformers":
return openllm.transformers.load_tokenizer(llm)
elif llm.runtime == "ggml":
return openllm.ggml.load_tokenizer(llm)
def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
"""Load the tokenizer from BentoML store.
By default, it will try to find the bentomodel whether it is in store..
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
"""
from .transformers import infer_tokenizers_class_for_llm
bentomodel_fs = llm._bentomodel._fs
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
try:
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
except KeyError:
# This could happen if users implement their own import_model
raise OpenLLMException(
"Model does not have tokenizer. Make sure to save \
the tokenizer within the model via 'custom_objects'.\
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
) from None
else:
raise ValueError(f"Unknown runtime: {llm.config['runtime']}")
tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
bentomodel_fs.getsyspath("/"),
trust_remote_code=llm.__llm_trust_remote_code__,
**tokenizer_attrs,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
_extras = {

View File

@@ -82,34 +82,30 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
raise NotImplementedError("Currently work in progress.")
def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> TokenizerProtocol[T]:
"""Load the tokenizer from BentoML store.
By default, it will try to find the bentomodel whether it is in store..
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
"""
(_, _), tokenizer_attrs = llm.llm_parameters
if llm.__llm_custom_tokenizer__:
tokenizer = llm.load_tokenizer(llm.tag, **tokenizer_attrs)
bentomodel_fs = llm._bentomodel._fs
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
try:
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
except KeyError:
# This could happen if users implement their own import_model
raise OpenLLMException(
"Model does not have tokenizer. Make sure to save \
the tokenizer within the model via 'custom_objects'.\
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
) from None
else:
bentomodel_fs = llm._bentomodel._fs
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
try:
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
except KeyError:
# This could happen if users implement their own import_model
raise OpenLLMException(
"Model does not have tokenizer. Make sure to save \
the tokenizer within the model via 'custom_objects'.\
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
) from None
else:
tokenizer = transformers.AutoTokenizer.from_pretrained(
bentomodel_fs.getsyspath("/"),
trust_remote_code=llm.__llm_trust_remote_code__,
**tokenizer_attrs,
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
bentomodel_fs.getsyspath("/"),
trust_remote_code=llm.__llm_trust_remote_code__,
**tokenizer_attrs,
)
return t.cast("TokenizerProtocol[T]", tokenizer)

View File

@@ -18,11 +18,8 @@ import copy
import importlib
import typing as t
import cloudpickle
import bentoml
from bentoml._internal.frameworks.transformers import make_default_signatures
from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
from bentoml._internal.models.model import ModelOptions
from .constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
@@ -48,8 +45,6 @@ if t.TYPE_CHECKING:
from .._llm import M
from .._llm import T
from .._types import DictStrAny
from .._types import ModelProtocol
from .._types import TokenizerProtocol
else:
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
_transformers = LazyLoader("_transformers", globals(), "transformers")
@@ -77,7 +72,7 @@ def process_transformers_config(
return config, hub_attrs, attrs
def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> T:
tokenizer_class = __llm.config["tokenizer_class"]
if tokenizer_class is None:
tokenizer_class = "AutoTokenizer"
@@ -138,21 +133,18 @@ def import_model(
**attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
"""
config, hub_attrs, attrs = process_transformers_config(llm.model_id, trust_remote_code, **attrs)
# NOTE: get the base args and attrs, then
# allow override via import_model
(model_decls, model_attrs), tokenizer_attrs = llm.llm_parameters
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
safe_serialisation = llm._serialisation_format == "safetensors"
_, tokenizer_attrs = llm.llm_parameters
quantize_method = llm._quantize_method
safe_serialisation = first_not_none(
attrs.get("safe_serialization"), default=llm._serialisation_format == "safetensors"
)
if llm.__llm_implementation__ == "vllm":
# Disable safe serialization with vLLM
safe_serialisation = False
metadata: DictStrAny = {
"safe_serialisation": safe_serialisation,
"_quantize": quantize_method if quantize_method is not None else False,
}
signatures: DictStrAny = {}
if quantize_method == "gptq":
if not is_autogptq_available():
@@ -260,18 +252,13 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo
raise
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
"""Load the model from BentoML store.
By default, it will try to find check the model in the local store.
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
"""
config, hub_attrs, attrs = process_transformers_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
# NOTE: get the base args and attrs, then
# allow override via import_model
(model_decls, model_attrs), _ = llm.llm_parameters
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
metadata = llm._bentomodel.info.metadata
safe_serialization = first_not_none(
t.cast(t.Optional[bool], metadata.get("safe_serialisation", None)),
@@ -285,17 +272,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
)
if llm.config["model_type"] != "causal_lm":
raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return t.cast(
"ModelProtocol[M]",
autogptq.AutoGPTQForCausalLM.from_quantized(
llm._bentomodel.path,
*decls,
quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
trust_remote_code=llm.__llm_trust_remote_code__,
use_safetensors=safe_serialization,
**hub_attrs,
**attrs,
),
return autogptq.AutoGPTQForCausalLM.from_quantized(
llm._bentomodel.path,
*decls,
quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
trust_remote_code=llm.__llm_trust_remote_code__,
use_safetensors=safe_serialization,
**hub_attrs,
**attrs,
)
model = infer_autoclass_from_llm_config(llm, config).from_pretrained(
llm._bentomodel.path,
@@ -316,46 +300,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
model = model.to("cuda")
except torch.cuda.OutOfMemoryError as err:
raise RuntimeError(
f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8' for dynamic quantization."
f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
) from err
if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
# BetterTransformer is currently only supported on PyTorch.
from optimum.bettertransformer import BetterTransformer
model = BetterTransformer.transform(model) # type: ignore
return t.cast("ModelProtocol[M]", model)
def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
"""Load the tokenizer from BentoML store.
By default, it will try to find the bentomodel whether it is in store..
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
"""
(_, _), tokenizer_attrs = llm.llm_parameters
bentomodel_fs = llm._bentomodel._fs
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
try:
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
except KeyError:
# This could happen if users implement their own import_model
raise OpenLLMException(
"Model does not have tokenizer. Make sure to save \
the tokenizer within the model via 'custom_objects'.\
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
) from None
else:
tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
bentomodel_fs.getsyspath("/"),
trust_remote_code=llm.__llm_trust_remote_code__,
**tokenizer_attrs,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
return t.cast("M", model)
def save_pretrained(