mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-19 14:16:22 -04:00
perf(serialisation): implement wrapper to reduce callstack (#132)
This commit is contained in:
4
changelog.d/132.breaking.md
Normal file
4
changelog.d/132.breaking.md
Normal file
@@ -0,0 +1,4 @@
|
||||
Updated signature for `load_model` and `load_tokenizer` not to allow tag.
|
||||
Tag can be accessed via `llm.tag`, or if using `openllm.serialisation` or `bentoml.transformers` then you can use `self._bentomodel`
|
||||
|
||||
Updated serialisation shared logics to reduce callstack for saving three calltrace.
|
||||
@@ -38,12 +38,15 @@ from bentoml._internal.models.model import ModelSignature
|
||||
|
||||
from ._configuration import AdapterType
|
||||
from ._configuration import FineTuneConfig
|
||||
from ._configuration import _object_getattribute
|
||||
from ._configuration import _setattr_class
|
||||
from ._quantisation import infer_quantisation_config
|
||||
from .exceptions import ForbiddenAttributeError
|
||||
from .exceptions import GpuNotAvailableError
|
||||
from .utils import DEBUG
|
||||
from .utils import ENV_VARS_TRUE_VALUES
|
||||
from .utils import MYPY
|
||||
from .utils import SHOW_CODEGEN
|
||||
from .utils import EnvVarMixin
|
||||
from .utils import LazyLoader
|
||||
from .utils import ReprMixin
|
||||
@@ -77,12 +80,12 @@ if t.TYPE_CHECKING:
|
||||
import vllm
|
||||
|
||||
import transformers
|
||||
from bentoml._internal.runner.strategy import Strategy
|
||||
|
||||
from ._configuration import PeftType
|
||||
from ._types import AdaptersMapping
|
||||
from ._types import AdaptersTuple
|
||||
from ._types import DictStrAny
|
||||
from ._types import ListStr
|
||||
from ._types import LiteralRuntime
|
||||
from ._types import LLMEmbeddings
|
||||
from ._types import LLMRunnable
|
||||
@@ -244,7 +247,7 @@ _reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"}
|
||||
|
||||
M = t.TypeVar(
|
||||
"M",
|
||||
bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM]",
|
||||
bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]",
|
||||
)
|
||||
T = t.TypeVar(
|
||||
"T",
|
||||
@@ -348,10 +351,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> M:
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> M:
|
||||
"""This function can be implemented to override the default load_model behaviour.
|
||||
|
||||
See falcon for example implementation.
|
||||
See falcon for example implementation. Tag can be accessed via ``self.tag``
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -394,8 +397,10 @@ class LLMInterface(ABC, t.Generic[M, T]):
|
||||
- `OPTForConditionalGeneration` -> `pt`
|
||||
- `TFOPTForConditionalGeneration` -> `tf`
|
||||
- `FlaxOPTForConditionalGeneration` -> `flax`
|
||||
|
||||
An additional naming for all VLLM backend: VLLMLlaMA -> `vllm`
|
||||
"""
|
||||
__llm_model__: M | peft.PeftModel | None
|
||||
__llm_model__: M | None
|
||||
"""A reference to the actual model. Instead of access this directly, you should use `model` property instead."""
|
||||
__llm_tokenizer__: T | None
|
||||
"""A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead."""
|
||||
@@ -404,13 +409,6 @@ class LLMInterface(ABC, t.Generic[M, T]):
|
||||
__llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
|
||||
"""A reference to the the cached LoRA adapter mapping."""
|
||||
|
||||
__llm_custom_import__: bool
|
||||
"""Whether this LLM has a custom import_model"""
|
||||
__llm_custom_load__: bool
|
||||
"""A boolean to determine whether a custom 'load_model' is implemented"""
|
||||
__llm_custom_tokenizer__: bool
|
||||
"""A boolean to determine whether a custom 'load_tokenizer' is implemented"""
|
||||
|
||||
if t.TYPE_CHECKING and not MYPY:
|
||||
|
||||
def __attrs_init__(
|
||||
@@ -432,6 +430,121 @@ class LLMInterface(ABC, t.Generic[M, T]):
|
||||
"""Generated __attrs_init__ for openllm.LLM."""
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
_R = t.TypeVar("_R")
|
||||
|
||||
class _import_model_wrapper(t.Generic[_R, M, T]):
|
||||
def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
|
||||
...
|
||||
|
||||
class _load_model_wrapper(t.Generic[M, T]):
|
||||
def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
...
|
||||
|
||||
class _load_tokenizer_wrapper(t.Generic[M, T]):
|
||||
def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
|
||||
...
|
||||
|
||||
class _llm_post_init_wrapper(t.Generic[M, T]):
|
||||
def __call__(self, llm: LLM[M, T]) -> T:
|
||||
...
|
||||
|
||||
|
||||
def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]):
|
||||
@functools.wraps(f)
|
||||
def wrapper(
|
||||
self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any
|
||||
) -> bentoml.Model:
|
||||
trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
|
||||
# wrapped around custom init to provide some meta compression
|
||||
# for all decls and attrs
|
||||
(model_decls, model_attrs), _ = self.llm_parameters
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
return f(self, *decls, trust_remote_code=trust_remote_code, **attrs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _wrapped_load_model(f: _load_model_wrapper[M, T]):
|
||||
@functools.wraps(f)
|
||||
def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
# wrapped around custom init to provide some meta compression
|
||||
# for all decls and attrs
|
||||
(model_decls, model_attrs), _ = self.llm_parameters
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
return f(self, *decls, **attrs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]):
|
||||
@functools.wraps(f)
|
||||
def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
|
||||
_, model_tokenizer_attrs = self.llm_parameters
|
||||
tokenizer_attrs = {**model_tokenizer_attrs, **tokenizer_attrs}
|
||||
return f(self, **tokenizer_attrs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]:
|
||||
@functools.wraps(f)
|
||||
def wrapper(self: LLM[M, T]):
|
||||
_default_post_init(self)
|
||||
f(self)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
|
||||
attributes = {
|
||||
"import_model": _wrapped_import_model,
|
||||
"load_model": _wrapped_load_model,
|
||||
"load_tokenizer": _wrapped_load_tokenizer,
|
||||
"llm_post_init": _wrapped_llm_post_init,
|
||||
}
|
||||
args: ListStr = []
|
||||
anns: DictStrAny = {}
|
||||
lines: ListStr = []
|
||||
globs: DictStrAny = {
|
||||
"cls": cls,
|
||||
"_cached_attribute": attributes,
|
||||
"_cached_getattribute_get": _object_getattribute.__get__,
|
||||
"LLMInterface": LLMInterface,
|
||||
"openllm": openllm,
|
||||
}
|
||||
# function initialisation
|
||||
for func, impl in attributes.items():
|
||||
globs[f"__wrapped_{func}"] = impl
|
||||
impl_name = f"__wrapped_{func}"
|
||||
cached_func_name = f"_cached_{cls.__name__}_func"
|
||||
if func == "llm_post_init":
|
||||
func_call = f"_impl_{cls.__name__}_{func}={impl_name}"
|
||||
else:
|
||||
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_getattr('{func}') else openllm.serialisation.{func}"
|
||||
lines.extend(
|
||||
[
|
||||
"_cached_LLMInterface_getattr=_cached_getattribute_get(LLMInterface)",
|
||||
f"{cached_func_name}=cls.{func}",
|
||||
func_call,
|
||||
_setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),
|
||||
]
|
||||
)
|
||||
|
||||
# cached attribute initialisation
|
||||
interface_anns = codegen.get_annotations(LLMInterface)
|
||||
for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
|
||||
lines.append(_setattr_class(f"__llm_{v}__", None))
|
||||
anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
|
||||
|
||||
if SHOW_CODEGEN:
|
||||
logger.info("Generated script for %s:\n\n%s", cls.__name__, "\n".join(lines))
|
||||
|
||||
return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
|
||||
|
||||
|
||||
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
|
||||
|
||||
|
||||
@@ -480,46 +593,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
elif "config_class" not in cd:
|
||||
raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
|
||||
|
||||
_custom_import = True
|
||||
if cls.import_model is LLMInterface[M, T].import_model:
|
||||
# using the default import model if no custom import is set
|
||||
_custom_import = False
|
||||
setattr(cls, "import_model", openllm.serialisation.import_model)
|
||||
else:
|
||||
import_func = getattr(cls, "import_model")
|
||||
|
||||
def _wrapped_import_model(
|
||||
self: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any
|
||||
) -> bentoml.Model:
|
||||
# wrapped around custom init to provide some meta compression
|
||||
# for all decls and attrs
|
||||
(model_decls, model_attrs), _ = self.llm_parameters
|
||||
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
|
||||
return import_func(self, *decls, trust_remote_code=trust_remote_code, **attrs)
|
||||
|
||||
setattr(cls, "import_model", functools.update_wrapper(_wrapped_import_model, cls.import_model))
|
||||
|
||||
if cls.llm_post_init is LLMInterface[M, T].llm_post_init:
|
||||
# using the default post init if no custom post init is set
|
||||
wrapped_post_init = _default_post_init
|
||||
else:
|
||||
original_post_init = getattr(cls, "llm_post_init")
|
||||
|
||||
def wrapped_post_init(self: LLM[M, T]) -> None:
|
||||
_default_post_init(self)
|
||||
original_post_init(self)
|
||||
|
||||
setattr(cls, "llm_post_init", wrapped_post_init)
|
||||
|
||||
cls.__llm_custom_import__ = _custom_import
|
||||
cls.__llm_custom_load__ = False if cls.load_model is LLMInterface[M, T].load_model else True
|
||||
cls.__llm_custom_tokenizer__ = False if cls.load_tokenizer is LLMInterface[M, T].load_tokenizer else True
|
||||
|
||||
for at in {"bentomodel", "model", "tokenizer", "adapter_map"}:
|
||||
setattr(cls, f"__llm_{at}__", None)
|
||||
_make_assignment_script(cls)(cls)
|
||||
|
||||
# update docstring for given entrypoint
|
||||
for fn in {"generate", "generate_one", "generate_iterator"}:
|
||||
@@ -546,7 +620,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
M,
|
||||
BetterTransformer.reverse(t.cast("transformers.PreTrainedModel", self.__llm_model__)),
|
||||
)
|
||||
|
||||
openllm.serialisation.save_pretrained(self, save_directory, **attrs)
|
||||
|
||||
@classmethod
|
||||
@@ -997,16 +1070,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
raise GpuNotAvailableError(f"{self} only supports running with GPU (None available).") from None
|
||||
|
||||
if self.__llm_model__ is None:
|
||||
self.__llm_model__ = t.cast(
|
||||
M, openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
|
||||
)
|
||||
return t.cast(M, self.__llm_model__)
|
||||
# NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
|
||||
self.__llm_model__ = self.load_model(*self._model_decls, **self._model_attrs)
|
||||
return self.__llm_model__
|
||||
|
||||
@property
|
||||
def tokenizer(self) -> T:
|
||||
"""The tokenizer to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
|
||||
if self.__llm_tokenizer__ is None:
|
||||
self.__llm_tokenizer__ = t.cast(T, openllm.serialisation.load_tokenizer(self))
|
||||
# NOTE: the signature of load_tokenizer here is the wrapper under _wrapped_load_tokenizer
|
||||
self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs)
|
||||
return self.__llm_tokenizer__
|
||||
|
||||
def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig:
|
||||
@@ -1204,7 +1277,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
models: list[bentoml.Model] | None = None,
|
||||
max_batch_size: int | None = None,
|
||||
max_latency_ms: int | None = None,
|
||||
scheduling_strategy: type[Strategy] | None = None,
|
||||
scheduling_strategy: type[bentoml.Strategy] | None = None,
|
||||
) -> LLMRunner:
|
||||
"""Convert this LLM into a Runner.
|
||||
|
||||
@@ -1292,6 +1365,7 @@ def Runner(
|
||||
model_name: str,
|
||||
*,
|
||||
model_id: str | None = None,
|
||||
model_version: str | None = ...,
|
||||
init_local: t.Literal[False, True] = ...,
|
||||
**attrs: t.Any,
|
||||
) -> LLMRunner:
|
||||
@@ -1303,12 +1377,46 @@ def Runner(
|
||||
model_name: str,
|
||||
*,
|
||||
model_id: str = ...,
|
||||
model_version: str | None = ...,
|
||||
models: list[bentoml.Model] | None = ...,
|
||||
max_batch_size: int | None = ...,
|
||||
max_latency_ms: int | None = ...,
|
||||
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
|
||||
embedded: t.Literal[True, False] = ...,
|
||||
scheduling_strategy: type[Strategy] | None = ...,
|
||||
scheduling_strategy: type[bentoml.Strategy] | None = ...,
|
||||
**attrs: t.Any,
|
||||
) -> LLMRunner:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def Runner(
|
||||
model_name: str,
|
||||
*,
|
||||
ensure_available: bool | None = None,
|
||||
init_local: bool = ...,
|
||||
implementation: LiteralRuntime | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> LLMRunner:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def Runner(
|
||||
model_name: str,
|
||||
*args: t.Any,
|
||||
model_id: str | None = ...,
|
||||
model_version: str | None = ...,
|
||||
llm_config: openllm.LLMConfig | None = ...,
|
||||
runtime: t.Literal["ggml", "transformers"] | None = ...,
|
||||
quantize: t.Literal["int8", "int4", "gptq"] | None = ...,
|
||||
bettertransformer: str | bool | None = ...,
|
||||
adapter_id: str | None = ...,
|
||||
adapter_name: str | None = ...,
|
||||
adapter_map: dict[str, str | None] | None = ...,
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
||||
serialisation: t.Literal["safetensors", "legacy"] = ...,
|
||||
**attrs: t.Any,
|
||||
) -> LLMRunner:
|
||||
...
|
||||
|
||||
@@ -32,9 +32,6 @@ else:
|
||||
class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda")
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
|
||||
@@ -32,9 +32,6 @@ else:
|
||||
class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda")
|
||||
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
|
||||
|
||||
@@ -22,14 +22,12 @@ from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
|
||||
from .configuration_dolly_v2 import END_KEY
|
||||
from .configuration_dolly_v2 import RESPONSE_KEY
|
||||
from .configuration_dolly_v2 import get_special_token_id
|
||||
from ...utils import normalize_attrs_to_model_tokenizer_pair
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
||||
import bentoml
|
||||
import transformers
|
||||
else:
|
||||
tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
|
||||
@@ -261,18 +259,10 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
|
||||
tokenizer_kwds = {"padding_side": "left"}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
|
||||
(_, model_attrs), tokenizer_attrs = self.llm_parameters
|
||||
normalized_model_attrs, normalized_tokenizer_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
|
||||
attrs = {**model_attrs, **normalized_model_attrs}
|
||||
tokenizer_attrs = {**tokenizer_attrs, **normalized_tokenizer_attrs}
|
||||
_ref = openllm.serialisation.get(self)
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
|
||||
return get_pipeline(
|
||||
model=transformers.AutoModelForCausalLM.from_pretrained(_ref.path, **attrs),
|
||||
tokenizer=transformers.AutoTokenizer.from_pretrained(_ref.path, **tokenizer_attrs),
|
||||
model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
|
||||
tokenizer=self.tokenizer,
|
||||
_init=True,
|
||||
return_full_text=self.config.return_full_text,
|
||||
)
|
||||
|
||||
@@ -24,7 +24,6 @@ from ..._prompt import default_formatter
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import bentoml
|
||||
import transformers
|
||||
else:
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
@@ -40,21 +39,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
|
||||
tokenizer_kwds: dict[str, t.Any] = {}
|
||||
return model_kwds, tokenizer_kwds
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda")
|
||||
|
||||
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
return transformers.AutoModelForCausalLM.from_pretrained(
|
||||
openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
|
||||
)
|
||||
|
||||
def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> t.Any:
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
return transformers.AutoTokenizer.from_pretrained(
|
||||
openllm.serialisation.get(self).path, trust_remote_code=trust_remote_code, **attrs
|
||||
)
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
|
||||
@@ -31,9 +31,6 @@ else:
|
||||
class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
|
||||
@@ -25,7 +25,6 @@ from ..._prompt import default_formatter
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import bentoml
|
||||
import transformers
|
||||
else:
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
@@ -77,8 +76,8 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
|
||||
if self.config.use_half_precision:
|
||||
model.half()
|
||||
return model
|
||||
|
||||
@@ -40,9 +40,6 @@ logger = logging.getLogger(__name__)
|
||||
class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
|
||||
@@ -26,7 +26,6 @@ if t.TYPE_CHECKING:
|
||||
import torch
|
||||
import vllm
|
||||
|
||||
import bentoml
|
||||
import transformers
|
||||
else:
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
@@ -79,8 +78,8 @@ class VLLMLlaMA(openllm.LLM["vllm.LLM", "transformers.LlamaTokenizerFast"]):
|
||||
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, **attrs)
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> t.Any:
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
|
||||
if self.config.use_half_precision:
|
||||
model.half()
|
||||
return model
|
||||
|
||||
@@ -63,7 +63,6 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
||||
|
||||
@property
|
||||
@@ -110,12 +109,12 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
|
||||
finally:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
|
||||
torch_dtype = attrs.pop("torch_dtype", self.dtype)
|
||||
device_map = attrs.pop("device_map", None)
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
|
||||
_ref = bentoml.transformers.get(tag)
|
||||
_ref = bentoml.transformers.get(self.tag)
|
||||
config = get_mpt_config(
|
||||
_ref.path,
|
||||
self.config.max_sequence_length,
|
||||
|
||||
@@ -39,7 +39,6 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
||||
|
||||
@property
|
||||
@@ -75,13 +74,10 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
|
||||
labels=generate_labels(self),
|
||||
)
|
||||
|
||||
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
|
||||
torch_dtype = attrs.pop("torch_dtype", self.dtype)
|
||||
trust_remote_code = attrs.pop("trust_remote_code", False)
|
||||
|
||||
_ref = bentoml.transformers.get(tag)
|
||||
model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
_ref.path, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, **attrs
|
||||
bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.bettertransformer = True if not torch.cuda.is_available() else False
|
||||
|
||||
@property
|
||||
|
||||
@@ -42,9 +42,6 @@ FIM_INDICATOR = "<FILL_HERE>"
|
||||
class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {
|
||||
|
||||
@@ -39,19 +39,25 @@ llm.save_pretrained("./path/to/local-dolly")
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
import cloudpickle
|
||||
|
||||
from .constants import HUB_ATTRS
|
||||
import openllm
|
||||
from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
|
||||
|
||||
from ..exceptions import OpenLLMException
|
||||
from ..utils import LazyLoader
|
||||
from ..utils import LazyModule
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import bentoml
|
||||
import transformers
|
||||
|
||||
from .._llm import M
|
||||
from .._llm import T
|
||||
from .._types import ModelProtocol
|
||||
from .._types import TokenizerProtocol
|
||||
else:
|
||||
transformers = LazyLoader("transformers", globals(), "transformers")
|
||||
|
||||
|
||||
def import_model(
|
||||
@@ -87,9 +93,6 @@ def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs
|
||||
|
||||
|
||||
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
|
||||
if llm.__llm_custom_load__:
|
||||
hub_attrs = {k: attrs.pop(k) for k in HUB_ATTRS if k in attrs}
|
||||
return llm.load_model(llm.tag, *decls, **hub_attrs, **attrs)
|
||||
if llm.runtime == "transformers":
|
||||
return openllm.transformers.load_model(llm, *decls, **attrs)
|
||||
elif llm.runtime == "ggml":
|
||||
@@ -98,16 +101,37 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
|
||||
raise ValueError(f"Unknown runtime: {llm.config['runtime']}")
|
||||
|
||||
|
||||
def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
|
||||
if llm.__llm_custom_tokenizer__:
|
||||
(_, _), tokenizer_attrs = llm.llm_parameters
|
||||
return llm.load_tokenizer(llm.tag, **tokenizer_attrs)
|
||||
elif llm.runtime == "transformers":
|
||||
return openllm.transformers.load_tokenizer(llm)
|
||||
elif llm.runtime == "ggml":
|
||||
return openllm.ggml.load_tokenizer(llm)
|
||||
def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
|
||||
"""Load the tokenizer from BentoML store.
|
||||
|
||||
By default, it will try to find the bentomodel whether it is in store..
|
||||
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
|
||||
"""
|
||||
from .transformers import infer_tokenizers_class_for_llm
|
||||
|
||||
bentomodel_fs = llm._bentomodel._fs
|
||||
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
|
||||
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
|
||||
try:
|
||||
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
|
||||
except KeyError:
|
||||
# This could happen if users implement their own import_model
|
||||
raise OpenLLMException(
|
||||
"Model does not have tokenizer. Make sure to save \
|
||||
the tokenizer within the model via 'custom_objects'.\
|
||||
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
|
||||
) from None
|
||||
else:
|
||||
raise ValueError(f"Unknown runtime: {llm.config['runtime']}")
|
||||
tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
|
||||
bentomodel_fs.getsyspath("/"),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
**tokenizer_attrs,
|
||||
)
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
_extras = {
|
||||
|
||||
@@ -82,34 +82,30 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
|
||||
raise NotImplementedError("Currently work in progress.")
|
||||
|
||||
|
||||
def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
|
||||
def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> TokenizerProtocol[T]:
|
||||
"""Load the tokenizer from BentoML store.
|
||||
|
||||
By default, it will try to find the bentomodel whether it is in store..
|
||||
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
|
||||
"""
|
||||
(_, _), tokenizer_attrs = llm.llm_parameters
|
||||
if llm.__llm_custom_tokenizer__:
|
||||
tokenizer = llm.load_tokenizer(llm.tag, **tokenizer_attrs)
|
||||
bentomodel_fs = llm._bentomodel._fs
|
||||
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
|
||||
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
|
||||
try:
|
||||
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
|
||||
except KeyError:
|
||||
# This could happen if users implement their own import_model
|
||||
raise OpenLLMException(
|
||||
"Model does not have tokenizer. Make sure to save \
|
||||
the tokenizer within the model via 'custom_objects'.\
|
||||
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
|
||||
) from None
|
||||
else:
|
||||
bentomodel_fs = llm._bentomodel._fs
|
||||
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
|
||||
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
|
||||
try:
|
||||
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
|
||||
except KeyError:
|
||||
# This could happen if users implement their own import_model
|
||||
raise OpenLLMException(
|
||||
"Model does not have tokenizer. Make sure to save \
|
||||
the tokenizer within the model via 'custom_objects'.\
|
||||
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
|
||||
) from None
|
||||
else:
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
||||
bentomodel_fs.getsyspath("/"),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
**tokenizer_attrs,
|
||||
)
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
||||
bentomodel_fs.getsyspath("/"),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
**tokenizer_attrs,
|
||||
)
|
||||
return t.cast("TokenizerProtocol[T]", tokenizer)
|
||||
|
||||
|
||||
|
||||
@@ -18,11 +18,8 @@ import copy
|
||||
import importlib
|
||||
import typing as t
|
||||
|
||||
import cloudpickle
|
||||
|
||||
import bentoml
|
||||
from bentoml._internal.frameworks.transformers import make_default_signatures
|
||||
from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
|
||||
from bentoml._internal.models.model import ModelOptions
|
||||
|
||||
from .constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
|
||||
@@ -48,8 +45,6 @@ if t.TYPE_CHECKING:
|
||||
from .._llm import M
|
||||
from .._llm import T
|
||||
from .._types import DictStrAny
|
||||
from .._types import ModelProtocol
|
||||
from .._types import TokenizerProtocol
|
||||
else:
|
||||
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
|
||||
_transformers = LazyLoader("_transformers", globals(), "transformers")
|
||||
@@ -77,7 +72,7 @@ def process_transformers_config(
|
||||
return config, hub_attrs, attrs
|
||||
|
||||
|
||||
def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
|
||||
def infer_tokenizers_class_for_llm(__llm: openllm.LLM[t.Any, T]) -> T:
|
||||
tokenizer_class = __llm.config["tokenizer_class"]
|
||||
if tokenizer_class is None:
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
@@ -138,21 +133,18 @@ def import_model(
|
||||
**attrs: Kwargs to be passed into AutoModelForSeq2SeqLM or AutoModelForCausalLM (+ TF, Flax variants).
|
||||
"""
|
||||
config, hub_attrs, attrs = process_transformers_config(llm.model_id, trust_remote_code, **attrs)
|
||||
|
||||
# NOTE: get the base args and attrs, then
|
||||
# allow override via import_model
|
||||
(model_decls, model_attrs), tokenizer_attrs = llm.llm_parameters
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
|
||||
safe_serialisation = llm._serialisation_format == "safetensors"
|
||||
_, tokenizer_attrs = llm.llm_parameters
|
||||
quantize_method = llm._quantize_method
|
||||
|
||||
safe_serialisation = first_not_none(
|
||||
attrs.get("safe_serialization"), default=llm._serialisation_format == "safetensors"
|
||||
)
|
||||
if llm.__llm_implementation__ == "vllm":
|
||||
# Disable safe serialization with vLLM
|
||||
safe_serialisation = False
|
||||
metadata: DictStrAny = {
|
||||
"safe_serialisation": safe_serialisation,
|
||||
"_quantize": quantize_method if quantize_method is not None else False,
|
||||
}
|
||||
|
||||
signatures: DictStrAny = {}
|
||||
if quantize_method == "gptq":
|
||||
if not is_autogptq_available():
|
||||
@@ -260,18 +252,13 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo
|
||||
raise
|
||||
|
||||
|
||||
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> ModelProtocol[M]:
|
||||
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
"""Load the model from BentoML store.
|
||||
|
||||
By default, it will try to find check the model in the local store.
|
||||
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
|
||||
"""
|
||||
config, hub_attrs, attrs = process_transformers_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
|
||||
# NOTE: get the base args and attrs, then
|
||||
# allow override via import_model
|
||||
(model_decls, model_attrs), _ = llm.llm_parameters
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
metadata = llm._bentomodel.info.metadata
|
||||
safe_serialization = first_not_none(
|
||||
t.cast(t.Optional[bool], metadata.get("safe_serialisation", None)),
|
||||
@@ -285,17 +272,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
|
||||
)
|
||||
if llm.config["model_type"] != "causal_lm":
|
||||
raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
return t.cast(
|
||||
"ModelProtocol[M]",
|
||||
autogptq.AutoGPTQForCausalLM.from_quantized(
|
||||
llm._bentomodel.path,
|
||||
*decls,
|
||||
quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
use_safetensors=safe_serialization,
|
||||
**hub_attrs,
|
||||
**attrs,
|
||||
),
|
||||
return autogptq.AutoGPTQForCausalLM.from_quantized(
|
||||
llm._bentomodel.path,
|
||||
*decls,
|
||||
quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
use_safetensors=safe_serialization,
|
||||
**hub_attrs,
|
||||
**attrs,
|
||||
)
|
||||
model = infer_autoclass_from_llm_config(llm, config).from_pretrained(
|
||||
llm._bentomodel.path,
|
||||
@@ -316,46 +300,14 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> Mod
|
||||
model = model.to("cuda")
|
||||
except torch.cuda.OutOfMemoryError as err:
|
||||
raise RuntimeError(
|
||||
f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8' for dynamic quantization."
|
||||
f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
|
||||
) from err
|
||||
if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
|
||||
# BetterTransformer is currently only supported on PyTorch.
|
||||
from optimum.bettertransformer import BetterTransformer
|
||||
|
||||
model = BetterTransformer.transform(model) # type: ignore
|
||||
return t.cast("ModelProtocol[M]", model)
|
||||
|
||||
|
||||
def load_tokenizer(llm: openllm.LLM[t.Any, T]) -> TokenizerProtocol[T]:
|
||||
"""Load the tokenizer from BentoML store.
|
||||
|
||||
By default, it will try to find the bentomodel whether it is in store..
|
||||
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
|
||||
"""
|
||||
(_, _), tokenizer_attrs = llm.llm_parameters
|
||||
bentomodel_fs = llm._bentomodel._fs
|
||||
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
|
||||
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile:
|
||||
try:
|
||||
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
|
||||
except KeyError:
|
||||
# This could happen if users implement their own import_model
|
||||
raise OpenLLMException(
|
||||
"Model does not have tokenizer. Make sure to save \
|
||||
the tokenizer within the model via 'custom_objects'.\
|
||||
For example: bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer}))"
|
||||
) from None
|
||||
else:
|
||||
tokenizer = infer_tokenizers_class_for_llm(llm).from_pretrained(
|
||||
bentomodel_fs.getsyspath("/"),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
**tokenizer_attrs,
|
||||
)
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
return tokenizer
|
||||
return t.cast("M", model)
|
||||
|
||||
|
||||
def save_pretrained(
|
||||
|
||||
Reference in New Issue
Block a user