mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-02-18 14:47:30 -05:00
1668 lines
69 KiB
Python
1668 lines
69 KiB
Python
# Copyright 2023 BentoML Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import annotations
|
|
import collections
|
|
import functools
|
|
import hashlib
|
|
import inspect
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import types
|
|
import typing as t
|
|
from abc import ABC
|
|
from abc import abstractmethod
|
|
from pathlib import Path
|
|
|
|
import attr
|
|
import orjson
|
|
from huggingface_hub import hf_hub_download
|
|
|
|
import bentoml
|
|
import openllm
|
|
from bentoml._internal.models import ModelStore
|
|
from bentoml._internal.models.model import ModelSignature
|
|
|
|
from ._configuration import AdapterType
|
|
from ._configuration import FineTuneConfig
|
|
from ._configuration import _object_getattribute
|
|
from ._configuration import _setattr_class
|
|
from ._quantisation import infer_quantisation_config
|
|
from .exceptions import ForbiddenAttributeError
|
|
from .exceptions import GpuNotAvailableError
|
|
from .utils import DEBUG
|
|
from .utils import ENV_VARS_TRUE_VALUES
|
|
from .utils import MYPY
|
|
from .utils import SHOW_CODEGEN
|
|
from .utils import EnvVarMixin
|
|
from .utils import LazyLoader
|
|
from .utils import ReprMixin
|
|
from .utils import bentoml_cattr
|
|
from .utils import codegen
|
|
from .utils import first_not_none
|
|
from .utils import in_docker
|
|
from .utils import is_peft_available
|
|
from .utils import is_torch_available
|
|
from .utils import non_intrusive_setattr
|
|
from .utils import normalize_attrs_to_model_tokenizer_pair
|
|
from .utils import requires_dependencies
|
|
from .utils import resolve_filepath
|
|
from .utils import validate_is_path
|
|
|
|
|
|
# NOTE: We need to do this so that overload can register
|
|
# correct overloads to typing registry
|
|
if sys.version_info[:2] >= (3, 11):
|
|
from typing import NotRequired
|
|
from typing import overload
|
|
else:
|
|
from typing_extensions import NotRequired
|
|
from typing_extensions import overload
|
|
|
|
if t.TYPE_CHECKING:
|
|
import auto_gptq as autogptq
|
|
import peft
|
|
import torch
|
|
import vllm
|
|
|
|
import transformers
|
|
|
|
from ._configuration import PeftType
|
|
from ._types import AdaptersMapping
|
|
from ._types import AdaptersTuple
|
|
from ._types import DictStrAny
|
|
from ._types import ListStr
|
|
from ._types import LiteralRuntime
|
|
from ._types import LLMEmbeddings
|
|
from ._types import LLMRunnable
|
|
from ._types import LLMRunner
|
|
from ._types import ModelSignatureDict as _ModelSignatureDict
|
|
from ._types import PeftAdapterOutput
|
|
from ._types import TupleAny
|
|
from .utils.representation import ReprArgs
|
|
|
|
UserDictAny = collections.UserDict[str, t.Any]
|
|
ResolvedAdaptersMapping = dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]]
|
|
else:
|
|
DictStrAny = dict
|
|
TupleAny = tuple
|
|
UserDictAny = collections.UserDict
|
|
LLMRunnable = bentoml.Runnable
|
|
LLMRunner = bentoml.Runner
|
|
LLMEmbeddings = dict
|
|
|
|
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
|
|
vllm = LazyLoader("vllm", globals(), "vllm")
|
|
transformers = LazyLoader("transformers", globals(), "transformers")
|
|
torch = LazyLoader("torch", globals(), "torch")
|
|
peft = LazyLoader("peft", globals(), "peft")
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ModelSignatureDict(t.TypedDict, total=False):
|
|
batchable: bool
|
|
batch_dim: t.Union[t.Tuple[int, int], int]
|
|
input_spec: NotRequired[t.Union[t.Any, t.Tuple[t.Any]]]
|
|
output_spec: NotRequired[t.Any]
|
|
|
|
|
|
def normalise_model_name(name: str) -> str:
|
|
return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else re.sub("[^a-zA-Z0-9]+", "-", name)
|
|
|
|
|
|
def make_tag(
|
|
model_id: str,
|
|
model_version: str | None = None,
|
|
trust_remote_code: bool = False,
|
|
implementation: LiteralRuntime = "pt",
|
|
quiet: bool = False,
|
|
) -> bentoml.Tag:
|
|
"""Generate a ``bentoml.Tag`` from a given transformers model name.
|
|
|
|
Note that this depends on your model to have a config class available.
|
|
|
|
Args:
|
|
model_id: The transformers model name or path to load the model from.
|
|
model_version: Optional model version to be saved with this tag. Default to None.
|
|
If model_id is a custom path, then the version would be the hash of the last modified
|
|
time from given ``model_id``.
|
|
trust_remote_code: Whether to trust the remote code. Defaults to False.
|
|
model_version: Optional model version to be saved with this tag.
|
|
implementation: Given implementation for said LLM. One of t.Literal['pt', 'tf', 'flax']
|
|
quiet: Whether to show warning logs. Default to 'False'
|
|
|
|
Returns:
|
|
A tuple of ``bentoml.Tag`` and a dict of unused kwargs.
|
|
"""
|
|
model_name = normalise_model_name(model_id)
|
|
if os.getenv("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES:
|
|
return bentoml.models.get(f"{implementation}-{model_name}").tag
|
|
|
|
if validate_is_path(model_id):
|
|
model_id = resolve_filepath(model_id)
|
|
# special cases, if it is the model store, then we return the tags
|
|
# this will happens within the container, where we use the relative path
|
|
if in_docker() and os.getenv("BENTO_PATH") is not None:
|
|
_store = ModelStore(Path(model_id).parent.parent)
|
|
tag = _store.list()[0].tag
|
|
model_version = tag.version
|
|
model_name = tag.name
|
|
else:
|
|
if model_version is None: # noqa: PLR5501
|
|
if not quiet:
|
|
logger.warning(
|
|
"Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
|
|
model_id,
|
|
)
|
|
model_version = generate_hash_from_file(model_id)
|
|
else:
|
|
config = t.cast(
|
|
"transformers.PretrainedConfig",
|
|
transformers.AutoConfig.from_pretrained(
|
|
model_id,
|
|
trust_remote_code=trust_remote_code,
|
|
revision=first_not_none(model_version, default="main"),
|
|
),
|
|
)
|
|
|
|
model_version = getattr(config, "_commit_hash", None)
|
|
if model_version is None:
|
|
raise ValueError(
|
|
f"Internal errors when parsing config for pretrained {model_id} ('commit_hash' not found)"
|
|
)
|
|
|
|
return bentoml.Tag.from_taglike(
|
|
f"{model_name if in_docker() and os.getenv('BENTO_PATH') is not None else implementation + '-' + model_name}:{model_version}".strip().lower()
|
|
)
|
|
|
|
|
|
@functools.lru_cache(maxsize=128)
|
|
def generate_hash_from_file(f: str, algorithm: t.Literal["md5", "sha1"] = "sha1") -> str:
|
|
"""Generate a hash from given file's modification time.
|
|
|
|
Args:
|
|
f: The file to generate the hash from.
|
|
algorithm: The hashing algorithm to use. Defaults to 'sha1' (similar to how Git generate its commit hash.)
|
|
|
|
Returns:
|
|
The generated hash.
|
|
"""
|
|
modification_time = str(os.path.getmtime(resolve_filepath(f)))
|
|
hashed = getattr(hashlib, algorithm)(modification_time.encode())
|
|
return hashed.hexdigest()
|
|
|
|
|
|
# the below is similar to peft.utils.other.CONFIG_NAME
|
|
PEFT_CONFIG_NAME = "adapter_config.json"
|
|
|
|
|
|
def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapping:
|
|
"""Resolve the type of the PeftConfig given the adapter_map.
|
|
|
|
This is similar to how PeftConfig resolve its config type.
|
|
|
|
Args:
|
|
adapter_map: The given mapping from either SDK or CLI. See CLI docs for more information.
|
|
"""
|
|
resolved: AdaptersMapping = {}
|
|
_has_set_default = False
|
|
for path_or_adapter_id, name in adapter_map.items():
|
|
if _has_set_default:
|
|
raise ValueError("Only one adapter can be set as default.")
|
|
resolve_name = name
|
|
if resolve_name is None:
|
|
resolve_name = "default"
|
|
_has_set_default = True
|
|
if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
|
|
config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
|
|
else:
|
|
try:
|
|
config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
|
|
except Exception as err:
|
|
raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
|
|
with open(config_file, "r") as file:
|
|
resolved_config = orjson.loads(file.read())
|
|
# all peft_type should be available in PEFT_CONFIG_NAME
|
|
_peft_type: AdapterType = resolved_config["peft_type"].lower()
|
|
if _peft_type not in resolved:
|
|
resolved[_peft_type] = ()
|
|
resolved[_peft_type] += (_AdaptersTuple((path_or_adapter_id, resolve_name, resolved_config)),)
|
|
return resolved
|
|
|
|
|
|
_reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"}
|
|
|
|
M = t.TypeVar(
|
|
"M",
|
|
bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLM, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]",
|
|
)
|
|
T = t.TypeVar(
|
|
"T",
|
|
bound="t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]",
|
|
)
|
|
|
|
|
|
def _default_post_init(self: LLM[t.Any, t.Any]) -> None:
|
|
self.device = None
|
|
|
|
if self.__llm_implementation__ == "pt" and is_torch_available():
|
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
class LLMInterface(ABC, t.Generic[M, T]):
|
|
"""This defines the loose contract for all openllm.LLM implementations."""
|
|
|
|
@property
|
|
def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None:
|
|
"""The default import kwargs to used when importing the model.
|
|
|
|
This will be passed into 'openllm.LLM.import_model'.
|
|
It returns two dictionaries: one for model kwargs and one for tokenizer kwargs.
|
|
|
|
Returns:
|
|
Optional tuple of model kwargs and tokenizer kwargs
|
|
"""
|
|
|
|
def embeddings(self, prompts: list[str]) -> LLMEmbeddings:
|
|
"""The implementation for generating text embeddings from given prompt.
|
|
|
|
It takes the prompt and output the embeddings for this given LLM.
|
|
|
|
Returns:
|
|
The embeddings for the given prompt.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any:
|
|
"""The implementation for text generation from given prompt.
|
|
|
|
It takes the prompt and 'generation_kwargs' from 'self.sanitize_parameters' and then
|
|
pass it to 'self.model.generate'.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def generate_one(
|
|
self,
|
|
prompt: str,
|
|
stop: list[str],
|
|
**preprocess_generate_kwds: t.Any,
|
|
) -> t.Sequence[dict[t.Literal["generated_text"], str]]:
|
|
"""The entrypoint for generating one prompt.
|
|
|
|
This provides additional stop tokens for generating per token level.
|
|
This is useful when running with agents, or initial streaming support.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def generate_iterator(self, prompt: str, **attrs: t.Any) -> t.Iterator[t.Any]:
|
|
"""T iterator version of `generate` function."""
|
|
raise NotImplementedError(
|
|
"Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented."
|
|
)
|
|
|
|
def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
|
|
"""This handler will sanitize all attrs and setup prompt text.
|
|
|
|
It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
|
|
|
|
Returns a tuple of three items:
|
|
- The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
|
|
- The attributes dictionary that will be passed into `self.postprocess_generate`.
|
|
"""
|
|
return prompt, attrs, attrs
|
|
|
|
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
|
|
"""This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
|
|
|
|
You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
|
|
|
|
NOTE: this will be used from the client side.
|
|
"""
|
|
return generation_result
|
|
|
|
def llm_post_init(self) -> None:
|
|
"""This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals."""
|
|
pass
|
|
|
|
def import_model(self, *args: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model:
|
|
"""This function can be implemented if default import_model doesn't satisfy your needs.
|
|
|
|
Note that tokenizer attrs can be accessed via ``llm.llm_parameters``.
|
|
|
|
```python
|
|
_, tokenizer_attrs = llm.llm_parameters
|
|
```
|
|
|
|
By default, `model_decls` and `model_attrs` is already sanitised and concatenated into `args` and `attrs`
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def load_model(self, *args: t.Any, **attrs: t.Any) -> M:
|
|
"""This function can be implemented to override the default load_model behaviour.
|
|
|
|
See falcon for example implementation. Tag can be accessed via ``self.tag``
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> T:
|
|
"""This function can be implemented to override how to load the tokenizer.
|
|
|
|
See falcon for example implementation.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
# NOTE: All fields below are attributes that can be accessed by users.
|
|
config_class: type[openllm.LLMConfig]
|
|
"""The config class to use for this LLM. If you are creating a custom LLM, you must specify this class."""
|
|
|
|
bettertransformer: bool
|
|
"""Whether to load this LLM with FasterTransformer enabled. The order of loading is:
|
|
|
|
- If pass within `for_model`, `from_pretrained` or `__init__`.
|
|
- If `self.bettertransformer` is set within `llm_post_init`.
|
|
- Finally, if none of the above, default to self.config['bettertransformer']
|
|
|
|
> **Note** that if LoRA is enabled, bettertransformer will be disabled.
|
|
"""
|
|
|
|
device: torch.device
|
|
"""The device to be used for this LLM. If the implementation is 'pt', then it will be torch.device, else string."""
|
|
|
|
# NOTE: The following will be populated by __init_subclass__, note that these should be immutable.
|
|
__llm_trust_remote_code__: bool
|
|
"""This is used to determine during 'import_model' whether to trust remote code or not.
|
|
|
|
This works synonymous with `trust_remote_code` kwarg in transformers Auto classes. If not passed,
|
|
then by default fallback to config_class['trust_remote_code']
|
|
"""
|
|
__llm_implementation__: LiteralRuntime
|
|
"""This is used to determine which implementation that this LLM has.
|
|
|
|
Usually, this will inferred from class name, that follows the HuggingFace's naming convention:
|
|
|
|
- `OPTForConditionalGeneration` -> `pt`
|
|
- `TFOPTForConditionalGeneration` -> `tf`
|
|
- `FlaxOPTForConditionalGeneration` -> `flax`
|
|
|
|
An additional naming for all VLLM backend: VLLMLlaMA -> `vllm`
|
|
"""
|
|
__llm_model__: M | None
|
|
"""A reference to the actual model. Instead of access this directly, you should use `model` property instead."""
|
|
__llm_tokenizer__: T | None
|
|
"""A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead."""
|
|
__llm_bentomodel__: bentoml.Model | None
|
|
"""A reference to the bentomodel used for this LLM. Instead of access this directly, you should use `_bentomodel` property instead."""
|
|
__llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
|
|
"""A reference to the the cached LoRA adapter mapping."""
|
|
|
|
if t.TYPE_CHECKING and not MYPY:
|
|
|
|
def __attrs_init__(
|
|
self,
|
|
config: openllm.LLMConfig,
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
|
|
model_id: str,
|
|
runtime: t.Literal["ggml", "transformers"],
|
|
model_decls: TupleAny,
|
|
model_attrs: DictStrAny,
|
|
tokenizer_attrs: DictStrAny,
|
|
tag: bentoml.Tag,
|
|
adapters_mapping: AdaptersMapping | None,
|
|
model_version: str | None,
|
|
quantize_method: t.Literal["int8", "int4", "gptq"] | None,
|
|
serialisation_format: t.Literal["safetensors", "legacy"],
|
|
**attrs: t.Any,
|
|
) -> None:
|
|
"""Generated __attrs_init__ for openllm.LLM."""
|
|
|
|
|
|
if t.TYPE_CHECKING:
|
|
_R = t.TypeVar("_R")
|
|
|
|
class _import_model_wrapper(t.Generic[_R, M, T]):
|
|
def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
|
|
...
|
|
|
|
class _load_model_wrapper(t.Generic[M, T]):
|
|
def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
|
...
|
|
|
|
class _load_tokenizer_wrapper(t.Generic[M, T]):
|
|
def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
|
|
...
|
|
|
|
class _llm_post_init_wrapper(t.Generic[M, T]):
|
|
def __call__(self, llm: LLM[M, T]) -> T:
|
|
...
|
|
|
|
|
|
def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]):
|
|
@functools.wraps(f)
|
|
def wrapper(
|
|
self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any
|
|
) -> bentoml.Model:
|
|
trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
|
|
# wrapped around custom init to provide some meta compression
|
|
# for all decls and attrs
|
|
(model_decls, model_attrs), _ = self.llm_parameters
|
|
decls = (*model_decls, *decls)
|
|
attrs = {**model_attrs, **attrs}
|
|
return f(self, *decls, trust_remote_code=trust_remote_code, **attrs)
|
|
|
|
return wrapper
|
|
|
|
|
|
def _wrapped_load_model(f: _load_model_wrapper[M, T]):
|
|
@functools.wraps(f)
|
|
def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
|
# wrapped around custom init to provide some meta compression
|
|
# for all decls and attrs
|
|
(model_decls, model_attrs), _ = self.llm_parameters
|
|
decls = (*model_decls, *decls)
|
|
attrs = {**model_attrs, **attrs}
|
|
return f(self, *decls, **attrs)
|
|
|
|
return wrapper
|
|
|
|
|
|
def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]):
|
|
@functools.wraps(f)
|
|
def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
|
|
_, model_tokenizer_attrs = self.llm_parameters
|
|
tokenizer_attrs = {**model_tokenizer_attrs, **tokenizer_attrs}
|
|
return f(self, **tokenizer_attrs)
|
|
|
|
return wrapper
|
|
|
|
|
|
def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]:
|
|
@functools.wraps(f)
|
|
def wrapper(self: LLM[M, T]):
|
|
_default_post_init(self)
|
|
f(self)
|
|
|
|
return wrapper
|
|
|
|
|
|
def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
|
|
attributes = {
|
|
"import_model": _wrapped_import_model,
|
|
"load_model": _wrapped_load_model,
|
|
"load_tokenizer": _wrapped_load_tokenizer,
|
|
"llm_post_init": _wrapped_llm_post_init,
|
|
}
|
|
args: ListStr = []
|
|
anns: DictStrAny = {}
|
|
lines: ListStr = []
|
|
globs: DictStrAny = {
|
|
"cls": cls,
|
|
"_cached_attribute": attributes,
|
|
"_cached_getattribute_get": _object_getattribute.__get__,
|
|
"LLMInterface": LLMInterface,
|
|
"openllm": openllm,
|
|
}
|
|
# function initialisation
|
|
for func, impl in attributes.items():
|
|
globs[f"__wrapped_{func}"] = impl
|
|
impl_name = f"__wrapped_{func}"
|
|
cached_func_name = f"_cached_{cls.__name__}_func"
|
|
if func == "llm_post_init":
|
|
func_call = f"_impl_{cls.__name__}_{func}={impl_name}"
|
|
else:
|
|
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_getattr('{func}') else openllm.serialisation.{func}"
|
|
lines.extend(
|
|
[
|
|
"_cached_LLMInterface_getattr=_cached_getattribute_get(LLMInterface)",
|
|
f"{cached_func_name}=cls.{func}",
|
|
func_call,
|
|
_setattr_class(func, f"{impl_name}(_impl_{cls.__name__}_{func})"),
|
|
]
|
|
)
|
|
|
|
# cached attribute initialisation
|
|
interface_anns = codegen.get_annotations(LLMInterface)
|
|
for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
|
|
lines.append(_setattr_class(f"__llm_{v}__", None))
|
|
anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
|
|
|
|
if SHOW_CODEGEN:
|
|
logger.info("Generated script for %s:\n\n%s", cls.__name__, "\n".join(lines))
|
|
|
|
return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
|
|
|
|
|
|
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
|
|
|
|
|
|
@attr.define(slots=True, repr=False, init=False)
|
|
class LLM(LLMInterface[M, T], ReprMixin):
|
|
config: openllm.LLMConfig
|
|
"""The config instance to use for this LLM. This will be created based on config_class and available
|
|
when initialising the LLM."""
|
|
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
|
|
"""Quantisation config for quantised model on the fly."""
|
|
|
|
_model_id: str
|
|
_runtime: t.Literal["ggml", "transformers"]
|
|
_model_decls: TupleAny
|
|
_model_attrs: DictStrAny
|
|
_tokenizer_attrs: DictStrAny
|
|
_tag: bentoml.Tag
|
|
_adapters_mapping: AdaptersMapping | None
|
|
_model_version: str
|
|
_quantize_method: t.Literal["int8", "int4", "gptq"] | None
|
|
_serialisation_format: t.Literal["safetensors", "legacy"]
|
|
|
|
tokenizer_cls: t.ClassVar[str | None] = None
|
|
|
|
@staticmethod
|
|
def _infer_implementation_from_name(name: str) -> tuple[LiteralRuntime, str]:
|
|
if name.startswith("Flax"):
|
|
return "flax", name[4:]
|
|
elif name.startswith("TF"):
|
|
return "tf", name[2:]
|
|
elif name.startswith("VLLM"):
|
|
return "vllm", name[4:]
|
|
else:
|
|
return "pt", name
|
|
|
|
def __init_subclass__(cls: type[openllm.LLM[M, T]]) -> None:
|
|
cd = cls.__dict__
|
|
implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__)
|
|
cls.__llm_implementation__ = implementation
|
|
config_class = openllm.AutoConfig.infer_class_from_name(config_class_name)
|
|
|
|
if "__openllm_internal__" in cd:
|
|
if "config_class" not in cd:
|
|
cls.config_class = config_class
|
|
elif "config_class" not in cd:
|
|
raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
|
|
|
|
_make_assignment_script(cls)(cls)
|
|
|
|
# update docstring for given entrypoint
|
|
for fn in {"generate", "generate_one", "generate_iterator"}:
|
|
original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
|
|
original_fn.__doc__ = (
|
|
original_fn.__doc__
|
|
or f"""\
|
|
'{fn}' implementation {cls.__name__}.
|
|
|
|
Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
|
|
The original can then be accessed with 'self.model.get_base_model()'.
|
|
"""
|
|
)
|
|
setattr(cls, fn, original_fn)
|
|
|
|
# The following is the similar interface to HuggingFace pretrained protocol.
|
|
def save_pretrained(self, save_directory: str | Path, **attrs: t.Any) -> None:
|
|
if isinstance(save_directory, Path):
|
|
save_directory = str(save_directory)
|
|
if self.__llm_model__ is not None and self.bettertransformer and self.__llm_implementation__ == "pt":
|
|
from optimum.bettertransformer import BetterTransformer
|
|
|
|
self.__llm_model__ = t.cast(
|
|
M,
|
|
BetterTransformer.reverse(t.cast("transformers.PreTrainedModel", self.__llm_model__)),
|
|
)
|
|
openllm.serialisation.save_pretrained(self, save_directory, **attrs)
|
|
|
|
@classmethod
|
|
@overload
|
|
def from_pretrained(
|
|
cls,
|
|
model_id: str | None = ...,
|
|
model_version: str | None = ...,
|
|
llm_config: openllm.LLMConfig | None = ...,
|
|
*args: t.Any,
|
|
runtime: t.Literal["ggml", "transformers"] | None = ...,
|
|
quantize: t.Literal["int8", "int4"] = ...,
|
|
bettertransformer: str | bool | None = ...,
|
|
adapter_id: str | None = ...,
|
|
adapter_name: str | None = ...,
|
|
adapter_map: dict[str, str | None] | None = ...,
|
|
quantization_config: transformers.BitsAndBytesConfig | None = ...,
|
|
serialisation: t.Literal["safetensors", "legacy"] = ...,
|
|
**attrs: t.Any,
|
|
) -> LLM[M, T]:
|
|
...
|
|
|
|
@classmethod
|
|
@overload
|
|
def from_pretrained(
|
|
cls,
|
|
model_id: str | None = ...,
|
|
model_version: str | None = ...,
|
|
llm_config: openllm.LLMConfig | None = ...,
|
|
*args: t.Any,
|
|
runtime: t.Literal["ggml", "transformers"] | None = ...,
|
|
quantize: t.Literal["gptq"] = ...,
|
|
bettertransformer: str | bool | None = ...,
|
|
adapter_id: str | None = ...,
|
|
adapter_name: str | None = ...,
|
|
adapter_map: dict[str, str | None] | None = ...,
|
|
quantization_config: autogptq.BaseQuantizeConfig | None = ...,
|
|
serialisation: t.Literal["safetensors", "legacy"] = ...,
|
|
**attrs: t.Any,
|
|
) -> LLM[M, T]:
|
|
...
|
|
|
|
@classmethod
|
|
def from_pretrained(
|
|
cls,
|
|
model_id: str | None = None,
|
|
model_version: str | None = None,
|
|
llm_config: openllm.LLMConfig | None = None,
|
|
*args: t.Any,
|
|
runtime: t.Literal["ggml", "transformers"] | None = None,
|
|
quantize: t.Literal["int8", "int4", "gptq"] | None = None,
|
|
bettertransformer: str | bool | None = None,
|
|
adapter_id: str | None = None,
|
|
adapter_name: str | None = None,
|
|
adapter_map: dict[str, str | None] | None = None,
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
|
serialisation: t.Literal["safetensors", "legacy"] = "safetensors",
|
|
**attrs: t.Any,
|
|
) -> LLM[M, T]:
|
|
"""Instantiate a pretrained LLM.
|
|
|
|
``LLM.from_pretrained`` follows the same design principle as HuggingFace's `from_pretrained` method, plus the following:
|
|
|
|
### Optimization options:
|
|
|
|
> This is most notable during serving time.
|
|
|
|
- quantize: quantize the model with the given quantization method. Currently supported int8, int4 quantization
|
|
- bettertransformer: Apply FasterTransformer to given pretrained weight
|
|
|
|
> Currently, the above two options are mutually exclusive.
|
|
|
|
#### Quantisation options
|
|
|
|
For customising options for quantisation config, ``openllm.LLM`` accepts all arbitrary arguments that is passed to ``transformers.BitsAndBytesConfig``
|
|
plus ``quantize`` value. For example, for ``int8`` quantisation, specify the following:
|
|
```python
|
|
model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
|
|
```
|
|
|
|
For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
|
|
to ``auto_gptq.BaseQuantizeConfig``.
|
|
|
|
### Adapter options:
|
|
|
|
> This is used in conjunction with the fine-tuning features
|
|
|
|
- adapter_id: Optional [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to apply to said model.
|
|
- adapter_name: Optional name of the adapter to apply to said model. If not provided, it will be handled internally by OpenLLM.
|
|
- adapter_map: optional dictionary of adapter_id to adapter_name. Note that this is mutually exclusive with adapter_id/adapter_name arguments.
|
|
|
|
Args:
|
|
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
|
|
> **Warning**: If custom path is passed, make sure it contains all available file to construct
|
|
> ``transformers.PretrainedConfig``, ``transformers.PreTrainedModel``, and ``transformers.PreTrainedTokenizer``.
|
|
model_name: Optional model name to be saved with this LLM. Default to None. It will be inferred automatically from model_id.
|
|
If model_id is a custom path, it will be the basename of the given path.
|
|
model_version: Optional version for this given model id. Default to None. This is useful for saving from custom path.
|
|
If set to None, the version will either be the git hash from given pretrained model, or the hash inferred
|
|
from last modified time of the given directory.
|
|
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
|
|
will use `config_class` to construct default configuration.
|
|
quantize: The quantization to use for this LLM. Defaults to None. Possible values
|
|
include int8, int4 and gptq.
|
|
runtime: Optional runtime to run this LLM. Default to 'transformers'. 'ggml' supports is working in progress.
|
|
quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
|
|
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
|
Default behaviour is similar to ``safe_serialization=False``.
|
|
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
|
|
adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
|
|
adapter_name: The adapter name to use for this LLM. Defaults to None.
|
|
adapter_map: The adapter map to use for this LLM. Defaults to None. Note that this is mutually exclusive with adapter_id/adapter_name arguments.
|
|
*args: The args to be passed to the model.
|
|
**attrs: The kwargs to be passed to the model.
|
|
"""
|
|
cfg_cls = cls.config_class
|
|
if model_id is None:
|
|
model_id = first_not_none(
|
|
cfg_cls.__openllm_env__["model_id_value"], default=cfg_cls.__openllm_default_id__
|
|
)
|
|
if runtime is None:
|
|
runtime = cfg_cls.__openllm_runtime__
|
|
|
|
model_id, *maybe_revision = model_id.rsplit(":")
|
|
if len(maybe_revision) > 0:
|
|
if model_version is not None:
|
|
logger.warning(
|
|
"revision is specified within 'model_id' (%s), which will override the 'model_version=%s'",
|
|
maybe_revision[0],
|
|
model_version,
|
|
)
|
|
model_version = maybe_revision[0]
|
|
|
|
# quantization setup
|
|
if quantization_config and quantize:
|
|
raise ValueError(
|
|
"""'quantization_config' and 'quantize' are mutually exclusive. Either customise
|
|
your quantization_config or use the 'quantize' argument."""
|
|
)
|
|
if quantization_config is None and quantize is not None:
|
|
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
|
|
|
|
if quantize == "gptq":
|
|
# We will use safetensors for gptq
|
|
serialisation = "safetensors"
|
|
|
|
# NOTE: LoRA adapter setup
|
|
if adapter_map and adapter_id:
|
|
raise ValueError(
|
|
"""'adapter_map' and 'adapter_id' are mutually exclusive. Either provide a
|
|
'adapter_map' ({adapter_id: adapter_name | None, ...}) or use
|
|
the combination of adapter_id/adapter_name arguments.
|
|
"""
|
|
)
|
|
if adapter_map is None and adapter_id is not None:
|
|
adapter_map = {adapter_id: adapter_name}
|
|
|
|
if adapter_map is not None and not is_peft_available():
|
|
raise RuntimeError(
|
|
"LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
|
|
)
|
|
|
|
if llm_config is None:
|
|
llm_config = cls.config_class.model_construct_env(**attrs)
|
|
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
|
|
attrs = llm_config["extras"]
|
|
|
|
_tag = cls._infer_tag_from_model_id(model_id, model_version)
|
|
if _tag.version is None:
|
|
raise RuntimeError("Failed to resolve model version.")
|
|
|
|
return cls(
|
|
*args,
|
|
model_id=model_id,
|
|
llm_config=llm_config,
|
|
bettertransformer=str(bettertransformer).upper() in ENV_VARS_TRUE_VALUES,
|
|
quantization_config=quantization_config,
|
|
_quantize_method=quantize,
|
|
_adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
|
|
_runtime=runtime,
|
|
_model_version=_tag.version,
|
|
_tag=_tag,
|
|
_serialisation_format=serialisation,
|
|
**attrs,
|
|
)
|
|
|
|
@classmethod
|
|
def _infer_tag_from_model_id(cls, model_id: str, model_version: str | None) -> bentoml.Tag:
|
|
# XXX: Fix me later, if the model is a valid tag, then we return it directly
|
|
# instead of creating a new tag from the model_id. this branch will be hit during `openllm build`
|
|
try:
|
|
return bentoml.models.get(model_id).tag
|
|
except (ValueError, bentoml.exceptions.BentoMLException):
|
|
try:
|
|
return bentoml.Tag.from_taglike(model_id)
|
|
except (ValueError, bentoml.exceptions.BentoMLException):
|
|
return make_tag(
|
|
model_id,
|
|
model_version=model_version,
|
|
trust_remote_code=cls.config_class.__openllm_trust_remote_code__,
|
|
implementation=cls.__llm_implementation__,
|
|
quiet=True,
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
*args: t.Any,
|
|
model_id: str,
|
|
llm_config: openllm.LLMConfig,
|
|
bettertransformer: bool | None,
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
|
|
_adapters_mapping: AdaptersMapping | None,
|
|
_tag: bentoml.Tag,
|
|
_quantize_method: t.Literal["int8", "int4", "gptq"] | None,
|
|
_runtime: t.Literal["ggml", "transformers"],
|
|
_model_version: str,
|
|
_serialisation_format: t.Literal["safetensors", "legacy"],
|
|
**attrs: t.Any,
|
|
):
|
|
"""Initialize the LLM with given pretrained model.
|
|
|
|
> **Warning**
|
|
> To initializing any LLM, you should use `openllm.AutoLLM` or `openllm.LLM.from_pretrained` instead.
|
|
> `__init__` initialization is only for internal use.
|
|
|
|
Note:
|
|
- *args to be passed to the model.
|
|
- **attrs will first be parsed to the AutoConfig, then the rest will be parsed to the import_model
|
|
- for tokenizer kwargs, it should be prefixed with _tokenizer_*
|
|
|
|
For custom pretrained path, it is recommended to pass in 'model_version' alongside with the path
|
|
to ensure that it won't be loaded multiple times.
|
|
Internally, if a pretrained is given as a HuggingFace repository path , OpenLLM will usethe commit_hash
|
|
to generate the model version.
|
|
|
|
For better consistency, we recommend users to also push the fine-tuned model to HuggingFace repository.
|
|
|
|
If you need to overwrite the default ``import_model``, implement the following in your subclass:
|
|
|
|
```python
|
|
def import_model(
|
|
self,
|
|
*args: t.Any,
|
|
trust_remote_code: bool,
|
|
**attrs: t.Any,
|
|
):
|
|
_, tokenizer_attrs = self.llm_parameters
|
|
|
|
return bentoml.transformers.save_model(
|
|
tag,
|
|
transformers.AutoModelForCausalLM.from_pretrained(
|
|
self.model_id, device_map="auto", torch_dtype=torch.bfloat16, **attrs
|
|
),
|
|
custom_objects={
|
|
"tokenizer": transformers.AutoTokenizer.from_pretrained(
|
|
self.model_id, padding_size="left", **tokenizer_attrs
|
|
)
|
|
},
|
|
)
|
|
```
|
|
|
|
If your import model doesn't require customization, you can simply pass in `import_kwargs`
|
|
at class level that will be then passed into The default `import_model` implementation.
|
|
See ``openllm.DollyV2`` for example.
|
|
|
|
```python
|
|
dolly_v2_runner = openllm.Runner(
|
|
"dolly-v2", _tokenizer_padding_size="left", torch_dtype=torch.bfloat16, device_map="cuda"
|
|
)
|
|
```
|
|
|
|
Note: If you implement your own `import_model`, then `import_kwargs` will be the
|
|
base kwargs. You can still override those via ``openllm.Runner``.
|
|
|
|
Note that this tag will be generated based on `self.default_id`.
|
|
passed from the __init__ constructor.
|
|
|
|
``llm_post_init`` can also be implemented if you need to do any additional
|
|
initialization after everything is setup.
|
|
|
|
Note: If you need to implement a custom `load_model`, the following is an example from Falcon implementation:
|
|
|
|
```python
|
|
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
|
|
torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
|
|
device_map = attrs.pop("device_map", "auto")
|
|
|
|
_ref = bentoml.transformers.get(tag)
|
|
|
|
model = bentoml.transformers.load_model(_ref, device_map=device_map, torch_dtype=torch_dtype, **attrs)
|
|
return transformers.pipeline("text-generation", model=model, tokenizer=_ref.custom_objects["tokenizer"])
|
|
```
|
|
|
|
Args:
|
|
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
|
|
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
|
|
will use `config_class` to construct default configuration.
|
|
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
|
|
quantization_config: ``transformers.BitsAndBytesConfig`` configuration, or 'gptq' denoting this model to be loaded with GPTQ.
|
|
*args: The args to be passed to the model.
|
|
**attrs: The kwargs to be passed to the model.
|
|
"""
|
|
# low_cpu_mem_usage is only available for model
|
|
# this is helpful on system with low memory to avoid OOM
|
|
low_cpu_mem_usage = attrs.pop("low_cpu_mem_usage", True)
|
|
|
|
if self.__llm_implementation__ == "pt":
|
|
attrs.update({"low_cpu_mem_usage": low_cpu_mem_usage, "quantization_config": quantization_config})
|
|
|
|
model_kwds: DictStrAny = {}
|
|
tokenizer_kwds: DictStrAny = {}
|
|
if self.import_kwargs is not None:
|
|
model_kwds, tokenizer_kwds = self.import_kwargs
|
|
|
|
# parsing tokenizer and model kwargs, as the hierachy is param pass > default
|
|
normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
|
|
# NOTE: Save the args and kwargs for latter load
|
|
|
|
self.__attrs_init__(
|
|
llm_config,
|
|
quantization_config,
|
|
model_id,
|
|
_runtime,
|
|
args,
|
|
{**model_kwds, **normalized_model_kwds},
|
|
{**tokenizer_kwds, **normalized_tokenizer_kwds},
|
|
_tag,
|
|
_adapters_mapping,
|
|
_model_version,
|
|
_quantize_method,
|
|
_serialisation_format,
|
|
)
|
|
# handle trust_remote_code
|
|
self.__llm_trust_remote_code__ = self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"])
|
|
|
|
self.llm_post_init()
|
|
|
|
# we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
|
|
if bettertransformer is True:
|
|
logger.debug("Using %r with BetterTransformer", self)
|
|
self.bettertransformer = bettertransformer
|
|
else:
|
|
non_intrusive_setattr(self, "bettertransformer", self.config["bettertransformer"])
|
|
# If lora is passed, the disable bettertransformer
|
|
if _adapters_mapping and self.bettertransformer is True:
|
|
logger.debug("LoRA is visible for %s, disabling BetterTransformer", self)
|
|
self.bettertransformer = False
|
|
|
|
def __setattr__(self, attr: str, value: t.Any) -> None:
|
|
if attr in _reserved_namespace:
|
|
raise ForbiddenAttributeError(
|
|
f"{attr} should not be set during runtime "
|
|
f"as these value will be reflected during runtime. "
|
|
f"Instead, you can create a custom LLM subclass {self.__class__.__name__}."
|
|
)
|
|
|
|
super().__setattr__(attr, value)
|
|
|
|
@property
|
|
def adapters_mapping(self) -> AdaptersMapping | None:
|
|
return self._adapters_mapping
|
|
|
|
@adapters_mapping.setter
|
|
def adapters_mapping(self, value: AdaptersMapping) -> None:
|
|
self._adapters_mapping = value
|
|
|
|
@property
|
|
def __repr_keys__(self) -> set[str]:
|
|
return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"}
|
|
|
|
def __repr_args__(self) -> ReprArgs:
|
|
for k in self.__repr_keys__:
|
|
if k == "config":
|
|
yield k, self.config.model_dump(flatten=True)
|
|
else:
|
|
yield k, getattr(self, k)
|
|
|
|
@property
|
|
def model_id(self) -> str:
|
|
return self._model_id
|
|
|
|
@property
|
|
def runtime(self) -> t.Literal["ggml", "transformers"]:
|
|
return self._runtime
|
|
|
|
@property
|
|
def runner_name(self) -> str:
|
|
return f"llm-{self.config['start_name']}-runner"
|
|
|
|
# NOTE: The section below defines a loose contract with langchain's LLM interface.
|
|
@property
|
|
def llm_type(self) -> str:
|
|
return normalise_model_name(self._model_id)
|
|
|
|
@property
|
|
def identifying_params(self) -> DictStrAny:
|
|
return {
|
|
"configuration": self.config.model_dump_json().decode(),
|
|
"model_ids": orjson.dumps(self.config["model_ids"]).decode(),
|
|
}
|
|
|
|
@property
|
|
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
|
|
"""Returning the processed model and tokenizer parameters.
|
|
|
|
These can then be used with 'import_model' or any other place that requires loading model and tokenizer.
|
|
|
|
See 'openllm.cli.download_models' for example usage.
|
|
|
|
Returns:
|
|
``tuple``: It returns a tuple of (model_args, model_kwargs) & tokenizer_kwargs
|
|
"""
|
|
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
|
|
|
|
@property
|
|
def tag(self) -> bentoml.Tag:
|
|
return self._tag
|
|
|
|
def ensure_model_id_exists(self) -> bentoml.Model:
|
|
"""This utility function will download the model if it doesn't exist yet.
|
|
|
|
Make sure to call this function if 'ensure_available' is not set during
|
|
Auto LLM initialisation.
|
|
|
|
The equivalent for ``openllm.Runner`` is ``openllm.Runner.download_model``.
|
|
"""
|
|
return openllm.import_model(
|
|
self.config["start_name"],
|
|
model_id=self.model_id,
|
|
model_version=self._model_version,
|
|
runtime=self.runtime,
|
|
implementation=self.__llm_implementation__,
|
|
quantize=self._quantize_method,
|
|
serialisation_format=self._serialisation_format,
|
|
)
|
|
|
|
@property
|
|
def _bentomodel(self) -> bentoml.Model:
|
|
if self.__llm_bentomodel__ is None:
|
|
self.__llm_bentomodel__ = openllm.serialisation.get(self)
|
|
return self.__llm_bentomodel__
|
|
|
|
@property
|
|
def model(self) -> M:
|
|
"""The model to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
|
|
# Run check for GPU
|
|
if self.config["requires_gpu"] and openllm.utils.device_count() < 1:
|
|
raise GpuNotAvailableError(f"{self} only supports running with GPU (None available).") from None
|
|
|
|
if self.__llm_model__ is None:
|
|
# NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
|
|
self.__llm_model__ = self.load_model(*self._model_decls, **self._model_attrs)
|
|
return self.__llm_model__
|
|
|
|
@property
|
|
def tokenizer(self) -> T:
|
|
"""The tokenizer to use for this LLM. This shouldn't be set at runtime, rather let OpenLLM handle it."""
|
|
if self.__llm_tokenizer__ is None:
|
|
# NOTE: the signature of load_tokenizer here is the wrapper under _wrapped_load_tokenizer
|
|
self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs)
|
|
return self.__llm_tokenizer__
|
|
|
|
def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig:
|
|
strategy = first_not_none(
|
|
self.config["fine_tune_strategies"].get(_adapter_type),
|
|
default=FineTuneConfig(adapter_type=t.cast("PeftType", _adapter_type), llm_config_class=self.config_class),
|
|
)
|
|
return strategy.eval() if inference_mode else strategy.train()
|
|
|
|
def _transpose_adapter_mapping(
|
|
self,
|
|
inference_mode: bool = True,
|
|
use_cache: bool = True,
|
|
) -> ResolvedAdaptersMapping:
|
|
if self._adapters_mapping is None:
|
|
raise ValueError("LoRA mapping is not set up correctly.")
|
|
|
|
if use_cache and self.__llm_adapter_map__ is not None:
|
|
# early out if we already serialized everything.
|
|
return self.__llm_adapter_map__
|
|
|
|
if not use_cache:
|
|
logger.debug("Adapter mapping resolution will not be cached. This should only be used during training.")
|
|
|
|
adapter_map: ResolvedAdaptersMapping = {k: {} for k in self._adapters_mapping}
|
|
# this is a temporary check to accept the first option name as 'default'
|
|
# then we will raise Error when the optional_name is set to None in next iteration.
|
|
_converted_first_none = False
|
|
for _adapter_type, _adapters_tuples in self._adapters_mapping.items():
|
|
default_config = self._default_ft_config(_adapter_type, inference_mode)
|
|
for adapter in _adapters_tuples:
|
|
if not adapter.name and _converted_first_none:
|
|
raise ValueError(
|
|
f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}"
|
|
)
|
|
name = adapter.name
|
|
if name is None:
|
|
_converted_first_none = True
|
|
name = "default"
|
|
peft_config = (
|
|
default_config.with_config(**adapter.config).to_peft_config()
|
|
if name == "default"
|
|
else FineTuneConfig(
|
|
adapter_type=t.cast("PeftType", _adapter_type),
|
|
adapter_config=adapter.config,
|
|
inference_mode=inference_mode,
|
|
llm_config_class=self.config_class,
|
|
).to_peft_config()
|
|
)
|
|
adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id)
|
|
|
|
if self.__llm_adapter_map__ is None and use_cache:
|
|
self.__llm_adapter_map__ = adapter_map
|
|
return adapter_map
|
|
|
|
@requires_dependencies("peft", extra="fine-tune")
|
|
def prepare_for_training(
|
|
self,
|
|
adapter_type: AdapterType = "lora",
|
|
use_gradient_checkpointing: bool = True,
|
|
**attrs: t.Any,
|
|
) -> tuple[peft.PeftModel, T]:
|
|
from peft import prepare_model_for_kbit_training
|
|
|
|
peft_config = (
|
|
self.config["fine_tune_strategies"]
|
|
.get(
|
|
adapter_type,
|
|
FineTuneConfig(
|
|
adapter_type=t.cast("PeftType", adapter_type),
|
|
llm_config_class=self.config_class,
|
|
),
|
|
)
|
|
.train()
|
|
.with_config(**attrs)
|
|
.to_peft_config()
|
|
)
|
|
wrapped_peft = peft.get_peft_model(
|
|
prepare_model_for_kbit_training(
|
|
self.model,
|
|
use_gradient_checkpointing=use_gradient_checkpointing,
|
|
),
|
|
peft_config,
|
|
)
|
|
if DEBUG:
|
|
wrapped_peft.print_trainable_parameters()
|
|
return wrapped_peft, self.tokenizer
|
|
|
|
@requires_dependencies("peft", extra="fine-tune")
|
|
def apply_adapter(
|
|
self,
|
|
inference_mode: bool = True,
|
|
adapter_type: AdapterType = "lora",
|
|
load_adapters: t.Literal["all"] | list[str] | None = None,
|
|
use_cache: bool = True,
|
|
) -> peft.PeftModel | M:
|
|
"""Apply given LoRA mapping to the model.
|
|
|
|
Note that the base model can still be accessed via self.model.get_base_model().
|
|
"""
|
|
assert self.__llm_model__ is not None # noqa: S101
|
|
|
|
# early out if _adapters_mapping is empty or it is already wrapped with peft.
|
|
if not self._adapters_mapping:
|
|
logger.debug("No adapter mapping is found. Skip applying adapter.")
|
|
return self.__llm_model__
|
|
if isinstance(self.__llm_model__, peft.PeftModel):
|
|
logger.debug("Model is already wrapped with peft. Skip applying adapter.")
|
|
return self.__llm_model__
|
|
|
|
_mapping = self._transpose_adapter_mapping(inference_mode=inference_mode, use_cache=use_cache)
|
|
if adapter_type not in _mapping:
|
|
raise ValueError(
|
|
f"Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}"
|
|
)
|
|
adapter_mapping = _mapping[adapter_type]
|
|
|
|
self.__llm_model__ = self._wrap_default_peft_model(adapter_mapping, inference_mode=inference_mode)
|
|
|
|
if not isinstance(self.__llm_model__, peft.PeftModel):
|
|
# We hit this branch during inference
|
|
# TODO: load multiple adapters
|
|
return self.__llm_model__
|
|
|
|
# now we loop through the rest with add_adapter
|
|
if len(adapter_mapping) > 0:
|
|
for adapter_name, (_peft_config, _) in adapter_mapping.items():
|
|
self.__llm_model__.add_adapter(adapter_name, _peft_config)
|
|
|
|
# optionally load adapters. In case of multiple adapters, or on Runner,
|
|
# we will need to set load_adapters='all'
|
|
if load_adapters is not None:
|
|
adapters_to_load = adapter_mapping.keys() if load_adapters == "all" else load_adapters
|
|
for adapter_name in adapters_to_load:
|
|
_peft_config, _peft_model_id = adapter_mapping[adapter_name]
|
|
self.__llm_model__.load_adapter(
|
|
_peft_model_id,
|
|
adapter_name=adapter_name,
|
|
is_trainable=not inference_mode,
|
|
**dict(_peft_config.to_dict()),
|
|
)
|
|
|
|
return self.__llm_model__
|
|
|
|
# XXX: Until peft publish py.typed PR, we will need to set the wrapper to t.Any (not ideal since we ducking the actual Peft class here.)
|
|
def _wrap_default_peft_model(
|
|
self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]], inference_mode: bool
|
|
) -> t.Any:
|
|
assert self.__llm_model__ is not None, "Error: Model is not loaded correctly" # noqa: S101
|
|
if isinstance(self.__llm_model__, peft.PeftModel):
|
|
logger.warning("Model is already wrapped with peft. Skip wrapping with default peft model.")
|
|
return self.__llm_model__
|
|
|
|
if "default" not in adapter_mapping:
|
|
raise ValueError(
|
|
"There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team."
|
|
)
|
|
|
|
default_config, peft_model_id = adapter_mapping.pop("default")
|
|
|
|
# the below shared similar logics with `get_peft_model`
|
|
# TODO: Support PromptLearningConfig
|
|
if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(
|
|
default_config, peft.PromptLearningConfig
|
|
):
|
|
logger.debug(
|
|
"Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.",
|
|
default_config.task_type,
|
|
)
|
|
model = peft.PeftModel(self.__llm_model__, default_config)
|
|
else:
|
|
# XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0
|
|
# onwards. For now, keep this logic here.
|
|
peft_class = peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING[default_config.task_type]
|
|
if default_config.base_model_name_or_path:
|
|
kwargs: DictStrAny = {"is_trainable": not inference_mode}
|
|
if "config" in inspect.signature(peft_class.from_pretrained).parameters:
|
|
kwargs["config"] = default_config
|
|
else:
|
|
kwargs.update(dict(default_config.to_dict().items()))
|
|
# BUG: This hits during inference, need fixing
|
|
model = peft_class.from_pretrained(self.__llm_model__, peft_model_id, **kwargs)
|
|
else:
|
|
# in this case, the given base_model_name_or_path is None. This will be hit during training
|
|
model = peft_class(self.__llm_model__, default_config)
|
|
return model
|
|
|
|
# order of these fields matter here, make sure to sync it with
|
|
# openllm.models.auto.factory.BaseAutoLLMClass.for_model
|
|
def to_runner(
|
|
self,
|
|
models: list[bentoml.Model] | None = None,
|
|
max_batch_size: int | None = None,
|
|
max_latency_ms: int | None = None,
|
|
scheduling_strategy: type[bentoml.Strategy] | None = None,
|
|
) -> LLMRunner:
|
|
"""Convert this LLM into a Runner.
|
|
|
|
Args:
|
|
models: Any additional ``bentoml.Model`` to be included in this given models.
|
|
By default, this will be determined from the model_name.
|
|
max_batch_size: The maximum batch size for the runner.
|
|
max_latency_ms: The maximum latency for the runner.
|
|
strategy: The strategy to use for this runner.
|
|
embedded: Whether to run this runner in embedded mode.
|
|
scheduling_strategy: Whether to create a custom scheduling strategy for this Runner.
|
|
|
|
Returns:
|
|
A generated LLMRunner for this LLM.
|
|
|
|
NOTE: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner(): 'name'.
|
|
- 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this.
|
|
The generated name will be 'llm-<model-start-name>-runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
|
|
- 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
|
|
- 'method_configs': The method configs for the runner will be managed internally by OpenLLM.
|
|
"""
|
|
models = models if models is not None else []
|
|
|
|
try:
|
|
models.append(self._bentomodel)
|
|
except bentoml.exceptions.NotFound:
|
|
models.append(openllm.serialisation.get(self, auto_import=True))
|
|
|
|
if scheduling_strategy is None:
|
|
from ._strategies import CascadingResourceStrategy
|
|
|
|
scheduling_strategy = CascadingResourceStrategy
|
|
|
|
generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
|
|
embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
|
|
generate_iterator_sig = ModelSignature.from_dict(
|
|
t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True))
|
|
)
|
|
|
|
# NOTE: returning the two langchain API's to the runner
|
|
return llm_runner_class(self)(
|
|
llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig),
|
|
name=self.runner_name,
|
|
embedded=False,
|
|
models=models,
|
|
max_batch_size=max_batch_size,
|
|
max_latency_ms=max_latency_ms,
|
|
method_configs=bentoml_cattr.unstructure(
|
|
{
|
|
"embeddings": embeddings_sig,
|
|
"__call__": generate_sig,
|
|
"generate": generate_sig,
|
|
"generate_one": generate_sig,
|
|
"generate_iterator": generate_iterator_sig,
|
|
}
|
|
),
|
|
scheduling_strategy=scheduling_strategy,
|
|
)
|
|
|
|
def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
|
|
"""The scikit-compatible API for self(...)."""
|
|
return self.__call__(prompt, **attrs)
|
|
|
|
def __call__(self, prompt: str, **attrs: t.Any) -> t.Any:
|
|
"""Returns the generation result and format the result.
|
|
|
|
First, it runs `self.sanitize_parameters` to sanitize the parameters.
|
|
The the sanitized prompt and kwargs will be pass into self.generate.
|
|
Finally, run self.postprocess_generate to postprocess the generated result.
|
|
|
|
This allows users to do the following:
|
|
|
|
```python
|
|
llm = openllm.AutoLLM.for_model("dolly-v2")
|
|
llm("What is the meaning of life?")
|
|
```
|
|
"""
|
|
prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **attrs)
|
|
generated_result = self.generate(prompt, **generate_kwargs)
|
|
return self.postprocess_generate(prompt, generated_result, **postprocess_kwargs)
|
|
|
|
|
|
@overload
|
|
def Runner(
|
|
model_name: str,
|
|
*,
|
|
model_id: str | None = None,
|
|
model_version: str | None = ...,
|
|
init_local: t.Literal[False, True] = ...,
|
|
**attrs: t.Any,
|
|
) -> LLMRunner:
|
|
...
|
|
|
|
|
|
@overload
|
|
def Runner(
|
|
model_name: str,
|
|
*,
|
|
model_id: str = ...,
|
|
model_version: str | None = ...,
|
|
models: list[bentoml.Model] | None = ...,
|
|
max_batch_size: int | None = ...,
|
|
max_latency_ms: int | None = ...,
|
|
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
|
|
embedded: t.Literal[True, False] = ...,
|
|
scheduling_strategy: type[bentoml.Strategy] | None = ...,
|
|
**attrs: t.Any,
|
|
) -> LLMRunner:
|
|
...
|
|
|
|
|
|
@overload
|
|
def Runner(
|
|
model_name: str,
|
|
*,
|
|
ensure_available: bool | None = None,
|
|
init_local: bool = ...,
|
|
implementation: LiteralRuntime | None = None,
|
|
llm_config: openllm.LLMConfig | None = None,
|
|
**attrs: t.Any,
|
|
) -> LLMRunner:
|
|
...
|
|
|
|
|
|
@overload
|
|
def Runner(
|
|
model_name: str,
|
|
*args: t.Any,
|
|
model_id: str | None = ...,
|
|
model_version: str | None = ...,
|
|
llm_config: openllm.LLMConfig | None = ...,
|
|
runtime: t.Literal["ggml", "transformers"] | None = ...,
|
|
quantize: t.Literal["int8", "int4", "gptq"] | None = ...,
|
|
bettertransformer: str | bool | None = ...,
|
|
adapter_id: str | None = ...,
|
|
adapter_name: str | None = ...,
|
|
adapter_map: dict[str, str | None] | None = ...,
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
|
serialisation: t.Literal["safetensors", "legacy"] = ...,
|
|
**attrs: t.Any,
|
|
) -> LLMRunner:
|
|
...
|
|
|
|
|
|
def Runner(
|
|
model_name: str,
|
|
ensure_available: bool | None = None,
|
|
init_local: bool = False,
|
|
implementation: LiteralRuntime | None = None,
|
|
llm_config: openllm.LLMConfig | None = None,
|
|
**attrs: t.Any,
|
|
) -> LLMRunner:
|
|
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
|
|
|
|
The behaviour of ensure_available that is synonymous to `AutoLLM.for_model` depends on `init_local`.
|
|
By default, `ensure_available` is synonymous to `init_local`, meaning on the service when creating
|
|
runner, it won't download the model. So before running your BentoML Service, you should create a `on_startup`
|
|
hook to check download if you don't want to do it manually:
|
|
|
|
```python
|
|
|
|
runner = openllm.Runner("dolly-v2")
|
|
|
|
@svc.on_startup
|
|
def download():
|
|
runner.download_model()
|
|
```
|
|
|
|
if `init_local=True` (For development workflow), it will also enable `ensure_available`.
|
|
Default value of `ensure_available` is None. If set then use that given value, otherwise fallback to the aforementioned behaviour.
|
|
|
|
Args:
|
|
model_name: Supported model name from 'openllm models'
|
|
ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
|
|
If False, make sure the model is available locally.
|
|
implementation: The given Runner implementation one choose for this Runner. By default, it is retrieved from the enviroment variable
|
|
of the respected model_name. For example: 'flan-t5' -> "OPENLLM_FLAN_T5_FRAMEWORK"
|
|
llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
|
|
init_local: If True, it will initialize the model locally. This is useful if you want to
|
|
run the model locally. (Symmetrical to bentoml.Runner.init_local())
|
|
**attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs
|
|
behaviour
|
|
"""
|
|
if llm_config is not None:
|
|
attrs.update(
|
|
{
|
|
"bettertransformer": llm_config["env"]["bettertransformer_value"],
|
|
"quantize": llm_config["env"]["quantize_value"],
|
|
"runtime": llm_config["env"]["runtime_value"],
|
|
"serialisation": first_not_none(
|
|
os.getenv("OPENLLM_SERIALIZATION"), attrs.get("serialisation"), default="safetensors"
|
|
),
|
|
}
|
|
)
|
|
|
|
default_implementation = llm_config["default_implementation"] if llm_config is not None else "pt"
|
|
|
|
implementation = first_not_none(
|
|
implementation, default=EnvVarMixin(model_name, default_implementation)["framework_value"]
|
|
)
|
|
|
|
runner = openllm.infer_auto_class(implementation).create_runner(
|
|
model_name,
|
|
llm_config=llm_config,
|
|
ensure_available=ensure_available if ensure_available is not None else init_local,
|
|
**attrs,
|
|
)
|
|
|
|
if init_local:
|
|
runner.init_local(quiet=True)
|
|
|
|
return runner
|
|
|
|
|
|
def method_signature(sig: ModelSignature) -> ModelSignatureDict:
|
|
return bentoml_cattr.unstructure(sig)
|
|
|
|
|
|
class SetAdapterOutput(t.TypedDict):
|
|
success: bool
|
|
message: str
|
|
|
|
|
|
def llm_runnable_class(
|
|
self: openllm.LLM[M, T],
|
|
embeddings_sig: ModelSignature,
|
|
generate_sig: ModelSignature,
|
|
generate_iterator_sig: ModelSignature,
|
|
) -> type[LLMRunnable]:
|
|
class _Runnable(bentoml.Runnable):
|
|
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu", "cpu")
|
|
SUPPORTS_CPU_MULTI_THREADING = True
|
|
|
|
def __init__(__self: _Runnable):
|
|
# NOTE: The side effect of this line
|
|
# is that it will load the imported model during
|
|
# runner startup. So don't remove it!!
|
|
__self.model = self.model # keep a loaded reference
|
|
if self.adapters_mapping is not None:
|
|
logger.info("Applying LoRA to %s...", self.runner_name)
|
|
self.apply_adapter(inference_mode=True, load_adapters="all")
|
|
|
|
@bentoml.Runnable.method(batchable=False)
|
|
def set_adapter(__self: _Runnable, adapter_name: str) -> SetAdapterOutput:
|
|
success = False
|
|
message = None
|
|
if not is_peft_available():
|
|
message = "peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'"
|
|
elif self.__llm_adapter_map__ is None:
|
|
message = "No adapters available for current running server."
|
|
elif not isinstance(__self.model, peft.PeftModel):
|
|
message = "Model is not a PeftModel"
|
|
if message is not None:
|
|
return SetAdapterOutput(success=success, message=message)
|
|
|
|
try:
|
|
t.cast("peft.PeftModel", __self.model).set_adapter(adapter_name)
|
|
return SetAdapterOutput(success=True, message=f"Successfully set current adapter to {adapter_name}")
|
|
except ValueError:
|
|
logger.info("Adapter %s not found", adapter_name)
|
|
return SetAdapterOutput(
|
|
success=success,
|
|
message=f"Adapter {adapter_name} not found. Available adapters: {list(t.cast('peft.PeftModel', __self.model).peft_config)}",
|
|
)
|
|
|
|
@bentoml.Runnable.method(**method_signature(embeddings_sig))
|
|
def embeddings(__self: _Runnable, prompt: str | list[str]) -> LLMEmbeddings:
|
|
if isinstance(prompt, str):
|
|
prompt = [prompt]
|
|
return self.embeddings(prompt)
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_sig))
|
|
def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
|
|
return self.generate(prompt, **attrs)
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_sig))
|
|
def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
|
|
return self.generate(prompt, **attrs)
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_sig))
|
|
def generate_one(
|
|
__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any
|
|
) -> t.Sequence[dict[t.Literal["generated_text"], str]]:
|
|
return self.generate_one(prompt, stop, **attrs)
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_iterator_sig))
|
|
def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[t.Any, None, None]:
|
|
yield self.generate_iterator(prompt, **attrs)
|
|
|
|
return types.new_class(
|
|
self.__class__.__name__ + "Runnable",
|
|
(_Runnable,),
|
|
{},
|
|
lambda ns: ns.update(
|
|
{
|
|
"SUPPORTED_RESOURCES": ("nvidia.com/gpu", "amd.com/gpu")
|
|
if self.config["requires_gpu"]
|
|
else ("nvidia.com/gpu", "amd.com/gpu", "cpu"),
|
|
"__module__": self.__module__,
|
|
"__doc__": self.config["env"].start_docstring,
|
|
}
|
|
),
|
|
)
|
|
|
|
|
|
def llm_runner_class(self: openllm.LLM[M, T]) -> type[LLMRunner]:
|
|
def available_adapters(__self: LLMRunner) -> PeftAdapterOutput:
|
|
if not is_peft_available():
|
|
return {
|
|
"success": False,
|
|
"result": {},
|
|
"error_msg": "peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'",
|
|
}
|
|
if self.__llm_adapter_map__ is None:
|
|
return {
|
|
"success": False,
|
|
"result": {},
|
|
"error_msg": "No adapters available for current running server.",
|
|
}
|
|
if not isinstance(__self.model, peft.PeftModel):
|
|
return {"success": False, "result": {}, "error_msg": "Model is not a PeftModel"}
|
|
return {"success": True, "result": __self.model.peft_config, "error_msg": ""}
|
|
|
|
def _wrapped_generate_run(__self: LLMRunner, prompt: str, **kwargs: t.Any) -> t.Any:
|
|
"""Wrapper for runner.generate.run() to handle the prompt and postprocessing.
|
|
|
|
This will be used for LangChain API.
|
|
|
|
Usage:
|
|
```python
|
|
runner = openllm.Runner("dolly-v2", init_local=True)
|
|
runner("What is the meaning of life?")
|
|
```
|
|
"""
|
|
prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **kwargs)
|
|
generated_result = __self.generate.run(prompt, **generate_kwargs)
|
|
return self.postprocess_generate(prompt, generated_result, **postprocess_kwargs)
|
|
|
|
def _wrapped_embeddings_run(__self: LLMRunner, prompt: str | list[str]) -> LLMEmbeddings:
|
|
"""``llm.embed`` is a light wrapper around runner.embeedings.run().
|
|
|
|
Usage:
|
|
```python
|
|
runner = openllm.Runner('llama', implementation='pt')
|
|
runner.embed("What is the meaning of life?")
|
|
```
|
|
"""
|
|
if isinstance(prompt, str):
|
|
prompt = [prompt]
|
|
return __self.embeddings.run(prompt)
|
|
|
|
def _wrapped_repr_keys(_: LLMRunner) -> set[str]:
|
|
return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"}
|
|
|
|
def _wrapped_repr_args(__self: LLMRunner) -> ReprArgs:
|
|
yield "runner_methods", {
|
|
method.name: {
|
|
"batchable": method.config.batchable,
|
|
"batch_dim": method.config.batch_dim if method.config.batchable else None,
|
|
}
|
|
for method in __self.runner_methods
|
|
}
|
|
yield "config", self.config.model_dump(flatten=True)
|
|
yield "llm_type", __self.llm_type
|
|
yield "runtime", self.runtime
|
|
yield "llm_tag", self.tag
|
|
|
|
return types.new_class(
|
|
self.__class__.__name__ + "Runner",
|
|
(bentoml.Runner,),
|
|
exec_body=lambda ns: ns.update(
|
|
{
|
|
"llm_type": self.llm_type,
|
|
"identifying_params": self.identifying_params,
|
|
"llm_tag": self.tag,
|
|
"llm": self, # NOTE: self reference to LLM
|
|
"config": self.config,
|
|
"peft_adapters": property(fget=available_adapters),
|
|
"download_model": self.ensure_model_id_exists,
|
|
"__call__": _wrapped_generate_run,
|
|
"embed": _wrapped_embeddings_run,
|
|
"__module__": self.__module__,
|
|
"__doc__": self.config["env"].start_docstring,
|
|
"__repr__": ReprMixin.__repr__,
|
|
"__repr_keys__": property(_wrapped_repr_keys),
|
|
"__repr_args__": _wrapped_repr_args,
|
|
}
|
|
),
|
|
)
|