mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-02-02 03:42:03 -05:00
1287 lines
61 KiB
Python
1287 lines
61 KiB
Python
# mypy: disable-error-code="name-defined,attr-defined"
|
|
from __future__ import annotations
|
|
import abc
|
|
import gc
|
|
import inspect
|
|
import logging
|
|
import os
|
|
import types
|
|
import typing as t
|
|
|
|
import attr
|
|
import fs.path
|
|
import inflection
|
|
import orjson
|
|
|
|
from huggingface_hub import hf_hub_download
|
|
|
|
import bentoml
|
|
import openllm
|
|
import openllm_core
|
|
|
|
from bentoml._internal.models.model import ModelSignature
|
|
from openllm_core._configuration import FineTuneConfig
|
|
from openllm_core._configuration import LLMConfig
|
|
from openllm_core._schema import EmbeddingsOutput
|
|
from openllm_core._typing_compat import AdaptersMapping
|
|
from openllm_core._typing_compat import AdaptersTuple
|
|
from openllm_core._typing_compat import AdapterType
|
|
from openllm_core._typing_compat import DictStrAny
|
|
from openllm_core._typing_compat import LiteralBackend
|
|
from openllm_core._typing_compat import LiteralString
|
|
from openllm_core._typing_compat import LLMRunnable
|
|
from openllm_core._typing_compat import LLMRunner
|
|
from openllm_core._typing_compat import M
|
|
from openllm_core._typing_compat import ModelSignatureDict
|
|
from openllm_core._typing_compat import PeftAdapterOutput
|
|
from openllm_core._typing_compat import T
|
|
from openllm_core._typing_compat import TupleAny
|
|
from openllm_core._typing_compat import overload
|
|
from openllm_core.utils import DEBUG
|
|
from openllm_core.utils import MYPY
|
|
from openllm_core.utils import EnvVarMixin
|
|
from openllm_core.utils import LazyLoader
|
|
from openllm_core.utils import ReprMixin
|
|
from openllm_core.utils import apply
|
|
from openllm_core.utils import bentoml_cattr
|
|
from openllm_core.utils import codegen
|
|
from openllm_core.utils import device_count
|
|
from openllm_core.utils import first_not_none
|
|
from openllm_core.utils import generate_hash_from_file
|
|
from openllm_core.utils import is_peft_available
|
|
from openllm_core.utils import is_torch_available
|
|
from openllm_core.utils import normalize_attrs_to_model_tokenizer_pair
|
|
from openllm_core.utils import resolve_filepath
|
|
from openllm_core.utils import validate_is_path
|
|
|
|
from ._assign import make_llm_attributes
|
|
from ._quantisation import infer_quantisation_config
|
|
from .exceptions import ForbiddenAttributeError
|
|
from .exceptions import GpuNotAvailableError
|
|
from .exceptions import OpenLLMException
|
|
from .utils import infer_auto_class
|
|
|
|
if t.TYPE_CHECKING:
|
|
|
|
import auto_gptq as autogptq
|
|
import peft
|
|
import torch
|
|
import transformers
|
|
|
|
from openllm_core._configuration import PeftType
|
|
from openllm_core.utils.representation import ReprArgs
|
|
else:
|
|
autogptq = LazyLoader('autogptq', globals(), 'auto_gptq')
|
|
transformers = LazyLoader('transformers', globals(), 'transformers')
|
|
torch = LazyLoader('torch', globals(), 'torch')
|
|
peft = LazyLoader('peft', globals(), 'peft')
|
|
|
|
ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConfig', str]]]
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def normalise_model_name(name: str) -> str:
|
|
if validate_is_path(name): return os.path.basename(resolve_filepath(name))
|
|
name = name.replace('/', '--')
|
|
return inflection.dasherize(name)
|
|
|
|
# the below is similar to peft.utils.other.CONFIG_NAME
|
|
PEFT_CONFIG_NAME = 'adapter_config.json'
|
|
|
|
def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapping:
|
|
'''Resolve the type of the PeftConfig given the adapter_map.
|
|
|
|
This is similar to how PeftConfig resolve its config type.
|
|
|
|
Args:
|
|
adapter_map: The given mapping from either SDK or CLI. See CLI docs for more information.
|
|
'''
|
|
resolved: AdaptersMapping = {}
|
|
_has_set_default = False
|
|
for path_or_adapter_id, name in adapter_map.items():
|
|
resolve_name = name
|
|
if resolve_name is None:
|
|
if _has_set_default: raise ValueError('Only one adapter can be set as default.')
|
|
resolve_name = 'default'
|
|
_has_set_default = True
|
|
if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
|
|
config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
|
|
else:
|
|
try:
|
|
config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
|
|
except Exception as err:
|
|
raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
|
|
with open(config_file, 'r') as file:
|
|
resolved_config = orjson.loads(file.read())
|
|
# all peft_type should be available in PEFT_CONFIG_NAME
|
|
_peft_type: AdapterType = resolved_config['peft_type'].lower()
|
|
if _peft_type not in resolved: resolved[_peft_type] = ()
|
|
resolved[_peft_type] += (_AdaptersTuple((path_or_adapter_id, resolve_name, resolved_config)),)
|
|
return resolved
|
|
|
|
_reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'}
|
|
|
|
class LLMFunction(abc.ABC):
|
|
@abc.abstractmethod
|
|
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
|
|
'''This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
|
|
|
|
You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
|
|
|
|
> [!NOTE]
|
|
> This will be used from the client side.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
@abc.abstractmethod
|
|
def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any:
|
|
'''Text generation implementation for any given prompt.
|
|
|
|
It takes the prompt and 'generation_kwargs'. The main implementation will parse all of kwargs
|
|
correctly for you, so that subclass implementation don't have to repeat some of these boilercode.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
@abc.abstractmethod
|
|
def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]:
|
|
'''The iterator implementation of generate.
|
|
|
|
This will be used for Token streaming and SSE support.
|
|
|
|
Args:
|
|
prompt: the input prompt
|
|
**attrs: Relevant attributes to be pass to the stream generation implementation.
|
|
|
|
Returns:
|
|
An iterator of incoming token generation. It will returns a dictionary
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
|
|
'''The entrypoint for generating one prompt.
|
|
|
|
This provides additional stop tokens for generating per token level. This is useful when running with agents, or initial streaming support.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
def embeddings(self, prompts: list[str]) -> EmbeddingsOutput:
|
|
'''The implementation for generating text embeddings from given prompt.
|
|
|
|
It takes the prompt and output the embeddings for this given LLM.
|
|
|
|
Returns:
|
|
The embeddings for the given prompt.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
class LLMSerialisation(abc.ABC, t.Generic[M, T]):
|
|
def import_model(self, *args: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model:
|
|
'''Import both model and tokenizer weights into as a BentoML models.
|
|
|
|
Note that tokenizer attrs can be accessed via ``llm.llm_parameters``.
|
|
|
|
```python
|
|
_, tokenizer_attrs = llm.llm_parameters
|
|
```
|
|
|
|
By default, `model_decls` and `model_attrs` is already sanitised and concatenated into `args` and `attrs`
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
def load_model(self, *args: t.Any, **attrs: t.Any) -> M:
|
|
'''This function can be implemented to override the default load_model behaviour.
|
|
|
|
See falcon for example implementation. Tag can be accessed via ``self.tag``
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
def load_tokenizer(self, tag: bentoml.Tag, **attrs: t.Any) -> T:
|
|
'''This function can be implemented to override how to load the tokenizer.
|
|
|
|
See falcon for example implementation.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC):
|
|
def llm_post_init(self) -> None:
|
|
'''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals.
|
|
By default, this will add `self.device` if the implementation is PyTorch.
|
|
'''
|
|
pass
|
|
|
|
def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
|
|
'''This handler will sanitize all attrs and setup prompt text.
|
|
|
|
It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
|
|
|
|
Returns a tuple of three items:
|
|
- The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
|
|
- The attributes dictionary that will be passed into `self.postprocess_generate`.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None:
|
|
'''The default import kwargs to used when importing the model.
|
|
|
|
This will be passed into 'openllm.LLM.import_model'.
|
|
It returns two dictionaries: one for model kwargs and one for tokenizer kwargs.
|
|
|
|
Returns:
|
|
Optional tuple of model kwargs and tokenizer kwargs
|
|
'''
|
|
|
|
# NOTE: All fields below are attributes that can be accessed by users.
|
|
config_class: t.Type[LLMConfig]
|
|
'''The config class to use for this LLM. If you are creating a custom LLM, you must specify this class.'''
|
|
device: 'torch.device'
|
|
'''The device to be used for this LLM. If the implementation is 'pt', then it will be torch.device, else string.'''
|
|
tokenizer_id: t.Union[t.Literal['local'], LiteralString]
|
|
'''optional tokenizer_id for loading with vLLM if the model supports vLLM.'''
|
|
|
|
# NOTE: The following will be populated by __init_subclass__, note that these should be immutable.
|
|
__llm_backend__: LiteralBackend
|
|
'''This is used to determine which framework implementation for this given LLM.
|
|
|
|
Usually, this will inferred from class name, that follows the HuggingFace's naming convention:
|
|
|
|
- `OPTForConditionalGeneration` -> `pt`
|
|
- `TFOPTForConditionalGeneration` -> `tf`
|
|
- `FlaxOPTForConditionalGeneration` -> `flax`
|
|
|
|
For all VLLM backend: VLLMLlama -> `vllm`
|
|
For all GGML backend: GGMLLlama -> `ggml`
|
|
For all MLC backend: MLCLlama -> `mlc`
|
|
'''
|
|
__llm_model__: t.Optional[M]
|
|
'''A reference to the actual model. Instead of access this directly, you should use `model` property instead.'''
|
|
__llm_tokenizer__: t.Optional[T]
|
|
'''A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead.'''
|
|
__llm_adapter_map__: t.Optional[ResolvedAdaptersMapping]
|
|
'''A reference to the the cached LoRA adapter mapping.'''
|
|
|
|
__llm_supports_embeddings__: bool
|
|
'''A boolean to determine whether models does implement ``LLM.embeddings``.'''
|
|
__llm_supports_generate__: bool
|
|
'''A boolean to determine whether models does implement ``LLM.generate``.'''
|
|
__llm_supports_generate_one__: bool
|
|
'''A boolean to determine whether models does implement ``LLM.generate_one``.'''
|
|
__llm_supports_generate_iterator__: bool
|
|
'''A boolean to determine whether models does implement ``LLM.generate_iterator``.'''
|
|
|
|
_DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer'
|
|
|
|
_AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class('AdaptersTuple', ['adapter_id', 'name', 'config'])
|
|
|
|
@attr.define(slots=True, repr=False, init=False)
|
|
class LLM(LLMInterface[M, T], ReprMixin):
|
|
if t.TYPE_CHECKING: __name__: str
|
|
if t.TYPE_CHECKING and not MYPY:
|
|
|
|
def __attrs_init__(self,
|
|
config: LLMConfig,
|
|
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
|
|
model_id: str,
|
|
model_decls: TupleAny,
|
|
model_attrs: DictStrAny,
|
|
tokenizer_attrs: DictStrAny,
|
|
tag: bentoml.Tag,
|
|
adapters_mapping: t.Optional[AdaptersMapping],
|
|
model_version: t.Optional[str],
|
|
quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
|
|
serialisation_format: t.Literal['safetensors', 'legacy'],
|
|
_local: bool,
|
|
**attrs: t.Any) -> None:
|
|
'''Generated __attrs_init__ for openllm.LLM.'''
|
|
|
|
config: LLMConfig
|
|
'''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.'''
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
|
|
'''Quantisation config for quantised model on the fly.'''
|
|
|
|
_model_id: str
|
|
_model_decls: TupleAny
|
|
_model_attrs: DictStrAny
|
|
_tokenizer_attrs: DictStrAny
|
|
_tag: bentoml.Tag
|
|
_adapters_mapping: AdaptersMapping | None
|
|
_model_version: str
|
|
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None
|
|
_serialisation_format: t.Literal['safetensors', 'legacy']
|
|
_local: bool
|
|
|
|
def __init_subclass__(cls: type[LLM[M, T]]) -> None:
|
|
cd = cls.__dict__
|
|
if cls.__name__.startswith('Flax'):
|
|
cls.__llm_backend__, config_class = 'flax', openllm.AutoConfig.infer_class_from_name(cls.__name__[4:])
|
|
elif cls.__name__.startswith('TF'):
|
|
cls.__llm_backend__, config_class = 'tf', openllm.AutoConfig.infer_class_from_name(cls.__name__[2:])
|
|
elif cls.__name__.startswith('VLLM'):
|
|
cls.__llm_backend__, config_class = 'vllm', openllm.AutoConfig.infer_class_from_name(cls.__name__[4:])
|
|
else:
|
|
cls.__llm_backend__, config_class = 'pt', openllm.AutoConfig.infer_class_from_name(cls.__name__)
|
|
if '__openllm_internal__' not in cd and 'config_class' not in cd:
|
|
raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
|
|
if '__openllm_internal__' in cd and 'config_class' not in cd: cls.config_class = config_class
|
|
if 'tokenizer_id' not in cd and cls.__llm_backend__ == 'vllm': cls.tokenizer_id = _DEFAULT_TOKENIZER
|
|
make_llm_attributes(cls)(cls)
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['trust_remote_code']) -> bool:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['backend']) -> LiteralBackend:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['model']) -> M | None:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['tokenizer']) -> T | None:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['adapter_map']) -> ResolvedAdaptersMapping | None:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['supports_embeddings']) -> bool:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['supports_generate']) -> bool:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['supports_generate_one']) -> bool:
|
|
...
|
|
|
|
@overload
|
|
def __getitem__(self, item: t.Literal['supports_generate_iterator']) -> bool:
|
|
...
|
|
|
|
def __getitem__(self, item: t.Union[LiteralString, t.Any]) -> t.Any:
|
|
if item is None: raise TypeError(f"{self} doesn't understand how to index None.")
|
|
item = inflection.underscore(item)
|
|
internal_attributes = f'__llm_{item}__'
|
|
if hasattr(self, internal_attributes): return getattr(self, internal_attributes)
|
|
elif hasattr(self, item): return getattr(self, item)
|
|
else: raise KeyError(item)
|
|
|
|
@classmethod
|
|
def from_pretrained(cls,
|
|
model_id: str | None = None,
|
|
model_version: str | None = None,
|
|
llm_config: LLMConfig | None = None,
|
|
*args: t.Any,
|
|
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
|
adapter_id: str | None = None,
|
|
adapter_name: str | None = None,
|
|
adapter_map: dict[str, str | None] | None = None,
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
|
serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
|
|
**attrs: t.Any) -> LLM[M, T]:
|
|
'''Instantiate a pretrained LLM.
|
|
|
|
``LLM.from_pretrained`` follows the same design principle as HuggingFace's `from_pretrained` method, plus the following:
|
|
|
|
### Optimization options:
|
|
|
|
> This is most notable during serving time.
|
|
|
|
- quantize: quantize the model with the given quantization method. Currently supported int8, int4 quantization
|
|
|
|
> Currently, the above two options are mutually exclusive.
|
|
|
|
#### Quantisation options
|
|
|
|
For customising options for quantisation config, ``openllm.LLM`` accepts all arbitrary arguments that is passed to ``transformers.BitsAndBytesConfig``
|
|
plus ``quantize`` value. For example, for ``int8`` quantisation, specify the following:
|
|
```python
|
|
model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
|
|
```
|
|
|
|
For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
|
|
to ``auto_gptq.BaseQuantizeConfig``.
|
|
|
|
### Adapter options:
|
|
|
|
> This is used in conjunction with the fine-tuning features
|
|
|
|
- adapter_id: Optional [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to apply to said model.
|
|
- adapter_name: Optional name of the adapter to apply to said model. If not provided, it will be handled internally by OpenLLM.
|
|
- adapter_map: optional dictionary of adapter_id to adapter_name. Note that this is mutually exclusive with adapter_id/adapter_name arguments.
|
|
|
|
Args:
|
|
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
|
|
> [!WARNING] If custom path is passed, make sure it contains all available file to construct
|
|
> ``transformers.PretrainedConfig``, ``transformers.PreTrainedModel``, and ``transformers.PreTrainedTokenizer``.
|
|
model_name: Optional model name to be saved with this LLM. Default to None. It will be inferred automatically from model_id.
|
|
If model_id is a custom path, it will be the basename of the given path.
|
|
model_version: Optional version for this given model id. Default to None. This is useful for saving from custom path.
|
|
If set to None, the version will either be the git hash from given pretrained model, or the hash inferred
|
|
from last modified time of the given directory.
|
|
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
|
|
will use `config_class` to construct default configuration.
|
|
quantize: The quantization to use for this LLM. Defaults to None. Possible values
|
|
include int8, int4 and gptq.
|
|
quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
|
|
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
|
Default behaviour is similar to ``safe_serialization=False``.
|
|
adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
|
|
adapter_name: The adapter name to use for this LLM. Defaults to None.
|
|
adapter_map: The adapter map to use for this LLM. Defaults to None. Note that this is mutually exclusive with adapter_id/adapter_name arguments.
|
|
*args: The args to be passed to the model.
|
|
**attrs: The kwargs to be passed to the model.
|
|
'''
|
|
cfg_cls = cls.config_class
|
|
_local = False
|
|
_model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
|
|
if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
|
|
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
|
|
|
|
# quantization setup
|
|
if quantization_config and quantize:
|
|
raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
|
|
if quantization_config is None and quantize is not None:
|
|
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
|
|
if quantize == 'gptq': serialisation = 'safetensors'
|
|
elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress
|
|
|
|
# NOTE: LoRA adapter setup
|
|
if adapter_map and adapter_id:
|
|
raise ValueError(
|
|
"'adapter_map' and 'adapter_id' are mutually exclusive. Either provide a 'adapter_map' ({adapter_id: adapter_name | None, ...}) or use the combination of adapter_id/adapter_name arguments. "
|
|
)
|
|
if adapter_map is None and adapter_id is not None: adapter_map = {adapter_id: adapter_name}
|
|
if adapter_map is not None and not is_peft_available():
|
|
raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
|
|
if adapter_map: logger.debug('OpenLLM will apply the following adapters layers: %s', list(adapter_map))
|
|
|
|
if llm_config is None:
|
|
llm_config = cls.config_class.model_construct_env(**attrs)
|
|
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
|
|
attrs = llm_config['extras']
|
|
|
|
try:
|
|
_tag = cls.generate_tag(_model_id, model_version)
|
|
if _tag.version is None:
|
|
raise ValueError(f'Failed to resolve the correct model version for {cfg_cls.__openllm_start_name__}')
|
|
except Exception as err:
|
|
raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}") from err
|
|
|
|
return cls(*args,
|
|
model_id=_model_id,
|
|
llm_config=llm_config,
|
|
quantization_config=quantization_config,
|
|
_quantize_method=quantize,
|
|
_model_version=_tag.version,
|
|
_tag=_tag,
|
|
_serialisation_format=serialisation,
|
|
_local=_local,
|
|
_adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
|
|
**attrs)
|
|
|
|
@classmethod
|
|
@apply(str.lower)
|
|
def _generate_tag_str(cls, model_id: str, model_version: str | None) -> str:
|
|
'''Generate a compliant ``bentoml.Tag`` from model_id.
|
|
|
|
If model_id is a pretrained_id from HF, then it will have the following format: <backend>-<normalise_model_id>:<revision>
|
|
If model_id contains the revision itself, then the same format above
|
|
If model_id is a path, then it will be <backend>-<basename_of_path>:<generated_sha1> if model_version is not passesd, otherwise <backend>-<basename_of_path>:<model_version>
|
|
|
|
> [!NOTE] here that the generated SHA1 for path cases is that it will be based on last modified time.
|
|
|
|
Args:
|
|
model_id: Model id for this given LLM. It can be pretrained weights URL, custom path.
|
|
model_version: Specific revision for this model_id or custom version.
|
|
|
|
Returns:
|
|
``str``: Generated tag format that can be parsed by ``bentoml.Tag``
|
|
'''
|
|
# specific branch for running in docker or kubernetes, this is very hacky,
|
|
# and probably need a better way to support custom path
|
|
if os.environ.get('BENTO_PATH') is not None: return ':'.join(fs.path.parts(model_id)[-2:])
|
|
|
|
model_name = normalise_model_name(model_id)
|
|
model_id, *maybe_revision = model_id.rsplit(':')
|
|
if len(maybe_revision) > 0:
|
|
if model_version is not None:
|
|
logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
|
|
return f'{cls.__llm_backend__}-{model_name}:{maybe_revision[0]}'
|
|
|
|
tag_name = f'{cls.__llm_backend__}-{model_name}'
|
|
if openllm_core.utils.check_bool_env('OPENLLM_USE_LOCAL_LATEST', False):
|
|
return str(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
|
|
if validate_is_path(model_id):
|
|
model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
|
|
else:
|
|
from .serialisation.transformers._helpers import process_config
|
|
model_version = getattr(
|
|
process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default='main'))[0], '_commit_hash', None)
|
|
if model_version is None:
|
|
raise ValueError(f"Internal errors when parsing config for pretrained '{model_id}' ('commit_hash' not found)")
|
|
return f'{tag_name}:{model_version}'
|
|
|
|
@classmethod
|
|
def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
|
|
return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
|
|
|
|
def __init__(self,
|
|
*args: t.Any,
|
|
model_id: str,
|
|
llm_config: LLMConfig,
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
|
|
_adapters_mapping: AdaptersMapping | None,
|
|
_tag: bentoml.Tag,
|
|
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None,
|
|
_model_version: str,
|
|
_serialisation_format: t.Literal['safetensors', 'legacy'],
|
|
_local: bool,
|
|
**attrs: t.Any,
|
|
):
|
|
'''Initialize the LLM with given pretrained model.
|
|
|
|
> [!WARNING]
|
|
> To initializing any LLM, you should use `openllm.AutoLLM` or `openllm.LLM.from_pretrained` instead.
|
|
> `__init__` initialization is only for internal use.
|
|
|
|
> [!NOTE]
|
|
> - *args to be passed to the model.
|
|
> - **attrs will first be parsed to the AutoConfig, then the rest will be parsed to the import_model
|
|
> - for tokenizer kwargs, it should be prefixed with _tokenizer_*
|
|
|
|
For custom pretrained path, it is recommended to pass in 'model_version' alongside with the path
|
|
to ensure that it won't be loaded multiple times.
|
|
Internally, if a pretrained is given as a HuggingFace repository path , OpenLLM will usethe commit_hash
|
|
to generate the model version.
|
|
|
|
For better consistency, we recommend users to also push the fine-tuned model to HuggingFace repository.
|
|
|
|
If you need to overwrite the default ``import_model``, implement the following in your subclass:
|
|
|
|
```python
|
|
def import_model(
|
|
self,
|
|
*args: t.Any,
|
|
trust_remote_code: bool,
|
|
**attrs: t.Any,
|
|
):
|
|
_, tokenizer_attrs = self.llm_parameters
|
|
|
|
return bentoml.transformers.save_model(
|
|
tag,
|
|
transformers.AutoModelForCausalLM.from_pretrained(
|
|
self.model_id, device_map="auto", torch_dtype=torch.bfloat16, **attrs
|
|
),
|
|
custom_objects={
|
|
"tokenizer": transformers.AutoTokenizer.from_pretrained(
|
|
self.model_id, padding_side="left", **tokenizer_attrs
|
|
)
|
|
},
|
|
)
|
|
```
|
|
|
|
If your import model doesn't require customization, you can simply pass in `import_kwargs`
|
|
at class level that will be then passed into The default `import_model` implementation.
|
|
See ``openllm.DollyV2`` for example.
|
|
|
|
```python
|
|
dolly_v2_runner = openllm.Runner(
|
|
"dolly-v2", _tokenizer_padding_side="left", torch_dtype=torch.bfloat16, device_map="cuda"
|
|
)
|
|
```
|
|
|
|
Note: If you implement your own `import_model`, then `import_kwargs` will be the
|
|
base kwargs. You can still override those via ``openllm.Runner``.
|
|
|
|
Note that this tag will be generated based on `self.default_id`.
|
|
passed from the __init__ constructor.
|
|
|
|
``llm_post_init`` can also be implemented if you need to do any additional
|
|
initialization after everything is setup.
|
|
|
|
Note: If you need to implement a custom `load_model`, the following is an example from Falcon implementation:
|
|
|
|
```python
|
|
def load_model(self, tag: bentoml.Tag, *args: t.Any, **attrs: t.Any) -> t.Any:
|
|
torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
|
|
device_map = attrs.pop("device_map", "auto")
|
|
|
|
_ref = bentoml.transformers.get(tag)
|
|
|
|
model = bentoml.transformers.load_model(_ref, device_map=device_map, torch_dtype=torch_dtype, **attrs)
|
|
return transformers.pipeline("text-generation", model=model, tokenizer=_ref.custom_objects["tokenizer"])
|
|
```
|
|
|
|
Args:
|
|
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
|
|
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
|
|
will use `config_class` to construct default configuration.
|
|
quantization_config: ``transformers.BitsAndBytesConfig`` configuration, or 'gptq' denoting this model to be loaded with GPTQ.
|
|
*args: The args to be passed to the model.
|
|
**attrs: The kwargs to be passed to the model.
|
|
'''
|
|
# low_cpu_mem_usage is only available for model
|
|
# this is helpful on system with low memory to avoid OOM
|
|
low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True)
|
|
if self.__llm_backend__ == 'pt':
|
|
attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage, 'quantization_config': quantization_config})
|
|
model_kwds: DictStrAny = {}
|
|
tokenizer_kwds: DictStrAny = {}
|
|
if self.import_kwargs is not None: model_kwds, tokenizer_kwds = self.import_kwargs
|
|
# set default tokenizer kwargs
|
|
tokenizer_kwds.update({'padding_side': 'left', 'truncation_side': 'left'})
|
|
|
|
# parsing tokenizer and model kwargs, as the hierachy is param pass > default
|
|
normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
|
|
# NOTE: Save the args and kwargs for latter load
|
|
self.__attrs_init__(llm_config,
|
|
quantization_config,
|
|
model_id,
|
|
args, {
|
|
**model_kwds, **normalized_model_kwds
|
|
}, {
|
|
**tokenizer_kwds, **normalized_tokenizer_kwds
|
|
},
|
|
_tag,
|
|
_adapters_mapping,
|
|
_model_version,
|
|
_quantize_method,
|
|
_serialisation_format,
|
|
_local)
|
|
|
|
self.llm_post_init()
|
|
|
|
def __setattr__(self, attr: str, value: t.Any) -> None:
|
|
if attr in _reserved_namespace:
|
|
raise ForbiddenAttributeError(
|
|
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.')
|
|
super().__setattr__(attr, value)
|
|
|
|
@property
|
|
def trust_remote_code(self) -> bool:
|
|
return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'), default=self.config['trust_remote_code'])
|
|
|
|
@property
|
|
def adapters_mapping(self) -> AdaptersMapping | None:
|
|
return self._adapters_mapping
|
|
|
|
@adapters_mapping.setter
|
|
def adapters_mapping(self, value: AdaptersMapping) -> None:
|
|
self._adapters_mapping = value
|
|
|
|
@property
|
|
def __repr_keys__(self) -> set[str]:
|
|
return {'model_id', 'runner_name', 'config', 'adapters_mapping', 'tag'}
|
|
|
|
def __repr_args__(self) -> ReprArgs:
|
|
for k in self.__repr_keys__:
|
|
if k == 'config': yield k, self.config.model_dump(flatten=True)
|
|
else: yield k, getattr(self, k)
|
|
yield 'backend', self.__llm_backend__
|
|
|
|
@property
|
|
def model_id(self) -> str:
|
|
return self._model_id
|
|
|
|
@property
|
|
def runner_name(self) -> str:
|
|
return f"llm-{self.config['start_name']}-runner"
|
|
|
|
# NOTE: The section below defines a loose contract with langchain's LLM interface.
|
|
@property
|
|
def llm_type(self) -> str:
|
|
return normalise_model_name(self._model_id)
|
|
|
|
@property
|
|
def identifying_params(self) -> DictStrAny:
|
|
return {'configuration': self.config.model_dump_json().decode(), 'model_ids': orjson.dumps(self.config['model_ids']).decode()}
|
|
|
|
@property
|
|
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
|
|
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
|
|
|
|
@property
|
|
def tag(self) -> bentoml.Tag:
|
|
return self._tag
|
|
|
|
def ensure_model_id_exists(self) -> bentoml.Model:
|
|
return openllm.import_model(self.config['start_name'],
|
|
model_id=self.model_id,
|
|
model_version=self._model_version,
|
|
backend=self.__llm_backend__,
|
|
quantize=self._quantize_method,
|
|
serialisation_format=self._serialisation_format)
|
|
|
|
@property
|
|
def _bentomodel(self) -> bentoml.Model:
|
|
return openllm.serialisation.get(self, auto_import=True)
|
|
|
|
def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
|
|
'''This handler will sanitize all attrs and setup prompt text.
|
|
|
|
It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
|
|
|
|
Returns a tuple of three items:
|
|
- The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
|
|
- The attributes dictionary that will be passed into `self.postprocess_generate`.
|
|
'''
|
|
return self.config.sanitize_parameters(prompt, **attrs)
|
|
|
|
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
|
|
'''This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
|
|
|
|
You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
|
|
|
|
> [!NOTE]
|
|
> This will be used from the client side.
|
|
'''
|
|
if isinstance(generation_result, dict) and 'text' in generation_result: return generation_result['text']
|
|
return self.config.postprocess_generate(prompt, generation_result, **attrs)
|
|
|
|
@property
|
|
def model(self) -> M:
|
|
# Run check for GPU
|
|
if self.config['requires_gpu'] and device_count() < 1:
|
|
raise GpuNotAvailableError(f'{self} only supports running with GPU (None available).') from None
|
|
# NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
|
|
if self.__llm_model__ is None:
|
|
model = self.load_model(*self._model_decls, **self._model_attrs)
|
|
# If OOM, then it is probably you don't have enough VRAM to run this model.
|
|
if self.__llm_backend__ == 'pt' and is_torch_available():
|
|
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
|
|
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
|
|
try:
|
|
model = model.to('cuda')
|
|
except Exception as err:
|
|
raise OpenLLMException(
|
|
f'Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.'
|
|
) from err
|
|
self.__llm_model__ = model
|
|
return self.__llm_model__
|
|
|
|
@property
|
|
def tokenizer(self) -> T:
|
|
# NOTE: the signature of load_tokenizer here is the wrapper under _wrapped_load_tokenizer
|
|
if self.__llm_tokenizer__ is None: self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs)
|
|
return self.__llm_tokenizer__
|
|
|
|
def _transpose_adapter_mapping(self, inference_mode: bool = True, use_cache: bool = True) -> ResolvedAdaptersMapping:
|
|
if self._adapters_mapping is None: raise ValueError('LoRA mapping is not set up correctly.')
|
|
# early out if we already serialized everything.
|
|
if use_cache and self.__llm_adapter_map__ is not None: return self.__llm_adapter_map__
|
|
if not use_cache:
|
|
logger.debug('Adapter mapping resolution will not be cached. This should only be used during training.')
|
|
adapter_map: ResolvedAdaptersMapping = {k: {} for k in self._adapters_mapping}
|
|
# this is a temporary check to accept the first option name as 'default'
|
|
# then we will raise Error when the optional_name is set to None in next iteration.
|
|
_converted_first_none = False
|
|
for _adapter_type, _adapters_tuples in self._adapters_mapping.items():
|
|
strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type),
|
|
default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), llm_config_class=self.config_class))
|
|
default_config = strategy.eval() if inference_mode else strategy.train()
|
|
for adapter in _adapters_tuples:
|
|
if not adapter.name and _converted_first_none:
|
|
raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}")
|
|
name = adapter.name
|
|
if name is None:
|
|
_converted_first_none = True
|
|
name = 'default'
|
|
peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(
|
|
adapter_type=t.cast('PeftType', _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class).to_peft_config()
|
|
adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id)
|
|
if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map
|
|
return adapter_map
|
|
|
|
def prepare_for_training(self, adapter_type: AdapterType = 'lora', use_gradient_checkpointing: bool = True, **attrs: t.Any) -> tuple[peft.PeftModel, T]:
|
|
from peft import prepare_model_for_kbit_training
|
|
peft_config = self.config['fine_tune_strategies'].get(adapter_type, FineTuneConfig(adapter_type=t.cast('PeftType', adapter_type),
|
|
llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config()
|
|
wrapped_peft = peft.get_peft_model(prepare_model_for_kbit_training( # type: ignore[no-untyped-call]
|
|
self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config)
|
|
if DEBUG: wrapped_peft.print_trainable_parameters()
|
|
return wrapped_peft, self.tokenizer
|
|
|
|
def apply_adapter(self, inference_mode: bool = True, adapter_type: AdapterType = 'lora', load_adapters: t.Literal['all'] | list[str] | None = None, use_cache: bool = True) -> M:
|
|
'''Apply given LoRA mapping to the model. Note that the base model can still be accessed via self.model.get_base_model().'''
|
|
if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly')
|
|
# early out if _adapters_mapping is empty or it is already wrapped with peft.
|
|
if not self._adapters_mapping: return self.__llm_model__
|
|
if isinstance(self.__llm_model__, peft.PeftModel): return self.__llm_model__
|
|
|
|
_mapping = self._transpose_adapter_mapping(inference_mode=inference_mode, use_cache=use_cache)
|
|
if adapter_type not in _mapping:
|
|
raise ValueError(f'Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}')
|
|
adapter_mapping = _mapping[adapter_type]
|
|
|
|
self.__llm_model__ = self._wrap_default_peft_model(adapter_mapping, inference_mode=inference_mode)
|
|
# now we loop through the rest with add_adapter
|
|
if len(adapter_mapping) > 0:
|
|
for adapter_name, (_peft_config, _) in adapter_mapping.items():
|
|
t.cast(peft.PeftModel, self.__llm_model__).add_adapter(adapter_name, _peft_config)
|
|
|
|
# optionally load adapters. In case of multiple adapters, or on Runner,
|
|
# we will need to set load_adapters='all'
|
|
if load_adapters is not None:
|
|
adapters_to_load = adapter_mapping.keys() if load_adapters == 'all' else load_adapters
|
|
for adapter_name in adapters_to_load:
|
|
_peft_config, _peft_model_id = adapter_mapping[adapter_name]
|
|
t.cast(peft.PeftModel, self.__llm_model__).load_adapter(_peft_model_id, adapter_name=adapter_name, is_trainable=not inference_mode, **dict(_peft_config.to_dict()))
|
|
|
|
return self.__llm_model__
|
|
|
|
def _wrap_default_peft_model(self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]], inference_mode: bool) -> M:
|
|
if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly')
|
|
if isinstance(self.__llm_model__, peft.PeftModel): return self.__llm_model__
|
|
if not isinstance(self.__llm_model__, transformers.PreTrainedModel):
|
|
raise ValueError('Loading LoRA layers currently only runs on PyTorch models.')
|
|
|
|
if 'default' not in adapter_mapping:
|
|
raise ValueError("There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team.")
|
|
default_config, peft_model_id = adapter_mapping.pop('default')
|
|
|
|
# the below shared similar logics with `get_peft_model`
|
|
# TODO: Support PromptLearningConfig
|
|
if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(default_config, peft.PromptLearningConfig):
|
|
logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.", default_config.task_type)
|
|
model = peft.PeftModel(self.__llm_model__, default_config)
|
|
else:
|
|
# XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0
|
|
# onwards. For now, keep this logic here.
|
|
peft_class = peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING[default_config.task_type]
|
|
if default_config.base_model_name_or_path:
|
|
kwargs: DictStrAny = {'is_trainable': not inference_mode}
|
|
if 'config' in inspect.signature(peft_class.from_pretrained).parameters: kwargs['config'] = default_config
|
|
else: kwargs.update(dict(default_config.to_dict().items()))
|
|
# BUG: This hits during inference, need fixing
|
|
model = peft_class.from_pretrained(self.__llm_model__, peft_model_id, **kwargs)
|
|
else:
|
|
# in this case, the given base_model_name_or_path is None. This will be hit during training
|
|
model = peft_class(self.__llm_model__, default_config)
|
|
return model
|
|
|
|
# order of these fields matter here, make sure to sync it with
|
|
# openllm.models.auto.factory.BaseAutoLLMClass.for_model
|
|
def to_runner(self,
|
|
models: list[bentoml.Model] | None = None,
|
|
max_batch_size: int | None = None,
|
|
max_latency_ms: int | None = None,
|
|
scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
|
|
'''Convert this LLM into a Runner.
|
|
|
|
Args:
|
|
models: Any additional ``bentoml.Model`` to be included in this given models.
|
|
By default, this will be determined from the model_name.
|
|
max_batch_size: The maximum batch size for the runner.
|
|
max_latency_ms: The maximum latency for the runner.
|
|
strategy: The strategy to use for this runner.
|
|
embedded: Whether to run this runner in embedded mode.
|
|
scheduling_strategy: Whether to create a custom scheduling strategy for this Runner.
|
|
|
|
Returns:
|
|
A generated LLMRunner for this LLM.
|
|
|
|
> [!NOTE]: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner():
|
|
>
|
|
> - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this. The generated name will be 'llm-<model-start-name>-runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
|
|
> - 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
|
|
> - 'method_configs': The method configs for the runner will be managed internally by OpenLLM.
|
|
'''
|
|
models = models if models is not None else []
|
|
|
|
try:
|
|
models.append(self._bentomodel)
|
|
except bentoml.exceptions.NotFound as err:
|
|
raise RuntimeError(f'Failed to locate {self._bentomodel}:{err}') from None
|
|
|
|
generate_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=False))
|
|
embeddings_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=True, batch_dim=0))
|
|
generate_iterator_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=False))
|
|
|
|
# NOTE: returning the two langchain API's to the runner
|
|
return llm_runner_class(self)(llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig),
|
|
name=self.runner_name,
|
|
embedded=False,
|
|
models=models,
|
|
max_batch_size=max_batch_size,
|
|
max_latency_ms=max_latency_ms,
|
|
method_configs=bentoml_cattr.unstructure({
|
|
'embeddings': embeddings_sig,
|
|
'__call__': generate_sig,
|
|
'generate': generate_sig,
|
|
'generate_one': generate_sig,
|
|
'generate_iterator': generate_iterator_sig
|
|
}),
|
|
scheduling_strategy=scheduling_strategy)
|
|
|
|
# NOTE: Scikit API
|
|
def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
|
|
return self(prompt, **attrs)
|
|
|
|
def __call__(self, prompt: str, **attrs: t.Any) -> t.Any:
|
|
'''Returns the generation result and format the result.
|
|
|
|
First, it runs `self.sanitize_parameters` to sanitize the parameters.
|
|
The the sanitized prompt and kwargs will be pass into self.generate.
|
|
Finally, run self.postprocess_generate to postprocess the generated result.
|
|
|
|
This allows users to do the following:
|
|
|
|
```python
|
|
llm = openllm.AutoLLM.for_model("dolly-v2")
|
|
llm("What is the meaning of life?")
|
|
```
|
|
'''
|
|
prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **attrs)
|
|
return self.postprocess_generate(prompt, self.generate(prompt, **generate_kwargs), **postprocess_kwargs)
|
|
|
|
def generate(self, prompt: str, **attrs: t.Any) -> t.List[t.Any]:
|
|
# TODO: support different generation strategies, similar to self.model.generate
|
|
for it in self.generate_iterator(prompt, **attrs):
|
|
pass
|
|
return [it]
|
|
|
|
def generate_iterator(self,
|
|
prompt: str,
|
|
/,
|
|
*,
|
|
context_length: int | None = None,
|
|
echo: bool = False,
|
|
stream_interval: int = 2,
|
|
stop: str | t.Iterable[str] | None = None,
|
|
stop_token_ids: list[int] | None = None,
|
|
**attrs: t.Any) -> t.Iterator[t.Any]:
|
|
# NOTE: encoder-decoder models will need to implement their own generate_iterator for now
|
|
from ._generation import get_context_length
|
|
from ._generation import is_partial_stop
|
|
from ._generation import prepare_logits_processor
|
|
|
|
len_prompt = len(prompt)
|
|
if stop_token_ids is None: stop_token_ids = []
|
|
stop_token_ids.append(self.tokenizer.eos_token_id)
|
|
|
|
logits_processor = prepare_logits_processor(self.config)
|
|
|
|
input_ids = self.tokenizer(prompt).input_ids
|
|
|
|
if context_length is None: context_length = get_context_length(self.model.config)
|
|
max_src_len = context_length - self.config['max_new_tokens'] - 1
|
|
|
|
input_ids = input_ids[-max_src_len:]
|
|
output_ids = list(input_ids)
|
|
input_echo_len = len(input_ids)
|
|
|
|
past_key_values = out = token = None
|
|
for i in range(self.config['max_new_tokens']):
|
|
torch.cuda.synchronize()
|
|
if i == 0: # prefill
|
|
out = self.model(torch.as_tensor([input_ids]), use_cache=True)
|
|
logits = out.logits
|
|
past_key_values = out.past_key_values
|
|
else: # decoding
|
|
out = self.model(input_ids=torch.as_tensor([[token]]), use_cache=True, past_key_values=past_key_values)
|
|
logits = out.logits
|
|
past_key_values = out.past_key_values
|
|
|
|
if logits_processor:
|
|
if self.config['repetition_penalty'] > 1.0:
|
|
tmp_output_ids: t.Any = torch.as_tensor([output_ids], device=logits.device)
|
|
else:
|
|
tmp_output_ids = None
|
|
last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
|
|
else:
|
|
last_token_logits = logits[0, -1, :]
|
|
|
|
# Switch to CPU by avoiding some bugs in mps backend.
|
|
if self.device.type == 'mps': last_token_logits = last_token_logits.float().to('cpu')
|
|
|
|
if self.config['temperature'] < 1e-5 or self.config['top_p'] < 1e-8:
|
|
token = int(torch.argmax(last_token_logits)) # greedy
|
|
else:
|
|
probs = torch.softmax(last_token_logits, dim=-1)
|
|
token = int(torch.multinomial(probs, num_samples=1))
|
|
output_ids.append(token)
|
|
torch.cuda.synchronize()
|
|
|
|
if token in stop_token_ids: stopped = True
|
|
else: stopped = False
|
|
|
|
# Yield the output tokens
|
|
if i % stream_interval == 0 or i == self.config['max_new_tokens'] - 1 or stopped:
|
|
if echo:
|
|
tmp_output_ids = output_ids
|
|
rfind_start = len_prompt
|
|
else:
|
|
tmp_output_ids = output_ids[input_echo_len:]
|
|
rfind_start = 0
|
|
output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
|
|
|
|
partially_stopped = False
|
|
if stop:
|
|
if isinstance(stop, str):
|
|
pos = output.rfind(stop, rfind_start)
|
|
if pos != -1: output, stopped = output[:pos], True
|
|
else: partially_stopped = is_partial_stop(output, stop)
|
|
elif isinstance(stop, t.Iterable):
|
|
for each_stop in stop:
|
|
pos = output.rfind(each_stop, rfind_start)
|
|
if pos != -1:
|
|
output, stopped = output[:pos], True
|
|
break
|
|
else:
|
|
partially_stopped = is_partial_stop(output, each_stop)
|
|
if partially_stopped: break
|
|
else: raise ValueError('Invalid stop field type.')
|
|
|
|
# Prevent yielding partial stop sequence
|
|
if not partially_stopped:
|
|
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None}
|
|
if stopped: break
|
|
|
|
# Finish stream event, which contains finish reason
|
|
if i == self.config['max_new_tokens'] - 1: finish_reason = 'length'
|
|
elif stopped: finish_reason = 'stop'
|
|
else: finish_reason = None
|
|
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason}
|
|
# Clean
|
|
del past_key_values, out
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
|
|
@overload
|
|
def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
|
|
...
|
|
|
|
@overload
|
|
def Runner(model_name: str,
|
|
*,
|
|
model_id: str = ...,
|
|
model_version: str | None = ...,
|
|
models: list[bentoml.Model] | None = ...,
|
|
max_batch_size: int | None = ...,
|
|
max_latency_ms: int | None = ...,
|
|
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
|
|
scheduling_strategy: type[bentoml.Strategy] | None = ...,
|
|
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
|
|
...
|
|
|
|
@overload
|
|
def Runner(model_name: str,
|
|
*,
|
|
ensure_available: bool = ...,
|
|
init_local: bool = ...,
|
|
backend: LiteralBackend | None = None,
|
|
llm_config: LLMConfig | None = None,
|
|
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
|
|
...
|
|
|
|
@overload
|
|
def Runner(model_name: str,
|
|
*,
|
|
model_id: str | None = ...,
|
|
model_version: str | None = ...,
|
|
llm_config: LLMConfig | None = ...,
|
|
quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
|
|
adapter_id: str | None = ...,
|
|
adapter_name: str | None = ...,
|
|
adapter_map: dict[str, str | None] | None = ...,
|
|
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
|
serialisation: t.Literal['safetensors', 'legacy'] = ...,
|
|
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
|
|
...
|
|
|
|
def Runner(model_name: str,
|
|
ensure_available: bool = False,
|
|
init_local: bool = False,
|
|
backend: LiteralBackend | None = None,
|
|
llm_config: LLMConfig | None = None,
|
|
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
|
|
'''Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
|
|
|
|
The behaviour of ensure_available that is synonymous to `AutoLLM.for_model` depends on `init_local`.
|
|
By default, `ensure_available` is synonymous to `init_local`, meaning on the service when creating
|
|
runner, it won't download the model. So before running your BentoML Service, you should create a `on_startup`
|
|
hook to check download if you don't want to do it manually:
|
|
|
|
```python
|
|
|
|
runner = openllm.Runner("dolly-v2")
|
|
|
|
@svc.on_startup
|
|
def download():
|
|
runner.download_model()
|
|
```
|
|
|
|
if `init_local=True` (For development workflow), it will also enable `ensure_available`.
|
|
Default value of `ensure_available` is None. If set then use that given value, otherwise fallback to the aforementioned behaviour.
|
|
|
|
Args:
|
|
model_name: Supported model name from 'openllm models'
|
|
ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
|
|
If False, make sure the model is available locally.
|
|
backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it.
|
|
llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
|
|
init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
|
|
**attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour
|
|
'''
|
|
if llm_config is not None:
|
|
attrs.update({
|
|
'model_id': llm_config['env']['model_id_value'],
|
|
'quantize': llm_config['env']['quantize_value'],
|
|
'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
|
|
})
|
|
|
|
backend = t.cast(LiteralBackend, first_not_none(backend, default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
|
|
if init_local: ensure_available = True
|
|
runner = infer_auto_class(backend).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available, **attrs)
|
|
if init_local: runner.init_local(quiet=True)
|
|
return runner
|
|
|
|
def method_signature(sig: ModelSignature) -> ModelSignatureDict:
|
|
return bentoml_cattr.unstructure(sig)
|
|
|
|
class SetAdapterOutput(t.TypedDict):
|
|
success: bool
|
|
message: str
|
|
|
|
def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature, generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]:
|
|
class _Runnable(bentoml.Runnable):
|
|
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
|
|
SUPPORTS_CPU_MULTI_THREADING = True
|
|
backend = self.__llm_backend__
|
|
|
|
def __init__(__self: _Runnable):
|
|
# NOTE: The side effect of this line is that it will load the
|
|
# imported model during runner startup. So don't remove it!!
|
|
if not self.model: raise RuntimeError('Failed to load the model correctly (See traceback above)')
|
|
if self.adapters_mapping is not None:
|
|
logger.info('Applying LoRA to %s...', self.runner_name)
|
|
self.apply_adapter(inference_mode=True, load_adapters='all')
|
|
|
|
def set_adapter(__self: _Runnable, adapter_name: str) -> None:
|
|
if self.__llm_adapter_map__ is None: raise ValueError('No adapters available for current running server.')
|
|
elif not isinstance(self.model, peft.PeftModel): raise RuntimeError('Model is not a PeftModel')
|
|
if adapter_name != 'default': self.model.set_adapter(adapter_name)
|
|
logger.info('Successfully apply LoRA layer %s', adapter_name)
|
|
|
|
@bentoml.Runnable.method(**method_signature(embeddings_sig)) # type: ignore
|
|
def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[EmbeddingsOutput]:
|
|
return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
|
|
def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
|
|
adapter_name = attrs.pop('adapter_name', None)
|
|
if adapter_name is not None: __self.set_adapter(adapter_name)
|
|
return self.generate(prompt, **attrs)
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
|
|
def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
|
|
adapter_name = attrs.pop('adapter_name', None)
|
|
if adapter_name is not None: __self.set_adapter(adapter_name)
|
|
if __self.backend == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid())
|
|
return self.generate(prompt, **attrs)
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
|
|
def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
|
|
adapter_name = attrs.pop('adapter_name', None)
|
|
if adapter_name is not None: __self.set_adapter(adapter_name)
|
|
return self.generate_one(prompt, stop, **attrs)
|
|
|
|
@bentoml.Runnable.method(**method_signature(generate_iterator_sig)) # type: ignore
|
|
def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]:
|
|
adapter_name = attrs.pop('adapter_name', None)
|
|
if adapter_name is not None: __self.set_adapter(adapter_name)
|
|
pre = 0
|
|
for outputs in self.generate_iterator(prompt, request_id=openllm_core.utils.gen_random_uuid(), **attrs):
|
|
output_text = outputs['text'][0] if __self.backend == 'vllm' else outputs['text']
|
|
output_text = output_text.strip().split(' ')
|
|
now = len(output_text) - 1
|
|
if now > pre:
|
|
yield ' '.join(output_text[pre:now]) + ' '
|
|
pre = now
|
|
yield ' '.join(output_text[pre:]) + ' '
|
|
return ' '.join(output_text) + ' '
|
|
|
|
return types.new_class(
|
|
self.__class__.__name__ + 'Runnable', (_Runnable,), {},
|
|
lambda ns: ns.update({
|
|
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
|
|
'__module__': self.__module__,
|
|
'__doc__': self.config['env'].start_docstring
|
|
}))
|
|
|
|
def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
|
|
def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput:
|
|
if not is_peft_available():
|
|
return PeftAdapterOutput(success=False, result={}, error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'")
|
|
if self.__llm_adapter_map__ is None:
|
|
return PeftAdapterOutput(success=False, result={}, error_msg='No adapters available for current running server.')
|
|
if not isinstance(self.model, peft.PeftModel):
|
|
return PeftAdapterOutput(success=False, result={}, error_msg='Model is not a PeftModel')
|
|
return PeftAdapterOutput(success=True, result=self.model.peft_config, error_msg='')
|
|
|
|
def _wrapped_generate_run(__self: LLMRunner[M, T], prompt: str, **kwargs: t.Any) -> t.Any:
|
|
'''Wrapper for runner.generate.run() to handle the prompt and postprocessing.
|
|
|
|
This will be used for LangChain API.
|
|
|
|
Usage:
|
|
|
|
```python
|
|
runner = openllm.Runner("dolly-v2", init_local=True)
|
|
runner("What is the meaning of life?")
|
|
```
|
|
'''
|
|
prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **kwargs)
|
|
return self.postprocess_generate(prompt, __self.generate.run(prompt, **generate_kwargs), **postprocess_kwargs)
|
|
|
|
def _wrapped_embeddings_run(__self: LLMRunner[M, T], prompt: str | list[str]) -> EmbeddingsOutput:
|
|
'''``llm.embed`` is a light wrapper around runner.embeedings.run().
|
|
|
|
Usage:
|
|
|
|
```python
|
|
runner = openllm.Runner('llama', backend='pt')
|
|
runner.embed("What is the meaning of life?")
|
|
```
|
|
'''
|
|
return __self.embeddings.run([prompt] if isinstance(prompt, str) else prompt)
|
|
|
|
def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
|
|
return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
|
|
|
|
def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
|
|
yield 'runner_methods', {method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods}
|
|
yield 'config', self.config.model_dump(flatten=True)
|
|
yield 'llm_type', __self.llm_type
|
|
yield 'backend', self.__llm_backend__
|
|
yield 'llm_tag', self.tag
|
|
|
|
return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),
|
|
exec_body=lambda ns: ns.update({
|
|
'llm_type': self.llm_type,
|
|
'identifying_params': self.identifying_params,
|
|
'llm_tag': self.tag,
|
|
'llm': self,
|
|
'config': self.config,
|
|
'backend': self.__llm_backend__,
|
|
'peft_adapters': property(fget=available_adapters),
|
|
'download_model': self.ensure_model_id_exists,
|
|
'__call__': _wrapped_generate_run,
|
|
'embed': _wrapped_embeddings_run,
|
|
'__module__': self.__module__,
|
|
'__doc__': self.config['env'].start_docstring,
|
|
'__repr__': ReprMixin.__repr__,
|
|
'__repr_keys__': property(_wrapped_repr_keys),
|
|
'__repr_args__': _wrapped_repr_args,
|
|
'supports_embeddings': self['supports_embeddings'],
|
|
'supports_hf_agent': self['supports_generate_one'],
|
|
'has_adapters': self._adapters_mapping is not None
|
|
}))
|
|
|
|
__all__ = ['LLMRunner', 'LLMRunnable', 'Runner', 'LLM', 'llm_runner_class', 'llm_runnable_class', 'EmbeddingsOutput']
|