mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-02 21:56:10 -05:00
fix(gptq): use upstream integration (#297)
* wip Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * feat: GPTQ transformers integration Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * fix: only load if variable is available and add changelog Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * chore: remove boilerplate check Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -39,10 +39,11 @@ classifiers = [
|
||||
]
|
||||
dependencies = [
|
||||
"bentoml[io]>=1.1.2",
|
||||
"transformers[torch,tokenizers,accelerate]>=4.29.0",
|
||||
"transformers[torch,tokenizers,accelerate]>=4.32.1",
|
||||
"openllm-client",
|
||||
"safetensors",
|
||||
"optimum",
|
||||
"optimum>=1.12.0",
|
||||
"accelerate",
|
||||
"ghapi",
|
||||
"tabulate[widechars]>=0.9.0",
|
||||
"click>=8.1.3",
|
||||
@@ -99,13 +100,13 @@ all = ["openllm[full]"]
|
||||
baichuan = ["cpm-kernels", "sentencepiece"]
|
||||
chatglm = ["cpm-kernels", "sentencepiece"]
|
||||
falcon = ["einops", "xformers"]
|
||||
fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
|
||||
fine-tune = ["peft>=0.5.0", "bitsandbytes", "datasets", "accelerate", "trl"]
|
||||
flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
full = [
|
||||
"openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
|
||||
"openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
|
||||
]
|
||||
ggml = ["ctransformers"]
|
||||
gptq = ["auto-gptq[triton]"]
|
||||
gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"]
|
||||
grpc = ["openllm-client[grpc]"]
|
||||
llama = ["fairscale", "sentencepiece"]
|
||||
mpt = ["triton", "einops"]
|
||||
@@ -150,7 +151,7 @@ dependencies = [
|
||||
# avoid https://github.com/pallets/click/issues/2558
|
||||
"click==8.1.3",
|
||||
"bentoml==1.1.2",
|
||||
"transformers>=4.31.0",
|
||||
"transformers>=4.32.1",
|
||||
"pandas-stubs",
|
||||
"types-psutil",
|
||||
"types-tabulate",
|
||||
|
||||
@@ -28,6 +28,7 @@ from openllm_core._typing_compat import AdaptersTuple
|
||||
from openllm_core._typing_compat import AdapterType
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import LLMRunnable
|
||||
from openllm_core._typing_compat import LLMRunner
|
||||
@@ -63,7 +64,6 @@ from .utils import infer_auto_class
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
|
||||
import auto_gptq as autogptq
|
||||
import peft
|
||||
import torch
|
||||
import transformers
|
||||
@@ -71,7 +71,6 @@ if t.TYPE_CHECKING:
|
||||
from openllm_core._configuration import PeftType
|
||||
from openllm_core.utils.representation import ReprArgs
|
||||
else:
|
||||
autogptq = LazyLoader('autogptq', globals(), 'auto_gptq')
|
||||
transformers = LazyLoader('transformers', globals(), 'transformers')
|
||||
torch = LazyLoader('torch', globals(), 'torch')
|
||||
peft = LazyLoader('peft', globals(), 'peft')
|
||||
@@ -80,6 +79,8 @@ ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_object_setattr = object.__setattr__
|
||||
|
||||
def normalise_model_name(name: str) -> str:
|
||||
if validate_is_path(name): return os.path.basename(resolve_filepath(name))
|
||||
name = name.replace('/', '--')
|
||||
@@ -280,7 +281,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
|
||||
def __attrs_init__(self,
|
||||
config: LLMConfig,
|
||||
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
|
||||
quantize: t.Optional[LiteralQuantise],
|
||||
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig]],
|
||||
model_id: str,
|
||||
model_decls: TupleAny,
|
||||
model_attrs: DictStrAny,
|
||||
@@ -288,17 +290,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
tag: bentoml.Tag,
|
||||
adapters_mapping: t.Optional[AdaptersMapping],
|
||||
model_version: t.Optional[str],
|
||||
quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
serialisation: t.Literal['safetensors', 'legacy'],
|
||||
_local: bool,
|
||||
**attrs: t.Any) -> None:
|
||||
'''Generated __attrs_init__ for openllm.LLM.'''
|
||||
|
||||
config: LLMConfig
|
||||
'''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.'''
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
|
||||
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None
|
||||
'''Quantisation config for quantised model on the fly.'''
|
||||
|
||||
_quantize: LiteralQuantise | None
|
||||
_model_id: str
|
||||
_model_decls: TupleAny
|
||||
_model_attrs: DictStrAny
|
||||
@@ -306,8 +307,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
_tag: bentoml.Tag
|
||||
_adapters_mapping: AdaptersMapping | None
|
||||
_model_version: str
|
||||
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None
|
||||
_serialisation_format: t.Literal['safetensors', 'legacy']
|
||||
_serialisation: t.Literal['safetensors', 'legacy']
|
||||
_local: bool
|
||||
|
||||
def __init_subclass__(cls: type[LLM[M, T]]) -> None:
|
||||
@@ -376,11 +376,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
model_version: str | None = None,
|
||||
llm_config: LLMConfig | None = None,
|
||||
*args: t.Any,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
quantize: LiteralQuantise | None = None,
|
||||
adapter_id: str | None = None,
|
||||
adapter_name: str | None = None,
|
||||
adapter_map: dict[str, str | None] | None = None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
|
||||
serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
|
||||
**attrs: t.Any) -> LLM[M, T]:
|
||||
'''Instantiate a pretrained LLM.
|
||||
@@ -403,9 +403,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
|
||||
```
|
||||
|
||||
For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
|
||||
to ``auto_gptq.BaseQuantizeConfig``.
|
||||
|
||||
### Adapter options:
|
||||
|
||||
> This is used in conjunction with the fine-tuning features
|
||||
@@ -427,7 +424,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
will use `config_class` to construct default configuration.
|
||||
quantize: The quantization to use for this LLM. Defaults to None. Possible values
|
||||
include int8, int4 and gptq.
|
||||
quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
|
||||
quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `transformers.GPTQConfig`) to use. Note that this is mutually exclusive with `quantize`
|
||||
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
||||
Default behaviour is similar to ``safe_serialization=False``.
|
||||
adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
|
||||
@@ -440,13 +437,15 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
_local = False
|
||||
_model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
|
||||
if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
|
||||
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
|
||||
quantize = first_not_none(quantize, t.cast(t.Optional[LiteralQuantise], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
|
||||
|
||||
# quantization setup
|
||||
if quantization_config and quantize:
|
||||
raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
|
||||
if quantization_config is None and quantize is not None:
|
||||
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
|
||||
# in case users input `tokenizer` to __init__, default to the _model_id
|
||||
_gptq_tokenizer = attrs.pop('tokenizer', _model_id)
|
||||
quantization_config, attrs = infer_quantisation_config(cls, quantize, tokenizer=_gptq_tokenizer, **attrs)
|
||||
if quantize == 'gptq': serialisation = 'safetensors'
|
||||
elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress
|
||||
|
||||
@@ -476,10 +475,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
model_id=_model_id,
|
||||
llm_config=llm_config,
|
||||
quantization_config=quantization_config,
|
||||
_quantize_method=quantize,
|
||||
_quantize=quantize,
|
||||
_model_version=_tag.version,
|
||||
_tag=_tag,
|
||||
_serialisation_format=serialisation,
|
||||
_serialisation=serialisation,
|
||||
_local=_local,
|
||||
_adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
|
||||
**attrs)
|
||||
@@ -534,12 +533,12 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
*args: t.Any,
|
||||
model_id: str,
|
||||
llm_config: LLMConfig,
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None,
|
||||
_adapters_mapping: AdaptersMapping | None,
|
||||
_tag: bentoml.Tag,
|
||||
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None,
|
||||
_quantize: LiteralQuantise | None,
|
||||
_model_version: str,
|
||||
_serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
_serialisation: t.Literal['safetensors', 'legacy'],
|
||||
_local: bool,
|
||||
**attrs: t.Any,
|
||||
):
|
||||
@@ -641,6 +640,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
# NOTE: Save the args and kwargs for latter load
|
||||
self.__attrs_init__(llm_config,
|
||||
quantization_config,
|
||||
_quantize,
|
||||
model_id,
|
||||
args, {
|
||||
**model_kwds, **normalized_model_kwds
|
||||
@@ -650,8 +650,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
_tag,
|
||||
_adapters_mapping,
|
||||
_model_version,
|
||||
_quantize_method,
|
||||
_serialisation_format,
|
||||
_serialisation,
|
||||
_local)
|
||||
|
||||
self.llm_post_init()
|
||||
@@ -672,7 +671,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
|
||||
@adapters_mapping.setter
|
||||
def adapters_mapping(self, value: AdaptersMapping) -> None:
|
||||
self._adapters_mapping = value
|
||||
_object_setattr(self, '_adapters_mapping', value)
|
||||
|
||||
@property
|
||||
def __repr_keys__(self) -> set[str]:
|
||||
@@ -709,13 +708,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
def tag(self) -> bentoml.Tag:
|
||||
return self._tag
|
||||
|
||||
def ensure_model_id_exists(self) -> bentoml.Model:
|
||||
def save_pretrained(self) -> bentoml.Model:
|
||||
return openllm.import_model(self.config['start_name'],
|
||||
model_id=self.model_id,
|
||||
model_version=self._model_version,
|
||||
backend=self.__llm_backend__,
|
||||
quantize=self._quantize_method,
|
||||
serialisation_format=self._serialisation_format)
|
||||
quantize=self._quantize,
|
||||
serialisation=self._serialisation)
|
||||
|
||||
@property
|
||||
def _bentomodel(self) -> bentoml.Model:
|
||||
@@ -1085,11 +1084,11 @@ def Runner(model_name: str,
|
||||
model_id: str | None = ...,
|
||||
model_version: str | None = ...,
|
||||
llm_config: LLMConfig | None = ...,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
|
||||
quantize: LiteralQuantise | None = ...,
|
||||
adapter_id: str | None = ...,
|
||||
adapter_name: str | None = ...,
|
||||
adapter_map: dict[str, str | None] | None = ...,
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
|
||||
serialisation: t.Literal['safetensors', 'legacy'] = ...,
|
||||
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
|
||||
...
|
||||
@@ -1270,7 +1269,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
|
||||
'config': self.config,
|
||||
'backend': self.__llm_backend__,
|
||||
'peft_adapters': property(fget=available_adapters),
|
||||
'download_model': self.ensure_model_id_exists,
|
||||
'download_model': self.save_pretrained,
|
||||
'__call__': _wrapped_generate_run,
|
||||
'embed': _wrapped_embeddings_run,
|
||||
'__module__': self.__module__,
|
||||
|
||||
@@ -3,47 +3,68 @@ from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import overload
|
||||
from openllm_core.utils import LazyLoader
|
||||
from openllm_core.utils import is_autogptq_available
|
||||
from openllm_core.utils import is_bitsandbytes_available
|
||||
from openllm_core.utils import is_transformers_supports_kbit
|
||||
from openllm_core.utils import pkg
|
||||
from openllm_core.utils import is_optimum_supports_gptq
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
|
||||
from ._llm import LLM
|
||||
|
||||
autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
QuantiseMode = t.Literal['int8', 'int4', 'gptq']
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
|
||||
...
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]:
|
||||
...
|
||||
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQuantise, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig, DictStrAny]:
|
||||
# 8 bit configuration
|
||||
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
|
||||
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
|
||||
int8_skip_modules: list[str] | None = attrs.pop('llm_int8_skip_modules', None)
|
||||
int8_has_fp16_weight = attrs.pop('llm_int8_has_fp16_weight', False)
|
||||
|
||||
autogptq_attrs: DictStrAny = {
|
||||
'bits': attrs.pop('gptq_bits', 4),
|
||||
'group_size': attrs.pop('gptq_group_size', -1),
|
||||
'damp_percent': attrs.pop('gptq_damp_percent', 0.01),
|
||||
'desc_act': attrs.pop('gptq_desc_act', True),
|
||||
'sym': attrs.pop('gptq_sym', True),
|
||||
'true_sequential': attrs.pop('gptq_true_sequential', True),
|
||||
}
|
||||
def create_gptq_config() -> transformers.GPTQConfig:
|
||||
gptq_bits = attrs.pop('bits', 4)
|
||||
gptq_tokenizer = attrs.pop('tokenizer', None)
|
||||
gptq_dataset = attrs.pop('dataset', 'c4')
|
||||
gptq_group_size = attrs.pop('group_size', 128)
|
||||
gptq_damp_percent = attrs.pop('damp_percent', 0.1)
|
||||
gptq_desc_act = attrs.pop('desc_act', False)
|
||||
gptq_sym = attrs.pop('sym', True)
|
||||
gptq_true_sequential = attrs.pop('true_sequential', True)
|
||||
gptq_use_cuda_fp16 = attrs.pop('use_cuda_fp16', True if torch.cuda.is_available() else False)
|
||||
gptq_model_seqlen = attrs.pop('model_seqlen', None)
|
||||
gptq_block_name_to_quantize = attrs.pop('block_name_to_quantize', None)
|
||||
gptq_module_name_preceding_first_block = attrs.pop('module_name_preceding_first_block', None)
|
||||
gptq_batch_size = attrs.pop('batch_size', 1)
|
||||
gptq_pad_token_id = attrs.pop('pad_token_id', None)
|
||||
gptq_disable_exllama = attrs.pop('disable_exllama', False)
|
||||
return transformers.GPTQConfig(bits=gptq_bits,
|
||||
tokenizer=gptq_tokenizer,
|
||||
dataset=gptq_dataset,
|
||||
group_size=gptq_group_size,
|
||||
damp_percent=gptq_damp_percent,
|
||||
desc_act=gptq_desc_act,
|
||||
sym=gptq_sym,
|
||||
true_sequential=gptq_true_sequential,
|
||||
use_cuda_fp16=gptq_use_cuda_fp16,
|
||||
model_seqlen=gptq_model_seqlen,
|
||||
block_name_to_quantize=gptq_block_name_to_quantize,
|
||||
module_name_preceding_first_block=gptq_module_name_preceding_first_block,
|
||||
batch_size=gptq_batch_size,
|
||||
pad_token_id=gptq_pad_token_id,
|
||||
disable_exllama=gptq_disable_exllama)
|
||||
|
||||
def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
|
||||
if int8_skip_modules is None: int8_skip_modules = []
|
||||
@@ -69,24 +90,18 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
|
||||
raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
|
||||
if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == 'int4':
|
||||
if is_transformers_supports_kbit():
|
||||
quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=int4_compute_dtype,
|
||||
bnb_4bit_quant_type=int4_quant_type,
|
||||
bnb_4bit_use_double_quant=int4_use_double_quant)
|
||||
else:
|
||||
logger.warning(
|
||||
"'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
|
||||
pkg.pkg_version_info('transformers'))
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=int4_compute_dtype,
|
||||
bnb_4bit_quant_type=int4_quant_type,
|
||||
bnb_4bit_use_double_quant=int4_use_double_quant)
|
||||
elif quantise == 'gptq':
|
||||
if not is_autogptq_available():
|
||||
if not is_autogptq_available() or not is_optimum_supports_gptq():
|
||||
logger.warning(
|
||||
"'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
|
||||
"'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
|
||||
)
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
else:
|
||||
quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
|
||||
quantisation_config = create_gptq_config()
|
||||
else:
|
||||
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
|
||||
return quantisation_config, attrs
|
||||
|
||||
@@ -109,12 +109,11 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
],
|
||||
'num_tokens': 20
|
||||
}))
|
||||
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
|
||||
async def embeddings_v1(phrases: list[str]) -> list[openllm.EmbeddingsOutput]:
|
||||
embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type]
|
||||
responses = (await embed_call.async_run(phrases))[0]
|
||||
return openllm.EmbeddingsOutput(embeddings=responses['embeddings'], num_tokens=responses['num_tokens'])
|
||||
return await embed_call.async_run(phrases)
|
||||
|
||||
if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
|
||||
if runner.supports_hf_agent:
|
||||
|
||||
async def hf_agent(request: Request) -> Response:
|
||||
json_str = await request.body()
|
||||
|
||||
@@ -128,19 +128,19 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any],
|
||||
quantize: LiteralString | None,
|
||||
adapter_map: dict[str, str | None] | None,
|
||||
dockerfile_template: str | None,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
serialisation: t.Literal['safetensors', 'legacy'],
|
||||
container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
|
||||
from openllm.cli._factory import parse_config_options
|
||||
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
|
||||
env: openllm_core.utils.EnvVarMixin = llm.config['env']
|
||||
if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
|
||||
if env['backend_value'] == 'vllm': serialisation = 'legacy'
|
||||
env_dict = {
|
||||
env.backend: env['backend_value'],
|
||||
env.config: f"'{llm.config.model_dump_json().decode()}'",
|
||||
env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
|
||||
'OPENLLM_MODEL': llm.config['model_name'],
|
||||
'OPENLLM_SERIALIZATION': serialisation_format,
|
||||
'OPENLLM_SERIALIZATION': serialisation,
|
||||
'OPENLLM_ADAPTER_MAP': f"'{orjson.dumps(adapter_map).decode()}'",
|
||||
'BENTOML_DEBUG': str(True),
|
||||
'BENTOML_QUIET': str(False),
|
||||
@@ -207,7 +207,7 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
dockerfile_template: str | None,
|
||||
adapter_map: dict[str, str | None] | None = None,
|
||||
extra_dependencies: tuple[str, ...] | None = None,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
|
||||
serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
|
||||
container_registry: LiteralContainerRegistry = 'ecr',
|
||||
container_version_strategy: LiteralContainerVersionStrategy = 'release',
|
||||
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
|
||||
@@ -246,7 +246,7 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
quantize,
|
||||
adapter_map,
|
||||
dockerfile_template,
|
||||
serialisation_format,
|
||||
serialisation,
|
||||
container_registry,
|
||||
container_version_strategy))
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from openllm_core._typing_compat import Concatenate
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
from openllm_core._typing_compat import get_literal_args
|
||||
@@ -131,15 +132,15 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
model_version: str | None,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
|
||||
device: t.Tuple[str, ...],
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
|
||||
quantize: LiteralQuantise | None,
|
||||
backend: LiteralBackend,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
serialisation: t.Literal['safetensors', 'legacy'],
|
||||
cors: bool,
|
||||
adapter_id: str | None,
|
||||
return_process: bool,
|
||||
**attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
|
||||
if serialisation == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
|
||||
termui.echo(
|
||||
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
|
||||
fg='yellow')
|
||||
@@ -184,11 +185,11 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
|
||||
'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
|
||||
'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
|
||||
'OPENLLM_SERIALIZATION': serialisation_format,
|
||||
env.backend: env['backend_value']
|
||||
'OPENLLM_SERIALIZATION': serialisation,
|
||||
env.backend: env['backend_value'],
|
||||
})
|
||||
if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
|
||||
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
|
||||
if env['quantize_value']: start_env[env.quantize] = str(env['quantize_value'])
|
||||
|
||||
llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
|
||||
model_id=start_env[env.model_id],
|
||||
@@ -196,7 +197,8 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
llm_config=config,
|
||||
ensure_available=True,
|
||||
adapter_map=adapter_map,
|
||||
serialisation=serialisation_format)
|
||||
quantize=env['quantize_value'],
|
||||
serialisation=serialisation)
|
||||
start_env.update({env.config: llm.config.model_dump_json().decode()})
|
||||
|
||||
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
|
||||
@@ -262,8 +264,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
|
||||
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
|
||||
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
|
||||
''',
|
||||
),
|
||||
'''),
|
||||
quantize_option(factory=cog.optgroup),
|
||||
serialisation_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--device',
|
||||
@@ -457,7 +458,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
|
||||
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--serialisation',
|
||||
'--serialization',
|
||||
'serialisation_format',
|
||||
'serialisation',
|
||||
type=click.Choice(['safetensors', 'legacy']),
|
||||
default='safetensors',
|
||||
show_default=True,
|
||||
|
||||
@@ -26,6 +26,7 @@ if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralContainerRegistry
|
||||
from openllm_core._typing_compat import LiteralContainerVersionStrategy
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -37,7 +38,7 @@ def _start(model_name: str,
|
||||
timeout: int = 30,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
|
||||
device: tuple[str, ...] | t.Literal['all'] | None = None,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
quantize: LiteralQuantise | None = None,
|
||||
adapter_map: dict[LiteralString, str | None] | None = None,
|
||||
backend: LiteralBackend | None = None,
|
||||
additional_args: list[str] | None = None,
|
||||
@@ -109,7 +110,7 @@ def _build(model_name: str,
|
||||
model_id: str | None = None,
|
||||
model_version: str | None = None,
|
||||
bento_version: str | None = None,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
quantize: LiteralQuantise | None = None,
|
||||
adapter_map: dict[str, str | None] | None = None,
|
||||
build_ctx: str | None = None,
|
||||
enable_features: tuple[str, ...] | None = None,
|
||||
@@ -120,7 +121,7 @@ def _build(model_name: str,
|
||||
container_version_strategy: LiteralContainerVersionStrategy | None = None,
|
||||
push: bool = False,
|
||||
containerize: bool = False,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
|
||||
serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
|
||||
additional_args: list[str] | None = None,
|
||||
bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
|
||||
"""Package a LLM into a Bento.
|
||||
@@ -160,14 +161,14 @@ def _build(model_name: str,
|
||||
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
|
||||
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
|
||||
container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
|
||||
serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
|
||||
serialisation: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
|
||||
additional_args: Additional arguments to pass to ``openllm build``.
|
||||
bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
|
||||
|
||||
Returns:
|
||||
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
"""
|
||||
args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format]
|
||||
args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation]
|
||||
if quantize: args.extend(['--quantize', quantize])
|
||||
if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
|
||||
if push: args.extend(['--push'])
|
||||
@@ -203,8 +204,8 @@ def _import_model(model_name: str,
|
||||
model_id: str | None = None,
|
||||
model_version: str | None = None,
|
||||
backend: LiteralBackend = 'pt',
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
|
||||
quantize: LiteralQuantise | None = None,
|
||||
serialisation: t.Literal['legacy', 'safetensors'] = 'safetensors',
|
||||
additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
|
||||
"""Import a LLM into local store.
|
||||
|
||||
@@ -228,7 +229,7 @@ def _import_model(model_name: str,
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
||||
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
||||
Default behaviour is similar to ``safe_serialization=False``.
|
||||
additional_args: Additional arguments to pass to ``openllm import``.
|
||||
|
||||
@@ -236,7 +237,7 @@ def _import_model(model_name: str,
|
||||
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
"""
|
||||
from .entrypoint import import_command
|
||||
args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
|
||||
args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation]
|
||||
if model_id is not None: args.append(model_id)
|
||||
if model_version is not None: args.extend(['--model-version', str(model_version)])
|
||||
if additional_args is not None: args.extend(additional_args)
|
||||
|
||||
@@ -54,7 +54,6 @@ import openllm
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from bentoml._internal.models.model import ModelStore
|
||||
from openllm import bundle
|
||||
from openllm import serialisation
|
||||
from openllm.exceptions import OpenLLMException
|
||||
from openllm.models.auto import CONFIG_MAPPING
|
||||
from openllm.models.auto import MODEL_FLAX_MAPPING_NAMES
|
||||
@@ -67,6 +66,7 @@ from openllm.utils import infer_auto_class
|
||||
from openllm_core._typing_compat import Concatenate
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
from openllm_core._typing_compat import Self
|
||||
@@ -84,7 +84,6 @@ from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import get_debug_mode
|
||||
from openllm_core.utils import get_quiet_mode
|
||||
from openllm_core.utils import is_torch_available
|
||||
from openllm_core.utils import is_transformers_supports_agent
|
||||
from openllm_core.utils import resolve_user_filepath
|
||||
from openllm_core.utils import set_debug_mode
|
||||
from openllm_core.utils import set_quiet_mode
|
||||
@@ -343,8 +342,8 @@ def import_command(
|
||||
output: LiteralOutput,
|
||||
machine: bool,
|
||||
backend: LiteralBackend,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
quantize: LiteralQuantise | None,
|
||||
serialisation: t.Literal['safetensors', 'legacy'],
|
||||
) -> bentoml.Model:
|
||||
"""Setup LLM interactively.
|
||||
|
||||
@@ -369,7 +368,7 @@ def import_command(
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm download opt facebook/opt-2.7b
|
||||
$ openllm import opt facebook/opt-2.7b
|
||||
```
|
||||
|
||||
\b
|
||||
@@ -400,17 +399,19 @@ def import_command(
|
||||
env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
|
||||
backend = first_not_none(backend, default=env['backend_value'])
|
||||
llm = infer_auto_class(backend).for_model(
|
||||
model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
|
||||
model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False,
|
||||
quantize=env['quantize_value'],
|
||||
serialisation=serialisation
|
||||
)
|
||||
_previously_saved = False
|
||||
try:
|
||||
_ref = serialisation.get(llm)
|
||||
_ref = openllm.serialisation.get(llm)
|
||||
_previously_saved = True
|
||||
except openllm.exceptions.OpenLLMException:
|
||||
if not machine and output == 'pretty':
|
||||
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
|
||||
termui.echo(msg, fg='yellow', nl=True)
|
||||
_ref = serialisation.get(llm, auto_import=True)
|
||||
_ref = openllm.serialisation.get(llm, auto_import=True)
|
||||
if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
|
||||
if machine: return _ref
|
||||
elif output == 'pretty':
|
||||
@@ -472,7 +473,7 @@ def build_command(
|
||||
bento_version: str | None,
|
||||
overwrite: bool,
|
||||
output: LiteralOutput,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
|
||||
quantize: LiteralQuantise | None,
|
||||
enable_features: tuple[str, ...] | None,
|
||||
workers_per_resource: float | None,
|
||||
adapter_id: tuple[str, ...],
|
||||
@@ -483,7 +484,7 @@ def build_command(
|
||||
dockerfile_template: t.TextIO | None,
|
||||
containerize: bool,
|
||||
push: bool,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
serialisation: t.Literal['safetensors', 'legacy'],
|
||||
container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy,
|
||||
force_push: bool,
|
||||
@@ -517,12 +518,12 @@ def build_command(
|
||||
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
|
||||
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
|
||||
try:
|
||||
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
|
||||
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation, env.backend: env['backend_value']})
|
||||
if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
|
||||
if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
|
||||
|
||||
llm = infer_auto_class(env['backend_value']).for_model(
|
||||
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
|
||||
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, quantize=env['quantize_value'], serialisation=serialisation, **attrs
|
||||
)
|
||||
|
||||
labels = dict(llm.identifying_params)
|
||||
@@ -798,7 +799,6 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
|
||||
except http.client.BadStatusLine:
|
||||
raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
|
||||
if agent == 'hf':
|
||||
if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
|
||||
_memoized = {k: v[0] for k, v in _memoized.items() if v}
|
||||
client._hf_agent.set_stream(logger.info)
|
||||
if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')
|
||||
|
||||
@@ -49,7 +49,7 @@ class BaseAutoLLMClass:
|
||||
```
|
||||
'''
|
||||
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
|
||||
if ensure_available: llm.ensure_model_id_exists()
|
||||
if ensure_available: llm.save_pretrained()
|
||||
return llm
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -37,6 +37,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
|
||||
from .transformers._helpers import process_config
|
||||
|
||||
config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
|
||||
|
||||
bentomodel_fs = fs.open_fs(llm._bentomodel.path)
|
||||
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
|
||||
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:
|
||||
|
||||
@@ -14,13 +14,14 @@ import openllm
|
||||
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from bentoml._internal.models.model import ModelOptions
|
||||
from openllm_core._typing_compat import M
|
||||
from openllm_core._typing_compat import T
|
||||
|
||||
from ._helpers import check_unintialised_params
|
||||
from ._helpers import infer_autoclass_from_llm
|
||||
from ._helpers import infer_tokenizers_from_llm
|
||||
from ._helpers import make_model_signatures
|
||||
from ._helpers import process_config
|
||||
from ._helpers import update_model
|
||||
from .weights import HfIgnore
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -32,8 +33,6 @@ if t.TYPE_CHECKING:
|
||||
|
||||
from bentoml._internal.models import ModelStore
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import M
|
||||
from openllm_core._typing_compat import T
|
||||
else:
|
||||
autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
|
||||
torch = openllm.utils.LazyLoader('torch', globals(), 'torch')
|
||||
@@ -63,16 +62,23 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
"""
|
||||
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
|
||||
_, tokenizer_attrs = llm.llm_parameters
|
||||
quantize_method = llm._quantize_method
|
||||
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
|
||||
quantize = llm._quantize
|
||||
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors')
|
||||
# Disable safe serialization with vLLM
|
||||
if llm.__llm_backend__ == 'vllm': safe_serialisation = False
|
||||
metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
|
||||
metadata: DictStrAny = {'safe_serialisation': safe_serialisation}
|
||||
if quantize: metadata['_quantize'] = quantize
|
||||
architectures = getattr(config, 'architectures', [])
|
||||
if not architectures: raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
|
||||
metadata['_pretrained_class'] = architectures[0]
|
||||
|
||||
signatures: DictStrAny = {}
|
||||
|
||||
if quantize_method == 'gptq':
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if quantize == 'gptq':
|
||||
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
|
||||
)
|
||||
if llm.config['model_type'] != 'causal_lm':
|
||||
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
signatures['generate'] = {'batchable': False}
|
||||
@@ -82,7 +88,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
|
||||
attrs.pop('quantization_config')
|
||||
if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
|
||||
metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
|
||||
metadata['_framework'] = llm.__llm_backend__
|
||||
signatures.update(make_model_signatures(llm))
|
||||
|
||||
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
|
||||
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
|
||||
@@ -95,42 +102,22 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
options=ModelOptions(),
|
||||
context=openllm.utils.generate_context(framework_name='openllm'),
|
||||
labels=openllm.utils.generate_labels(llm),
|
||||
signatures=signatures if signatures else make_model_signatures(llm))
|
||||
metadata=metadata,
|
||||
signatures=signatures)
|
||||
with openllm.utils.analytics.set_bentoml_tracking():
|
||||
try:
|
||||
bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
|
||||
tokenizer.save_pretrained(bentomodel.path)
|
||||
if quantize_method == 'gptq':
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if llm.config['model_type'] != 'causal_lm':
|
||||
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
|
||||
model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
|
||||
*decls,
|
||||
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
|
||||
trust_remote_code=trust_remote_code,
|
||||
use_safetensors=safe_serialisation,
|
||||
**hub_attrs,
|
||||
**attrs)
|
||||
update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
|
||||
model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
|
||||
if llm._local:
|
||||
# possible local path
|
||||
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
|
||||
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
|
||||
# for trust_remote_code to work
|
||||
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
|
||||
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
|
||||
else:
|
||||
architectures = getattr(config, 'architectures', [])
|
||||
if not architectures:
|
||||
raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
|
||||
architecture = architectures[0]
|
||||
update_model(bentomodel, metadata={'_pretrained_class': architecture})
|
||||
if llm._local:
|
||||
# possible local path
|
||||
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
|
||||
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
|
||||
# for trust_remote_code to work
|
||||
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
|
||||
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
|
||||
else:
|
||||
# we will clone the all tings into the bentomodel path without loading model into memory
|
||||
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
|
||||
# we will clone the all tings into the bentomodel path without loading model into memory
|
||||
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
|
||||
except Exception:
|
||||
raise
|
||||
else:
|
||||
@@ -165,29 +152,27 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
|
||||
|
||||
def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
|
||||
safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
|
||||
attrs.pop('safe_serialization', None),
|
||||
default=llm._serialisation_format == 'safetensors')
|
||||
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if llm.config['model_type'] != 'causal_lm':
|
||||
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
|
||||
*decls,
|
||||
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
use_safetensors=safe_serialization,
|
||||
**hub_attrs,
|
||||
**attrs)
|
||||
auto_class = infer_autoclass_from_llm(llm, config)
|
||||
device_map: str | None = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
|
||||
|
||||
device_map = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
|
||||
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
|
||||
*decls,
|
||||
config=config,
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
device_map=device_map,
|
||||
**hub_attrs,
|
||||
**attrs).eval()
|
||||
if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
|
||||
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
|
||||
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
|
||||
)
|
||||
if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
|
||||
model = auto_class.from_pretrained(llm._bentomodel.path, device_map='auto', **hub_attrs, **attrs)
|
||||
# TODO: Use the below logic once TheBloke finished migration to new GPTQConfig from transformers
|
||||
# from accelerate import init_empty_weights
|
||||
# from optimum.gptq import load_quantized_model
|
||||
# # disable exllama if gptq is loaded on CPU
|
||||
# disable_exllama = not torch.cuda.is_available()
|
||||
# with init_empty_weights():
|
||||
# empty = auto_class.from_pretrained(llm.model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map='auto')
|
||||
# empty.tie_weights()
|
||||
# model = load_quantized_model(empty, save_folder=llm._bentomodel.path, device_map='auto', disable_exllama=disable_exllama)
|
||||
else:
|
||||
model = auto_class.from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **hub_attrs, **attrs).eval()
|
||||
if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
|
||||
return t.cast('M', model)
|
||||
|
||||
@@ -5,7 +5,6 @@ import typing as t
|
||||
import openllm
|
||||
import openllm_core
|
||||
|
||||
from bentoml._internal.models.model import ModelInfo
|
||||
from bentoml._internal.models.model import ModelSignature
|
||||
from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
|
||||
from openllm.serialisation.constants import HUB_ATTRS
|
||||
@@ -16,8 +15,6 @@ if t.TYPE_CHECKING:
|
||||
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
import bentoml
|
||||
|
||||
from bentoml._internal.models.model import ModelSignaturesType
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import M
|
||||
@@ -25,8 +22,6 @@ if t.TYPE_CHECKING:
|
||||
else:
|
||||
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
|
||||
|
||||
_object_setattr = object.__setattr__
|
||||
|
||||
def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
|
||||
'''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.
|
||||
|
||||
@@ -73,24 +68,6 @@ def check_unintialised_params(model: torch.nn.Module) -> None:
|
||||
if len(unintialized) > 0:
|
||||
raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')
|
||||
|
||||
def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model:
|
||||
based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
|
||||
based.update(metadata)
|
||||
_object_setattr(
|
||||
bentomodel,
|
||||
'_info',
|
||||
ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged
|
||||
tag=bentomodel.info.tag,
|
||||
module=bentomodel.info.module,
|
||||
labels=bentomodel.info.labels,
|
||||
options=bentomodel.info.options.to_dict(),
|
||||
signatures=bentomodel.info.signatures,
|
||||
context=bentomodel.info.context,
|
||||
api_version=bentomodel.info.api_version,
|
||||
creation_time=bentomodel.info.creation_time,
|
||||
metadata=based))
|
||||
return bentomodel
|
||||
|
||||
# NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures
|
||||
def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
|
||||
infer_fn: tuple[str, ...] = ('__call__',)
|
||||
|
||||
@@ -11,11 +11,12 @@ import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
|
||||
def build_bento(model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
|
||||
logger.info('Building BentoML for %s', model)
|
||||
bento = openllm.build(model, model_id=model_id, quantize=quantize)
|
||||
yield bento
|
||||
|
||||
@@ -19,13 +19,7 @@ if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
|
||||
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
|
||||
return {
|
||||
'backend': llm.__llm_backend__,
|
||||
'framework': 'openllm',
|
||||
'model_name': llm.config['model_name'],
|
||||
'architecture': llm.config['architecture'],
|
||||
'serialisation_format': llm._serialisation_format
|
||||
}
|
||||
return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation}
|
||||
|
||||
def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
|
||||
import openllm
|
||||
|
||||
@@ -24,6 +24,7 @@ import openllm
|
||||
from openllm._llm import normalise_model_name
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import ListAny
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -141,14 +142,7 @@ class DockerHandle(_Handle):
|
||||
return container.status in ['running', 'created']
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _local_handle(model: str,
|
||||
model_id: str,
|
||||
image_tag: str,
|
||||
deployment_mode: t.Literal['container', 'local'],
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
*,
|
||||
_serve_grpc: bool = False,
|
||||
):
|
||||
def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
|
||||
with openllm.utils.reserve_free_port() as port:
|
||||
pass
|
||||
|
||||
@@ -169,14 +163,7 @@ def _local_handle(model: str,
|
||||
proc.stderr.close()
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _container_handle(model: str,
|
||||
model_id: str,
|
||||
image_tag: str,
|
||||
deployment_mode: t.Literal['container', 'local'],
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
*,
|
||||
_serve_grpc: bool = False,
|
||||
):
|
||||
def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
|
||||
envvar = openllm.utils.EnvVarMixin(model)
|
||||
|
||||
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
|
||||
|
||||
Reference in New Issue
Block a user