diff --git a/changelog.d/297.refactor.md b/changelog.d/297.refactor.md new file mode 100644 index 00000000..d88ff541 --- /dev/null +++ b/changelog.d/297.refactor.md @@ -0,0 +1 @@ +Refactor GPTQ to use official implementation from transformers>=4.32 diff --git a/openllm-client/pyproject.toml b/openllm-client/pyproject.toml index f4466905..573f73cf 100644 --- a/openllm-client/pyproject.toml +++ b/openllm-client/pyproject.toml @@ -105,7 +105,7 @@ dependencies = [ # avoid https://github.com/pallets/click/issues/2558 "click==8.1.3", "bentoml==1.1.2", - "transformers>=4.31.0", + "transformers>=4.32.1", "pandas-stubs", "types-psutil", "types-tabulate", diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py index ae453ec2..5188c187 100644 --- a/openllm-client/src/openllm_client/_base.py +++ b/openllm-client/src/openllm_client/_base.py @@ -19,7 +19,6 @@ from openllm_core._typing_compat import overload from openllm_core.utils import bentoml_cattr from openllm_core.utils import ensure_exec_coro from openllm_core.utils import is_transformers_available -from openllm_core.utils import is_transformers_supports_agent from .benmin import AsyncClient as AsyncBentoClient from .benmin import Client as BentoClient @@ -94,8 +93,6 @@ class _ClientAttr: raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.") if not self.supports_hf_agent: raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.') - if not is_transformers_supports_agent(): - raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'") import transformers return transformers.HfAgent(urljoin(self._address, '/hf/agent')) @@ -215,8 +212,6 @@ class _AsyncClient(_ClientAttr): else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'") async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: - if not is_transformers_supports_agent(): - raise RuntimeError('This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0') if len(args) > 1: raise ValueError("'args' should only take one positional argument.") from transformers.tools.agents import clean_code_for_run from transformers.tools.agents import get_tool_creation_code diff --git a/openllm-core/pyproject.toml b/openllm-core/pyproject.toml index f519700a..b025e889 100644 --- a/openllm-core/pyproject.toml +++ b/openllm-core/pyproject.toml @@ -118,7 +118,7 @@ dependencies = [ # avoid https://github.com/pallets/click/issues/2558 "click==8.1.3", "bentoml==1.1.2", - "transformers>=4.31.0", + "transformers>=4.32.1", "pandas-stubs", "types-psutil", "types-tabulate", diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index 21939026..1054a233 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -11,7 +11,6 @@ import bentoml from bentoml._internal.types import ModelSignatureDict as ModelSignatureDict if t.TYPE_CHECKING: - import auto_gptq as autogptq import peft import transformers import vllm @@ -26,11 +25,7 @@ if t.TYPE_CHECKING: from .utils.lazy import VersionInfo -M = t.TypeVar( - 'M', - bound= - 't.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]' -) +M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel]') T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]') def get_literal_args(typ: t.Any) -> tuple[str, ...]: @@ -43,6 +38,7 @@ ListStr = t.List[str] TupleAny = t.Tuple[t.Any, ...] At = t.TypeVar('At', bound=attr.AttrsInstance) +LiteralQuantise = t.Literal['int8', 'int4', 'gptq'] LiteralBackend = t.Literal['pt', 'tf', 'flax', 'vllm', 'ggml', 'mlc'] AdapterType = t.Literal['lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3'] diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index bdf4e20a..dfe76ea6 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -332,8 +332,6 @@ _import_structure: dict[str, list[str]] = { 'is_bitsandbytes_available', 'is_peft_available', 'is_datasets_available', - 'is_transformers_supports_kbit', - 'is_transformers_supports_agent', 'is_jupyter_available', 'is_jupytext_available', 'is_notebook_available', @@ -344,7 +342,8 @@ _import_structure: dict[str, list[str]] = { 'is_fairscale_available', 'is_grpc_available', 'is_grpc_health_available', - 'is_transformers_available' + 'is_transformers_available', + 'is_optimum_supports_gptq', ] } @@ -368,13 +367,12 @@ if t.TYPE_CHECKING: from .import_utils import is_jupyter_available as is_jupyter_available from .import_utils import is_jupytext_available as is_jupytext_available from .import_utils import is_notebook_available as is_notebook_available + from .import_utils import is_optimum_supports_gptq as is_optimum_supports_gptq from .import_utils import is_peft_available as is_peft_available from .import_utils import is_sentencepiece_available as is_sentencepiece_available from .import_utils import is_tf_available as is_tf_available from .import_utils import is_torch_available as is_torch_available from .import_utils import is_transformers_available as is_transformers_available - from .import_utils import is_transformers_supports_agent as is_transformers_supports_agent - from .import_utils import is_transformers_supports_kbit as is_transformers_supports_kbit from .import_utils import is_triton_available as is_triton_available from .import_utils import is_vllm_available as is_vllm_available from .import_utils import is_xformers_available as is_xformers_available diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py index fe5d44c6..e0a4aef2 100644 --- a/openllm-core/src/openllm_core/utils/import_utils.py +++ b/openllm-core/src/openllm_core/utils/import_utils.py @@ -74,11 +74,8 @@ def is_grpc_available() -> bool: def is_grpc_health_available() -> bool: return _grpc_health_available -def is_transformers_supports_kbit() -> bool: - return pkg.pkg_version_info('transformers')[:2] >= (4, 30) - -def is_transformers_supports_agent() -> bool: - return pkg.pkg_version_info('transformers')[:2] >= (4, 29) +def is_optimum_supports_gptq() -> bool: + return pkg.pkg_version_info('optimum')[:2] >= (0, 12) def is_jupyter_available() -> bool: return _jupyter_available diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 129f5075..c2d225b1 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -39,10 +39,11 @@ classifiers = [ ] dependencies = [ "bentoml[io]>=1.1.2", - "transformers[torch,tokenizers,accelerate]>=4.29.0", + "transformers[torch,tokenizers,accelerate]>=4.32.1", "openllm-client", "safetensors", - "optimum", + "optimum>=1.12.0", + "accelerate", "ghapi", "tabulate[widechars]>=0.9.0", "click>=8.1.3", @@ -99,13 +100,13 @@ all = ["openllm[full]"] baichuan = ["cpm-kernels", "sentencepiece"] chatglm = ["cpm-kernels", "sentencepiece"] falcon = ["einops", "xformers"] -fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"] +fine-tune = ["peft>=0.5.0", "bitsandbytes", "datasets", "accelerate", "trl"] flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"] full = [ - "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]", + "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]", ] ggml = ["ctransformers"] -gptq = ["auto-gptq[triton]"] +gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"] grpc = ["openllm-client[grpc]"] llama = ["fairscale", "sentencepiece"] mpt = ["triton", "einops"] @@ -150,7 +151,7 @@ dependencies = [ # avoid https://github.com/pallets/click/issues/2558 "click==8.1.3", "bentoml==1.1.2", - "transformers>=4.31.0", + "transformers>=4.32.1", "pandas-stubs", "types-psutil", "types-tabulate", diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 89160e3c..7e0ad43f 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -28,6 +28,7 @@ from openllm_core._typing_compat import AdaptersTuple from openllm_core._typing_compat import AdapterType from openllm_core._typing_compat import DictStrAny from openllm_core._typing_compat import LiteralBackend +from openllm_core._typing_compat import LiteralQuantise from openllm_core._typing_compat import LiteralString from openllm_core._typing_compat import LLMRunnable from openllm_core._typing_compat import LLMRunner @@ -63,7 +64,6 @@ from .utils import infer_auto_class if t.TYPE_CHECKING: - import auto_gptq as autogptq import peft import torch import transformers @@ -71,7 +71,6 @@ if t.TYPE_CHECKING: from openllm_core._configuration import PeftType from openllm_core.utils.representation import ReprArgs else: - autogptq = LazyLoader('autogptq', globals(), 'auto_gptq') transformers = LazyLoader('transformers', globals(), 'transformers') torch = LazyLoader('torch', globals(), 'torch') peft = LazyLoader('peft', globals(), 'peft') @@ -80,6 +79,8 @@ ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConf logger = logging.getLogger(__name__) +_object_setattr = object.__setattr__ + def normalise_model_name(name: str) -> str: if validate_is_path(name): return os.path.basename(resolve_filepath(name)) name = name.replace('/', '--') @@ -280,7 +281,8 @@ class LLM(LLMInterface[M, T], ReprMixin): def __attrs_init__(self, config: LLMConfig, - quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]], + quantize: t.Optional[LiteralQuantise], + quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig]], model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, @@ -288,17 +290,16 @@ class LLM(LLMInterface[M, T], ReprMixin): tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str], - quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']], - serialisation_format: t.Literal['safetensors', 'legacy'], + serialisation: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None: '''Generated __attrs_init__ for openllm.LLM.''' config: LLMConfig '''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.''' - quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None + quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None '''Quantisation config for quantised model on the fly.''' - + _quantize: LiteralQuantise | None _model_id: str _model_decls: TupleAny _model_attrs: DictStrAny @@ -306,8 +307,7 @@ class LLM(LLMInterface[M, T], ReprMixin): _tag: bentoml.Tag _adapters_mapping: AdaptersMapping | None _model_version: str - _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None - _serialisation_format: t.Literal['safetensors', 'legacy'] + _serialisation: t.Literal['safetensors', 'legacy'] _local: bool def __init_subclass__(cls: type[LLM[M, T]]) -> None: @@ -376,11 +376,11 @@ class LLM(LLMInterface[M, T], ReprMixin): model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, - quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, + quantize: LiteralQuantise | None = None, adapter_id: str | None = None, adapter_name: str | None = None, adapter_map: dict[str, str | None] | None = None, - quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, + quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None, serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors', **attrs: t.Any) -> LLM[M, T]: '''Instantiate a pretrained LLM. @@ -403,9 +403,6 @@ class LLM(LLMInterface[M, T], ReprMixin): model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False) ``` - For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed - to ``auto_gptq.BaseQuantizeConfig``. - ### Adapter options: > This is used in conjunction with the fine-tuning features @@ -427,7 +424,7 @@ class LLM(LLMInterface[M, T], ReprMixin): will use `config_class` to construct default configuration. quantize: The quantization to use for this LLM. Defaults to None. Possible values include int8, int4 and gptq. - quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize` + quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `transformers.GPTQConfig`) to use. Note that this is mutually exclusive with `quantize` serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. Default behaviour is similar to ``safe_serialization=False``. adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None. @@ -440,13 +437,15 @@ class LLM(LLMInterface[M, T], ReprMixin): _local = False _model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__) if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True - quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None) + quantize = first_not_none(quantize, t.cast(t.Optional[LiteralQuantise], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None) # quantization setup if quantization_config and quantize: raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.") if quantization_config is None and quantize is not None: - quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs) + # in case users input `tokenizer` to __init__, default to the _model_id + _gptq_tokenizer = attrs.pop('tokenizer', _model_id) + quantization_config, attrs = infer_quantisation_config(cls, quantize, tokenizer=_gptq_tokenizer, **attrs) if quantize == 'gptq': serialisation = 'safetensors' elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress @@ -476,10 +475,10 @@ class LLM(LLMInterface[M, T], ReprMixin): model_id=_model_id, llm_config=llm_config, quantization_config=quantization_config, - _quantize_method=quantize, + _quantize=quantize, _model_version=_tag.version, _tag=_tag, - _serialisation_format=serialisation, + _serialisation=serialisation, _local=_local, _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, **attrs) @@ -534,12 +533,12 @@ class LLM(LLMInterface[M, T], ReprMixin): *args: t.Any, model_id: str, llm_config: LLMConfig, - quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, + quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, - _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, + _quantize: LiteralQuantise | None, _model_version: str, - _serialisation_format: t.Literal['safetensors', 'legacy'], + _serialisation: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any, ): @@ -641,6 +640,7 @@ class LLM(LLMInterface[M, T], ReprMixin): # NOTE: Save the args and kwargs for latter load self.__attrs_init__(llm_config, quantization_config, + _quantize, model_id, args, { **model_kwds, **normalized_model_kwds @@ -650,8 +650,7 @@ class LLM(LLMInterface[M, T], ReprMixin): _tag, _adapters_mapping, _model_version, - _quantize_method, - _serialisation_format, + _serialisation, _local) self.llm_post_init() @@ -672,7 +671,7 @@ class LLM(LLMInterface[M, T], ReprMixin): @adapters_mapping.setter def adapters_mapping(self, value: AdaptersMapping) -> None: - self._adapters_mapping = value + _object_setattr(self, '_adapters_mapping', value) @property def __repr_keys__(self) -> set[str]: @@ -709,13 +708,13 @@ class LLM(LLMInterface[M, T], ReprMixin): def tag(self) -> bentoml.Tag: return self._tag - def ensure_model_id_exists(self) -> bentoml.Model: + def save_pretrained(self) -> bentoml.Model: return openllm.import_model(self.config['start_name'], model_id=self.model_id, model_version=self._model_version, backend=self.__llm_backend__, - quantize=self._quantize_method, - serialisation_format=self._serialisation_format) + quantize=self._quantize, + serialisation=self._serialisation) @property def _bentomodel(self) -> bentoml.Model: @@ -1085,11 +1084,11 @@ def Runner(model_name: str, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., - quantize: t.Literal['int8', 'int4', 'gptq'] | None = ..., + quantize: LiteralQuantise | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., - quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, + quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None, serialisation: t.Literal['safetensors', 'legacy'] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... @@ -1270,7 +1269,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: 'config': self.config, 'backend': self.__llm_backend__, 'peft_adapters': property(fget=available_adapters), - 'download_model': self.ensure_model_id_exists, + 'download_model': self.save_pretrained, '__call__': _wrapped_generate_run, 'embed': _wrapped_embeddings_run, '__module__': self.__module__, diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index 83258aac..16f02893 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -3,47 +3,68 @@ from __future__ import annotations import logging import typing as t +import torch +import transformers + +from openllm_core._typing_compat import LiteralQuantise from openllm_core._typing_compat import overload -from openllm_core.utils import LazyLoader from openllm_core.utils import is_autogptq_available from openllm_core.utils import is_bitsandbytes_available -from openllm_core.utils import is_transformers_supports_kbit -from openllm_core.utils import pkg +from openllm_core.utils import is_optimum_supports_gptq if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny from ._llm import LLM -autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers') - logger = logging.getLogger(__name__) -QuantiseMode = t.Literal['int8', 'int4', 'gptq'] - @overload def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ... @overload -def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]: ... -def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQuantise, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig, DictStrAny]: # 8 bit configuration int8_threshold = attrs.pop('llm_int8_threshhold', 6.0) int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False) int8_skip_modules: list[str] | None = attrs.pop('llm_int8_skip_modules', None) int8_has_fp16_weight = attrs.pop('llm_int8_has_fp16_weight', False) - autogptq_attrs: DictStrAny = { - 'bits': attrs.pop('gptq_bits', 4), - 'group_size': attrs.pop('gptq_group_size', -1), - 'damp_percent': attrs.pop('gptq_damp_percent', 0.01), - 'desc_act': attrs.pop('gptq_desc_act', True), - 'sym': attrs.pop('gptq_sym', True), - 'true_sequential': attrs.pop('gptq_true_sequential', True), - } + def create_gptq_config() -> transformers.GPTQConfig: + gptq_bits = attrs.pop('bits', 4) + gptq_tokenizer = attrs.pop('tokenizer', None) + gptq_dataset = attrs.pop('dataset', 'c4') + gptq_group_size = attrs.pop('group_size', 128) + gptq_damp_percent = attrs.pop('damp_percent', 0.1) + gptq_desc_act = attrs.pop('desc_act', False) + gptq_sym = attrs.pop('sym', True) + gptq_true_sequential = attrs.pop('true_sequential', True) + gptq_use_cuda_fp16 = attrs.pop('use_cuda_fp16', True if torch.cuda.is_available() else False) + gptq_model_seqlen = attrs.pop('model_seqlen', None) + gptq_block_name_to_quantize = attrs.pop('block_name_to_quantize', None) + gptq_module_name_preceding_first_block = attrs.pop('module_name_preceding_first_block', None) + gptq_batch_size = attrs.pop('batch_size', 1) + gptq_pad_token_id = attrs.pop('pad_token_id', None) + gptq_disable_exllama = attrs.pop('disable_exllama', False) + return transformers.GPTQConfig(bits=gptq_bits, + tokenizer=gptq_tokenizer, + dataset=gptq_dataset, + group_size=gptq_group_size, + damp_percent=gptq_damp_percent, + desc_act=gptq_desc_act, + sym=gptq_sym, + true_sequential=gptq_true_sequential, + use_cuda_fp16=gptq_use_cuda_fp16, + model_seqlen=gptq_model_seqlen, + block_name_to_quantize=gptq_block_name_to_quantize, + module_name_preceding_first_block=gptq_module_name_preceding_first_block, + batch_size=gptq_batch_size, + pad_token_id=gptq_pad_token_id, + disable_exllama=gptq_disable_exllama) def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig: if int8_skip_modules is None: int8_skip_modules = [] @@ -69,24 +90,18 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'") if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules) elif quantise == 'int4': - if is_transformers_supports_kbit(): - quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True, - bnb_4bit_compute_dtype=int4_compute_dtype, - bnb_4bit_quant_type=int4_quant_type, - bnb_4bit_use_double_quant=int4_use_double_quant) - else: - logger.warning( - "'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.", - pkg.pkg_version_info('transformers')) - quantisation_config = create_int8_config(int8_skip_modules) + quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True, + bnb_4bit_compute_dtype=int4_compute_dtype, + bnb_4bit_quant_type=int4_quant_type, + bnb_4bit_use_double_quant=int4_use_double_quant) elif quantise == 'gptq': - if not is_autogptq_available(): + if not is_autogptq_available() or not is_optimum_supports_gptq(): logger.warning( - "'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes." + "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes." ) quantisation_config = create_int8_config(int8_skip_modules) else: - quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs) + quantisation_config = create_gptq_config() else: raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.") return quantisation_config, attrs diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 70fd1608..fc2be132 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -109,12 +109,11 @@ def metadata_v1(_: str) -> openllm.MetadataOutput: ], 'num_tokens': 20 })) -async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput: +async def embeddings_v1(phrases: list[str]) -> list[openllm.EmbeddingsOutput]: embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type] - responses = (await embed_call.async_run(phrases))[0] - return openllm.EmbeddingsOutput(embeddings=responses['embeddings'], num_tokens=responses['num_tokens']) + return await embed_call.async_run(phrases) -if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent(): +if runner.supports_hf_agent: async def hf_agent(request: Request) -> Response: json_str = await request.body() diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 5cb23f20..262d5730 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -128,19 +128,19 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, - serialisation_format: t.Literal['safetensors', 'legacy'], + serialisation: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions: from openllm.cli._factory import parse_config_options environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy()) env: openllm_core.utils.EnvVarMixin = llm.config['env'] - if env['backend_value'] == 'vllm': serialisation_format = 'legacy' + if env['backend_value'] == 'vllm': serialisation = 'legacy' env_dict = { env.backend: env['backend_value'], env.config: f"'{llm.config.model_dump_json().decode()}'", env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}', 'OPENLLM_MODEL': llm.config['model_name'], - 'OPENLLM_SERIALIZATION': serialisation_format, + 'OPENLLM_SERIALIZATION': serialisation, 'OPENLLM_ADAPTER_MAP': f"'{orjson.dumps(adapter_map).decode()}'", 'BENTOML_DEBUG': str(True), 'BENTOML_QUIET': str(False), @@ -207,7 +207,7 @@ def create_bento(bento_tag: bentoml.Tag, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None, - serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors', + serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors', container_registry: LiteralContainerRegistry = 'ecr', container_version_strategy: LiteralContainerVersionStrategy = 'release', _bento_store: BentoStore = Provide[BentoMLContainer.bento_store], @@ -246,7 +246,7 @@ def create_bento(bento_tag: bentoml.Tag, quantize, adapter_map, dockerfile_template, - serialisation_format, + serialisation, container_registry, container_version_strategy)) diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index b1d1b5af..085e0ed1 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -22,6 +22,7 @@ from bentoml._internal.configuration.containers import BentoMLContainer from openllm_core._typing_compat import Concatenate from openllm_core._typing_compat import DictStrAny from openllm_core._typing_compat import LiteralBackend +from openllm_core._typing_compat import LiteralQuantise from openllm_core._typing_compat import LiteralString from openllm_core._typing_compat import ParamSpec from openllm_core._typing_compat import get_literal_args @@ -131,15 +132,15 @@ Available official model_id(s): [default: {llm_config['default_id']}] model_version: str | None, workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], - quantize: t.Literal['int8', 'int4', 'gptq'] | None, + quantize: LiteralQuantise | None, backend: LiteralBackend, - serialisation_format: t.Literal['safetensors', 'legacy'], + serialisation: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any, ) -> LLMConfig | subprocess.Popen[bytes]: - if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'): + if serialisation == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'): termui.echo( f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg='yellow') @@ -184,11 +185,11 @@ Available official model_id(s): [default: {llm_config['default_id']}] 'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()), 'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()), 'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(), - 'OPENLLM_SERIALIZATION': serialisation_format, - env.backend: env['backend_value'] + 'OPENLLM_SERIALIZATION': serialisation, + env.backend: env['backend_value'], }) if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value']) - if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value'])) + if env['quantize_value']: start_env[env.quantize] = str(env['quantize_value']) llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model, model_id=start_env[env.model_id], @@ -196,7 +197,8 @@ Available official model_id(s): [default: {llm_config['default_id']}] llm_config=config, ensure_available=True, adapter_map=adapter_map, - serialisation=serialisation_format) + quantize=env['quantize_value'], + serialisation=serialisation) start_env.update({env.config: llm.config.model_dump_json().decode()}) server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs) @@ -262,8 +264,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/) - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml) - ''', - ), + '''), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup), cog.optgroup.option('--device', @@ -457,7 +458,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--serialisation', '--serialization', - 'serialisation_format', + 'serialisation', type=click.Choice(['safetensors', 'legacy']), default='safetensors', show_default=True, diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index c981f126..3812eb23 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -26,6 +26,7 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralBackend from openllm_core._typing_compat import LiteralContainerRegistry from openllm_core._typing_compat import LiteralContainerVersionStrategy + from openllm_core._typing_compat import LiteralQuantise from openllm_core._typing_compat import LiteralString logger = logging.getLogger(__name__) @@ -37,7 +38,7 @@ def _start(model_name: str, timeout: int = 30, workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None, device: tuple[str, ...] | t.Literal['all'] | None = None, - quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, + quantize: LiteralQuantise | None = None, adapter_map: dict[LiteralString, str | None] | None = None, backend: LiteralBackend | None = None, additional_args: list[str] | None = None, @@ -109,7 +110,7 @@ def _build(model_name: str, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, - quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, + quantize: LiteralQuantise | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, @@ -120,7 +121,7 @@ def _build(model_name: str, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, - serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors', + serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors', additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento: """Package a LLM into a Bento. @@ -160,14 +161,14 @@ def _build(model_name: str, container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR. container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR. container_version_strategy: The container version strategy. Default to the latest release of OpenLLM. - serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True` + serialisation: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True` additional_args: Additional arguments to pass to ``openllm build``. bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store. Returns: ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud. """ - args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format] + args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation] if quantize: args.extend(['--quantize', quantize]) if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.") if push: args.extend(['--push']) @@ -203,8 +204,8 @@ def _import_model(model_name: str, model_id: str | None = None, model_version: str | None = None, backend: LiteralBackend = 'pt', - quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, - serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors', + quantize: LiteralQuantise | None = None, + serialisation: t.Literal['legacy', 'safetensors'] = 'safetensors', additional_args: t.Sequence[str] | None = None) -> bentoml.Model: """Import a LLM into local store. @@ -228,7 +229,7 @@ def _import_model(model_name: str, - int8: Quantize the model with 8bit (bitsandbytes required) - int4: Quantize the model with 4bit (bitsandbytes required) - gptq: Quantize the model with GPTQ (auto-gptq required) - serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. + serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. Default behaviour is similar to ``safe_serialization=False``. additional_args: Additional arguments to pass to ``openllm import``. @@ -236,7 +237,7 @@ def _import_model(model_name: str, ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud. """ from .entrypoint import import_command - args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format] + args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation] if model_id is not None: args.append(model_id) if model_version is not None: args.extend(['--model-version', str(model_version)]) if additional_args is not None: args.extend(additional_args) diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py index 93445c34..f3c60091 100644 --- a/openllm-python/src/openllm/cli/entrypoint.py +++ b/openllm-python/src/openllm/cli/entrypoint.py @@ -54,7 +54,6 @@ import openllm from bentoml._internal.configuration.containers import BentoMLContainer from bentoml._internal.models.model import ModelStore from openllm import bundle -from openllm import serialisation from openllm.exceptions import OpenLLMException from openllm.models.auto import CONFIG_MAPPING from openllm.models.auto import MODEL_FLAX_MAPPING_NAMES @@ -67,6 +66,7 @@ from openllm.utils import infer_auto_class from openllm_core._typing_compat import Concatenate from openllm_core._typing_compat import DictStrAny from openllm_core._typing_compat import LiteralBackend +from openllm_core._typing_compat import LiteralQuantise from openllm_core._typing_compat import LiteralString from openllm_core._typing_compat import ParamSpec from openllm_core._typing_compat import Self @@ -84,7 +84,6 @@ from openllm_core.utils import first_not_none from openllm_core.utils import get_debug_mode from openllm_core.utils import get_quiet_mode from openllm_core.utils import is_torch_available -from openllm_core.utils import is_transformers_supports_agent from openllm_core.utils import resolve_user_filepath from openllm_core.utils import set_debug_mode from openllm_core.utils import set_quiet_mode @@ -343,8 +342,8 @@ def import_command( output: LiteralOutput, machine: bool, backend: LiteralBackend, - quantize: t.Literal['int8', 'int4', 'gptq'] | None, - serialisation_format: t.Literal['safetensors', 'legacy'], + quantize: LiteralQuantise | None, + serialisation: t.Literal['safetensors', 'legacy'], ) -> bentoml.Model: """Setup LLM interactively. @@ -369,7 +368,7 @@ def import_command( \b ```bash - $ openllm download opt facebook/opt-2.7b + $ openllm import opt facebook/opt-2.7b ``` \b @@ -400,17 +399,19 @@ def import_command( env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize) backend = first_not_none(backend, default=env['backend_value']) llm = infer_auto_class(backend).for_model( - model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format + model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, + quantize=env['quantize_value'], +serialisation=serialisation ) _previously_saved = False try: - _ref = serialisation.get(llm) + _ref = openllm.serialisation.get(llm) _previously_saved = True except openllm.exceptions.OpenLLMException: if not machine and output == 'pretty': msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..." termui.echo(msg, fg='yellow', nl=True) - _ref = serialisation.get(llm, auto_import=True) + _ref = openllm.serialisation.get(llm, auto_import=True) if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() if machine: return _ref elif output == 'pretty': @@ -472,7 +473,7 @@ def build_command( bento_version: str | None, overwrite: bool, output: LiteralOutput, - quantize: t.Literal['int8', 'int4', 'gptq'] | None, + quantize: LiteralQuantise | None, enable_features: tuple[str, ...] | None, workers_per_resource: float | None, adapter_id: tuple[str, ...], @@ -483,7 +484,7 @@ def build_command( dockerfile_template: t.TextIO | None, containerize: bool, push: bool, - serialisation_format: t.Literal['safetensors', 'legacy'], + serialisation: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy, force_push: bool, @@ -517,12 +518,12 @@ def build_command( # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path try: - os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']}) + os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation, env.backend: env['backend_value']}) if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value']) if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value']) llm = infer_auto_class(env['backend_value']).for_model( - model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs + model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, quantize=env['quantize_value'], serialisation=serialisation, **attrs ) labels = dict(llm.identifying_params) @@ -798,7 +799,6 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: except http.client.BadStatusLine: raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None if agent == 'hf': - if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'") _memoized = {k: v[0] for k, v in _memoized.items() if v} client._hf_agent.set_stream(logger.info) if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta') diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py index d04d7bef..d309f423 100644 --- a/openllm-python/src/openllm/models/auto/factory.py +++ b/openllm-python/src/openllm/models/auto/factory.py @@ -49,7 +49,7 @@ class BaseAutoLLMClass: ``` ''' llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs) - if ensure_available: llm.ensure_model_id_exists() + if ensure_available: llm.save_pretrained() return llm @classmethod diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index caabd0d6..b7933680 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -37,6 +37,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: from .transformers._helpers import process_config config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code) + bentomodel_fs = fs.open_fs(llm._bentomodel.path) if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME): with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile: diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index 44f36576..cb712ec5 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -14,13 +14,14 @@ import openllm from bentoml._internal.configuration.containers import BentoMLContainer from bentoml._internal.models.model import ModelOptions +from openllm_core._typing_compat import M +from openllm_core._typing_compat import T from ._helpers import check_unintialised_params from ._helpers import infer_autoclass_from_llm from ._helpers import infer_tokenizers_from_llm from ._helpers import make_model_signatures from ._helpers import process_config -from ._helpers import update_model from .weights import HfIgnore if t.TYPE_CHECKING: @@ -32,8 +33,6 @@ if t.TYPE_CHECKING: from bentoml._internal.models import ModelStore from openllm_core._typing_compat import DictStrAny - from openllm_core._typing_compat import M - from openllm_core._typing_compat import T else: autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq') torch = openllm.utils.LazyLoader('torch', globals(), 'torch') @@ -63,16 +62,23 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, """ config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs) _, tokenizer_attrs = llm.llm_parameters - quantize_method = llm._quantize_method - safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors') + quantize = llm._quantize + safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors') # Disable safe serialization with vLLM if llm.__llm_backend__ == 'vllm': safe_serialisation = False - metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method} + metadata: DictStrAny = {'safe_serialisation': safe_serialisation} + if quantize: metadata['_quantize'] = quantize + architectures = getattr(config, 'architectures', []) + if not architectures: raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`') + metadata['_pretrained_class'] = architectures[0] + signatures: DictStrAny = {} - if quantize_method == 'gptq': - if not openllm.utils.is_autogptq_available(): - raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + if quantize == 'gptq': + if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq(): + raise openllm.exceptions.OpenLLMException( + "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'" + ) if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") signatures['generate'] = {'batchable': False} @@ -82,7 +88,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False): attrs.pop('quantization_config') if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation - metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__ + metadata['_framework'] = llm.__llm_backend__ + signatures.update(make_model_signatures(llm)) tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -95,42 +102,22 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, options=ModelOptions(), context=openllm.utils.generate_context(framework_name='openllm'), labels=openllm.utils.generate_labels(llm), - signatures=signatures if signatures else make_model_signatures(llm)) + metadata=metadata, + signatures=signatures) with openllm.utils.analytics.set_bentoml_tracking(): try: bentomodel.enter_cloudpickle_context(external_modules, imported_modules) tokenizer.save_pretrained(bentomodel.path) - if quantize_method == 'gptq': - if not openllm.utils.is_autogptq_available(): - raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") - if llm.config['model_type'] != 'causal_lm': - raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") - logger.debug('Saving model with GPTQ quantisation will require loading model into memory.') - model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id, - *decls, - quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config), - trust_remote_code=trust_remote_code, - use_safetensors=safe_serialisation, - **hub_attrs, - **attrs) - update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework}) - model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation) + if llm._local: + # possible local path + logger.debug('Model will be loaded into memory to save to target store as it is from local path.') + model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs) + # for trust_remote_code to work + bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules) + model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation) else: - architectures = getattr(config, 'architectures', []) - if not architectures: - raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`') - architecture = architectures[0] - update_model(bentomodel, metadata={'_pretrained_class': architecture}) - if llm._local: - # possible local path - logger.debug('Model will be loaded into memory to save to target store as it is from local path.') - model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs) - # for trust_remote_code to work - bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules) - model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation) - else: - # we will clone the all tings into the bentomodel path without loading model into memory - snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm)) + # we will clone the all tings into the bentomodel path without loading model into memory + snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm)) except Exception: raise else: @@ -165,29 +152,27 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs) - safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)), - attrs.pop('safe_serialization', None), - default=llm._serialisation_format == 'safetensors') - if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq': - if not openllm.utils.is_autogptq_available(): - raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") - if llm.config['model_type'] != 'causal_lm': - raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") - return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path, - *decls, - quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config), - trust_remote_code=llm.trust_remote_code, - use_safetensors=safe_serialization, - **hub_attrs, - **attrs) + auto_class = infer_autoclass_from_llm(llm, config) + device_map: str | None = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None) - device_map = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None) - model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path, - *decls, - config=config, - trust_remote_code=llm.trust_remote_code, - device_map=device_map, - **hub_attrs, - **attrs).eval() - if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model) + if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq': + if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq(): + raise openllm.exceptions.OpenLLMException( + "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'" + ) + if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") + + model = auto_class.from_pretrained(llm._bentomodel.path, device_map='auto', **hub_attrs, **attrs) + # TODO: Use the below logic once TheBloke finished migration to new GPTQConfig from transformers + # from accelerate import init_empty_weights + # from optimum.gptq import load_quantized_model + # # disable exllama if gptq is loaded on CPU + # disable_exllama = not torch.cuda.is_available() + # with init_empty_weights(): + # empty = auto_class.from_pretrained(llm.model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map='auto') + # empty.tie_weights() + # model = load_quantized_model(empty, save_folder=llm._bentomodel.path, device_map='auto', disable_exllama=disable_exllama) + else: + model = auto_class.from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **hub_attrs, **attrs).eval() + if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model) return t.cast('M', model) diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index 29fcf8af..57774913 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -5,7 +5,6 @@ import typing as t import openllm import openllm_core -from bentoml._internal.models.model import ModelInfo from bentoml._internal.models.model import ModelSignature from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING from openllm.serialisation.constants import HUB_ATTRS @@ -16,8 +15,6 @@ if t.TYPE_CHECKING: from transformers.models.auto.auto_factory import _BaseAutoModelClass - import bentoml - from bentoml._internal.models.model import ModelSignaturesType from openllm_core._typing_compat import DictStrAny from openllm_core._typing_compat import M @@ -25,8 +22,6 @@ if t.TYPE_CHECKING: else: transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch') -_object_setattr = object.__setattr__ - def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]: '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig. @@ -73,24 +68,6 @@ def check_unintialised_params(model: torch.nn.Module) -> None: if len(unintialized) > 0: raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}') -def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model: - based: DictStrAny = copy.deepcopy(bentomodel.info.metadata) - based.update(metadata) - _object_setattr( - bentomodel, - '_info', - ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged - tag=bentomodel.info.tag, - module=bentomodel.info.module, - labels=bentomodel.info.labels, - options=bentomodel.info.options.to_dict(), - signatures=bentomodel.info.signatures, - context=bentomodel.info.context, - api_version=bentomodel.info.api_version, - creation_time=bentomodel.info.creation_time, - metadata=based)) - return bentomodel - # NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType: infer_fn: tuple[str, ...] = ('__call__',) diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py index 90043c9b..b0d8cdf6 100644 --- a/openllm-python/src/openllm/testing.py +++ b/openllm-python/src/openllm/testing.py @@ -11,11 +11,12 @@ import openllm if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralBackend + from openllm_core._typing_compat import LiteralQuantise logger = logging.getLogger(__name__) @contextlib.contextmanager -def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]: +def build_bento(model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]: logger.info('Building BentoML for %s', model) bento = openllm.build(model, model_id=model_id, quantize=quantize) yield bento diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py index 75cf83e8..f8a7bf29 100644 --- a/openllm-python/src/openllm/utils/__init__.py +++ b/openllm-python/src/openllm/utils/__init__.py @@ -19,13 +19,7 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralBackend def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: - return { - 'backend': llm.__llm_backend__, - 'framework': 'openllm', - 'model_name': llm.config['model_name'], - 'architecture': llm.config['architecture'], - 'serialisation_format': llm._serialisation_format - } + return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation} def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]: import openllm diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py index 3d722661..24b29eb6 100644 --- a/openllm-python/tests/models/conftest.py +++ b/openllm-python/tests/models/conftest.py @@ -24,6 +24,7 @@ import openllm from openllm._llm import normalise_model_name from openllm_core._typing_compat import DictStrAny from openllm_core._typing_compat import ListAny +from openllm_core._typing_compat import LiteralQuantise logger = logging.getLogger(__name__) @@ -141,14 +142,7 @@ class DockerHandle(_Handle): return container.status in ['running', 'created'] @contextlib.contextmanager -def _local_handle(model: str, - model_id: str, - image_tag: str, - deployment_mode: t.Literal['container', 'local'], - quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, - *, - _serve_grpc: bool = False, - ): +def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False): with openllm.utils.reserve_free_port() as port: pass @@ -169,14 +163,7 @@ def _local_handle(model: str, proc.stderr.close() @contextlib.contextmanager -def _container_handle(model: str, - model_id: str, - image_tag: str, - deployment_mode: t.Literal['container', 'local'], - quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, - *, - _serve_grpc: bool = False, - ): +def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False): envvar = openllm.utils.EnvVarMixin(model) with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: diff --git a/tools/dependencies.py b/tools/dependencies.py index 9c50a26a..116c6386 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -110,10 +110,11 @@ _TRANSFORMERS_EXT = ['torch', 'tokenizers', 'accelerate'] _BASE_DEPENDENCIES = [ Dependencies(name='bentoml', extensions=_BENTOML_EXT, lower_constraint=lower_bentoml_constraint), - Dependencies(name='transformers', extensions=_TRANSFORMERS_EXT, lower_constraint='4.29.0'), + Dependencies(name='transformers', extensions=_TRANSFORMERS_EXT, lower_constraint='4.32.1'), Dependencies(name='openllm-client'), Dependencies(name='safetensors'), - Dependencies(name='optimum'), + Dependencies(name='optimum', lower_constraint="1.12.0"), + Dependencies(name='accelerate'), Dependencies(name='ghapi'), Dependencies(name='tabulate', extensions=['widechars'], lower_constraint='0.9.0'), Dependencies(name='click', lower_constraint='8.1.3'), @@ -122,7 +123,7 @@ _BASE_DEPENDENCIES = [ ] _ALL_RUNTIME_DEPS = ['flax>=0.7', 'jax', 'jaxlib', 'tensorflow', 'keras'] -FINE_TUNE_DEPS = ['peft>=0.4.0', 'bitsandbytes', 'datasets', 'accelerate', 'trl'] +FINE_TUNE_DEPS = ['peft>=0.5.0', 'bitsandbytes', 'datasets', 'accelerate', 'trl'] FLAN_T5_DEPS = _ALL_RUNTIME_DEPS OPT_DEPS = _ALL_RUNTIME_DEPS GRPC_DEPS = ['openllm-client[grpc]'] @@ -130,7 +131,7 @@ OPENAI_DEPS = ['openai', 'tiktoken'] AGENTS_DEPS = ['transformers[agents]>=4.30', 'diffusers', 'soundfile'] PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat'] GGML_DEPS = ['ctransformers'] -GPTQ_DEPS = ['auto-gptq[triton]'] +GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2', 'optimum>=1.12.0'] VLLM_DEPS = ['vllm>=0.1.4', 'ray'] _base_requirements: dict[str, t.Any] = {