mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-06-12 02:20:32 -04:00
fix(gptq): use upstream integration (#297)
* wip Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * feat: GPTQ transformers integration Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * fix: only load if variable is available and add changelog Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> * chore: remove boilerplate check Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -3,47 +3,68 @@ from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from openllm_core._typing_compat import LiteralQuantise
|
||||
from openllm_core._typing_compat import overload
|
||||
from openllm_core.utils import LazyLoader
|
||||
from openllm_core.utils import is_autogptq_available
|
||||
from openllm_core.utils import is_bitsandbytes_available
|
||||
from openllm_core.utils import is_transformers_supports_kbit
|
||||
from openllm_core.utils import pkg
|
||||
from openllm_core.utils import is_optimum_supports_gptq
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
|
||||
from ._llm import LLM
|
||||
|
||||
autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
QuantiseMode = t.Literal['int8', 'int4', 'gptq']
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
|
||||
...
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]:
|
||||
...
|
||||
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
|
||||
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQuantise, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig, DictStrAny]:
|
||||
# 8 bit configuration
|
||||
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
|
||||
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
|
||||
int8_skip_modules: list[str] | None = attrs.pop('llm_int8_skip_modules', None)
|
||||
int8_has_fp16_weight = attrs.pop('llm_int8_has_fp16_weight', False)
|
||||
|
||||
autogptq_attrs: DictStrAny = {
|
||||
'bits': attrs.pop('gptq_bits', 4),
|
||||
'group_size': attrs.pop('gptq_group_size', -1),
|
||||
'damp_percent': attrs.pop('gptq_damp_percent', 0.01),
|
||||
'desc_act': attrs.pop('gptq_desc_act', True),
|
||||
'sym': attrs.pop('gptq_sym', True),
|
||||
'true_sequential': attrs.pop('gptq_true_sequential', True),
|
||||
}
|
||||
def create_gptq_config() -> transformers.GPTQConfig:
|
||||
gptq_bits = attrs.pop('bits', 4)
|
||||
gptq_tokenizer = attrs.pop('tokenizer', None)
|
||||
gptq_dataset = attrs.pop('dataset', 'c4')
|
||||
gptq_group_size = attrs.pop('group_size', 128)
|
||||
gptq_damp_percent = attrs.pop('damp_percent', 0.1)
|
||||
gptq_desc_act = attrs.pop('desc_act', False)
|
||||
gptq_sym = attrs.pop('sym', True)
|
||||
gptq_true_sequential = attrs.pop('true_sequential', True)
|
||||
gptq_use_cuda_fp16 = attrs.pop('use_cuda_fp16', True if torch.cuda.is_available() else False)
|
||||
gptq_model_seqlen = attrs.pop('model_seqlen', None)
|
||||
gptq_block_name_to_quantize = attrs.pop('block_name_to_quantize', None)
|
||||
gptq_module_name_preceding_first_block = attrs.pop('module_name_preceding_first_block', None)
|
||||
gptq_batch_size = attrs.pop('batch_size', 1)
|
||||
gptq_pad_token_id = attrs.pop('pad_token_id', None)
|
||||
gptq_disable_exllama = attrs.pop('disable_exllama', False)
|
||||
return transformers.GPTQConfig(bits=gptq_bits,
|
||||
tokenizer=gptq_tokenizer,
|
||||
dataset=gptq_dataset,
|
||||
group_size=gptq_group_size,
|
||||
damp_percent=gptq_damp_percent,
|
||||
desc_act=gptq_desc_act,
|
||||
sym=gptq_sym,
|
||||
true_sequential=gptq_true_sequential,
|
||||
use_cuda_fp16=gptq_use_cuda_fp16,
|
||||
model_seqlen=gptq_model_seqlen,
|
||||
block_name_to_quantize=gptq_block_name_to_quantize,
|
||||
module_name_preceding_first_block=gptq_module_name_preceding_first_block,
|
||||
batch_size=gptq_batch_size,
|
||||
pad_token_id=gptq_pad_token_id,
|
||||
disable_exllama=gptq_disable_exllama)
|
||||
|
||||
def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
|
||||
if int8_skip_modules is None: int8_skip_modules = []
|
||||
@@ -69,24 +90,18 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
|
||||
raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
|
||||
if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == 'int4':
|
||||
if is_transformers_supports_kbit():
|
||||
quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=int4_compute_dtype,
|
||||
bnb_4bit_quant_type=int4_quant_type,
|
||||
bnb_4bit_use_double_quant=int4_use_double_quant)
|
||||
else:
|
||||
logger.warning(
|
||||
"'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
|
||||
pkg.pkg_version_info('transformers'))
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=int4_compute_dtype,
|
||||
bnb_4bit_quant_type=int4_quant_type,
|
||||
bnb_4bit_use_double_quant=int4_use_double_quant)
|
||||
elif quantise == 'gptq':
|
||||
if not is_autogptq_available():
|
||||
if not is_autogptq_available() or not is_optimum_supports_gptq():
|
||||
logger.warning(
|
||||
"'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
|
||||
"'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
|
||||
)
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
else:
|
||||
quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
|
||||
quantisation_config = create_gptq_config()
|
||||
else:
|
||||
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
|
||||
return quantisation_config, attrs
|
||||
|
||||
Reference in New Issue
Block a user