perf: unify LLM interface (#518)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-03-10 11:10:27 -04:00 · 2023-11-06 20:39:43 -05:00
parent f2639879af
commit e2029c934b
136 changed files with 9646 additions and 11244 deletions
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -8,6 +8,8 @@ import transformers

 from openllm_core._typing_compat import LiteralQuantise
 from openllm_core._typing_compat import overload
+from openllm_core.exceptions import MissingDependencyError
+from openllm_core.utils import is_autoawq_available
 from openllm_core.utils import is_autogptq_available
 from openllm_core.utils import is_bitsandbytes_available
 from openllm_core.utils import is_optimum_supports_gptq
@@ -20,25 +22,36 @@ if t.TYPE_CHECKING:
 logger = logging.getLogger(__name__)

@overload
-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
+def infer_quantisation_config(self: LLM[t.Any, t.Any], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
  ...

@overload
-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]:
+def infer_quantisation_config(self: LLM[t.Any, t.Any], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]:
  ...

-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQuantise, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig, DictStrAny]:
+@overload
+def infer_quantisation_config(self: LLM[t.Any, t.Any], quantise: t.Literal['awq'], **attrs: t.Any) -> tuple[transformers.AwqConfig, DictStrAny]:
+  ...
+
+def infer_quantisation_config(self: LLM[t.Any, t.Any], quantise: LiteralQuantise,
+                              **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig, DictStrAny]:
  # 8 bit configuration
  int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
  int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
  int8_skip_modules: list[str] | None = attrs.pop('llm_int8_skip_modules', None)
  int8_has_fp16_weight = attrs.pop('llm_int8_has_fp16_weight', False)

+  # shared arguments for gptq and awq
+  bits = attrs.pop('bits', 4)
+  group_size = attrs.pop('group_size', 128)
+
+  def create_awq_config() -> transformers.AwqConfig:
+    zero_point = attrs.pop('zero_point', True)
+    return transformers.AwqConfig(bits=bits, group_size=group_size, zero_point=zero_point)
+
  def create_gptq_config() -> transformers.GPTQConfig:
-    gptq_bits = attrs.pop('bits', 4)
-    gptq_tokenizer = attrs.pop('tokenizer', None)
+    gptq_tokenizer = attrs.pop('tokenizer', self.model_id)
    gptq_dataset = attrs.pop('dataset', 'c4')
-    gptq_group_size = attrs.pop('group_size', 128)
    gptq_damp_percent = attrs.pop('damp_percent', 0.1)
    gptq_desc_act = attrs.pop('desc_act', False)
    gptq_sym = attrs.pop('sym', True)
@@ -50,10 +63,10 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQua
    gptq_batch_size = attrs.pop('batch_size', 1)
    gptq_pad_token_id = attrs.pop('pad_token_id', None)
    gptq_disable_exllama = attrs.pop('disable_exllama', False)
-    return transformers.GPTQConfig(bits=gptq_bits,
+    return transformers.GPTQConfig(bits=bits,
                                   tokenizer=gptq_tokenizer,
                                   dataset=gptq_dataset,
-                                   group_size=gptq_group_size,
+                                   group_size=group_size,
                                   damp_percent=gptq_damp_percent,
                                   desc_act=gptq_desc_act,
                                   sym=gptq_sym,
@@ -67,25 +80,22 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQua
                                   disable_exllama=gptq_disable_exllama)

  def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
-    if int8_skip_modules is None: int8_skip_modules = []
-    if 'lm_head' not in int8_skip_modules and cls.config_class.__openllm_model_type__ == 'causal_lm':
-      logger.debug("Skipping 'lm_head' for quantization for %s", cls.__name__)
-      int8_skip_modules.append('lm_head')
+    # if int8_skip_modules is None: int8_skip_modules = []
+    # if 'lm_head' not in int8_skip_modules and self.config_class.__openllm_model_type__ == 'causal_lm':
+    #   logger.debug("Skipping 'lm_head' for quantization for %s", self.__name__)
+    #   int8_skip_modules.append('lm_head')
    return transformers.BitsAndBytesConfig(load_in_8bit=True,
                                           llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload,
                                           llm_int8_threshhold=int8_threshold,
                                           llm_int8_skip_modules=int8_skip_modules,
-                                           llm_int8_has_fp16_weight=int8_has_fp16_weight,
-                                           )
+                                           llm_int8_has_fp16_weight=int8_has_fp16_weight)

  # 4 bit configuration
  int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
  int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
  int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)

-  # NOTE: Quantization setup
-  # quantize is a openllm.LLM feature, where we can quantize the model
-  # with bitsandbytes or quantization aware training.
+  # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
  if not is_bitsandbytes_available():
    raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
  if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
@@ -96,12 +106,15 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQua
                                                          bnb_4bit_use_double_quant=int4_use_double_quant)
  elif quantise == 'gptq':
    if not is_autogptq_available() or not is_optimum_supports_gptq():
-      logger.warning(
-          "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
-      )
-      quantisation_config = create_int8_config(int8_skip_modules)
+      raise MissingDependencyError(
+          "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[gptq]\"'")
    else:
      quantisation_config = create_gptq_config()
+  elif quantise == 'awq':
+    if not is_autoawq_available():
+      raise MissingDependencyError("quantize='awq' requires 'auto-awq' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[awq]\"'.")
+    else:
+      quantisation_config = create_awq_config()
  else:
-    raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
+    raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq', 'awq'], got {quantise} instead.")
  return quantisation_config, attrs