feat(vllm): support GPTQ with 0.2.6 (#797)

* feat(vllm): GPTQ support passthrough Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: run scripts Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> * fix(install): set order of xformers before vllm Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: support GPTQ with vLLM Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2023-12-18 12:41:19 -05:00
parent 5d27337e82
commit 8d63afc9ce
7 changed files with 22 additions and 17 deletions
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -184,6 +184,15 @@ openllm start facebook/opt-2.7b
 > architecture. Use the `openllm models` command to see the complete list of supported
 > models, their architectures, and their variants.

+> [!IMPORTANT]
+> If you are testing openllm on CPU, you might want to pass in `DTYPE=float32`. By default,
+> OpenLLM will set model `dtype` to `bfloat16` for the best performance.
+> ```bash
+> DTYPE=float32 openllm start facbeook/opt-2.7b
+> ```
+> This will also applies to older GPUs. If your GPUs doesn't support `bfloat16`, then you also
+> want to set `DTYPE=float16`.
+
 ## 🧩 Supported models

 OpenLLM currently supports the following models. By default, OpenLLM doesn't include dependencies to run all models. The extra model-specific dependencies can be installed with the instructions below.
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -119,7 +119,7 @@ openai = ["openai[datalib]>=1", "tiktoken"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 qwen = ["cpm-kernels", "tiktoken"]
 starcoder = ["bitsandbytes"]
-vllm = ["vllm>=0.2.5", "ray==2.6.0"]
+vllm = ["vllm==0.2.6", "ray==2.6.0"]

 [tool.hatch.version]
 fallback-version = "0.0.0"
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -231,19 +231,13 @@ class LLM(t.Generic[M, T]):

  class _Quantise:
    @staticmethod
-    def pt(llm: LLM, quantise=None):
-      return quantise
-
+    def pt(llm: LLM, quantise=None): return quantise
    @staticmethod
-    def vllm(llm: LLM, quantise=None):
-      return quantise
-
+    def vllm(llm: LLM, quantise=None): return quantise
    @staticmethod
    def ctranslate(llm: LLM, quantise=None):
-      if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:
-        raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
-      if quantise == 'int8':
-        quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
+      if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
+      if quantise == 'int8': quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
      return quantise

  @apply(lambda val: tuple(str.lower(i) if i else i for i in val))
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -125,6 +125,8 @@ class vLLMRunnable(bentoml.Runnable):
    num_gpus, dev = 1, openllm.utils.device_count()
    if dev >= 2:
      num_gpus = min(dev // 2 * 2, dev)
+    quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
+    dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype  # NOTE: quantise GPTQ doesn't support bfloat16 yet.
    try:
      self.model = vllm.AsyncLLMEngine.from_engine_args(
        vllm.AsyncEngineArgs(
@@ -135,10 +137,10 @@ class vLLMRunnable(bentoml.Runnable):
          model=llm.bentomodel.path,
          tokenizer=llm.bentomodel.path,  #
          trust_remote_code=llm.trust_remote_code,
-          dtype=llm._torch_dtype,  #
+          dtype=dtype,  #
          max_model_len=llm._max_model_len,
          gpu_memory_utilization=llm._gpu_memory_utilization,  #
-          quantization=llm.quantise if llm.quantise and llm.quantise in {'awq', 'squeezellm'} else None,
+          quantization=quantise,
        )
      )
    except Exception as err:
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -37,7 +37,7 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
    tokenizer.save_pretrained(bentomodel.path)
    if llm._quantization_config or (llm.quantise and llm.quantise not in {'squeezellm', 'awq'}):
      attrs['quantization_config'] = llm.quantization_config
-    if llm.quantise == 'gptq':
+    if llm.quantise == 'gptq' and llm.__llm_backend__ == 'pt':
      from optimum.gptq.constants import GPTQ_CONFIG

      with open(bentomodel.path_of(GPTQ_CONFIG), 'w', encoding='utf-8') as f:
@@ -106,7 +106,7 @@ def load_model(llm, *decls, **attrs):

  if '_quantize' in llm.bentomodel.info.metadata:
    _quantise = llm.bentomodel.info.metadata['_quantize']
-    if _quantise == 'gptq':
+    if _quantise == 'gptq' and llm.__llm_backend__ == 'pt':
      if not is_autogptq_available():
        raise OpenLLMException(
          "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"