mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-02-19 07:06:02 -05:00
feat(vllm): support GPTQ with 0.2.6 (#797)
* feat(vllm): GPTQ support passthrough Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: run scripts Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> * fix(install): set order of xformers before vllm Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: support GPTQ with vLLM Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -231,19 +231,13 @@ class LLM(t.Generic[M, T]):
|
||||
|
||||
class _Quantise:
|
||||
@staticmethod
|
||||
def pt(llm: LLM, quantise=None):
|
||||
return quantise
|
||||
|
||||
def pt(llm: LLM, quantise=None): return quantise
|
||||
@staticmethod
|
||||
def vllm(llm: LLM, quantise=None):
|
||||
return quantise
|
||||
|
||||
def vllm(llm: LLM, quantise=None): return quantise
|
||||
@staticmethod
|
||||
def ctranslate(llm: LLM, quantise=None):
|
||||
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:
|
||||
raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
|
||||
if quantise == 'int8':
|
||||
quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
|
||||
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
|
||||
if quantise == 'int8': quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
|
||||
return quantise
|
||||
|
||||
@apply(lambda val: tuple(str.lower(i) if i else i for i in val))
|
||||
|
||||
@@ -125,6 +125,8 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
num_gpus, dev = 1, openllm.utils.device_count()
|
||||
if dev >= 2:
|
||||
num_gpus = min(dev // 2 * 2, dev)
|
||||
quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
|
||||
dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet.
|
||||
try:
|
||||
self.model = vllm.AsyncLLMEngine.from_engine_args(
|
||||
vllm.AsyncEngineArgs(
|
||||
@@ -135,10 +137,10 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
model=llm.bentomodel.path,
|
||||
tokenizer=llm.bentomodel.path, #
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
dtype=llm._torch_dtype, #
|
||||
dtype=dtype, #
|
||||
max_model_len=llm._max_model_len,
|
||||
gpu_memory_utilization=llm._gpu_memory_utilization, #
|
||||
quantization=llm.quantise if llm.quantise and llm.quantise in {'awq', 'squeezellm'} else None,
|
||||
quantization=quantise,
|
||||
)
|
||||
)
|
||||
except Exception as err:
|
||||
|
||||
@@ -37,7 +37,7 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
|
||||
tokenizer.save_pretrained(bentomodel.path)
|
||||
if llm._quantization_config or (llm.quantise and llm.quantise not in {'squeezellm', 'awq'}):
|
||||
attrs['quantization_config'] = llm.quantization_config
|
||||
if llm.quantise == 'gptq':
|
||||
if llm.quantise == 'gptq' and llm.__llm_backend__ == 'pt':
|
||||
from optimum.gptq.constants import GPTQ_CONFIG
|
||||
|
||||
with open(bentomodel.path_of(GPTQ_CONFIG), 'w', encoding='utf-8') as f:
|
||||
@@ -106,7 +106,7 @@ def load_model(llm, *decls, **attrs):
|
||||
|
||||
if '_quantize' in llm.bentomodel.info.metadata:
|
||||
_quantise = llm.bentomodel.info.metadata['_quantize']
|
||||
if _quantise == 'gptq':
|
||||
if _quantise == 'gptq' and llm.__llm_backend__ == 'pt':
|
||||
if not is_autogptq_available():
|
||||
raise OpenLLMException(
|
||||
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
|
||||
|
||||
Reference in New Issue
Block a user