From 8d63afc9ce97ce8ae81571e7bc2716aa57932864 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:41:19 -0500 Subject: [PATCH] feat(vllm): support GPTQ with 0.2.6 (#797) * feat(vllm): GPTQ support passthrough Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: run scripts Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> * fix(install): set order of xformers before vllm Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: support GPTQ with vLLM Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --- Dockerfile | 2 +- openllm-python/README.md | 9 +++++++++ openllm-python/pyproject.toml | 2 +- openllm-python/src/openllm/_llm.py | 14 ++++---------- openllm-python/src/openllm/_runners.py | 6 ++++-- .../openllm/serialisation/transformers/__init__.py | 4 ++-- tools/dependencies.py | 2 +- 7 files changed, 22 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index 83d49840..fe56d1ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,7 +29,7 @@ COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml /openllm-py # below RUN --mount=type=cache,target=/root/.cache/pip \ pip3 install -v --no-cache-dir \ - "ray==2.6.0" "vllm==0.2.2" xformers && \ + "ray==2.6.0" "xformers==0.0.23" "vllm==0.2.6" && \ pip3 install --no-cache-dir -e /openllm-python/ COPY openllm-core/src openllm-core/src diff --git a/openllm-python/README.md b/openllm-python/README.md index 8b0b9725..34cf3b09 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -184,6 +184,15 @@ openllm start facebook/opt-2.7b > architecture. Use the `openllm models` command to see the complete list of supported > models, their architectures, and their variants. +> [!IMPORTANT] +> If you are testing openllm on CPU, you might want to pass in `DTYPE=float32`. By default, +> OpenLLM will set model `dtype` to `bfloat16` for the best performance. +> ```bash +> DTYPE=float32 openllm start facbeook/opt-2.7b +> ``` +> This will also applies to older GPUs. If your GPUs doesn't support `bfloat16`, then you also +> want to set `DTYPE=float16`. + ## 🧩 Supported models OpenLLM currently supports the following models. By default, OpenLLM doesn't include dependencies to run all models. The extra model-specific dependencies can be installed with the instructions below. diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index f35a89ed..d48a964f 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -119,7 +119,7 @@ openai = ["openai[datalib]>=1", "tiktoken"] playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] qwen = ["cpm-kernels", "tiktoken"] starcoder = ["bitsandbytes"] -vllm = ["vllm>=0.2.5", "ray==2.6.0"] +vllm = ["vllm==0.2.6", "ray==2.6.0"] [tool.hatch.version] fallback-version = "0.0.0" diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 03caae26..afca8ecc 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -231,19 +231,13 @@ class LLM(t.Generic[M, T]): class _Quantise: @staticmethod - def pt(llm: LLM, quantise=None): - return quantise - + def pt(llm: LLM, quantise=None): return quantise @staticmethod - def vllm(llm: LLM, quantise=None): - return quantise - + def vllm(llm: LLM, quantise=None): return quantise @staticmethod def ctranslate(llm: LLM, quantise=None): - if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: - raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'") - if quantise == 'int8': - quantise = 'int8_float16' if llm._has_gpus else 'int8_float32' + if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'") + if quantise == 'int8': quantise = 'int8_float16' if llm._has_gpus else 'int8_float32' return quantise @apply(lambda val: tuple(str.lower(i) if i else i for i in val)) diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py index e0701692..b1a120c7 100644 --- a/openllm-python/src/openllm/_runners.py +++ b/openllm-python/src/openllm/_runners.py @@ -125,6 +125,8 @@ class vLLMRunnable(bentoml.Runnable): num_gpus, dev = 1, openllm.utils.device_count() if dev >= 2: num_gpus = min(dev // 2 * 2, dev) + quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None + dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet. try: self.model = vllm.AsyncLLMEngine.from_engine_args( vllm.AsyncEngineArgs( @@ -135,10 +137,10 @@ class vLLMRunnable(bentoml.Runnable): model=llm.bentomodel.path, tokenizer=llm.bentomodel.path, # trust_remote_code=llm.trust_remote_code, - dtype=llm._torch_dtype, # + dtype=dtype, # max_model_len=llm._max_model_len, gpu_memory_utilization=llm._gpu_memory_utilization, # - quantization=llm.quantise if llm.quantise and llm.quantise in {'awq', 'squeezellm'} else None, + quantization=quantise, ) ) except Exception as err: diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index d07f0af3..47f03478 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -37,7 +37,7 @@ def import_model(llm, *decls, trust_remote_code, **attrs): tokenizer.save_pretrained(bentomodel.path) if llm._quantization_config or (llm.quantise and llm.quantise not in {'squeezellm', 'awq'}): attrs['quantization_config'] = llm.quantization_config - if llm.quantise == 'gptq': + if llm.quantise == 'gptq' and llm.__llm_backend__ == 'pt': from optimum.gptq.constants import GPTQ_CONFIG with open(bentomodel.path_of(GPTQ_CONFIG), 'w', encoding='utf-8') as f: @@ -106,7 +106,7 @@ def load_model(llm, *decls, **attrs): if '_quantize' in llm.bentomodel.info.metadata: _quantise = llm.bentomodel.info.metadata['_quantize'] - if _quantise == 'gptq': + if _quantise == 'gptq' and llm.__llm_backend__ == 'pt': if not is_autogptq_available(): raise OpenLLMException( "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'" diff --git a/tools/dependencies.py b/tools/dependencies.py index af5ffcf8..1a07526d 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -155,7 +155,7 @@ GGML_DEPS = ['ctransformers'] CTRANSLATE_DEPS = ['ctranslate2>=3.22.0'] AWQ_DEPS = ['autoawq'] GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2'] -VLLM_DEPS = ['vllm>=0.2.5', 'ray==2.6.0'] +VLLM_DEPS = ['vllm==0.2.6', 'ray==2.6.0'] _base_requirements: dict[str, t.Any] = { inflection.dasherize(name): config_cls.__openllm_requirements__