From 539f250c0f8637b12276d6580a1ace0e71ebe4a1 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sun, 19 Nov 2023 02:52:32 -0500 Subject: [PATCH] feat(vllm): bump to 0.2.2 (#695) * feat(vllm): bump to 0.2.2 Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: move up to CUDA 12.1 Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: remove auto-gptq installation since the builder image doesn't have access to GPU Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: update containerization warning Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- README.md | 2 +- changelog.d/695.change.md | 1 + examples/bentofile.yaml | 2 +- openllm-python/README.md | 2 +- openllm-python/pyproject.toml | 6 +++--- openllm-python/src/openllm/_llm.py | 2 +- openllm-python/src/openllm/bundle/_package.py | 10 +--------- .../src/openllm/bundle/oci/Dockerfile | 18 +++++++++--------- .../src/openllm/bundle/oci/__init__.py | 9 +++++---- tools/dependencies.py | 4 ++-- 10 files changed, 25 insertions(+), 31 deletions(-) create mode 100644 changelog.d/695.change.md diff --git a/README.md b/README.md index a32dde26..7dd67ca4 100644 --- a/README.md +++ b/README.md @@ -1276,7 +1276,7 @@ openllm start TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq ``` > [!NOTE] -> In order to run GPTQ, make sure you run `pip install "openllm[gptq]" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/` +> In order to run GPTQ, make sure you run `pip install "openllm[gptq]"` > first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving. > See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization. diff --git a/changelog.d/695.change.md b/changelog.d/695.change.md new file mode 100644 index 00000000..af4f2d7a --- /dev/null +++ b/changelog.d/695.change.md @@ -0,0 +1 @@ +Update vLLM to 0.2.2, bringing supports and a lot of improvement upstream diff --git a/examples/bentofile.yaml b/examples/bentofile.yaml index 99331a9d..38cb0624 100644 --- a/examples/bentofile.yaml +++ b/examples/bentofile.yaml @@ -3,4 +3,4 @@ include: - 'api_server.py' python: packages: - - openllm[vllm]==0.4.12 + - openllm[vllm]>=0.4.15 diff --git a/openllm-python/README.md b/openllm-python/README.md index a32dde26..7dd67ca4 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -1276,7 +1276,7 @@ openllm start TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq ``` > [!NOTE] -> In order to run GPTQ, make sure you run `pip install "openllm[gptq]" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/` +> In order to run GPTQ, make sure you run `pip install "openllm[gptq]"` > first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving. > See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization. diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 5e531e36..2b365b24 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -113,13 +113,13 @@ full = [ "openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,starcoder,vllm]", ] ggml = ["ctransformers"] -gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"] -grpc = ["bentoml[grpc]>=1.1.9", "openllm-client[grpc]>=0.4.15"] +gptq = ["auto-gptq[triton]>=0.4.2"] +grpc = ["bentoml[grpc]>=1.1.9", "openllm-client[grpc]>=0.4.16"] mpt = ["triton"] openai = ["openai[datalib]>=1", "tiktoken"] playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] starcoder = ["bitsandbytes"] -vllm = ["vllm>=0.2.1post1", "ray"] +vllm = ["vllm>=0.2.2"] [tool.hatch.version] fallback-version = "0.0.0" diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 4f44084c..db25774e 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -199,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin): system_message=system_message, LLM__model_attrs=model_attrs, LLM__tokenizer_attrs=tokenizer_attrs, - llm_dtype__=torch_dtype.lower(), + llm_dtype__=dtype.lower(), llm_backend__=backend, llm_config__=llm_config, llm_trust_remote_code__=trust_remote_code, diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index f885a30c..6405b3b9 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -65,15 +65,7 @@ def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=N built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')] if all(i for i in built_wheels): wheels = [llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels] - return PythonOptions( - packages=packages, - wheels=wheels, - lock_packages=True, - extra_index_url=[ - 'https://download.pytorch.org/whl/cu118', - 'https://huggingface.github.io/autogptq-index/whl/cu118/', - ], - ) + return PythonOptions(packages=packages, wheels=wheels, lock_packages=True) def construct_docker_options( diff --git a/openllm-python/src/openllm/bundle/oci/Dockerfile b/openllm-python/src/openllm/bundle/oci/Dockerfile index bb586d8f..19dfda13 100644 --- a/openllm-python/src/openllm/bundle/oci/Dockerfile +++ b/openllm-python/src/openllm/bundle/oci/Dockerfile @@ -10,13 +10,13 @@ ENV PATH /opt/conda/bin:$PATH ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - ccache \ - curl \ - libssl-dev ca-certificates make \ - git && \ - rm -rf /var/lib/apt/lists/* + build-essential \ + ca-certificates \ + ccache \ + curl \ + libssl-dev ca-certificates make \ + git && \ + rm -rf /var/lib/apt/lists/* # Install required dependencies @@ -29,8 +29,8 @@ COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./ # pip install autoawq --no-cache-dir && \ RUN --mount=type=cache,target=/root/.cache/pip \ pip install --extra-index-url "https://huggingface.github.io/autogptq-index/whl/cu118/" \ - -v --no-cache-dir \ - "ray==2.6.0" "einops" "vllm==0.2.1.post1" "auto-gptq[triton]" "torch==2.0.1" xformers && \ + -v --no-cache-dir \ + "ray==2.6.0" "vllm==0.2.2" xformers && \ pip install --no-cache-dir -e . FROM base-container diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index ddb35c4f..cb9b2b21 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -50,12 +50,13 @@ class RefResolver: else: raise ValueError(f'Unknown strategy: {strategy_or_version}') + # fmt: off @property - def tag(self): - return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version) - + def tag(self):return 'latest' if self.strategy in {'latest','nightly'} else repr(self.version) @staticmethod - def construct_base_image(reg, strategy=None): + def construct_base_image(reg,strategy=None): + if reg == 'gh': logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.") + elif reg == 'docker': logger.warning('docker is base image is yet to be supported. Falling back to "ecr".'); reg = 'ecr' return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}' diff --git a/tools/dependencies.py b/tools/dependencies.py index d44e39f6..d9bed1dc 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -156,8 +156,8 @@ PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat'] GGML_DEPS = ['ctransformers'] CTRANSLATE_DEPS = ['ctranslate2'] AWQ_DEPS = ['autoawq'] -GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2', 'optimum>=1.12.0'] -VLLM_DEPS = ['vllm>=0.2.1post1', 'ray'] +GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2'] +VLLM_DEPS = ['vllm>=0.2.2'] _base_requirements: dict[str, t.Any] = { inflection.dasherize(name): config_cls.__openllm_requirements__