From 539f250c0f8637b12276d6580a1ace0e71ebe4a1 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Sun, 19 Nov 2023 02:52:32 -0500
Subject: [PATCH] feat(vllm): bump to 0.2.2 (#695)

* feat(vllm): bump to 0.2.2

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: update changelog

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: move up to CUDA 12.1

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* fix: remove auto-gptq installation

since the builder image doesn't have access to GPU

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* fix: update containerization warning

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 README.md                                      |  2 +-
 changelog.d/695.change.md                      |  1 +
 examples/bentofile.yaml                        |  2 +-
 openllm-python/README.md                       |  2 +-
 openllm-python/pyproject.toml                  |  6 +++---
 openllm-python/src/openllm/_llm.py             |  2 +-
 openllm-python/src/openllm/bundle/_package.py  | 10 +---------
 .../src/openllm/bundle/oci/Dockerfile          | 18 +++++++++---------
 .../src/openllm/bundle/oci/__init__.py         |  9 +++++----
 tools/dependencies.py                          |  4 ++--
 10 files changed, 25 insertions(+), 31 deletions(-)
 create mode 100644 changelog.d/695.change.md

diff --git a/README.md b/README.md
index a32dde26..7dd67ca4 100644
--- a/README.md
+++ b/README.md
@@ -1276,7 +1276,7 @@ openllm start TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq
 ```
 
 > [!NOTE]
-> In order to run GPTQ, make sure you run `pip install "openllm[gptq]" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`
+> In order to run GPTQ, make sure you run `pip install "openllm[gptq]"`
 > first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving.
 > See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization.
 
diff --git a/changelog.d/695.change.md b/changelog.d/695.change.md
new file mode 100644
index 00000000..af4f2d7a
--- /dev/null
+++ b/changelog.d/695.change.md
@@ -0,0 +1 @@
+Update vLLM to 0.2.2, bringing supports and a lot of improvement upstream
diff --git a/examples/bentofile.yaml b/examples/bentofile.yaml
index 99331a9d..38cb0624 100644
--- a/examples/bentofile.yaml
+++ b/examples/bentofile.yaml
@@ -3,4 +3,4 @@ include:
   - 'api_server.py'
 python:
   packages:
-    - openllm[vllm]==0.4.12
+    - openllm[vllm]>=0.4.15
diff --git a/openllm-python/README.md b/openllm-python/README.md
index a32dde26..7dd67ca4 100644
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -1276,7 +1276,7 @@ openllm start TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq
 ```
 
 > [!NOTE]
-> In order to run GPTQ, make sure you run `pip install "openllm[gptq]" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`
+> In order to run GPTQ, make sure you run `pip install "openllm[gptq]"`
 > first to install the dependency. From the GPTQ paper, it is recommended to quantized the weights before serving.
 > See [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for more information on GPTQ quantization.
 
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 5e531e36..2b365b24 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -113,13 +113,13 @@ full = [
     "openllm[agents,awq,baichuan,chatglm,ctranslate,falcon,fine-tune,ggml,gptq,grpc,mpt,openai,playground,starcoder,vllm]",
 ]
 ggml = ["ctransformers"]
-gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"]
-grpc = ["bentoml[grpc]>=1.1.9", "openllm-client[grpc]>=0.4.15"]
+gptq = ["auto-gptq[triton]>=0.4.2"]
+grpc = ["bentoml[grpc]>=1.1.9", "openllm-client[grpc]>=0.4.16"]
 mpt = ["triton"]
 openai = ["openai[datalib]>=1", "tiktoken"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 starcoder = ["bitsandbytes"]
-vllm = ["vllm>=0.2.1post1", "ray"]
+vllm = ["vllm>=0.2.2"]
 
 [tool.hatch.version]
 fallback-version = "0.0.0"
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 4f44084c..db25774e 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -199,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin):
       system_message=system_message,
       LLM__model_attrs=model_attrs,
       LLM__tokenizer_attrs=tokenizer_attrs,
-      llm_dtype__=torch_dtype.lower(),
+      llm_dtype__=dtype.lower(),
       llm_backend__=backend,
       llm_config__=llm_config,
       llm_trust_remote_code__=trust_remote_code,
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index f885a30c..6405b3b9 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -65,15 +65,7 @@ def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=N
   built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
   if all(i for i in built_wheels):
     wheels = [llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels]
-  return PythonOptions(
-    packages=packages,
-    wheels=wheels,
-    lock_packages=True,
-    extra_index_url=[
-      'https://download.pytorch.org/whl/cu118',
-      'https://huggingface.github.io/autogptq-index/whl/cu118/',
-    ],
-  )
+  return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
 
 
 def construct_docker_options(
diff --git a/openllm-python/src/openllm/bundle/oci/Dockerfile b/openllm-python/src/openllm/bundle/oci/Dockerfile
index bb586d8f..19dfda13 100644
--- a/openllm-python/src/openllm/bundle/oci/Dockerfile
+++ b/openllm-python/src/openllm/bundle/oci/Dockerfile
@@ -10,13 +10,13 @@ ENV PATH /opt/conda/bin:$PATH
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-        build-essential \
-        ca-certificates \
-        ccache \
-        curl \
-        libssl-dev ca-certificates make \
-        git && \
-        rm -rf /var/lib/apt/lists/*
+  build-essential \
+  ca-certificates \
+  ccache \
+  curl \
+  libssl-dev ca-certificates make \
+  git && \
+  rm -rf /var/lib/apt/lists/*
 
 
 # Install required dependencies
@@ -29,8 +29,8 @@ COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./
 # pip install autoawq --no-cache-dir && \
 RUN --mount=type=cache,target=/root/.cache/pip \
   pip install --extra-index-url "https://huggingface.github.io/autogptq-index/whl/cu118/" \
-              -v --no-cache-dir \
-              "ray==2.6.0" "einops" "vllm==0.2.1.post1" "auto-gptq[triton]" "torch==2.0.1" xformers && \
+  -v --no-cache-dir \
+  "ray==2.6.0" "vllm==0.2.2" xformers && \
   pip install --no-cache-dir -e .
 
 FROM base-container
diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py
index ddb35c4f..cb9b2b21 100644
--- a/openllm-python/src/openllm/bundle/oci/__init__.py
+++ b/openllm-python/src/openllm/bundle/oci/__init__.py
@@ -50,12 +50,13 @@ class RefResolver:
     else:
       raise ValueError(f'Unknown strategy: {strategy_or_version}')
 
+  # fmt: off
   @property
-  def tag(self):
-    return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
-
+  def tag(self):return 'latest' if self.strategy in {'latest','nightly'} else repr(self.version)
   @staticmethod
-  def construct_base_image(reg, strategy=None):
+  def construct_base_image(reg,strategy=None):
+    if reg == 'gh': logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.")
+    elif reg == 'docker': logger.warning('docker is base image is yet to be supported. Falling back to "ecr".'); reg = 'ecr'
     return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'
 
 
diff --git a/tools/dependencies.py b/tools/dependencies.py
index d44e39f6..d9bed1dc 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -156,8 +156,8 @@ PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat']
 GGML_DEPS = ['ctransformers']
 CTRANSLATE_DEPS = ['ctranslate2']
 AWQ_DEPS = ['autoawq']
-GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2', 'optimum>=1.12.0']
-VLLM_DEPS = ['vllm>=0.2.1post1', 'ray']
+GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2']
+VLLM_DEPS = ['vllm>=0.2.2']
 
 _base_requirements: dict[str, t.Any] = {
   inflection.dasherize(name): config_cls.__openllm_requirements__