From 2036d4e0153f99e87014c5267195a940a06969f9 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Sat, 26 Aug 2023 09:02:52 -0400
Subject: [PATCH] chore(build): use latest vllm pre-built kernel (#261)

---
 .github/workflows/build.yml                   |  2 +-
 openllm-python/pyproject.toml                 |  4 +-
 .../src/openllm/bundle/oci/Dockerfile         | 59 ++-----------------
 tools/dependencies.py                         |  2 +-
 4 files changed, 8 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 80a43382..e7d008f3 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -55,7 +55,7 @@ jobs:
     if: >-
       contains(needs.get_commit_message.outputs.message, '[ec2 build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
     env:
-      EC2_INSTANCE_TYPE: g5.12xlarge
+      EC2_INSTANCE_TYPE: t3.2xlarge
       EC2_AMI_ID: ami-089dafe9af191a0fd
       EC2_SUBNET_ID: subnet-0ca63188fe98788c1,subnet-05997205433b249d0,subnet-07ef5d3e974275fed,subnet-0161ef0151089bb0b
       EC2_SECURITY_GROUP: sg-051366641bf2b8049
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 838e9536..129f5075 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -102,7 +102,7 @@ falcon = ["einops", "xformers"]
 fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
 flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 full = [
-    "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
+  "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
 ]
 ggml = ["ctransformers"]
 gptq = ["auto-gptq[triton]"]
@@ -113,7 +113,7 @@ openai = ["openai", "tiktoken"]
 opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 starcoder = ["bitsandbytes"]
-vllm = ["vllm", "ray"]
+vllm = ["vllm>=0.1.4", "ray"]
 
 [tool.hatch.version]
 fallback-version = "0.0.0"
diff --git a/openllm-python/src/openllm/bundle/oci/Dockerfile b/openllm-python/src/openllm/bundle/oci/Dockerfile
index 6f0ef484..c3f4e4ce 100644
--- a/openllm-python/src/openllm/bundle/oci/Dockerfile
+++ b/openllm-python/src/openllm/bundle/oci/Dockerfile
@@ -60,50 +60,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends ninja-build &&
 RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8 && \
     /opt/conda/bin/conda clean -ya
 
-# NOTE: Build vllm CUDA kernels
-FROM kernel-builder as vllm-builder
-
-ENV COMMIT_HASH d1744376ae9fdbfa6a2dc763e1c67309e138fa3d
-ARG COMMIT_HASH=${COMMIT_HASH}
-
-WORKDIR /usr/src
-
-RUN <<EOT
-git clone https://github.com/vllm-project/vllm.git && cd vllm
-git fetch && git checkout ${COMMIT_HASH}
-python setup.py build
-EOT
-
-# NOTE: Build flash-attention-2 CUDA kernels
-FROM kernel-builder as flash-attn-v2-builder
-
-ENV COMMIT_HASH 4c98d0b41f38ee638a979064856ae06fc1aec8b6
-ARG COMMIT_HASH=${COMMIT_HASH}
-
-WORKDIR /usr/src
-
-RUN <<EOT
-pip install packaging
-git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2 && cd flash-attention-v2
-git fetch && git checkout ${COMMIT_HASH}
-python setup.py build
-EOT
-
-# NOTE: Build auto-gptq CUDA kernels
-FROM kernel-builder as auto-gptq-builder
-
-ENV COMMIT_HASH 18326851213568df3c5bbbb1169fe51c7f7d6c60
-ARG COMMIT_HASH=${COMMIT_HASH}
-
-WORKDIR /usr/src
-
-RUN <<EOT
-pip install packaging
-git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
-git fetch && git checkout ${COMMIT_HASH}
-TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
-EOT
-
 # base image
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 as base-container
 
@@ -122,15 +78,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Copy conda with PyTorch installed
 COPY --from=pytorch-install /opt/conda /opt/conda
 
-# Copy build artefacts for vllm
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artefacts for flash-attention-v2
-COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artefacts for auto-gptq
-COPY --from=auto-gptq-builder /usr/src/AutoGPTQ/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
 # Install required dependencies
 COPY openllm-python/src src
 COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./
@@ -145,8 +92,10 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 
 # Install all required dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
-  pip install --extra-index-url "https://download.pytorch.org/whl/cu118" -v --no-cache-dir \
-    "ray==2.6.0" "einops" "torch>=2.0.1+cu118" xformers "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]"
+  pip install --extra-index-url "https://download.pytorch.org/whl/cu118" \
+              --extra-index-url "https://huggingface.github.io/autogptq-index/whl/cu118/" \
+              -v --no-cache-dir \
+              "ray==2.6.0" "einops" "vllm>=0.1.4" "auto-gptq[triton]" "torch>=2.0.1+cu118" xformers "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]"
 
 FROM base-container
 
diff --git a/tools/dependencies.py b/tools/dependencies.py
index 3e895f39..e86ea33c 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -125,7 +125,7 @@ AGENTS_DEPS = ['transformers[agents]>=4.30', 'diffusers', 'soundfile']
 PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat']
 GGML_DEPS = ['ctransformers']
 GPTQ_DEPS = ['auto-gptq[triton]']
-VLLM_DEPS = ['vllm', 'ray']
+VLLM_DEPS = ['vllm>=0.1.4', 'ray']
 
 _base_requirements: dict[str, t.Any] = {
     inflection.dasherize(name): config_cls.__openllm_requirements__ for name, config_cls in openllm.CONFIG_MAPPING.items() if config_cls.__openllm_requirements__