chore(build): use latest vllm pre-built kernel (#261)

2026-05-19 14:16:22 -04:00 · 2023-08-26 09:02:52 -04:00
parent fa00bbcc32
commit 2036d4e015
4 changed files with 8 additions and 59 deletions
--- a/openllm-python/src/openllm/bundle/oci/Dockerfile
+++ b/openllm-python/src/openllm/bundle/oci/Dockerfile
@@ -60,50 +60,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends ninja-build &&
 RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8 && \
    /opt/conda/bin/conda clean -ya

-# NOTE: Build vllm CUDA kernels
-FROM kernel-builder as vllm-builder
-
-ENV COMMIT_HASH d1744376ae9fdbfa6a2dc763e1c67309e138fa3d
-ARG COMMIT_HASH=${COMMIT_HASH}
-
-WORKDIR /usr/src
-
-RUN <<EOT
-git clone https://github.com/vllm-project/vllm.git && cd vllm
-git fetch && git checkout ${COMMIT_HASH}
-python setup.py build
-EOT
-
-# NOTE: Build flash-attention-2 CUDA kernels
-FROM kernel-builder as flash-attn-v2-builder
-
-ENV COMMIT_HASH 4c98d0b41f38ee638a979064856ae06fc1aec8b6
-ARG COMMIT_HASH=${COMMIT_HASH}
-
-WORKDIR /usr/src
-
-RUN <<EOT
-pip install packaging
-git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2 && cd flash-attention-v2
-git fetch && git checkout ${COMMIT_HASH}
-python setup.py build
-EOT
-
-# NOTE: Build auto-gptq CUDA kernels
-FROM kernel-builder as auto-gptq-builder
-
-ENV COMMIT_HASH 18326851213568df3c5bbbb1169fe51c7f7d6c60
-ARG COMMIT_HASH=${COMMIT_HASH}
-
-WORKDIR /usr/src
-
-RUN <<EOT
-pip install packaging
-git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
-git fetch && git checkout ${COMMIT_HASH}
-TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX;8.9;9.0" python setup.py build
-EOT
-
 # base image
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 as base-container

@@ -122,15 +78,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Copy conda with PyTorch installed
 COPY --from=pytorch-install /opt/conda /opt/conda

-# Copy build artefacts for vllm
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artefacts for flash-attention-v2
-COPY --from=flash-attn-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artefacts for auto-gptq
-COPY --from=auto-gptq-builder /usr/src/AutoGPTQ/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
 # Install required dependencies
 COPY openllm-python/src src
 COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./
@@ -145,8 +92,10 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins

 # Install all required dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
-  pip install --extra-index-url "https://download.pytorch.org/whl/cu118" -v --no-cache-dir \
-    "ray==2.6.0" "einops" "torch>=2.0.1+cu118" xformers "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]"
+  pip install --extra-index-url "https://download.pytorch.org/whl/cu118" \
+              --extra-index-url "https://huggingface.github.io/autogptq-index/whl/cu118/" \
+              -v --no-cache-dir \
+              "ray==2.6.0" "einops" "vllm>=0.1.4" "auto-gptq[triton]" "torch>=2.0.1+cu118" xformers "jax[cuda11_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ".[opt,mpt,fine-tune,llama,chatglm]"

 FROM base-container