Build VLLM CUDA from RHEL AI wheels, add audio and video packages (#85)

ckhordiasma · web-flow · commit 7a8e29a0e889 · 2025-03-20T12:57:36.000-05:00
* Update Dockerfile.ubi to install vllm-cuda using wheel from RHEL AI team

the install script is located in payload/run.sh. An args file was also
added with the custom parameters, and is referenced in the tekton
pipeline.

* update payload/run.sh to use bot token

* add trap to guarantee run.sh deletion
diff --git a/.tekton/vllm-cuda-v2-19-push.yaml b/.tekton/vllm-cuda-v2-19-push.yaml
@@ -30,6 +30,10 @@ spec:
     value: Dockerfile.ubi
   - name: path-context
     value: .
+  - name: additional-build-secret
+    value: rhel-ai-private-index-auth
+  - name: build-args-file
+    value: argfile.konflux
   taskRunSpecs:
     - pipelineTaskName: ecosystem-cert-preflight-checks
       computeResources:
@@ -294,6 +298,8 @@ spec:
           - $(params.build-platforms)
       name: build-images
       params:
+      - name: ADDITIONAL_SECRET
+        value: $(params.additional-build-secret)
       - name: IMAGE
         value: $(params.output-image)
       - name: DOCKERFILE
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -1,12 +1,9 @@
-## Global Args #################################################################
-ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
-ARG PYTHON_VERSION=3.12
 
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ARG BASE_UBI_IMAGE_TAG
+ARG PYTHON_VERSION
 
 ## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf -y update && microdnf install -y --nodocs \
@@ -19,25 +16,28 @@ ENV LANG=C.UTF-8 \
     LC_ALL=C.UTF-8
 
 # Some utils for dev purposes - tar required for kubectl cp
+
 RUN microdnf install -y --nodocs \
-        which procps findutils tar vim git\
+        which procps findutils tar vim git \
     && microdnf clean all
 
 
 ## Python Installer ############################################################
-FROM base as python-install
+FROM base AS python-install
 ARG PYTHON_VERSION
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN microdnf install -y --nodocs \
     python${PYTHON_VERSION}-devel  && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
+    pip install --no-cache -U pip wheel uv && \
+    microdnf clean all
 
 
 ## CUDA Base ###################################################################
-FROM python-install as cuda-base
+FROM python-install AS cuda-base
 
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
@@ -51,88 +51,30 @@ RUN microdnf install -y --nodocs \
     ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
 
 
-
 ## Python cuda base #################################################################
 FROM cuda-base AS python-cuda-base
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # install cuda and common dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     uv pip install \
         -r requirements-cuda.txt
 
 
-## Development #################################################################
-FROM python-cuda-base AS dev
-
-# install build and runtime dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
-    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
-    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
-    uv pip install \
-        -r requirements-cuda.txt \
-        -r requirements-dev.txt
-
-## Builder #####################################################################
-FROM dev AS build
-
-# install build dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
-    uv pip install -r requirements-build.txt
-
-# install compiler cache to speed up compilation leveraging local or remote caching
-# git is required for the cutlass kernels
-RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
-
-COPY . .
-
-ARG TORCH_CUDA_ARCH_LIST
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-ARG vllm_fa_cmake_gpu_arches
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
-
-# max jobs used by Ninja to build extensions
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# number of threads used by nvcc
-ARG nvcc_threads=8
-ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
-
-# Make sure the cuda environment is in the PATH
-ENV PATH=/usr/local/cuda/bin:$PATH
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=.git,target=/workspace/.git \
-    env CFLAGS="-march=haswell" \
-        CXXFLAGS="$CFLAGS $CXXFLAGS" \
-        CMAKE_BUILD_TYPE=Release \
-        python3 setup.py bdist_wheel --dist-dir=dist
 
 #################### libsodium Build IMAGE ####################
-FROM base as libsodium-builder
+FROM base AS libsodium-builder
 
 RUN microdnf install -y --nodocs gcc gzip \
     && microdnf clean all
 
 WORKDIR /usr/src/libsodium
 
-ARG LIBSODIUM_VERSION=1.0.20
+ARG LIBSODIUM_VERSION
 RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
     && tar -xzvf libsodium*.tar.gz \
     && rm -f libsodium*.tar.gz \
@@ -156,25 +98,32 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
 ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
 
 # Triton needs a CC compiler
+
 RUN microdnf install -y --nodocs gcc \
     rsync \
     && microdnf clean all
 
-# install vllm wheel first, so that torch etc will be installed
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
-    cd /usr/src/libsodium \
-    && make install
+    make -C /usr/src/libsodium install
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install \
-        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+
+# install vllm by running the payload script and then install flashinfer
+
+ARG VLLM_WHEEL_VERSION
+ARG VLLM_WHEEL_INDEX
+ARG FLASHINFER_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=payload,target=/workspace/payload \
+    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
+        env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
+            VLLM_WHEEL_VERSION=${VLLM_VERSION} \
+            VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
+        ./payload/run.sh && \
+        uv pip install "${FLASHINFER_VERSION}" 
 
 ENV HF_HUB_OFFLINE=1 \
     HOME=/home/vllm \
@@ -199,25 +148,32 @@ ENV HF_HUB_OFFLINE=1 \
 RUN umask 002 && \
     useradd --uid 2000 --gid 0 vllm && \
     mkdir -p /home/vllm && \
-    chmod g+rwx /home/vllm /usr/src /workspace
-
-COPY LICENSE /licenses/vllm.md
-COPY examples/*.jinja /app/data/template/
+    chmod g+rwx /home/vllm
 
 USER 2000
 WORKDIR /home/vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 
 
-FROM vllm-openai as vllm-grpc-adapter
+## TGIS Adapter layer #####################################################################
+FROM vllm-openai AS vllm-grpc-adapter
 
 USER root
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
+ARG VLLM_TGIS_ADAPTER_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=payload,target=/workspace/payload \
+    --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
+    cd /workspace && \
+    ls && \
+    env HOME=/root \
+        BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
+        VLLM_WHEEL_VERSION=${VLLM_VERSION} \
+        VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
+        VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
+        ./payload/run.sh
+
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \
diff --git a/argfile.konflux b/argfile.konflux
@@ -0,0 +1,7 @@
+BASE_UBI_IMAGE_TAG=9.5-1739420147
+PYTHON_VERSION=3.11
+LIBSODIUM_VERSION=1.0.20
+VLLM_TGIS_ADAPTER_VERSION=0.6.3
+FLASHINFER_VERSION=https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
+VLLM_WHEEL_VERSION=0.7.2
+VLLM_WHEEL_INDEX=https://gitlab.com/api/v4/projects/66664052/packages/pypi/simple
diff --git a/payload/run.sh b/payload/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# required env vars:
+# $BOT_PAT
+# $WHEEL_RELEASE_ARTIFACTS
+# optional:
+# $VLLM_TGIS_ADAPTER_VERSION
+# $VLLM_WHEEL_VERSION
+set -ex
+
+cat <<EOF > ${HOME}/.netrc
+machine gitlab.com
+login rhel-ai-wheels-prefetch-token-rhoai 
+password $BOT_PAT
+EOF
+
+trap "rm ${HOME}/.netrc" EXIT
+
+# https://docs.astral.sh/uv/configuration/indexes/#searching-across-multiple-indexes
+# This will prefer to use the custom index, and fall back to pypi if needed
+export UV_EXTRA_INDEX_URL=${VLLM_WHEEL_INDEX}
+export UV_INDEX_STRATEGY=unsafe-first-match 
+
+vllm="vllm[tensorizer,audio,video]"
+
+if [[ -n "$VLLM_TGIS_ADAPTER_VERSION" ]]; then
+    vllm_tgis_adapter="vllm-tgis-adapter==${VLLM_TGIS_ADAPTER_VERSION}"
+fi
+
+if [[ -n "$VLLM_WHEEL_VERSION" ]]; then
+    vllm="${vllm}==${$VLLM_WHEEL_VERSION}"
+fi
+
+uv pip install $vllm $vllm_tgis_adapter
+