1- ## Global Args #################################################################
2- ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
3- ARG PYTHON_VERSION=3.12
41
5- ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
6- ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
2+ ARG BASE_UBI_IMAGE_TAG
3+ ARG PYTHON_VERSION
74
85## Base Layer ##################################################################
9- FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
6+ FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
107ARG PYTHON_VERSION
118ENV PYTHON_VERSION=${PYTHON_VERSION}
129RUN microdnf -y update && microdnf install -y --nodocs \
@@ -19,25 +16,28 @@ ENV LANG=C.UTF-8 \
1916 LC_ALL=C.UTF-8
2017
2118# Some utils for dev purposes - tar required for kubectl cp
19+
2220RUN microdnf install -y --nodocs \
23- which procps findutils tar vim git\
21+ which procps findutils tar vim git \
2422 && microdnf clean all
2523
2624
2725## Python Installer ############################################################
28- FROM base as python-install
26+ FROM base AS python-install
2927ARG PYTHON_VERSION
3028
3129ENV VIRTUAL_ENV=/opt/vllm
3230ENV PATH="$VIRTUAL_ENV/bin:$PATH"
3331ENV PYTHON_VERSION=${PYTHON_VERSION}
3432RUN microdnf install -y --nodocs \
3533 python${PYTHON_VERSION}-devel && \
36- python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
34+ python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
35+ pip install --no-cache -U pip wheel uv && \
36+ microdnf clean all
3737
3838
3939## CUDA Base ###################################################################
40- FROM python-install as cuda-base
40+ FROM python-install AS cuda-base
4141
4242RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
4343 https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
@@ -51,88 +51,30 @@ RUN microdnf install -y --nodocs \
5151 ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
5252
5353
54-
5554## Python cuda base #################################################################
5655FROM cuda-base AS python-cuda-base
5756
5857ENV VIRTUAL_ENV=/opt/vllm
5958ENV PATH="$VIRTUAL_ENV/bin:$PATH"
6059
6160# install cuda and common dependencies
62- RUN --mount=type=cache,target=/root/.cache/pip \
63- --mount=type=cache,target=/root/.cache/uv \
61+ RUN --mount=type=cache,target=/root/.cache/uv \
6462 --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
6563 --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
6664 uv pip install \
6765 -r requirements-cuda.txt
6866
6967
70- ## Development #################################################################
71- FROM python-cuda-base AS dev
72-
73- # install build and runtime dependencies
74- RUN --mount=type=cache,target=/root/.cache/pip \
75- --mount=type=cache,target=/root/.cache/uv \
76- --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
77- --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
78- --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
79- --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
80- --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
81- uv pip install \
82- -r requirements-cuda.txt \
83- -r requirements-dev.txt
84-
85- ## Builder #####################################################################
86- FROM dev AS build
87-
88- # install build dependencies
89- RUN --mount=type=cache,target=/root/.cache/pip \
90- --mount=type=cache,target=/root/.cache/uv \
91- --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
92- uv pip install -r requirements-build.txt
93-
94- # install compiler cache to speed up compilation leveraging local or remote caching
95- # git is required for the cutlass kernels
96- RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
97-
98- COPY . .
99-
100- ARG TORCH_CUDA_ARCH_LIST
101- ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
102- ARG vllm_fa_cmake_gpu_arches
103- ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
104-
105- # max jobs used by Ninja to build extensions
106- ARG max_jobs=2
107- ENV MAX_JOBS=${max_jobs}
108- # number of threads used by nvcc
109- ARG nvcc_threads=8
110- ENV NVCC_THREADS=$nvcc_threads
111- # make sure punica kernels are built (for LoRA)
112- ENV VLLM_INSTALL_PUNICA_KERNELS=1
113-
114- # Make sure the cuda environment is in the PATH
115- ENV PATH=/usr/local/cuda/bin:$PATH
116-
117- ENV CCACHE_DIR=/root/.cache/ccache
118- RUN --mount=type=cache,target=/root/.cache/ccache \
119- --mount=type=cache,target=/root/.cache/pip \
120- --mount=type=cache,target=/root/.cache/uv \
121- --mount=type=bind,src=.git,target=/workspace/.git \
122- env CFLAGS="-march=haswell" \
123- CXXFLAGS="$CFLAGS $CXXFLAGS" \
124- CMAKE_BUILD_TYPE=Release \
125- python3 setup.py bdist_wheel --dist-dir=dist
12668
12769#################### libsodium Build IMAGE ####################
128- FROM base as libsodium-builder
70+ FROM base AS libsodium-builder
12971
13072RUN microdnf install -y --nodocs gcc gzip \
13173 && microdnf clean all
13274
13375WORKDIR /usr/src/libsodium
13476
135- ARG LIBSODIUM_VERSION=1.0.20
77+ ARG LIBSODIUM_VERSION
13678RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
13779 && tar -xzvf libsodium*.tar.gz \
13880 && rm -f libsodium*.tar.gz \
@@ -156,25 +98,32 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
15698ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
15799
158100# Triton needs a CC compiler
101+
159102RUN microdnf install -y --nodocs gcc \
160103 rsync \
161104 && microdnf clean all
162105
163- # install vllm wheel first, so that torch etc will be installed
164- RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
165- --mount=type=cache,target=/root/.cache/pip \
166- --mount=type=cache,target=/root/.cache/uv \
167- uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose
168106
169107# Install libsodium for Tensorizer encryption
170108RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
171- cd /usr/src/libsodium \
172- && make install
109+ make -C /usr/src/libsodium install
173110
174- RUN --mount=type=cache,target=/root/.cache/pip \
175- --mount=type=cache,target=/root/.cache/uv \
176- uv pip install \
177- "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
111+ COPY LICENSE /licenses/vllm.md
112+ COPY examples/*.jinja /app/data/template/
113+
114+ # install vllm by running the payload script and then install flashinfer
115+
116+ ARG VLLM_WHEEL_VERSION
117+ ARG VLLM_WHEEL_INDEX
118+ ARG FLASHINFER_VERSION
119+ RUN --mount=type=cache,target=/root/.cache/uv \
120+ --mount=type=bind,src=payload,target=/workspace/payload \
121+ --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
122+ env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
123+ VLLM_WHEEL_VERSION=${VLLM_VERSION} \
124+ VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
125+ ./payload/run.sh && \
126+ uv pip install "${FLASHINFER_VERSION}"
178127
179128ENV HF_HUB_OFFLINE=1 \
180129 HOME=/home/vllm \
@@ -199,25 +148,32 @@ ENV HF_HUB_OFFLINE=1 \
199148RUN umask 002 && \
200149 useradd --uid 2000 --gid 0 vllm && \
201150 mkdir -p /home/vllm && \
202- chmod g+rwx /home/vllm /usr/src /workspace
203-
204- COPY LICENSE /licenses/vllm.md
205- COPY examples/*.jinja /app/data/template/
151+ chmod g+rwx /home/vllm
206152
207153USER 2000
208154WORKDIR /home/vllm
209155
210156ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
211157
212158
213- FROM vllm-openai as vllm-grpc-adapter
159+ ## TGIS Adapter layer #####################################################################
160+ FROM vllm-openai AS vllm-grpc-adapter
214161
215162USER root
216163
217- RUN --mount=type=cache,target=/root/.cache/pip \
218- --mount=type=cache,target=/root/.cache/uv \
219- --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
220- HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
164+ ARG VLLM_TGIS_ADAPTER_VERSION
165+ RUN --mount=type=cache,target=/root/.cache/uv \
166+ --mount=type=bind,src=payload,target=/workspace/payload \
167+ --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
168+ cd /workspace && \
169+ ls && \
170+ env HOME=/root \
171+ BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
172+ VLLM_WHEEL_VERSION=${VLLM_VERSION} \
173+ VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
174+ VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
175+ ./payload/run.sh
176+
221177
222178ENV GRPC_PORT=8033 \
223179 PORT=8000 \
0 commit comments