Skip to content

Commit e6312d2

Browse files
authored
Uodate Dockerfile.gb200 to latest sglang (#8356)
1 parent 8af145b commit e6312d2

File tree

1 file changed

+28
-39
lines changed

1 file changed

+28
-39
lines changed

docker/Dockerfile.gb200

Lines changed: 28 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ ARG CUDA_VERSION=12.8.1
22
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
33

44
ARG BUILD_TYPE=blackwell
5+
ARG DEEPEP_COMMIT=1b14ad661c7640137fcfe93cccb2694ede1220b0
6+
ARG CMAKE_BUILD_PARALLEL_LEVEL=2
57
ENV DEBIAN_FRONTEND=noninteractive \
68
CUDA_HOME=/usr/local/cuda \
79
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
@@ -16,7 +18,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
1618
tzdata \
1719
software-properties-common netcat-openbsd kmod unzip openssh-server \
1820
curl wget lsof zsh ccache tmux htop git-lfs tree \
19-
python3 python3-pip python3-dev libpython3-dev \
21+
python3 python3-pip python3-dev libpython3-dev python3-venv \
2022
build-essential cmake \
2123
libopenmpi-dev libnuma1 libnuma-dev \
2224
libibverbs-dev libibverbs1 libibumad3 \
@@ -36,13 +38,8 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
3638
&& rm -rf /var/lib/apt/lists/* \
3739
&& apt-get clean
3840

39-
40-
# --- Install SGLang missing package
41-
RUN pip install netifaces
42-
43-
# --- Install nightly PyTorch ---
44-
RUN pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 --force-reinstall
45-
41+
# --- Install SGLang missing package for blackwell build type
42+
RUN python3 -m pip install openai httpx
4643

4744
# GDRCopy installation
4845
RUN mkdir -p /tmp/gdrcopy && cd /tmp \
@@ -56,12 +53,12 @@ RUN mkdir -p /tmp/gdrcopy && cd /tmp \
5653
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
5754

5855
# Clone and install SGLang
59-
# FIXME: Forcing SGLang to 2a2d3478afe8cdb336888f2e6faa3775ac40254e because sgl-kernel v0.2.5 is missing aarch64 package
56+
# NOTE: flashinfer v0.2.9rc1 is not installing for aarch64
6057
WORKDIR /sgl-workspace
6158
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six \
6259
&& git clone https://github.com/sgl-project/sglang.git \
6360
&& cd sglang \
64-
&& git checkout 2a2d3478afe8cdb336888f2e6faa3775ac40254e \
61+
&& git checkout a167fd0bcb9ef4b0f4331a109e40c8cdc770b026 \
6562
&& case "$CUDA_VERSION" in \
6663
12.6.1) CUINDEX=126 ;; \
6764
12.8.1) CUINDEX=128 ;; \
@@ -70,38 +67,33 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
7067
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
7168
&& if [ "$CUDA_VERSION" = "12.8.1" ]; then \
7269
python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \
73-
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.4/sgl_kernel-0.2.4+cu128-cp39-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
70+
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.7/sgl_kernel-0.2.7+cu128-cp39-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
7471
fi
7572

76-
77-
# Build NVSHMEM
78-
# Build and install NVSHMEM + DeepEP
79-
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz \
80-
&& git clone https://github.com/fzyzcjy/DeepEP.git \
81-
&& cd DeepEP \
82-
&& git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 \
83-
&& cd .. \
84-
&& tar -xf nvshmem_src_3.2.5-1.txz && mv nvshmem_src nvshmem \
85-
&& cd nvshmem \
86-
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
87-
&& sed -i '1i#include <unistd.h>' examples/moe_shuffle.cu \
88-
&& rm -f /sgl-workspace/nvshmem_src_3.2.5-1.txz \
89-
&& NVSHMEM_SHMEM_SUPPORT=0 \
90-
NVSHMEM_UCX_SUPPORT=0 \
91-
NVSHMEM_USE_NCCL=0 \
92-
NVSHMEM_MPI_SUPPORT=0 \
93-
NVSHMEM_IBGDA_SUPPORT=1 \
94-
NVSHMEM_PMIX_SUPPORT=0 \
95-
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
96-
NVSHMEM_USE_GDRCOPY=1 \
97-
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" \
98-
&& cmake --build build --target install -j \
99-
&& cd /sgl-workspace/DeepEP \
100-
&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
73+
# Build and install NVSHMEM + DeepEP
74+
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
75+
&& git clone https://github.com/fzyzcjy/DeepEP.git \
76+
&& cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \
77+
&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
78+
&& cd nvshmem \
79+
&& rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
80+
&& NVSHMEM_SHMEM_SUPPORT=0 \
81+
NVSHMEM_UCX_SUPPORT=0 \
82+
NVSHMEM_USE_NCCL=0 \
83+
NVSHMEM_MPI_SUPPORT=0 \
84+
NVSHMEM_IBGDA_SUPPORT=1 \
85+
NVSHMEM_PMIX_SUPPORT=0 \
86+
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
87+
NVSHMEM_USE_GDRCOPY=1 \
88+
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" \
89+
&& cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \
90+
&& cd /sgl-workspace/DeepEP \
91+
&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
10192

10293
# Python tools
10394
RUN python3 -m pip install --no-cache-dir \
10495
datamodel_code_generator \
96+
mooncake_transfer_engine==0.3.5 \
10597
pre-commit \
10698
pytest \
10799
black \
@@ -145,9 +137,6 @@ RUN apt update -y \
145137
&& apt update -y \
146138
&& apt install nsight-systems-cli -y
147139

148-
# --- Install Mooncake ---
149-
RUN pip install mooncake-transfer-engine==0.3.5
150-
151140
# Set up locale
152141
RUN locale-gen en_US.UTF-8
153142
ENV LANG en_US.UTF-8

0 commit comments

Comments
 (0)