Skip to content

Commit 4d6a578

Browse files
committed
rdma-tools: add deepep/deepgemm/ucx tests
Signed-off-by: Cyclinder Kuo <[email protected]>
1 parent a10a859 commit 4d6a578

9 files changed

Lines changed: 368 additions & 22 deletions

File tree

.github/workflows/ImageRdmaTools.yaml

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,11 +163,95 @@ jobs:
163163
set -x
164164
cd ${{ env.DOCKERFILE_PATH }}
165165
chmod +x ./generateDockerfile.sh
166-
VAR_NCCL_BASE=true ./generateDockerfile.sh
166+
VAR_NCCL_BASE=false ./generateDockerfile.sh
167167
image_tag=$( git ls-tree --full-tree HEAD -- ${{ env.DOCKERFILE_PATH }} | awk '{ print $3 }' )
168168
[ -n "${image_tag}" ] || { echo "error, failed to get image_tag from commit id"; exit 1 ; }
169169
echo "image_tag=light-${image_tag}" >> $GITHUB_ENV
170170
171+
get_deepep_info:
172+
runs-on: ubuntu-latest
173+
outputs:
174+
code_sha: ${{ env.code_sha }}
175+
push_image: ${{ env.push_image }}
176+
build_platform: ${{ env.build_platform }}
177+
upload_artifact: ${{ env.upload_artifact }}
178+
image_tag: ${{ env.image_tag }}
179+
image_name: ${{ env.image_name }}
180+
dockerfile_dirctory: ${{ env.dockerfile_dirctory }}
181+
build_latest: ${{ env.build_latest }}
182+
generateDockerfileScript: ${{ env.generateDockerfileScript }}
183+
steps:
184+
- name: get information
185+
run: |
186+
echo '${{ toJSON(github) }}'
187+
echo "image_name=${{ env.IMAGE_NAME }}" >> $GITHUB_ENV
188+
echo "build_platform=linux/amd64" >> $GITHUB_ENV
189+
echo "upload_artifact=false" >> $GITHUB_ENV
190+
echo "dockerfile_dirctory=${{ env.DOCKERFILE_PATH }}" >> $GITHUB_ENV
191+
echo "generateDockerfileScript=${{ env.GEN_DOCKERFILE_SCRIPT }}" >> $GITHUB_ENV
192+
if ${{ github.event_name == 'workflow_dispatch' }}; then
193+
echo "call by workflow_dispatch"
194+
echo "code_sha=${{ github.event.inputs.ref }}" >> $GITHUB_ENV
195+
echo "push_image=true" >> $GITHUB_ENV
196+
echo "build_latest=false" >> $GITHUB_ENV
197+
elif ${{ github.event_name == 'push' }} ; then
198+
echo "call by push tag"
199+
echo "code_sha=${GITHUB_REF##*/}" >> $GITHUB_ENV
200+
echo "push_image=true" >> $GITHUB_ENV
201+
echo "build_latest=true" >> $GITHUB_ENV
202+
else
203+
echo "call by PR"
204+
echo "use sha ${{ github.event.pull_request.head.sha }} , by pr"
205+
echo "code_sha=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
206+
echo "push_image=false" >> $GITHUB_ENV
207+
echo "build_latest=false" >> $GITHUB_ENV
208+
fi
209+
210+
- name: Checkout Source Code
211+
uses: actions/checkout@v4
212+
with:
213+
persist-credentials: false
214+
ref: ${{ env.code_sha }}
215+
216+
- name: Generate Dockerfile
217+
run: |
218+
set -x
219+
cd ${{ env.DOCKERFILE_PATH }}
220+
chmod +x ./generateDockerfile.sh
221+
VAR_NCCL_BASE=true ENV_INSTALL_DEEPEP=true ./generateDockerfile.sh
222+
image_tag=$( git ls-tree --full-tree HEAD -- ${{ env.DOCKERFILE_PATH }} | awk '{ print $3 }' )
223+
[ -n "${image_tag}" ] || { echo "error, failed to get image_tag from commit id"; exit 1 ; }
224+
echo "image_tag=deepep-${image_tag}" >> $GITHUB_ENV
225+
226+
call-deepep-workflow:
227+
needs: [get_deepep_info]
228+
uses: ./.github/workflows/callBuildImage.yaml
229+
with:
230+
code_sha: ${{ needs.get_deepep_info.outputs.code_sha }}
231+
push_image: ${{ needs.get_deepep_info.outputs.push_image }}
232+
image_name: ${{ needs.get_deepep_info.outputs.image_name }}
233+
dockerfile_dirctory: ${{ needs.get_deepep_info.outputs.dockerfile_dirctory }}
234+
build_platform: ${{ needs.get_deepep_info.outputs.build_platform }}
235+
upload_artifact: ${{ needs.get_deepep_info.outputs.upload_artifact }}
236+
image_tag: ${{ needs.get_deepep_info.outputs.image_tag }}
237+
generateDockerfileCmd: "VAR_NCCL_BASE=true ENV_INSTALL_DEEPEP=true ${{ needs.get_deepep_info.outputs.generateDockerfileScript }}"
238+
secrets: inherit
239+
240+
call-deepep-latest-workflow:
241+
needs: [get_deepep_info]
242+
uses: ./.github/workflows/callBuildImage.yaml
243+
if: ${{ needs.get_deepep_info.outputs.build_latest == 'true' }}
244+
with:
245+
code_sha: ${{ needs.get_deepep_info.outputs.code_sha }}
246+
push_image: ${{ needs.get_deepep_info.outputs.push_image }}
247+
image_name: ${{ needs.get_deepep_info.outputs.image_name }}
248+
dockerfile_dirctory: ${{ needs.get_deepep_info.outputs.dockerfile_dirctory }}
249+
build_platform: ${{ needs.get_deepep_info.outputs.build_platform }}
250+
upload_artifact: ${{ needs.get_deepep_info.outputs.upload_artifact }}
251+
image_tag: deepep-latest
252+
generateDockerfileCmd: "VAR_NCCL_BASE=true ENV_INSTALL_DEEPEP=true ${{ needs.get_deepep_info.outputs.generateDockerfileScript }}"
253+
secrets: inherit
254+
171255
call-light-workflow:
172256
needs: [get_light_info]
173257
uses: ./.github/workflows/callBuildImage.yaml

.github/workflows/ReleaseRdmaTools.yaml

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939
echo '${{ toJSON(github) }}'
4040
if ${{ github.event_name == 'workflow_dispatch' }}; then
4141
echo "call by workflow_dispatch"
42-
echo "CODE_SHA=${{ github.event.inputs.ref }}" >> $GITHUB_ENV
42+
echo "CODE_SHA=${{ github.event.inputs.tag }}" >> $GITHUB_ENV
4343
elif ${{ github.event_name == 'push' }} ; then
4444
echo "call by push tag"
4545
echo "CODE_SHA=${GITHUB_REF##*/}" >> $GITHUB_ENV
@@ -50,7 +50,7 @@ jobs:
5050
uses: actions/checkout@v4
5151
with:
5252
persist-credentials: false
53-
ref: ${{ steps.get_original_ref.outputs.code_sha }}
53+
ref: ${{ env.CODE_SHA }}
5454

5555
- name: Generate Dockerfile
5656
id: generate_dockerfile
@@ -95,7 +95,7 @@ jobs:
9595
echo '${{ toJSON(github) }}'
9696
if ${{ github.event_name == 'workflow_dispatch' }}; then
9797
echo "call by workflow_dispatch"
98-
echo "CODE_SHA=${{ github.event.inputs.ref }}" >> $GITHUB_ENV
98+
echo "CODE_SHA=${{ github.event.inputs.tag }}" >> $GITHUB_ENV
9999
elif ${{ github.event_name == 'push' }} ; then
100100
echo "call by push tag"
101101
echo "CODE_SHA=${GITHUB_REF##*/}" >> $GITHUB_ENV
@@ -106,7 +106,7 @@ jobs:
106106
uses: actions/checkout@v4
107107
with:
108108
persist-credentials: false
109-
ref: ${{ steps.get_original_ref.outputs.code_sha }}
109+
ref: ${{ env.CODE_SHA }}
110110

111111
- name: Generate Dockerfile
112112
id: generate_dockerfile
@@ -134,8 +134,65 @@ jobs:
134134
generateDockerfileCmd: "VAR_NCCL_BASE=false ${{ needs.get_light_info.outputs.generateDockerfileCmd }}"
135135
secrets: inherit
136136

137+
get_deepep_info:
138+
runs-on: ubuntu-latest
139+
outputs:
140+
chart_path: ${{ env.CHART_PATH }}
141+
code_sha: ${{ env.CODE_SHA }}
142+
dockerfile_dirctory: ${{ env.DOCKERFILE_PATH }}
143+
generateDockerfileCmd: ${{ env.GEN_DOCKERFILE_SCRIPT }}
144+
image_name: ${{ env.IMAGE_NAME }}
145+
build_platform: ${{ env.BUILD_PLATFORM }}
146+
image_tag: ${{ steps.generate_dockerfile.outputs.image_tag }}
147+
steps:
148+
- name: Get information
149+
id: get_original_ref
150+
run: |
151+
echo '${{ toJSON(github) }}'
152+
if ${{ github.event_name == 'workflow_dispatch' }}; then
153+
echo "call by workflow_dispatch"
154+
echo "CODE_SHA=${{ github.event.inputs.tag }}" >> $GITHUB_ENV
155+
elif ${{ github.event_name == 'push' }} ; then
156+
echo "call by push tag"
157+
echo "CODE_SHA=${GITHUB_REF##*/}" >> $GITHUB_ENV
158+
else
159+
exit 1
160+
fi
161+
162+
- name: Checkout Source Code
163+
uses: actions/checkout@v4
164+
with:
165+
persist-credentials: false
166+
ref: ${{ env.CODE_SHA }}
167+
168+
- name: Generate Dockerfile
169+
id: generate_dockerfile
170+
run: |
171+
set -x
172+
cd ${{ env.DOCKERFILE_PATH }}
173+
chmod +x ./generateDockerfile.sh
174+
VAR_NCCL_BASE=true ENV_INSTALL_DEEPEP=true ./generateDockerfile.sh
175+
# image_tag from current tag
176+
image_tag=$(grep -Eo 'v[0-9]+.[0-9]+.[0-9]+' <<< "${{ env.CODE_SHA }}")
177+
[ -n "${image_tag}" ] || { echo "error, failed to get image_tag from commit id"; exit 1 ; }
178+
echo "image_tag=deepep-${image_tag}" >> $GITHUB_OUTPUT
179+
180+
call-image-deepep-workflow:
181+
needs: [get_deepep_info]
182+
uses: ./.github/workflows/callBuildImage.yaml
183+
with:
184+
code_sha: ${{ needs.get_deepep_info.outputs.code_sha }}
185+
push_image: true
186+
image_name: ${{ needs.get_deepep_info.outputs.image_name }}
187+
dockerfile_dirctory: ${{ needs.get_deepep_info.outputs.dockerfile_dirctory }}
188+
build_platform: ${{ needs.get_deepep_info.outputs.build_platform }}
189+
upload_artifact: false
190+
image_tag: ${{ needs.get_deepep_info.outputs.image_tag }}
191+
generateDockerfileCmd: "VAR_NCCL_BASE=true ENV_INSTALL_DEEPEP=true ${{ needs.get_deepep_info.outputs.generateDockerfileCmd }}"
192+
secrets: inherit
193+
137194
call-chart-workflow:
138-
needs: [ get_nccl_info, get_light_info, call-image-nccl-workflow, call-image-light-workflow ]
195+
needs: [ get_nccl_info, get_light_info, get_deepep_info, call-image-nccl-workflow, call-image-light-workflow, call-image-deepep-workflow ]
139196
uses: ./.github/workflows/callBuildChart.yaml
140197
with:
141198
code_sha: ${{ needs.get_nccl_info.outputs.code_sha }}

.github/workflows/callBuildImage.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,7 @@ jobs:
139139
github-token: ${{ secrets.WELAN_PAT }}
140140
platforms: ${{ env.build_platform }}
141141
outputs: type=tar,dest=/tmp/${{ env.upload_image_artifact_name }}.tar
142-
tags: |
143-
${{ env.image_full_name }}:${{ env.image_tag }}
144-
${{ env.image_full_name }}:latest
142+
tags: ${{ format('{0}:{1}{2}', env.image_full_name, env.image_tag, (!startsWith(env.image_tag, 'light-') && !startsWith(env.image_tag, 'deepep-') && env.image_tag != 'light-latest' && env.image_tag != 'deepep-latest') && format('\n{0}:latest', env.image_full_name) || '') }}
145143
build-args: |
146144
GIT_COMMIT_VERSION=${{ env.commitver }}
147145
GIT_COMMIT_TIME=${{ env.committime }}

rdma-tools/image/Dockerfile.template

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#============= build server ============
55

6-
FROM golang:1.24.1 as buildserver
6+
FROM golang:1.24.1 AS buildserver
77

88
# Copy the Go module files
99
COPY /server /server
@@ -21,11 +21,21 @@ ARG TARGETOS
2121
# TARGETARCH is an automatic platform ARG enabled by Docker BuildKit.
2222
ARG TARGETARCH
2323

24+
ENV ENV_BASEIMAGE_CUDA_VERISON=<<ENV_BASEIMAGE_CUDA_VERISON>>
2425
ENV ENV_VERSION_PERFTEST=<<ENV_VERSION_PERFTEST>>
2526
ENV ENV_DOWNLOAD_OFED_DEB_SOURCE=<<ENV_DOWNLOAD_OFED_DEB_SOURCE>>
2627
ENV ENV_VERSION_CUDA_SAMPLE=<<ENV_VERSION_CUDA_SAMPLE>>
2728
ENV ENV_GDRCOPY_COMMIT=<<ENV_GDRCOPY_COMMIT>>
2829
ENV ENV_VERSION_NVBANDWIDTH=<<ENV_VERSION_NVBANDWIDTH>>
30+
ENV ENV_INSTALL_DEEPEP=<<ENV_INSTALL_DEEPEP>>
31+
ENV ENV_DEEPEP_VERSION=<<ENV_DEEPEP_VERSION>>
32+
ENV ENV_DEEPGEMM_VERSION=<<ENV_DEEPGEMM_VERSION>>
33+
ENV ENV_UCX_VERSION=<<ENV_UCX_VERSION>>
34+
ENV ENV_NVSHMEM_VERSION=<<ENV_NVSHMEM_VERSION>>
35+
ENV ENV_CMAKE_CUDA_ARCHITECTURES=<<ENV_CMAKE_CUDA_ARCHITECTURES>>
36+
ENV ENV_BUILD_AND_DOWNLOAD_PARALLEL=<<ENV_BUILD_AND_DOWNLOAD_PARALLEL>>
37+
ENV ENV_GITHUB_ARTIFACTORY=<<ENV_GITHUB_ARTIFACTORY>>
38+
ENV ENV_TORCH_CUDA_ARCH_LIST=<<ENV_TORCH_CUDA_ARCH_LIST>>
2939

3040
# build perftest
3141
# Mellanox OFED (latest): required by perftest
@@ -52,6 +62,31 @@ RUN mkdir /buildGdrcopy || true
5262
RUN mkdir /buildNvbandwidth || true
5363
RUN mkdir /buildCudaSample || true
5464
RUN mkdir /buildnccltest || true
65+
RUN mkdir /buildDeepEP || true
66+
RUN mkdir /buildDeepGEMM || true
67+
RUN mkdir -p /opt/ucx || true
68+
RUN mkdir -p /opt/deepep-tests || true
69+
RUN mkdir -p /opt/deepep-src || true
70+
RUN mkdir -p /opt/deepgemm-src || true
71+
RUN mkdir -p /opt/nvshmem || true
72+
73+
COPY /install-deepep.sh /install-deepep.sh
74+
COPY /install-ucx.sh /install-ucx.sh
75+
76+
RUN if [ "${ENV_INSTALL_DEEPEP}" = "true" ] ; then \
77+
apt-get update \
78+
&& apt-get install -y --no-install-recommends \
79+
build-essential cmake git curl wget vim ninja-build \
80+
libtool autoconf automake pkg-config m4 \
81+
libopenmpi-dev libnuma-dev numactl \
82+
librdmacm-dev libibumad-dev libibverbs-dev libnl-3-dev libnl-route-3-dev \
83+
ibverbs-providers python3-dev python3-pip python-is-python3 \
84+
unzip ca-certificates \
85+
&& chmod +x /install-deepep.sh \
86+
&& /install-deepep.sh \
87+
&& chmod +x /install-ucx.sh \
88+
&& /install-ucx.sh ; \
89+
fi
5590

5691
#========== root image ==============
5792
FROM <<ENV_BASEIMAGE_FULL_NAME>> as rootfs
@@ -76,7 +111,7 @@ ENV OMPI_ALLOW_RUN_AS_ROOT=1
76111
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
77112

78113
COPY /install-tools.sh /install-tools.sh
79-
COPY /install-tools.sh /install-nccl.sh
114+
COPY /install-nccl.sh /install-nccl.sh
80115
COPY /test.sh /test.sh
81116
COPY /tools /tools
82117
COPY /printpaths.sh /printpaths.sh
@@ -97,13 +132,36 @@ COPY --from=buildtools /buildCudaSample/* /usr/bin/
97132
COPY --from=buildtools /buildGdrcopy /buildGdrcopy
98133
COPY --from=buildtools /buildNvbandwidth/* /usr/bin/
99134
COPY --from=buildtools /usr/local/bin/tcpdump /usr/local/bin/tcpdump
135+
COPY --from=buildtools /buildDeepEP /buildDeepEP
136+
COPY --from=buildtools /buildDeepGEMM /buildDeepGEMM
137+
COPY --from=buildtools /opt/ucx /opt/ucx
138+
COPY --from=buildtools /opt/deepep-tests /opt/deepep-tests
139+
COPY --from=buildtools /opt/deepep-src /opt/deepep-src
140+
COPY --from=buildtools /opt/deepgemm-src /opt/deepgemm-src
141+
COPY --from=buildtools /opt/nvshmem /opt/nvshmem
100142

101143
COPY --from=buildserver /server/echo-server /usr/bin/
102144

103145
WORKDIR /
104146
RUN chmod +x /tools/* && mv /tools/* /usr/sbin && rm -rf /tools
105147
RUN chmod +x /install-tools.sh && /install-tools.sh && rm -f /install-tools.sh
106148
RUN chmod +x /install-nccl.sh && export ENV_INSTALL_HPCX=<<ENV_INSTALL_HPCX>> && /install-nccl.sh && rm -f /install-nccl.sh
149+
RUN if [ "<<ENV_INSTALL_DEEPEP>>" = "true" ] ; then \
150+
apt-get update \
151+
&& apt-get install -y --no-install-recommends python3 python3-pip python-is-python3 ca-certificates \
152+
&& python3 -m pip install --upgrade pip \
153+
&& CUDA_SHORT=$(echo "<<ENV_BASEIMAGE_CUDA_VERISON>>" | cut -d. -f1,2) \
154+
&& CU_VER=$(echo "${CUDA_SHORT}" | tr -d .) \
155+
&& pip3 install torch numpy packaging --extra-index-url https://download.pytorch.org/whl/cu${CU_VER} \
156+
&& pip3 install /buildDeepEP/*.whl /buildDeepGEMM/*.whl --force-reinstall \
157+
&& rm -rf /buildDeepEP /buildDeepGEMM \
158+
&& echo "/opt/ucx/lib" > /etc/ld.so.conf.d/ucx.conf \
159+
&& echo "/opt/nvshmem/lib" > /etc/ld.so.conf.d/nvshmem.conf \
160+
&& ldconfig \
161+
&& ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so \
162+
&& apt-get clean \
163+
&& rm -rf /var/lib/apt/lists/* ; \
164+
fi
107165
# check binary
108166
RUN chmod +x /test.sh && ENV_INSTALL_HPCX=<<ENV_INSTALL_HPCX>> /test.sh && rm -f /test.sh
109167

0 commit comments

Comments
 (0)