Skip to content

Commit e237b1f

Browse files
committed
Merge branch 'main' into dpykhtar/add_mmap_bin_files
Signed-off-by: dimapihtar <[email protected]>
2 parents e63094e + 2a4e56c commit e237b1f

File tree

171 files changed

+861
-21976
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

171 files changed

+861
-21976
lines changed

.coveragerc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@ omit =
2222
nemo/collections/audio/parts/utils/maxine.py
2323

2424
nemo/core/*
25-
nemo/collections/common/*
25+
nemo/collections/common/*
26+
27+
/workspace/config-3.12.py
28+
/workspace/config-3.py
29+
/workspace/config.py
2630

2731
[paths]
2832
source =

.github/workflows/cicd-main-automodel.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ jobs:
7070
script: L2_VLM_HF_Transformer_PEFT_FSDP2
7171
- runner: self-hosted-azure-gpus-1
7272
script: L2_VLM_HF_Transformer_PEFT_4bit
73+
is-optional: true
7374
- runner: self-hosted-azure
7475
script: L2_VLM_HF_Transformer_SFT_FSDP2
7576
- runner: self-hosted-azure

.github/workflows/cicd-main-export-deploy.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ jobs:
9696
runner: self-hosted-azure-gpus-1
9797
needs: [unit-tests]
9898
runs-on: ${{ matrix.runner }}
99-
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
99+
name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
100100
steps:
101101
- name: Checkout
102102
uses: actions/checkout@v4
@@ -109,4 +109,4 @@ jobs:
109109
script: ${{ matrix.script }}
110110
tests_to_run: ${{ inputs.test_to_run }}
111111
image: ${{ inputs.image-name }}
112-
is_optional: ${{ matrix.is_optional || false }}
112+
is_optional: ${{ matrix.is-optional || false }}

.github/workflows/cicd-main-speech.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ jobs:
120120
script: L2_Speech_Batch_Size_OOMptimizer
121121
- runner: self-hosted-azure
122122
script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
123+
is-optional: true
123124
- runner: self-hosted-azure
124125
script: L2_Speech_Transcription_Speech_to_Text_Transcribe
125126
- runner: self-hosted-azure

.github/workflows/cicd-main.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ jobs:
135135
runs-on: ubuntu-latest
136136
environment: test
137137
if: |
138-
needs.pre-flight.outputs.test_to_run != '[]'
138+
needs.pre-flight.outputs.test_to_run != '[]'
139139
&& needs.pre-flight.outputs.components_to_run != '[]'
140140
&& needs.pre-flight.outputs.is_ci_workload == 'false'
141141
steps:
@@ -147,10 +147,10 @@ jobs:
147147
uses: ./.github/workflows/_build_container.yml
148148
needs: [pre-flight, code-linting, cicd-wait-in-queue]
149149
if: |
150-
needs.pre-flight.outputs.test_to_run != '[]'
150+
needs.pre-flight.outputs.test_to_run != '[]'
151151
&& needs.pre-flight.outputs.components_to_run != '[]'
152152
&& (
153-
success()
153+
success()
154154
|| (
155155
needs.cicd-wait-in-queue.result == 'skipped'
156156
&& needs.pre-flight.outputs.is_ci_workload == 'true'
@@ -385,8 +385,8 @@ jobs:
385385

386386
- name: Remove label if not cancelled
387387
if: |
388-
steps.result.outputs.code != 'cancelled'
389-
&& github.event.label.name == 'Run CICD'
388+
steps.result.outputs.code != 'cancelled'
389+
&& github.event.label.name == 'Run CICD'
390390
&& github.event.pull_request.head.repo.full_name == github.repository
391391
env:
392392
GH_TOKEN: ${{ github.token }}
@@ -395,8 +395,8 @@ jobs:
395395

396396
- name: Pipeline successful, add PR comment
397397
if: |
398-
steps.result.outputs.code == 'success'
399-
&& github.event_name == 'pull_request'
398+
steps.result.outputs.code == 'success'
399+
&& github.event_name == 'pull_request'
400400
&& env.SLACK_WEBHOOK != ''
401401
uses: peter-evans/create-or-update-comment@v4
402402
env:
@@ -416,8 +416,8 @@ jobs:
416416
417417
- name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
418418
if: |
419-
steps.result.outputs.code == 'failure'
420-
&& github.event.label.name == 'Run CICD'
419+
steps.result.outputs.code == 'failure'
420+
&& github.event.label.name == 'Run CICD'
421421
&& env.SLACK_WEBHOOK != ''
422422
env:
423423
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
@@ -451,7 +451,7 @@ jobs:
451451
needs.pre-flight.outputs.test_to_run != '[]'
452452
&& needs.pre-flight.outputs.components_to_run != '[]'
453453
&& (
454-
success()
454+
success()
455455
|| needs.Nemo_CICD_Test.result == 'success'
456456
)
457457
&& !cancelled()

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# Changelog
22

33
<!-- Next changelog -->
4+
## NVIDIA Neural Modules 2.3.2
5+
6+
This release addresses known security issues. For the latest NVIDIA Vulnerability Disclosure Information visit https://www.nvidia.com/en-us/security/, for acknowledgement please reach out to the NVIDIA PSIRT team at [email protected]
7+
48
## NVIDIA Neural Modules 2.3.1
59

610
### Highlights

docker/Dockerfile.ci.export_deploy

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -32,64 +32,65 @@ apt-get install -y bc
3232
apt-get clean
3333
EOF
3434

35-
WORKDIR /tmp/NeMo
35+
WORKDIR /opt/NeMo
3636
ARG TRTLLM_REPO
3737
ARG TRTLLM_TAG
38-
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh bash -ex <<"EOF"
38+
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
39+
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF"
3940

40-
bash /tmp/NeMo/install_dep.sh --library trt --mode install
41+
bash /opt/NeMo/install_dep.sh --library trt --mode install
4142
EOF
4243

4344
FROM base-image AS trt-llm-wheel
44-
WORKDIR /tmp/NeMo
45+
WORKDIR /opt/NeMo
4546
ARG TRTLLM_REPO
4647
ARG TRTLLM_TAG
47-
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh bash -ex <<"EOF"
48+
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
49+
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF"
4850

49-
bash /tmp/NeMo/install_dep.sh --library trtllm --mode build
51+
bash /opt/NeMo/install_dep.sh --library trtllm --mode build
5052
EOF
5153

5254
FROM base-image as te-wheel
53-
WORKDIR /tmp/NeMo
55+
WORKDIR /opt/NeMo
5456
ARG TE_REPO
5557
ARG TE_TAG
56-
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh \
57-
--mount=type=bind,source=external/patches,target=/tmp/NeMo/external/patches bash -ex <<"EOF"
58+
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
59+
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF"
5860

59-
bash /tmp/NeMo/install_dep.sh --library te --mode build
60-
ls -al /tmp/Megatron-LM || true
61+
bash /opt/NeMo/install_dep.sh --library te --mode build
62+
ls -al /opt/Megatron-LM || true
6163
EOF
6264

6365
FROM base-image as mcore-wheel
64-
WORKDIR /tmp/NeMo
66+
WORKDIR /opt/NeMo
6567
ARG MLM_REPO
6668
ARG MLM_TAG
67-
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh bash -ex <<"EOF"
69+
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh bash -ex <<"EOF"
6870

69-
bash /tmp/NeMo/install_dep.sh --library mcore --mode build
70-
ls -al /tmp/Megatron-LM || true
71+
bash /opt/NeMo/install_dep.sh --library mcore --mode build
72+
ls -al /opt/Megatron-LM || true
7173
EOF
7274

7375
FROM base-image
74-
WORKDIR /tmp/NeMo
76+
WORKDIR /opt/NeMo
7577
ENV INSTALL_DIR="/opt"
7678
RUN \
7779
--mount=type=bind,from=trt-llm-wheel,source=/opt/wheels/trtllm,target=/opt/wheels/trtllm \
7880
--mount=type=bind,from=te-wheel,source=/opt/wheels/te,target=/opt/wheels/te \
7981
--mount=type=bind,from=mcore-wheel,source=/opt/wheels/mcore,target=/opt/wheels/mcore \
80-
--mount=type=bind,source=requirements,target=/tmp/NeMo/requirements \
81-
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/tmp/NeMo/tools/ctc_segmentation/requirements.txt \
82-
--mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh \
83-
--mount=type=bind,source=setup.py,target=/tmp/NeMo/setup.py \
84-
--mount=type=bind,source=external/patches,target=/tmp/NeMo/external/patches \
85-
--mount=type=bind,source=README.md,target=/tmp/NeMo/README.md \
86-
--mount=type=bind,source=nemo/package_info.py,target=/tmp/NeMo/nemo/package_info.py \
87-
--mount=type=bind,source=nemo/__init__.py,target=/tmp/NeMo/nemo/__init__.py bash -ex <<"EOF"
88-
89-
bash /tmp/NeMo/install_dep.sh --library all --mode install
82+
--mount=type=bind,source=requirements,target=/opt/NeMo/requirements \
83+
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/opt/NeMo/tools/ctc_segmentation/requirements.txt \
84+
--mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
85+
--mount=type=bind,source=setup.py,target=/opt/NeMo/setup.py \
86+
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches \
87+
--mount=type=bind,source=README.md,target=/opt/NeMo/README.md \
88+
--mount=type=bind,source=nemo/package_info.py,target=/opt/NeMo/nemo/package_info.py \
89+
--mount=type=bind,source=nemo/__init__.py,target=/opt/NeMo/nemo/__init__.py bash -ex <<"EOF"
90+
91+
bash /opt/NeMo/install_dep.sh --library all --mode install
9092
pip install --no-cache-dir ".[deploy,test]"
91-
rm -rf $NEMO_DIR || true
92-
93+
9394
EOF
9495

9596
WORKDIR /workspace

docker/common/install_dep.sh

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ trt() {
6969
git submodule update --init --recursive
7070
sed -i "/torch/d" requirements.txt
7171
git lfs pull
72+
patch -p1 < $CURR/external/patches/trt_llm.patch
7273
popd
7374

7475
if [[ "$mode" == "install" ]]; then
@@ -81,11 +82,12 @@ trt() {
8182
bash docker/common/install_ccache.sh
8283

8384
. docker/common/install_tensorrt.sh \
84-
--TRT_VER="10.9.0.34" \
85-
--CUDA_VER="12.8" \
86-
--CUDNN_VER="9.8.0.87-1" \
87-
--NCCL_VER="2.25.1-1+cuda12.8" \
88-
--CUBLAS_VER="12.8.4.1-1"
85+
--TRT_VER="10.10.0.31" \
86+
--CUDA_VER="12.9" \
87+
--CUDNN_VER="9.9.0.52-1" \
88+
--NCCL_VER="2.26.5-1+cuda12.9" \
89+
--CUBLAS_VER="12.9.0.13-1" \
90+
--NVRTC_VER="12.9.41-1"
8991
set -u
9092
fi
9193
fi
@@ -133,12 +135,15 @@ trtllm() {
133135
git submodule update --init --recursive
134136
sed -i "/torch/d" requirements.txt
135137
git lfs pull
138+
patch -p1 < $CURR/external/patches/trt_llm.patch
136139
popd
137140

138141
build() {
139142
if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then
143+
# CONDA_PREFIX causes an error in trt-llm's build script
144+
unset CONDA_PREFIX
140145
cd $TRTLLM_DIR
141-
python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --dist_dir $WHEELS_DIR --python_bindings --benchmarks
146+
TORCH_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" python3 ./scripts/build_wheel.py --job_count $(nproc) --clean --trt_root /usr/local/tensorrt --dist_dir $WHEELS_DIR --python_bindings --benchmarks
142147
fi
143148
}
144149

@@ -149,8 +154,7 @@ trtllm() {
149154
build
150155
fi
151156

152-
pip install --no-cache-dir $WHEELS_DIR/tensorrt_llm*.whl --extra-index-url https://pypi.nvidia.com &&
153-
sed -i '57d' /usr/local/lib/python3.12/dist-packages/torch_tensorrt/dynamo/conversion/custom_ops_converters.py || true
157+
pip install --no-cache-dir $WHEELS_DIR/tensorrt_llm*.whl --extra-index-url https://pypi.nvidia.com || true
154158
fi
155159
}
156160

@@ -167,15 +171,14 @@ te() {
167171
fi
168172
pushd $TE_DIR
169173
git checkout -f $TE_TAG
170-
patch -p1 </$CURR/external/patches/nemo_2.3.0_te.patch
171174
popd
172175

173176
build() {
174177
if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then
175178
cd $TE_DIR
176179
git submodule init
177180
git submodule update
178-
pip wheel --wheel-dir $WHEELS_DIR/ $TE_DIR
181+
pip wheel --wheel-dir $WHEELS_DIR/ --no-build-isolation $TE_DIR
179182
fi
180183
}
181184

@@ -308,11 +311,6 @@ extra() {
308311
"git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@b6eb61dbf9fe272b1a943b1b0d9efdde99df0737 ; platform_machine == 'x86_64'" # Compiling NvRX requires CUDA
309312
)
310313
fi
311-
if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then
312-
patch \
313-
/usr/local/lib/python3.12/dist-packages/torch/accelerator/__init__.py \
314-
/$CURR/external/patches/torch_accelerator_144567_fix.patch
315-
fi
316314

317315
if [[ "$mode" == "install" ]]; then
318316
pip install --force-reinstall --no-deps --no-cache-dir "${DEPS[@]}"

examples/nlp/duplex_text_normalization/README.md

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)