Skip to content

Commit dcfc176

Browse files
committed
Merge remote-tracking branch 'origin/main' into chtruong/re-add-libsox
Signed-off-by: Charlie Truong <[email protected]>
2 parents 70196a6 + 9bb58fe commit dcfc176

File tree

131 files changed

+4382
-764
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

131 files changed

+4382
-764
lines changed

.github/scripts/components_to_run.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,6 @@ def main(source_sha: str, target_sha: str):
7676

7777
test_modules = list(set(test_modules))
7878

79-
if len(test_modules) == 0:
80-
test_modules = ["nemo2", "automodel", "export-deploy", "speech"]
81-
8279
with open("test_modules.json", "w", encoding="utf-8") as f:
8380
json.dump(test_modules, f)
8481

.github/scripts/nemo_dependencies.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]:
205205
if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests':
206206
dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
207207
elif parts[0] == 'tests':
208-
dependencies[module_path] = [relative_path]
208+
dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")]
209209
elif parts[1] == 'collections':
210210
dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
211211

@@ -252,7 +252,9 @@ def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]:
252252
for package, deps in dependencies.items():
253253
package_parts = package.split('.')
254254

255-
if os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")):
255+
if package_parts[0] == "tests":
256+
simplified_package_path = f"{os.path.join(*package_parts)}.py"
257+
elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")):
256258
simplified_package_path = file_path
257259
elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")):
258260
simplified_package_path = file_path
@@ -267,13 +269,14 @@ def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]:
267269

268270
if (
269271
len(dep_parts) >= 2
270-
and dep_parts[1] in find_top_level_packages(nemo_root)
272+
and (dep_parts[1] in find_top_level_packages(nemo_root))
271273
and dep_parts[1] != 'collections'
272274
):
273275
simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}")
274-
276+
elif dep_parts[0] == "tests":
277+
simplified_dependencies[simplified_package_path].append(".".join(dep_parts))
275278
elif len(dep_parts) >= 3 and (
276-
simplified_name := f"{dep_parts[0]}.{dep_parts[1]}.{dep_parts[2]}"
279+
simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}"
277280
) in find_collection_modules(nemo_root):
278281
simplified_dependencies[simplified_package_path].append(simplified_name)
279282

@@ -353,7 +356,7 @@ def main():
353356

354357
# Output as JSON
355358
data = json.dumps(dependencies, indent=4)
356-
# print(data)
359+
357360
with open('nemo_dependencies.json', 'w', encoding='utf-8') as f:
358361
f.write(data)
359362

.github/workflows/cicd-main-automodel.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ jobs:
5757
tests_to_run: ${{ inputs.test_to_run }}
5858
image: ${{ inputs.image-name }}
5959
cpu-only: ${{ matrix.cpu-only || false }}
60+
is_optional: ${{ matrix.is-optional || false }}
6061

6162
e2e-tests:
6263
strategy:

.github/workflows/cicd-main-nemo2.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ jobs:
4747
runner: self-hosted-azure
4848
- script: L2_NeMo_2_llama3_straggler_detection
4949
runner: self-hosted-azure
50+
- script: L2_NeMo_2_llama3_local_ckpt
51+
runner: self-hosted-azure
5052
- script: L2_NeMo_2_GPT_DDP_Param_Parity_check
5153
runner: self-hosted-azure
5254
- script: L2_NeMo_2_Hyena_Conversion_from_HF
@@ -240,9 +242,20 @@ jobs:
240242
script: L2_NeMo_2_LLAMA4_MOCK_FINETUNE_CP2
241243
- runner: self-hosted-azure
242244
script: L2_NeMo_2_LLAMA4_ENERGON_FINETUNE_EP2
245+
- runner: self-hosted-azure
246+
script: L2_NeMo_2_Diffusion_Recipe_Test
247+
- runner: self-hosted-azure
248+
script: L2_NeMo_2_Diffusion_Taskencoder_Test
249+
- runner: self-hosted-azure
250+
script: L2_NeMo_2_Flux_Import_Test
251+
- runner: self-hosted-azure
252+
script: L2_NeMo_2_Flux_Inference_Test
253+
- runner: self-hosted-azure
254+
script: L2_NeMo_2_Flux_Training_Test
255+
243256
needs: [build]
244257
runs-on: ${{ matrix.runner }}
245-
name: ${{ matrix.script }}
258+
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
246259
steps:
247260
- name: Checkout
248261
uses: actions/checkout@v4

.github/workflows/cicd-main-speech.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ jobs:
7070
image: ${{ inputs.image-name }}
7171
timeout: ${{ matrix.timeout || 10 }}
7272
cpu-only: ${{ matrix.cpu-only || false }}
73+
is_optional: ${{ matrix.is-optional || false }}
7374

7475
e2e-tests:
7576
strategy:

.github/workflows/cicd-main-unit-tests.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ jobs:
7171
is_unit_test: true
7272
tests_to_run: ${{ inputs.test_to_run }}
7373
cpu-only: ${{ matrix.cpu-only || false }}
74+
is_optional: ${{ matrix.is-optional || false }}
7475

7576
collections-multimodal-tests:
7677
strategy:
@@ -97,7 +98,7 @@ jobs:
9798
is_unit_test: true
9899
tests_to_run: ${{ inputs.test_to_run }}
99100
cpu-only: ${{ matrix.cpu-only || false }}
100-
101+
is_optional: ${{ matrix.is-optional || false }}
101102
collections-vlm-tests:
102103
strategy:
103104
fail-fast: false
@@ -123,6 +124,7 @@ jobs:
123124
is_unit_test: true
124125
tests_to_run: ${{ inputs.test_to_run }}
125126
cpu-only: ${{ matrix.cpu-only || false }}
127+
is_optional: ${{ matrix.is-optional || false }}
126128

127129
core-tests:
128130
strategy:
@@ -180,6 +182,7 @@ jobs:
180182
is_unit_test: true
181183
tests_to_run: ${{ inputs.test_to_run }}
182184
cpu-only: ${{ matrix.cpu-only || false }}
185+
is_optional: ${{ matrix.is-optional || false }}
183186

184187
other-tests:
185188
strategy:
@@ -206,3 +209,4 @@ jobs:
206209
is_unit_test: true
207210
tests_to_run: ${{ inputs.test_to_run }}
208211
cpu-only: ${{ matrix.cpu-only || false }}
212+
is_optional: ${{ matrix.is-optional || false }}

.github/workflows/cicd-main.yml

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@ on:
2121
- r**
2222
- weekly-bump*
2323
types: [labeled]
24-
push:
25-
branches:
26-
- main
2724
workflow_dispatch:
2825
inputs:
2926
test_to_run:
@@ -134,11 +131,12 @@ jobs:
134131
uses: ./.github/workflows/code-linting.yml
135132

136133
cicd-wait-in-queue:
137-
needs: [pre-flight]
134+
needs: [pre-flight, code-linting]
138135
runs-on: ubuntu-latest
139136
environment: test
140137
if: |
141138
needs.pre-flight.outputs.test_to_run != '[]'
139+
&& needs.pre-flight.outputs.components_to_run != '[]'
142140
&& needs.pre-flight.outputs.is_ci_workload == 'false'
143141
steps:
144142
- name: Running CI tests
@@ -150,6 +148,7 @@ jobs:
150148
needs: [pre-flight, code-linting, cicd-wait-in-queue]
151149
if: |
152150
needs.pre-flight.outputs.test_to_run != '[]'
151+
&& needs.pre-flight.outputs.components_to_run != '[]'
153152
&& (
154153
success()
155154
|| (
@@ -165,6 +164,7 @@ jobs:
165164
cicd-import-tests:
166165
if: |
167166
needs.pre-flight.outputs.test_to_run != '[]'
167+
&& needs.pre-flight.outputs.components_to_run != '[]'
168168
&& (
169169
success()
170170
|| (
@@ -214,6 +214,7 @@ jobs:
214214
runs-on: self-hosted-azure
215215
if: |
216216
needs.pre-flight.outputs.test_to_run != '[]'
217+
&& needs.pre-flight.outputs.components_to_run != '[]'
217218
&& (
218219
success()
219220
|| (
@@ -240,6 +241,7 @@ jobs:
240241
uses: ./.github/workflows/cicd-main-unit-tests.yml
241242
if: |
242243
needs.pre-flight.outputs.test_to_run != '[]'
244+
&& needs.pre-flight.outputs.components_to_run != '[]'
243245
&& (
244246
success()
245247
|| (
@@ -360,12 +362,14 @@ jobs:
360362
GH_TOKEN: ${{ github.token }}
361363
RUN_ID: ${{ github.run_id }}
362364
HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
365+
IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
363366
run: |
364367
# Get workflow run details and check job conclusions
368+
LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last')
365369
NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
366370
NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
367371
368-
if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && "$HAS_LABEL" == "true" ]]; then
372+
if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && ("$HAS_LABEL" == "true" || "$IS_SCHEDULED" == "true") ]]; then
369373
RESULT="success"
370374
elif [[ $NUM_CANCELLED -gt 0 ]]; then
371375
RESULT="cancelled"
@@ -445,9 +449,11 @@ jobs:
445449
446450
Coverage:
447451
runs-on: ubuntu-latest
448-
needs: [Nemo_CICD_Test]
452+
needs: [pre-flight, Nemo_CICD_Test]
449453
if: |
450-
(
454+
needs.pre-flight.outputs.test_to_run != '[]'
455+
&& needs.pre-flight.outputs.components_to_run != '[]'
456+
&& (
451457
success()
452458
|| needs.Nemo_CICD_Test.result == 'success'
453459
)

Dockerfile.ci

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
1818

19-
FROM ${BASE_IMAGE}
20-
19+
FROM ${BASE_IMAGE} as base-image
20+
ENV PIP_CONSTRAINT=""
2121
ENV TRANSFORMERS_OFFLINE=0
2222
ENV HYDRA_FULL_ERROR=1
2323
ENV PYTHONUNBUFFERED=1
@@ -29,6 +29,17 @@ apt-get install -y bc libsox-fmt-all
2929
apt-get clean
3030
EOF
3131

32+
FROM base-image as te-wheel
33+
ARG TE_REPO
34+
ARG TE_TAG
35+
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh \
36+
--mount=type=bind,source=external/patches,target=/tmp/NeMo/external/patches <<"EOF" bash -ex
37+
38+
bash /tmp/NeMo/reinstall.sh --library te --mode build
39+
ls -al /tmp/Megatron-LM || true
40+
EOF
41+
42+
FROM base-image as mcore-wheel
3243
ARG MLM_REPO
3344
ARG MLM_TAG
3445
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
@@ -37,8 +48,11 @@ RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF"
3748
ls -al /tmp/Megatron-LM || true
3849
EOF
3950

51+
FROM base-image
4052
WORKDIR /workspace
41-
RUN \
53+
RUN \
54+
--mount=type=bind,from=te-wheel,source=/opt/wheels/te,target=/opt/wheels/te \
55+
--mount=type=bind,from=mcore-wheel,source=/opt/wheels/mcore,target=/opt/wheels/mcore \
4256
--mount=type=bind,source=requirements,target=/tmp/NeMo/requirements \
4357
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/tmp/NeMo/tools/ctc_segmentation/requirements.txt \
4458
--mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh \
@@ -48,6 +62,7 @@ RUN \
4862
--mount=type=bind,source=nemo/package_info.py,target=/tmp/NeMo/nemo/package_info.py \
4963
--mount=type=bind,source=nemo/__init__.py,target=/tmp/NeMo/nemo/__init__.py <<"EOF" bash -ex
5064

65+
bash /tmp/NeMo/reinstall.sh --library te --mode install
5166
bash /tmp/NeMo/reinstall.sh --library mcore --mode install
5267
bash /tmp/NeMo/reinstall.sh --library nemo --mode install
5368
rm -rf $NEMO_DIR || true

Dockerfile.ci.export_deploy

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
1818

19-
FROM ${BASE_IMAGE} AS trt-base
19+
FROM ${BASE_IMAGE} AS base-image
20+
ENV PIP_CONSTRAINT=""
2021
ARG IMAGE_LABEL
2122
LABEL "nemo.library"=${IMAGE_LABEL}
2223

@@ -39,18 +40,25 @@ RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF"
3940
EOF
4041

4142

42-
FROM trt-base AS trt-llm-wheel
43-
43+
FROM base-image AS trt-llm-wheel
4444
ARG TRTLLM_REPO
4545
ARG TRTLLM_TAG
4646
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
4747

4848
bash /tmp/NeMo/reinstall.sh --library trtllm --mode build
4949
EOF
5050

51+
FROM base-image as te-wheel
52+
ARG TE_REPO
53+
ARG TE_TAG
54+
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh \
55+
--mount=type=bind,source=external/patches,target=/tmp/NeMo/external/patches <<"EOF" bash -ex
56+
57+
bash /tmp/NeMo/reinstall.sh --library te --mode build
58+
ls -al /tmp/Megatron-LM || true
59+
EOF
5160

52-
FROM trt-base as final
53-
61+
FROM base-image as mcore-wheel
5462
ARG MLM_REPO
5563
ARG MLM_TAG
5664
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
@@ -59,9 +67,13 @@ RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF"
5967
ls -al /tmp/Megatron-LM || true
6068
EOF
6169

70+
71+
FROM base-image
6272
WORKDIR /workspace
6373
RUN \
6474
--mount=type=bind,from=trt-llm-wheel,source=/opt/wheels/trtllm,target=/opt/wheels/trtllm \
75+
--mount=type=bind,from=te-wheel,source=/opt/wheels/te,target=/opt/wheels/te \
76+
--mount=type=bind,from=mcore-wheel,source=/opt/wheels/mcore,target=/opt/wheels/mcore \
6577
--mount=type=bind,source=requirements,target=/tmp/NeMo/requirements \
6678
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/tmp/NeMo/tools/ctc_segmentation/requirements.txt \
6779
--mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh \

docs/links_needing_review.json

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,6 @@
102102
"uri": "https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/index.html",
103103
"info": "404 Client Error: Not Found for url: https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/index.html"
104104
}
105-
{
106-
"filename": "core/api.rst",
107-
"lineno": 2,
108-
"status": "broken",
109-
"code": 0,
110-
"uri": "https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/mixins/device_dtype_mixin.py#L113",
111-
"info": "404 Client Error: Not Found for url: https://github.com/Lightning-AI/pytorch-lightning/blob/master/src/pytorch_lightning/core/mixins/device_dtype_mixin.py"
112-
}
113105
{
114106
"filename": "multimodal/vlm/clip.rst",
115107
"lineno": 140,
@@ -302,14 +294,6 @@
302294
"uri": "https://github.com/webdataset/webdataset#multinode-training",
303295
"info": "Anchor 'multinode-training' not found"
304296
}
305-
{
306-
"filename": "core/api.rst",
307-
"lineno": 6,
308-
"status": "broken",
309-
"code": 0,
310-
"uri": "https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/module.py#L728",
311-
"info": "Anchor 'L728' not found"
312-
}
313297
{
314298
"filename": "nlp/question_answering.rst",
315299
"lineno": 196,

0 commit comments

Comments
 (0)