Skip to content

Commit b950c28

Browse files
authored
Merge branch 'main' into farhadr/evo2_cleanup
2 parents 1f03be2 + 4ab5a21 commit b950c28

File tree

43 files changed

+690
-105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+690
-105
lines changed

.github/codecov.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
codecov:
2+
require_ci_to_pass: false
3+
4+
coverage:
5+
status:
6+
project:
7+
default:
8+
target: auto
9+
threshold: 5
10+
11+
comment:
12+
layout: "diff, flags, files"
13+
behavior: default
14+
require_changes: false # if true: only post the comment if coverage changes

.github/dependabot.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,13 @@ updates:
1010
reviewers:
1111
- "pstjohn"
1212
- "jstjohn"
13+
- package-ecosystem: "docker"
14+
directory: "/"
15+
target-branch: "main"
16+
open-pull-requests-limit: 1
17+
schedule:
18+
interval: "weekly"
19+
reviewers:
20+
- "pstjohn"
21+
- "dorotat-nv"
22+
- "trvachov"

.github/workflows/bionemo-subpackage-ci.yml

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,52 @@ on:
55
workflow_dispatch:
66
inputs:
77
subpackages:
8-
description: BioNeMo sub-packages (comma-separated) to test or publish.
8+
description: "BioNeMo sub-packages (comma-separated) to test or publish."
99
required: true
1010
type: string
1111
test:
12-
description: Test the sub-packages before publishing to PyPI. Strongly recommended for production releases to PyPI. Can be disabled when staging sub-packages on Test PyPI or publishing circular dependencies to PyPI.
12+
description: "Test the sub-packages before publishing to PyPI. Strongly recommended for production releases to PyPI. Can be disabled when staging sub-packages on Test PyPI or publishing circular dependencies to PyPI."
1313
required: false
1414
type: boolean
1515
default: true
1616
publish:
17-
description: Publish the built package to PyPI. If testing is specified, requires that all sub-package tests succeed based on dependencies published to Test PyPI or PyPI.
17+
description: "Publish the built package to PyPI. If testing is specified, requires that all sub-package tests succeed based on dependencies published to Test PyPI or PyPI."
1818
required: false
1919
type: boolean
2020
default: false
2121
pypi:
22-
description: Publish to PyPI instead of Test PyPI.
22+
description: "Publish to PyPI instead of Test PyPI."
2323
required: false
2424
type: boolean
2525
default: false
26+
version_overwrite:
27+
description: "Overwrite the published version of the sub-package. (Sets skip-existing to False. Requires deleting existing wheels and other artifacts on PyPI.)"
28+
required: false
29+
type: boolean
30+
default: false
31+
build_framework:
32+
description: "Build framework to use for building and publishing."
33+
type: choice
34+
options:
35+
- "python"
36+
- "rust_pyo3_maturin"
37+
default: "python"
38+
required: true
39+
python_version:
40+
description: "Python version to use for testing and publishing."
41+
required: false
42+
type: string
43+
default: "3.12"
44+
gpu_runner:
45+
description: "Specify a GPU runner for testing on NVIDIA GitHub Actions. (For a list of available runners, refer to: https://docs.gha-runners.nvidia.com/runners/)"
46+
required: false
47+
type: string
48+
default: "linux-amd64-gpu-l4-latest-1"
49+
cuda_version:
50+
description: "NVIDIA CUDA container version to use for testing."
51+
required: false
52+
type: string
53+
default: "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
2654

2755
jobs:
2856
configure-workflow-packages:
@@ -49,18 +77,19 @@ jobs:
4977
package: ${{ fromJson(needs.configure-workflow-packages.outputs.workflow_packages) }}
5078
fail-fast: false # Prevent all matrix jobs from failing if one fails.
5179
name: "[${{ matrix.package }}] Install and test sub-package."
52-
runs-on: linux-amd64-gpu-l4-latest-1
53-
container: # GPU jobs must run in a container. Use a fresh base container for package installation and testing.
54-
image: nvidia/cuda:12.8.1-base-ubuntu22.04
80+
# Use GPU runner only when testing, otherwise use a standard runner.
81+
runs-on: ${{ github.event.inputs.test == 'true' && github.event.inputs.gpu_runner || 'ubuntu-latest' }}
82+
container:
83+
# GPU jobs must run in a container. Use a fresh CUDA base container for package installation and testing.
84+
# If testing is disabled, use a lightweight container to quickly skip this job.
85+
image: ${{ github.event.inputs.test == 'true' && github.event.inputs.cuda_version || 'ubuntu:latest' }}
5586
steps:
5687
# Silently skip all steps if testing is disabled, which does not block building or publishing.
5788
- name: Install git and system dependencies.
5889
if: ${{ github.event.inputs.test == 'true' }}
5990
run: |
6091
apt-get update
61-
apt-get install -y git
62-
apt-get install -y lsb-release # No longer pre-installed in Ubuntu>=22.04.
63-
apt-get install -y build-essential # For installing C build tools, like GCC and make.
92+
apt-get install -qyy git curl lsb-release build-essential
6493
- uses: actions/checkout@v4
6594
if: ${{ github.event.inputs.test == 'true' }}
6695
with:
@@ -69,16 +98,41 @@ jobs:
6998
- uses: actions/setup-python@v5
7099
if: ${{ github.event.inputs.test == 'true' }}
71100
with:
72-
python-version: "3.12"
73-
- id: install-dispatch-subpackage
101+
python-version: ${{ github.event.inputs.python_version }}
102+
- id: install-rust
103+
if: ${{ github.event.inputs.test == 'true' }}
104+
name: Install Rust.
105+
run: |
106+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
107+
. $HOME/.cargo/env
108+
rustc --version
109+
cargo --version
110+
rustup --version
111+
- id: install-subpackage-core
74112
if: ${{ github.event.inputs.test == 'true' }}
75113
name: Install sub-package.
76114
run: |
115+
# Setup environment, i.e. add Rust to PATH and silence pip root user warnings.
116+
. $HOME/.cargo/env
77117
# Install sub-package and dependencies.
78-
pip install --upgrade pip setuptools
79-
pip install pytest
118+
pip install --upgrade pip setuptools uv maturin
80119
# Install required core & optional [test] dependencies.
81-
pip install --no-cache-dir 'sub-packages/${{ matrix.package }}[test]'
120+
uv pip install --no-cache --system pytest sub-packages/${{ matrix.package }}[test]
121+
- id: install-subpackage-post
122+
if: ${{ github.event.inputs.test == 'true' }}
123+
name: Install sub-package dependencies that need to be installed after the core dependencies.
124+
run: |
125+
# DEV: Post-install dependencies are configured in [project.optional-dependencies].
126+
# `uv pip install --extra <optional-dependency> -r <pyproject.toml>` tracks
127+
# post-dependencies in the pyproject.toml and avoids installing core dependencies
128+
# redundantly, which causes errors with incompatible --config-setting.
129+
130+
# TransformerEngine
131+
uv pip install --no-cache --no-build-isolation --system --extra te -r sub-packages/${{ matrix.package }}/pyproject.toml || echo "[BioNeMo Sub-Package CI] TE will not be installed."
132+
133+
# # Apex
134+
# # NOTE: --cpp_ext and --cuda_ext are required for building fused Apex kernels.
135+
# uv pip install --no-cache --no-build-isolation --system --config-setting="--build-option=--cpp_ext" --config-setting="--build-option=--cuda_ext" --extra apex -r sub-packages/${{ matrix.package }}/pyproject.toml || echo "[BioNeMo Sub-Package CI] Apex will not be installed."
82136
- id: test-dispatch-subpackage
83137
if: ${{ github.event.inputs.test == 'true' }}
84138
name: Test sub-package.
@@ -103,23 +157,30 @@ jobs:
103157
persist-credentials: false
104158
- uses: actions/setup-python@v5
105159
with:
106-
python-version: "3.12"
160+
python-version: ${{ github.event.inputs.python_version }}
107161
- id: build-package
108162
name: Build a binary wheel and a source tarball for the sub-package.
109163
run: |
110-
if [[ "${{ github.event.inputs.test }}" != "true" ]]; then
164+
if [[ "${{ github.event.inputs.test }}" != "true" && "${{ github.event.inputs.version_overwrite }}" != "true" ]]; then
111165
# For untested sub-packages, append '-dev' to the version for PyPI.
112166
sed -i 's/[[:space:]]*$//' sub-packages/${{ matrix.package }}/VERSION
113167
sed -i 's/$/-dev/' sub-packages/${{ matrix.package }}/VERSION
114168
fi
115-
python -m pip install build
116-
python -m build sub-packages/${{ matrix.package }}
169+
# Build the sub-package.
170+
if [[ "${{ github.event.inputs.build_framework }}" == "python" ]]; then
171+
pip install build
172+
python -m build sub-packages/${{ matrix.package }}
173+
elif [[ "${{ github.event.inputs.build_framework }}" == "rust_pyo3_maturin" ]]; then
174+
# Install maturin[zig] to build the Rust sub-package with compatibility for manylinux_X_Y using zig.
175+
pip install maturin[zig]
176+
maturin build --release --zig -m sub-packages/${{ matrix.package }}/Cargo.toml
177+
fi
117178
- id: upload-distribution
118179
name: Upload distribution packages to the workflow.
119180
uses: actions/upload-artifact@v4
120181
with:
121-
name: ${{ matrix.package }}-dist
122-
path: sub-packages/${{ matrix.package }}/dist
182+
name: ${{ matrix.package }}-build-artifacts
183+
path: ${{ github.event.inputs.build_framework == 'rust_pyo3_maturin' && format('sub-packages/{0}/target/wheels', matrix.package) || format('sub-packages/{0}/dist', matrix.package) }}
123184

124185
publish-to-pypi:
125186
needs: [build-pypi, install-and-test]
@@ -141,7 +202,7 @@ jobs:
141202
name: Download the built distribution.
142203
uses: actions/download-artifact@v4
143204
with:
144-
name: ${{ matrix.package }}-dist
205+
name: ${{ matrix.package }}-build-artifacts
145206
path: sub-packages/${{ matrix.package }}/dist
146207
- id: publish-to-testpypi
147208
name: Publish distribution 📦 to Test PyPI for PR.
@@ -151,7 +212,7 @@ jobs:
151212
verbose: true
152213
packages-dir: sub-packages/${{ matrix.package }}/dist
153214
repository-url: https://test.pypi.org/legacy/
154-
skip-existing: true
215+
skip-existing: ${{ github.event.inputs.version_overwrite }}
155216
- id: publish-to-pypi
156217
name: Publish distribution 📦 to PyPI for Workflow Dispatch.
157218
# To require testing before publishing to PyPI, add: ... && needs.install-and-test.result == 'success'
@@ -162,4 +223,4 @@ jobs:
162223
with:
163224
verbose: true
164225
packages-dir: sub-packages/${{ matrix.package }}/dist
165-
skip-existing: true
226+
skip-existing: ${{ github.event.inputs.version_overwrite }}

Dockerfile

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
# training loss curves from NeMo.
2222
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.01-py3
2323

24-
FROM rust:1.82.0 AS rust-env
24+
FROM rust:1.86.0 AS rust-env
2525

2626
RUN rustup set profile minimal && \
2727
rustup install 1.82.0 && \
@@ -114,6 +114,12 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
114114
pip install .; \
115115
fi
116116

117+
# On ARM, bits and bytes needs to be built from scratch
118+
RUN if [ "$TARGETARCH" = "arm64" ]; then \
119+
cd / && pip uninstall bitsandbytes && \
120+
git clone --single-branch --branch 0.45.5 https://github.com/bitsandbytes-foundation/bitsandbytes.git && \
121+
cd bitsandbytes && pip install . && cd .. && rm -rf bitsandbytes; \
122+
fi
117123
###############################################################################
118124
# /end ARM
119125
###############################################################################
@@ -128,6 +134,9 @@ RUN pip install hatchling urllib3 # needed to install nemo-run
128134
ARG NEMU_RUN_TAG=v0.3.0
129135
RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG} --use-deprecated=legacy-resolver
130136

137+
# Rapids SingleCell Installation
138+
RUN pip install 'rapids-singlecell' --extra-index-url=https://pypi.nvidia.com
139+
131140
RUN mkdir -p /workspace/bionemo2/
132141

133142
WORKDIR /workspace
@@ -140,7 +149,7 @@ RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
140149
# environment, and does not use the current uv.lock file. Note that with python 3.12, we now need to set
141150
# UV_BREAK_SYSTEM_PACKAGES, since the pytorch base image has made the decision not to use a virtual environment and UV
142151
# does not respect the PIP_BREAK_SYSTEM_PACKAGES environment variable set in the base dockerfile.
143-
COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
152+
COPY --from=ghcr.io/astral-sh/uv:0.6.13 /uv /usr/local/bin/uv
144153
ENV UV_LINK_MODE=copy \
145154
UV_COMPILE_BYTECODE=1 \
146155
UV_PYTHON_DOWNLOADS=never \
@@ -175,7 +184,7 @@ uv pip install maturin --no-build-isolation
175184
git clone https://github.com/NVIDIA/nvidia-resiliency-ext
176185
uv pip install nvidia-resiliency-ext/
177186
rm -rf nvidia-resiliency-ext/
178-
# ngcsdk causes strange dependency conflicts that we will resolve later
187+
# ngcsdk causes strange dependency conflicts (ngcsdk requires protobuf<4, but nemo_toolkit requires protobuf==4.24.4, deleting it from the uv pip install prevents installation conflicts)
179188
sed -i "/ngcsdk/d" ./sub-packages/bionemo-core/pyproject.toml
180189
# Remove llama-index because bionemo doesn't use it and it adds CVEs to container
181190
sed -i "/llama-index/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt
@@ -185,7 +194,7 @@ uv pip install --no-build-isolation \
185194
-r /requirements-cve.txt \
186195
-r /requirements-test.txt
187196

188-
# Install back ngcsdk. Somehow doing it here avoids a large dependency loop
197+
# Install back ngcsdk, as a WAR for the protobuf version conflict with nemo_toolkit.
189198
uv pip install ngcsdk
190199

191200
# Addressing security scan issue - CVE vulnerability https://github.com/advisories/GHSA-g4r7-86gm-pgqc The package is a
@@ -234,7 +243,7 @@ USER $USERNAME
234243
COPY --from=bionemo2-base --chown=$USERNAME:$USERNAME --chmod=777 \
235244
/usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages
236245

237-
COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
246+
COPY --from=ghcr.io/astral-sh/uv:0.6.13 /uv /usr/local/bin/uv
238247
ENV UV_LINK_MODE=copy \
239248
UV_COMPILE_BYTECODE=0 \
240249
UV_PYTHON_DOWNLOADS=never \

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ fastest performance on the market. You can access BioNeMo Framework as a free co
1313
or learn more at <https://www.nvidia.com/en-us/clara/bionemo/> about getting an enterprise license for improved
1414
expert-level support.
1515

16+
BioNeMo Framework is part of a larger ecosystem of NVIDIA Biopharma products. Get notified of new releases, bug fixes, critical security updates, and more for biopharma. [Subscribe.](https://www.nvidia.com/en-us/clara/biopharma/product-updates/)
17+
1618
## Structure of the Framework
1719

1820
The `bionemo-framework` is organized into independently installable namespace packages. These are located under the

ci/benchmarks/partial-conv/esm2_pretrain.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ script_args:
2525
batch_size:
2626
value: 16
2727
max_steps:
28-
value: 26500
28+
value: 500000
29+
stop_steps: 26500
2930
script: |-
3031
WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
3132
--train-cluster-path=${data_path}/train_clusters.parquet \
@@ -38,6 +39,7 @@ script: |-
3839
--val-check-interval=1000 \
3940
--limit-val-batches=1 \
4041
--num-steps=${max_steps} \
42+
--early-stop-on-step ${stop_steps} \
4143
--min-seq-length=1024 \
4244
--max-seq-length=1024 \
4345
--num-layers=33 \

ci/benchmarks/partial-conv/evo2_pretrain.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ script: |-
6868
--devices=${gpus} \
6969
--micro-batch-size=${batch_size} \
7070
--model-size=${config_name} \
71-
--max-steps=${max_steps} --early-stop-on-step ${stop_steps} \
71+
--max-steps=${max_steps} \
72+
--early-stop-on-step ${stop_steps} \
7273
--limit-val-batches=20 \
7374
--log-every-n-steps=50 \
7475
--val-check-interval=500 \
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
scope: partial-conv
2+
time_limit: 14400
3+
script_args:
4+
# All arguments referenced in the script string must be specified here.
5+
# Arguments not referenced in the script string must have the 'arg' field specified.
6+
# See jet/core/configs.py for the specification of the configuration class
7+
workspace:
8+
value: /workspace/bionemo2
9+
key_segment: False
10+
data_path:
11+
value: /data/cellxgene_scdl
12+
key_segment: False
13+
model:
14+
value: geneformer
15+
variant:
16+
value: train
17+
config_name:
18+
value: geneformer_config
19+
precision:
20+
value: [bf16-mixed]
21+
nodes:
22+
value: [2]
23+
gpus:
24+
value: 8
25+
batch_size:
26+
value: 32
27+
max_steps:
28+
value: 37000
29+
lr:
30+
value: 0.001
31+
val_check_interval:
32+
value: 500
33+
acc:
34+
value: 1
35+
36+
script: |-
37+
WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
38+
--data-dir ${data_path} \
39+
--experiment-name ${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
40+
--num-gpus ${gpus} \
41+
--save-last-checkpoint \
42+
--num-nodes ${nodes} \
43+
--val-check-interval ${val_check_interval} \
44+
--num-dataset-workers 8 \
45+
--num-steps ${max_steps} \
46+
--seq-length 2048 \
47+
--limit-val-batches 8 \
48+
--micro-batch-size ${batch_size} \
49+
--resume-if-exists \
50+
--log-every-n-steps 50 \
51+
--lr ${lr} \
52+
--create-tensorboard-logger \
53+
--result-dir=${tensorboard_dir} \
54+
--wandb-project ${wandb_project_name} \
55+
--wandb-job-type=${pipeline_label} \
56+
--cosine-rampup-frac 0.004331629559040111 \
57+
--cosine-hold-frac 0.021658147795200554 \
58+
--accumulate-grad-batches ${acc} \
59+
--precision ${precision} \
60+
--disable-checkpointing;

0 commit comments

Comments
 (0)