address PR comments and add proper docstrings and type hints in the c… #131
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # CI pipeline — runner on fela (as bench), GPU work dispatched via single Slurm srun. | |
| # | |
| # The self-hosted runner runs as user bench on fela. All GPU work (build, | |
| # CUDA tests, benchmarks, JIT tests) runs inside a single srun allocation | |
| # to avoid re-queuing overhead. Lint runs directly on fela (no GPU needed). | |
| # | |
| # All state (pip packages, build artifacts, checkout) lives on /net/scratch | |
| # and persists across steps automatically. | |
| name: CI | |
| on: | |
| push: | |
| branches: [main, "kdidi/**"] | |
| pull_request: | |
| branches: [main] | |
| concurrency: | |
| group: ci-${{ github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| SIF: /net/scratch/kdidi/pytorch_25.11-py3-x11.sif | |
| PIP_PKGS: /net/scratch/bench/ci-pip-pkgs | |
| jobs: | |
| ci: | |
| runs-on: [self-hosted, gpu] | |
| timeout-minutes: 720 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v5 | |
| with: | |
| submodules: recursive | |
| - name: Restore source timestamps | |
| run: | | |
| # Restore git commit timestamps on C++/CUDA sources so ninja doesn't | |
| # rebuild unchanged files. Without this, checkout sets all mtimes to | |
| # "now", invalidating the JIT extension cache every run (~2h rebuild). | |
| git ls-files -- '*.cpp' '*.cu' '*.cuh' '*.h' '*.hh' '*.cc' | \ | |
| xargs -P8 -I{} bash -c 'ts=$(git log -1 --format="%ct" -- "$1") && touch -d "@$ts" "$1"' _ {} | |
| - name: Lint | |
| continue-on-error: true | |
| run: | | |
| apptainer exec \ | |
| --bind /net/scratch:/net/scratch \ | |
| --bind $PIP_PKGS:/ci-pip-pkgs \ | |
| --pwd $GITHUB_WORKSPACE \ | |
| $SIF bash << 'LINT' | |
| set -ex | |
| PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") | |
| export PATH=/ci-pip-pkgs/bin:$HOME/.local/bin:$PATH | |
| export PYTHONPATH=$PWD:/ci-pip-pkgs/lib/python${PYVER}/site-packages:$PYTHONPATH | |
| pip install --break-system-packages --target /ci-pip-pkgs/lib/python${PYVER}/site-packages \ | |
| pre-commit 2>/dev/null || true | |
| pre-commit install | |
| pre-commit run --all-files --show-diff-on-failure || echo "::warning::pre-commit found issues" | |
| LINT | |
| - name: API quality gates | |
| run: | | |
| python3 - << 'PY' | |
| import ast | |
| from pathlib import Path | |
| root = Path(".") | |
| checks = [ | |
| ("tmol/ligand/__init__.py", "prepare_ligands"), | |
| ("tmol/io/pose_stack_from_biotite.py", "build_context_from_biotite"), | |
| ("tmol/io/pose_stack_from_biotite.py", "pose_stack_from_biotite"), | |
| ("tmol/database/__init__.py", "add_residue_type"), | |
| ("tmol/score/__init__.py", "beta2016_score_function"), | |
| ("tmol/relax/fast_relax.py", "fast_relax"), | |
| ] | |
| errors = [] | |
| for rel_path, fn_name in checks: | |
| tree = ast.parse((root / rel_path).read_text()) | |
| fn = next( | |
| (n for n in tree.body if isinstance(n, ast.FunctionDef) and n.name == fn_name), | |
| None, | |
| ) | |
| if fn is None: | |
| errors.append(f"{rel_path}: missing function {fn_name}") | |
| continue | |
| doc = ast.get_docstring(fn) or "" | |
| if "Args:" not in doc or "Returns:" not in doc: | |
| errors.append(f"{rel_path}:{fn_name} missing Google-style Args/Returns") | |
| for rel_path in { | |
| "tmol/ligand/__init__.py", | |
| "tmol/io/pose_stack_from_biotite.py", | |
| "tmol/database/__init__.py", | |
| "tmol/score/__init__.py", | |
| "tmol/relax/fast_relax.py", | |
| }: | |
| tree = ast.parse((root / rel_path).read_text()) | |
| for node in ast.walk(tree): | |
| if ( | |
| isinstance(node, ast.Call) | |
| and isinstance(node.func, ast.Name) | |
| and node.func.id == "print" | |
| ): | |
| errors.append(f"{rel_path}: print() is not allowed in library code") | |
| if errors: | |
| raise SystemExit("\n".join(errors)) | |
| PY | |
| # All GPU work in a single srun allocation: setup, build, tests, benchmarks. | |
| # This avoids re-queuing for each step and keeps the environment consistent. | |
| # Use Ampere (a6000, sm_86) — the NGC 25.11 container (Python 3.12, PyTorch 2.9). | |
| # Switch a6000 -> b4000 once Blackwell support is confirmed. | |
| - name: Build and test (GPU) | |
| run: | | |
| mkdir -p $PIP_PKGS | |
| srun --gres=gpu:a6000:1 -p gpu --export=ALL \ | |
| -t 10:00:00 --cpus-per-task=16 --mem=128G \ | |
| apptainer exec --nv --writable-tmpfs \ | |
| --bind /net/scratch:/net/scratch \ | |
| --bind $PIP_PKGS:/ci-pip-pkgs \ | |
| --pwd $GITHUB_WORKSPACE \ | |
| $SIF bash << 'GPU_WORK' | |
| set -ex | |
| PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") | |
| export PYTHONUSERBASE=/ci-pip-pkgs | |
| export PATH=/ci-pip-pkgs/bin:$HOME/.local/bin:$PATH | |
| export PYTHONPATH=$GITHUB_WORKSPACE:/ci-pip-pkgs/lib/python${PYVER}/site-packages:$PYTHONPATH | |
| # ── Setup ────────────────────────────────────────────────────── | |
| command -v uv >/dev/null 2>&1 || pip install --user uv | |
| TMPDIR=$GITHUB_WORKSPACE/.ci-tmp | |
| mkdir -p $TMPDIR | |
| pip freeze | grep -vE '(file://|^-e |^#|^pandas==|^scipy==|^clang-format==|^black==|^flake8==|^pathspec==|^click==|^mypy-extensions==|^packaging==)' > $TMPDIR/container_pins.txt | |
| uv pip compile pyproject.toml --all-extras \ | |
| --constraint $TMPDIR/container_pins.txt \ | |
| --output-file $TMPDIR/requirements.txt | |
| grep -vE "^(torch(|vision|audio)|numpy|nvidia-.*|triton|tensorrt|pynvml|pandas|scipy)==" \ | |
| $TMPDIR/requirements.txt > $TMPDIR/to_install.txt | |
| uv pip install --prefix /ci-pip-pkgs --python /usr/bin/python3 --break-system-packages \ | |
| -r $TMPDIR/to_install.txt 'packaging>=24.2' 'scikit-build-core>=0.10' 'numpy<2' | |
| python3 -c "from openbabel import pybel; print('openbabel OK:', pybel.readstring('smi', 'CCO').formula)" | |
| # ── Build ────────────────────────────────────────────────────── | |
| rm -rf build/ | |
| export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda} | |
| export CMAKE_PREFIX_PATH=$(python3 -c 'import torch; print(torch.utils.cmake_prefix_path)') | |
| GPU_ARCH=$(python3 -c "import torch; c=torch.cuda.get_device_capability(0); print(f'{c[0]}{c[1]}')") | |
| echo "=== Building for GPU architecture: sm_${GPU_ARCH} ===" | |
| MAX_JOBS=12 pip install -v --no-build-isolation --no-deps --prefix /ci-pip-pkgs \ | |
| -Ccmake.define.CMAKE_CUDA_ARCHITECTURES="${GPU_ARCH}" \ | |
| -Ccmake.define.TMOL_BUILD_TESTS=ON \ | |
| -Ccmake.define.TMOL_NVCC_THREADS=2 \ | |
| -e . | |
| # ── Environment info ─────────────────────────────────────────── | |
| echo "=== environment ===" | |
| python --version | |
| python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}, GPUs: {torch.cuda.device_count()}')" | |
| nvidia-smi || true | |
| # ── Test CPU ─────────────────────────────────────────────────── | |
| echo "=== tests (CPU only) ===" | |
| CUDA_VISIBLE_DEVICES="" pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 \ | |
| --cov=./tmol --junitxml=testing.cpu.junit.xml -k "not cuda" || true | |
| # ── Test CUDA ────────────────────────────────────────────────── | |
| echo "=== tests (CUDA) ===" | |
| pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 --cov=./tmol \ | |
| --junitxml=testing.cuda.junit.xml -k "cuda" || true | |
| # ── Benchmark ────────────────────────────────────────────────── | |
| echo "=== benchmarks ===" | |
| BENCHMARK_DIR=benchmark/$GITHUB_REPOSITORY/$GITHUB_REF_NAME | |
| BENCHMARK_RESULT=${BENCHMARK_DIR}/${GITHUB_RUN_NUMBER}.json | |
| mkdir -p $BENCHMARK_DIR | |
| pytest -p no:rerunfailures --benchmark-enable --benchmark-only \ | |
| --benchmark-name=short --benchmark-sort=fullname \ | |
| --benchmark-columns=ops,mean,iqr \ | |
| --benchmark-json=${BENCHMARK_RESULT} \ | |
| --benchmark-max-time=.1 || true | |
| pytest-benchmark compare --name=short --sort=fullname \ | |
| --columns=ops,mean,iqr $(find benchmark -name '*.json') || true | |
| # ── Test CUDA (JIT) ──────────────────────────────────────────── | |
| echo "=== tests (CUDA JIT) ===" | |
| export TMOL_USE_JIT=1 | |
| export TORCH_CUDA_ARCH_LIST=$(python3 -c "import torch; c=torch.cuda.get_device_capability(0); print(f'{c[0]}.{c[1]}')") | |
| export TORCH_EXTENSIONS_DIR=/ci-pip-pkgs/torch_extensions | |
| echo "JIT compiling for TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" | |
| pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 --cov=./tmol \ | |
| --junitxml=testing.cuda.jit.junit.xml -k "cuda" || true | |
| GPU_WORK | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-results | |
| path: | | |
| testing.cpu.junit.xml | |
| testing.cuda.junit.xml | |
| testing.cuda.jit.junit.xml | |
| - name: Upload benchmark results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results | |
| path: benchmark/**/* |