add new ci pipeline #135
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # CI pipeline — runner on fela (as bench), GPU work dispatched via single Slurm srun. | |
| # | |
| # The self-hosted runner runs as user bench on fela. All GPU work (build, | |
| # CUDA tests, benchmarks, JIT tests) runs inside a single srun allocation | |
| # to avoid re-queuing overhead. Lint runs directly on fela (no GPU needed). | |
| # | |
| # All state (pip packages, build artifacts, checkout) lives on /net/scratch | |
| # and persists across steps automatically. | |
| name: CI | |
| on: | |
| push: | |
| branches: [main, "kdidi/**"] | |
| pull_request: | |
| branches: [main] | |
| concurrency: | |
| group: ci-${{ github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| SIF: /net/scratch/kdidi/pytorch_25.11-py3-x11.sif | |
| PIP_PKGS: /net/scratch/bench/ci-pip-pkgs | |
| jobs: | |
| ci: | |
| runs-on: [self-hosted, gpu] | |
| timeout-minutes: 720 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v5 | |
| with: | |
| submodules: recursive | |
| - name: Restore source timestamps | |
| run: | | |
| # Restore git commit timestamps on C++/CUDA sources so ninja doesn't | |
| # rebuild unchanged files. Without this, checkout sets all mtimes to | |
| # "now", invalidating the JIT extension cache every run (~2h rebuild). | |
| git ls-files -- '*.cpp' '*.cu' '*.cuh' '*.h' '*.hh' '*.cc' | \ | |
| xargs -P8 -I{} bash -c 'ts=$(git log -1 --format="%ct" -- "$1") && touch -d "@$ts" "$1"' _ {} | |
| - name: Lint | |
| continue-on-error: true | |
| run: | | |
| apptainer exec \ | |
| --bind /net/scratch:/net/scratch \ | |
| --bind $PIP_PKGS:/ci-pip-pkgs \ | |
| --pwd $GITHUB_WORKSPACE \ | |
| $SIF bash << 'LINT' | |
| set -ex | |
| PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") | |
| export PATH=/ci-pip-pkgs/bin:$HOME/.local/bin:$PATH | |
| export PYTHONPATH=$PWD:/ci-pip-pkgs/lib/python${PYVER}/site-packages:$PYTHONPATH | |
| pip install --break-system-packages --target /ci-pip-pkgs/lib/python${PYVER}/site-packages \ | |
| pre-commit 2>/dev/null || true | |
| pre-commit install | |
| pre-commit run --all-files --show-diff-on-failure || echo "::warning::pre-commit found issues" | |
| LINT | |
| # All GPU work in a single srun allocation: setup, build, tests, benchmarks. | |
| # This avoids re-queuing for each step and keeps the environment consistent. | |
| # Use Ampere (a6000, sm_86) — the NGC 25.11 container (Python 3.12, PyTorch 2.9). | |
| # Switch a6000 -> b4000 once Blackwell support is confirmed. | |
| - name: Build and test (GPU) | |
| run: | | |
| mkdir -p $PIP_PKGS | |
| srun --gres=gpu:a6000:1 -p gpu --export=ALL \ | |
| -t 10:00:00 --cpus-per-task=16 --mem=128G \ | |
| apptainer exec --nv --writable-tmpfs \ | |
| --bind /net/scratch:/net/scratch \ | |
| --bind $PIP_PKGS:/ci-pip-pkgs \ | |
| --pwd $GITHUB_WORKSPACE \ | |
| $SIF bash << 'GPU_WORK' | |
| set -ex | |
| PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") | |
| export PYTHONUSERBASE=/ci-pip-pkgs | |
| export PATH=/ci-pip-pkgs/bin:$HOME/.local/bin:$PATH | |
| export PYTHONPATH=$GITHUB_WORKSPACE:/ci-pip-pkgs/lib/python${PYVER}/site-packages:$PYTHONPATH | |
| export TMOL_CACHE_ROOT=/net/scratch/bench/ci-caches/tmol | |
| export TMOL_PRECOMPILED_BUILD_DIR=$TMOL_CACHE_ROOT/precompiled-build | |
| export TMOL_JIT_CACHE_DIR=$TMOL_CACHE_ROOT/jit-extensions | |
| mkdir -p "$TMOL_PRECOMPILED_BUILD_DIR" "$TMOL_JIT_CACHE_DIR" | |
| # ── Setup ────────────────────────────────────────────────────── | |
| command -v uv >/dev/null 2>&1 || pip install --user uv | |
| TMPDIR=$GITHUB_WORKSPACE/.ci-tmp | |
| mkdir -p $TMPDIR | |
| pip freeze | grep -vE '(file://|^-e |^#|^pandas==|^scipy==|^clang-format==|^black==|^flake8==|^pathspec==|^click==|^mypy-extensions==|^packaging==)' > $TMPDIR/container_pins.txt | |
| uv pip compile pyproject.toml --all-extras \ | |
| --constraint $TMPDIR/container_pins.txt \ | |
| --output-file $TMPDIR/requirements.txt | |
| grep -vE "^(torch(|vision|audio)|numpy|nvidia-.*|triton|tensorrt|pynvml|pandas|scipy)==" \ | |
| $TMPDIR/requirements.txt > $TMPDIR/to_install.txt | |
| uv pip install --prefix /ci-pip-pkgs --python /usr/bin/python3 --break-system-packages \ | |
| -r $TMPDIR/to_install.txt 'packaging>=24.2' 'scikit-build-core>=0.10' 'numpy<2' | |
| python3 -c "from openbabel import pybel; print('openbabel OK:', pybel.readstring('smi', 'CCO').formula)" | |
| # ── Build (persistent cache dir on runner) ──────────────────── | |
| rm -rf build | |
| ln -sfn "$TMOL_PRECOMPILED_BUILD_DIR" build | |
| export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda} | |
| export CMAKE_PREFIX_PATH=$(python3 -c 'import torch; print(torch.utils.cmake_prefix_path)') | |
| GPU_ARCH=$(python3 -c "import torch; c=torch.cuda.get_device_capability(0); print(f'{c[0]}{c[1]}')") | |
| echo "=== Building for GPU architecture: sm_${GPU_ARCH} ===" | |
| MAX_JOBS=12 pip install -v --no-build-isolation --no-deps --prefix /ci-pip-pkgs \ | |
| -Ccmake.define.CMAKE_CUDA_ARCHITECTURES="${GPU_ARCH}" \ | |
| -Ccmake.define.TMOL_BUILD_TESTS=ON \ | |
| -Ccmake.define.TMOL_NVCC_THREADS=2 \ | |
| -e . | |
| # ── Environment info ─────────────────────────────────────────── | |
| echo "=== environment ===" | |
| python --version | |
| python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}, GPUs: {torch.cuda.device_count()}')" | |
| nvidia-smi || true | |
| # ── Test CPU ─────────────────────────────────────────────────── | |
| echo "=== tests (CPU only) ===" | |
| unset TMOL_USE_JIT | |
| export TMOL_JIT_FALLBACK=0 | |
| unset TORCH_EXTENSIONS_DIR | |
| CUDA_VISIBLE_DEVICES="" pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 \ | |
| --cov=./tmol --junitxml=testing.cpu.junit.xml -o faulthandler_timeout=300 \ | |
| -k "not cuda and not benchmark and not score_function_benchmarks" | |
| # ── Test CUDA ────────────────────────────────────────────────── | |
| echo "=== tests (CUDA) ===" | |
| unset TMOL_USE_JIT | |
| export TMOL_JIT_FALLBACK=0 | |
| unset TORCH_EXTENSIONS_DIR | |
| pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 --no-cov \ | |
| --junitxml=testing.cuda.junit.xml -o faulthandler_timeout=300 \ | |
| -k "cuda and not benchmark and not score_function_benchmarks" | |
| # ── Benchmark ────────────────────────────────────────────────── | |
| echo "=== benchmarks ===" | |
| BENCHMARK_DIR=benchmark/$GITHUB_REPOSITORY/$GITHUB_REF_NAME | |
| BENCHMARK_RESULT=${BENCHMARK_DIR}/${GITHUB_RUN_NUMBER}.json | |
| mkdir -p $BENCHMARK_DIR | |
| pytest -p no:rerunfailures --benchmark-enable --benchmark-only \ | |
| --benchmark-name=short --benchmark-sort=fullname \ | |
| --benchmark-columns=ops,mean,iqr \ | |
| --benchmark-json=${BENCHMARK_RESULT} \ | |
| --benchmark-max-time=.1 || true | |
| pytest-benchmark compare --name=short --sort=fullname \ | |
| --columns=ops,mean,iqr $(find benchmark -name '*.json') || true | |
| # ── Test CUDA (JIT) ──────────────────────────────────────────── | |
| echo "=== tests (CUDA JIT) ===" | |
| export TMOL_USE_JIT=1 | |
| export TMOL_JIT_FALLBACK=0 | |
| export TORCH_CUDA_ARCH_LIST=$(python3 -c "import torch; c=torch.cuda.get_device_capability(0); print(f'{c[0]}.{c[1]}')") | |
| export TORCH_EXTENSIONS_DIR=$TMOL_JIT_CACHE_DIR | |
| echo "JIT compiling for TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" | |
| pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 --no-cov \ | |
| --junitxml=testing.cuda.jit.junit.xml -o faulthandler_timeout=300 \ | |
| -k "cuda and not benchmark and not score_function_benchmarks" | |
| GPU_WORK | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-results | |
| path: | | |
| testing.cpu.junit.xml | |
| testing.cuda.junit.xml | |
| testing.cuda.jit.junit.xml | |
| - name: Upload benchmark results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results | |
| path: benchmark/**/* |