add new ci pipeline #135

Workflow file for this run

	# CI pipeline — runner on fela (as bench), GPU work dispatched via single Slurm srun.
	#
	# The self-hosted runner runs as user bench on fela. All GPU work (build,
	# CUDA tests, benchmarks, JIT tests) runs inside a single srun allocation
	# to avoid re-queuing overhead. Lint runs directly on fela (no GPU needed).
	#
	# All state (pip packages, build artifacts, checkout) lives on /net/scratch
	# and persists across steps automatically.

	name: CI

	on:
	push:
	branches: [main, "kdidi/**"]
	pull_request:
	branches: [main]

	concurrency:
	group: ci-${{ github.ref }}
	cancel-in-progress: true

	env:
	SIF: /net/scratch/kdidi/pytorch_25.11-py3-x11.sif
	PIP_PKGS: /net/scratch/bench/ci-pip-pkgs

	jobs:
	ci:
	runs-on: [self-hosted, gpu]
	timeout-minutes: 720
	steps:
	- name: Checkout
	uses: actions/checkout@v5
	with:
	submodules: recursive

	- name: Restore source timestamps
	run: \|
	# Restore git commit timestamps on C++/CUDA sources so ninja doesn't
	# rebuild unchanged files. Without this, checkout sets all mtimes to
	# "now", invalidating the JIT extension cache every run (~2h rebuild).
	git ls-files -- '.cpp' '.cu' '.cuh' '.h' '.hh' '.cc' \| \
	xargs -P8 -I{} bash -c 'ts=$(git log -1 --format="%ct" -- "$1") && touch -d "@$ts" "$1"' _ {}

	- name: Lint
	continue-on-error: true
	run: \|
	apptainer exec \
	--bind /net/scratch:/net/scratch \
	--bind $PIP_PKGS:/ci-pip-pkgs \
	--pwd $GITHUB_WORKSPACE \
	$SIF bash << 'LINT'
	set -ex
	PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
	export PATH=/ci-pip-pkgs/bin:$HOME/.local/bin:$PATH
	export PYTHONPATH=$PWD:/ci-pip-pkgs/lib/python${PYVER}/site-packages:$PYTHONPATH
	pip install --break-system-packages --target /ci-pip-pkgs/lib/python${PYVER}/site-packages \
	pre-commit 2>/dev/null \|\| true
	pre-commit install
	pre-commit run --all-files --show-diff-on-failure \|\| echo "::warning::pre-commit found issues"
	LINT

	# All GPU work in a single srun allocation: setup, build, tests, benchmarks.
	# This avoids re-queuing for each step and keeps the environment consistent.
	# Use Ampere (a6000, sm_86) — the NGC 25.11 container (Python 3.12, PyTorch 2.9).
	# Switch a6000 -> b4000 once Blackwell support is confirmed.
	- name: Build and test (GPU)
	run: \|
	mkdir -p $PIP_PKGS
	srun --gres=gpu:a6000:1 -p gpu --export=ALL \
	-t 10:00:00 --cpus-per-task=16 --mem=128G \
	apptainer exec --nv --writable-tmpfs \
	--bind /net/scratch:/net/scratch \
	--bind $PIP_PKGS:/ci-pip-pkgs \
	--pwd $GITHUB_WORKSPACE \
	$SIF bash << 'GPU_WORK'
	set -ex

	PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
	export PYTHONUSERBASE=/ci-pip-pkgs
	export PATH=/ci-pip-pkgs/bin:$HOME/.local/bin:$PATH
	export PYTHONPATH=$GITHUB_WORKSPACE:/ci-pip-pkgs/lib/python${PYVER}/site-packages:$PYTHONPATH
	export TMOL_CACHE_ROOT=/net/scratch/bench/ci-caches/tmol
	export TMOL_PRECOMPILED_BUILD_DIR=$TMOL_CACHE_ROOT/precompiled-build
	export TMOL_JIT_CACHE_DIR=$TMOL_CACHE_ROOT/jit-extensions
	mkdir -p "$TMOL_PRECOMPILED_BUILD_DIR" "$TMOL_JIT_CACHE_DIR"

	# ── Setup ──────────────────────────────────────────────────────
	command -v uv >/dev/null 2>&1 \|\| pip install --user uv

	TMPDIR=$GITHUB_WORKSPACE/.ci-tmp
	mkdir -p $TMPDIR

	pip freeze \| grep -vE '(file://\|^-e \|^#\|^pandas==\|^scipy==\|^clang-format==\|^black==\|^flake8==\|^pathspec==\|^click==\|^mypy-extensions==\|^packaging==)' > $TMPDIR/container_pins.txt
	uv pip compile pyproject.toml --all-extras \
	--constraint $TMPDIR/container_pins.txt \
	--output-file $TMPDIR/requirements.txt

	grep -vE "^(torch(\|vision\|audio)\|numpy\|nvidia-.*\|triton\|tensorrt\|pynvml\|pandas\|scipy)==" \
	$TMPDIR/requirements.txt > $TMPDIR/to_install.txt

	uv pip install --prefix /ci-pip-pkgs --python /usr/bin/python3 --break-system-packages \
	-r $TMPDIR/to_install.txt 'packaging>=24.2' 'scikit-build-core>=0.10' 'numpy<2'

	python3 -c "from openbabel import pybel; print('openbabel OK:', pybel.readstring('smi', 'CCO').formula)"

	# ── Build (persistent cache dir on runner) ────────────────────
	rm -rf build
	ln -sfn "$TMOL_PRECOMPILED_BUILD_DIR" build

	export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
	export CMAKE_PREFIX_PATH=$(python3 -c 'import torch; print(torch.utils.cmake_prefix_path)')

	GPU_ARCH=$(python3 -c "import torch; c=torch.cuda.get_device_capability(0); print(f'{c[0]}{c[1]}')")
	echo "=== Building for GPU architecture: sm_${GPU_ARCH} ==="
	MAX_JOBS=12 pip install -v --no-build-isolation --no-deps --prefix /ci-pip-pkgs \
	-Ccmake.define.CMAKE_CUDA_ARCHITECTURES="${GPU_ARCH}" \
	-Ccmake.define.TMOL_BUILD_TESTS=ON \
	-Ccmake.define.TMOL_NVCC_THREADS=2 \
	-e .

	# ── Environment info ───────────────────────────────────────────
	echo "=== environment ==="
	python --version
	python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}, GPUs: {torch.cuda.device_count()}')"
	nvidia-smi \|\| true

	# ── Test CPU ───────────────────────────────────────────────────
	echo "=== tests (CPU only) ==="
	unset TMOL_USE_JIT
	export TMOL_JIT_FALLBACK=0
	unset TORCH_EXTENSIONS_DIR
	CUDA_VISIBLE_DEVICES="" pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 \
	--cov=./tmol --junitxml=testing.cpu.junit.xml -o faulthandler_timeout=300 \
	-k "not cuda and not benchmark and not score_function_benchmarks"

	# ── Test CUDA ──────────────────────────────────────────────────
	echo "=== tests (CUDA) ==="
	unset TMOL_USE_JIT
	export TMOL_JIT_FALLBACK=0
	unset TORCH_EXTENSIONS_DIR
	pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 --no-cov \
	--junitxml=testing.cuda.junit.xml -o faulthandler_timeout=300 \
	-k "cuda and not benchmark and not score_function_benchmarks"

	# ── Benchmark ──────────────────────────────────────────────────
	echo "=== benchmarks ==="
	BENCHMARK_DIR=benchmark/$GITHUB_REPOSITORY/$GITHUB_REF_NAME
	BENCHMARK_RESULT=${BENCHMARK_DIR}/${GITHUB_RUN_NUMBER}.json
	mkdir -p $BENCHMARK_DIR

	pytest -p no:rerunfailures --benchmark-enable --benchmark-only \
	--benchmark-name=short --benchmark-sort=fullname \
	--benchmark-columns=ops,mean,iqr \
	--benchmark-json=${BENCHMARK_RESULT} \
	--benchmark-max-time=.1 \|\| true

	pytest-benchmark compare --name=short --sort=fullname \
	--columns=ops,mean,iqr $(find benchmark -name '*.json') \|\| true

	# ── Test CUDA (JIT) ────────────────────────────────────────────
	echo "=== tests (CUDA JIT) ==="
	export TMOL_USE_JIT=1
	export TMOL_JIT_FALLBACK=0
	export TORCH_CUDA_ARCH_LIST=$(python3 -c "import torch; c=torch.cuda.get_device_capability(0); print(f'{c[0]}.{c[1]}')")
	export TORCH_EXTENSIONS_DIR=$TMOL_JIT_CACHE_DIR
	echo "JIT compiling for TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}"

	pytest -p no:rerunfailures -p instafail -s -v --tb=short --durations=25 --no-cov \
	--junitxml=testing.cuda.jit.junit.xml -o faulthandler_timeout=300 \
	-k "cuda and not benchmark and not score_function_benchmarks"

	GPU_WORK

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results
	path: \|
	testing.cpu.junit.xml
	testing.cuda.junit.xml
	testing.cuda.jit.junit.xml

	- name: Upload benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results
	path: benchmark/*/

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

add new ci pipeline #135

Workflow file

add new ci pipeline #135

Uh oh!

Workflow file for this run