GPU Benchmark #36
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # GPU Benchmark | |
| # | |
| # Runs GPU training benchmark with configurable model size. | |
| # Two modes: | |
| # - GPU-only (default): benchmark GPU training time | |
| # - CPU comparison: also run CPU for speedup comparison (use smaller configs) | |
| # | |
| # Examples: | |
| # gh workflow run gpu-benchmark.yml # GPU-only, 50 S3 samples | |
| # gh workflow run gpu-benchmark.yml -f s3_samples=0 # GPU-only, repo test data | |
| # gh workflow run gpu-benchmark.yml -f compare_cpu=true -f s3_samples=0 # GPU vs CPU comparison | |
| # gh workflow run gpu-benchmark.yml -f channels=64 -f residual_blocks=16 # Production model | |
| # gh workflow run gpu-benchmark.yml -f s3_samples=50 -f wandb_project=elf-net-ci # Log to WandB | |
| name: GPU Benchmark | |
| # Weekly schedule + manual trigger | |
| on: | |
| schedule: | |
| - cron: '0 6 * * 1' # Monday 6am UTC | |
| workflow_dispatch: | |
| inputs: | |
| instance_type: | |
| description: 'EC2 instance type' | |
| default: 'g6.xlarge' | |
| type: string | |
| epochs: | |
| description: 'Number of training epochs' | |
| default: '5' | |
| type: string | |
| channels: | |
| description: 'Number of model channels (8=tiny, 32=prod, 64=large)' | |
| default: '32' | |
| type: string | |
| residual_blocks: | |
| description: 'Number of residual blocks (2=tiny, 16=prod)' | |
| default: '16' | |
| type: string | |
| s3_samples: | |
| description: 'Number of S3 samples to use (0=use repo test data, 10-200 for S3)' | |
| default: '50' | |
| type: string | |
| max_file_size: | |
| description: 'Skip S3 samples larger than N MB (0=no limit, 25=safe for L4 24GB)' | |
| default: '25' | |
| type: string | |
| compare_cpu: | |
| description: 'Also run CPU benchmark for speedup comparison (slower)' | |
| default: false | |
| type: boolean | |
| wandb_project: | |
| description: 'WandB project name (empty=use WANDB_PROJECT var or elf-net-ci; "none"/"false"/"disabled" to skip)' | |
| default: '' | |
| type: string | |
| dataset_version: | |
| description: 'Dataset version tag for WandB (auto=hash of sample IDs; override with e.g. oa-s3-v1)' | |
| default: '' | |
| type: string | |
| debug: | |
| description: 'Debug mode (false/true/N minutes)' | |
| default: 'false' | |
| type: string | |
| permissions: | |
| id-token: write # Required for AWS OIDC | |
| contents: read | |
| jobs: | |
| ec2: | |
| uses: Open-Athena/ec2-gha/.github/workflows/runner.yml@v2 | |
| with: | |
| ec2_instance_type: ${{ inputs.instance_type || 'g6.xlarge' }} | |
| ec2_image_id: ami-0365bff494b18bf93 # Deep Learning OSS Nvidia Driver AMI GPU PyTorch 2.7 (Ubuntu 22.04) | |
| ec2_root_device_size: '+10' | |
| aws_tags: '[{"Key": "project", "Value": "electrai"}]' | |
| debug: ${{ inputs.debug || 'false' }} | |
| secrets: | |
| GH_SA_TOKEN: ${{ secrets.GH_SA_TOKEN }} | |
| benchmark: | |
| needs: ec2 | |
| runs-on: ${{ needs.ec2.outputs.id }} | |
| env: | |
| EPOCHS: ${{ inputs.epochs || '5' }} | |
| CHANNELS: ${{ inputs.channels || '32' }} | |
| BLOCKS: ${{ inputs.residual_blocks || '16' }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Check GPU | |
| run: nvidia-smi | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v5 | |
| - name: Install dependencies | |
| run: uv sync | |
| - name: Sync S3 training data | |
| if: (inputs.s3_samples || '50') != '0' | |
| id: s3 | |
| env: | |
| S3_SAMPLES: ${{ inputs.s3_samples || '50' }} | |
| MAX_FILE_SIZE: ${{ inputs.max_file_size || '25' }} | |
| run: > | |
| uv run scripts/s3_sync.py | |
| -n "$S3_SAMPLES" | |
| -M "$MAX_FILE_SIZE" | |
| -v | |
| - name: Run GPU benchmark | |
| id: gpu | |
| env: | |
| S3_SAMPLES: ${{ inputs.s3_samples || '50' }} | |
| MAX_FILE_SIZE: ${{ inputs.max_file_size || '25' }} | |
| WANDB_PROJECT: ${{ inputs.wandb_project || vars.WANDB_PROJECT || 'elf-net-ci' }} | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| DATASET_VERSION: ${{ inputs.dataset_version || steps.s3.outputs.DATASET_HASH }} | |
| INSTANCE_TYPE: ${{ inputs.instance_type || 'g6.xlarge' }} | |
| run: | | |
| echo "=== GPU BENCHMARK ===" | |
| echo "Config: epochs=$EPOCHS, channels=$CHANNELS, blocks=$BLOCKS, s3_samples=$S3_SAMPLES, max_file_size=${MAX_FILE_SIZE}MB" | |
| echo "" | |
| DATA_ARGS="" | |
| if [ "$S3_SAMPLES" != "0" ] && [ -n "$S3_SAMPLES" ]; then | |
| DATA_ARGS="--data-root data/s3" | |
| echo "Using S3 data ($S3_SAMPLES samples)" | |
| else | |
| echo "Using repo test data (5 samples)" | |
| fi | |
| WANDB_ARGS="" | |
| if echo "$WANDB_PROJECT" | grep -qiE '^(none|false|disabled)$'; then | |
| echo "WandB logging: disabled (project=$WANDB_PROJECT)" | |
| elif [ -n "$WANDB_API_KEY" ] && [ -n "$WANDB_PROJECT" ]; then | |
| WANDB_ARGS="--wandb-project $WANDB_PROJECT" | |
| echo "WandB logging: $WANDB_PROJECT" | |
| else | |
| echo "WandB logging: disabled (no API key)" | |
| fi | |
| START=$(date +%s) | |
| OUTPUT=$(uv run python scripts/e2e_train.py \ | |
| --gpu \ | |
| --epochs "$EPOCHS" \ | |
| --channels "$CHANNELS" \ | |
| --residual-blocks "$BLOCKS" \ | |
| --gradient-checkpoint \ | |
| --max-file-size "$MAX_FILE_SIZE" \ | |
| --verbose \ | |
| --no-check \ | |
| $DATA_ARGS \ | |
| $WANDB_ARGS 2>&1 | tee /dev/stderr) | |
| END=$(date +%s) | |
| GPU_TIME=$((END - START)) | |
| echo "" | |
| echo "GPU_TIME=${GPU_TIME}" >> $GITHUB_OUTPUT | |
| echo "GPU training time: ${GPU_TIME}s" | |
| # Capture WandB run URL from output | |
| WANDB_RUN_URL=$(echo "$OUTPUT" | grep -oP 'WANDB_RUN_URL=\K.*' || true) | |
| if [ -n "$WANDB_RUN_URL" ]; then | |
| echo "WANDB_RUN_URL=${WANDB_RUN_URL}" >> $GITHUB_OUTPUT | |
| echo "WandB run: $WANDB_RUN_URL" | |
| fi | |
| - name: Run CPU benchmark | |
| id: cpu | |
| if: inputs.compare_cpu | |
| env: | |
| S3_SAMPLES: ${{ inputs.s3_samples || '50' }} | |
| MAX_FILE_SIZE: ${{ inputs.max_file_size || '25' }} | |
| run: | | |
| echo "=== CPU BENCHMARK ===" | |
| echo "Config: epochs=$EPOCHS, channels=$CHANNELS, blocks=$BLOCKS, s3_samples=$S3_SAMPLES, max_file_size=${MAX_FILE_SIZE}MB" | |
| echo "" | |
| DATA_ARGS="" | |
| if [ "$S3_SAMPLES" != "0" ] && [ -n "$S3_SAMPLES" ]; then | |
| DATA_ARGS="--data-root data/s3" | |
| echo "Using S3 data ($S3_SAMPLES samples)" | |
| else | |
| echo "Using repo test data (5 samples)" | |
| fi | |
| START=$(date +%s) | |
| uv run python scripts/e2e_train.py \ | |
| --epochs "$EPOCHS" \ | |
| --channels "$CHANNELS" \ | |
| --residual-blocks "$BLOCKS" \ | |
| --gradient-checkpoint \ | |
| --max-file-size "$MAX_FILE_SIZE" \ | |
| --verbose \ | |
| --no-check \ | |
| $DATA_ARGS | |
| END=$(date +%s) | |
| CPU_TIME=$((END - START)) | |
| echo "" | |
| echo "CPU_TIME=${CPU_TIME}" >> $GITHUB_OUTPUT | |
| echo "CPU training time: ${CPU_TIME}s" | |
| - name: Summary | |
| env: | |
| S3_SAMPLES: ${{ inputs.s3_samples || '50' }} | |
| MAX_FILE_SIZE: ${{ inputs.max_file_size || '25' }} | |
| COMPARE_CPU: ${{ inputs.compare_cpu }} | |
| WANDB_PROJECT: ${{ inputs.wandb_project || vars.WANDB_PROJECT || 'elf-net-ci' }} | |
| WANDB_ENTITY: ${{ vars.WANDB_ENTITY || 'PrinceOA' }} | |
| GPU_TIME: ${{ steps.gpu.outputs.GPU_TIME }} | |
| CPU_TIME: ${{ steps.cpu.outputs.CPU_TIME }} | |
| WANDB_RUN_URL: ${{ steps.gpu.outputs.WANDB_RUN_URL }} | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| if [ "$S3_SAMPLES" != "0" ] && [ -n "$S3_SAMPLES" ]; then | |
| DATA_DESC="$S3_SAMPLES samples (S3, ≤${MAX_FILE_SIZE}MB)" | |
| else | |
| DATA_DESC="5 samples (repo test data)" | |
| fi | |
| echo "## GPU Benchmark Results" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Configuration" >> $GITHUB_STEP_SUMMARY | |
| echo "| Parameter | Value |" >> $GITHUB_STEP_SUMMARY | |
| echo "|-----------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Epochs | $EPOCHS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Channels | $CHANNELS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Residual Blocks | $BLOCKS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Data | $DATA_DESC |" >> $GITHUB_STEP_SUMMARY | |
| COMMIT_LINK="[\`${GITHUB_SHA::7}\`](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/commit/${GITHUB_SHA})" | |
| PR_NUM=$(curl -sf -H "Authorization: token $GH_TOKEN" \ | |
| "${GITHUB_API_URL}/repos/${GITHUB_REPOSITORY}/commits/${GITHUB_SHA}/pulls" \ | |
| | python3 -c "import sys,json; ps=json.load(sys.stdin); print(ps[0]['number'] if ps else '')" 2>/dev/null || true) | |
| if [ -n "$PR_NUM" ]; then | |
| COMMIT_LINK="${COMMIT_LINK} ([#${PR_NUM}](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/pull/${PR_NUM}))" | |
| fi | |
| echo "| Commit | ${COMMIT_LINK} |" >> $GITHUB_STEP_SUMMARY | |
| if [ -n "$WANDB_RUN_URL" ]; then | |
| WANDB_RUN_ID=$(echo "$WANDB_RUN_URL" | grep -oP '[^/]+$') | |
| echo "| WandB | [${WANDB_PROJECT}](https://wandb.ai/${WANDB_ENTITY}/$WANDB_PROJECT) run [${WANDB_RUN_ID}](${WANDB_RUN_URL}) |" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Results" >> $GITHUB_STEP_SUMMARY | |
| echo "| Device | Time | Speedup |" >> $GITHUB_STEP_SUMMARY | |
| echo "|--------|------|---------|" >> $GITHUB_STEP_SUMMARY | |
| if [ "$COMPARE_CPU" = "true" ]; then | |
| SPEEDUP=$(echo "scale=2; $CPU_TIME / $GPU_TIME" | bc) | |
| echo "| GPU | ${GPU_TIME}s | **${SPEEDUP}x** |" >> $GITHUB_STEP_SUMMARY | |
| echo "| CPU | ${CPU_TIME}s | 1.0x |" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "| GPU | ${GPU_TIME}s | - |" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "*CPU comparison skipped. Run with \`compare_cpu=true\` to include.*" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| - name: Show GPU memory usage | |
| if: always() | |
| run: nvidia-smi |