From ccb7d05685b587c72383e15265e446eba2c38231 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <209825114+claude[bot]@users.noreply.github.com> Date: Fri, 15 Aug 2025 22:57:25 +0000 Subject: [PATCH 01/14] Implement tar improvements for smaller tarballs and obviate --ignore-zeros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Set --blocking-factor=1 to prevent extra zero-padding blocks - Add --sparse flag to consolidate runs of zeros in input files - Add --label parameter with run basename for identification - Pipe tar output through 'head --bytes -1024' to trim EOF blocks before gzip - Update comments and documentation to reflect that --ignore-zeros is no longer needed - Replace --gzip with manual gzip after EOF trimming for proper concatenation These changes implement the solution described in issue #8 to create efficiently concatenatable tarballs that can be extracted with standard tar commands without requiring --ignore-zeros. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Chris Tomkins-Tinch --- incremental_illumina_upload_to_gs.sh | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 971e757..8b7415b 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -6,7 +6,7 @@ # depends on: # GNU tar (separate install on mac) # google-cloud-sdk -# IMPORTANT: resulting tarball must be extracted with GNU tar and "--ignore-zeros" specified +# Uses optimized tar settings (--blocking-factor=1, --sparse, EOF trimming) for efficient concatenation if [[ "$#" -ne 2 ]]; then echo "--------------------------------------------------------------------" @@ -21,8 +21,8 @@ if [[ "$#" -ne 2 ]]; then echo "" echo "" echo "This script creates incremental gzipped tarballs and syncs them" - echo "to a single tarball in a GS bucket. The resulting tarballs must be" - echo "extracted with GNU tar or compatible with '--ignore-zeros' specifed." + echo "to a single tarball in a GS bucket. The tarballs use optimized" + echo "tar settings for efficient concatenation and standard extraction." echo "" echo "Dependencies: GNU tar, google-cloud-sdk (with crcmod installed)" echo "" @@ -157,15 +157,20 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. # see: https://www.gnu.org/software/tar/manual/html_node/Incremental-Dumps.html # https://www.gnu.org/software/tar/manual/html_node/Snapshot-Files.html # '--no-check-device' is for NFS - # '-C "${PATH_TO_UPLOAD}" .' so we don't store the full path (-C is cd) + # '-C "${PATH_TO_UPLOAD}" ." so we don't store the full path (-C is cd) + # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation + # '--sparse' consolidates runs of zeros in input files + # '--label' adds human-readable note with run ID + # 'head --bytes -1024' trims EOF blocks (two 512-byte blocks) before gzip compression if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi $TAR_BIN --exclude='Thumbnail_Images' --exclude="Images" --exclude "FocusModelGeneration" --exclude='Autocenter' --exclude='InstrumentAnalyticsLogs' --exclude "Logs" \ --create \ - --gzip \ + --blocking-factor=1 \ + --sparse \ + --label="${RUN_BASENAME}" \ $SHOULD_CHECK_DEVICE_STR \ - --file="${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz" \ --listed-incremental="${STAGING_AREA_PATH}/${RUN_BASENAME}/index" \ - -C "${PATH_TO_UPLOAD}" . + -C "${PATH_TO_UPLOAD}" . | head --bytes -1024 | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz" # ------------------------------------------------------------------------- # # (WIP alternative to the above tar call) @@ -250,7 +255,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. done # create a note about the tarball - echo "$RUN_BASENAME.tar.gz must be unpacked with GNU tar and '--ignore-zeros' specified." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt" + echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt" # if only the index file is present, remove it if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then From 5142975c0fb336d8d8fa2297b02ad86e8463f1a3 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 19:30:29 -0400 Subject: [PATCH 02/14] add Claude-related infrastructure files --- .claude/agents/expert-developer.md | 7 +++ .claude/settings.local.json | 9 ++++ CLAUDE.md | 79 ++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 .claude/agents/expert-developer.md create mode 100644 .claude/settings.local.json create mode 100644 CLAUDE.md diff --git a/.claude/agents/expert-developer.md b/.claude/agents/expert-developer.md new file mode 100644 index 0000000..9869054 --- /dev/null +++ b/.claude/agents/expert-developer.md @@ -0,0 +1,7 @@ +--- +name: expert-developer +description: use this agent when uncommitted changes exist for files in this repo, and for when open issues are outstanding for this repo on GitHub +model: sonnet +--- + +Expert software enginner who can act on my behalf to address open GitHub issues, incorporate changes according to guidance I have issued, and review new code I have written. diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..42e34a5 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,9 @@ +{ + "permissions": { + "allow": [ + "Bash(git checkout:*)" + ], + "deny": [], + "ask": [] + } +} \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..81371f4 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,79 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This repository contains bash scripts for incremental upload of Illumina sequencing runs to Google Cloud Storage. The system creates incremental gzipped tarballs and syncs them to a single tarball in a GS bucket, designed to work on Linux-based Illumina sequencers (like NextSeq 2000) or companion computers. + +## Architecture + +**Core Scripts:** +- `incremental_illumina_upload_to_gs.sh` - Main upload script that creates incremental tar archives and uploads them to GS +- `monitor_runs.sh` - Monitoring script that watches for new run directories and launches upload processes +- `simulate_sequencer_write.sh` - Testing utility that simulates a sequencer writing data incrementally + +**Key Components:** +- **Incremental Archiving**: Uses GNU tar with `--listed-incremental` to create incremental backups +- **Chunked Uploads**: Splits large runs into manageable chunks (default 100MB) with retry logic +- **GS Composition**: Uses `gsutil compose` to merge incremental tarballs into single archives +- **Cross-platform Support**: Handles differences between Linux (Illumina sequencers) and macOS + +## Dependencies + +Required tools that must be available: +- `gsutil` (Google Cloud SDK) +- `tar` (GNU tar, installed as `gtar` on macOS) +- `pstree` (for monitoring script, installed via `brew install pstree` on macOS) + +## Environment Variables + +Key configuration variables (with defaults): +- `CHUNK_SIZE_MB=100` - Size of incremental tar chunks +- `DELAY_BETWEEN_INCREMENTS_SEC=30` - Wait time between upload attempts +- `RUN_COMPLETION_TIMEOUT_DAYS=16` - Max time to wait for run completion +- `STAGING_AREA_PATH` - Location for temporary files (defaults to `/usr/local/illumina/seq-run-uploads` on Illumina machines, `/tmp/seq-run-uploads` elsewhere) +- `RSYNC_RETRY_MAX_ATTEMPTS=12` - Maximum retry attempts for uploads +- `INCLUSION_TIME_INTERVAL_DAYS=7` - Age limit for runs to be considered for upload + +## Usage Patterns + +**Main upload script:** +```bash +./incremental_illumina_upload_to_gs.sh /path/to/run gs://bucket-prefix +``` + +**Monitoring script:** +```bash +./monitor_runs.sh /path/to/monitored-directory gs://bucket-prefix +``` + +**Simulation script (for testing):** +```bash +./simulate_sequencer_write.sh /path/to/actual_run /path/to/simulated_run +``` + +## Important Implementation Details + +- **Excluded Directories**: The upload excludes large non-essential directories: `Thumbnail_Images`, `Images`, `FocusModelGeneration`, `Autocenter`, `InstrumentAnalyticsLogs`, `Logs` +- **Individual Files**: `SampleSheet.csv` and `RunInfo.xml` are uploaded separately before tarball creation +- **Run Completion Detection**: Looks for `RTAComplete.txt` or `RTAComplete.xml` files +- **Tarball Extraction**: Resulting tarballs must be extracted with GNU tar using `--ignore-zeros` +- **NFS Support**: Uses `--no-check-device` flag for NFS mounted storage +- **Platform Detection**: Automatically detects Illumina machines vs other environments +- **Cleanup**: Removes local incremental tarballs after successful upload + +## Cron Integration + +The monitoring script is designed to work with cron scheduling. Example crontab entry: +``` +@hourly ~/monitor_runs.sh /usr/local/illumina/runs gs://bucket/flowcells >> ~/upload_monitor.log +``` + +## File Paths + +Staging areas: +- Illumina machines: `/usr/local/illumina/seq-run-uploads` +- Other systems: `/tmp/seq-run-uploads` + +Run detection based on presence of `RunInfo.xml` files in monitored directories. \ No newline at end of file From 2eeb65d37a79a52aacb1dd46cba97fc2998bc6cf Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 19:38:50 -0400 Subject: [PATCH 03/14] do not trim EOF blocks from final incremental tarball do not trim EOF blocks from final incremental tarball that mops up the remaining changed files --- incremental_illumina_upload_to_gs.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 8b7415b..fbf5d9d 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -161,8 +161,9 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation # '--sparse' consolidates runs of zeros in input files # '--label' adds human-readable note with run ID - # 'head --bytes -1024' trims EOF blocks (two 512-byte blocks) before gzip compression + # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi + if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi $TAR_BIN --exclude='Thumbnail_Images' --exclude="Images" --exclude "FocusModelGeneration" --exclude='Autocenter' --exclude='InstrumentAnalyticsLogs' --exclude "Logs" \ --create \ --blocking-factor=1 \ @@ -170,7 +171,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. --label="${RUN_BASENAME}" \ $SHOULD_CHECK_DEVICE_STR \ --listed-incremental="${STAGING_AREA_PATH}/${RUN_BASENAME}/index" \ - -C "${PATH_TO_UPLOAD}" . | head --bytes -1024 | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz" + -C "${PATH_TO_UPLOAD}" . | $EOF_PROCESSOR | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz" # ------------------------------------------------------------------------- # # (WIP alternative to the above tar call) From c8ed187c59d418e29be3f67d22cccad9c9478da4 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 21:03:52 -0400 Subject: [PATCH 04/14] more claude config --- .claude/settings.local.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 42e34a5..883804c 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -1,7 +1,9 @@ { "permissions": { "allow": [ - "Bash(git checkout:*)" + "Bash(git checkout:*)", + "Bash(git fetch:*)", + "WebFetch(domain:www.gnu.org)" ], "deny": [], "ask": [] From b5fc8836583d77cf8fe19cb2b803c58a164e2e72 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 21:05:13 -0400 Subject: [PATCH 05/14] obtain metadata related to the upload and the uploader, and persist via in-band tar labels, plus a verbose json file alongside the README.txt in the destination bucket --- incremental_illumina_upload_to_gs.sh | 213 ++++++++++++++++++++++++++- monitor_runs.sh | 9 +- 2 files changed, 218 insertions(+), 4 deletions(-) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index fbf5d9d..7441a0b 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -23,6 +23,10 @@ if [[ "$#" -ne 2 ]]; then echo "This script creates incremental gzipped tarballs and syncs them" echo "to a single tarball in a GS bucket. The tarballs use optimized" echo "tar settings for efficient concatenation and standard extraction." + echo "Each tar chunk includes metadata labels with run info, timestamps," + echo "and machine details. Use 'tar --test-label -f chunk.tar.gz' to view" + echo "individual chunk labels, or 'tar -tvf combined.tar.gz | grep Volume'" + echo "to see all labels in the final composed archive." echo "" echo "Dependencies: GNU tar, google-cloud-sdk (with crcmod installed)" echo "" @@ -73,6 +77,7 @@ chunk_size_bytes=$(expr $CHUNK_SIZE_MB \* 1048576) # $CHUNK_SIZE_MB*1024^2 RUN_COMPLETION_TIMEOUT_SEC=$(expr $RUN_COMPLETION_TIMEOUT_DAYS \* 86400) size_at_last_check=0 +tar_increment_counter=0 TAR_BIN="tar" GSUTIL_CMD='gsutil' @@ -85,6 +90,201 @@ fi $GSUTIL_CMD version -l +# Function to detect external IP address using multiple methods +get_external_ip() { + local ip="" + + # Check if required tools are available + local has_dig=$(command -v dig &> /dev/null && echo "true" || echo "false") + local has_curl=$(command -v curl &> /dev/null && echo "true" || echo "false") + local has_awk=$(command -v awk &> /dev/null && echo "true" || echo "false") + + if [[ "$has_dig" == "true" && "$has_awk" == "true" ]]; then + # Method 1: Google DNS TXT record + ip=$(dig +short txt o-o.myaddr.l.google.com @ns1.google.com 2>/dev/null | awk -F'"' '{print $2}' | tr -d '\n\r"' | head -1) + if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$ip" + return + fi + + # Method 2: OpenDNS + ip=$(dig +short myip.opendns.com @resolver1.opendns.com 2>/dev/null | tr -d '\n\r"' | head -1) + if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$ip" + return + fi + + # Method 3: Cloudflare + ip=$(dig +short txt ch whoami.cloudflare @1.0.0.1 2>/dev/null | tr -d '\n\r"' | head -1) + if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$ip" + return + fi + + # Method 4: Google DNS (alternative) + ip=$(dig +short txt o-o.myaddr.l.google.com @ns1.google.com 2>/dev/null | tr -d '\n\r"' | head -1) + if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$ip" + return + fi + fi + + if [[ "$has_curl" == "true" ]]; then + # Method 5: AWS checkip + ip=$(curl -s --max-time 5 checkip.amazonaws.com 2>/dev/null | tr -d '\n\r"' | head -1) + if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$ip" + return + fi + fi + + # Fallback: try to get local interface IP + if command -v ip &> /dev/null; then + local local_ip=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' 2>/dev/null | head -1) + if [[ -n "$local_ip" && "$local_ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$local_ip" + return + fi + elif command -v route &> /dev/null; then + # macOS/BSD fallback using route command + local local_ip=$(route get 8.8.8.8 2>/dev/null | awk '/interface:/ {getline; if(/inet/) print $2}' | head -1) + if [[ -n "$local_ip" && "$local_ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "$local_ip" + return + fi + fi + + # Final fallback + echo "0.0.0.0" +} + +# Function to detect if running via cron +is_cron_execution() { + # Check if CRON_INVOKED is set by monitor_runs.sh + if [[ -n "$CRON_INVOKED" ]]; then + echo "$CRON_INVOKED" + return + fi + + # Detect based on environment characteristics + if [[ -z "$TERM" || "$TERM" == "dumb" ]] && [[ -z "$SSH_CLIENT" ]] && [[ -z "$SSH_TTY" ]]; then + echo "true" + else + echo "false" + fi +} + +# Function to generate enhanced tar label metadata +generate_tar_label() { + local run_basename="$1" + local increment_num="$2" + local timestamp_formatted=$(date +%y-%m-%dT%H:%M) + local hostname=$(hostname) + local username=$(whoami) + local external_ip=$(get_external_ip) + local is_cron=$(is_cron_execution) + + # GNU tar volume labels have a strict 99-byte limit, but support control characters + # Try compact JSON first, then fall back to pipe-separated format if too long + local json_metadata="{\"r\":\"${run_basename:0:15}\",\"t\":\"$timestamp_formatted\",\"i\":$increment_num,\"h\":\"${hostname:0:8}\",\"u\":\"${username:0:8}\",\"ip\":\"$external_ip\",\"c\":$([ "$is_cron" = "true" ] && echo 1 || echo 0)}" + + if [[ ${#json_metadata} -le 99 ]]; then + # Use JSON format (human and machine readable) + echo "$json_metadata" + else + # Fallback to pipe-separated format (very compact) + local pipe_format="${run_basename:0:15}|$timestamp_formatted|$increment_num|${hostname:0:8}|${username:0:8}|$external_ip|$([ "$is_cron" = "true" ] && echo 1 || echo 0)" + if [[ ${#pipe_format} -le 99 ]]; then + echo "$pipe_format" + else + # Last resort: compress with gzip and base64 encode + local compressed=$(echo "$json_metadata" | gzip | base64 | tr -d '\n') + local max_len=$((99 - 3)) # Reserve 3 chars for "gz:" prefix + echo "gz:${compressed:0:$max_len}" + fi + fi +} + +# Function to generate verbose metadata JSON file for the entire upload process +generate_verbose_metadata() { + local run_basename="$1" + local run_path="$2" + local destination_bucket="$3" + local start_time="$4" + local current_time=$(date +%s) + local timestamp_formatted=$(date -d "@$current_time" +%Y-%m-%dT%H:%M:%S%z 2>/dev/null || date -r "$current_time" +%Y-%m-%dT%H:%M:%S%z) + local start_timestamp=$(date -d "@$start_time" +%Y-%m-%dT%H:%M:%S%z 2>/dev/null || date -r "$start_time" +%Y-%m-%dT%H:%M:%S%z) + local hostname=$(hostname) + local username=$(whoami) + local external_ip=$(get_external_ip) + local is_cron=$(is_cron_execution) + local script_version=$(grep "^# version: " "$0" | head -1 | sed 's/^# version: //' || echo "unknown") + local final_increment=$tar_increment_counter + local upload_duration=$((current_time - start_time)) + + # Get executable versions + local gsutil_version=$(gsutil version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown") + local gcloud_version=$(gcloud version --format="value(Google Cloud SDK)" 2>/dev/null || echo "unknown") + local tar_version=$($TAR_BIN --version 2>/dev/null | head -1 || echo "unknown") + local bash_version=$($BASH --version 2>/dev/null | head -1 || echo "$BASH_VERSION") + + # Get run size information + local run_size_bytes=0 + if [ -d "$run_path" ]; then + if [ "$(uname)" != "Darwin" ]; then + run_size_bytes=$(du -sb "$run_path" 2>/dev/null | cut -f1 || echo 0) + else + run_size_bytes=$(du -sk "$run_path" 2>/dev/null | awk '{print $1 * 1024}' || echo 0) + fi + fi + + # Create comprehensive metadata JSON + cat << EOF +{ + "upload_metadata": { + "run_basename": "$run_basename", + "run_path": "$run_path", + "destination_bucket": "$destination_bucket", + "upload_start_time": "$start_timestamp", + "upload_completion_time": "$timestamp_formatted", + "upload_duration_seconds": $upload_duration, + "total_increments": $final_increment, + "run_size_bytes": $run_size_bytes, + "cron_invoked": $is_cron + }, + "uploading_machine_info": { + "hostname": "$hostname", + "username": "$username", + "external_ip": "$external_ip", + "operating_system": "$(uname -s)", + "architecture": "$(uname -m)", + "script_version": "$script_version" + }, + "tool_versions": { + "gsutil_version": "$gsutil_version", + "gcloud_version": "$gcloud_version", + "tar_version": "$tar_version", + "bash_version": "$bash_version" + }, + "environment_variables": { + "chunk_size_mb": $CHUNK_SIZE_MB, + "delay_between_increments_sec": $DELAY_BETWEEN_INCREMENTS_SEC, + "run_completion_timeout_days": $RUN_COMPLETION_TIMEOUT_DAYS, + "staging_area_path": "$STAGING_AREA_PATH", + "source_path_is_on_nfs": "$SOURCE_PATH_IS_ON_NFS" + }, + "tar_settings": { + "tar_binary": "$TAR_BIN", + "blocking_factor": 1, + "sparse_enabled": true, + "eof_trimming": "incremental_only", + "excluded_directories": ["Thumbnail_Images", "Images", "FocusModelGeneration", "Autocenter", "InstrumentAnalyticsLogs", "Logs"] + }, + "generation_timestamp": "$timestamp_formatted" +} +EOF +} + # if the run does not already exist on the destination, commence upload process... if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then START_TIME=$(date +%s) @@ -152,6 +352,12 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. echo "commencing sync on latest data" size_at_last_check=$current_size timestamp=$(date +%s) # intentionally called before tar so time is a little older + + # increment counter for this tarball + tar_increment_counter=$((tar_increment_counter + 1)) + + # generate enhanced tar label with metadata + tar_label=$(generate_tar_label "$RUN_BASENAME" "$tar_increment_counter") # increate incremental tarballs # see: https://www.gnu.org/software/tar/manual/html_node/Incremental-Dumps.html @@ -160,7 +366,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. # '-C "${PATH_TO_UPLOAD}" ." so we don't store the full path (-C is cd) # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation # '--sparse' consolidates runs of zeros in input files - # '--label' adds human-readable note with run ID + # '--label' adds enhanced metadata (JSON or pipe-separated format within 99-byte tar limit) # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi @@ -168,7 +374,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. --create \ --blocking-factor=1 \ --sparse \ - --label="${RUN_BASENAME}" \ + --label="$tar_label" \ $SHOULD_CHECK_DEVICE_STR \ --listed-incremental="${STAGING_AREA_PATH}/${RUN_BASENAME}/index" \ -C "${PATH_TO_UPLOAD}" . | $EOF_PROCESSOR | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz" @@ -257,6 +463,9 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. # create a note about the tarball echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt" + + # create and upload verbose metadata JSON file + generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json" # if only the index file is present, remove it if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then diff --git a/monitor_runs.sh b/monitor_runs.sh index 3cd4bab..1d2abc3 100755 --- a/monitor_runs.sh +++ b/monitor_runs.sh @@ -6,7 +6,7 @@ # depends on: # google-cloud-sdk # pstree (separate install on mac, 'brew install pstree') -# IMPORTANT: resulting tarball must be extracted with GNU tar and "--ignore-zeros" specified +# Uses optimized tar settings (--blocking-factor=1, --sparse, EOF trimming) for efficient concatenation if [[ "$#" -ne 2 ]]; then echo "--------------------------------------------------------------------" @@ -99,7 +99,12 @@ while true; do upload_cmd="${SCRIPTPATH}/incremental_illumina_upload_to_gs.sh ${found_dir} ${DESTINATION_BUCKET_PREFIX}" echo " ${upload_cmd}" # fork incremental upload to separate process - (STAGING_AREA_PATH="${STAGING_AREA_PATH}" ${upload_cmd}) & + # pass cron detection info via environment variable + if [[ $CRON -gt 0 ]]; then + (STAGING_AREA_PATH="${STAGING_AREA_PATH}" CRON_INVOKED="true" ${upload_cmd}) & + else + (STAGING_AREA_PATH="${STAGING_AREA_PATH}" CRON_INVOKED="false" ${upload_cmd}) & + fi else echo "Skipping initiation of new upload (upload in progress): ${STAGING_AREA_PATH}/${RUN_BASENAME}" fi From 8be1cc3e725052c455b4f39af5decb64b63a4fae Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 21:10:47 -0400 Subject: [PATCH 06/14] use last component of run basename for short-form representation, rather than just truncating it --- incremental_illumina_upload_to_gs.sh | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 7441a0b..42b1d58 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -184,16 +184,30 @@ generate_tar_label() { local external_ip=$(get_external_ip) local is_cron=$(is_cron_execution) + # Intelligently shorten run basename: extract last component after underscores, then truncate if needed + local short_run_basename="$run_basename" + if [[ "$run_basename" =~ _ ]]; then + # Extract last component after final underscore + short_run_basename="${run_basename##*_}" + # If still too long, truncate to 15 chars + if [[ ${#short_run_basename} -gt 15 ]]; then + short_run_basename="${short_run_basename:0:15}" + fi + else + # No underscores, just truncate + short_run_basename="${run_basename:0:15}" + fi + # GNU tar volume labels have a strict 99-byte limit, but support control characters # Try compact JSON first, then fall back to pipe-separated format if too long - local json_metadata="{\"r\":\"${run_basename:0:15}\",\"t\":\"$timestamp_formatted\",\"i\":$increment_num,\"h\":\"${hostname:0:8}\",\"u\":\"${username:0:8}\",\"ip\":\"$external_ip\",\"c\":$([ "$is_cron" = "true" ] && echo 1 || echo 0)}" + local json_metadata="{\"r\":\"$short_run_basename\",\"t\":\"$timestamp_formatted\",\"i\":$increment_num,\"h\":\"${hostname:0:8}\",\"u\":\"${username:0:8}\",\"ip\":\"$external_ip\",\"c\":$([ "$is_cron" = "true" ] && echo 1 || echo 0)}" if [[ ${#json_metadata} -le 99 ]]; then # Use JSON format (human and machine readable) echo "$json_metadata" else # Fallback to pipe-separated format (very compact) - local pipe_format="${run_basename:0:15}|$timestamp_formatted|$increment_num|${hostname:0:8}|${username:0:8}|$external_ip|$([ "$is_cron" = "true" ] && echo 1 || echo 0)" + local pipe_format="$short_run_basename|$timestamp_formatted|$increment_num|${hostname:0:8}|${username:0:8}|$external_ip|$([ "$is_cron" = "true" ] && echo 1 || echo 0)" if [[ ${#pipe_format} -le 99 ]]; then echo "$pipe_format" else From 4bc2a92895f9a1b6319b5008555ef888e103de67 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 21:35:16 -0400 Subject: [PATCH 07/14] migrate Google Cloud CLI calls from `gsutil` -> `gcloud storage` migrate Google Cloud CLI calls from `gsutil` -> `gcloud storage`. Mostly a drop-in replacement, except for `gsutil compose` which is now `gcloud storage objects compose`. --- .claude/settings.local.json | 3 +- CLAUDE.md | 4 +- incremental_illumina_upload_to_gs.sh | 63 +++++++++++++++------------- monitor_runs.sh | 7 ++-- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 883804c..1e1c3e6 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -3,7 +3,8 @@ "allow": [ "Bash(git checkout:*)", "Bash(git fetch:*)", - "WebFetch(domain:www.gnu.org)" + "WebFetch(domain:www.gnu.org)", + "WebSearch" ], "deny": [], "ask": [] diff --git a/CLAUDE.md b/CLAUDE.md index 81371f4..8abfc71 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,13 +16,13 @@ This repository contains bash scripts for incremental upload of Illumina sequenc **Key Components:** - **Incremental Archiving**: Uses GNU tar with `--listed-incremental` to create incremental backups - **Chunked Uploads**: Splits large runs into manageable chunks (default 100MB) with retry logic -- **GS Composition**: Uses `gsutil compose` to merge incremental tarballs into single archives +- **GS Composition**: Uses `gcloud storage objects compose` to merge incremental tarballs into single archives - **Cross-platform Support**: Handles differences between Linux (Illumina sequencers) and macOS ## Dependencies Required tools that must be available: -- `gsutil` (Google Cloud SDK) +- `gcloud storage` (Google Cloud SDK) - `tar` (GNU tar, installed as `gtar` on macOS) - `pstree` (for monitoring script, installed via `brew install pstree` on macOS) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 42b1d58..1e39551 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -28,7 +28,7 @@ if [[ "$#" -ne 2 ]]; then echo "individual chunk labels, or 'tar -tvf combined.tar.gz | grep Volume'" echo "to see all labels in the final composed archive." echo "" - echo "Dependencies: GNU tar, google-cloud-sdk (with crcmod installed)" + echo "Dependencies: GNU tar, google-cloud-sdk (gcloud CLI with storage commands)" echo "" echo "Usage: $(basename $0) /path/of/run_to_upload gs://bucket-prefix" echo "--------------------------------------------------------------------" @@ -80,15 +80,16 @@ size_at_last_check=0 tar_increment_counter=0 TAR_BIN="tar" -GSUTIL_CMD='gsutil' +GCLOUD_STORAGE_CMD='gcloud storage' if [ "$(uname)" == "Darwin" ]; then TAR_BIN="gtar" # GNU tar must be installed and available on the path as gtar #export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=true # workaround for https://bugs.python.org/issue33725 - GSUTIL_CMD='gsutil -o "GSUtil:parallel_process_count=1"' + # Note: gcloud storage handles parallelization automatically, no manual tuning needed + GCLOUD_STORAGE_CMD='gcloud storage' fi -$GSUTIL_CMD version -l +gcloud version # Function to detect external IP address using multiple methods get_external_ip() { @@ -237,8 +238,12 @@ generate_verbose_metadata() { local upload_duration=$((current_time - start_time)) # Get executable versions - local gsutil_version=$(gsutil version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown") - local gcloud_version=$(gcloud version --format="value(Google Cloud SDK)" 2>/dev/null || echo "unknown") + local gcloud_version=$(gcloud version --format='value("Google Cloud SDK")' 2>/dev/null || echo "unknown") + local gcloud_storage_version=$(gcloud components list --filter="id:gcloud-storage" --format="value(version.string)" 2>/dev/null) + # If gcloud storage version is empty (bundled), fall back to main gcloud version + if [[ -z "$gcloud_storage_version" ]]; then + gcloud_storage_version="$gcloud_version" + fi local tar_version=$($TAR_BIN --version 2>/dev/null | head -1 || echo "unknown") local bash_version=$($BASH --version 2>/dev/null | head -1 || echo "$BASH_VERSION") @@ -275,8 +280,8 @@ generate_verbose_metadata() { "script_version": "$script_version" }, "tool_versions": { - "gsutil_version": "$gsutil_version", "gcloud_version": "$gcloud_version", + "gcloud_storage_version": "$gcloud_storage_version", "tar_version": "$tar_version", "bash_version": "$bash_version" }, @@ -300,7 +305,7 @@ EOF } # if the run does not already exist on the destination, commence upload process... -if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then +if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then START_TIME=$(date +%s) echo "Does not already exist in bucket: ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" @@ -331,8 +336,8 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. file_basename="$(basename ${filename})" file_extension="${file_basename#*.}" file_basename_no_ext="${file_basename%%.*}" - if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}" &> /dev/null; then - $GSUTIL_CMD cp "${PATH_TO_UPLOAD}/${filename}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}" + if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}" &> /dev/null; then + $GCLOUD_STORAGE_CMD cp "${PATH_TO_UPLOAD}/${filename}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}" else echo "Already exists in bucket; skipping upload: ${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}" fi @@ -414,17 +419,15 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. # try (and retry) to rsync incremental tarballs to bucket retry_count=0 until [ "$retry_count" -ge $RSYNC_RETRY_MAX_ATTEMPTS ]; do - # -m parallel uploads - # -c Causes the rsync command to compute and compare checksums + # --checksums-only: Causes the rsync command to compute and compare checksums # (instead of comparing mtime) for files if the size of source - # and destination match. - # -C If an error occurs, continue to attempt to copy the remaining - # files. If errors occurred, gsutil's exit status will be - # non-zero even if this flag is set. This option is implicitly - # set when running "gsutil -m rsync..." (included below in case '-m' is removed). + # and destination match. (gcloud storage handles parallelization automatically) + # --continue-on-error: If an error occurs, continue to attempt to copy the remaining + # files. If errors occurred, gcloud storage's exit status will be + # non-zero even if this flag is set. gcloud storage handles parallelization automatically. # - # see: https://cloud.google.com/storage/docs/gsutil/commands/rsync - $GSUTIL_CMD rsync -cC -x '.*index$' "${STAGING_AREA_PATH}/${RUN_BASENAME}/" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts" && break + # see: https://cloud.google.com/sdk/gcloud/reference/storage/rsync + $GCLOUD_STORAGE_CMD rsync --checksums-only --continue-on-error --exclude='.*index$' "${STAGING_AREA_PATH}/${RUN_BASENAME}/" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts" && break retry_count=$((retry_count+1)) #sleep $RSYNC_RETRY_DELAY_SEC sleep $(expr $RSYNC_RETRY_DELAY_SEC \* $retry_count) # each retry scale delay by a multiple of the count @@ -435,8 +438,8 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. for incremental_tarball in $(find "${STAGING_AREA_PATH}/${RUN_BASENAME}" -type f -name "*.tar.gz"); do # if the local incremental tarball has indeed been synced # remove the local copy of it... - if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})"; then - $GSUTIL_CMD cp "${incremental_tarball}" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})" && rm "${incremental_tarball}" + if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})"; then + $GCLOUD_STORAGE_CMD cp "${incremental_tarball}" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})" && rm "${incremental_tarball}" else rm "${incremental_tarball}" fi @@ -452,11 +455,11 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. done # make sure the composed tarball does not exist on GS; if it does not... - if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"; then + if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"; then # get the archive started with a blank file dummyfile="${STAGING_AREA_PATH}/${RUN_BASENAME}/dummyfile.tar.gz" touch $dummyfile - $GSUTIL_CMD cp "${dummyfile}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" + $GCLOUD_STORAGE_CMD cp "${dummyfile}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" rm "${dummyfile}" fi @@ -465,21 +468,21 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}. # append the first 31 incremental tarballs # to the main tarball, then remove the incremental tarballs # keep doing this until there are no more incremental tarballs - # see: https://cloud.google.com/storage/docs/gsutil/commands/compose - until [[ "$($GSUTIL_CMD du ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/'*.tar.gz' 2> /dev/null | wc -l | awk '{print $1}' || echo '0')" == "0" ]]; do - first_files=$($GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/"'*.tar.gz' | sort -V | head -n 31 | tr '\n' ' ') + # see: https://cloud.google.com/storage/docs/composing-objects#create-composite-cli + until [[ "$($GCLOUD_STORAGE_CMD du ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/'*.tar.gz' 2> /dev/null | wc -l | awk '{print $1}' || echo '0')" == "0" ]]; do + first_files=$($GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/"'*.tar.gz' | sort -V | head -n 31 | tr '\n' ' ') if [ ${#first_files} -ge 0 ]; then - $GSUTIL_CMD compose "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" \ + $GCLOUD_STORAGE_CMD objects compose "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" \ ${first_files} \ - "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" && sleep 10 && $GSUTIL_CMD rm ${first_files} + "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" && sleep 10 && $GCLOUD_STORAGE_CMD rm ${first_files} fi done # create a note about the tarball - echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt" + echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt" # create and upload verbose metadata JSON file - generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json" + generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json" # if only the index file is present, remove it if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then diff --git a/monitor_runs.sh b/monitor_runs.sh index 1d2abc3..8238f04 100755 --- a/monitor_runs.sh +++ b/monitor_runs.sh @@ -66,10 +66,11 @@ INCLUSION_TIME_INTERVAL_DAYS=${INCLUSION_TIME_INTERVAL_DAYS:-'7'} DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'10'} STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}" -GSUTIL_CMD='gsutil' +GCLOUD_STORAGE_CMD='gcloud storage' if [ "$(uname)" == "Darwin" ]; then #export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=true # workaround for https://bugs.python.org/issue33725 - GSUTIL_CMD='gsutil -o "GSUtil:parallel_process_count=1"' + # Note: gcloud storage handles parallelization automatically, no manual tuning needed + GCLOUD_STORAGE_CMD='gcloud storage' fi echo "Location for temp files: ${STAGING_AREA_PATH}" @@ -91,7 +92,7 @@ while true; do RUN_BASENAME="$(basename ${found_dir})" # if the run does not already exist on the destination, commence upload process... RUN_BUCKET_PATH="${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" - if ! $GSUTIL_CMD ls "${RUN_BUCKET_PATH}" &> /dev/null; then + if ! $GCLOUD_STORAGE_CMD ls "${RUN_BUCKET_PATH}" &> /dev/null; then echo "Run does not exist in bucket: ${RUN_BUCKET_PATH}" if ! [ -d "${STAGING_AREA_PATH}/${RUN_BASENAME}" ]; then echo "Run upload not yet in progress; no dir: ${STAGING_AREA_PATH}/${RUN_BASENAME}" From 51b7c47a70c9cd5ac0f25e280159f398e1d3257e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 21:54:30 -0400 Subject: [PATCH 08/14] eliminate unnecessary repetition of path construction for tarball destination (store in `FINAL_TARBALL_PATH `) --- incremental_illumina_upload_to_gs.sh | 19 +++++++++++-------- monitor_runs.sh | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 1e39551..6a8bbc0 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -304,11 +304,14 @@ generate_verbose_metadata() { EOF } +# Define the final tarball path once to avoid repetition +FINAL_TARBALL_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz" + # if the run does not already exist on the destination, commence upload process... -if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then +if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then START_TIME=$(date +%s) - echo "Does not already exist in bucket: ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" + echo "Does not already exist in bucket: $FINAL_TARBALL_PATH" # quit if the run is stale based on mtime of RunInfo.xml if [ "$(uname)" != "Darwin" ]; then @@ -455,11 +458,11 @@ if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BA done # make sure the composed tarball does not exist on GS; if it does not... - if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"; then + if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH"; then # get the archive started with a blank file dummyfile="${STAGING_AREA_PATH}/${RUN_BASENAME}/dummyfile.tar.gz" touch $dummyfile - $GCLOUD_STORAGE_CMD cp "${dummyfile}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" + $GCLOUD_STORAGE_CMD cp "${dummyfile}" "$FINAL_TARBALL_PATH" rm "${dummyfile}" fi @@ -472,14 +475,14 @@ if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BA until [[ "$($GCLOUD_STORAGE_CMD du ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/'*.tar.gz' 2> /dev/null | wc -l | awk '{print $1}' || echo '0')" == "0" ]]; do first_files=$($GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/"'*.tar.gz' | sort -V | head -n 31 | tr '\n' ' ') if [ ${#first_files} -ge 0 ]; then - $GCLOUD_STORAGE_CMD objects compose "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" \ + $GCLOUD_STORAGE_CMD objects compose "$FINAL_TARBALL_PATH" \ ${first_files} \ - "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" && sleep 10 && $GCLOUD_STORAGE_CMD rm ${first_files} + "$FINAL_TARBALL_PATH" && sleep 10 && $GCLOUD_STORAGE_CMD rm ${first_files} fi done # create a note about the tarball - echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt" + echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands. The $RUN_BASENAME.terra.tsv file can be used to add a row for this tarball to a table on Terra." | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt" # create and upload verbose metadata JSON file generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json" @@ -492,6 +495,6 @@ if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BA # if staging dir is empty, remove it (rmdir only does this if empty). rmdir "${STAGING_AREA_PATH}/${RUN_BASENAME}" &> /dev/null else - echo "Exiting; already exists: ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" + echo "Exiting; already exists: $FINAL_TARBALL_PATH" exit 0 fi diff --git a/monitor_runs.sh b/monitor_runs.sh index 8238f04..529a171 100755 --- a/monitor_runs.sh +++ b/monitor_runs.sh @@ -91,7 +91,8 @@ while true; do echo "Path is new enough to attempt an upload: ${found_dir}" RUN_BASENAME="$(basename ${found_dir})" # if the run does not already exist on the destination, commence upload process... - RUN_BUCKET_PATH="${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" + # Define the final tarball path to match the pattern used in incremental script + RUN_BUCKET_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz" if ! $GCLOUD_STORAGE_CMD ls "${RUN_BUCKET_PATH}" &> /dev/null; then echo "Run does not exist in bucket: ${RUN_BUCKET_PATH}" if ! [ -d "${STAGING_AREA_PATH}/${RUN_BASENAME}" ]; then From ccc75c131c8548b2f5692abbaeb4efa3da65f3ff Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 21:55:39 -0400 Subject: [PATCH 09/14] create tsv file for appending a row to a Terra table for the uploaded run --- incremental_illumina_upload_to_gs.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 6a8bbc0..3bad90e 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -304,6 +304,18 @@ generate_verbose_metadata() { EOF } +# Function to generate Terra-compatible TSV file for data table import +generate_terra_tsv() { + local run_basename="$1" + local tarball_path="$2" + + # Create TSV with POSIX line endings (LF only) + cat << EOF +entity:flowcell_id biosample_attributes flowcell_tar samplesheets sample_rename_map_tsv +$run_basename $tarball_path +EOF +} + # Define the final tarball path once to avoid repetition FINAL_TARBALL_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz" @@ -486,6 +498,9 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then # create and upload verbose metadata JSON file generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json" + + # create and upload Terra-compatible TSV file + generate_terra_tsv "$RUN_BASENAME" "$FINAL_TARBALL_PATH" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.terra.tsv" # if only the index file is present, remove it if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then From 5471ff635f84d5665cc434f3a3f2c828fac75f2d Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 22:01:03 -0400 Subject: [PATCH 10/14] parameterize name included in tsv for Terra table row upsert --- CLAUDE.md | 1 + incremental_illumina_upload_to_gs.sh | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 8abfc71..7e87304 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,6 +35,7 @@ Key configuration variables (with defaults): - `STAGING_AREA_PATH` - Location for temporary files (defaults to `/usr/local/illumina/seq-run-uploads` on Illumina machines, `/tmp/seq-run-uploads` elsewhere) - `RSYNC_RETRY_MAX_ATTEMPTS=12` - Maximum retry attempts for uploads - `INCLUSION_TIME_INTERVAL_DAYS=7` - Age limit for runs to be considered for upload +- `TERRA_RUN_TABLE_NAME=flowcell` - Table name for Terra TSV file generation (creates `entity:{table_name}_id` column) ## Usage Patterns diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 3bad90e..e161a33 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -59,6 +59,7 @@ RUN_BASENAME="$(basename ${PATH_TO_UPLOAD})" STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}" RSYNC_RETRY_MAX_ATTEMPTS=${RSYNC_RETRY_MAX_ATTEMPTS:-"12"} RSYNC_RETRY_DELAY_SEC=${RSYNC_RETRY_DELAY_SEC:-"600"} +TERRA_RUN_TABLE_NAME=${TERRA_RUN_TABLE_NAME:-"flowcell"} # ------------------------------- @@ -311,7 +312,7 @@ generate_terra_tsv() { # Create TSV with POSIX line endings (LF only) cat << EOF -entity:flowcell_id biosample_attributes flowcell_tar samplesheets sample_rename_map_tsv +entity:${TERRA_RUN_TABLE_NAME}_id biosample_attributes flowcell_tar samplesheets sample_rename_map_tsv $run_basename $tarball_path EOF } From bd0e979638c0139a5356497cfcdcbed1798dd133 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 22:21:19 -0400 Subject: [PATCH 11/14] add hard and optional dependency checking; revise cron detection to be more involved and hopefully cross-platform (GNU/Linux vs macOS/BSD) --- .claude/settings.local.json | 3 +- incremental_illumina_upload_to_gs.sh | 116 ++++++++++++++++++++++++++- monitor_runs.sh | 77 +++++++++++++++++- 3 files changed, 192 insertions(+), 4 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 1e1c3e6..898fdf1 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -4,7 +4,8 @@ "Bash(git checkout:*)", "Bash(git fetch:*)", "WebFetch(domain:www.gnu.org)", - "WebSearch" + "WebSearch", + "WebFetch(domain:man7.org)" ], "deny": [], "ask": [] diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index e161a33..21e929d 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -61,6 +61,82 @@ RSYNC_RETRY_MAX_ATTEMPTS=${RSYNC_RETRY_MAX_ATTEMPTS:-"12"} RSYNC_RETRY_DELAY_SEC=${RSYNC_RETRY_DELAY_SEC:-"600"} TERRA_RUN_TABLE_NAME=${TERRA_RUN_TABLE_NAME:-"flowcell"} +# ------------------------------- +# Dependency checking +# ------------------------------- + +# Hard dependencies - script will exit if any are missing +HARD_DEPENDENCIES=(gcloud tar date basename mkdir rm find head wc cut awk sort tr expr du uname realpath stat touch cat gzip base64 sed ls whoami hostname ps) + +# Optional dependencies - script will work without them but with reduced functionality +OPTIONAL_DEPENDENCIES=(dig curl ip route grep pstree) + +# Check hard dependencies +echo "Checking required dependencies..." +missing_deps=() +for dependency in "${HARD_DEPENDENCIES[@]}"; do + if ! command -v "$dependency" &> /dev/null; then + missing_deps+=("$dependency") + fi +done + +if [[ ${#missing_deps[@]} -gt 0 ]]; then + echo "ERROR! Missing required dependencies. Aborting..." + echo "The following commands need to be installed and available on PATH:" + for dep in "${missing_deps[@]}"; do + echo " - $dep" + done + echo "" + echo "Please install the missing dependencies and try again." + exit 1 +fi + +# Check and track optional dependencies +echo "Checking optional dependencies..." +available_optional_deps=() +for dependency in "${OPTIONAL_DEPENDENCIES[@]}"; do + if command -v "$dependency" &> /dev/null; then + available_optional_deps+=("$dependency") + fi +done + +if [[ ${#available_optional_deps[@]} -eq 0 ]]; then + echo "Warning: No optional tools available. Some features may have reduced functionality." +else + echo "Available optional tools: ${available_optional_deps[*]}" + + # Check specific functionality + ip_tools=(dig curl ip route) + available_ip_tools=() + for tool in "${ip_tools[@]}"; do + if [[ " ${available_optional_deps[*]} " =~ " ${tool} " ]]; then + available_ip_tools+=("$tool") + fi + done + + if [[ ${#available_ip_tools[@]} -eq 0 ]]; then + echo "Warning: No external IP detection tools available. External IP will be set to 0.0.0.0" + fi + + if [[ " ${available_optional_deps[*]} " =~ " pstree " ]]; then + echo "Note: pstree available for enhanced cron detection" + else + echo "Note: Using ps fallback for cron detection (pstree not available)" + fi +fi + +# Check for GNU tar specifically on macOS +if [ "$(uname)" == "Darwin" ]; then + if ! command -v gtar &> /dev/null; then + echo "ERROR! macOS detected but GNU tar (gtar) is not available." + echo "Please install GNU tar: brew install gnu-tar" + exit 1 + fi +fi + +echo "All required dependencies satisfied." +echo "" + # ------------------------------- function cleanup(){ @@ -168,8 +244,44 @@ is_cron_execution() { return fi - # Detect based on environment characteristics - if [[ -z "$TERM" || "$TERM" == "dumb" ]] && [[ -z "$SSH_CLIENT" ]] && [[ -z "$SSH_TTY" ]]; then + # Use same cron detection logic as monitor_runs.sh + local cron_detected=0 + if command -v pstree &> /dev/null; then + # Use pstree if available + if pstree -s $$ 2>/dev/null | grep -q cron 2>/dev/null; then + cron_detected=1 + else + cron_detected=0 + fi + else + # Fallback using ps - works on both GNU/Linux and macOS/BSD + if [ "$(uname)" == "Darwin" ]; then + # macOS/BSD ps format - check current and parent processes + local current_pid=$$ + while [[ $current_pid -ne 1 ]]; do + if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then + cron_detected=1 + break + fi + local parent_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ') + [[ -z "$parent_pid" || "$parent_pid" == "0" || "$parent_pid" == "1" ]] && break + current_pid=$parent_pid + done + else + # GNU/Linux ps format - trace up the process tree + local current_pid=$$ + while [[ $current_pid -ne 1 ]]; do + if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then + cron_detected=1 + break + fi + current_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ' || echo 1) + [[ -z "$current_pid" || "$current_pid" == "0" ]] && break + done + fi + fi + + if [[ $cron_detected -gt 0 ]]; then echo "true" else echo "false" diff --git a/monitor_runs.sh b/monitor_runs.sh index 529a171..38cd420 100755 --- a/monitor_runs.sh +++ b/monitor_runs.sh @@ -30,6 +30,46 @@ fi set -x +# ------------------------------- +# Dependency checking +# ------------------------------- + +# Hard dependencies - script will exit if any are missing +MONITOR_DEPENDENCIES=(gcloud find sort realpath basename grep ps) + +echo "Checking required dependencies for monitor script..." +missing_deps=() +for dependency in "${MONITOR_DEPENDENCIES[@]}"; do + if ! command -v "$dependency" &> /dev/null; then + missing_deps+=("$dependency") + fi +done + +if [[ ${#missing_deps[@]} -gt 0 ]]; then + echo "ERROR! Missing required dependencies. Aborting..." + echo "The following commands need to be installed and available on PATH:" + for dep in "${missing_deps[@]}"; do + echo " - $dep" + done + echo "" + echo "Please install the missing dependencies and try again." + exit 1 +fi + +# Check for optional pstree (preferred for cron detection) +if command -v pstree &> /dev/null; then + echo "Using pstree for cron detection." + CRON_DETECTION_METHOD="pstree" +else + echo "pstree not available, using ps fallback for cron detection." + CRON_DETECTION_METHOD="ps" +fi + +echo "All required dependencies satisfied for monitor script." +echo "" + +# ------------------------------- + function absolute_path() { local SOURCE="$1" while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink @@ -76,7 +116,42 @@ fi echo "Location for temp files: ${STAGING_AREA_PATH}" # detect if running via cron, and only run infinitely if not running via cron -CRON="$( pstree -s $$ | grep -c cron )" +if [[ "$CRON_DETECTION_METHOD" == "pstree" ]]; then + if pstree -s $$ 2>/dev/null | grep -q cron 2>/dev/null; then + CRON=1 + else + CRON=0 + fi +else + # Fallback using ps - works on both GNU/Linux and macOS/BSD + # Check if any parent process contains 'cron' in the command name + if [ "$(uname)" == "Darwin" ]; then + # macOS/BSD ps format - check current and parent processes + current_pid=$$ + CRON=0 + while [[ $current_pid -ne 1 ]]; do + if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then + CRON=1 + break + fi + parent_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ') + [[ -z "$parent_pid" || "$parent_pid" == "0" || "$parent_pid" == "1" ]] && break + current_pid=$parent_pid + done + else + # GNU/Linux ps format - trace up the process tree + current_pid=$$ + CRON=0 + while [[ $current_pid -ne 1 ]]; do + if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then + CRON=1 + break + fi + current_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ' || echo 1) + [[ -z "$current_pid" || "$current_pid" == "0" ]] && break + done + fi +fi while true; do echo "" echo "===" From c2a4af555f09a064103de6b0ae97214cc2c0c705 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 22:42:04 -0400 Subject: [PATCH 12/14] parameterize static exclusion list used to omit large debug/diagnostic data from run folder tarballs --- incremental_illumina_upload_to_gs.sh | 25 +++++++++++++++++++++---- monitor_runs.sh | 2 +- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index 21e929d..a213b78 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -53,7 +53,7 @@ else fi CHUNK_SIZE_MB=${CHUNK_SIZE_MB:-'100'} -DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'30'} +DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'600'} RUN_COMPLETION_TIMEOUT_DAYS=${RUN_COMPLETION_TIMEOUT_DAYS:-'16'} RUN_BASENAME="$(basename ${PATH_TO_UPLOAD})" STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}" @@ -61,6 +61,15 @@ RSYNC_RETRY_MAX_ATTEMPTS=${RSYNC_RETRY_MAX_ATTEMPTS:-"12"} RSYNC_RETRY_DELAY_SEC=${RSYNC_RETRY_DELAY_SEC:-"600"} TERRA_RUN_TABLE_NAME=${TERRA_RUN_TABLE_NAME:-"flowcell"} +# Default directories to exclude from tar archives (large non-essential directories) +DEFAULT_TAR_EXCLUSIONS=("Thumbnail_Images" "Images" "FocusModelGeneration" "Autocenter" "InstrumentAnalyticsLogs" "Logs") +# Allow override via environment variable (space-separated list) +if [[ -n "$TAR_EXCLUSIONS" ]]; then + IFS=' ' read -ra TAR_EXCLUSIONS_ARRAY <<< "$TAR_EXCLUSIONS" +else + TAR_EXCLUSIONS_ARRAY=("${DEFAULT_TAR_EXCLUSIONS[@]}") +fi + # ------------------------------- # Dependency checking # ------------------------------- @@ -410,7 +419,7 @@ generate_verbose_metadata() { "blocking_factor": 1, "sparse_enabled": true, "eof_trimming": "incremental_only", - "excluded_directories": ["Thumbnail_Images", "Images", "FocusModelGeneration", "Autocenter", "InstrumentAnalyticsLogs", "Logs"] + "excluded_directories": [$(printf '"%s",' "${TAR_EXCLUSIONS_ARRAY[@]}" | sed 's/,$//')] }, "generation_timestamp": "$timestamp_formatted" } @@ -495,7 +504,7 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then run_is_finished=$([ -e "${PATH_TO_UPLOAD}/RTAComplete.txt" ] || [ -e "${PATH_TO_UPLOAD}/RTAComplete.xml" ] && echo "true" || echo "false") # if enough additional data has been added, or the run is complete, initiate incremental upload - if [[ $current_size -ge $(($size_at_last_check + $chunk_size_bytes)) ]] || [ "$run_is_finished" = 'true' ]; then + if [[ $current_size -ge $(($size_at_last_check + $chunk_size_bytes)) || "$run_is_finished" == "true" ]]; then echo "commencing sync on latest data" size_at_last_check=$current_size timestamp=$(date +%s) # intentionally called before tar so time is a little older @@ -517,7 +526,15 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi - $TAR_BIN --exclude='Thumbnail_Images' --exclude="Images" --exclude "FocusModelGeneration" --exclude='Autocenter' --exclude='InstrumentAnalyticsLogs' --exclude "Logs" \ + if [[ -f "$EXCLUSIONS_FILE" && -s "$EXCLUSIONS_FILE" ]]; then EXCLUSION_STR="--exclude-from=$EXCLUSIONS_FILE"; else EXCLUSION_STR=""; fi + + # Build static exclusion arguments from array + STATIC_EXCLUSIONS=() + for exclusion in "${TAR_EXCLUSIONS_ARRAY[@]}"; do + STATIC_EXCLUSIONS+=("--exclude=$exclusion") + done + + $TAR_BIN "${STATIC_EXCLUSIONS[@]}" \ --create \ --blocking-factor=1 \ --sparse \ diff --git a/monitor_runs.sh b/monitor_runs.sh index 38cd420..8ba2e0d 100755 --- a/monitor_runs.sh +++ b/monitor_runs.sh @@ -103,7 +103,7 @@ PATH_TO_MONITOR="$(realpath ${PATH_TO_MONITOR})" DESTINATION_BUCKET_PREFIX="$2" INCLUSION_TIME_INTERVAL_DAYS=${INCLUSION_TIME_INTERVAL_DAYS:-'7'} -DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'10'} +DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'600'} STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}" GCLOUD_STORAGE_CMD='gcloud storage' From 695c0c1cfe1a881392331005d35684645d7cb41d Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 22:43:22 -0400 Subject: [PATCH 13/14] use dynamic exclusions to avoid tarball bloat by discouraging tar from including multiple snapshots of large in-progress files use dynamic exclusions to avoid tarball bloat by discouraging tar from including multiple snapshots of large in-progress files; this addresses #4 --- .claude/settings.local.json | 3 +- CLAUDE.md | 6 ++- incremental_illumina_upload_to_gs.sh | 67 ++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 898fdf1..52905d5 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -5,7 +5,8 @@ "Bash(git fetch:*)", "WebFetch(domain:www.gnu.org)", "WebSearch", - "WebFetch(domain:man7.org)" + "WebFetch(domain:man7.org)", + "Bash(gh issue view:*)" ], "deny": [], "ask": [] diff --git a/CLAUDE.md b/CLAUDE.md index 7e87304..146171a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,12 +30,13 @@ Required tools that must be available: Key configuration variables (with defaults): - `CHUNK_SIZE_MB=100` - Size of incremental tar chunks -- `DELAY_BETWEEN_INCREMENTS_SEC=30` - Wait time between upload attempts +- `DELAY_BETWEEN_INCREMENTS_SEC=600` - Wait time between upload attempts (optimized to reduce tarball bloat from partial *.cbcl files) - `RUN_COMPLETION_TIMEOUT_DAYS=16` - Max time to wait for run completion - `STAGING_AREA_PATH` - Location for temporary files (defaults to `/usr/local/illumina/seq-run-uploads` on Illumina machines, `/tmp/seq-run-uploads` elsewhere) - `RSYNC_RETRY_MAX_ATTEMPTS=12` - Maximum retry attempts for uploads - `INCLUSION_TIME_INTERVAL_DAYS=7` - Age limit for runs to be considered for upload - `TERRA_RUN_TABLE_NAME=flowcell` - Table name for Terra TSV file generation (creates `entity:{table_name}_id` column) +- `TAR_EXCLUSIONS` - Space-separated list of directories to exclude from tar archives (defaults to: "Thumbnail_Images Images FocusModelGeneration Autocenter InstrumentAnalyticsLogs Logs") ## Usage Patterns @@ -56,7 +57,8 @@ Key configuration variables (with defaults): ## Important Implementation Details -- **Excluded Directories**: The upload excludes large non-essential directories: `Thumbnail_Images`, `Images`, `FocusModelGeneration`, `Autocenter`, `InstrumentAnalyticsLogs`, `Logs` +- **Excluded Directories**: The upload excludes large non-essential directories (configurable via `TAR_EXCLUSIONS` environment variable, defaults to: `Thumbnail_Images`, `Images`, `FocusModelGeneration`, `Autocenter`, `InstrumentAnalyticsLogs`, `Logs`) +- **Dynamic Exclusions**: During active sequencing, automatically excludes the most recent cycle directory and recently modified files (within 3 minutes) to prevent tarball bloat from partial `*.cbcl` files. Exclusions are disabled for the final tarball when `RTAComplete.txt`/`RTAComplete.xml` is detected. - **Individual Files**: `SampleSheet.csv` and `RunInfo.xml` are uploaded separately before tarball creation - **Run Completion Detection**: Looks for `RTAComplete.txt` or `RTAComplete.xml` files - **Tarball Extraction**: Resulting tarballs must be extracted with GNU tar using `--ignore-zeros` diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index a213b78..cac81a3 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -438,6 +438,60 @@ $run_basename $tarball_path EOF } +# Function to generate dynamic exclusions for incomplete cycles and recently modified files +# Only generates exclusions when run is not finished to prevent tarball bloat from partial *.cbcl files +generate_dynamic_exclusions() { + local run_path="$1" + local exclusions_file="$2" + local run_is_finished="$3" + + # Only generate exclusions if run is not finished + if [[ "$run_is_finished" != 'true' ]]; then + # Clear the exclusions file + > "$exclusions_file" + + # Find and exclude the most recent cycle directory to prevent capturing partial *.cbcl files + # Look for pattern: Data/Intensities/BaseCalls/L00*/C###.# (cycle directories) + if [[ -d "${run_path}/Data/Intensities/BaseCalls" ]]; then + local most_recent_cycle=$(find "${run_path}/Data/Intensities/BaseCalls/"L* \ + -type d \ + -regextype posix-extended \ + -regex '^.+/C[0-9]+\.[0-9]+$' 2>/dev/null | \ + sort -r -k1,1 -V | \ + head -n1 | \ + sed -E 's|(BaseCalls/)L([0-9]+)|\\1L*|g' 2>/dev/null || true) + + if [[ -n "$most_recent_cycle" ]]; then + # Convert absolute path to relative path for tar exclusion + local relative_cycle_path="${most_recent_cycle#${run_path}/}" + echo "$relative_cycle_path" >> "$exclusions_file" + echo "Dynamic exclusion: $relative_cycle_path (most recent cycle)" + fi + fi + + # Add files that have been modified in the past 3 minutes (optional, but recommended) + # This helps avoid capturing files that are actively being written + find "$run_path" -mmin -3 -type f 2>/dev/null | while IFS= read -r recent_file; do + # Convert to relative path for tar exclusion + local relative_file="${recent_file#${run_path}/}" + echo "$relative_file" >> "$exclusions_file" + done + + # Show exclusions count for logging + local exclusion_count=$(wc -l < "$exclusions_file" 2>/dev/null || echo 0) + if [[ $exclusion_count -gt 0 ]]; then + echo "Generated $exclusion_count dynamic exclusions to prevent capturing incomplete files" + fi + + return 0 + else + echo "Run is finished - no dynamic exclusions applied" + # Ensure exclusions file doesn't exist for finished runs + [[ -f "$exclusions_file" ]] && rm "$exclusions_file" + return 1 + fi +} + # Define the final tarball path once to avoid repetition FINAL_TARBALL_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz" @@ -515,6 +569,17 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then # generate enhanced tar label with metadata tar_label=$(generate_tar_label "$RUN_BASENAME" "$tar_increment_counter") + # Generate dynamic exclusions to prevent tarball bloat from partial *.cbcl files + EXCLUSIONS_FILE="${STAGING_AREA_PATH}/${RUN_BASENAME}/dynamic_exclusions.txt" + generate_dynamic_exclusions "$PATH_TO_UPLOAD" "$EXCLUSIONS_FILE" "$run_is_finished" + + # If this is the final tarball for a completed run, sync and wait to ensure all writes are flushed + if [[ "$run_is_finished" == 'true' ]]; then + echo "Run completed - performing final sync and wait before capturing final tarball" + sync + sleep 10 # Wait 10 seconds to ensure all file writes are fully committed + fi + # increate incremental tarballs # see: https://www.gnu.org/software/tar/manual/html_node/Incremental-Dumps.html # https://www.gnu.org/software/tar/manual/html_node/Snapshot-Files.html @@ -523,6 +588,7 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation # '--sparse' consolidates runs of zeros in input files # '--label' adds enhanced metadata (JSON or pipe-separated format within 99-byte tar limit) + # '--exclude-from' excludes patterns from file to prevent capturing incomplete *.cbcl files # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi @@ -535,6 +601,7 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then done $TAR_BIN "${STATIC_EXCLUSIONS[@]}" \ + $EXCLUSION_STR \ --create \ --blocking-factor=1 \ --sparse \ From 9359a5ad681cd036113b13267a8831e948d163b1 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 15 Aug 2025 22:49:37 -0400 Subject: [PATCH 14/14] sort and de-duplicate dynamic exclusions --- incremental_illumina_upload_to_gs.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh index cac81a3..59c4f36 100755 --- a/incremental_illumina_upload_to_gs.sh +++ b/incremental_illumina_upload_to_gs.sh @@ -477,6 +477,12 @@ generate_dynamic_exclusions() { echo "$relative_file" >> "$exclusions_file" done + # Sort and deduplicate exclusions (files may match both recent cycle and recent time criteria) + if [[ -f "$exclusions_file" && -s "$exclusions_file" ]]; then + local temp_file="${exclusions_file}.tmp" + sort -u "$exclusions_file" > "$temp_file" && mv "$temp_file" "$exclusions_file" + fi + # Show exclusions count for logging local exclusion_count=$(wc -l < "$exclusions_file" 2>/dev/null || echo 0) if [[ $exclusion_count -gt 0 ]]; then