From ccb7d05685b587c72383e15265e446eba2c38231 Mon Sep 17 00:00:00 2001
From: "claude[bot]" <209825114+claude[bot]@users.noreply.github.com>
Date: Fri, 15 Aug 2025 22:57:25 +0000
Subject: [PATCH 01/14] Implement tar improvements for smaller tarballs and
 obviate --ignore-zeros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Set --blocking-factor=1 to prevent extra zero-padding blocks
- Add --sparse flag to consolidate runs of zeros in input files
- Add --label parameter with run basename for identification
- Pipe tar output through 'head --bytes -1024' to trim EOF blocks before gzip
- Update comments and documentation to reflect that --ignore-zeros is no longer needed
- Replace --gzip with manual gzip after EOF trimming for proper concatenation

These changes implement the solution described in issue #8 to create
efficiently concatenatable tarballs that can be extracted with standard
tar commands without requiring --ignore-zeros.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: Chris Tomkins-Tinch <tomkinsc@users.noreply.github.com>
---
 incremental_illumina_upload_to_gs.sh | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 971e757..8b7415b 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -6,7 +6,7 @@
 # depends on:
 # GNU tar (separate install on mac)
 # google-cloud-sdk
-# IMPORTANT: resulting tarball must be extracted with GNU tar and "--ignore-zeros" specified
+# Uses optimized tar settings (--blocking-factor=1, --sparse, EOF trimming) for efficient concatenation
 
 if [[ "$#" -ne 2 ]]; then
     echo "--------------------------------------------------------------------"
@@ -21,8 +21,8 @@ if [[ "$#" -ne 2 ]]; then
     echo ""
     echo ""
     echo "This script creates incremental gzipped tarballs and syncs them"
-    echo "to a single tarball in a GS bucket. The resulting tarballs must be"
-    echo "extracted with GNU tar or compatible with '--ignore-zeros' specifed."
+    echo "to a single tarball in a GS bucket. The tarballs use optimized"
+    echo "tar settings for efficient concatenation and standard extraction."
     echo ""
     echo "Dependencies: GNU tar, google-cloud-sdk (with crcmod installed)"
     echo ""
@@ -157,15 +157,20 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
             # see: https://www.gnu.org/software/tar/manual/html_node/Incremental-Dumps.html
             #      https://www.gnu.org/software/tar/manual/html_node/Snapshot-Files.html
             # '--no-check-device' is for NFS
-            # '-C "${PATH_TO_UPLOAD}" .' so we don't store the full path (-C is cd)
+            # '-C "${PATH_TO_UPLOAD}" ." so we don't store the full path (-C is cd)
+            # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation
+            # '--sparse' consolidates runs of zeros in input files
+            # '--label' adds human-readable note with run ID
+            # 'head --bytes -1024' trims EOF blocks (two 512-byte blocks) before gzip compression
             if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi
                 $TAR_BIN --exclude='Thumbnail_Images' --exclude="Images" --exclude "FocusModelGeneration" --exclude='Autocenter' --exclude='InstrumentAnalyticsLogs' --exclude "Logs" \
                 --create \
-                --gzip \
+                --blocking-factor=1 \
+                --sparse \
+                --label="${RUN_BASENAME}" \
                 $SHOULD_CHECK_DEVICE_STR \
-                --file="${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz" \
                 --listed-incremental="${STAGING_AREA_PATH}/${RUN_BASENAME}/index" \
-                -C "${PATH_TO_UPLOAD}" .
+                -C "${PATH_TO_UPLOAD}" . | head --bytes -1024 | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz"
 
             # -------------------------------------------------------------------------
             # # (WIP alternative to the above tar call)
@@ -250,7 +255,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
     done
 
     # create a note about the tarball
-    echo "$RUN_BASENAME.tar.gz must be unpacked with GNU tar and '--ignore-zeros' specified." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt"
+    echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt"
 
     # if only the index file is present, remove it
     if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then

From 5142975c0fb336d8d8fa2297b02ad86e8463f1a3 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 19:30:29 -0400
Subject: [PATCH 02/14] add Claude-related infrastructure files

---
 .claude/agents/expert-developer.md |  7 +++
 .claude/settings.local.json        |  9 ++++
 CLAUDE.md                          | 79 ++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 .claude/agents/expert-developer.md
 create mode 100644 .claude/settings.local.json
 create mode 100644 CLAUDE.md

diff --git a/.claude/agents/expert-developer.md b/.claude/agents/expert-developer.md
new file mode 100644
index 0000000..9869054
--- /dev/null
+++ b/.claude/agents/expert-developer.md
@@ -0,0 +1,7 @@
+---
+name: expert-developer
+description: use this agent when uncommitted changes exist for files in this repo, and for when open issues are outstanding for this repo on GitHub
+model: sonnet
+---
+
+Expert software enginner who can act on my behalf to address open GitHub issues, incorporate changes according to guidance I have issued, and review new code I have written.
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000..42e34a5
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,9 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(git checkout:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..81371f4
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,79 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+This repository contains bash scripts for incremental upload of Illumina sequencing runs to Google Cloud Storage. The system creates incremental gzipped tarballs and syncs them to a single tarball in a GS bucket, designed to work on Linux-based Illumina sequencers (like NextSeq 2000) or companion computers.
+
+## Architecture
+
+**Core Scripts:**
+- `incremental_illumina_upload_to_gs.sh` - Main upload script that creates incremental tar archives and uploads them to GS
+- `monitor_runs.sh` - Monitoring script that watches for new run directories and launches upload processes
+- `simulate_sequencer_write.sh` - Testing utility that simulates a sequencer writing data incrementally
+
+**Key Components:**
+- **Incremental Archiving**: Uses GNU tar with `--listed-incremental` to create incremental backups
+- **Chunked Uploads**: Splits large runs into manageable chunks (default 100MB) with retry logic
+- **GS Composition**: Uses `gsutil compose` to merge incremental tarballs into single archives
+- **Cross-platform Support**: Handles differences between Linux (Illumina sequencers) and macOS
+
+## Dependencies
+
+Required tools that must be available:
+- `gsutil` (Google Cloud SDK)
+- `tar` (GNU tar, installed as `gtar` on macOS)
+- `pstree` (for monitoring script, installed via `brew install pstree` on macOS)
+
+## Environment Variables
+
+Key configuration variables (with defaults):
+- `CHUNK_SIZE_MB=100` - Size of incremental tar chunks
+- `DELAY_BETWEEN_INCREMENTS_SEC=30` - Wait time between upload attempts
+- `RUN_COMPLETION_TIMEOUT_DAYS=16` - Max time to wait for run completion
+- `STAGING_AREA_PATH` - Location for temporary files (defaults to `/usr/local/illumina/seq-run-uploads` on Illumina machines, `/tmp/seq-run-uploads` elsewhere)
+- `RSYNC_RETRY_MAX_ATTEMPTS=12` - Maximum retry attempts for uploads
+- `INCLUSION_TIME_INTERVAL_DAYS=7` - Age limit for runs to be considered for upload
+
+## Usage Patterns
+
+**Main upload script:**
+```bash
+./incremental_illumina_upload_to_gs.sh /path/to/run gs://bucket-prefix
+```
+
+**Monitoring script:**
+```bash
+./monitor_runs.sh /path/to/monitored-directory gs://bucket-prefix
+```
+
+**Simulation script (for testing):**
+```bash
+./simulate_sequencer_write.sh /path/to/actual_run /path/to/simulated_run
+```
+
+## Important Implementation Details
+
+- **Excluded Directories**: The upload excludes large non-essential directories: `Thumbnail_Images`, `Images`, `FocusModelGeneration`, `Autocenter`, `InstrumentAnalyticsLogs`, `Logs`
+- **Individual Files**: `SampleSheet.csv` and `RunInfo.xml` are uploaded separately before tarball creation
+- **Run Completion Detection**: Looks for `RTAComplete.txt` or `RTAComplete.xml` files
+- **Tarball Extraction**: Resulting tarballs must be extracted with GNU tar using `--ignore-zeros`
+- **NFS Support**: Uses `--no-check-device` flag for NFS mounted storage
+- **Platform Detection**: Automatically detects Illumina machines vs other environments
+- **Cleanup**: Removes local incremental tarballs after successful upload
+
+## Cron Integration
+
+The monitoring script is designed to work with cron scheduling. Example crontab entry:
+```
+@hourly ~/monitor_runs.sh /usr/local/illumina/runs gs://bucket/flowcells >> ~/upload_monitor.log
+```
+
+## File Paths
+
+Staging areas:
+- Illumina machines: `/usr/local/illumina/seq-run-uploads`
+- Other systems: `/tmp/seq-run-uploads`
+
+Run detection based on presence of `RunInfo.xml` files in monitored directories.
\ No newline at end of file

From 2eeb65d37a79a52aacb1dd46cba97fc2998bc6cf Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 19:38:50 -0400
Subject: [PATCH 03/14] do not trim EOF blocks from final incremental tarball

do not trim EOF blocks from final incremental tarball that mops up the remaining changed files
---
 incremental_illumina_upload_to_gs.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 8b7415b..fbf5d9d 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -161,8 +161,9 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
             # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation
             # '--sparse' consolidates runs of zeros in input files
             # '--label' adds human-readable note with run ID
-            # 'head --bytes -1024' trims EOF blocks (two 512-byte blocks) before gzip compression
+            # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks
             if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi
+            if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi
                 $TAR_BIN --exclude='Thumbnail_Images' --exclude="Images" --exclude "FocusModelGeneration" --exclude='Autocenter' --exclude='InstrumentAnalyticsLogs' --exclude "Logs" \
                 --create \
                 --blocking-factor=1 \
@@ -170,7 +171,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
                 --label="${RUN_BASENAME}" \
                 $SHOULD_CHECK_DEVICE_STR \
                 --listed-incremental="${STAGING_AREA_PATH}/${RUN_BASENAME}/index" \
-                -C "${PATH_TO_UPLOAD}" . | head --bytes -1024 | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz"
+                -C "${PATH_TO_UPLOAD}" . | $EOF_PROCESSOR | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz"
 
             # -------------------------------------------------------------------------
             # # (WIP alternative to the above tar call)

From c8ed187c59d418e29be3f67d22cccad9c9478da4 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 21:03:52 -0400
Subject: [PATCH 04/14] more claude config

---
 .claude/settings.local.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 42e34a5..883804c 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -1,7 +1,9 @@
 {
   "permissions": {
     "allow": [
-      "Bash(git checkout:*)"
+      "Bash(git checkout:*)",
+      "Bash(git fetch:*)",
+      "WebFetch(domain:www.gnu.org)"
     ],
     "deny": [],
     "ask": []

From b5fc8836583d77cf8fe19cb2b803c58a164e2e72 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 21:05:13 -0400
Subject: [PATCH 05/14] obtain metadata related to the upload and the uploader,
 and persist via in-band tar labels, plus a verbose json file alongside the
 README.txt in the destination bucket

---
 incremental_illumina_upload_to_gs.sh | 213 ++++++++++++++++++++++++++-
 monitor_runs.sh                      |   9 +-
 2 files changed, 218 insertions(+), 4 deletions(-)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index fbf5d9d..7441a0b 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -23,6 +23,10 @@ if [[ "$#" -ne 2 ]]; then
     echo "This script creates incremental gzipped tarballs and syncs them"
     echo "to a single tarball in a GS bucket. The tarballs use optimized"
     echo "tar settings for efficient concatenation and standard extraction."
+    echo "Each tar chunk includes metadata labels with run info, timestamps,"
+    echo "and machine details. Use 'tar --test-label -f chunk.tar.gz' to view"
+    echo "individual chunk labels, or 'tar -tvf combined.tar.gz | grep Volume'"
+    echo "to see all labels in the final composed archive."
     echo ""
     echo "Dependencies: GNU tar, google-cloud-sdk (with crcmod installed)"
     echo ""
@@ -73,6 +77,7 @@ chunk_size_bytes=$(expr $CHUNK_SIZE_MB \* 1048576) # $CHUNK_SIZE_MB*1024^2
 RUN_COMPLETION_TIMEOUT_SEC=$(expr $RUN_COMPLETION_TIMEOUT_DAYS \* 86400)
 
 size_at_last_check=0
+tar_increment_counter=0
 
 TAR_BIN="tar"
 GSUTIL_CMD='gsutil'
@@ -85,6 +90,201 @@ fi
 
 $GSUTIL_CMD version -l
 
+# Function to detect external IP address using multiple methods
+get_external_ip() {
+    local ip=""
+    
+    # Check if required tools are available
+    local has_dig=$(command -v dig &> /dev/null && echo "true" || echo "false")
+    local has_curl=$(command -v curl &> /dev/null && echo "true" || echo "false")
+    local has_awk=$(command -v awk &> /dev/null && echo "true" || echo "false")
+    
+    if [[ "$has_dig" == "true" && "$has_awk" == "true" ]]; then
+        # Method 1: Google DNS TXT record
+        ip=$(dig +short txt o-o.myaddr.l.google.com @ns1.google.com 2>/dev/null | awk -F'"' '{print $2}' | tr -d '\n\r"' | head -1)
+        if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "$ip"
+            return
+        fi
+        
+        # Method 2: OpenDNS
+        ip=$(dig +short myip.opendns.com @resolver1.opendns.com 2>/dev/null | tr -d '\n\r"' | head -1)
+        if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "$ip"
+            return
+        fi
+        
+        # Method 3: Cloudflare
+        ip=$(dig +short txt ch whoami.cloudflare @1.0.0.1 2>/dev/null | tr -d '\n\r"' | head -1)
+        if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "$ip"
+            return
+        fi
+        
+        # Method 4: Google DNS (alternative)
+        ip=$(dig +short txt o-o.myaddr.l.google.com @ns1.google.com 2>/dev/null | tr -d '\n\r"' | head -1)
+        if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "$ip"
+            return
+        fi
+    fi
+    
+    if [[ "$has_curl" == "true" ]]; then
+        # Method 5: AWS checkip
+        ip=$(curl -s --max-time 5 checkip.amazonaws.com 2>/dev/null | tr -d '\n\r"' | head -1)
+        if [[ -n "$ip" && "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "$ip"
+            return
+        fi
+    fi
+    
+    # Fallback: try to get local interface IP
+    if command -v ip &> /dev/null; then
+        local local_ip=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' 2>/dev/null | head -1)
+        if [[ -n "$local_ip" && "$local_ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "$local_ip"
+            return
+        fi
+    elif command -v route &> /dev/null; then
+        # macOS/BSD fallback using route command
+        local local_ip=$(route get 8.8.8.8 2>/dev/null | awk '/interface:/ {getline; if(/inet/) print $2}' | head -1)
+        if [[ -n "$local_ip" && "$local_ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "$local_ip"
+            return
+        fi
+    fi
+    
+    # Final fallback
+    echo "0.0.0.0"
+}
+
+# Function to detect if running via cron
+is_cron_execution() {
+    # Check if CRON_INVOKED is set by monitor_runs.sh
+    if [[ -n "$CRON_INVOKED" ]]; then
+        echo "$CRON_INVOKED"
+        return
+    fi
+    
+    # Detect based on environment characteristics
+    if [[ -z "$TERM" || "$TERM" == "dumb" ]] && [[ -z "$SSH_CLIENT" ]] && [[ -z "$SSH_TTY" ]]; then
+        echo "true"
+    else
+        echo "false"
+    fi
+}
+
+# Function to generate enhanced tar label metadata
+generate_tar_label() {
+    local run_basename="$1"
+    local increment_num="$2"
+    local timestamp_formatted=$(date +%y-%m-%dT%H:%M)
+    local hostname=$(hostname)
+    local username=$(whoami)
+    local external_ip=$(get_external_ip)
+    local is_cron=$(is_cron_execution)
+    
+    # GNU tar volume labels have a strict 99-byte limit, but support control characters
+    # Try compact JSON first, then fall back to pipe-separated format if too long
+    local json_metadata="{\"r\":\"${run_basename:0:15}\",\"t\":\"$timestamp_formatted\",\"i\":$increment_num,\"h\":\"${hostname:0:8}\",\"u\":\"${username:0:8}\",\"ip\":\"$external_ip\",\"c\":$([ "$is_cron" = "true" ] && echo 1 || echo 0)}"
+    
+    if [[ ${#json_metadata} -le 99 ]]; then
+        # Use JSON format (human and machine readable)
+        echo "$json_metadata"
+    else
+        # Fallback to pipe-separated format (very compact)
+        local pipe_format="${run_basename:0:15}|$timestamp_formatted|$increment_num|${hostname:0:8}|${username:0:8}|$external_ip|$([ "$is_cron" = "true" ] && echo 1 || echo 0)"
+        if [[ ${#pipe_format} -le 99 ]]; then
+            echo "$pipe_format"
+        else
+            # Last resort: compress with gzip and base64 encode
+            local compressed=$(echo "$json_metadata" | gzip | base64 | tr -d '\n')
+            local max_len=$((99 - 3)) # Reserve 3 chars for "gz:" prefix
+            echo "gz:${compressed:0:$max_len}"
+        fi
+    fi
+}
+
+# Function to generate verbose metadata JSON file for the entire upload process
+generate_verbose_metadata() {
+    local run_basename="$1"
+    local run_path="$2"
+    local destination_bucket="$3"
+    local start_time="$4"
+    local current_time=$(date +%s)
+    local timestamp_formatted=$(date -d "@$current_time" +%Y-%m-%dT%H:%M:%S%z 2>/dev/null || date -r "$current_time" +%Y-%m-%dT%H:%M:%S%z)
+    local start_timestamp=$(date -d "@$start_time" +%Y-%m-%dT%H:%M:%S%z 2>/dev/null || date -r "$start_time" +%Y-%m-%dT%H:%M:%S%z)
+    local hostname=$(hostname)
+    local username=$(whoami)
+    local external_ip=$(get_external_ip)
+    local is_cron=$(is_cron_execution)
+    local script_version=$(grep "^# version: " "$0" | head -1 | sed 's/^# version: //' || echo "unknown")
+    local final_increment=$tar_increment_counter
+    local upload_duration=$((current_time - start_time))
+    
+    # Get executable versions
+    local gsutil_version=$(gsutil version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown")
+    local gcloud_version=$(gcloud version --format="value(Google Cloud SDK)" 2>/dev/null || echo "unknown")
+    local tar_version=$($TAR_BIN --version 2>/dev/null | head -1 || echo "unknown")
+    local bash_version=$($BASH --version 2>/dev/null | head -1 || echo "$BASH_VERSION")
+    
+    # Get run size information
+    local run_size_bytes=0
+    if [ -d "$run_path" ]; then
+        if [ "$(uname)" != "Darwin" ]; then
+            run_size_bytes=$(du -sb "$run_path" 2>/dev/null | cut -f1 || echo 0)
+        else
+            run_size_bytes=$(du -sk "$run_path" 2>/dev/null | awk '{print $1 * 1024}' || echo 0)
+        fi
+    fi
+    
+    # Create comprehensive metadata JSON
+    cat << EOF
+{
+  "upload_metadata": {
+    "run_basename": "$run_basename",
+    "run_path": "$run_path",
+    "destination_bucket": "$destination_bucket",
+    "upload_start_time": "$start_timestamp",
+    "upload_completion_time": "$timestamp_formatted",
+    "upload_duration_seconds": $upload_duration,
+    "total_increments": $final_increment,
+    "run_size_bytes": $run_size_bytes,
+    "cron_invoked": $is_cron
+  },
+  "uploading_machine_info": {
+    "hostname": "$hostname",
+    "username": "$username",
+    "external_ip": "$external_ip",
+    "operating_system": "$(uname -s)",
+    "architecture": "$(uname -m)",
+    "script_version": "$script_version"
+  },
+  "tool_versions": {
+    "gsutil_version": "$gsutil_version",
+    "gcloud_version": "$gcloud_version",
+    "tar_version": "$tar_version",
+    "bash_version": "$bash_version"
+  },
+  "environment_variables": {
+    "chunk_size_mb": $CHUNK_SIZE_MB,
+    "delay_between_increments_sec": $DELAY_BETWEEN_INCREMENTS_SEC,
+    "run_completion_timeout_days": $RUN_COMPLETION_TIMEOUT_DAYS,
+    "staging_area_path": "$STAGING_AREA_PATH",
+    "source_path_is_on_nfs": "$SOURCE_PATH_IS_ON_NFS"
+  },
+  "tar_settings": {
+    "tar_binary": "$TAR_BIN",
+    "blocking_factor": 1,
+    "sparse_enabled": true,
+    "eof_trimming": "incremental_only",
+    "excluded_directories": ["Thumbnail_Images", "Images", "FocusModelGeneration", "Autocenter", "InstrumentAnalyticsLogs", "Logs"]
+  },
+  "generation_timestamp": "$timestamp_formatted"
+}
+EOF
+}
+
 # if the run does not already exist on the destination, commence upload process...
 if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then
     START_TIME=$(date +%s)
@@ -152,6 +352,12 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
             echo "commencing sync on latest data"
             size_at_last_check=$current_size
             timestamp=$(date +%s) # intentionally called before tar so time is a little older    
+            
+            # increment counter for this tarball
+            tar_increment_counter=$((tar_increment_counter + 1))
+            
+            # generate enhanced tar label with metadata
+            tar_label=$(generate_tar_label "$RUN_BASENAME" "$tar_increment_counter")
 
             # increate incremental tarballs
             # see: https://www.gnu.org/software/tar/manual/html_node/Incremental-Dumps.html
@@ -160,7 +366,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
             # '-C "${PATH_TO_UPLOAD}" ." so we don't store the full path (-C is cd)
             # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation
             # '--sparse' consolidates runs of zeros in input files
-            # '--label' adds human-readable note with run ID
+            # '--label' adds enhanced metadata (JSON or pipe-separated format within 99-byte tar limit)
             # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks
             if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi
             if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi
@@ -168,7 +374,7 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
                 --create \
                 --blocking-factor=1 \
                 --sparse \
-                --label="${RUN_BASENAME}" \
+                --label="$tar_label" \
                 $SHOULD_CHECK_DEVICE_STR \
                 --listed-incremental="${STAGING_AREA_PATH}/${RUN_BASENAME}/index" \
                 -C "${PATH_TO_UPLOAD}" . | $EOF_PROCESSOR | gzip > "${STAGING_AREA_PATH}/${RUN_BASENAME}/${timestamp}_part-1.tar.gz"
@@ -257,6 +463,9 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
 
     # create a note about the tarball
     echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt"
+    
+    # create and upload verbose metadata JSON file
+    generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json"
 
     # if only the index file is present, remove it
     if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then
diff --git a/monitor_runs.sh b/monitor_runs.sh
index 3cd4bab..1d2abc3 100755
--- a/monitor_runs.sh
+++ b/monitor_runs.sh
@@ -6,7 +6,7 @@
 # depends on:
 # google-cloud-sdk
 # pstree (separate install on mac, 'brew install pstree')
-# IMPORTANT: resulting tarball must be extracted with GNU tar and "--ignore-zeros" specified
+# Uses optimized tar settings (--blocking-factor=1, --sparse, EOF trimming) for efficient concatenation
 
 if [[ "$#" -ne 2 ]]; then
     echo "--------------------------------------------------------------------"
@@ -99,7 +99,12 @@ while true; do
                   upload_cmd="${SCRIPTPATH}/incremental_illumina_upload_to_gs.sh ${found_dir} ${DESTINATION_BUCKET_PREFIX}"
                   echo "    ${upload_cmd}"
                   # fork incremental upload to separate process
-                  (STAGING_AREA_PATH="${STAGING_AREA_PATH}" ${upload_cmd}) &
+                  # pass cron detection info via environment variable
+                  if [[ $CRON -gt 0 ]]; then
+                      (STAGING_AREA_PATH="${STAGING_AREA_PATH}" CRON_INVOKED="true" ${upload_cmd}) &
+                  else
+                      (STAGING_AREA_PATH="${STAGING_AREA_PATH}" CRON_INVOKED="false" ${upload_cmd}) &
+                  fi
                 else
                     echo "Skipping initiation of new upload (upload in progress): ${STAGING_AREA_PATH}/${RUN_BASENAME}"
                 fi

From 8be1cc3e725052c455b4f39af5decb64b63a4fae Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 21:10:47 -0400
Subject: [PATCH 06/14] use last component of run basename for short-form
 representation, rather than just truncating it

---
 incremental_illumina_upload_to_gs.sh | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 7441a0b..42b1d58 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -184,16 +184,30 @@ generate_tar_label() {
     local external_ip=$(get_external_ip)
     local is_cron=$(is_cron_execution)
     
+    # Intelligently shorten run basename: extract last component after underscores, then truncate if needed
+    local short_run_basename="$run_basename"
+    if [[ "$run_basename" =~ _ ]]; then
+        # Extract last component after final underscore
+        short_run_basename="${run_basename##*_}"
+        # If still too long, truncate to 15 chars
+        if [[ ${#short_run_basename} -gt 15 ]]; then
+            short_run_basename="${short_run_basename:0:15}"
+        fi
+    else
+        # No underscores, just truncate
+        short_run_basename="${run_basename:0:15}"
+    fi
+    
     # GNU tar volume labels have a strict 99-byte limit, but support control characters
     # Try compact JSON first, then fall back to pipe-separated format if too long
-    local json_metadata="{\"r\":\"${run_basename:0:15}\",\"t\":\"$timestamp_formatted\",\"i\":$increment_num,\"h\":\"${hostname:0:8}\",\"u\":\"${username:0:8}\",\"ip\":\"$external_ip\",\"c\":$([ "$is_cron" = "true" ] && echo 1 || echo 0)}"
+    local json_metadata="{\"r\":\"$short_run_basename\",\"t\":\"$timestamp_formatted\",\"i\":$increment_num,\"h\":\"${hostname:0:8}\",\"u\":\"${username:0:8}\",\"ip\":\"$external_ip\",\"c\":$([ "$is_cron" = "true" ] && echo 1 || echo 0)}"
     
     if [[ ${#json_metadata} -le 99 ]]; then
         # Use JSON format (human and machine readable)
         echo "$json_metadata"
     else
         # Fallback to pipe-separated format (very compact)
-        local pipe_format="${run_basename:0:15}|$timestamp_formatted|$increment_num|${hostname:0:8}|${username:0:8}|$external_ip|$([ "$is_cron" = "true" ] && echo 1 || echo 0)"
+        local pipe_format="$short_run_basename|$timestamp_formatted|$increment_num|${hostname:0:8}|${username:0:8}|$external_ip|$([ "$is_cron" = "true" ] && echo 1 || echo 0)"
         if [[ ${#pipe_format} -le 99 ]]; then
             echo "$pipe_format"
         else

From 4bc2a92895f9a1b6319b5008555ef888e103de67 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 21:35:16 -0400
Subject: [PATCH 07/14] migrate Google Cloud CLI calls from `gsutil` -> `gcloud
 storage`

migrate Google Cloud CLI calls from `gsutil` -> `gcloud storage`. Mostly a drop-in replacement, except for `gsutil compose` which is now `gcloud storage objects compose`.
---
 .claude/settings.local.json          |  3 +-
 CLAUDE.md                            |  4 +-
 incremental_illumina_upload_to_gs.sh | 63 +++++++++++++++-------------
 monitor_runs.sh                      |  7 ++--
 4 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 883804c..1e1c3e6 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -3,7 +3,8 @@
     "allow": [
       "Bash(git checkout:*)",
       "Bash(git fetch:*)",
-      "WebFetch(domain:www.gnu.org)"
+      "WebFetch(domain:www.gnu.org)",
+      "WebSearch"
     ],
     "deny": [],
     "ask": []
diff --git a/CLAUDE.md b/CLAUDE.md
index 81371f4..8abfc71 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -16,13 +16,13 @@ This repository contains bash scripts for incremental upload of Illumina sequenc
 **Key Components:**
 - **Incremental Archiving**: Uses GNU tar with `--listed-incremental` to create incremental backups
 - **Chunked Uploads**: Splits large runs into manageable chunks (default 100MB) with retry logic
-- **GS Composition**: Uses `gsutil compose` to merge incremental tarballs into single archives
+- **GS Composition**: Uses `gcloud storage objects compose` to merge incremental tarballs into single archives
 - **Cross-platform Support**: Handles differences between Linux (Illumina sequencers) and macOS
 
 ## Dependencies
 
 Required tools that must be available:
-- `gsutil` (Google Cloud SDK)
+- `gcloud storage` (Google Cloud SDK)
 - `tar` (GNU tar, installed as `gtar` on macOS)
 - `pstree` (for monitoring script, installed via `brew install pstree` on macOS)
 
diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 42b1d58..1e39551 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -28,7 +28,7 @@ if [[ "$#" -ne 2 ]]; then
     echo "individual chunk labels, or 'tar -tvf combined.tar.gz | grep Volume'"
     echo "to see all labels in the final composed archive."
     echo ""
-    echo "Dependencies: GNU tar, google-cloud-sdk (with crcmod installed)"
+    echo "Dependencies: GNU tar, google-cloud-sdk (gcloud CLI with storage commands)"
     echo ""
     echo "Usage: $(basename $0) /path/of/run_to_upload gs://bucket-prefix"
     echo "--------------------------------------------------------------------"
@@ -80,15 +80,16 @@ size_at_last_check=0
 tar_increment_counter=0
 
 TAR_BIN="tar"
-GSUTIL_CMD='gsutil'
+GCLOUD_STORAGE_CMD='gcloud storage'
 if [ "$(uname)" == "Darwin" ]; then
     TAR_BIN="gtar" # GNU tar must be installed and available on the path as gtar
 
     #export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=true # workaround for https://bugs.python.org/issue33725
-    GSUTIL_CMD='gsutil -o "GSUtil:parallel_process_count=1"'
+    # Note: gcloud storage handles parallelization automatically, no manual tuning needed
+    GCLOUD_STORAGE_CMD='gcloud storage'
 fi
 
-$GSUTIL_CMD version -l
+gcloud version
 
 # Function to detect external IP address using multiple methods
 get_external_ip() {
@@ -237,8 +238,12 @@ generate_verbose_metadata() {
     local upload_duration=$((current_time - start_time))
     
     # Get executable versions
-    local gsutil_version=$(gsutil version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown")
-    local gcloud_version=$(gcloud version --format="value(Google Cloud SDK)" 2>/dev/null || echo "unknown")
+    local gcloud_version=$(gcloud version --format='value("Google Cloud SDK")' 2>/dev/null || echo "unknown")
+    local gcloud_storage_version=$(gcloud components list --filter="id:gcloud-storage" --format="value(version.string)" 2>/dev/null)
+    # If gcloud storage version is empty (bundled), fall back to main gcloud version
+    if [[ -z "$gcloud_storage_version" ]]; then
+        gcloud_storage_version="$gcloud_version"
+    fi
     local tar_version=$($TAR_BIN --version 2>/dev/null | head -1 || echo "unknown")
     local bash_version=$($BASH --version 2>/dev/null | head -1 || echo "$BASH_VERSION")
     
@@ -275,8 +280,8 @@ generate_verbose_metadata() {
     "script_version": "$script_version"
   },
   "tool_versions": {
-    "gsutil_version": "$gsutil_version",
     "gcloud_version": "$gcloud_version",
+    "gcloud_storage_version": "$gcloud_storage_version", 
     "tar_version": "$tar_version",
     "bash_version": "$bash_version"
   },
@@ -300,7 +305,7 @@ EOF
 }
 
 # if the run does not already exist on the destination, commence upload process...
-if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then
+if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then
     START_TIME=$(date +%s)
 
     echo "Does not already exist in bucket: ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz"
@@ -331,8 +336,8 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
             file_basename="$(basename ${filename})"
             file_extension="${file_basename#*.}"
             file_basename_no_ext="${file_basename%%.*}"
-            if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}" &> /dev/null; then
-                $GSUTIL_CMD cp "${PATH_TO_UPLOAD}/${filename}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}"
+            if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}" &> /dev/null; then
+                $GCLOUD_STORAGE_CMD cp "${PATH_TO_UPLOAD}/${filename}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}"
             else
                 echo "Already exists in bucket; skipping upload: ${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}_${file_basename}"
             fi
@@ -414,17 +419,15 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
             # try (and retry) to rsync incremental tarballs to bucket
             retry_count=0
             until [ "$retry_count" -ge $RSYNC_RETRY_MAX_ATTEMPTS ]; do
-                # -m parallel uploads
-                # -c Causes the rsync command to compute and compare checksums
+                # --checksums-only: Causes the rsync command to compute and compare checksums
                 #    (instead of comparing mtime) for files if the size of source
-                #    and destination match.
-                # -C If an error occurs, continue to attempt to copy the remaining
-                #    files. If errors occurred, gsutil's exit status will be
-                #    non-zero even if this flag is set. This option is implicitly
-                #    set when running "gsutil -m rsync..." (included below in case '-m' is removed).
+                #    and destination match. (gcloud storage handles parallelization automatically)
+                # --continue-on-error: If an error occurs, continue to attempt to copy the remaining
+                #    files. If errors occurred, gcloud storage's exit status will be
+                #    non-zero even if this flag is set. gcloud storage handles parallelization automatically.
                 #
-                # see: https://cloud.google.com/storage/docs/gsutil/commands/rsync
-                $GSUTIL_CMD rsync -cC -x '.*index$' "${STAGING_AREA_PATH}/${RUN_BASENAME}/" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts" && break
+                # see: https://cloud.google.com/sdk/gcloud/reference/storage/rsync
+                $GCLOUD_STORAGE_CMD rsync --checksums-only --continue-on-error --exclude='.*index$' "${STAGING_AREA_PATH}/${RUN_BASENAME}/" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts" && break
                 retry_count=$((retry_count+1)) 
                 #sleep $RSYNC_RETRY_DELAY_SEC
                 sleep $(expr $RSYNC_RETRY_DELAY_SEC \* $retry_count) # each retry scale delay by a multiple of the count
@@ -435,8 +438,8 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
             for incremental_tarball in $(find "${STAGING_AREA_PATH}/${RUN_BASENAME}" -type f -name "*.tar.gz"); do
                 # if the local incremental tarball has indeed been synced
                 # remove the local copy of it...
-                if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})"; then
-                    $GSUTIL_CMD cp "${incremental_tarball}" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})" && rm "${incremental_tarball}"
+                if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})"; then
+                    $GCLOUD_STORAGE_CMD cp "${incremental_tarball}" "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/$(basename ${incremental_tarball})" && rm "${incremental_tarball}"
                 else
                     rm "${incremental_tarball}"
                 fi
@@ -452,11 +455,11 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
     done
 
     # make sure the composed tarball does not exist on GS; if it does not...
-    if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"; then
+    if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"; then
         # get the archive started with a blank file
         dummyfile="${STAGING_AREA_PATH}/${RUN_BASENAME}/dummyfile.tar.gz"
         touch $dummyfile
-        $GSUTIL_CMD cp "${dummyfile}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz"
+        $GCLOUD_STORAGE_CMD cp "${dummyfile}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz"
         rm "${dummyfile}"
     fi
 
@@ -465,21 +468,21 @@ if ! $GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.
     # append the first 31 incremental tarballs
     # to the main tarball, then remove the incremental tarballs
     # keep doing this until there are no more incremental tarballs
-    # see: https://cloud.google.com/storage/docs/gsutil/commands/compose
-    until [[ "$($GSUTIL_CMD du ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/'*.tar.gz' 2> /dev/null | wc -l | awk '{print $1}' || echo '0')" == "0" ]]; do
-        first_files=$($GSUTIL_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/"'*.tar.gz' | sort -V | head -n 31 | tr '\n' ' ')
+    # see: https://cloud.google.com/storage/docs/composing-objects#create-composite-cli
+    until [[ "$($GCLOUD_STORAGE_CMD du ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/'*.tar.gz' 2> /dev/null | wc -l | awk '{print $1}' || echo '0')" == "0" ]]; do
+        first_files=$($GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/"'*.tar.gz' | sort -V | head -n 31 | tr '\n' ' ')
         if [ ${#first_files} -ge 0 ]; then
-            $GSUTIL_CMD compose "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" \
+            $GCLOUD_STORAGE_CMD objects compose "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" \
                 ${first_files} \
-                "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" && sleep 10 && $GSUTIL_CMD rm ${first_files}
+                "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" && sleep 10 && $GCLOUD_STORAGE_CMD rm ${first_files}
         fi
     done
 
     # create a note about the tarball
-    echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt"
+    echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt"
     
     # create and upload verbose metadata JSON file
-    generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | gsutil cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json"
+    generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json"
 
     # if only the index file is present, remove it
     if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then
diff --git a/monitor_runs.sh b/monitor_runs.sh
index 1d2abc3..8238f04 100755
--- a/monitor_runs.sh
+++ b/monitor_runs.sh
@@ -66,10 +66,11 @@ INCLUSION_TIME_INTERVAL_DAYS=${INCLUSION_TIME_INTERVAL_DAYS:-'7'}
 DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'10'}
 STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}"
 
-GSUTIL_CMD='gsutil'
+GCLOUD_STORAGE_CMD='gcloud storage'
 if [ "$(uname)" == "Darwin" ]; then
     #export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=true # workaround for https://bugs.python.org/issue33725
-    GSUTIL_CMD='gsutil -o "GSUtil:parallel_process_count=1"'
+    # Note: gcloud storage handles parallelization automatically, no manual tuning needed
+    GCLOUD_STORAGE_CMD='gcloud storage'
 fi
 
 echo "Location for temp files: ${STAGING_AREA_PATH}"
@@ -91,7 +92,7 @@ while true; do
             RUN_BASENAME="$(basename ${found_dir})"
             # if the run does not already exist on the destination, commence upload process...
             RUN_BUCKET_PATH="${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz"
-            if ! $GSUTIL_CMD ls "${RUN_BUCKET_PATH}" &> /dev/null; then
+            if ! $GCLOUD_STORAGE_CMD ls "${RUN_BUCKET_PATH}" &> /dev/null; then
                 echo "Run does not exist in bucket:            ${RUN_BUCKET_PATH}"
                 if ! [ -d "${STAGING_AREA_PATH}/${RUN_BASENAME}" ]; then
                   echo "Run upload not yet in progress; no dir:  ${STAGING_AREA_PATH}/${RUN_BASENAME}"

From 51b7c47a70c9cd5ac0f25e280159f398e1d3257e Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 21:54:30 -0400
Subject: [PATCH 08/14] eliminate unnecessary repetition of path construction
 for tarball destination (store in `FINAL_TARBALL_PATH `)

---
 incremental_illumina_upload_to_gs.sh | 19 +++++++++++--------
 monitor_runs.sh                      |  3 ++-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 1e39551..6a8bbc0 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -304,11 +304,14 @@ generate_verbose_metadata() {
 EOF
 }
 
+# Define the final tarball path once to avoid repetition
+FINAL_TARBALL_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"
+
 # if the run does not already exist on the destination, commence upload process...
-if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz" &> /dev/null; then
+if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then
     START_TIME=$(date +%s)
 
-    echo "Does not already exist in bucket: ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz"
+    echo "Does not already exist in bucket: $FINAL_TARBALL_PATH"
 
     # quit if the run is stale based on mtime of RunInfo.xml
     if [ "$(uname)" != "Darwin" ]; then
@@ -455,11 +458,11 @@ if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BA
     done
 
     # make sure the composed tarball does not exist on GS; if it does not...
-    if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"; then
+    if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH"; then
         # get the archive started with a blank file
         dummyfile="${STAGING_AREA_PATH}/${RUN_BASENAME}/dummyfile.tar.gz"
         touch $dummyfile
-        $GCLOUD_STORAGE_CMD cp "${dummyfile}" "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz"
+        $GCLOUD_STORAGE_CMD cp "${dummyfile}" "$FINAL_TARBALL_PATH"
         rm "${dummyfile}"
     fi
 
@@ -472,14 +475,14 @@ if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BA
     until [[ "$($GCLOUD_STORAGE_CMD du ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/'*.tar.gz' 2> /dev/null | wc -l | awk '{print $1}' || echo '0')" == "0" ]]; do
         first_files=$($GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/parts/"'*.tar.gz' | sort -V | head -n 31 | tr '\n' ' ')
         if [ ${#first_files} -ge 0 ]; then
-            $GCLOUD_STORAGE_CMD objects compose "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" \
+            $GCLOUD_STORAGE_CMD objects compose "$FINAL_TARBALL_PATH" \
                 ${first_files} \
-                "${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/$RUN_BASENAME.tar.gz" && sleep 10 && $GCLOUD_STORAGE_CMD rm ${first_files}
+                "$FINAL_TARBALL_PATH" && sleep 10 && $GCLOUD_STORAGE_CMD rm ${first_files}
         fi
     done
 
     # create a note about the tarball
-    echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands." | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt"
+    echo "$RUN_BASENAME.tar.gz created using optimized tar settings for efficient concatenation. Can be extracted with standard tar commands. The $RUN_BASENAME.terra.tsv file can be used to add a row for this tarball to a table on Terra." | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.tar.gz.README.txt"
     
     # create and upload verbose metadata JSON file
     generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json"
@@ -492,6 +495,6 @@ if ! $GCLOUD_STORAGE_CMD ls "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BA
     # if staging dir is empty, remove it (rmdir only does this if empty).
     rmdir "${STAGING_AREA_PATH}/${RUN_BASENAME}" &> /dev/null
 else
-    echo "Exiting; already exists: ${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz"
+    echo "Exiting; already exists: $FINAL_TARBALL_PATH"
     exit 0
 fi
diff --git a/monitor_runs.sh b/monitor_runs.sh
index 8238f04..529a171 100755
--- a/monitor_runs.sh
+++ b/monitor_runs.sh
@@ -91,7 +91,8 @@ while true; do
             echo "Path is new enough to attempt an upload: ${found_dir}"
             RUN_BASENAME="$(basename ${found_dir})"
             # if the run does not already exist on the destination, commence upload process...
-            RUN_BUCKET_PATH="${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/${RUN_BASENAME}.tar.gz"
+            # Define the final tarball path to match the pattern used in incremental script
+            RUN_BUCKET_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"
             if ! $GCLOUD_STORAGE_CMD ls "${RUN_BUCKET_PATH}" &> /dev/null; then
                 echo "Run does not exist in bucket:            ${RUN_BUCKET_PATH}"
                 if ! [ -d "${STAGING_AREA_PATH}/${RUN_BASENAME}" ]; then

From ccc75c131c8548b2f5692abbaeb4efa3da65f3ff Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 21:55:39 -0400
Subject: [PATCH 09/14] create tsv file for appending a row to a Terra table
 for the uploaded run

---
 incremental_illumina_upload_to_gs.sh | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 6a8bbc0..3bad90e 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -304,6 +304,18 @@ generate_verbose_metadata() {
 EOF
 }
 
+# Function to generate Terra-compatible TSV file for data table import
+generate_terra_tsv() {
+    local run_basename="$1"
+    local tarball_path="$2"
+    
+    # Create TSV with POSIX line endings (LF only)
+    cat << EOF
+entity:flowcell_id	biosample_attributes	flowcell_tar	samplesheets	sample_rename_map_tsv
+$run_basename		$tarball_path		
+EOF
+}
+
 # Define the final tarball path once to avoid repetition
 FINAL_TARBALL_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"
 
@@ -486,6 +498,9 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then
     
     # create and upload verbose metadata JSON file
     generate_verbose_metadata "$RUN_BASENAME" "$PATH_TO_UPLOAD" "$DESTINATION_BUCKET_PREFIX" "$START_TIME" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.upload_metadata.json"
+    
+    # create and upload Terra-compatible TSV file
+    generate_terra_tsv "$RUN_BASENAME" "$FINAL_TARBALL_PATH" | $GCLOUD_STORAGE_CMD cp - "${DESTINATION_BUCKET_PREFIX}/$RUN_BASENAME/$RUN_BASENAME.terra.tsv"
 
     # if only the index file is present, remove it
     if [[ $(ls -1 "${STAGING_AREA_PATH}/${RUN_BASENAME}" | wc -l) -eq 1 ]]; then

From 5471ff635f84d5665cc434f3a3f2c828fac75f2d Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 22:01:03 -0400
Subject: [PATCH 10/14] parameterize name included in tsv for Terra table row
 upsert

---
 CLAUDE.md                            | 1 +
 incremental_illumina_upload_to_gs.sh | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 8abfc71..7e87304 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -35,6 +35,7 @@ Key configuration variables (with defaults):
 - `STAGING_AREA_PATH` - Location for temporary files (defaults to `/usr/local/illumina/seq-run-uploads` on Illumina machines, `/tmp/seq-run-uploads` elsewhere)
 - `RSYNC_RETRY_MAX_ATTEMPTS=12` - Maximum retry attempts for uploads
 - `INCLUSION_TIME_INTERVAL_DAYS=7` - Age limit for runs to be considered for upload
+- `TERRA_RUN_TABLE_NAME=flowcell` - Table name for Terra TSV file generation (creates `entity:{table_name}_id` column)
 
 ## Usage Patterns
 
diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 3bad90e..e161a33 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -59,6 +59,7 @@ RUN_BASENAME="$(basename ${PATH_TO_UPLOAD})"
 STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}"
 RSYNC_RETRY_MAX_ATTEMPTS=${RSYNC_RETRY_MAX_ATTEMPTS:-"12"}
 RSYNC_RETRY_DELAY_SEC=${RSYNC_RETRY_DELAY_SEC:-"600"}
+TERRA_RUN_TABLE_NAME=${TERRA_RUN_TABLE_NAME:-"flowcell"}
 
 # -------------------------------
 
@@ -311,7 +312,7 @@ generate_terra_tsv() {
     
     # Create TSV with POSIX line endings (LF only)
     cat << EOF
-entity:flowcell_id	biosample_attributes	flowcell_tar	samplesheets	sample_rename_map_tsv
+entity:${TERRA_RUN_TABLE_NAME}_id	biosample_attributes	flowcell_tar	samplesheets	sample_rename_map_tsv
 $run_basename		$tarball_path		
 EOF
 }

From bd0e979638c0139a5356497cfcdcbed1798dd133 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 22:21:19 -0400
Subject: [PATCH 11/14] add hard and optional dependency checking; revise cron
 detection to be more involved and hopefully cross-platform (GNU/Linux vs
 macOS/BSD)

---
 .claude/settings.local.json          |   3 +-
 incremental_illumina_upload_to_gs.sh | 116 ++++++++++++++++++++++++++-
 monitor_runs.sh                      |  77 +++++++++++++++++-
 3 files changed, 192 insertions(+), 4 deletions(-)

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 1e1c3e6..898fdf1 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -4,7 +4,8 @@
       "Bash(git checkout:*)",
       "Bash(git fetch:*)",
       "WebFetch(domain:www.gnu.org)",
-      "WebSearch"
+      "WebSearch",
+      "WebFetch(domain:man7.org)"
     ],
     "deny": [],
     "ask": []
diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index e161a33..21e929d 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -61,6 +61,82 @@ RSYNC_RETRY_MAX_ATTEMPTS=${RSYNC_RETRY_MAX_ATTEMPTS:-"12"}
 RSYNC_RETRY_DELAY_SEC=${RSYNC_RETRY_DELAY_SEC:-"600"}
 TERRA_RUN_TABLE_NAME=${TERRA_RUN_TABLE_NAME:-"flowcell"}
 
+# -------------------------------
+# Dependency checking
+# -------------------------------
+
+# Hard dependencies - script will exit if any are missing
+HARD_DEPENDENCIES=(gcloud tar date basename mkdir rm find head wc cut awk sort tr expr du uname realpath stat touch cat gzip base64 sed ls whoami hostname ps)
+
+# Optional dependencies - script will work without them but with reduced functionality
+OPTIONAL_DEPENDENCIES=(dig curl ip route grep pstree)
+
+# Check hard dependencies
+echo "Checking required dependencies..."
+missing_deps=()
+for dependency in "${HARD_DEPENDENCIES[@]}"; do
+    if ! command -v "$dependency" &> /dev/null; then
+        missing_deps+=("$dependency")
+    fi
+done
+
+if [[ ${#missing_deps[@]} -gt 0 ]]; then
+    echo "ERROR! Missing required dependencies. Aborting..."
+    echo "The following commands need to be installed and available on PATH:"
+    for dep in "${missing_deps[@]}"; do
+        echo "  - $dep"
+    done
+    echo ""
+    echo "Please install the missing dependencies and try again."
+    exit 1
+fi
+
+# Check and track optional dependencies
+echo "Checking optional dependencies..."
+available_optional_deps=()
+for dependency in "${OPTIONAL_DEPENDENCIES[@]}"; do
+    if command -v "$dependency" &> /dev/null; then
+        available_optional_deps+=("$dependency")
+    fi
+done
+
+if [[ ${#available_optional_deps[@]} -eq 0 ]]; then
+    echo "Warning: No optional tools available. Some features may have reduced functionality."
+else
+    echo "Available optional tools: ${available_optional_deps[*]}"
+    
+    # Check specific functionality
+    ip_tools=(dig curl ip route)
+    available_ip_tools=()
+    for tool in "${ip_tools[@]}"; do
+        if [[ " ${available_optional_deps[*]} " =~ " ${tool} " ]]; then
+            available_ip_tools+=("$tool")
+        fi
+    done
+    
+    if [[ ${#available_ip_tools[@]} -eq 0 ]]; then
+        echo "Warning: No external IP detection tools available. External IP will be set to 0.0.0.0"
+    fi
+    
+    if [[ " ${available_optional_deps[*]} " =~ " pstree " ]]; then
+        echo "Note: pstree available for enhanced cron detection"
+    else
+        echo "Note: Using ps fallback for cron detection (pstree not available)"
+    fi
+fi
+
+# Check for GNU tar specifically on macOS
+if [ "$(uname)" == "Darwin" ]; then
+    if ! command -v gtar &> /dev/null; then
+        echo "ERROR! macOS detected but GNU tar (gtar) is not available."
+        echo "Please install GNU tar: brew install gnu-tar"
+        exit 1
+    fi
+fi
+
+echo "All required dependencies satisfied."
+echo ""
+
 # -------------------------------
 
 function cleanup(){
@@ -168,8 +244,44 @@ is_cron_execution() {
         return
     fi
     
-    # Detect based on environment characteristics
-    if [[ -z "$TERM" || "$TERM" == "dumb" ]] && [[ -z "$SSH_CLIENT" ]] && [[ -z "$SSH_TTY" ]]; then
+    # Use same cron detection logic as monitor_runs.sh
+    local cron_detected=0
+    if command -v pstree &> /dev/null; then
+        # Use pstree if available
+        if pstree -s $$ 2>/dev/null | grep -q cron 2>/dev/null; then
+            cron_detected=1
+        else
+            cron_detected=0
+        fi
+    else
+        # Fallback using ps - works on both GNU/Linux and macOS/BSD
+        if [ "$(uname)" == "Darwin" ]; then
+            # macOS/BSD ps format - check current and parent processes
+            local current_pid=$$
+            while [[ $current_pid -ne 1 ]]; do
+                if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then
+                    cron_detected=1
+                    break
+                fi
+                local parent_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ')
+                [[ -z "$parent_pid" || "$parent_pid" == "0" || "$parent_pid" == "1" ]] && break
+                current_pid=$parent_pid
+            done
+        else
+            # GNU/Linux ps format - trace up the process tree
+            local current_pid=$$
+            while [[ $current_pid -ne 1 ]]; do
+                if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then
+                    cron_detected=1
+                    break
+                fi
+                current_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ' || echo 1)
+                [[ -z "$current_pid" || "$current_pid" == "0" ]] && break
+            done
+        fi
+    fi
+    
+    if [[ $cron_detected -gt 0 ]]; then
         echo "true"
     else
         echo "false"
diff --git a/monitor_runs.sh b/monitor_runs.sh
index 529a171..38cd420 100755
--- a/monitor_runs.sh
+++ b/monitor_runs.sh
@@ -30,6 +30,46 @@ fi
 
 set -x
 
+# -------------------------------
+# Dependency checking
+# -------------------------------
+
+# Hard dependencies - script will exit if any are missing
+MONITOR_DEPENDENCIES=(gcloud find sort realpath basename grep ps)
+
+echo "Checking required dependencies for monitor script..."
+missing_deps=()
+for dependency in "${MONITOR_DEPENDENCIES[@]}"; do
+    if ! command -v "$dependency" &> /dev/null; then
+        missing_deps+=("$dependency")
+    fi
+done
+
+if [[ ${#missing_deps[@]} -gt 0 ]]; then
+    echo "ERROR! Missing required dependencies. Aborting..."
+    echo "The following commands need to be installed and available on PATH:"
+    for dep in "${missing_deps[@]}"; do
+        echo "  - $dep"
+    done
+    echo ""
+    echo "Please install the missing dependencies and try again."
+    exit 1
+fi
+
+# Check for optional pstree (preferred for cron detection)
+if command -v pstree &> /dev/null; then
+    echo "Using pstree for cron detection."
+    CRON_DETECTION_METHOD="pstree"
+else
+    echo "pstree not available, using ps fallback for cron detection."
+    CRON_DETECTION_METHOD="ps"
+fi
+
+echo "All required dependencies satisfied for monitor script."
+echo ""
+
+# -------------------------------
+
 function absolute_path() {
     local SOURCE="$1"
     while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
@@ -76,7 +116,42 @@ fi
 echo "Location for temp files: ${STAGING_AREA_PATH}"
 
 # detect if running via cron, and only run infinitely if not running via cron
-CRON="$( pstree -s $$ | grep -c cron )"
+if [[ "$CRON_DETECTION_METHOD" == "pstree" ]]; then
+    if pstree -s $$ 2>/dev/null | grep -q cron 2>/dev/null; then
+        CRON=1
+    else
+        CRON=0
+    fi
+else
+    # Fallback using ps - works on both GNU/Linux and macOS/BSD
+    # Check if any parent process contains 'cron' in the command name
+    if [ "$(uname)" == "Darwin" ]; then
+        # macOS/BSD ps format - check current and parent processes
+        current_pid=$$
+        CRON=0
+        while [[ $current_pid -ne 1 ]]; do
+            if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then
+                CRON=1
+                break
+            fi
+            parent_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ')
+            [[ -z "$parent_pid" || "$parent_pid" == "0" || "$parent_pid" == "1" ]] && break
+            current_pid=$parent_pid
+        done
+    else
+        # GNU/Linux ps format - trace up the process tree
+        current_pid=$$
+        CRON=0
+        while [[ $current_pid -ne 1 ]]; do
+            if ps -o comm= -p $current_pid 2>/dev/null | grep -q cron; then
+                CRON=1
+                break
+            fi
+            current_pid=$(ps -o ppid= -p $current_pid 2>/dev/null | tr -d ' ' || echo 1)
+            [[ -z "$current_pid" || "$current_pid" == "0" ]] && break
+        done
+    fi
+fi
 while true; do
     echo ""
     echo "==="

From c2a4af555f09a064103de6b0ae97214cc2c0c705 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 22:42:04 -0400
Subject: [PATCH 12/14] parameterize static exclusion list used to omit large
 debug/diagnostic data from run folder tarballs

---
 incremental_illumina_upload_to_gs.sh | 25 +++++++++++++++++++++----
 monitor_runs.sh                      |  2 +-
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index 21e929d..a213b78 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -53,7 +53,7 @@ else
 fi
 
 CHUNK_SIZE_MB=${CHUNK_SIZE_MB:-'100'}
-DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'30'}
+DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'600'}
 RUN_COMPLETION_TIMEOUT_DAYS=${RUN_COMPLETION_TIMEOUT_DAYS:-'16'}
 RUN_BASENAME="$(basename ${PATH_TO_UPLOAD})"
 STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}"
@@ -61,6 +61,15 @@ RSYNC_RETRY_MAX_ATTEMPTS=${RSYNC_RETRY_MAX_ATTEMPTS:-"12"}
 RSYNC_RETRY_DELAY_SEC=${RSYNC_RETRY_DELAY_SEC:-"600"}
 TERRA_RUN_TABLE_NAME=${TERRA_RUN_TABLE_NAME:-"flowcell"}
 
+# Default directories to exclude from tar archives (large non-essential directories)
+DEFAULT_TAR_EXCLUSIONS=("Thumbnail_Images" "Images" "FocusModelGeneration" "Autocenter" "InstrumentAnalyticsLogs" "Logs")
+# Allow override via environment variable (space-separated list)
+if [[ -n "$TAR_EXCLUSIONS" ]]; then
+    IFS=' ' read -ra TAR_EXCLUSIONS_ARRAY <<< "$TAR_EXCLUSIONS"
+else
+    TAR_EXCLUSIONS_ARRAY=("${DEFAULT_TAR_EXCLUSIONS[@]}")
+fi
+
 # -------------------------------
 # Dependency checking
 # -------------------------------
@@ -410,7 +419,7 @@ generate_verbose_metadata() {
     "blocking_factor": 1,
     "sparse_enabled": true,
     "eof_trimming": "incremental_only",
-    "excluded_directories": ["Thumbnail_Images", "Images", "FocusModelGeneration", "Autocenter", "InstrumentAnalyticsLogs", "Logs"]
+    "excluded_directories": [$(printf '"%s",' "${TAR_EXCLUSIONS_ARRAY[@]}" | sed 's/,$//')]
   },
   "generation_timestamp": "$timestamp_formatted"
 }
@@ -495,7 +504,7 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then
         run_is_finished=$([ -e "${PATH_TO_UPLOAD}/RTAComplete.txt" ] || [ -e "${PATH_TO_UPLOAD}/RTAComplete.xml" ] && echo "true" || echo "false")
 
         # if enough additional data has been added, or the run is complete, initiate incremental upload
-        if [[ $current_size -ge $(($size_at_last_check + $chunk_size_bytes)) ]] || [ "$run_is_finished" = 'true' ]; then
+        if [[ $current_size -ge $(($size_at_last_check + $chunk_size_bytes)) || "$run_is_finished" == "true" ]]; then
             echo "commencing sync on latest data"
             size_at_last_check=$current_size
             timestamp=$(date +%s) # intentionally called before tar so time is a little older    
@@ -517,7 +526,15 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then
             # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks
             if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi
             if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi
-                $TAR_BIN --exclude='Thumbnail_Images' --exclude="Images" --exclude "FocusModelGeneration" --exclude='Autocenter' --exclude='InstrumentAnalyticsLogs' --exclude "Logs" \
+            if [[ -f "$EXCLUSIONS_FILE" && -s "$EXCLUSIONS_FILE" ]]; then EXCLUSION_STR="--exclude-from=$EXCLUSIONS_FILE"; else EXCLUSION_STR=""; fi
+            
+            # Build static exclusion arguments from array
+            STATIC_EXCLUSIONS=()
+            for exclusion in "${TAR_EXCLUSIONS_ARRAY[@]}"; do
+                STATIC_EXCLUSIONS+=("--exclude=$exclusion")
+            done
+            
+                $TAR_BIN "${STATIC_EXCLUSIONS[@]}" \
                 --create \
                 --blocking-factor=1 \
                 --sparse \
diff --git a/monitor_runs.sh b/monitor_runs.sh
index 38cd420..8ba2e0d 100755
--- a/monitor_runs.sh
+++ b/monitor_runs.sh
@@ -103,7 +103,7 @@ PATH_TO_MONITOR="$(realpath ${PATH_TO_MONITOR})"
 DESTINATION_BUCKET_PREFIX="$2"
 
 INCLUSION_TIME_INTERVAL_DAYS=${INCLUSION_TIME_INTERVAL_DAYS:-'7'}
-DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'10'}
+DELAY_BETWEEN_INCREMENTS_SEC=${DELAY_BETWEEN_INCREMENTS_SEC:-'600'}
 STAGING_AREA_PATH="${STAGING_AREA_PATH:-$DEFAULT_STAGING_AREA}"
 
 GCLOUD_STORAGE_CMD='gcloud storage'

From 695c0c1cfe1a881392331005d35684645d7cb41d Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 22:43:22 -0400
Subject: [PATCH 13/14] use dynamic exclusions to avoid tarball bloat by
 discouraging tar from including multiple snapshots of large in-progress files

use dynamic exclusions to avoid tarball bloat by discouraging tar from including multiple snapshots of large in-progress files; this addresses #4
---
 .claude/settings.local.json          |  3 +-
 CLAUDE.md                            |  6 ++-
 incremental_illumina_upload_to_gs.sh | 67 ++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 898fdf1..52905d5 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -5,7 +5,8 @@
       "Bash(git fetch:*)",
       "WebFetch(domain:www.gnu.org)",
       "WebSearch",
-      "WebFetch(domain:man7.org)"
+      "WebFetch(domain:man7.org)",
+      "Bash(gh issue view:*)"
     ],
     "deny": [],
     "ask": []
diff --git a/CLAUDE.md b/CLAUDE.md
index 7e87304..146171a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -30,12 +30,13 @@ Required tools that must be available:
 
 Key configuration variables (with defaults):
 - `CHUNK_SIZE_MB=100` - Size of incremental tar chunks
-- `DELAY_BETWEEN_INCREMENTS_SEC=30` - Wait time between upload attempts
+- `DELAY_BETWEEN_INCREMENTS_SEC=600` - Wait time between upload attempts (optimized to reduce tarball bloat from partial *.cbcl files)
 - `RUN_COMPLETION_TIMEOUT_DAYS=16` - Max time to wait for run completion
 - `STAGING_AREA_PATH` - Location for temporary files (defaults to `/usr/local/illumina/seq-run-uploads` on Illumina machines, `/tmp/seq-run-uploads` elsewhere)
 - `RSYNC_RETRY_MAX_ATTEMPTS=12` - Maximum retry attempts for uploads
 - `INCLUSION_TIME_INTERVAL_DAYS=7` - Age limit for runs to be considered for upload
 - `TERRA_RUN_TABLE_NAME=flowcell` - Table name for Terra TSV file generation (creates `entity:{table_name}_id` column)
+- `TAR_EXCLUSIONS` - Space-separated list of directories to exclude from tar archives (defaults to: "Thumbnail_Images Images FocusModelGeneration Autocenter InstrumentAnalyticsLogs Logs")
 
 ## Usage Patterns
 
@@ -56,7 +57,8 @@ Key configuration variables (with defaults):
 
 ## Important Implementation Details
 
-- **Excluded Directories**: The upload excludes large non-essential directories: `Thumbnail_Images`, `Images`, `FocusModelGeneration`, `Autocenter`, `InstrumentAnalyticsLogs`, `Logs`
+- **Excluded Directories**: The upload excludes large non-essential directories (configurable via `TAR_EXCLUSIONS` environment variable, defaults to: `Thumbnail_Images`, `Images`, `FocusModelGeneration`, `Autocenter`, `InstrumentAnalyticsLogs`, `Logs`)
+- **Dynamic Exclusions**: During active sequencing, automatically excludes the most recent cycle directory and recently modified files (within 3 minutes) to prevent tarball bloat from partial `*.cbcl` files. Exclusions are disabled for the final tarball when `RTAComplete.txt`/`RTAComplete.xml` is detected.
 - **Individual Files**: `SampleSheet.csv` and `RunInfo.xml` are uploaded separately before tarball creation
 - **Run Completion Detection**: Looks for `RTAComplete.txt` or `RTAComplete.xml` files
 - **Tarball Extraction**: Resulting tarballs must be extracted with GNU tar using `--ignore-zeros`
diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index a213b78..cac81a3 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -438,6 +438,60 @@ $run_basename		$tarball_path
 EOF
 }
 
+# Function to generate dynamic exclusions for incomplete cycles and recently modified files
+# Only generates exclusions when run is not finished to prevent tarball bloat from partial *.cbcl files
+generate_dynamic_exclusions() {
+    local run_path="$1"
+    local exclusions_file="$2"
+    local run_is_finished="$3"
+    
+    # Only generate exclusions if run is not finished
+    if [[ "$run_is_finished" != 'true' ]]; then
+        # Clear the exclusions file
+        > "$exclusions_file"
+        
+        # Find and exclude the most recent cycle directory to prevent capturing partial *.cbcl files
+        # Look for pattern: Data/Intensities/BaseCalls/L00*/C###.# (cycle directories)
+        if [[ -d "${run_path}/Data/Intensities/BaseCalls" ]]; then
+            local most_recent_cycle=$(find "${run_path}/Data/Intensities/BaseCalls/"L* \
+                -type d \
+                -regextype posix-extended \
+                -regex '^.+/C[0-9]+\.[0-9]+$' 2>/dev/null | \
+                sort -r -k1,1 -V | \
+                head -n1 | \
+                sed -E 's|(BaseCalls/)L([0-9]+)|\\1L*|g' 2>/dev/null || true)
+            
+            if [[ -n "$most_recent_cycle" ]]; then
+                # Convert absolute path to relative path for tar exclusion
+                local relative_cycle_path="${most_recent_cycle#${run_path}/}"
+                echo "$relative_cycle_path" >> "$exclusions_file"
+                echo "Dynamic exclusion: $relative_cycle_path (most recent cycle)"
+            fi
+        fi
+        
+        # Add files that have been modified in the past 3 minutes (optional, but recommended)
+        # This helps avoid capturing files that are actively being written
+        find "$run_path" -mmin -3 -type f 2>/dev/null | while IFS= read -r recent_file; do
+            # Convert to relative path for tar exclusion
+            local relative_file="${recent_file#${run_path}/}"
+            echo "$relative_file" >> "$exclusions_file"
+        done
+        
+        # Show exclusions count for logging
+        local exclusion_count=$(wc -l < "$exclusions_file" 2>/dev/null || echo 0)
+        if [[ $exclusion_count -gt 0 ]]; then
+            echo "Generated $exclusion_count dynamic exclusions to prevent capturing incomplete files"
+        fi
+        
+        return 0
+    else
+        echo "Run is finished - no dynamic exclusions applied"
+        # Ensure exclusions file doesn't exist for finished runs
+        [[ -f "$exclusions_file" ]] && rm "$exclusions_file"
+        return 1
+    fi
+}
+
 # Define the final tarball path once to avoid repetition
 FINAL_TARBALL_PATH="${DESTINATION_BUCKET_PREFIX}/${RUN_BASENAME}/${RUN_BASENAME}.tar.gz"
 
@@ -515,6 +569,17 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then
             # generate enhanced tar label with metadata
             tar_label=$(generate_tar_label "$RUN_BASENAME" "$tar_increment_counter")
 
+            # Generate dynamic exclusions to prevent tarball bloat from partial *.cbcl files
+            EXCLUSIONS_FILE="${STAGING_AREA_PATH}/${RUN_BASENAME}/dynamic_exclusions.txt"
+            generate_dynamic_exclusions "$PATH_TO_UPLOAD" "$EXCLUSIONS_FILE" "$run_is_finished"
+            
+            # If this is the final tarball for a completed run, sync and wait to ensure all writes are flushed
+            if [[ "$run_is_finished" == 'true' ]]; then
+                echo "Run completed - performing final sync and wait before capturing final tarball"
+                sync
+                sleep 10  # Wait 10 seconds to ensure all file writes are fully committed
+            fi
+
             # increate incremental tarballs
             # see: https://www.gnu.org/software/tar/manual/html_node/Incremental-Dumps.html
             #      https://www.gnu.org/software/tar/manual/html_node/Snapshot-Files.html
@@ -523,6 +588,7 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then
             # '--blocking-factor=1' prevents extra zero-padding blocks for efficient concatenation
             # '--sparse' consolidates runs of zeros in input files
             # '--label' adds enhanced metadata (JSON or pipe-separated format within 99-byte tar limit)
+            # '--exclude-from' excludes patterns from file to prevent capturing incomplete *.cbcl files
             # 'head --bytes -1024' trims EOF blocks for incremental tarballs; final tarball preserves EOF blocks
             if [[ "$SOURCE_PATH_IS_ON_NFS" == "true" ]]; then SHOULD_CHECK_DEVICE_STR="--no-check-device"; else SHOULD_CHECK_DEVICE_STR=""; fi
             if [[ "$run_is_finished" == 'true' ]]; then EOF_PROCESSOR="cat"; else EOF_PROCESSOR="head --bytes -1024"; fi
@@ -535,6 +601,7 @@ if ! $GCLOUD_STORAGE_CMD ls "$FINAL_TARBALL_PATH" &> /dev/null; then
             done
             
                 $TAR_BIN "${STATIC_EXCLUSIONS[@]}" \
+                $EXCLUSION_STR \
                 --create \
                 --blocking-factor=1 \
                 --sparse \

From 9359a5ad681cd036113b13267a8831e948d163b1 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 15 Aug 2025 22:49:37 -0400
Subject: [PATCH 14/14] sort and de-duplicate dynamic exclusions

---
 incremental_illumina_upload_to_gs.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/incremental_illumina_upload_to_gs.sh b/incremental_illumina_upload_to_gs.sh
index cac81a3..59c4f36 100755
--- a/incremental_illumina_upload_to_gs.sh
+++ b/incremental_illumina_upload_to_gs.sh
@@ -477,6 +477,12 @@ generate_dynamic_exclusions() {
             echo "$relative_file" >> "$exclusions_file"
         done
         
+        # Sort and deduplicate exclusions (files may match both recent cycle and recent time criteria)
+        if [[ -f "$exclusions_file" && -s "$exclusions_file" ]]; then
+            local temp_file="${exclusions_file}.tmp"
+            sort -u "$exclusions_file" > "$temp_file" && mv "$temp_file" "$exclusions_file"
+        fi
+        
         # Show exclusions count for logging
         local exclusion_count=$(wc -l < "$exclusions_file" 2>/dev/null || echo 0)
         if [[ $exclusion_count -gt 0 ]]; then