flashinfer-ai · kahyunnam · Feb 12, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 10, 2026
@@ -2,6 +2,8 @@
 
 set -eo pipefail
 
+export PARALLEL_TESTS=true  # Enable parallel test execution for unit tests (auto-discovery mode)
+
 # Get the directory where this script is located
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Source test environment setup (handles package overrides like TVM-FFI)

@@ -5,8 +5,9 @@
 # Default environment variables
 : "${JUNIT_DIR:=$(realpath ./junit)}"
 : "${MAX_JOBS:=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)}"
-: "${CUDA_VISIBLE_DEVICES:=0}"
+# CUDA_VISIBLE_DEVICES: Not set by default - let detect_gpus() auto-detect via nvidia-smi
 : "${SAMPLE_RATE:=5}"  # Run every Nth test in sanity mode (5 = ~20% coverage)
+: "${PARALLEL_TESTS:=false}"  # Disable parallel test execution by default
 
 # Randomize starting offset (0 to SAMPLE_RATE-1) for sampling variety
 if [ -z "${SAMPLE_OFFSET:-}" ]; then
@@ -356,6 +357,284 @@ run_full_test_file() {
     echo ""
 }
 
+# Detect available GPUs from CUDA_VISIBLE_DEVICES or nvidia-smi
+detect_gpus() {
+    if [ "$PARALLEL_TESTS" != "true" ]; then
+        echo "0"
+        return
+    fi
+
+    # Parse CUDA_VISIBLE_DEVICES if set
+    if [ -n "$CUDA_VISIBLE_DEVICES" ] && [ "$CUDA_VISIBLE_DEVICES" != "-1" ]; then
+        # Handle various formats: "0,1,2,3" or "0 1 2 3"
+        AVAILABLE_GPUS=$(echo "$CUDA_VISIBLE_DEVICES" | tr ',' ' ' | tr -s ' ')
+        echo "$AVAILABLE_GPUS"
+        return
+    fi
+
+    # Fallback to nvidia-smi
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        AVAILABLE_GPUS=$(nvidia-smi --list-gpus 2>/dev/null | awk '{print NR-1}' | tr '\n' ' ' | sed 's/ $//')
+        if [ -n "$AVAILABLE_GPUS" ]; then
+            echo "$AVAILABLE_GPUS"
+            return
+        fi
-        AVAILABLE_GPUS=$(nvidia-smi --list-gpus 2>/dev/null | awk '{print NR-1}' | tr '\n' ' ' | sed 's/ $//')
-        if [ -n "$AVAILABLE_GPUS" ]; then
-            echo "$AVAILABLE_GPUS"
-            return
-        fi
+        gpus=$(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null)
+        if [ $? -eq 0 ] && [ -n "$gpus" ]; then
+            AVAILABLE_GPUS=$(echo "$gpus" | tr '\n' ' ')
+            echo "$AVAILABLE_GPUS"
+            return
+        fi
-        AVAILABLE_GPUS=$(nvidia-smi --list-gpus 2>/dev/null | awk '{print NR-1}' | tr '\n' ' ' | sed 's/ $//')
-        if [ -n "$AVAILABLE_GPUS" ]; then
-            echo "$AVAILABLE_GPUS"
-            return
-        fi
+        gpus=$(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null)
+        if [ $? -eq 0 ] && [ -n "$gpus" ]; then
+            AVAILABLE_GPUS=$(echo "$gpus" | tr '\n' ' ')
+            echo "$AVAILABLE_GPUS"
+            return
+        fi
+    fi
+
+    # Default to single GPU
+    echo "0"
+}
+
+# Run tests in parallel across multiple GPUs
+run_tests_parallel() {
+    local test_files=$1
+    local mode=$2  # "sanity" or "full"
+
+    # Detect available GPUs
+    local gpu_string
+    gpu_string=$(detect_gpus)
+    local -a GPU_LIST
+    IFS=' ' read -r -a GPU_LIST <<< "$gpu_string"
+    local NUM_GPUS=${#GPU_LIST[@]}
+
+    # Auto-fallback to sequential if only one GPU
+    if [ "$NUM_GPUS" -eq 1 ]; then
+        echo "=========================================="
+        echo "Only 1 GPU detected - using sequential execution"
+        echo "=========================================="
+        echo "GPU: ${GPU_LIST[0]}"
+        echo ""
+        # Run sequentially instead
+        if [ "$mode" = "sanity" ]; then
+            FILE_COUNT=0
+            for test_file in $test_files; do
+                FILE_COUNT=$((FILE_COUNT + 1))
+                run_sanity_test_file "$test_file" "$FILE_COUNT"
+            done
+        else
+            for test_file in $test_files; do
+                run_full_test_file "$test_file"
+            done
+        fi
+        return
+    fi
+
+    echo "=========================================="
+    echo "PARALLEL EXECUTION MODE"
+    echo "=========================================="
+    echo "Available GPUs: ${GPU_LIST[*]}"
+    echo "Number of GPUs: $NUM_GPUS"
+    echo "Test mode: $mode"
+    echo ""
+
+    # Create a temporary directory for parallel job state
+    PARALLEL_TMP_DIR=$(mktemp -d)
+
+    # Preserve existing EXIT trap and add cleanup
+    PREV_EXIT_TRAP=$(trap -p EXIT | sed -E "s/^trap -- '(.*)' EXIT$/\1/")
+    trap 'rm -rf "$PARALLEL_TMP_DIR"; '"$PREV_EXIT_TRAP" EXIT
+
+    # Convert test files to array
+    local -a test_files_array
+    IFS=' ' read -r -a test_files_array <<< "$test_files"
+    local total_files=${#test_files_array[@]}
+
+    echo "Total test files to execute: $total_files"
+    echo ""
+
+    # Create a results file for each test
+    declare -A test_result_files
+    declare -A test_pid_map
+    declare -A test_gpu_map
+
+    # Free GPU queue for proper GPU assignment
+    local -a available_gpus=("${GPU_LIST[@]}")
+
+    # Function to run a single test file
+    run_single_test_background() {
+        local test_file=$1
+        local gpu_id=$2
+        local file_index=$3
+        local result_file="$PARALLEL_TMP_DIR/result_${file_index}"
+        local log_file="$PARALLEL_TMP_DIR/log_${file_index}"
+
+        (
+            # Set GPU for this test
+            export CUDA_VISIBLE_DEVICES=$gpu_id
+
+            # Redirect output to log file
+            exec > "$log_file" 2>&1
+
+            echo "=========================================="
+            echo "[$file_index/$total_files] Processing: $test_file"
+            echo "GPU: $gpu_id"
+            echo "=========================================="
+
+            if [ "$mode" = "sanity" ]; then
+                # Run sanity test
+                collect_tests "$test_file"
+
+                if [ -z "$ALL_NODE_IDS" ]; then
+                    if [ $COLLECTION_EXIT_CODE -ne 0 ]; then
+                        echo "⚠️  Collection failed for $test_file (skipping)"
+                    else
+                        echo "⚠️  No tests found in $test_file"
+                    fi
+                    echo "SKIPPED" > "$result_file"
+                    exit 0
+                fi
+
+                TOTAL_IN_FILE=$(echo "$ALL_NODE_IDS" | wc -l)
+                sample_tests "$ALL_NODE_IDS"
+                SAMPLED_IN_FILE=$(echo "$SAMPLED_NODE_IDS" | wc -l)
+
+                if [ "$SAMPLED_IN_FILE" -eq 0 ]; then
+                    echo "⚠️  No tests sampled from $test_file, skipping"
+                    echo "SKIPPED" > "$result_file"
+                    exit 0
+                fi
+
+                mapfile -t SAMPLED_NODE_IDS_ARRAY <<< "$SAMPLED_NODE_IDS"
+                JUNIT_FILENAME="${test_file//\//_}.xml"
+                JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}"
+
+                # shellcheck disable=SC2086
+                if ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${SAMPLED_NODE_IDS_ARRAY[@]}"; then
+                    echo "✅ PASSED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)"
+                    echo "PASSED:$TOTAL_IN_FILE:$SAMPLED_IN_FILE" > "$result_file"
+                else
+                    echo "❌ FAILED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)"
+                    echo "FAILED:$TOTAL_IN_FILE:$SAMPLED_IN_FILE" > "$result_file"
+                fi
+            else
+                # Run full test
+                JUNIT_FILENAME="${test_file//\//_}.xml"
+                JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}"
+
+                # shellcheck disable=SC2086
+                if ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${test_file}"; then
+                    echo "✅ PASSED: $test_file"
+                    echo "PASSED" > "$result_file"
+                else
+                    echo "❌ FAILED: $test_file"
+                    echo "FAILED" > "$result_file"
+                fi
+            fi
+        ) &
+
+        local pid=$!
+        echo "$pid:$test_file:$result_file:$log_file:$file_index"
+    }
+
+    # Launch tests in parallel with GPU queue
+    echo "Launching tests in parallel..."
+    local test_idx=0
+    while [ $test_idx -lt $total_files ]; do
+        # Wait for a GPU to become available
+        while [ ${#available_gpus[@]} -eq 0 ]; do
+            # Check for finished jobs and reclaim their GPUs
+            for pid in "${!test_pid_map[@]}"; do
+                if ! kill -0 "$pid" 2>/dev/null; then
+                    # Job finished, reclaim its GPU
+                    wait "$pid" 2>/dev/null || true
+                    local freed_gpu="${test_gpu_map[$pid]}"
+                    available_gpus+=("$freed_gpu")
+                    unset "test_pid_map[$pid]"
+                    unset "test_gpu_map[$pid]"
+                fi
+            done
+            # Small sleep to avoid busy-waiting
+            [ ${#available_gpus[@]} -eq 0 ] && sleep 0.1
+        done
+
+        # Get next available GPU
+        local gpu_id="${available_gpus[0]}"
+        available_gpus=("${available_gpus[@]:1}")  # Remove first element
+
+        # Launch test on this GPU
+        local test_file="${test_files_array[$test_idx]}"
+        local file_index=$((test_idx + 1))
+        local job_info
+        job_info=$(run_single_test_background "$test_file" "$gpu_id" "$file_index")
+
+        # Parse job info
+        local pid result_file log_file
+        IFS=':' read -r pid test_file result_file log_file file_index <<< "$job_info"
+        test_result_files[$pid]="$result_file:$test_file:$log_file:$file_index"
+        test_pid_map[$pid]="$test_file"
+        test_gpu_map[$pid]="$gpu_id"
+
+        test_idx=$((test_idx + 1))
+    done
+
+    # Wait for all remaining jobs
+    echo ""
+    echo "Waiting for all tests to complete..."
+    for pid in "${!test_result_files[@]}"; do
+        wait "$pid" 2>/dev/null || true
+    done
+
+    echo ""
+    echo "All tests completed. Processing results..."
+    echo ""
+
+    # Sort results by file_index for deterministic output
+    local -a sorted_pids=()
+    for pid in "${!test_result_files[@]}"; do
+        local result_file test_file log_file file_index
+        IFS=':' read -r result_file test_file log_file file_index <<< "${test_result_files[$pid]}"
+        sorted_pids+=("$file_index:$pid")
+    done
+    local sorted_list
+    sorted_list=$(printf '%s\n' "${sorted_pids[@]}" | sort -n)
+    mapfile -t sorted_pids <<< "$sorted_list"
+
+    # Process results in order
+    for entry in "${sorted_pids[@]}"; do
+        local pid="${entry#*:}"
+        local result_file test_file log_file file_index
+        IFS=':' read -r result_file test_file log_file file_index <<< "${test_result_files[$pid]}"
+
+        # Show log output
+        if [ -f "$log_file" ]; then
+            cat "$log_file"
+            echo ""
+        fi
+
+        # Process result
+        if [ -f "$result_file" ]; then
+            local result
+            result=$(cat "$result_file")
+            TOTAL_TESTS=$((TOTAL_TESTS + 1))
+
+            if [[ "$result" == PASSED* ]]; then
+                PASSED_TESTS=$((PASSED_TESTS + 1))
+                if [ "$mode" = "sanity" ]; then
+                    local total_in_file sampled_in_file
+                    # shellcheck disable=SC2034  # status is part of the read but unused
+                    IFS=':' read -r _ total_in_file sampled_in_file <<< "$result"
+                    TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + total_in_file))
+                    SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + sampled_in_file))
+                fi
+            elif [[ "$result" == FAILED* ]]; then
+                FAILED_TESTS="$FAILED_TESTS\n  - $test_file"
+                # shellcheck disable=SC2034  # EXIT_CODE is used by calling scripts
+                EXIT_CODE=1
+                if [ "$mode" = "sanity" ]; then
+                    local total_in_file sampled_in_file
+                    # shellcheck disable=SC2034  # status is part of the read but unused
+                    IFS=':' read -r _ total_in_file sampled_in_file <<< "$result"
+                    TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + total_in_file))
+                    SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + sampled_in_file))
+                fi
+            elif [[ "$result" == SKIPPED* ]]; then
+                # Don't count skipped tests as passed
+                TOTAL_TESTS=$((TOTAL_TESTS - 1))
+            fi
+        fi
+    done
+}
+
 # Print execution summary
 print_execution_summary() {
     if [ "$SANITY_TEST" == "true" ]; then
@@ -422,16 +701,27 @@ execute_tests() {
 
     mkdir -p "${JUNIT_DIR}"
 
-    if [ "$SANITY_TEST" == "true" ]; then
-        FILE_COUNT=0
-        for test_file in $test_files; do
-            FILE_COUNT=$((FILE_COUNT + 1))
-            run_sanity_test_file "$test_file" "$FILE_COUNT"
-        done
+    # Check if parallel execution is enabled
+    if [ "$PARALLEL_TESTS" == "true" ]; then
+        # Run tests in parallel
+        if [ "$SANITY_TEST" == "true" ]; then
+            run_tests_parallel "$test_files" "sanity"
+        else
+            run_tests_parallel "$test_files" "full"
+        fi
     else
-        for test_file in $test_files; do
-            run_full_test_file "$test_file"
-        done
+        # Original sequential execution
+        if [ "$SANITY_TEST" == "true" ]; then
+            FILE_COUNT=0
+            for test_file in $test_files; do
+                FILE_COUNT=$((FILE_COUNT + 1))
+                run_sanity_test_file "$test_file" "$FILE_COUNT"
+            done
+        else
+            for test_file in $test_files; do
+                run_full_test_file "$test_file"
+            done
+        fi
     fi
 
     print_execution_summary