Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/task_run_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

set -eo pipefail

export PARALLEL_TESTS=true # Enable parallel test execution for unit tests (auto-discovery mode)

# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Source test environment setup (handles package overrides like TVM-FFI)
Expand Down
310 changes: 300 additions & 10 deletions scripts/test_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
# Default environment variables
: "${JUNIT_DIR:=$(realpath ./junit)}"
: "${MAX_JOBS:=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)}"
: "${CUDA_VISIBLE_DEVICES:=0}"
# CUDA_VISIBLE_DEVICES: Not set by default - let detect_gpus() auto-detect via nvidia-smi
: "${SAMPLE_RATE:=5}" # Run every Nth test in sanity mode (5 = ~20% coverage)
: "${PARALLEL_TESTS:=false}" # Disable parallel test execution by default

# Randomize starting offset (0 to SAMPLE_RATE-1) for sampling variety
if [ -z "${SAMPLE_OFFSET:-}" ]; then
Expand Down Expand Up @@ -356,6 +357,284 @@ run_full_test_file() {
echo ""
}

# Detect available GPUs from CUDA_VISIBLE_DEVICES or nvidia-smi
detect_gpus() {
if [ "$PARALLEL_TESTS" != "true" ]; then
echo "0"
return
fi

# Parse CUDA_VISIBLE_DEVICES if set
if [ -n "$CUDA_VISIBLE_DEVICES" ] && [ "$CUDA_VISIBLE_DEVICES" != "-1" ]; then
# Handle various formats: "0,1,2,3" or "0 1 2 3"
AVAILABLE_GPUS=$(echo "$CUDA_VISIBLE_DEVICES" | tr ',' ' ' | tr -s ' ')
echo "$AVAILABLE_GPUS"
return
fi

# Fallback to nvidia-smi
if command -v nvidia-smi >/dev/null 2>&1; then
AVAILABLE_GPUS=$(nvidia-smi --list-gpus 2>/dev/null | awk '{print NR-1}' | tr '\n' ' ' | sed 's/ $//')
if [ -n "$AVAILABLE_GPUS" ]; then
echo "$AVAILABLE_GPUS"
return
fi
Comment on lines +377 to +381
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The use of nvidia-smi --list-gpus is not ideal for scripting as its output format is intended for human consumption and may change between driver versions. A more robust approach is to use nvidia-smi --query-gpu, which provides a stable, script-friendly interface. This also allows for better error handling by checking the command's exit code.

Suggested change
AVAILABLE_GPUS=$(nvidia-smi --list-gpus 2>/dev/null | awk '{print NR-1}' | tr '\n' ' ' | sed 's/ $//')
if [ -n "$AVAILABLE_GPUS" ]; then
echo "$AVAILABLE_GPUS"
return
fi
gpus=$(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null)
if [ $? -eq 0 ] && [ -n "$gpus" ]; then
AVAILABLE_GPUS=$(echo "$gpus" | tr '\n' ' ')
echo "$AVAILABLE_GPUS"
return
fi

fi

# Default to single GPU
echo "0"
}

# Run tests in parallel across multiple GPUs
run_tests_parallel() {
local test_files=$1
local mode=$2 # "sanity" or "full"

# Detect available GPUs
local gpu_string
gpu_string=$(detect_gpus)
local -a GPU_LIST
IFS=' ' read -r -a GPU_LIST <<< "$gpu_string"
local NUM_GPUS=${#GPU_LIST[@]}

# Auto-fallback to sequential if only one GPU
if [ "$NUM_GPUS" -eq 1 ]; then
echo "=========================================="
echo "Only 1 GPU detected - using sequential execution"
echo "=========================================="
echo "GPU: ${GPU_LIST[0]}"
echo ""
# Run sequentially instead
if [ "$mode" = "sanity" ]; then
FILE_COUNT=0
for test_file in $test_files; do
FILE_COUNT=$((FILE_COUNT + 1))
run_sanity_test_file "$test_file" "$FILE_COUNT"
done
else
for test_file in $test_files; do
run_full_test_file "$test_file"
done
fi
return
fi

echo "=========================================="
echo "PARALLEL EXECUTION MODE"
echo "=========================================="
echo "Available GPUs: ${GPU_LIST[*]}"
echo "Number of GPUs: $NUM_GPUS"
echo "Test mode: $mode"
echo ""

# Create a temporary directory for parallel job state
PARALLEL_TMP_DIR=$(mktemp -d)

# Preserve existing EXIT trap and add cleanup
PREV_EXIT_TRAP=$(trap -p EXIT | sed -E "s/^trap -- '(.*)' EXIT$/\1/")
trap 'rm -rf "$PARALLEL_TMP_DIR"; '"$PREV_EXIT_TRAP" EXIT

# Convert test files to array
local -a test_files_array
IFS=' ' read -r -a test_files_array <<< "$test_files"
local total_files=${#test_files_array[@]}

echo "Total test files to execute: $total_files"
echo ""

# Create a results file for each test
declare -A test_result_files
declare -A test_pid_map
declare -A test_gpu_map

# Free GPU queue for proper GPU assignment
local -a available_gpus=("${GPU_LIST[@]}")

# Function to run a single test file
run_single_test_background() {
local test_file=$1
local gpu_id=$2
local file_index=$3
local result_file="$PARALLEL_TMP_DIR/result_${file_index}"
local log_file="$PARALLEL_TMP_DIR/log_${file_index}"

(
# Set GPU for this test
export CUDA_VISIBLE_DEVICES=$gpu_id

# Redirect output to log file
exec > "$log_file" 2>&1

echo "=========================================="
echo "[$file_index/$total_files] Processing: $test_file"
echo "GPU: $gpu_id"
echo "=========================================="

if [ "$mode" = "sanity" ]; then
# Run sanity test
collect_tests "$test_file"

if [ -z "$ALL_NODE_IDS" ]; then
if [ $COLLECTION_EXIT_CODE -ne 0 ]; then
echo "⚠️ Collection failed for $test_file (skipping)"
else
echo "⚠️ No tests found in $test_file"
fi
echo "SKIPPED" > "$result_file"
exit 0
fi

TOTAL_IN_FILE=$(echo "$ALL_NODE_IDS" | wc -l)
sample_tests "$ALL_NODE_IDS"
SAMPLED_IN_FILE=$(echo "$SAMPLED_NODE_IDS" | wc -l)

if [ "$SAMPLED_IN_FILE" -eq 0 ]; then
echo "⚠️ No tests sampled from $test_file, skipping"
echo "SKIPPED" > "$result_file"
exit 0
fi

mapfile -t SAMPLED_NODE_IDS_ARRAY <<< "$SAMPLED_NODE_IDS"
JUNIT_FILENAME="${test_file//\//_}.xml"
JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}"

# shellcheck disable=SC2086
if ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${SAMPLED_NODE_IDS_ARRAY[@]}"; then
echo "βœ… PASSED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)"
echo "PASSED:$TOTAL_IN_FILE:$SAMPLED_IN_FILE" > "$result_file"
else
echo "❌ FAILED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)"
echo "FAILED:$TOTAL_IN_FILE:$SAMPLED_IN_FILE" > "$result_file"
fi
else
# Run full test
JUNIT_FILENAME="${test_file//\//_}.xml"
JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}"

# shellcheck disable=SC2086
if ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${test_file}"; then
echo "βœ… PASSED: $test_file"
echo "PASSED" > "$result_file"
else
echo "❌ FAILED: $test_file"
echo "FAILED" > "$result_file"
fi
fi
) &

local pid=$!
echo "$pid:$test_file:$result_file:$log_file:$file_index"
}

# Launch tests in parallel with GPU queue
echo "Launching tests in parallel..."
local test_idx=0
while [ $test_idx -lt $total_files ]; do
# Wait for a GPU to become available
while [ ${#available_gpus[@]} -eq 0 ]; do
# Check for finished jobs and reclaim their GPUs
for pid in "${!test_pid_map[@]}"; do
if ! kill -0 "$pid" 2>/dev/null; then
# Job finished, reclaim its GPU
wait "$pid" 2>/dev/null || true
local freed_gpu="${test_gpu_map[$pid]}"
available_gpus+=("$freed_gpu")
unset "test_pid_map[$pid]"
unset "test_gpu_map[$pid]"
fi
done
# Small sleep to avoid busy-waiting
[ ${#available_gpus[@]} -eq 0 ] && sleep 0.1
done

# Get next available GPU
local gpu_id="${available_gpus[0]}"
available_gpus=("${available_gpus[@]:1}") # Remove first element

# Launch test on this GPU
local test_file="${test_files_array[$test_idx]}"
local file_index=$((test_idx + 1))
local job_info
job_info=$(run_single_test_background "$test_file" "$gpu_id" "$file_index")

# Parse job info
local pid result_file log_file
IFS=':' read -r pid test_file result_file log_file file_index <<< "$job_info"
test_result_files[$pid]="$result_file:$test_file:$log_file:$file_index"
test_pid_map[$pid]="$test_file"
test_gpu_map[$pid]="$gpu_id"

test_idx=$((test_idx + 1))
done

# Wait for all remaining jobs
echo ""
echo "Waiting for all tests to complete..."
for pid in "${!test_result_files[@]}"; do
wait "$pid" 2>/dev/null || true
done

echo ""
echo "All tests completed. Processing results..."
echo ""

# Sort results by file_index for deterministic output
local -a sorted_pids=()
for pid in "${!test_result_files[@]}"; do
local result_file test_file log_file file_index
IFS=':' read -r result_file test_file log_file file_index <<< "${test_result_files[$pid]}"
sorted_pids+=("$file_index:$pid")
done
local sorted_list
sorted_list=$(printf '%s\n' "${sorted_pids[@]}" | sort -n)
mapfile -t sorted_pids <<< "$sorted_list"

# Process results in order
for entry in "${sorted_pids[@]}"; do
local pid="${entry#*:}"
local result_file test_file log_file file_index
IFS=':' read -r result_file test_file log_file file_index <<< "${test_result_files[$pid]}"

# Show log output
if [ -f "$log_file" ]; then
cat "$log_file"
echo ""
fi

# Process result
if [ -f "$result_file" ]; then
local result
result=$(cat "$result_file")
TOTAL_TESTS=$((TOTAL_TESTS + 1))

if [[ "$result" == PASSED* ]]; then
PASSED_TESTS=$((PASSED_TESTS + 1))
if [ "$mode" = "sanity" ]; then
local total_in_file sampled_in_file
# shellcheck disable=SC2034 # status is part of the read but unused
IFS=':' read -r _ total_in_file sampled_in_file <<< "$result"
TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + total_in_file))
SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + sampled_in_file))
fi
elif [[ "$result" == FAILED* ]]; then
FAILED_TESTS="$FAILED_TESTS\n - $test_file"
# shellcheck disable=SC2034 # EXIT_CODE is used by calling scripts
EXIT_CODE=1
if [ "$mode" = "sanity" ]; then
local total_in_file sampled_in_file
# shellcheck disable=SC2034 # status is part of the read but unused
IFS=':' read -r _ total_in_file sampled_in_file <<< "$result"
TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + total_in_file))
SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + sampled_in_file))
fi
elif [[ "$result" == SKIPPED* ]]; then
# Don't count skipped tests as passed
TOTAL_TESTS=$((TOTAL_TESTS - 1))
fi
fi
done
}

# Print execution summary
print_execution_summary() {
if [ "$SANITY_TEST" == "true" ]; then
Expand Down Expand Up @@ -422,16 +701,27 @@ execute_tests() {

mkdir -p "${JUNIT_DIR}"

if [ "$SANITY_TEST" == "true" ]; then
FILE_COUNT=0
for test_file in $test_files; do
FILE_COUNT=$((FILE_COUNT + 1))
run_sanity_test_file "$test_file" "$FILE_COUNT"
done
# Check if parallel execution is enabled
if [ "$PARALLEL_TESTS" == "true" ]; then
# Run tests in parallel
if [ "$SANITY_TEST" == "true" ]; then
run_tests_parallel "$test_files" "sanity"
else
run_tests_parallel "$test_files" "full"
fi
else
for test_file in $test_files; do
run_full_test_file "$test_file"
done
# Original sequential execution
if [ "$SANITY_TEST" == "true" ]; then
FILE_COUNT=0
for test_file in $test_files; do
FILE_COUNT=$((FILE_COUNT + 1))
run_sanity_test_file "$test_file" "$FILE_COUNT"
done
else
for test_file in $test_files; do
run_full_test_file "$test_file"
done
fi
fi

print_execution_summary
Expand Down
Loading