diff --git a/presto/cluster/README.md b/presto/cluster/README.md new file mode 100644 index 00000000..138cc035 --- /dev/null +++ b/presto/cluster/README.md @@ -0,0 +1,7 @@ +These are a set of scripts to run presto in a Slurm cluster. + +# Dispatch a job to create SF1k tables and track job progress. +SCALE_FACTOR=1000 WORKSPACE= DATA= NUM_NODES=1 NUM_GPUS_PER_NODE=1 ./dispatch.sh create --job-name -A -p + +# Dispatch a job to run SF1k benchmark (after creation) and track job progress. +SCALE_FACTOR=1000 WORKSPACE= DATA= NUM_NODES= NUM_GPUS_PER_NODE= ./dispatch.sh run --job-name -A -p \ No newline at end of file diff --git a/presto/cluster/create_benchmarks.sbatch b/presto/cluster/create_benchmarks.sbatch new file mode 100644 index 00000000..122c4d08 --- /dev/null +++ b/presto/cluster/create_benchmarks.sbatch @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --time=00:25:00 + +[ -z "$SCALE_FACTOR" ] && echo "SCALE_FACTOR env variable must be set" && exit 1 + +source ../../scripts/common.sh +source slurm_functions.sh +source setup_coord.sh + +for i in $(seq 0 $(( $NUM_WORKERS - 1 )) ); do + # Workers need to be CPU to run ANALYZE on tables. + run_worker "0" "cpu" "$COORD" "$i" + #run_worker $i "cpu" +done + +wait_for_workers_to_register $NUM_WORKERS + +create_tpch_data $SCALE_FACTOR + +create_schema $SCALE_FACTOR diff --git a/presto/cluster/dispatch.sh b/presto/cluster/dispatch.sh new file mode 100755 index 00000000..bbe211d2 --- /dev/null +++ b/presto/cluster/dispatch.sh @@ -0,0 +1,26 @@ +# This is a convenient wrapper for the run_benchmarks.sh and create_benchmarks.sh that +# will track the current job for the user. + +#!/bin/bash +rm *.log +rm *.out + +[ $# -ge 1 ] && echo "$0 expected first argument is 'create/run'" && exit 1 +JOB_TYPE="$1" +[ "$JOB_TYPE" == "create" ] && [ "$JOB_TYPE" == "run" ] && echo "first parameter must be create or run" && exit 1 +shift 1 + +[ -z "$NUM_NODES" ] && echo "NUM_NODES env variable must be set" && exit 1 +[ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 + +sbatch "$@" --nodes=${NUM_NODES} --ntasks-per-node=${NUM_GPUS_PER_NODE} ${JOB_TYPE}_benchmarks.sbatch; + +echo "Waiting for jobs to finish..." +while :; do + line=$(squeue | grep $(whoami)) + [ -z "$line" ] && break + printf "\r%s" "$line" + sleep 5 +done +echo "" +cat *.out diff --git a/presto/cluster/run_benchmarks.sbatch b/presto/cluster/run_benchmarks.sbatch new file mode 100644 index 00000000..e8a6e383 --- /dev/null +++ b/presto/cluster/run_benchmarks.sbatch @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --time=00:20:00 + +source ../../scripts/common.sh +source slurm_functions.sh +source setup_coord.sh + +i=0 +for node_id in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do + for gpu_id in $(seq 0 $(( $NUM_GPUS_PER_NODE - 1 )) ); do + run_worker "${gpu_id}" "gpu" "${node_id}" "$i" + i=$(($i + 1)) + done +done + +wait_for_workers_to_register $NUM_WORKERS + +run_queries 5 $SCALE_FACTOR +fetch_query_results +parse_results 5 diff --git a/presto/cluster/setup_coord.sh b/presto/cluster/setup_coord.sh new file mode 100644 index 00000000..d875c2b9 --- /dev/null +++ b/presto/cluster/setup_coord.sh @@ -0,0 +1,35 @@ +# Outlined steps to be run from an sbatch script. +# These steps should verify the context we are running in +# and start the coordinator. + +[ -z "$SLURM_JOB_NAME" ] && echo "required argument '--job-name' not specified" && exit 1 +[ -z "$SLURM_JOB_ACCOUNT" ] && echo "required argument '--account' not specified" && exit 1 +[ -z "$SLURM_JOB_PARTITION" ] && echo "required argument '--partition' not specified" && exit 1 +#[ -z "$SLURM_TIMELIMIT" ] && echo_error "required argument '--time' not specified" && exit 1 +[ -z "$SLURM_NTASKS_PER_NODE" ] && echo "required argument '--ntasks-per-node' not specified" && exit 1 +[ -z "$SLURM_NNODES" ] && echo "required argument '--nodes' not specified" && exit 1 +[ -z "$NUM_NODES" ] && echo "NUM_WORKERS must be set" && exit 1 +[ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 + +NUM_WORKERS=$(( $NUM_NODES * $NUM_GPUS_PER_NODE )) +LOGS="${WORKSPACE}/velox-testing/presto/cluster/" +CONFIGS="${WORKSPACE}/velox-testing/presto/docker/config/generated" +# Right now we assume one node that everything will run on. +# To support more nodes we just need to split the nodelist and assign the coord/each worker to a separate node. +# This will also require custom configs for each worker. +COORD=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -1) +CUDF_LIB=/usr/lib64/presto-native-libs +if [ "${NUM_WORKERS}" -eq "1" ]; then + SINGLE_NODE_EXECUTION=true +else + SINGLE_NODE_EXECUTION=false +fi + +[ ! -d "$WORKSPACE" ] && echo "WORKSPACE must be a valid directory" && exit 1 +[ ! -d "$DATA" ] && echo "DATA must be a valid directory" && exit 1 + +validate_config_directory + +run_coordinator + +wait_until_coordinator_is_running diff --git a/presto/cluster/slurm_functions.sh b/presto/cluster/slurm_functions.sh new file mode 100644 index 00000000..045ce12b --- /dev/null +++ b/presto/cluster/slurm_functions.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# Takes a list of environment variables. Checks that each one is set and of non-zero length. +function validate_environment_preconditions { + local missing=() + for var in "$@"; do + # -z "${!var+x}" => unset; -z "${!var}" => empty + if [[ -z "${!var+x}" || -z "${!var}" ]]; then + missing+=("$var") + fi + done + if ((${#missing[@]})); then + echo_error "required env var ${missing[*]} not set" + fi +} + +# Execute script through the coordinator image (used for coordinator and cli executables) +function run_coord_image { + [ $# -ne 2 ] && echo_error "$0 expected one argument for '