rapidsai · misiugodfrey · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 17, 2025
diff --git a/presto/cluster/README.md b/presto/cluster/README.md
@@ -0,0 +1,7 @@
+These are a set of scripts to run presto in a Slurm cluster.
+
+# Dispatch a job to create SF1k tables and track job progress.
+WORKSPACE=<workspace_path> DATA=<data_path> ./dispatch.sh create --job-name <job_name> -A <account> -p <platform>
+
+# Dispatch a job to run SF1k benchmark (after creation) and track job progress.
+WORKSPACE=<workspace_path> DATA=<data_path> ./dispatch.sh run --job-name <job_name> -A <account> -p <platform>
diff --git a/presto/cluster/create_benchmarks.sbatch b/presto/cluster/create_benchmarks.sbatch
@@ -0,0 +1,15 @@
+#!/bin/bash
+#SBATCH --time=00:25:00
+
+source ../../scripts/common.sh
+source slurm_functions.sh
+source setup_coord.sh
+
+for i in $(seq 0 $(( $NUM_WORKERS - 1 )) ); do
+    # Workers need to be CPU to run ANALYZE on tables.
+    run_worker $i "cpu"
+done
+
+wait_for_workers_to_register $NUM_WORKERS
+
+create_schema
diff --git a/presto/cluster/dispatch.sh b/presto/cluster/dispatch.sh
@@ -0,0 +1,23 @@
+# This is a convenient wrapper for the run_benchmarks.sh and create_benchmarks.sh that
+# will track the current job for the user.
+
+#!/bin/bash
+rm *.log
+rm *.out
+
+[ $# -ne 1 ] && echo_error "$0 expected one argument for 'create/run'"
+JOB_TYPE="$1"
+[ "$JOB_TYPE" == "create" ] && [ "$JOB_TYPE" == "run" ] && echo_error "parameter must be create or run"
+shift 1
+
+sbatch "$@" --nodes=1 --ntasks-per-node=10 ${JOB_TYPE}_benchmarks.sbatch;
+
+echo "Waiting for jobs to finish..."
+while :; do
+    line=$(squeue | grep $(whoami))
+    [ -z "$line" ] && break
+    printf "\r%s" "$line"
+    sleep 5
+done
+echo ""
+cat *.out
diff --git a/presto/cluster/run_benchmarks.sbatch b/presto/cluster/run_benchmarks.sbatch
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH --time=00:20:00
+
+source ../../scripts/common.sh
+source slurm_functions.sh
+source setup_coord.sh
+
+for i in $(seq 0 $(( $NUM_WORKERS - 1 )) ); do
+    run_worker $i "gpu"
+done
+
+wait_for_workers_to_register $NUM_WORKERS
+
+run_queries 5
+fetch_query_results
+parse_results 5
diff --git a/presto/cluster/setup_coord.sh b/presto/cluster/setup_coord.sh
@@ -0,0 +1,34 @@
+# Outlined steps to be run from an sbatch script.
+# These steps should verify the context we are running in
+# and start the coordinator.
+
+[ -z "$SLURM_JOB_NAME" ] && echo_error "required argument '--job-name' not specified" && exit 1
+[ -z "$SLURM_JOB_ACCOUNT" ] && echo_error "required argument '--account' not specified" && exit 1
+[ -z "$SLURM_JOB_PARTITION" ] && echo_error "required argument '--partition' not specified" && exit 1
+#[ -z "$SLURM_TIMELIMIT" ] && echo_error "required argument '--time' not specified" && exit 1
+[ -z "$SLURM_NTASKS_PER_NODE" ] && echo_error "required argument '--ntasks-per-node' not specified" && exit 1
+[ -z "$SLURM_NNODES" ] && echo_error "required argument '--nodes' not specified" && exit 1
+
+NUM_WORKERS=1
+LOGS="${WORKSPACE}/velox-testing/presto/cluster/"
+CONFIGS="${WORKSPACE}/velox-testing/presto/docker/config/generated"
+# Right now we assume one node that everything will run on.
+# To support more nodes we just need to split the nodelist and assign the coord/each worker to a separate node.
+# This will also require custom configs for each worker.
+NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -1)
+COORD=${NODE}
+CUDF_LIB=/usr/lib64/presto-native-libs
+if [ "${NUM_WORKERS}" -eq "1" ]; then
+    SINGLE_NODE_EXECUTION=true
+else
+    SINGLE_NODE_EXECUTION=false
+fi
+
+[ ! -d "$WORKSPACE" ] && echo_error "WORKSPACE must be a valid directory" && exit 1
+[ ! -d "$DATA" ] && echo_error "DATA must be a valid directory" && exit 1
+
+validate_config_directory
+
+run_coordinator
+
+wait_until_coordinator_is_running
diff --git a/presto/cluster/slurm_functions.sh b/presto/cluster/slurm_functions.sh
@@ -0,0 +1,215 @@
+#!/bin/bash
+
+# Takes a list of environment variables.  Checks that each one is set and of non-zero length.
+function validate_environment_preconditions {
+    local missing=()
+    for var in "$@"; do
+        # -z "${!var+x}" => unset; -z "${!var}" => empty
+        if [[ -z "${!var+x}" || -z "${!var}" ]]; then
+            missing+=("$var")
+        fi
+    done
+    if ((${#missing[@]})); then
+        echo_error "required env var ${missing[*]} not set"
+  fi
+}
+
+# Execute script through the coordinator image (used for coordinator and cli executables)
+function run_coord_image {
+    [ $# -ne 1 ] && echo_error "$0 expected one argument for '<script>' and one for '<coord/cli>'"
+    validate_environment_preconditions LOGS CONFIGS WORKSPACE COORD DATA
+    local script=$1
+    local type=$2
+    [ "$type" != "coord" ] && [ "$type" != "cli" ] && echo_error "coord type must be coord/cli"
+    local log_file="${type}.log"
+
+    local coord_image="${WORKSPACE}/presto-coordinator.sqsh"
+    [ ! -f "${coord_image}" ] && echo_error "coord image does not exist at ${coord_image}"
+
+    mkdir -p ${WORKSPACE}/.hive_metastore
+
+RUN_CMD="srun -w $COORD --ntasks=1 --overlap \
+--container-image=${coord_image} \
+--container-mounts=${WORKSPACE}:/workspace,\
+${DATA}:/data,\
+${CONFIGS}/etc_common:/opt/presto-server/etc,\
+${CONFIGS}/etc_coordinator/node.properties:/opt/presto-server/etc/node.properties,\
+${CONFIGS}/etc_coordinator/config_native.properties:/opt/presto-server/etc/config.properties,\
+${WORKSPACE}/.hive_metastore:/var/lib/presto/data/hive/metastore \
+-- bash -lc \"${script}\" > ${LOGS}/${log_file} 2>&1"
+
+# Coordinator runs as a background process, whereas we want to wait for cli
+# so that the job will finish when the cli is done (terminating background
+# processes like the coordinator and workers).
+if [ "${type}" == "coord" ]; then
+    $RUN_CMD &
+else
+    $RUN_CMD
+}
+
+# Runs a coordinator on a specific node with default configurations.
+# Overrides the config files with the coord node and other needed updates.
+function run_coordinator {
+    validate_environment_preconditions CONFIGS SINGLE_NODE_EXECUTION NODE
+    local coord_config="${CONFIGS}/etc_coordinator/config_native.properties"
+    # Replace placeholder in configs
+    sed -i "s+discovery\.uri.*+discovery\.uri=http://${COORD}:8080+g" ${coord_config}
+    sed -i "s+single-node-execution-enabled.*+single-node-execution-enabled=${SINGLE_NODE_EXECUTION}+g" ${coord_config}
+
+    run_coord_image "/opt/presto-server/bin/launcher run" "coord"
+}
+
+# Runs a worker on a given node with custom configuration files which are generated as necessary.
+function run_worker {
+    [ $# -ne 2 ] && echo_error "$0 expected arguments 'worker_id' and 'worker_type'"
+    validate_environment_preconditions LOGS CONFIGS WORKSPACE COORD SINGLE_NODE_EXECUTION NODE CUDF_LIB DATA
+
+    local worker_id=$1
+    local worker_type=$2
+    [ "$worker_type" != "cpu" ] && [ "$worker_type" != "gpu" ] && echo_error "worker type must be gpu/cpu"
+    if [ "$worker_type" == "cpu" ]; then
+	NUM_DRIVERS=64
+    else
+	NUM_DRIVERS=4
+    fi
+
+    local worker_image="${WORKSPACE}/presto-native-worker-${worker_type}.sqsh"
+    [ ! -f "${worker_image}" ] && echo_error "worker image does not exist at ${worker_image}"
+
+    # Make a copy of the worker config that can be given a unique id for this worker.
+    cp -r "${CONFIGS}/etc_worker" "${CONFIGS}/etc_worker_${worker_id}"
+    local worker_config="${CONFIGS}/etc_worker_${worker_id}/config_native.properties"
+    local worker_node="${CONFIGS}/etc_worker_${worker_id}/node.properties"
+    local worker_data="${WORKSPACE}/worker_data_${worker_id}"
+
+    # Create unique configuration/data files for each worker:
+    # Give each worker a unique port.
+    sed -i "s+http-server\.http\.port.*+http-server\.http\.port=900${worker_id}+g" ${worker_config}
+    # Update discovery based on which node the coordinator is running on.
+    sed -i "s+discovery\.uri.*+discovery\.uri=http://${COORD}:8080+g" ${worker_config}
+    sed -i "s+single-node-execution-enabled.*+single-node-execution-enabled=${SINGLE_NODE_EXECUTION}+g" ${worker_config}
+    sed -i "s+task.max-drivers-per-task.*+task.max-drivers-per-task=${NUM_DRIVERS}+g" ${worker_config}
+
+    # Give each worker a unique id.
+    sed -i "s+node\.id.*+node\.id=worker_${worker_id}+g" ${worker_node}
+
+    # Create unique data dir per worker.
+    mkdir -p ${worker_data}
+    mkdir -p ${WORKSPACE}/.hive_metastore
+
+    # Run the worker with the new configs.
+    CUDA_VISIBLE_DIVICES=${worker_id} srun -w $NODE --ntasks=1 --overlap \
+--container-image=${worker_image} \
+--container-mounts=${WORKSPACE}:/workspace,\
+${DATA}:/data,\
+${CONFIGS}/etc_common:/opt/presto-server/etc,\
+${worker_node}:/opt/presto-server/etc/node.properties,\
+${worker_config}:/opt/presto-server/etc/config.properties,\
+${worker_data}:/var/lib/presto/data,\
+${WORKSPACE}/.hive_metastore:/var/lib/presto/data/hive/metastore \
+--container-env=LD_LIBRARY_PATH="$CUDF_LIB:$LD_LIBRARY_PATH" \
+-- bash -lc "presto_server --etc-dir=/opt/presto-server/etc" > ${LOGS}/worker_${worker_id}.log 2>&1 &
+}
+
+# Run a cli node that will connect to the coordinator and run queries that setup a
+# tpch schema based on the create_schema.sql file.
+function create_schema {
+    run_coord_image "/opt/presto-cli --server ${COORD}:8080 --catalog hive --schema default < /workspace/create_schema.sql" "cli"
+}
+
+# Run a cli node that will connect to the coordinator and run queries from queries.sql
+# Results are stored in cli.log.
+function run_queries {
+    [ $# -ne 1 ] && echo_error "$0 expected one argument for '<iterations>'"
+    local num_iterations=$1
+    awk -v n="$num_iterations" '{ for (i=1; i<=n; i++) print }' "${WORKSPACE}/queries.sql" > ${WORKSPACE}/iterating_queries.sql
+    run_coord_image "/opt/presto-cli --server ${COORD}:8080 --catalog hive --schema tpch1k < /workspace/iterating_queries.sql" "cli"
+    rm ${WORKSPACE}/iterating_queries.sql
+}
+
+# Check if the coordinator is running via curl.  Fail after 10 retries.
 function wait_for_worker_node_registration() { 
 function wait_for_worker_node_registration() { 
+function wait_until_coordinator_is_running {
+    validate_environment_preconditions COORD LOGS
+    local state="INACTIVE"
+    for i in {1..10}; do
+        state=$(curl -s http://${COORD}:8080/v1/info/state || true)
+        if [[ "$state" == "\"ACTIVE\"" ]]; then
+            echo_success "$SLURM_JOB_ID coord started.  state: $state"  >> $LOGS/out.log
+	    return 0
+        fi
+        sleep 5
+    done
+    echo_error "$SLURM_JOB_ID coord did not start.  state: $state"  >> $LOGS/out.log
+}
+
+# Check N nodes are registered with the coordinator.  Fail after 10 retries.
+function wait_for_workers_to_register {
+    validate_environment_preconditions LOGS COORD
+    [ $# -ne 1 ] && echo_error "$0 expected one argument for 'expected number of workers'"
+    local expected_num_workers=$1
+    local num_workers=0
+    for i in {1..10}; do
+        num_workers=$(curl -s http://${COORD}:8080/v1/node | jq length)
+        if (( $num_workers == $expected_num_workers )); then
+            echo_success "$SLURM_JOB_ID worker registered. num_nodes: $num_workers"  >> $LOGS/out.log
+	    return 0
+        fi
+        sleep 5
+    done
+    echo_error "$SLURM_JOB_ID worker registered. num_nodes: $num_workers"  >> $LOGS/out.log
+}
+
+function validate_file_exists {
+    [ ! -f "$1" ] && echo_error "$1 must exist in CONFIGS directory" && exit 1
+}
+
+function validate_config_directory {
+    validate_environment_preconditions CONFIGS
+    validate_file_exists "${CONFIGS}/etc_common/jvm.config"
+    validate_file_exists "${CONFIGS}/etc_common/log.properties"
+    validate_file_exists "${CONFIGS}/etc_coordinator/config_native.properties"
+    validate_file_exists "${CONFIGS}/etc_coordinator/node.properties"
+    validate_file_exists "${CONFIGS}/etc_worker/config_native.properties"
+    validate_file_exists "${CONFIGS}/etc_worker/node.properties"
+}
+
+# Reads from cli.log to get the query IDs and fetches their results from the coordinator
+# The results are stored in results.log
+function fetch_query_results {
+    validate_environment_preconditions LOGS COORD
+    echo "" > ${LOGS}/results.log
+    while IFS= read -r query_line; do
+        if [[ $query_line =~ Query[[:space:]]+([^,]+), ]]; then
+	    query_id="${BASH_REMATCH[1]}"
+	    curl -s http://${COORD}:8080/v1/query/$query_id >> ${LOGS}/results.log
+        fi
+    done < <(grep "Query [^ ]*," "${LOGS}/cli.log")
+}
+
+# Parses results.log for the relevant timing data, storing the results in summary.csv.
+# Assumes the queries are in order, and numbers them accordingly.
+function parse_results {
+    [ $# -ne 1 ] && echo_error "$0 expected one argument for '<iterations>'"
+    local num_iterations=$1
+
+    validate_environment_preconditions LOGS
+    echo "query,state,elapsed_ms,execution_ms,cpu_ms" > ${LOGS}/summary.csv
+    cat ${LOGS}/results.log | jq -r '
+      def durms:
+        if (type=="number") then .
+        else capture("(?<v>[0-9.]+)\\s*(?<u>ms|s|m|h)") as $c
+          | ($c.v|tonumber) * (if $c.u=="ms" then 1 elif $c.u=="s" then 1000 elif $c.u=="m" then 60000 else 3600000 end)
+        end;
+
+      . as $r
+      | [
+          ($r.state // "UNKNOWN"),
+          ($r.queryStats.elapsedTimeMillis    // ($r.queryStats.elapsedTime    | durms) // 0),
+          ($r.queryStats.executionTimeMillis  // ($r.queryStats.executionTime  | durms) // 0),
+          ($r.queryStats.totalCpuTimeMillis   // ($r.queryStats.totalCpuTime   | durms) // 0)
+        ]
+      | @csv' | awk -v n="${num_iterations:-0}" '
+      	BEGIN { if (n <= 0) { print "error: num_iterations must be > 0" > "/dev/stderr"; exit 1 } }
+      	{ printf("%2d,%s\n", int((NR-1)/n), $0) }
+      ' >> ${LOGS}/summary.csv
+}
diff --git a/scripts/common.sh b/scripts/common.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+function echo_error {
+  echo -e "${RED}$1${NC}"
+  exit 1
+}
+
+function echo_warning {
+  echo -e "${YELLOW}$1${NC}"
+}
+
+function echo_success {
+  echo -e "${GREEN}$1${NC}"
+}