Skip to content
Draft
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions presto/cluster/README.md
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might make sense to name the top level directory slurm instead of cluster.

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
These are a set of scripts to run presto in a Slurm cluster.

# Dispatch a job to create SF1k tables and track job progress.
WORKSPACE=<workspace_path> DATA=<data_path> ./dispatch.sh create --job-name <job_name> -A <account> -p <platform>

# Dispatch a job to run SF1k benchmark (after creation) and track job progress.
WORKSPACE=<workspace_path> DATA=<data_path> ./dispatch.sh run --job-name <job_name> -A <account> -p <platform>
15 changes: 15 additions & 0 deletions presto/cluster/create_benchmarks.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
#SBATCH --time=00:25:00
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can take a little while due to the cost of the ANALYZE steps. Currently 25 min seems to be a decent limit.


source ../../scripts/common.sh
source slurm_functions.sh
source setup_coord.sh

for i in $(seq 0 $(( $NUM_WORKERS - 1 )) ); do
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this script run per node or per task?

# Workers need to be CPU to run ANALYZE on tables.
run_worker $i "cpu"
done

wait_for_workers_to_register $NUM_WORKERS

create_schema
23 changes: 23 additions & 0 deletions presto/cluster/dispatch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# This is a convenient wrapper for the run_benchmarks.sh and create_benchmarks.sh that
# will track the current job for the user.

#!/bin/bash
rm *.log
rm *.out

[ $# -ne 1 ] && echo_error "$0 expected one argument for 'create/run'"
JOB_TYPE="$1"
[ "$JOB_TYPE" == "create" ] && [ "$JOB_TYPE" == "run" ] && echo_error "parameter must be create or run"
shift 1

sbatch "$@" --nodes=1 --ntasks-per-node=10 ${JOB_TYPE}_benchmarks.sbatch;

echo "Waiting for jobs to finish..."
while :; do
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This just tracks the job in the terminal until it is finished.

line=$(squeue | grep $(whoami))
[ -z "$line" ] && break
printf "\r%s" "$line"
sleep 5
done
echo ""
cat *.out
16 changes: 16 additions & 0 deletions presto/cluster/run_benchmarks.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --time=00:20:00

source ../../scripts/common.sh
source slurm_functions.sh
source setup_coord.sh

for i in $(seq 0 $(( $NUM_WORKERS - 1 )) ); do
run_worker $i "gpu"
done

wait_for_workers_to_register $NUM_WORKERS

run_queries 5
fetch_query_results
parse_results 5
34 changes: 34 additions & 0 deletions presto/cluster/setup_coord.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Outlined steps to be run from an sbatch script.
# These steps should verify the context we are running in
# and start the coordinator.

[ -z "$SLURM_JOB_NAME" ] && echo_error "required argument '--job-name' not specified" && exit 1
[ -z "$SLURM_JOB_ACCOUNT" ] && echo_error "required argument '--account' not specified" && exit 1
[ -z "$SLURM_JOB_PARTITION" ] && echo_error "required argument '--partition' not specified" && exit 1
#[ -z "$SLURM_TIMELIMIT" ] && echo_error "required argument '--time' not specified" && exit 1
[ -z "$SLURM_NTASKS_PER_NODE" ] && echo_error "required argument '--ntasks-per-node' not specified" && exit 1
[ -z "$SLURM_NNODES" ] && echo_error "required argument '--nodes' not specified" && exit 1

NUM_WORKERS=1
LOGS="${WORKSPACE}/velox-testing/presto/cluster/"
CONFIGS="${WORKSPACE}/velox-testing/presto/docker/config/generated"
# Right now we assume one node that everything will run on.
# To support more nodes we just need to split the nodelist and assign the coord/each worker to a separate node.
# This will also require custom configs for each worker.
NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -1)
COORD=${NODE}
CUDF_LIB=/usr/lib64/presto-native-libs
if [ "${NUM_WORKERS}" -eq "1" ]; then
SINGLE_NODE_EXECUTION=true
else
SINGLE_NODE_EXECUTION=false
fi

[ ! -d "$WORKSPACE" ] && echo_error "WORKSPACE must be a valid directory" && exit 1
[ ! -d "$DATA" ] && echo_error "DATA must be a valid directory" && exit 1

validate_config_directory

run_coordinator

wait_until_coordinator_is_running
215 changes: 215 additions & 0 deletions presto/cluster/slurm_functions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/bin/bash

# Takes a list of environment variables. Checks that each one is set and of non-zero length.
function validate_environment_preconditions {
local missing=()
for var in "$@"; do
# -z "${!var+x}" => unset; -z "${!var}" => empty
if [[ -z "${!var+x}" || -z "${!var}" ]]; then
missing+=("$var")
fi
done
if ((${#missing[@]})); then
echo_error "required env var ${missing[*]} not set"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this file source common.sh?

fi
}

# Execute script through the coordinator image (used for coordinator and cli executables)
function run_coord_image {
[ $# -ne 1 ] && echo_error "$0 expected one argument for '<script>' and one for '<coord/cli>'"
validate_environment_preconditions LOGS CONFIGS WORKSPACE COORD DATA
local script=$1
local type=$2
[ "$type" != "coord" ] && [ "$type" != "cli" ] && echo_error "coord type must be coord/cli"
local log_file="${type}.log"

local coord_image="${WORKSPACE}/presto-coordinator.sqsh"
[ ! -f "${coord_image}" ] && echo_error "coord image does not exist at ${coord_image}"

mkdir -p ${WORKSPACE}/.hive_metastore

RUN_CMD="srun -w $COORD --ntasks=1 --overlap \
--container-image=${coord_image} \
--container-mounts=${WORKSPACE}:/workspace,\
${DATA}:/data,\
${CONFIGS}/etc_common:/opt/presto-server/etc,\
${CONFIGS}/etc_coordinator/node.properties:/opt/presto-server/etc/node.properties,\
${CONFIGS}/etc_coordinator/config_native.properties:/opt/presto-server/etc/config.properties,\
${WORKSPACE}/.hive_metastore:/var/lib/presto/data/hive/metastore \
-- bash -lc \"${script}\" > ${LOGS}/${log_file} 2>&1"

# Coordinator runs as a background process, whereas we want to wait for cli
# so that the job will finish when the cli is done (terminating background
# processes like the coordinator and workers).
if [ "${type}" == "coord" ]; then
$RUN_CMD &
else
$RUN_CMD
}

# Runs a coordinator on a specific node with default configurations.
# Overrides the config files with the coord node and other needed updates.
function run_coordinator {
validate_environment_preconditions CONFIGS SINGLE_NODE_EXECUTION NODE
local coord_config="${CONFIGS}/etc_coordinator/config_native.properties"
# Replace placeholder in configs
sed -i "s+discovery\.uri.*+discovery\.uri=http://${COORD}:8080+g" ${coord_config}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should expand our config generation capability in velox-testing and avoid having to use sed commands.

sed -i "s+single-node-execution-enabled.*+single-node-execution-enabled=${SINGLE_NODE_EXECUTION}+g" ${coord_config}

run_coord_image "/opt/presto-server/bin/launcher run" "coord"
}

# Runs a worker on a given node with custom configuration files which are generated as necessary.
function run_worker {
[ $# -ne 2 ] && echo_error "$0 expected arguments 'worker_id' and 'worker_type'"
validate_environment_preconditions LOGS CONFIGS WORKSPACE COORD SINGLE_NODE_EXECUTION NODE CUDF_LIB DATA

local worker_id=$1
local worker_type=$2
[ "$worker_type" != "cpu" ] && [ "$worker_type" != "gpu" ] && echo_error "worker type must be gpu/cpu"
if [ "$worker_type" == "cpu" ]; then
NUM_DRIVERS=64
else
NUM_DRIVERS=4
fi

local worker_image="${WORKSPACE}/presto-native-worker-${worker_type}.sqsh"
[ ! -f "${worker_image}" ] && echo_error "worker image does not exist at ${worker_image}"

# Make a copy of the worker config that can be given a unique id for this worker.
cp -r "${CONFIGS}/etc_worker" "${CONFIGS}/etc_worker_${worker_id}"
local worker_config="${CONFIGS}/etc_worker_${worker_id}/config_native.properties"
local worker_node="${CONFIGS}/etc_worker_${worker_id}/node.properties"
local worker_data="${WORKSPACE}/worker_data_${worker_id}"

# Create unique configuration/data files for each worker:
# Give each worker a unique port.
sed -i "s+http-server\.http\.port.*+http-server\.http\.port=900${worker_id}+g" ${worker_config}
# Update discovery based on which node the coordinator is running on.
sed -i "s+discovery\.uri.*+discovery\.uri=http://${COORD}:8080+g" ${worker_config}
sed -i "s+single-node-execution-enabled.*+single-node-execution-enabled=${SINGLE_NODE_EXECUTION}+g" ${worker_config}
sed -i "s+task.max-drivers-per-task.*+task.max-drivers-per-task=${NUM_DRIVERS}+g" ${worker_config}

# Give each worker a unique id.
sed -i "s+node\.id.*+node\.id=worker_${worker_id}+g" ${worker_node}

# Create unique data dir per worker.
mkdir -p ${worker_data}
mkdir -p ${WORKSPACE}/.hive_metastore

# Run the worker with the new configs.
CUDA_VISIBLE_DIVICES=${worker_id} srun -w $NODE --ntasks=1 --overlap \
--container-image=${worker_image} \
--container-mounts=${WORKSPACE}:/workspace,\
${DATA}:/data,\
${CONFIGS}/etc_common:/opt/presto-server/etc,\
${worker_node}:/opt/presto-server/etc/node.properties,\
${worker_config}:/opt/presto-server/etc/config.properties,\
${worker_data}:/var/lib/presto/data,\
${WORKSPACE}/.hive_metastore:/var/lib/presto/data/hive/metastore \
--container-env=LD_LIBRARY_PATH="$CUDF_LIB:$LD_LIBRARY_PATH" \
-- bash -lc "presto_server --etc-dir=/opt/presto-server/etc" > ${LOGS}/worker_${worker_id}.log 2>&1 &
}

# Run a cli node that will connect to the coordinator and run queries that setup a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason not to reuse the benchmarking and integration test scripts instead of these CLI based functions?

Similar comment applies to the result set related functions.

# tpch schema based on the create_schema.sql file.
function create_schema {
run_coord_image "/opt/presto-cli --server ${COORD}:8080 --catalog hive --schema default < /workspace/create_schema.sql" "cli"
}

# Run a cli node that will connect to the coordinator and run queries from queries.sql
# Results are stored in cli.log.
function run_queries {
[ $# -ne 1 ] && echo_error "$0 expected one argument for '<iterations>'"
local num_iterations=$1
awk -v n="$num_iterations" '{ for (i=1; i<=n; i++) print }' "${WORKSPACE}/queries.sql" > ${WORKSPACE}/iterating_queries.sql
run_coord_image "/opt/presto-cli --server ${COORD}:8080 --catalog hive --schema tpch1k < /workspace/iterating_queries.sql" "cli"
rm ${WORKSPACE}/iterating_queries.sql
}

# Check if the coordinator is running via curl. Fail after 10 retries.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we reuse the existing wait_for_worker_node_registration function (

function wait_for_worker_node_registration() {
)?

function wait_until_coordinator_is_running {
validate_environment_preconditions COORD LOGS
local state="INACTIVE"
for i in {1..10}; do
state=$(curl -s http://${COORD}:8080/v1/info/state || true)
if [[ "$state" == "\"ACTIVE\"" ]]; then
echo_success "$SLURM_JOB_ID coord started. state: $state" >> $LOGS/out.log
return 0
fi
sleep 5
done
echo_error "$SLURM_JOB_ID coord did not start. state: $state" >> $LOGS/out.log
}

# Check N nodes are registered with the coordinator. Fail after 10 retries.
function wait_for_workers_to_register {
validate_environment_preconditions LOGS COORD
[ $# -ne 1 ] && echo_error "$0 expected one argument for 'expected number of workers'"
local expected_num_workers=$1
local num_workers=0
for i in {1..10}; do
num_workers=$(curl -s http://${COORD}:8080/v1/node | jq length)
if (( $num_workers == $expected_num_workers )); then
echo_success "$SLURM_JOB_ID worker registered. num_nodes: $num_workers" >> $LOGS/out.log
return 0
fi
sleep 5
done
echo_error "$SLURM_JOB_ID worker registered. num_nodes: $num_workers" >> $LOGS/out.log
}

function validate_file_exists {
[ ! -f "$1" ] && echo_error "$1 must exist in CONFIGS directory" && exit 1
}

function validate_config_directory {
validate_environment_preconditions CONFIGS
validate_file_exists "${CONFIGS}/etc_common/jvm.config"
validate_file_exists "${CONFIGS}/etc_common/log.properties"
validate_file_exists "${CONFIGS}/etc_coordinator/config_native.properties"
validate_file_exists "${CONFIGS}/etc_coordinator/node.properties"
validate_file_exists "${CONFIGS}/etc_worker/config_native.properties"
validate_file_exists "${CONFIGS}/etc_worker/node.properties"
}

# Reads from cli.log to get the query IDs and fetches their results from the coordinator
# The results are stored in results.log
function fetch_query_results {
validate_environment_preconditions LOGS COORD
echo "" > ${LOGS}/results.log
while IFS= read -r query_line; do
if [[ $query_line =~ Query[[:space:]]+([^,]+), ]]; then
query_id="${BASH_REMATCH[1]}"
curl -s http://${COORD}:8080/v1/query/$query_id >> ${LOGS}/results.log
fi
done < <(grep "Query [^ ]*," "${LOGS}/cli.log")
}

# Parses results.log for the relevant timing data, storing the results in summary.csv.
# Assumes the queries are in order, and numbers them accordingly.
function parse_results {
[ $# -ne 1 ] && echo_error "$0 expected one argument for '<iterations>'"
local num_iterations=$1

validate_environment_preconditions LOGS
echo "query,state,elapsed_ms,execution_ms,cpu_ms" > ${LOGS}/summary.csv
cat ${LOGS}/results.log | jq -r '
def durms:
if (type=="number") then .
else capture("(?<v>[0-9.]+)\\s*(?<u>ms|s|m|h)") as $c
| ($c.v|tonumber) * (if $c.u=="ms" then 1 elif $c.u=="s" then 1000 elif $c.u=="m" then 60000 else 3600000 end)
end;

. as $r
| [
($r.state // "UNKNOWN"),
($r.queryStats.elapsedTimeMillis // ($r.queryStats.elapsedTime | durms) // 0),
($r.queryStats.executionTimeMillis // ($r.queryStats.executionTime | durms) // 0),
($r.queryStats.totalCpuTimeMillis // ($r.queryStats.totalCpuTime | durms) // 0)
]
| @csv' | awk -v n="${num_iterations:-0}" '
BEGIN { if (n <= 0) { print "error: num_iterations must be > 0" > "/dev/stderr"; exit 1 } }
{ printf("%2d,%s\n", int((NR-1)/n), $0) }
' >> ${LOGS}/summary.csv
}
19 changes: 19 additions & 0 deletions scripts/common.sh
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we give this file a more specific name e.g. echo_helper.sh?

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m' # No Color

function echo_error {
echo -e "${RED}$1${NC}"
exit 1
}

function echo_warning {
echo -e "${YELLOW}$1${NC}"
}

function echo_success {
echo -e "${GREEN}$1${NC}"
}