@@ -203,26 +203,22 @@ main() {
203203 # nlj uses range() function, no data generation needed
204204 ;;
205205 tpch)
206- data_tpch " 1"
206+ data_tpch " 1" " parquet "
207207 ;;
208208 tpch_mem)
209- # same data as for tpch
210- data_tpch " 1"
209+ data_tpch " 1" " parquet"
211210 ;;
212211 tpch_csv)
213- # same data as for tpch
214- data_tpch " 1"
212+ data_tpch " 1" " csv"
215213 ;;
216214 tpch10)
217- data_tpch " 10"
215+ data_tpch " 10" " parquet "
218216 ;;
219217 tpch_mem10)
220- # same data as for tpch10
221- data_tpch " 10"
218+ data_tpch " 10" " parquet"
222219 ;;
223220 tpch_csv10)
224- # same data as for tpch10
225- data_tpch " 10"
221+ data_tpch " 10" " csv"
226222 ;;
227223 clickbench_1)
228224 data_clickbench_1
@@ -537,7 +533,7 @@ main() {
537533# Creates TPCH data at a certain scale factor, if it doesn't already
538534# exist
539535#
540- # call like: data_tpch($scale_factor)
536+ # call like: data_tpch($scale_factor, format )
541537#
542538# Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
543539# Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -548,9 +544,10 @@ data_tpch() {
548544 echo " Internal error: Scale factor not specified"
549545 exit 1
550546 fi
547+ FORMAT=$2
551548
552549 TPCH_DIR=" ${DATA_DIR} /tpch_sf${SCALE_FACTOR} "
553- echo " Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR} ..."
550+ echo " Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR} ..."
554551
555552 # Ensure the target data directory exists
556553 mkdir -p " ${TPCH_DIR} "
@@ -562,15 +559,6 @@ data_tpch() {
562559 exit 1
563560 fi
564561
565- # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
566- FILE=" ${TPCH_DIR} /supplier.tbl"
567- if test -f " ${FILE} " ; then
568- echo " tbl files exist ($FILE exists)."
569- else
570- echo " creating tbl files with tpchgen-cli..."
571- tpchgen-cli --scale-factor " ${SCALE_FACTOR} " --format tbl --output-dir " ${TPCH_DIR} "
572- fi
573-
574562 # Copy expected answers into the ./data/answers directory if it does not already exist
575563 FILE=" ${TPCH_DIR} /answers/q1.out"
576564 if test -f " ${FILE} " ; then
@@ -581,23 +569,32 @@ data_tpch() {
581569 docker run -v " ${TPCH_DIR} " :/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c " cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
582570 fi
583571
584- # Create 'parquet' files, one directory per file
585- FILE=" ${TPCH_DIR} /supplier"
586- if test -d " ${FILE} " ; then
587- echo " parquet files exist ($FILE exists)."
588- else
589- echo " creating parquet files using tpchgen-cli ..."
590- tpchgen-cli --scale-factor " ${SCALE_FACTOR} " --format parquet --parquet-compression=' ZSTD(1)' --parts=1 --output-dir " ${TPCH_DIR} "
572+ if [ " $FORMAT " = " parquet" ]; then
573+ # Create 'parquet' files, one directory per file
574+ FILE=" ${TPCH_DIR} /supplier"
575+ if test -d " ${FILE} " ; then
576+ echo " parquet files exist ($FILE exists)."
577+ else
578+ echo " creating parquet files using tpchgen-cli ..."
579+ tpchgen-cli --scale-factor " ${SCALE_FACTOR} " --format parquet --parquet-compression=' ZSTD(1)' --parts=1 --output-dir " ${TPCH_DIR} "
580+ fi
581+ return
591582 fi
592583
593- # Create 'csv' files
594- FILE=" ${TPCH_DIR} /csv/supplier"
595- if test -d " ${FILE} " ; then
596- echo " csv files exist ($FILE exists)."
597- else
598- echo " creating csv files using tpchgen-cli binary ..."
599- tpchgen-cli --scale-factor " ${SCALE_FACTOR} " --format csv --output-dir " ${TPCH_DIR} /csv"
584+ # Create 'csv' files, one directory per file
585+ if [ " $FORMAT " = " csv" ]; then
586+ FILE=" ${TPCH_DIR} /csv/supplier"
587+ if test -d " ${FILE} " ; then
588+ echo " csv files exist ($FILE exists)."
589+ else
590+ echo " creating csv files using tpchgen-cli binary ..."
591+ tpchgen-cli --scale-factor " ${SCALE_FACTOR} " --format csv --parts=1 --output-dir " ${TPCH_DIR} /csv"
592+ fi
593+ return
600594 fi
595+
596+ echo " Error: unknown format '$FORMAT ' for tpch data generation, expected 'parquet' or 'csv'"
597+ exit 1
601598}
602599
603600# Runs the tpch benchmark
@@ -614,10 +611,10 @@ run_tpch() {
614611 echo " Running tpch benchmark..."
615612
616613 FORMAT=$2
617- debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format ${FORMAT} -o " ${RESULTS_FILE} " ${QUERY_ARG}
614+ debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " --format ${FORMAT} -o " ${RESULTS_FILE} " ${QUERY_ARG}
618615}
619616
620- # Runs the tpch in memory
617+ # Runs the tpch in memory (needs tpch parquet data)
621618run_tpch_mem () {
622619 SCALE_FACTOR=$1
623620 if [ -z " $SCALE_FACTOR " ] ; then
@@ -630,7 +627,7 @@ run_tpch_mem() {
630627 echo " RESULTS_FILE: ${RESULTS_FILE} "
631628 echo " Running tpch_mem benchmark..."
632629 # -m means in memory
633- debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " -m --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG}
630+ debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path " ${TPCH_DIR} " --prefer_hash_join " ${PREFER_HASH_JOIN} " -m --format parquet -o " ${RESULTS_FILE} " ${QUERY_ARG}
634631}
635632
636633# Runs the compile profile benchmark helper
0 commit comments