Skip to content

Commit 907bce3

Browse files
committed
Remove uneeded benchmarking code
1 parent e0a9157 commit 907bce3

File tree

6 files changed

+35
-287
lines changed

6 files changed

+35
-287
lines changed

benchmarks/README.md

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -246,25 +246,8 @@ You can enable `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc
246246
cargo run --release --features "mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
247247
```
248248

249-
The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl`
250-
(generated by the `dbgen` utility) to CSV and Parquet.
251-
252-
```bash
253-
cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet
254-
```
255-
256249
Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
257250

258-
#### Sorted Conversion
259-
260-
The TPCH tables generated by the dbgen utility are sorted by their first column (their primary key for most tables, the `l_orderkey` column for the `lineitem` table.)
261-
262-
To preserve this sorted order information during conversion (useful for benchmarking execution on pre-sorted data) include the `--sort` flag:
263-
264-
```bash
265-
cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-sorted-parquet --format parquet --sort
266-
```
267-
268251
### Comparing results between runs
269252

270253
Any `dfbench` execution with `-o <dir>` argument will produce a

benchmarks/bench.sh

Lines changed: 35 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -203,26 +203,22 @@ main() {
203203
# nlj uses range() function, no data generation needed
204204
;;
205205
tpch)
206-
data_tpch "1"
206+
data_tpch "1" "parquet"
207207
;;
208208
tpch_mem)
209-
# same data as for tpch
210-
data_tpch "1"
209+
data_tpch "1" "parquet"
211210
;;
212211
tpch_csv)
213-
# same data as for tpch
214-
data_tpch "1"
212+
data_tpch "1" "csv"
215213
;;
216214
tpch10)
217-
data_tpch "10"
215+
data_tpch "10" "parquet"
218216
;;
219217
tpch_mem10)
220-
# same data as for tpch10
221-
data_tpch "10"
218+
data_tpch "10" "parquet"
222219
;;
223220
tpch_csv10)
224-
# same data as for tpch10
225-
data_tpch "10"
221+
data_tpch "10" "csv"
226222
;;
227223
clickbench_1)
228224
data_clickbench_1
@@ -537,7 +533,7 @@ main() {
537533
# Creates TPCH data at a certain scale factor, if it doesn't already
538534
# exist
539535
#
540-
# call like: data_tpch($scale_factor)
536+
# call like: data_tpch($scale_factor, format)
541537
#
542538
# Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
543539
# Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -548,9 +544,10 @@ data_tpch() {
548544
echo "Internal error: Scale factor not specified"
549545
exit 1
550546
fi
547+
FORMAT=$2
551548

552549
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
553-
echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
550+
echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
554551

555552
# Ensure the target data directory exists
556553
mkdir -p "${TPCH_DIR}"
@@ -562,15 +559,6 @@ data_tpch() {
562559
exit 1
563560
fi
564561

565-
# Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
566-
FILE="${TPCH_DIR}/supplier.tbl"
567-
if test -f "${FILE}"; then
568-
echo " tbl files exist ($FILE exists)."
569-
else
570-
echo " creating tbl files with tpchgen-cli..."
571-
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format tbl --output-dir "${TPCH_DIR}"
572-
fi
573-
574562
# Copy expected answers into the ./data/answers directory if it does not already exist
575563
FILE="${TPCH_DIR}/answers/q1.out"
576564
if test -f "${FILE}"; then
@@ -581,23 +569,32 @@ data_tpch() {
581569
docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
582570
fi
583571

584-
# Create 'parquet' files, one directory per file
585-
FILE="${TPCH_DIR}/supplier"
586-
if test -d "${FILE}"; then
587-
echo " parquet files exist ($FILE exists)."
588-
else
589-
echo " creating parquet files using tpchgen-cli ..."
590-
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
572+
if [ "$FORMAT" = "parquet" ]; then
573+
# Create 'parquet' files, one directory per file
574+
FILE="${TPCH_DIR}/supplier"
575+
if test -d "${FILE}"; then
576+
echo " parquet files exist ($FILE exists)."
577+
else
578+
echo " creating parquet files using tpchgen-cli ..."
579+
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
580+
fi
581+
return
591582
fi
592583

593-
# Create 'csv' files
594-
FILE="${TPCH_DIR}/csv/supplier"
595-
if test -d "${FILE}"; then
596-
echo " csv files exist ($FILE exists)."
597-
else
598-
echo " creating csv files using tpchgen-cli binary ..."
599-
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --output-dir "${TPCH_DIR}/csv"
584+
# Create 'csv' files, one directory per file
585+
if [ "$FORMAT" = "csv" ]; then
586+
FILE="${TPCH_DIR}/csv/supplier"
587+
if test -d "${FILE}"; then
588+
echo " csv files exist ($FILE exists)."
589+
else
590+
echo " creating csv files using tpchgen-cli binary ..."
591+
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --parts=1 --output-dir "${TPCH_DIR}/csv"
592+
fi
593+
return
600594
fi
595+
596+
echo "Error: unknown format '$FORMAT' for tpch data generation, expected 'parquet' or 'csv'"
597+
exit 1
601598
}
602599

603600
# Runs the tpch benchmark
@@ -614,10 +611,10 @@ run_tpch() {
614611
echo "Running tpch benchmark..."
615612

616613
FORMAT=$2
617-
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
614+
debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
618615
}
619616

620-
# Runs the tpch in memory
617+
# Runs the tpch in memory (needs tpch parquet data)
621618
run_tpch_mem() {
622619
SCALE_FACTOR=$1
623620
if [ -z "$SCALE_FACTOR" ] ; then
@@ -630,7 +627,7 @@ run_tpch_mem() {
630627
echo "RESULTS_FILE: ${RESULTS_FILE}"
631628
echo "Running tpch_mem benchmark..."
632629
# -m means in memory
633-
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
630+
debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
634631
}
635632

636633
# Runs the compile profile benchmark helper

benchmarks/src/bin/dfbench.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ enum Options {
4848
Nlj(nlj::RunOpt),
4949
SortTpch(sort_tpch::RunOpt),
5050
Tpch(tpch::RunOpt),
51-
TpchConvert(tpch::ConvertOpt),
5251
}
5352

5453
// Main benchmark runner entrypoint
@@ -65,6 +64,5 @@ pub async fn main() -> Result<()> {
6564
Options::Nlj(opt) => opt.run().await,
6665
Options::SortTpch(opt) => opt.run().await,
6766
Options::Tpch(opt) => Box::pin(opt.run()).await,
68-
Options::TpchConvert(opt) => opt.run().await,
6967
}
7068
}

benchmarks/src/bin/tpch.rs

Lines changed: 0 additions & 65 deletions
This file was deleted.

0 commit comments

Comments
 (0)