-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Add sorted data benchmark. #19042
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add sorted data benchmark. #19042
Changes from all commits
c3647cd
627f081
6c0afd6
0ee89f5
acf6e24
c654246
68e72f1
401cb8e
88f84d9
2392655
413142a
ecbe8d0
bc11193
fce2ccc
c62c0fa
2dcbee2
ba45f6a
c547cd3
39e6a5c
140d4ea
5702ed6
4ddcbdf
022547e
ceac596
e4826dc
5d6c28e
8f2a4e0
e1a434d
d247fd4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -99,6 +99,9 @@ clickbench_partitioned: ClickBench queries against partitioned (100 files) parqu | |
| clickbench_pushdown: ClickBench queries against partitioned (100 files) parquet w/ filter_pushdown enabled | ||
| clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific) | ||
|
|
||
| # Sorted Data Benchmarks (ORDER BY Optimization) | ||
| clickbench_sorted: ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization) | ||
|
|
||
| # H2O.ai Benchmarks (Group By, Join, Window) | ||
| h2o_small: h2oai benchmark with small dataset (1e7 rows) for groupby, default file format is csv | ||
| h2o_medium: h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv | ||
|
|
@@ -318,6 +321,9 @@ main() { | |
| compile_profile) | ||
| data_tpch "1" "parquet" | ||
| ;; | ||
| clickbench_sorted) | ||
| clickbench_sorted | ||
| ;; | ||
| *) | ||
| echo "Error: unknown benchmark '$BENCHMARK' for data generation" | ||
| usage | ||
|
|
@@ -449,7 +455,7 @@ main() { | |
| h2o_medium_window) | ||
| run_h2o_window "MEDIUM" "CSV" "window" | ||
| ;; | ||
| h2o_big_window) | ||
| h2o_big_window) | ||
| run_h2o_window "BIG" "CSV" "window" | ||
| ;; | ||
| h2o_small_parquet) | ||
|
|
@@ -501,6 +507,9 @@ main() { | |
| compile_profile) | ||
| run_compile_profile "${PROFILE_ARGS[@]}" | ||
| ;; | ||
| clickbench_sorted) | ||
| run_clickbench_sorted | ||
| ;; | ||
| *) | ||
| echo "Error: unknown benchmark '$BENCHMARK' for run" | ||
| usage | ||
|
|
@@ -1201,6 +1210,113 @@ compare_benchmarks() { | |
|
|
||
| } | ||
|
|
||
| # Creates sorted ClickBench data from hits.parquet (full dataset) | ||
| # The data is sorted by EventTime in ascending order | ||
| # Uses datafusion-cli to reduce dependencies | ||
| clickbench_sorted() { | ||
| SORTED_FILE="${DATA_DIR}/hits_sorted.parquet" | ||
| ORIGINAL_FILE="${DATA_DIR}/hits.parquet" | ||
|
|
||
| # Default memory limit is 12GB, can be overridden with DATAFUSION_MEMORY_GB env var | ||
| MEMORY_LIMIT_GB=${DATAFUSION_MEMORY_GB:-12} | ||
|
|
||
| echo "Creating sorted ClickBench dataset from hits.parquet..." | ||
| echo "Configuration:" | ||
| echo " Memory limit: ${MEMORY_LIMIT_GB}G" | ||
| echo " Row group size: 64K rows" | ||
| echo " Compression: uncompressed" | ||
|
|
||
| if [ ! -f "${ORIGINAL_FILE}" ]; then | ||
| echo "hits.parquet not found. Running data_clickbench_1 first..." | ||
| data_clickbench_1 | ||
| fi | ||
|
|
||
| if [ -f "${SORTED_FILE}" ]; then | ||
| echo "Sorted hits.parquet already exists at ${SORTED_FILE}" | ||
| return 0 | ||
| fi | ||
|
|
||
| echo "Sorting hits.parquet by EventTime (this may take several minutes)..." | ||
|
|
||
| pushd "${DATAFUSION_DIR}" > /dev/null | ||
| echo "Building datafusion-cli..." | ||
| cargo build --release --bin datafusion-cli | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice |
||
| DATAFUSION_CLI="${DATAFUSION_DIR}/target/release/datafusion-cli" | ||
| popd > /dev/null | ||
|
|
||
|
|
||
| START_TIME=$(date +%s) | ||
| echo "Start time: $(date '+%Y-%m-%d %H:%M:%S')" | ||
| echo "Using datafusion-cli to create sorted parquet file..." | ||
| "${DATAFUSION_CLI}" << EOF | ||
| -- Memory and performance configuration | ||
| SET datafusion.runtime.memory_limit = '${MEMORY_LIMIT_GB}G'; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I already set memory limit here @alamb , i think it's similar to -m limit. |
||
| SET datafusion.execution.spill_compression = 'uncompressed'; | ||
| SET datafusion.execution.sort_spill_reservation_bytes = 10485760; -- 10MB | ||
| SET datafusion.execution.batch_size = 8192; | ||
| SET datafusion.execution.target_partitions = 1; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alamb I tried locally for target_partitions, it only not OOM for setting to 1, even for 2 it will OOM, so i setting 1 here. I am not sure why.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But it works for the huge data set:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think by default there is no memory limit. You can potentially limit the memory usage like this diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index e79cca1a2a..d227fffde6 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -1244,8 +1244,8 @@ data_sorted_clickbench() {
DATAFUSION_CLI="${DATAFUSION_DIR}/target/release/datafusion-cli"
popd > /dev/null
- echo "Using datafusion-cli to create sorted parquet file..."
- "${DATAFUSION_CLI}" << EOF
+ echo "Using datafusion-cli (4GB memory) to create sorted parquet file..."
+ "${DATAFUSION_CLI}" -m 4 << EOF
-- Memory and performance configuration
SET datafusion.runtime.memory_limit = '${MEMORY_LIMIT_GB}G';
SET datafusion.execution.spill_compression = 'uncompressed';However, whenI tried that it still wasn't able to re-sort the data 🤔
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I already set the memory limit here: I think we can keep target partition 1 for the first step, i can investigate more as follow-up, may be we can speed up it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resources exhausted: Additional allocation failed for ExternalSorterMerge[1] with top memory consumers (across reservations) as:
ExternalSorter[2]#13(can spill: true) consumed 3.7 GB, peak 4.8 GB,
ExternalSorter[3]#15(can spill: true) consumed 3.5 GB, peak 4.4 GB,
ExternalSorterMerge[2]#14(can spill: false) consumed 2.3 GB, peak 2.3 GB,
ExternalSorterMerge[1]#12(can spill: false) consumed 1004.2 MB, peak 1694.0 MB,
ExternalSorterMerge[3]#16(can spill: false) consumed 845.7 MB, peak 1798.9 MB.
Error: Failed to allocate additional 12.7 MB for ExternalSorterMerge[1] with 998.9 MB already allocated for this reservation - 689.7 KB remain available for the total pool
\qExternalSorterMerge seems to cause the Resources exhausted, when we have more than one partition.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, @alamb I add the duration logs in latest PR now for the default behavior (12GB memory, and 1 target partition), the time is fast for it for my local mac, less than 5mins: +----------+
| count |
+----------+
| 99997497 |
+----------+
1 row(s) fetched.
Elapsed 278.468 seconds.
\q
End time: 2025-12-06 16:27:54
✓ Successfully created sorted ClickBench dataset
Input: 14095 MB
Output: 36159 MB
Time Statistics:
Total duration: 280 seconds (00:04:40)
Throughput: 50 MB/s |
||
|
|
||
| -- Parquet output configuration | ||
| SET datafusion.execution.parquet.max_row_group_size = 65536; | ||
| SET datafusion.execution.parquet.compression = 'uncompressed'; | ||
|
|
||
| -- Execute sort and write | ||
| COPY (SELECT * FROM '${ORIGINAL_FILE}' ORDER BY "EventTime") | ||
| TO '${SORTED_FILE}' | ||
| STORED AS PARQUET; | ||
| EOF | ||
|
|
||
| local result=$? | ||
|
|
||
| END_TIME=$(date +%s) | ||
| DURATION=$((END_TIME - START_TIME)) | ||
| echo "End time: $(date '+%Y-%m-%d %H:%M:%S')" | ||
|
|
||
| if [ $result -eq 0 ]; then | ||
| echo "✓ Successfully created sorted ClickBench dataset" | ||
|
|
||
| INPUT_SIZE=$(stat -f%z "${ORIGINAL_FILE}" 2>/dev/null || stat -c%s "${ORIGINAL_FILE}" 2>/dev/null) | ||
| OUTPUT_SIZE=$(stat -f%z "${SORTED_FILE}" 2>/dev/null || stat -c%s "${SORTED_FILE}" 2>/dev/null) | ||
| INPUT_MB=$((INPUT_SIZE / 1024 / 1024)) | ||
| OUTPUT_MB=$((OUTPUT_SIZE / 1024 / 1024)) | ||
|
|
||
| echo " Input: ${INPUT_MB} MB" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I ran this and it showed ✓ Successfully created sorted ClickBench dataset
Input: 14095 MB
Output: 36159 MBI think that is due to the lack of compression
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes @alamb , it was suggested by @2010YOUY01 here: #19042 (comment) Because we want to speed up the sort for data generation, and we don't care about the compression here, so i set to uncompressed to speed up it. SET datafusion.execution.parquet.compression = 'uncompressed'; |
||
| echo " Output: ${OUTPUT_MB} MB" | ||
|
|
||
| echo "" | ||
| echo "Time Statistics:" | ||
| echo " Total duration: ${DURATION} seconds ($(printf '%02d:%02d:%02d' $((DURATION/3600)) $((DURATION%3600/60)) $((DURATION%60))))" | ||
| echo " Throughput: $((INPUT_MB / DURATION)) MB/s" | ||
|
|
||
| return 0 | ||
| else | ||
| echo "✗ Error: Failed to create sorted dataset" | ||
| echo "💡 Tip: Try increasing memory with: DATAFUSION_MEMORY_GB=16 ./bench.sh data clickbench_sorted" | ||
| return 1 | ||
| fi | ||
| } | ||
|
|
||
| # Runs the sorted data benchmark with prefer_existing_sort configuration | ||
| run_clickbench_sorted() { | ||
| RESULTS_FILE="${RESULTS_DIR}/clickbench_sorted.json" | ||
| echo "RESULTS_FILE: ${RESULTS_FILE}" | ||
| echo "Running sorted data benchmark with prefer_existing_sort optimization..." | ||
|
|
||
| # Ensure sorted data exists | ||
| clickbench_sorted | ||
|
|
||
| # Run benchmark with prefer_existing_sort configuration | ||
| # This allows DataFusion to optimize away redundant sorts while maintaining parallelism | ||
| debug_run $CARGO_COMMAND --bin dfbench -- clickbench \ | ||
| --iterations 5 \ | ||
| --path "${DATA_DIR}/hits_sorted.parquet" \ | ||
| --queries-path "${SCRIPT_DIR}/queries/clickbench/queries/sorted_data" \ | ||
| --sorted-by "EventTime" \ | ||
| -c datafusion.optimizer.prefer_existing_sort=true \ | ||
| -o "${RESULTS_FILE}" \ | ||
| ${QUERY_ARG} | ||
| } | ||
|
|
||
| setup_venv() { | ||
| python3 -m venv "$VIRTUAL_ENV" | ||
| PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| -- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591 | ||
| -- set datafusion.execution.parquet.binary_as_string = true | ||
| SELECT * FROM hits ORDER BY "EventTime" DESC limit 10; |
Uh oh!
There was an error while loading. Please reload this page.