From 98c3342a5e071926792c60ffef39faf08bfeddd5 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 25 Jan 2025 23:41:42 +0200 Subject: [PATCH 1/3] bench: add array_agg benchmark --- datafusion/functions-aggregate/Cargo.toml | 4 + .../functions-aggregate/benches/array_agg.rs | 178 ++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 datafusion/functions-aggregate/benches/array_agg.rs diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index 81e5233a1516..333f0d9cdd79 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -66,3 +66,7 @@ harness = false [[bench]] name = "sum" harness = false + +[[bench]] +name = "array_agg" +harness = false diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs new file mode 100644 index 000000000000..b117694ea092 --- /dev/null +++ b/datafusion/functions-aggregate/benches/array_agg.rs @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray}; +use arrow::datatypes::Int64Type; +use arrow::util::bench_util::create_primitive_array; +use arrow_schema::Field; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::Accumulator; +use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator; + +use arrow::util::test_util::seedable_rng; +use arrow_buffer::{NullBufferBuilder, OffsetBuffer}; +use rand::Rng; +use rand::distributions::{Distribution, Standard}; + +fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { + let list_item_data_type = values.as_list::().values().data_type().clone(); + c.bench_function(name, |b| { + b.iter(|| { + black_box( + ArrayAggAccumulator::try_new(&list_item_data_type) + .unwrap() + .merge_batch(&[values.clone()]) + .unwrap(), + ) + }) + }); +} + +/// Create List array with the given item data type, null density, null locations and non zero length lists density +/// Creates an random (but fixed-seeded) array of a given size and null density +pub fn create_list_array( + size: usize, + null_density: f32, + non_zero_length_lists_probability: f32, +) -> ListArray +where + T: ArrowPrimitiveType, + Standard: Distribution, +{ + let mut nulls_builder = NullBufferBuilder::new(size); + let mut rng = seedable_rng(); + + let offsets = OffsetBuffer::from_lengths((0..size).map(|_| { + let is_null = rng.gen::() < null_density; + + let mut length = rng.gen_range(1..10); + + if is_null { + nulls_builder.append_null(); + + if rng.gen::() > non_zero_length_lists_probability { + length = 0; + } + } else { + nulls_builder.append_non_null(); + } + + length + })); + + let length = *offsets.last().unwrap() as usize; + + let values = create_primitive_array::(length, 0.0); + + let field = Field::new_list_field(T::DATA_TYPE, true); + + ListArray::new( + Arc::new(field), + offsets, + Arc::new(values), + nulls_builder.finish(), + ) +} + +fn array_agg_benchmark(c: &mut Criterion) { + + let values = Arc::new(create_list_array::(8192, 0.0, 0.0)) as ArrayRef; + merge_batch_bench(c, "array_agg i64 merge_batch no nulls", values); + + let values = Arc::new(create_list_array::(8192, 1.0, 0.0)) as ArrayRef; + merge_batch_bench(c, "array_agg i64 merge_batch all nulls, 100% of nulls point to a zero length array", values); + + let values = Arc::new(create_list_array::(8192, 1.0, 0.1)) as ArrayRef; + merge_batch_bench(c, "array_agg i64 merge_batch all nulls, 90% of nulls point to a zero length array", values); + + // All nulls point to a 0 length array + + let values = Arc::new(create_list_array::(8192, 0.3, 0.0)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 30% nulls, 100% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.7, 0.0)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 70% nulls, 100% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.3, 0.01)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 30% nulls, 99% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.7, 0.01)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 70% nulls, 99% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.3, 0.10)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 30% nulls, 90% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.7, 0.10)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 70% nulls, 90% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.3, 0.50)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 30% nulls, 50% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.7, 0.50)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 70% nulls, 50% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.3, 1.0)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 30% nulls, 0% of nulls point to a zero length array", + values, + ); + + let values = Arc::new(create_list_array::(8192, 0.7, 1.0)) as ArrayRef; + merge_batch_bench( + c, + "array_agg i64 merge_batch 70% nulls, 0% of nulls point to a zero length array", + values, + ); +} + +criterion_group!(benches, array_agg_benchmark); +criterion_main!(benches); From 21fed11ebc7c15652b21f63c4e066dfc5ce14a9a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 25 Jan 2025 23:59:49 +0200 Subject: [PATCH 2/3] format --- .../functions-aggregate/benches/array_agg.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs index b117694ea092..bc674ade7c7d 100644 --- a/datafusion/functions-aggregate/benches/array_agg.rs +++ b/datafusion/functions-aggregate/benches/array_agg.rs @@ -27,13 +27,14 @@ use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator; use arrow::util::test_util::seedable_rng; use arrow_buffer::{NullBufferBuilder, OffsetBuffer}; -use rand::Rng; use rand::distributions::{Distribution, Standard}; +use rand::Rng; fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { let list_item_data_type = values.as_list::().values().data_type().clone(); c.bench_function(name, |b| { b.iter(|| { + #[allow(clippy::unit_arg)] black_box( ArrayAggAccumulator::try_new(&list_item_data_type) .unwrap() @@ -91,15 +92,22 @@ where } fn array_agg_benchmark(c: &mut Criterion) { - let values = Arc::new(create_list_array::(8192, 0.0, 0.0)) as ArrayRef; merge_batch_bench(c, "array_agg i64 merge_batch no nulls", values); let values = Arc::new(create_list_array::(8192, 1.0, 0.0)) as ArrayRef; - merge_batch_bench(c, "array_agg i64 merge_batch all nulls, 100% of nulls point to a zero length array", values); + merge_batch_bench( + c, + "array_agg i64 merge_batch all nulls, 100% of nulls point to a zero length array", + values, + ); let values = Arc::new(create_list_array::(8192, 1.0, 0.1)) as ArrayRef; - merge_batch_bench(c, "array_agg i64 merge_batch all nulls, 90% of nulls point to a zero length array", values); + merge_batch_bench( + c, + "array_agg i64 merge_batch all nulls, 90% of nulls point to a zero length array", + values, + ); // All nulls point to a 0 length array From 07004dc4e3eaff65cbc3e497a5d6ed473c768a8a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sun, 26 Jan 2025 14:04:11 +0200 Subject: [PATCH 3/3] rename variable --- .../functions-aggregate/benches/array_agg.rs | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs index bc674ade7c7d..c4599cdfc9b3 100644 --- a/datafusion/functions-aggregate/benches/array_agg.rs +++ b/datafusion/functions-aggregate/benches/array_agg.rs @@ -45,12 +45,12 @@ fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { }); } -/// Create List array with the given item data type, null density, null locations and non zero length lists density +/// Create List array with the given item data type, null density, null locations and zero length lists density /// Creates an random (but fixed-seeded) array of a given size and null density pub fn create_list_array( size: usize, null_density: f32, - non_zero_length_lists_probability: f32, + zero_length_lists_probability: f32, ) -> ListArray where T: ArrowPrimitiveType, @@ -67,7 +67,7 @@ where if is_null { nulls_builder.append_null(); - if rng.gen::() > non_zero_length_lists_probability { + if rng.gen::() <= zero_length_lists_probability { length = 0; } } else { @@ -92,17 +92,17 @@ where } fn array_agg_benchmark(c: &mut Criterion) { - let values = Arc::new(create_list_array::(8192, 0.0, 0.0)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.0, 1.0)) as ArrayRef; merge_batch_bench(c, "array_agg i64 merge_batch no nulls", values); - let values = Arc::new(create_list_array::(8192, 1.0, 0.0)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 1.0, 1.0)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch all nulls, 100% of nulls point to a zero length array", values, ); - let values = Arc::new(create_list_array::(8192, 1.0, 0.1)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 1.0, 0.9)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch all nulls, 90% of nulls point to a zero length array", @@ -111,42 +111,42 @@ fn array_agg_benchmark(c: &mut Criterion) { // All nulls point to a 0 length array - let values = Arc::new(create_list_array::(8192, 0.3, 0.0)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.3, 1.0)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 30% nulls, 100% of nulls point to a zero length array", values, ); - let values = Arc::new(create_list_array::(8192, 0.7, 0.0)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.7, 1.0)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 70% nulls, 100% of nulls point to a zero length array", values, ); - let values = Arc::new(create_list_array::(8192, 0.3, 0.01)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.3, 0.99)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 30% nulls, 99% of nulls point to a zero length array", values, ); - let values = Arc::new(create_list_array::(8192, 0.7, 0.01)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.7, 0.99)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 70% nulls, 99% of nulls point to a zero length array", values, ); - let values = Arc::new(create_list_array::(8192, 0.3, 0.10)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.3, 0.9)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 30% nulls, 90% of nulls point to a zero length array", values, ); - let values = Arc::new(create_list_array::(8192, 0.7, 0.10)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.7, 0.9)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 70% nulls, 90% of nulls point to a zero length array", @@ -167,14 +167,14 @@ fn array_agg_benchmark(c: &mut Criterion) { values, ); - let values = Arc::new(create_list_array::(8192, 0.3, 1.0)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.3, 0.0)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 30% nulls, 0% of nulls point to a zero length array", values, ); - let values = Arc::new(create_list_array::(8192, 0.7, 1.0)) as ArrayRef; + let values = Arc::new(create_list_array::(8192, 0.7, 0.0)) as ArrayRef; merge_batch_bench( c, "array_agg i64 merge_batch 70% nulls, 0% of nulls point to a zero length array",