Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3525,6 +3525,12 @@ impl ScalarValue {
}
}
}

/// Compacts ([ScalarValue::compact]) the current [ScalarValue] and returns it.
pub fn compacted(mut self) -> Self {
self.compact();
self
}
}

pub fn copy_array_data(data: &ArrayData) -> ArrayData {
Expand Down
83 changes: 78 additions & 5 deletions datafusion/functions-aggregate/src/array_agg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
//! `ARRAY_AGG` aggregate implementation: [`ArrayAgg`]

use arrow::array::{
new_empty_array, Array, ArrayRef, AsArray, BooleanArray, ListArray, StructArray,
make_array, new_empty_array, Array, ArrayRef, AsArray, BooleanArray, ListArray,
StructArray,
};
use arrow::compute::{filter, SortOptions};
use arrow::datatypes::{DataType, Field, FieldRef, Fields};

use datafusion_common::cast::as_list_array;
use datafusion_common::scalar::copy_array_data;
use datafusion_common::utils::{get_row_at_idx, SingleRowListArrayBuilder};
use datafusion_common::{exec_err, ScalarValue};
use datafusion_common::{internal_err, Result};
Expand Down Expand Up @@ -319,7 +321,11 @@ impl Accumulator for ArrayAggAccumulator {
};

if !val.is_empty() {
self.values.push(val);
// The ArrayRef might be holding a reference to its original input buffer, so
// storing it here directly copied/compacted avoids over accounting memory
// not used here.
self.values
.push(make_array(copy_array_data(&val.to_data())));
}

Ok(())
Expand Down Expand Up @@ -429,7 +435,8 @@ impl Accumulator for DistinctArrayAggAccumulator {
if nulls.is_none_or(|nulls| nulls.null_count() < val.len()) {
for i in 0..val.len() {
if nulls.is_none_or(|nulls| nulls.is_valid(i)) {
self.values.insert(ScalarValue::try_from_array(val, i)?);
self.values
.insert(ScalarValue::try_from_array(val, i)?.compacted());
}
}
}
Expand Down Expand Up @@ -558,8 +565,14 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
if nulls.is_none_or(|nulls| nulls.null_count() < val.len()) {
for i in 0..val.len() {
if nulls.is_none_or(|nulls| nulls.is_valid(i)) {
self.values.push(ScalarValue::try_from_array(val, i)?);
self.ordering_values.push(get_row_at_idx(ord, i)?)
self.values
.push(ScalarValue::try_from_array(val, i)?.compacted());
self.ordering_values.push(
get_row_at_idx(ord, i)?
.into_iter()
.map(|v| v.compacted())
.collect(),
)
}
}
}
Expand Down Expand Up @@ -722,6 +735,7 @@ impl OrderSensitiveArrayAggAccumulator {
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::{ListBuilder, StringBuilder};
use arrow::datatypes::{FieldRef, Schema};
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::internal_err;
Expand Down Expand Up @@ -988,6 +1002,56 @@ mod tests {
Ok(())
}

#[test]
fn does_not_over_account_memory() -> Result<()> {
let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string().build_two()?;

acc1.update_batch(&[data(["a", "c", "b"])])?;
acc2.update_batch(&[data(["b", "c", "a"])])?;
acc1 = merge(acc1, acc2)?;

// without compaction, the size is 2652.
assert_eq!(acc1.size(), 732);

Ok(())
}
#[test]
fn does_not_over_account_memory_distinct() -> Result<()> {
let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string()
.distinct()
.build_two()?;

acc1.update_batch(&[string_list_data([
vec!["a", "b", "c"],
vec!["d", "e", "f"],
])])?;
acc2.update_batch(&[string_list_data([vec!["e", "f", "g"]])])?;
acc1 = merge(acc1, acc2)?;

// without compaction, the size is 16660
assert_eq!(acc1.size(), 1660);

Ok(())
}

#[test]
fn does_not_over_account_memory_ordered() -> Result<()> {
let mut acc = ArrayAggAccumulatorBuilder::string()
.order_by_col("col", SortOptions::new(false, false))
.build()?;

acc.update_batch(&[string_list_data([
vec!["a", "b", "c"],
vec!["c", "d", "e"],
vec!["b", "c", "d"],
])])?;

// without compaction, the size is 17112
assert_eq!(acc.size(), 2080);
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in apache/main, this value is 2112. I have no idea why we need 2080 here for the test to pass.

I don't think investing time in investigating this is worth it, so I'm just going to go ahead and merge this, but I'll keep this in mind.


Ok(())
}

struct ArrayAggAccumulatorBuilder {
return_field: FieldRef,
distinct: bool,
Expand Down Expand Up @@ -1066,6 +1130,15 @@ mod tests {
.collect()
}

fn string_list_data<'a>(data: impl IntoIterator<Item = Vec<&'a str>>) -> ArrayRef {
let mut builder = ListBuilder::new(StringBuilder::new());
for string_list in data.into_iter() {
builder.append_value(string_list.iter().map(Some).collect::<Vec<_>>());
}

Arc::new(builder.finish())
}

fn data<T, const N: usize>(list: [T; N]) -> ArrayRef
where
ScalarValue: From<T>,
Expand Down
Loading