-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Implement array_union
#7897
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement array_union
#7897
Changes from 6 commits
282c10b
7315fcf
6420f40
4443883
2f1aa1f
648ae5d
cecbedd
5bee5bb
5bbc444
00d6f3f
4e65f76
3f8405e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,8 +24,10 @@ use arrow::array::*; | |
| use arrow::buffer::OffsetBuffer; | ||
| use arrow::compute; | ||
| use arrow::datatypes::{DataType, Field, UInt64Type}; | ||
| use arrow::row::{RowConverter, SortField, Row}; | ||
| use arrow_buffer::NullBuffer; | ||
|
|
||
| use arrow_schema::FieldRef; | ||
| use datafusion_common::cast::{ | ||
| as_generic_string_array, as_int64_array, as_list_array, as_string_array, | ||
| }; | ||
|
|
@@ -36,6 +38,7 @@ use datafusion_common::{ | |
| }; | ||
|
|
||
| use itertools::Itertools; | ||
| use std::collections::HashSet; | ||
|
|
||
| macro_rules! downcast_arg { | ||
| ($ARG:expr, $ARRAY_TYPE:ident) => {{ | ||
|
|
@@ -1358,6 +1361,89 @@ macro_rules! to_string { | |
| }}; | ||
| } | ||
|
|
||
| fn union_generic_lists<OffsetSize: OffsetSizeTrait>( | ||
| l: &GenericListArray<OffsetSize>, | ||
| r: &GenericListArray<OffsetSize>, | ||
| field: &FieldRef | ||
| ) -> Result<GenericListArray<OffsetSize>> { | ||
| let converter = | ||
| RowConverter::new(vec![SortField::new(l.value_type().clone())])?; | ||
|
|
||
| let nulls = NullBuffer::union(l.nulls(), r.nulls()); | ||
| let l_values = l.values().clone(); | ||
| let r_values = r.values().clone(); | ||
| let l_values = converter.convert_columns(&[l_values])?; | ||
| let r_values = converter.convert_columns(&[r_values])?; | ||
|
|
||
| // Might be worth adding an upstream OffsetBufferBuilder | ||
| let mut offsets = Vec::<OffsetSize>::with_capacity(l.len() + 1); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a really neat implementation @edmondop. |
||
| offsets.push(OffsetSize::usize_as(0)); | ||
| let mut rows = Vec::with_capacity(l_values.num_rows() + r_values.num_rows()); | ||
| let mut dedup = HashSet::new(); | ||
| // Needed to preserve ordering | ||
| let mut row_elements:Vec<Row<'_>> = vec![]; | ||
| for (l_w, r_w) in l.offsets().windows(2).zip(r.offsets().windows(2)) { | ||
| let l_slice = l_w[0].as_usize()..l_w[1].as_usize(); | ||
| let r_slice = r_w[0].as_usize()..r_w[1].as_usize(); | ||
| for i in l_slice { | ||
| let left_row = l_values.row(i); | ||
| if dedup.insert(left_row) { | ||
| row_elements.push(left_row); | ||
edmondop marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
| for i in r_slice { | ||
| let right_row=r_values.row(i); | ||
| if dedup.insert(right_row){ | ||
| row_elements.push(right_row); | ||
| } | ||
| } | ||
|
|
||
| rows.extend(row_elements.iter()); | ||
| offsets.push(OffsetSize::usize_as(rows.len())); | ||
| dedup.clear(); | ||
| row_elements.clear(); | ||
edmondop marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| let values = converter.convert_rows(rows)?; | ||
| let offsets = OffsetBuffer::new(offsets.into()); | ||
| let result = values[0].clone(); | ||
| Ok(GenericListArray::<OffsetSize>::new( | ||
| field.clone(), offsets, result, nulls, | ||
| )) | ||
| } | ||
|
|
||
| /// Array_union SQL function | ||
| pub fn array_union(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
edmondop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if args.len() != 2 { | ||
| return exec_err!("array_union needs two arguments"); | ||
| } | ||
| let array1 = &args[0]; | ||
| let array2 = &args[1]; | ||
| match (array1.data_type(), array2.data_type()) { | ||
| (DataType::Null, _) => Ok(array2.clone()), | ||
| (_, DataType::Null) => Ok(array1.clone()), | ||
| (DataType::List(field_ref), DataType::List(_)) => { | ||
| check_datatypes("array_union", &[&array1, &array2])?; | ||
| let list1 = array1.as_list::<i32>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer datafusion::common::cast as_list_array and as_large_list_array
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain ?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. datafusion::common::cast and arrow::array::cast are doing the similar thing. The difference is that common::cast return datafusion error, while arrow::array::cast return Arror error. To me, return datafusion error in datafusion project make much more senses for me. Also, mixing these two casting is quite messy, we can have arrow casting inside common::cast and call common::cast for other crate.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just found that arrow::array::cast does not return Result<>
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. never mind, I don't have strong opinion on which to use yet.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't we directly use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the offset i32 branch you mean @Weijun-H ? I like the fact that the code has the same structure for LargeList and List
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but I think |
||
| let list2 = array2.as_list::<i32>(); | ||
| let result = union_generic_lists::<i32>(list1, list2, field_ref)?; | ||
| Ok(Arc::new(result)) | ||
| } | ||
| (DataType::LargeList(field_ref), DataType::LargeList(_)) => { | ||
| check_datatypes("array_union", &[&array1, &array2])?; | ||
| let list1 = array1.as_list::<i64>(); | ||
| let list2 = array2.as_list::<i64>(); | ||
| let result = union_generic_lists::<i64>(list1, list2, field_ref)?; | ||
| Ok(Arc::new(result)) | ||
| } | ||
| _ => { | ||
| return internal_err!( | ||
| "array_union only support list with offsets of type int32 and int64" | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Array_to_string SQL function | ||
| pub fn array_to_string(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
| let arr = &args[0]; | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.