-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Add support for FixedSizeList type in arrow_cast, hashing
#8344
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
068e736
27e1b10
6d9687a
5a6ab22
78e7d2f
b6ce1d2
7eb3d73
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,8 +27,9 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array}; | |
| use arrow_buffer::i256; | ||
|
|
||
| use crate::cast::{ | ||
| as_boolean_array, as_generic_binary_array, as_large_list_array, as_list_array, | ||
| as_primitive_array, as_string_array, as_struct_array, | ||
| as_boolean_array, as_fixed_size_list_array, as_generic_binary_array, | ||
| as_large_list_array, as_list_array, as_primitive_array, as_string_array, | ||
| as_struct_array, | ||
| }; | ||
| use crate::error::{DataFusionError, Result, _internal_err}; | ||
|
|
||
|
|
@@ -267,6 +268,38 @@ where | |
| Ok(()) | ||
| } | ||
|
|
||
| fn hash_fixed_list_array( | ||
| array: &FixedSizeListArray, | ||
| random_state: &RandomState, | ||
| hashes_buffer: &mut [u64], | ||
| ) -> Result<()> { | ||
| let values = array.values().clone(); | ||
| let value_len = array.value_length(); | ||
| let offset_size = value_len as usize / array.len(); | ||
| let nulls = array.nulls(); | ||
| let mut values_hashes = vec![0u64; values.len()]; | ||
| create_hashes(&[values], random_state, &mut values_hashes)?; | ||
| if let Some(nulls) = nulls { | ||
| for i in 0..array.len() { | ||
| if nulls.is_valid(i) { | ||
| let hash = &mut hashes_buffer[i]; | ||
| for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size] | ||
| { | ||
| *hash = combine_hashes(*hash, *values_hash); | ||
| } | ||
| } | ||
| } | ||
| } else { | ||
| for i in 0..array.len() { | ||
| let hash = &mut hashes_buffer[i]; | ||
| for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size] { | ||
| *hash = combine_hashes(*hash, *values_hash); | ||
| } | ||
| } | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| /// Test version of `create_hashes` that produces the same value for | ||
| /// all hashes (to test collisions) | ||
| /// | ||
|
|
@@ -366,6 +399,10 @@ pub fn create_hashes<'a>( | |
| let array = as_large_list_array(array)?; | ||
| hash_list_array(array, random_state, hashes_buffer)?; | ||
| } | ||
| DataType::FixedSizeList(_,_) => { | ||
| let array = as_fixed_size_list_array(array)?; | ||
| hash_fixed_list_array(array, random_state, hashes_buffer)?; | ||
| } | ||
| _ => { | ||
| // This is internal because we should have caught this before. | ||
| return _internal_err!( | ||
|
|
@@ -546,6 +583,30 @@ mod tests { | |
| assert_eq!(hashes[2], hashes[3]); | ||
| } | ||
|
|
||
| #[test] | ||
| // Tests actual values of hashes, which are different if forcing collisions | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| fn create_hashes_for_fixed_size_list_arrays() { | ||
| let data = vec![ | ||
| Some(vec![Some(0), Some(1), Some(2)]), | ||
| None, | ||
| Some(vec![Some(3), None, Some(5)]), | ||
| Some(vec![Some(3), None, Some(5)]), | ||
| None, | ||
| Some(vec![Some(0), Some(1), Some(2)]), | ||
| ]; | ||
| let list_array = | ||
| Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>( | ||
| data, 3, | ||
| )) as ArrayRef; | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
| let mut hashes = vec![0; list_array.len()]; | ||
| create_hashes(&[list_array], &random_state, &mut hashes).unwrap(); | ||
| assert_eq!(hashes[0], hashes[5]); | ||
| assert_eq!(hashes[1], hashes[4]); | ||
| assert_eq!(hashes[2], hashes[3]); | ||
| } | ||
|
|
||
|
Comment on lines
+586
to
+609
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a unit test for the hash function. |
||
| #[test] | ||
| // Tests actual values of hashes, which are different if forcing collisions | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -384,4 +384,41 @@ LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, di | |
| query T | ||
| select arrow_typeof(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))')); | ||
| ---- | ||
| LargeList(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | ||
| LargeList(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | ||
|
|
||
| ## FixedSizeList | ||
|
|
||
| query ? | ||
| select arrow_cast(null, 'FixedSizeList(1, Int64)'); | ||
| ---- | ||
| NULL | ||
|
|
||
| #TODO: arrow-rs doesn't support it yet | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we be casting
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed that the List supports casting from UTF8 to List with a single size. Therefore, I think FixedSizeList should also support it. |
||
| #query ? | ||
| #select arrow_cast('1', 'FixedSizeList(1, Int64)'); | ||
| #---- | ||
| #[1] | ||
|
|
||
|
|
||
| query ? | ||
| select arrow_cast([1], 'FixedSizeList(1, Int64)'); | ||
| ---- | ||
| [1] | ||
|
|
||
| query error DataFusion error: Optimizer rule 'simplify_expressions' failed | ||
| select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(4, Int64)'); | ||
|
|
||
| query ? | ||
| select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'); | ||
| ---- | ||
| [1, 2, 3] | ||
|
|
||
| query T | ||
| select arrow_typeof(arrow_cast(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 'FixedSizeList(3, Int64)')); | ||
| ---- | ||
| FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) | ||
|
|
||
| query ? | ||
| select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'); | ||
| ---- | ||
| [1, 2, 3] | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't see any test coverage for this new code -- e.g. either unit tests for hashing or a higher level test like
GROUP BY <FixedListArray>Can you either ensure this code is tested somehow, or else perhaps move the hash support to a different PR so we can merge the
arrow_castsupport ?