-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
Describe the bug
The is_null and is_not_null kernels are incorrect for UnionArrays
I believe the core problem is that UnionArray does not implement Array::logical_nulls
arrow-rs/arrow-array/src/array/union_array.rs
Lines 445 to 525 in b9562b9
| impl Array for UnionArray { | |
| fn as_any(&self) -> &dyn Any { | |
| self | |
| } | |
| fn to_data(&self) -> ArrayData { | |
| self.clone().into() | |
| } | |
| fn into_data(self) -> ArrayData { | |
| self.into() | |
| } | |
| fn data_type(&self) -> &DataType { | |
| &self.data_type | |
| } | |
| fn slice(&self, offset: usize, length: usize) -> ArrayRef { | |
| Arc::new(self.slice(offset, length)) | |
| } | |
| fn len(&self) -> usize { | |
| self.type_ids.len() | |
| } | |
| fn is_empty(&self) -> bool { | |
| self.type_ids.is_empty() | |
| } | |
| fn offset(&self) -> usize { | |
| 0 | |
| } | |
| fn nulls(&self) -> Option<&NullBuffer> { | |
| None | |
| } | |
| /// Union types always return non null as there is no validity buffer. | |
| /// To check validity correctly you must check the underlying vector. | |
| fn is_null(&self, _index: usize) -> bool { | |
| false | |
| } | |
| /// Union types always return non null as there is no validity buffer. | |
| /// To check validity correctly you must check the underlying vector. | |
| fn is_valid(&self, _index: usize) -> bool { | |
| true | |
| } | |
| /// Union types always return 0 null count as there is no validity buffer. | |
| /// To get null count correctly you must check the underlying vector. | |
| fn null_count(&self) -> usize { | |
| 0 | |
| } | |
| fn get_buffer_memory_size(&self) -> usize { | |
| let mut sum = self.type_ids.inner().capacity(); | |
| if let Some(o) = self.offsets.as_ref() { | |
| sum += o.inner().capacity() | |
| } | |
| self.fields | |
| .iter() | |
| .flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size())) | |
| .sum::<usize>() | |
| + sum | |
| } | |
| fn get_array_memory_size(&self) -> usize { | |
| let mut sum = self.type_ids.inner().capacity(); | |
| if let Some(o) = self.offsets.as_ref() { | |
| sum += o.inner().capacity() | |
| } | |
| std::mem::size_of::<Self>() | |
| + self | |
| .fields | |
| .iter() | |
| .flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size())) | |
| .sum::<usize>() | |
| + sum | |
| } | |
| } |
And instead falls back to the default implementation (which calls Array::nulls):
arrow-rs/arrow-array/src/array/mod.rs
Lines 212 to 214 in b9562b9
| fn logical_nulls(&self) -> Option<NullBuffer> { | |
| self.nulls().cloned() | |
| } |
To Reproduce
Run these tests in is_null.rs:
#[test]
fn test_null_array_is_not_null() {
let a = NullArray::new(3);
let res = is_not_null(&a).unwrap();
let expected = BooleanArray::from(vec![false, false, false]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_dense_union_is_null() {
// union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
let int_array = Int32Array::from(vec![Some(1), None]);
let float_array = Float64Array::from(vec![Some(3.2), None]);
let str_array = StringArray::from(vec![Some("a"), None]);
let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
let offsets = [0, 1, 0, 1, 0, 1]
.into_iter()
.collect::<ScalarBuffer<i32>>();
let children = vec![
Arc::new(int_array) as Arc<dyn Array>,
Arc::new(float_array),
Arc::new(str_array),
];
let array =
UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
.unwrap();
let result = is_null(&array).unwrap();
let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
assert_eq!(expected, &result);
}
#[test]
fn test_sparse_union_is_null() {
// union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
let float_array =
Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
let children = vec![
Arc::new(int_array) as Arc<dyn Array>,
Arc::new(float_array),
Arc::new(str_array),
];
let array =
UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
let result = is_null(&array).unwrap();
let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
assert_eq!(expected, &result);
}
fn union_fields() -> UnionFields {
[
(0, Arc::new(Field::new("A", DataType::Int32, true))),
(1, Arc::new(Field::new("B", DataType::Float64, true))),
(2, Arc::new(Field::new("C", DataType::Utf8, true))),
]
.into_iter()
.collect()
}Full diff
diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs
index ea8e12abbe2..0dd74a2d0b6 100644
--- a/arrow-arith/src/boolean.rs
+++ b/arrow-arith/src/boolean.rs
@@ -354,6 +354,8 @@ pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
mod tests {
use super::*;
use std::sync::Arc;
+ use arrow_buffer::ScalarBuffer;
+ use arrow_schema::{DataType, Field, UnionFields};
#[test]
fn test_bool_array_and() {
@@ -911,4 +913,65 @@ mod tests {
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
+
+ #[test]
+ fn test_dense_union_is_null() {
+ // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+ let int_array = Int32Array::from(vec![Some(1), None]);
+ let float_array = Float64Array::from(vec![Some(3.2), None]);
+ let str_array = StringArray::from(vec![Some("a"), None]);
+ let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+ let offsets = [0, 1, 0, 1, 0, 1]
+ .into_iter()
+ .collect::<ScalarBuffer<i32>>();
+
+ let children = vec![
+ Arc::new(int_array) as Arc<dyn Array>,
+ Arc::new(float_array),
+ Arc::new(str_array),
+ ];
+
+ let array =
+ UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
+ .unwrap();
+
+ let result = is_null(&array).unwrap();
+
+ let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+ assert_eq!(expected, &result);
+ }
+
+ #[test]
+ fn test_sparse_union_is_null() {
+ // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+ let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
+ let float_array =
+ Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
+ let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
+ let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+
+ let children = vec![
+ Arc::new(int_array) as Arc<dyn Array>,
+ Arc::new(float_array),
+ Arc::new(str_array),
+ ];
+
+ let array =
+ UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
+
+ let result = is_null(&array).unwrap();
+
+ let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+ assert_eq!(expected, &result);
+ }
+
+ fn union_fields() -> UnionFields {
+ [
+ (0, Arc::new(Field::new("A", DataType::Int32, true))),
+ (1, Arc::new(Field::new("B", DataType::Float64, true))),
+ (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+ ]
+ .into_iter()
+ .collect()
+ }
}Expected behavior
The tests should pass
Instead they currently error as is_null always returns false and is_not_null always returns true:
assertion `left == right` failed
left: BooleanArray
[
false,
true,
false,
true,
false,
true,
]
right: BooleanArray
[
false,
false,
false,
false,
false,
false,
]
Additional context
This was found by @samuelcolvin on apache/datafusion#11162
The reproducers are from his PR on apache/datafusion#11321