Skip to content

Incorrect values for is_null and is_not_null on UnionArray #6017

@alamb

Description

@alamb

Describe the bug
The is_null and is_not_null kernels are incorrect for UnionArrays

I believe the core problem is that UnionArray does not implement Array::logical_nulls

impl Array for UnionArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.type_ids.len()
}
fn is_empty(&self) -> bool {
self.type_ids.is_empty()
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
None
}
/// Union types always return non null as there is no validity buffer.
/// To check validity correctly you must check the underlying vector.
fn is_null(&self, _index: usize) -> bool {
false
}
/// Union types always return non null as there is no validity buffer.
/// To check validity correctly you must check the underlying vector.
fn is_valid(&self, _index: usize) -> bool {
true
}
/// Union types always return 0 null count as there is no validity buffer.
/// To get null count correctly you must check the underlying vector.
fn null_count(&self) -> usize {
0
}
fn get_buffer_memory_size(&self) -> usize {
let mut sum = self.type_ids.inner().capacity();
if let Some(o) = self.offsets.as_ref() {
sum += o.inner().capacity()
}
self.fields
.iter()
.flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size()))
.sum::<usize>()
+ sum
}
fn get_array_memory_size(&self) -> usize {
let mut sum = self.type_ids.inner().capacity();
if let Some(o) = self.offsets.as_ref() {
sum += o.inner().capacity()
}
std::mem::size_of::<Self>()
+ self
.fields
.iter()
.flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size()))
.sum::<usize>()
+ sum
}
}

And instead falls back to the default implementation (which calls Array::nulls):

fn logical_nulls(&self) -> Option<NullBuffer> {
self.nulls().cloned()
}

To Reproduce
Run these tests in is_null.rs:

    #[test]
    fn test_null_array_is_not_null() {
        let a = NullArray::new(3);

        let res = is_not_null(&a).unwrap();

        let expected = BooleanArray::from(vec![false, false, false]);

        assert_eq!(expected, res);
        assert!(res.nulls().is_none());
    }

    #[test]
    fn test_dense_union_is_null() {
        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
        let int_array = Int32Array::from(vec![Some(1), None]);
        let float_array = Float64Array::from(vec![Some(3.2), None]);
        let str_array = StringArray::from(vec![Some("a"), None]);
        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
        let offsets = [0, 1, 0, 1, 0, 1]
            .into_iter()
            .collect::<ScalarBuffer<i32>>();

        let children = vec![
            Arc::new(int_array) as Arc<dyn Array>,
            Arc::new(float_array),
            Arc::new(str_array),
        ];

        let array =
            UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
                .unwrap();

        let result = is_null(&array).unwrap();

        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
        assert_eq!(expected, &result);
    }

    #[test]
    fn test_sparse_union_is_null() {
        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
        let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
        let float_array =
            Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
        let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();

        let children = vec![
            Arc::new(int_array) as Arc<dyn Array>,
            Arc::new(float_array),
            Arc::new(str_array),
        ];

        let array =
            UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();

        let result = is_null(&array).unwrap();

        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
        assert_eq!(expected, &result);
    }

    fn union_fields() -> UnionFields {
        [
            (0, Arc::new(Field::new("A", DataType::Int32, true))),
            (1, Arc::new(Field::new("B", DataType::Float64, true))),
            (2, Arc::new(Field::new("C", DataType::Utf8, true))),
        ]
            .into_iter()
            .collect()
    }
Full diff

diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs
index ea8e12abbe2..0dd74a2d0b6 100644
--- a/arrow-arith/src/boolean.rs
+++ b/arrow-arith/src/boolean.rs
@@ -354,6 +354,8 @@ pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
 mod tests {
     use super::*;
     use std::sync::Arc;
+    use arrow_buffer::ScalarBuffer;
+    use arrow_schema::{DataType, Field, UnionFields};

     #[test]
     fn test_bool_array_and() {
@@ -911,4 +913,65 @@ mod tests {
         assert_eq!(expected, res);
         assert!(res.nulls().is_none());
     }
+
+    #[test]
+    fn test_dense_union_is_null() {
+        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+        let int_array = Int32Array::from(vec![Some(1), None]);
+        let float_array = Float64Array::from(vec![Some(3.2), None]);
+        let str_array = StringArray::from(vec![Some("a"), None]);
+        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+        let offsets = [0, 1, 0, 1, 0, 1]
+            .into_iter()
+            .collect::<ScalarBuffer<i32>>();
+
+        let children = vec![
+            Arc::new(int_array) as Arc<dyn Array>,
+            Arc::new(float_array),
+            Arc::new(str_array),
+        ];
+
+        let array =
+            UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
+                .unwrap();
+
+        let result = is_null(&array).unwrap();
+
+        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+        assert_eq!(expected, &result);
+    }
+
+    #[test]
+    fn test_sparse_union_is_null() {
+        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+        let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
+        let float_array =
+            Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
+        let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
+        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+
+        let children = vec![
+            Arc::new(int_array) as Arc<dyn Array>,
+            Arc::new(float_array),
+            Arc::new(str_array),
+        ];
+
+        let array =
+            UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
+
+        let result = is_null(&array).unwrap();
+
+        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+        assert_eq!(expected, &result);
+    }
+
+    fn union_fields() -> UnionFields {
+        [
+            (0, Arc::new(Field::new("A", DataType::Int32, true))),
+            (1, Arc::new(Field::new("B", DataType::Float64, true))),
+            (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+        ]
+            .into_iter()
+            .collect()
+    }
 }

Expected behavior
The tests should pass

Instead they currently error as is_null always returns false and is_not_null always returns true:

assertion `left == right` failed
  left: BooleanArray
[
  false,
  true,
  false,
  true,
  false,
  true,
]
 right: BooleanArray
[
  false,
  false,
  false,
  false,
  false,
  false,
]

Additional context
This was found by @samuelcolvin on apache/datafusion#11162

The reproducers are from his PR on apache/datafusion#11321

Metadata

Metadata

Assignees

No one assigned

    Labels

    arrowChanges to the arrow cratebug

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions