From 2ff57e4f873a59805c8de57a3b72c18443cf751c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 14 Feb 2024 12:33:34 -0500 Subject: [PATCH 1/4] Improve documentation on how to build `ScalarValue::Struct` and add `ScalarStructBuilder` --- .../common/src/{scalar.rs => scalar/mod.rs} | 155 +++++++++++------- .../common/src/scalar/struct_builder.rs | 152 +++++++++++++++++ .../tests/cases/roundtrip_logical_plan.rs | 20 +-- 3 files changed, 259 insertions(+), 68 deletions(-) rename datafusion/common/src/{scalar.rs => scalar/mod.rs} (98%) create mode 100644 datafusion/common/src/scalar/struct_builder.rs diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar/mod.rs similarity index 98% rename from datafusion/common/src/scalar.rs rename to datafusion/common/src/scalar/mod.rs index 2395f8acc4d2..5eee78e58e2a 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! This module provides ScalarValue, an enum that can be used for storage of single elements +//! [`ScalarValue`]: stores single constant values + +mod struct_builder; use std::borrow::Borrow; use std::cmp::Ordering; @@ -43,19 +45,20 @@ use arrow::{ compute::kernels::cast::{cast_with_options, CastOptions}, datatypes::{ i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, DataType, - Field, Fields, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, + Field, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, - IntervalYearMonthType, SchemaBuilder, TimeUnit, TimestampMicrosecondType, + IntervalYearMonthType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, DECIMAL128_MAX_PRECISION, }, }; use arrow_array::cast::as_list_array; use arrow_array::{ArrowNativeTypeOp, Scalar}; -use arrow_buffer::NullBuffer; + +pub use struct_builder::ScalarStructBuilder; /// A dynamically typed, nullable single value, (the single-valued counter-part -/// to arrow's [`Array`]) +/// to arrow [`Array`]) /// /// # Performance /// @@ -99,6 +102,66 @@ use arrow_buffer::NullBuffer; /// # } /// ``` /// +/// # Nested Types +/// +/// `List` / `LargeList` / `FixedSizeList` / `Struct` are represented as a +/// single element array of the corresponding type. +/// +/// ## Example: Creating [`ScalarValue::Struct`] using [`ScalarStructBuilder`] +/// ``` +/// # use std::sync::Arc; +/// # use arrow::datatypes::{DataType, Field}; +/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder}; +/// // Build a struct like: {a: 1, b: "foo"} +/// let field_a = Field::new("a", DataType::Int32, false); +/// let field_b = Field::new("b", DataType::Utf8, false); +/// +/// let s1 = ScalarStructBuilder::new() +/// .with_scalar(field_a, ScalarValue::from(1i32)) +/// .with_scalar(field_b, ScalarValue::from("foo")) +/// .build(); +/// ``` +/// +/// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`] +/// ``` +/// # use std::sync::Arc; +/// # use arrow::datatypes::{DataType, Field}; +/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder}; +/// // Build a struct representing a NULL value +/// let fields = vec![ +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Utf8, false), +/// ]; +/// +/// let s1 = ScalarStructBuilder::new_null(fields); +/// ``` +/// +/// ## Example: Creating [`ScalarValue::Struct`] directly +/// ``` +/// # use std::sync::Arc; +/// # use arrow::datatypes::{DataType, Field, Fields}; +/// # use arrow_array::{ArrayRef, Int32Array, StructArray, StringArray}; +/// # use datafusion_common::ScalarValue; +/// // Build a struct like: {a: 1, b: "foo"} +/// // Field description +/// let fields = Fields::from(vec![ +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Utf8, false), +/// ]); +/// // one row arrays for each field +/// let arrays: Vec = vec![ +/// Arc::new(Int32Array::from(vec![1])), +/// Arc::new(StringArray::from(vec!["foo"])), +/// ]; +/// // no nulls for this array +/// let nulls = None; +/// let arr = StructArray::new(fields, arrays, nulls); +/// +/// // Create a ScalarValue::Struct directly +/// let s1 = ScalarValue::Struct(Arc::new(arr)); +/// ``` +/// +/// /// # Further Reading /// See [datatypes](https://arrow.apache.org/docs/python/api/datatypes.html) for /// details on datatypes and the [format](https://github.com/apache/arrow/blob/master/format/Schema.fbs#L354-L375) @@ -153,7 +216,8 @@ pub enum ScalarValue { List(Arc), /// The array must be a LargeListArray with length 1. LargeList(Arc), - /// Represents a single element of a [`StructArray`] as an [`ArrayRef`] + /// Represents a single element [`StructArray`] as an [`ArrayRef`]. See + /// [`ScalarValue`] for examples of how to create instances of this type. Struct(Arc), /// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01 Date32(Option), @@ -2679,20 +2743,13 @@ impl From> for ScalarValue { /// Wrapper to create ScalarValue::Struct for convenience impl From> for ScalarValue { fn from(value: Vec<(&str, ScalarValue)>) -> Self { - let (fields, scalars): (SchemaBuilder, Vec<_>) = value + value .into_iter() - .map(|(name, scalar)| (Field::new(name, scalar.data_type(), false), scalar)) - .unzip(); - - let arrays = scalars - .into_iter() - .map(|scalar| scalar.to_array().unwrap()) - .collect::>(); - - let fields = fields.finish().fields; - let struct_array = StructArray::try_new(fields, arrays, None).unwrap(); - - Self::Struct(Arc::new(struct_array)) + .fold(ScalarStructBuilder::new(), |builder, (name, value)| { + builder.with_name_and_scalar(name, value) + }) + .build() + .unwrap() } } @@ -2710,27 +2767,6 @@ impl From for ScalarValue { } } -// TODO: Remove this after changing to Scalar -// Wrapper for ScalarValue::Struct that checks the length of the arrays, without nulls -impl From<(Fields, Vec)> for ScalarValue { - fn from((fields, arrays): (Fields, Vec)) -> Self { - Self::from((fields, arrays, None)) - } -} - -// TODO: Remove this after changing to Scalar -// Wrapper for ScalarValue::Struct that checks the length of the arrays -impl From<(Fields, Vec, Option)> for ScalarValue { - fn from( - (fields, arrays, nulls): (Fields, Vec, Option), - ) -> Self { - for arr in arrays.iter() { - assert_eq!(arr.len(), 1); - } - Self::Struct(Arc::new(StructArray::new(fields, arrays, nulls))) - } -} - macro_rules! impl_try_from { ($SCALAR:ident, $NATIVE:ident) => { impl TryFrom for $NATIVE { @@ -3247,6 +3283,7 @@ mod tests { use arrow::datatypes::{ArrowNumericType, ArrowPrimitiveType}; use arrow::util::pretty::pretty_format_columns; use arrow_buffer::Buffer; + use arrow_schema::Fields; use chrono::NaiveDate; use rand::Rng; @@ -3266,31 +3303,33 @@ mod tests { ), ]); - let arrays = vec![boolean as ArrayRef, int as ArrayRef]; - let fields = Fields::from(vec![ - Field::new("b", DataType::Boolean, false), - Field::new("c", DataType::Int32, false), - ]); - let sv = ScalarValue::from((fields, arrays)); + let sv = ScalarStructBuilder::new() + .with_array(Field::new("b", DataType::Boolean, false), boolean) + .with_array(Field::new("c", DataType::Int32, false), int) + .build() + .unwrap(); + let struct_arr = sv.to_array().unwrap(); let actual = as_struct_array(&struct_arr).unwrap(); assert_eq!(actual, &expected); } #[test] - #[should_panic(expected = "assertion `left == right` failed")] + #[should_panic( + expected = "Error building SclarValue::Struct. Expected array with exactly one element, found array with 4 elements" + )] fn test_scalar_value_from_for_struct_should_panic() { - let fields = Fields::from(vec![ - Field::new("bool", DataType::Boolean, false), - Field::new("i32", DataType::Int32, false), - ]); - - let arrays = vec![ - Arc::new(BooleanArray::from(vec![false, true, false, false])) as ArrayRef, - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ]; - - let _ = ScalarValue::from((fields, arrays)); + let _ = ScalarStructBuilder::new() + .with_array( + Field::new("bool", DataType::Boolean, false), + Arc::new(BooleanArray::from(vec![false, true, false, false])), + ) + .with_array( + Field::new("i32", DataType::Int32, false), + Arc::new(Int32Array::from(vec![42, 28, 19, 31])), + ) + .build() + .unwrap(); } #[test] diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs new file mode 100644 index 000000000000..d40f0c8b3b4f --- /dev/null +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ScalarStructBuilder`] for building [`ScalarValue::Struct`] + +use crate::error::_internal_err; +use crate::{DataFusionError, Result, ScalarValue}; +use arrow::array::{ArrayRef, StructArray}; +use arrow::datatypes::{DataType, FieldRef, Fields}; +use arrow_schema::Field; +use std::sync::Arc; + +/// Builder for [`ScalarValue::Struct`]. +/// +/// See examples on [`ScalarValue`] +#[derive(Debug, Default)] +pub struct ScalarStructBuilder { + fields: Vec, + arrays: Vec, +} + +impl ScalarStructBuilder { + /// Create a new `ScalarStructBuilder` + pub fn new() -> Self { + Self::default() + } + + /// Return a new [`ScalarValue::Struct`] with the specified fields and a + /// single null value + pub fn new_null(fields: impl IntoFields) -> ScalarValue { + DataType::Struct(fields.into()).try_into().unwrap() + } + + /// Add the specified field and [`ArrayRef`] to the struct. + /// + /// Note the array should have a single row. + pub fn with_array(mut self, field: impl IntoFieldRef, value: ArrayRef) -> Self { + self.fields.push(field.into_field_ref()); + self.arrays.push(value); + self + } + + /// Add the specified field and `ScalarValue` to the struct. + pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) -> Self { + // valid scalar value should not fail + let array = value.to_array().unwrap(); + self.with_array(field, array) + } + + /// Add a field with the specified name and value to the struct. + /// the field is created with the specified data type and as non nullable + pub fn with_name_and_scalar(self, name: &str, value: ScalarValue) -> Self { + let field = Field::new(name, value.data_type(), false); + self.with_scalar(field, value) + } + + /// Return a [`ScalarValue::Struct`] with the fields and values added so far + /// + /// # Errors + /// + /// If the [`StructArray`] cannot be created (for example if there is a + /// mismatch between field types and arrays) or the arrays do not have + /// exactly one element. + pub fn build(self) -> Result { + let Self { fields, arrays } = self; + + for array in &arrays { + if array.len() != 1 { + return _internal_err!( + "Error building SclarValue::Struct. \ + Expected array with exactly one element, found array with {} elements", + array.len() + ); + } + } + + let struct_array = StructArray::try_new(Fields::from(fields), arrays, None)?; + Ok(ScalarValue::Struct(Arc::new(struct_array))) + } +} + +/// Trait for converting a type into a [`FieldRef`] +/// +/// Used to avoid having to call `clone()` on a `FieldRef` when adding a field to +/// a `ScalarStructBuilder`. +/// +/// TODO potentially upstream this to arrow-rs so that we can +/// use impl `Into` instead +pub trait IntoFieldRef { + fn into_field_ref(self) -> FieldRef; +} + +impl IntoFieldRef for FieldRef { + fn into_field_ref(self) -> FieldRef { + self + } +} + +impl IntoFieldRef for &FieldRef { + fn into_field_ref(self) -> FieldRef { + self.clone() + } +} + +impl IntoFieldRef for Field { + fn into_field_ref(self) -> FieldRef { + FieldRef::new(self) + } +} + +/// Trait for converting a type into a [`Fields`] +/// +/// This avoids to avoid having to call clone() on an Arc'd `Fields` when adding +/// a field to a `ScalarStructBuilder` +/// +/// TODO potentially upstream this to arrow-rs so that we can +/// use impl `Into` instead +pub trait IntoFields { + fn into(self) -> Fields; +} + +impl IntoFields for Fields { + fn into(self) -> Fields { + self + } +} + +impl IntoFields for &Fields { + fn into(self) -> Fields { + self.clone() + } +} + +impl IntoFields for Vec { + fn into(self) -> Fields { + Fields::from(self) + } +} diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index b6d288da2c3e..68a318b5a6d5 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -21,7 +21,6 @@ use std::fmt::{self, Debug, Formatter}; use std::sync::Arc; use arrow::array::{ArrayRef, FixedSizeListArray}; -use arrow::array::{BooleanArray, Int32Array}; use arrow::csv::WriterBuilder; use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, @@ -42,6 +41,7 @@ use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::file_options::StatementOptions; use datafusion_common::parsers::CompressionTypeVariant; +use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{internal_err, not_impl_err, plan_err, FileTypeWriterOptions}; use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, ScalarValue}; use datafusion_common::{FileType, Result}; @@ -932,17 +932,17 @@ fn round_trip_scalar_values() { ScalarValue::Binary(None), ScalarValue::LargeBinary(Some(b"bar".to_vec())), ScalarValue::LargeBinary(None), - ScalarValue::from(( - vec![ + ScalarStructBuilder::new() + .with_scalar( Field::new("a", DataType::Int32, true), + ScalarValue::from(23i32), + ) + .with_scalar( Field::new("b", DataType::Boolean, false), - ] - .into(), - vec![ - Arc::new(Int32Array::from(vec![Some(23)])) as ArrayRef, - Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef, - ], - )), + ScalarValue::from(false), + ) + .build() + .unwrap(), ScalarValue::try_from(&DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Boolean, false), From 34c185aa168528dcce6e01aa6701b2ea81f58559 Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 14 Feb 2024 14:32:55 -0800 Subject: [PATCH 2/4] Update datafusion/common/src/scalar/struct_builder.rs --- datafusion/common/src/scalar/struct_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index d40f0c8b3b4f..926e10041751 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -81,7 +81,7 @@ impl ScalarStructBuilder { for array in &arrays { if array.len() != 1 { return _internal_err!( - "Error building SclarValue::Struct. \ + "Error building ScalarValue::Struct. \ Expected array with exactly one element, found array with {} elements", array.len() ); From 4ddf29a5b07bc5ddeaac99bc45ad55e627379617 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 14 Feb 2024 17:38:06 -0500 Subject: [PATCH 3/4] Improved docs --- datafusion/common/src/scalar/mod.rs | 30 ++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 5eee78e58e2a..23b4b2cb046e 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! [`ScalarValue`]: stores single constant values +//! [`ScalarValue`]: stores single values mod struct_builder; @@ -57,13 +57,33 @@ use arrow_array::{ArrowNativeTypeOp, Scalar}; pub use struct_builder::ScalarStructBuilder; -/// A dynamically typed, nullable single value, (the single-valued counter-part -/// to arrow [`Array`]) +/// A dynamically typed, nullable single value. +/// +/// While an arrow [`Array`]) stores one or more values of the same type, in a +/// single column, a `ScalarValue` stores a single value of a single type, the +/// equivalent of 1 row and one column. +/// +/// ```text +/// ┌────────┐ +/// │ value1 │ +/// │ value2 │ ┌────────┐ +/// │ value3 │ │ value2 │ +/// │ ... │ └────────┘ +/// │ valueN │ +/// └────────┘ +/// +/// Array ScalarValue +/// +/// stores multiple, stores a single, +/// possibly null, values of possible null, value +/// the same type +/// ``` /// /// # Performance /// -/// In general, please use arrow [`Array`]s rather than [`ScalarValue`] whenever -/// possible, as it is far more efficient for multiple values. +/// In general, performance will be better using arrow [`Array`]s rather than +/// [`ScalarValue`], as it is far more efficient to process multiple values at +/// once (vecctorized processing). /// /// # Example /// ``` From 9d8658eddf615edbc714e6f6466b27b7971cabbe Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 14 Feb 2024 17:38:46 -0500 Subject: [PATCH 4/4] update test --- datafusion/common/src/scalar/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 23b4b2cb046e..29107ab10e7e 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -3336,7 +3336,7 @@ mod tests { #[test] #[should_panic( - expected = "Error building SclarValue::Struct. Expected array with exactly one element, found array with 4 elements" + expected = "Error building ScalarValue::Struct. Expected array with exactly one element, found array with 4 elements" )] fn test_scalar_value_from_for_struct_should_panic() { let _ = ScalarStructBuilder::new()