-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Improve documentation on how to build ScalarValue::Struct and add ScalarStructBuilder
#9229
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2ff57e4
34c185a
4ddf29a
12f4afa
9d8658e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,9 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| //! This module provides ScalarValue, an enum that can be used for storage of single elements | ||
| //! [`ScalarValue`]: stores single values | ||
|
|
||
| mod struct_builder; | ||
|
|
||
| use std::borrow::Borrow; | ||
| use std::cmp::Ordering; | ||
|
|
@@ -43,24 +45,45 @@ use arrow::{ | |
| compute::kernels::cast::{cast_with_options, CastOptions}, | ||
| datatypes::{ | ||
| i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, DataType, | ||
| Field, Fields, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, | ||
| Field, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, | ||
| IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, | ||
| IntervalYearMonthType, SchemaBuilder, TimeUnit, TimestampMicrosecondType, | ||
| IntervalYearMonthType, TimeUnit, TimestampMicrosecondType, | ||
| TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, | ||
| UInt16Type, UInt32Type, UInt64Type, UInt8Type, DECIMAL128_MAX_PRECISION, | ||
| }, | ||
| }; | ||
| use arrow_array::cast::as_list_array; | ||
| use arrow_array::{ArrowNativeTypeOp, Scalar}; | ||
| use arrow_buffer::NullBuffer; | ||
|
|
||
| /// A dynamically typed, nullable single value, (the single-valued counter-part | ||
| /// to arrow's [`Array`]) | ||
| pub use struct_builder::ScalarStructBuilder; | ||
|
|
||
| /// A dynamically typed, nullable single value. | ||
| /// | ||
| /// While an arrow [`Array`]) stores one or more values of the same type, in a | ||
| /// single column, a `ScalarValue` stores a single value of a single type, the | ||
| /// equivalent of 1 row and one column. | ||
| /// | ||
| /// ```text | ||
| /// ┌────────┐ | ||
| /// │ value1 │ | ||
| /// │ value2 │ ┌────────┐ | ||
| /// │ value3 │ │ value2 │ | ||
| /// │ ... │ └────────┘ | ||
| /// │ valueN │ | ||
| /// └────────┘ | ||
| /// | ||
| /// Array ScalarValue | ||
| /// | ||
| /// stores multiple, stores a single, | ||
| /// possibly null, values of possible null, value | ||
| /// the same type | ||
| /// ``` | ||
| /// | ||
| /// # Performance | ||
| /// | ||
| /// In general, please use arrow [`Array`]s rather than [`ScalarValue`] whenever | ||
| /// possible, as it is far more efficient for multiple values. | ||
| /// In general, performance will be better using arrow [`Array`]s rather than | ||
| /// [`ScalarValue`], as it is far more efficient to process multiple values at | ||
| /// once (vecctorized processing). | ||
| /// | ||
| /// # Example | ||
| /// ``` | ||
|
|
@@ -99,6 +122,66 @@ use arrow_buffer::NullBuffer; | |
| /// # } | ||
| /// ``` | ||
| /// | ||
| /// # Nested Types | ||
| /// | ||
| /// `List` / `LargeList` / `FixedSizeList` / `Struct` are represented as a | ||
| /// single element array of the corresponding type. | ||
| /// | ||
| /// ## Example: Creating [`ScalarValue::Struct`] using [`ScalarStructBuilder`] | ||
| /// ``` | ||
| /// # use std::sync::Arc; | ||
| /// # use arrow::datatypes::{DataType, Field}; | ||
| /// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder}; | ||
| /// // Build a struct like: {a: 1, b: "foo"} | ||
| /// let field_a = Field::new("a", DataType::Int32, false); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The use case for the |
||
| /// let field_b = Field::new("b", DataType::Utf8, false); | ||
| /// | ||
| /// let s1 = ScalarStructBuilder::new() | ||
| /// .with_scalar(field_a, ScalarValue::from(1i32)) | ||
| /// .with_scalar(field_b, ScalarValue::from("foo")) | ||
| /// .build(); | ||
| /// ``` | ||
| /// | ||
| /// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`] | ||
| /// ``` | ||
| /// # use std::sync::Arc; | ||
| /// # use arrow::datatypes::{DataType, Field}; | ||
| /// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder}; | ||
| /// // Build a struct representing a NULL value | ||
| /// let fields = vec![ | ||
| /// Field::new("a", DataType::Int32, false), | ||
| /// Field::new("b", DataType::Utf8, false), | ||
| /// ]; | ||
| /// | ||
| /// let s1 = ScalarStructBuilder::new_null(fields); | ||
| /// ``` | ||
| /// | ||
| /// ## Example: Creating [`ScalarValue::Struct`] directly | ||
| /// ``` | ||
| /// # use std::sync::Arc; | ||
| /// # use arrow::datatypes::{DataType, Field, Fields}; | ||
| /// # use arrow_array::{ArrayRef, Int32Array, StructArray, StringArray}; | ||
| /// # use datafusion_common::ScalarValue; | ||
| /// // Build a struct like: {a: 1, b: "foo"} | ||
| /// // Field description | ||
| /// let fields = Fields::from(vec![ | ||
| /// Field::new("a", DataType::Int32, false), | ||
| /// Field::new("b", DataType::Utf8, false), | ||
| /// ]); | ||
| /// // one row arrays for each field | ||
| /// let arrays: Vec<ArrayRef> = vec![ | ||
| /// Arc::new(Int32Array::from(vec![1])), | ||
| /// Arc::new(StringArray::from(vec!["foo"])), | ||
| /// ]; | ||
| /// // no nulls for this array | ||
| /// let nulls = None; | ||
| /// let arr = StructArray::new(fields, arrays, nulls); | ||
| /// | ||
| /// // Create a ScalarValue::Struct directly | ||
| /// let s1 = ScalarValue::Struct(Arc::new(arr)); | ||
| /// ``` | ||
| /// | ||
| /// | ||
| /// # Further Reading | ||
| /// See [datatypes](https://arrow.apache.org/docs/python/api/datatypes.html) for | ||
| /// details on datatypes and the [format](https://github.com/apache/arrow/blob/master/format/Schema.fbs#L354-L375) | ||
|
|
@@ -153,7 +236,8 @@ pub enum ScalarValue { | |
| List(Arc<ListArray>), | ||
| /// The array must be a LargeListArray with length 1. | ||
| LargeList(Arc<LargeListArray>), | ||
| /// Represents a single element of a [`StructArray`] as an [`ArrayRef`] | ||
| /// Represents a single element [`StructArray`] as an [`ArrayRef`]. See | ||
| /// [`ScalarValue`] for examples of how to create instances of this type. | ||
| Struct(Arc<StructArray>), | ||
| /// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01 | ||
| Date32(Option<i32>), | ||
|
|
@@ -2679,20 +2763,13 @@ impl From<Option<&str>> for ScalarValue { | |
| /// Wrapper to create ScalarValue::Struct for convenience | ||
| impl From<Vec<(&str, ScalarValue)>> for ScalarValue { | ||
| fn from(value: Vec<(&str, ScalarValue)>) -> Self { | ||
| let (fields, scalars): (SchemaBuilder, Vec<_>) = value | ||
| .into_iter() | ||
| .map(|(name, scalar)| (Field::new(name, scalar.data_type(), false), scalar)) | ||
| .unzip(); | ||
|
|
||
| let arrays = scalars | ||
| value | ||
| .into_iter() | ||
| .map(|scalar| scalar.to_array().unwrap()) | ||
| .collect::<Vec<ArrayRef>>(); | ||
|
|
||
| let fields = fields.finish().fields; | ||
| let struct_array = StructArray::try_new(fields, arrays, None).unwrap(); | ||
|
|
||
| Self::Struct(Arc::new(struct_array)) | ||
| .fold(ScalarStructBuilder::new(), |builder, (name, value)| { | ||
| builder.with_name_and_scalar(name, value) | ||
| }) | ||
| .build() | ||
| .unwrap() | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -2710,27 +2787,6 @@ impl From<String> for ScalarValue { | |
| } | ||
| } | ||
|
|
||
| // TODO: Remove this after changing to Scalar<T> | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These were added in #7893 and not yet released so this is not a breaking API change |
||
| // Wrapper for ScalarValue::Struct that checks the length of the arrays, without nulls | ||
| impl From<(Fields, Vec<ArrayRef>)> for ScalarValue { | ||
| fn from((fields, arrays): (Fields, Vec<ArrayRef>)) -> Self { | ||
| Self::from((fields, arrays, None)) | ||
| } | ||
| } | ||
|
|
||
| // TODO: Remove this after changing to Scalar<T> | ||
| // Wrapper for ScalarValue::Struct that checks the length of the arrays | ||
| impl From<(Fields, Vec<ArrayRef>, Option<NullBuffer>)> for ScalarValue { | ||
| fn from( | ||
| (fields, arrays, nulls): (Fields, Vec<ArrayRef>, Option<NullBuffer>), | ||
| ) -> Self { | ||
| for arr in arrays.iter() { | ||
| assert_eq!(arr.len(), 1); | ||
| } | ||
| Self::Struct(Arc::new(StructArray::new(fields, arrays, nulls))) | ||
| } | ||
| } | ||
|
|
||
| macro_rules! impl_try_from { | ||
| ($SCALAR:ident, $NATIVE:ident) => { | ||
| impl TryFrom<ScalarValue> for $NATIVE { | ||
|
|
@@ -3247,6 +3303,7 @@ mod tests { | |
| use arrow::datatypes::{ArrowNumericType, ArrowPrimitiveType}; | ||
| use arrow::util::pretty::pretty_format_columns; | ||
| use arrow_buffer::Buffer; | ||
| use arrow_schema::Fields; | ||
| use chrono::NaiveDate; | ||
| use rand::Rng; | ||
|
|
||
|
|
@@ -3266,31 +3323,33 @@ mod tests { | |
| ), | ||
| ]); | ||
|
|
||
| let arrays = vec![boolean as ArrayRef, int as ArrayRef]; | ||
| let fields = Fields::from(vec![ | ||
| Field::new("b", DataType::Boolean, false), | ||
| Field::new("c", DataType::Int32, false), | ||
| ]); | ||
| let sv = ScalarValue::from((fields, arrays)); | ||
| let sv = ScalarStructBuilder::new() | ||
| .with_array(Field::new("b", DataType::Boolean, false), boolean) | ||
| .with_array(Field::new("c", DataType::Int32, false), int) | ||
| .build() | ||
| .unwrap(); | ||
|
|
||
| let struct_arr = sv.to_array().unwrap(); | ||
| let actual = as_struct_array(&struct_arr).unwrap(); | ||
| assert_eq!(actual, &expected); | ||
| } | ||
|
|
||
| #[test] | ||
| #[should_panic(expected = "assertion `left == right` failed")] | ||
| #[should_panic( | ||
| expected = "Error building ScalarValue::Struct. Expected array with exactly one element, found array with 4 elements" | ||
| )] | ||
| fn test_scalar_value_from_for_struct_should_panic() { | ||
| let fields = Fields::from(vec![ | ||
| Field::new("bool", DataType::Boolean, false), | ||
| Field::new("i32", DataType::Int32, false), | ||
| ]); | ||
|
|
||
| let arrays = vec![ | ||
| Arc::new(BooleanArray::from(vec![false, true, false, false])) as ArrayRef, | ||
| Arc::new(Int32Array::from(vec![42, 28, 19, 31])), | ||
| ]; | ||
|
|
||
| let _ = ScalarValue::from((fields, arrays)); | ||
| let _ = ScalarStructBuilder::new() | ||
| .with_array( | ||
| Field::new("bool", DataType::Boolean, false), | ||
| Arc::new(BooleanArray::from(vec![false, true, false, false])), | ||
| ) | ||
| .with_array( | ||
| Field::new("i32", DataType::Int32, false), | ||
| Arc::new(Int32Array::from(vec![42, 28, 19, 31])), | ||
| ) | ||
| .build() | ||
| .unwrap(); | ||
| } | ||
|
|
||
| #[test] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nice to have some similar examples for creating
Struct::List