Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 120 additions & 61 deletions datafusion/common/src/scalar.rs → datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
// specific language governing permissions and limitations
// under the License.

//! This module provides ScalarValue, an enum that can be used for storage of single elements
//! [`ScalarValue`]: stores single values

mod struct_builder;

use std::borrow::Borrow;
use std::cmp::Ordering;
Expand Down Expand Up @@ -43,24 +45,45 @@ use arrow::{
compute::kernels::cast::{cast_with_options, CastOptions},
datatypes::{
i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, DataType,
Field, Fields, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type,
Field, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type,
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
IntervalYearMonthType, SchemaBuilder, TimeUnit, TimestampMicrosecondType,
IntervalYearMonthType, TimeUnit, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type, DECIMAL128_MAX_PRECISION,
},
};
use arrow_array::cast::as_list_array;
use arrow_array::{ArrowNativeTypeOp, Scalar};
use arrow_buffer::NullBuffer;

/// A dynamically typed, nullable single value, (the single-valued counter-part
/// to arrow's [`Array`])
pub use struct_builder::ScalarStructBuilder;

/// A dynamically typed, nullable single value.
///
/// While an arrow [`Array`]) stores one or more values of the same type, in a
/// single column, a `ScalarValue` stores a single value of a single type, the
/// equivalent of 1 row and one column.
///
/// ```text
/// ┌────────┐
/// │ value1 │
/// │ value2 │ ┌────────┐
/// │ value3 │ │ value2 │
/// │ ... │ └────────┘
/// │ valueN │
/// └────────┘
///
/// Array ScalarValue
///
/// stores multiple, stores a single,
/// possibly null, values of possible null, value
/// the same type
/// ```
///
/// # Performance
///
/// In general, please use arrow [`Array`]s rather than [`ScalarValue`] whenever
/// possible, as it is far more efficient for multiple values.
/// In general, performance will be better using arrow [`Array`]s rather than
/// [`ScalarValue`], as it is far more efficient to process multiple values at
/// once (vecctorized processing).
///
/// # Example
/// ```
Expand Down Expand Up @@ -99,6 +122,66 @@ use arrow_buffer::NullBuffer;
/// # }
/// ```
///
/// # Nested Types
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have some similar examples for creating Struct::List

///
/// `List` / `LargeList` / `FixedSizeList` / `Struct` are represented as a
/// single element array of the corresponding type.
///
/// ## Example: Creating [`ScalarValue::Struct`] using [`ScalarStructBuilder`]
/// ```
/// # use std::sync::Arc;
/// # use arrow::datatypes::{DataType, Field};
/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder};
/// // Build a struct like: {a: 1, b: "foo"}
/// let field_a = Field::new("a", DataType::Int32, false);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use case for the ScalarStructBuilder is pretty nicely illustrated by this example compared to the one in

## Example: Creating [`ScalarValue::Struct`] directly

/// let field_b = Field::new("b", DataType::Utf8, false);
///
/// let s1 = ScalarStructBuilder::new()
/// .with_scalar(field_a, ScalarValue::from(1i32))
/// .with_scalar(field_b, ScalarValue::from("foo"))
/// .build();
/// ```
///
/// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`]
/// ```
/// # use std::sync::Arc;
/// # use arrow::datatypes::{DataType, Field};
/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder};
/// // Build a struct representing a NULL value
/// let fields = vec![
/// Field::new("a", DataType::Int32, false),
/// Field::new("b", DataType::Utf8, false),
/// ];
///
/// let s1 = ScalarStructBuilder::new_null(fields);
/// ```
///
/// ## Example: Creating [`ScalarValue::Struct`] directly
/// ```
/// # use std::sync::Arc;
/// # use arrow::datatypes::{DataType, Field, Fields};
/// # use arrow_array::{ArrayRef, Int32Array, StructArray, StringArray};
/// # use datafusion_common::ScalarValue;
/// // Build a struct like: {a: 1, b: "foo"}
/// // Field description
/// let fields = Fields::from(vec![
/// Field::new("a", DataType::Int32, false),
/// Field::new("b", DataType::Utf8, false),
/// ]);
/// // one row arrays for each field
/// let arrays: Vec<ArrayRef> = vec![
/// Arc::new(Int32Array::from(vec![1])),
/// Arc::new(StringArray::from(vec!["foo"])),
/// ];
/// // no nulls for this array
/// let nulls = None;
/// let arr = StructArray::new(fields, arrays, nulls);
///
/// // Create a ScalarValue::Struct directly
/// let s1 = ScalarValue::Struct(Arc::new(arr));
/// ```
///
///
/// # Further Reading
/// See [datatypes](https://arrow.apache.org/docs/python/api/datatypes.html) for
/// details on datatypes and the [format](https://github.com/apache/arrow/blob/master/format/Schema.fbs#L354-L375)
Expand Down Expand Up @@ -153,7 +236,8 @@ pub enum ScalarValue {
List(Arc<ListArray>),
/// The array must be a LargeListArray with length 1.
LargeList(Arc<LargeListArray>),
/// Represents a single element of a [`StructArray`] as an [`ArrayRef`]
/// Represents a single element [`StructArray`] as an [`ArrayRef`]. See
/// [`ScalarValue`] for examples of how to create instances of this type.
Struct(Arc<StructArray>),
/// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01
Date32(Option<i32>),
Expand Down Expand Up @@ -2679,20 +2763,13 @@ impl From<Option<&str>> for ScalarValue {
/// Wrapper to create ScalarValue::Struct for convenience
impl From<Vec<(&str, ScalarValue)>> for ScalarValue {
fn from(value: Vec<(&str, ScalarValue)>) -> Self {
let (fields, scalars): (SchemaBuilder, Vec<_>) = value
.into_iter()
.map(|(name, scalar)| (Field::new(name, scalar.data_type(), false), scalar))
.unzip();

let arrays = scalars
value
.into_iter()
.map(|scalar| scalar.to_array().unwrap())
.collect::<Vec<ArrayRef>>();

let fields = fields.finish().fields;
let struct_array = StructArray::try_new(fields, arrays, None).unwrap();

Self::Struct(Arc::new(struct_array))
.fold(ScalarStructBuilder::new(), |builder, (name, value)| {
builder.with_name_and_scalar(name, value)
})
.build()
.unwrap()
}
}

Expand All @@ -2710,27 +2787,6 @@ impl From<String> for ScalarValue {
}
}

// TODO: Remove this after changing to Scalar<T>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These were added in #7893 and not yet released so this is not a breaking API change

// Wrapper for ScalarValue::Struct that checks the length of the arrays, without nulls
impl From<(Fields, Vec<ArrayRef>)> for ScalarValue {
fn from((fields, arrays): (Fields, Vec<ArrayRef>)) -> Self {
Self::from((fields, arrays, None))
}
}

// TODO: Remove this after changing to Scalar<T>
// Wrapper for ScalarValue::Struct that checks the length of the arrays
impl From<(Fields, Vec<ArrayRef>, Option<NullBuffer>)> for ScalarValue {
fn from(
(fields, arrays, nulls): (Fields, Vec<ArrayRef>, Option<NullBuffer>),
) -> Self {
for arr in arrays.iter() {
assert_eq!(arr.len(), 1);
}
Self::Struct(Arc::new(StructArray::new(fields, arrays, nulls)))
}
}

macro_rules! impl_try_from {
($SCALAR:ident, $NATIVE:ident) => {
impl TryFrom<ScalarValue> for $NATIVE {
Expand Down Expand Up @@ -3247,6 +3303,7 @@ mod tests {
use arrow::datatypes::{ArrowNumericType, ArrowPrimitiveType};
use arrow::util::pretty::pretty_format_columns;
use arrow_buffer::Buffer;
use arrow_schema::Fields;
use chrono::NaiveDate;
use rand::Rng;

Expand All @@ -3266,31 +3323,33 @@ mod tests {
),
]);

let arrays = vec![boolean as ArrayRef, int as ArrayRef];
let fields = Fields::from(vec![
Field::new("b", DataType::Boolean, false),
Field::new("c", DataType::Int32, false),
]);
let sv = ScalarValue::from((fields, arrays));
let sv = ScalarStructBuilder::new()
.with_array(Field::new("b", DataType::Boolean, false), boolean)
.with_array(Field::new("c", DataType::Int32, false), int)
.build()
.unwrap();

let struct_arr = sv.to_array().unwrap();
let actual = as_struct_array(&struct_arr).unwrap();
assert_eq!(actual, &expected);
}

#[test]
#[should_panic(expected = "assertion `left == right` failed")]
#[should_panic(
expected = "Error building ScalarValue::Struct. Expected array with exactly one element, found array with 4 elements"
)]
fn test_scalar_value_from_for_struct_should_panic() {
let fields = Fields::from(vec![
Field::new("bool", DataType::Boolean, false),
Field::new("i32", DataType::Int32, false),
]);

let arrays = vec![
Arc::new(BooleanArray::from(vec![false, true, false, false])) as ArrayRef,
Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
];

let _ = ScalarValue::from((fields, arrays));
let _ = ScalarStructBuilder::new()
.with_array(
Field::new("bool", DataType::Boolean, false),
Arc::new(BooleanArray::from(vec![false, true, false, false])),
)
.with_array(
Field::new("i32", DataType::Int32, false),
Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
)
.build()
.unwrap();
}

#[test]
Expand Down
Loading