diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 10c53c549e2b..2ddc2d845b01 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -109,6 +109,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff buffer.push(0i64); [buffer, MutableBuffer::new(capacity * mem::size_of::())] } + DataType::BinaryView | DataType::Utf8View => [ + MutableBuffer::new(capacity * mem::size_of::()), + empty_buffer, + ], DataType::List(_) | DataType::Map(_, _) => { // offset buffer always starts with a zero let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); @@ -1541,6 +1545,9 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::LargeBinary => DataTypeLayout::new_binary::(), DataType::Utf8 => DataTypeLayout::new_binary::(), DataType::LargeUtf8 => DataTypeLayout::new_binary::(), + DataType::BinaryView | DataType::Utf8View => { + unimplemented!("BinaryView/Utf8View not implemented") + } DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index b279546474a0..1255ff39e097 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -96,6 +96,9 @@ fn equal_values( variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) } DataType::FixedSizeBinary(_) => fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::BinaryView | DataType::Utf8View => { + unimplemented!("BinaryView/Utf8View not yet implemented") + } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 268cf10f2326..ef53efac2373 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -224,6 +224,9 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::Decimal256(_, _) => primitive::build_extend::(array), DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::(array), + DataType::BinaryView | DataType::Utf8View => { + unimplemented!("BinaryView/Utf8View not implemented") + } DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), DataType::LargeList(_) => list::build_extend::(array), DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), @@ -266,6 +269,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::Decimal256(_, _) => primitive::extend_nulls::, DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, + DataType::BinaryView | DataType::Utf8View => { + unimplemented!("BinaryView/Utf8View not implemented") + } DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, DataType::LargeList(_) => list::extend_nulls::, DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { @@ -419,6 +425,9 @@ impl<'a> MutableArrayData<'a> { | DataType::LargeBinary | DataType::Interval(_) | DataType::FixedSizeBinary(_) => vec![], + DataType::BinaryView | DataType::Utf8View => { + unimplemented!("BinaryView/Utf8View not implemented") + } DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { let children = arrays .iter() diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index 42ac71fbbd7e..a04db1cf3538 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -271,6 +271,9 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::LargeUtf8 => json!({"name": "largeutf8"}), DataType::Binary => json!({"name": "binary"}), DataType::LargeBinary => json!({"name": "largebinary"}), + DataType::BinaryView | DataType::Utf8View => { + unimplemented!("BinaryView/Utf8View not implemented") + } DataType::FixedSizeBinary(byte_width) => { json!({"name": "fixedsizebinary", "byteWidth": byte_width}) } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 505541f7d50f..a2ffd4380203 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -543,6 +543,7 @@ pub(crate) fn get_fb_field_type<'a>( .as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, + BinaryView | Utf8View => unimplemented!("BinaryView/Utf8View not implemented"), Utf8 => FBFieldType { type_type: crate::Type::Utf8, type_: crate::Utf8Builder::new(fbb).finish().as_union_value(), diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 079b855ce990..b3d89b011e66 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -196,6 +196,15 @@ pub enum DataType { /// A single LargeBinary array can store up to [`i64::MAX`] bytes /// of binary data in total. LargeBinary, + /// (NOT YET FULLY SUPPORTED) Opaque binary data of variable length. + /// + /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. + /// + /// Logically the same as [`Self::Binary`], but the internal representation uses a view + /// struct that contains the string length and either the string's entire data + /// inline (for small strings) or an inlined prefix, an index of another buffer, + /// and an offset pointing to a slice in that buffer (for non-small strings). + BinaryView, /// A variable-length string in Unicode with UTF-8 encoding. /// /// A single Utf8 array can store up to [`i32::MAX`] bytes @@ -206,6 +215,15 @@ pub enum DataType { /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes /// of string data in total. LargeUtf8, + /// (NOT YET FULLY SUPPORTED) A variable-length string in Unicode with UTF-8 encoding + /// + /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. + /// + /// Logically the same as [`Self::Utf8`], but the internal representation uses a view + /// struct that contains the string length and either the string's entire data + /// inline (for small strings) or an inlined prefix, an index of another buffer, + /// and an offset pointing to a slice in that buffer (for non-small strings). + Utf8View, /// A list of some logical data type with variable length. /// /// A single List array can store up to [`i32::MAX`] elements in total. @@ -515,8 +533,8 @@ impl DataType { DataType::Interval(IntervalUnit::MonthDayNano) => Some(16), DataType::Decimal128(_, _) => Some(16), DataType::Decimal256(_, _) => Some(32), - DataType::Utf8 | DataType::LargeUtf8 => None, - DataType::Binary | DataType::LargeBinary => None, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None, DataType::FixedSizeBinary(_) => None, DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) => None, DataType::FixedSizeList(_, _) => None, @@ -555,8 +573,10 @@ impl DataType { | DataType::Binary | DataType::FixedSizeBinary(_) | DataType::LargeBinary + | DataType::BinaryView | DataType::Utf8 | DataType::LargeUtf8 + | DataType::Utf8View | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => 0, DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(), diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index a5c84b107094..70a3e2b21a3c 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -507,6 +507,7 @@ impl Field { | DataType::Duration(_) | DataType::Binary | DataType::LargeBinary + | DataType::BinaryView | DataType::Interval(_) | DataType::LargeList(_) | DataType::List(_) @@ -517,6 +518,7 @@ impl Field { | DataType::FixedSizeBinary(_) | DataType::Utf8 | DataType::LargeUtf8 + | DataType::Utf8View | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { if from.data_type == DataType::Null { diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 4c350c4b1d8c..a8bef98d9e8c 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -481,6 +481,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } + DataType::BinaryView | DataType::Utf8View => unimplemented!("BinaryView/Utf8View not implemented"), DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { // Decimal precision determines the Parquet physical type to use.