Skip to content

Commit 1ab356f

Browse files
authored
[Variant] Add variant to arrow for DataType::{Binary/LargeBinary/BinaryView} (#8768)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Closes #8767. # What changes are included in this PR? - Add a struct `VariantToBinaryRowBuilder<'a>`, and `BinaryLikeArrayBuilder` - Add three enums `Binary(VariantToBinaryArrowRowBuilder<'a, BinaryBuilder>)` , `LargeBinary(VariantToBinaryArrowRowBuilder<'a, LargeBinaryBuilder>)` and `BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>)` for `PrimitiveVariantToArrowRowBuilder` - Add tests to cover the added logic # Are these changes tested? Added new tests # Are there any user-facing changes? No public API changed
1 parent e4ff9dc commit 1ab356f

File tree

5 files changed

+155
-12
lines changed

5 files changed

+155
-12
lines changed

arrow-array/src/builder/generic_bytes_builder.rs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
357357
/// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \
358358
///
359359
/// These capacities are preallocation hints used to improve performance,
360-
/// but consuquences of passing a hint too large or too small should be negligible.
360+
/// but consequences of passing a hint too large or too small should be negligible.
361361
const AVERAGE_STRING_LENGTH: usize = 16;
362362
/// Trait for string-like array builders
363363
///
@@ -392,6 +392,50 @@ impl<O: OffsetSizeTrait> StringLikeArrayBuilder for GenericStringBuilder<O> {
392392
}
393393
}
394394

395+
/// A byte size value representing the number of bytes to allocate per binary in [`GenericBinaryBuilder`]
396+
///
397+
/// To create a [`GenericBinaryBuilder`] using `.with_capacity` we are required to provide: \
398+
/// - `item_capacity` - the row count \
399+
/// - `data_capacity` - total binary byte count \
400+
///
401+
/// We will use the `AVERAGE_BINARY_LENGTH` * row_count for `data_capacity`. \
402+
///
403+
/// These capacities are preallocation hints used to improve performance,
404+
/// but consequences of passing a hint too large or too small should be negligible.
405+
const AVERAGE_BINARY_LENGTH: usize = 128;
406+
/// Trait for binary-like array builders
407+
///
408+
/// This trait provides unified interface for builders that append binary-like data
409+
/// such as [`GenericBinaryBuilder<O>`] and [`crate::builder::BinaryViewBuilder`]
410+
pub trait BinaryLikeArrayBuilder: ArrayBuilder {
411+
/// Returns a human-readable type name for the builder.
412+
fn type_name() -> &'static str;
413+
414+
/// Creates a new builder with the given row capacity.
415+
fn with_capacity(capacity: usize) -> Self;
416+
417+
/// Appends a non-null string value to the builder.
418+
fn append_value(&mut self, value: &[u8]);
419+
420+
/// Appends a null value to the builder.
421+
fn append_null(&mut self);
422+
}
423+
424+
impl<O: OffsetSizeTrait> BinaryLikeArrayBuilder for GenericBinaryBuilder<O> {
425+
fn type_name() -> &'static str {
426+
std::any::type_name::<Self>()
427+
}
428+
fn with_capacity(capacity: usize) -> Self {
429+
Self::with_capacity(capacity, capacity * AVERAGE_BINARY_LENGTH)
430+
}
431+
fn append_value(&mut self, value: &[u8]) {
432+
Self::append_value(self, value);
433+
}
434+
fn append_null(&mut self) {
435+
Self::append_null(self);
436+
}
437+
}
438+
395439
/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
396440
///
397441
/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow_schema::ArrowError;
2525
use hashbrown::HashTable;
2626
use hashbrown::hash_table::Entry;
2727

28-
use crate::builder::{ArrayBuilder, StringLikeArrayBuilder};
28+
use crate::builder::{ArrayBuilder, BinaryLikeArrayBuilder, StringLikeArrayBuilder};
2929
use crate::types::bytes::ByteArrayNativeType;
3030
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
3131
use crate::{Array, ArrayRef, GenericByteViewArray};
@@ -570,6 +570,21 @@ impl StringLikeArrayBuilder for StringViewBuilder {
570570
///
571571
pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
572572

573+
impl BinaryLikeArrayBuilder for BinaryViewBuilder {
574+
fn type_name() -> &'static str {
575+
std::any::type_name::<BinaryViewBuilder>()
576+
}
577+
fn with_capacity(capacity: usize) -> Self {
578+
Self::with_capacity(capacity)
579+
}
580+
fn append_value(&mut self, value: &[u8]) {
581+
Self::append_value(self, value);
582+
}
583+
fn append_null(&mut self) {
584+
Self::append_null(self);
585+
}
586+
}
587+
573588
/// Creates a view from a fixed length input (the compiler can generate
574589
/// specialized code for this)
575590
fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {

parquet-variant-compute/src/variant_array.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,17 +1172,17 @@ fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, Dat
11721172
Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
11731173
Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
11741174

1175-
// Binary and string are allowed. Force Binary to BinaryView because that's what the parquet
1175+
// Binary and string are allowed. Force Binary/LargeBinary to BinaryView because that's what the parquet
11761176
// reader returns and what the rest of the variant code expects.
1177-
Binary => Cow::Owned(DataType::BinaryView),
1177+
Binary | LargeBinary => Cow::Owned(BinaryView),
11781178
BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
11791179

11801180
// UUID maps to 16-byte fixed-size binary; no other width is allowed
11811181
FixedSizeBinary(16) => borrow!(),
11821182
FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
11831183

11841184
// We can _possibly_ allow (some of) these some day?
1185-
LargeBinary | ListView(_) | LargeList(_) | LargeListView(_) => {
1185+
ListView(_) | LargeList(_) | LargeListView(_) => {
11861186
fail!()
11871187
}
11881188

parquet-variant-compute/src/variant_get.rs

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -309,10 +309,11 @@ mod test {
309309
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
310310
use crate::{VariantArray, VariantArrayBuilder, json_to_variant};
311311
use arrow::array::{
312-
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array,
313-
Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array,
314-
Int16Array, Int32Array, Int64Array, LargeStringArray, NullBuilder, StringArray,
315-
StringViewArray, StructArray, Time64MicrosecondArray,
312+
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
313+
Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array,
314+
Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, LargeBinaryArray,
315+
LargeStringArray, NullBuilder, StringArray, StringViewArray, StructArray,
316+
Time64MicrosecondArray,
316317
};
317318
use arrow::buffer::NullBuffer;
318319
use arrow::compute::CastOptions;
@@ -1316,6 +1317,63 @@ mod test {
13161317
)
13171318
}
13181319

1320+
perfectly_shredded_variant_array_fn!(perfectly_shredded_binary_variant_array, || {
1321+
BinaryArray::from(vec![
1322+
Some(b"Apache" as &[u8]),
1323+
Some(b"Arrow-rs" as &[u8]),
1324+
Some(b"Parquet-variant" as &[u8]),
1325+
])
1326+
});
1327+
1328+
perfectly_shredded_to_arrow_primitive_test!(
1329+
get_variant_perfectly_shredded_binary_as_binary,
1330+
DataType::Binary,
1331+
perfectly_shredded_binary_variant_array,
1332+
BinaryArray::from(vec![
1333+
Some(b"Apache" as &[u8]),
1334+
Some(b"Arrow-rs" as &[u8]),
1335+
Some(b"Parquet-variant" as &[u8]),
1336+
])
1337+
);
1338+
1339+
perfectly_shredded_variant_array_fn!(perfectly_shredded_large_binary_variant_array, || {
1340+
LargeBinaryArray::from(vec![
1341+
Some(b"Apache" as &[u8]),
1342+
Some(b"Arrow-rs" as &[u8]),
1343+
Some(b"Parquet-variant" as &[u8]),
1344+
])
1345+
});
1346+
1347+
perfectly_shredded_to_arrow_primitive_test!(
1348+
get_variant_perfectly_shredded_large_binary_as_large_binary,
1349+
DataType::LargeBinary,
1350+
perfectly_shredded_large_binary_variant_array,
1351+
LargeBinaryArray::from(vec![
1352+
Some(b"Apache" as &[u8]),
1353+
Some(b"Arrow-rs" as &[u8]),
1354+
Some(b"Parquet-variant" as &[u8]),
1355+
])
1356+
);
1357+
1358+
perfectly_shredded_variant_array_fn!(perfectly_shredded_binary_view_variant_array, || {
1359+
BinaryViewArray::from(vec![
1360+
Some(b"Apache" as &[u8]),
1361+
Some(b"Arrow-rs" as &[u8]),
1362+
Some(b"Parquet-variant" as &[u8]),
1363+
])
1364+
});
1365+
1366+
perfectly_shredded_to_arrow_primitive_test!(
1367+
get_variant_perfectly_shredded_binary_view_as_binary_view,
1368+
DataType::BinaryView,
1369+
perfectly_shredded_binary_view_variant_array,
1370+
BinaryViewArray::from(vec![
1371+
Some(b"Apache" as &[u8]),
1372+
Some(b"Arrow-rs" as &[u8]),
1373+
Some(b"Parquet-variant" as &[u8]),
1374+
])
1375+
);
1376+
13191377
/// Return a VariantArray that represents a normal "shredded" variant
13201378
/// for the following example
13211379
///

parquet-variant-compute/src/variant_to_arrow.rs

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
// under the License.
1717

1818
use arrow::array::{
19-
ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder,
20-
NullArray, NullBufferBuilder, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder,
21-
StringViewBuilder,
19+
ArrayRef, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, BinaryViewBuilder,
20+
BooleanBuilder, FixedSizeBinaryBuilder, LargeBinaryBuilder, LargeStringBuilder, NullArray,
21+
NullBufferBuilder, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder,
2222
};
2323
use arrow::compute::{CastOptions, DecimalCast};
2424
use arrow::datatypes::{self, DataType, DecimalType};
@@ -66,6 +66,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
6666
String(VariantToStringArrowBuilder<'a, StringBuilder>),
6767
LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>),
6868
StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>),
69+
Binary(VariantToBinaryArrowRowBuilder<'a, BinaryBuilder>),
70+
LargeBinary(VariantToBinaryArrowRowBuilder<'a, LargeBinaryBuilder>),
71+
BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>),
6972
}
7073

7174
/// Builder for converting variant values into strongly typed Arrow arrays.
@@ -111,6 +114,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
111114
String(b) => b.append_null(),
112115
LargeString(b) => b.append_null(),
113116
StringView(b) => b.append_null(),
117+
Binary(b) => b.append_null(),
118+
LargeBinary(b) => b.append_null(),
119+
BinaryView(b) => b.append_null(),
114120
}
115121
}
116122

@@ -144,6 +150,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
144150
String(b) => b.append_value(value),
145151
LargeString(b) => b.append_value(value),
146152
StringView(b) => b.append_value(value),
153+
Binary(b) => b.append_value(value),
154+
LargeBinary(b) => b.append_value(value),
155+
BinaryView(b) => b.append_value(value),
147156
}
148157
}
149158

@@ -177,6 +186,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
177186
String(b) => b.finish(),
178187
LargeString(b) => b.finish(),
179188
StringView(b) => b.finish(),
189+
Binary(b) => b.finish(),
190+
LargeBinary(b) => b.finish(),
191+
BinaryView(b) => b.finish(),
180192
}
181193
}
182194
}
@@ -322,6 +334,13 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
322334
LargeString(VariantToStringArrowBuilder::new(cast_options, capacity))
323335
}
324336
DataType::Utf8View => StringView(VariantToStringArrowBuilder::new(cast_options, capacity)),
337+
DataType::Binary => Binary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)),
338+
DataType::LargeBinary => {
339+
LargeBinary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
340+
}
341+
DataType::BinaryView => {
342+
BinaryView(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
343+
}
325344
_ if data_type.is_primitive() => {
326345
return Err(ArrowError::NotYetImplemented(format!(
327346
"Primitive data_type {data_type:?} not yet implemented"
@@ -506,6 +525,13 @@ define_variant_to_primitive_builder!(
506525
type_name: T::DATA_TYPE
507526
);
508527

528+
define_variant_to_primitive_builder!(
529+
struct VariantToBinaryArrowRowBuilder<'a, B: BinaryLikeArrayBuilder>
530+
|capacity| -> B { B::with_capacity(capacity) },
531+
|value| value.as_u8_slice(),
532+
type_name: B::type_name()
533+
);
534+
509535
/// Builder for converting variant values to arrow Decimal values
510536
pub(crate) struct VariantToDecimalArrowRowBuilder<'a, T>
511537
where

0 commit comments

Comments
 (0)