Skip to content

Commit 2710f31

Browse files
committed
support full arrow u32 through parquet
This is idential to the solution we now have for u64.
1 parent d1a9e0b commit 2710f31

File tree

2 files changed

+78
-12
lines changed

2 files changed

+78
-12
lines changed

parquet/src/arrow/array_reader.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,16 @@ impl<T: DataType> ArrayReader for PrimitiveArrayReader<T> {
271271
let target_type = self.get_data_type().clone();
272272
let arrow_data_type = match T::get_physical_type() {
273273
PhysicalType::BOOLEAN => ArrowBooleanType::DATA_TYPE,
274-
PhysicalType::INT32 => ArrowInt32Type::DATA_TYPE,
274+
PhysicalType::INT32 => {
275+
match target_type {
276+
ArrowType::UInt32 => {
277+
// follow C++ implementation and use overflow/reinterpret cast from i32 to u32 which will map
278+
// `i32::MIN..0` to `(i32::MAX as u32)..u32::MAX`
279+
ArrowUInt32Type::DATA_TYPE
280+
}
281+
_ => ArrowInt32Type::DATA_TYPE,
282+
}
283+
}
275284
PhysicalType::INT64 => {
276285
match target_type {
277286
ArrowType::UInt64 => {

parquet/src/arrow/arrow_writer.rs

Lines changed: 68 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -211,19 +211,45 @@ fn write_leaf(
211211
let indices = levels.filter_array_indices();
212212
let written = match writer {
213213
ColumnWriter::Int32ColumnWriter(ref mut typed) => {
214-
// If the column is a Date64, we cast it to a Date32, and then interpret that as Int32
215-
let array = if let ArrowDataType::Date64 = column.data_type() {
216-
let array = arrow::compute::cast(column, &ArrowDataType::Date32)?;
217-
arrow::compute::cast(&array, &ArrowDataType::Int32)?
218-
} else {
219-
arrow::compute::cast(column, &ArrowDataType::Int32)?
214+
let values = match column.data_type() {
215+
ArrowDataType::Date64 => {
216+
// If the column is a Date64, we cast it to a Date32, and then interpret that as Int32
217+
let array = if let ArrowDataType::Date64 = column.data_type() {
218+
let array = arrow::compute::cast(column, &ArrowDataType::Date32)?;
219+
arrow::compute::cast(&array, &ArrowDataType::Int32)?
220+
} else {
221+
arrow::compute::cast(column, &ArrowDataType::Int32)?
222+
};
223+
let array = array
224+
.as_any()
225+
.downcast_ref::<arrow_array::Int32Array>()
226+
.expect("Unable to get int32 array");
227+
get_numeric_array_slice::<Int32Type, _>(&array, &indices)
228+
}
229+
ArrowDataType::UInt32 => {
230+
// follow C++ implementation and use overflow/reinterpret cast from u32 to i32 which will map
231+
// `(i32::MAX as u32)..u32::MAX` to `i32::MIN..0`
232+
let array = column
233+
.as_any()
234+
.downcast_ref::<arrow_array::UInt32Array>()
235+
.expect("Unable to get u32 array");
236+
let array = arrow::compute::unary::<_, _, arrow::datatypes::Int32Type>(
237+
array,
238+
|x| x as i32,
239+
);
240+
get_numeric_array_slice::<Int32Type, _>(&array, &indices)
241+
}
242+
_ => {
243+
let array = arrow::compute::cast(column, &ArrowDataType::Int32)?;
244+
let array = array
245+
.as_any()
246+
.downcast_ref::<arrow_array::Int32Array>()
247+
.expect("Unable to get i32 array");
248+
get_numeric_array_slice::<Int32Type, _>(&array, &indices)
249+
}
220250
};
221-
let array = array
222-
.as_any()
223-
.downcast_ref::<arrow_array::Int32Array>()
224-
.expect("Unable to get int32 array");
225251
typed.write_batch(
226-
get_numeric_array_slice::<Int32Type, _>(&array, &indices).as_slice(),
252+
values.as_slice(),
227253
Some(levels.definition.as_slice()),
228254
levels.repetition.as_deref(),
229255
)?
@@ -1469,6 +1495,37 @@ mod tests {
14691495
);
14701496
}
14711497

1498+
#[test]
1499+
fn u32_min_max() {
1500+
// check values roundtrip through parquet
1501+
let values = Arc::new(UInt32Array::from_iter_values(vec![
1502+
u32::MIN,
1503+
u32::MIN + 1,
1504+
(i32::MAX as u32) - 1,
1505+
i32::MAX as u32,
1506+
(i32::MAX as u32) + 1,
1507+
u32::MAX - 1,
1508+
u32::MAX,
1509+
]));
1510+
let file = one_column_roundtrip("u32_min_max_single_column", values, false);
1511+
1512+
// check statistics are valid
1513+
let reader = SerializedFileReader::new(file).unwrap();
1514+
let metadata = reader.metadata();
1515+
assert_eq!(metadata.num_row_groups(), 1);
1516+
let row_group = metadata.row_group(0);
1517+
assert_eq!(row_group.num_columns(), 1);
1518+
let column = row_group.column(0);
1519+
let stats = column.statistics().unwrap();
1520+
assert!(stats.has_min_max_set());
1521+
if let Statistics::Int32(stats) = stats {
1522+
assert_eq!(*stats.min() as u32, u32::MIN);
1523+
assert_eq!(*stats.max() as u32, u32::MAX);
1524+
} else {
1525+
panic!("Statistics::Int32 missing")
1526+
}
1527+
}
1528+
14721529
#[test]
14731530
fn u64_min_max() {
14741531
// check values roundtrip through parquet

0 commit comments

Comments
 (0)