Skip to content

Commit 9e575bd

Browse files
authored
feat: add row_group_is_[max/min]_value_exact to StatisticsConverter (#7574)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> As described in apache/datafusion#15976 (comment), we can expose the `is_[max/min]_value_exact` flags in `StatisticsConverter` in order to justify whether the stats are exact. # What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> Add `row_group_is_[max/min]_value_exact` to StatisticsConverter, also with some changes in the corresponding test files. # Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!--- If there are any breaking changes to public APIs, please call them out. -->
1 parent 026356b commit 9e575bd

File tree

3 files changed

+383
-5
lines changed

3 files changed

+383
-5
lines changed

parquet/src/arrow/arrow_reader/statistics.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,6 +1403,48 @@ impl<'a> StatisticsConverter<'a> {
14031403
max_statistics(data_type, iter, self.physical_type)
14041404
}
14051405

1406+
/// Extract the `is_max_value_exact` flags from row group statistics in [`RowGroupMetaData`]
1407+
///
1408+
/// See docs on [`Self::row_group_maxes`] for details
1409+
pub fn row_group_is_max_value_exact<I>(&self, metadatas: I) -> Result<BooleanArray>
1410+
where
1411+
I: IntoIterator<Item = &'a RowGroupMetaData>,
1412+
{
1413+
let Some(parquet_index) = self.parquet_column_index else {
1414+
let num_row_groups = metadatas.into_iter().count();
1415+
return Ok(BooleanArray::from_iter(
1416+
std::iter::repeat(None).take(num_row_groups),
1417+
));
1418+
};
1419+
1420+
let is_max_value_exact = metadatas
1421+
.into_iter()
1422+
.map(|x| x.column(parquet_index).statistics())
1423+
.map(|s| s.map(|s| s.max_is_exact()));
1424+
Ok(BooleanArray::from_iter(is_max_value_exact))
1425+
}
1426+
1427+
/// Extract the `is_min_value_exact` flags from row group statistics in [`RowGroupMetaData`]
1428+
///
1429+
/// See docs on [`Self::row_group_mins`] for details
1430+
pub fn row_group_is_min_value_exact<I>(&self, metadatas: I) -> Result<BooleanArray>
1431+
where
1432+
I: IntoIterator<Item = &'a RowGroupMetaData>,
1433+
{
1434+
let Some(parquet_index) = self.parquet_column_index else {
1435+
let num_row_groups = metadatas.into_iter().count();
1436+
return Ok(BooleanArray::from_iter(
1437+
std::iter::repeat(None).take(num_row_groups),
1438+
));
1439+
};
1440+
1441+
let is_min_value_exact = metadatas
1442+
.into_iter()
1443+
.map(|x| x.column(parquet_index).statistics())
1444+
.map(|s| s.map(|s| s.min_is_exact()));
1445+
Ok(BooleanArray::from_iter(is_min_value_exact))
1446+
}
1447+
14061448
/// Extract the null counts from row group statistics in [`RowGroupMetaData`]
14071449
///
14081450
/// See docs on [`Self::row_group_mins`] for details

parquet/tests/arrow_reader/mod.rs

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ use chrono::Datelike;
3131
use chrono::{Duration, TimeDelta};
3232
use half::f16;
3333
use parquet::arrow::ArrowWriter;
34-
use parquet::file::properties::{EnabledStatistics, WriterProperties};
34+
use parquet::file::properties::{
35+
EnabledStatistics, WriterProperties, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
36+
};
3537
use std::sync::Arc;
3638
use tempfile::NamedTempFile;
3739

@@ -91,10 +93,20 @@ enum Scenario {
9193
PeriodsInColumnNames,
9294
StructArray,
9395
UTF8,
96+
/// UTF8 with max and min values truncated
97+
TruncatedUTF8,
9498
UTF8View,
9599
BinaryView,
96100
}
97101

102+
impl Scenario {
103+
// If the test scenario needs to set `set_statistics_truncate_length` to test
104+
// statistics truncation.
105+
fn truncate_stats(&self) -> bool {
106+
matches!(self, Scenario::TruncatedUTF8)
107+
}
108+
}
109+
98110
fn make_boolean_batch(v: Vec<Option<bool>>) -> RecordBatch {
99111
let schema = Arc::new(Schema::new(vec![Field::new(
100112
"bool",
@@ -631,6 +643,8 @@ fn make_dict_batch() -> RecordBatch {
631643
.unwrap()
632644
}
633645

646+
/// Create data batches for the given scenario.
647+
/// `make_test_file_rg` uses the first batch to inference the schema of the file.
634648
fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
635649
match scenario {
636650
Scenario::Boolean => {
@@ -987,6 +1001,33 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
9871001
make_utf8_batch(vec![Some("e"), Some("f"), Some("g"), Some("h"), Some("i")]),
9881002
]
9891003
}
1004+
Scenario::TruncatedUTF8 => {
1005+
// Make utf8 batch with strings longer than 64 bytes
1006+
// to check truncation of row group statistics
1007+
vec![
1008+
make_utf8_batch(vec![
1009+
Some(&("a".repeat(64) + "1")),
1010+
Some(&("b".repeat(64) + "2")),
1011+
Some(&("c".repeat(64) + "3")),
1012+
None,
1013+
Some(&("d".repeat(64) + "4")),
1014+
]),
1015+
make_utf8_batch(vec![
1016+
Some(&("e".repeat(64) + "5")),
1017+
Some(&("f".repeat(64) + "6")),
1018+
Some(&("g".repeat(64) + "7")),
1019+
Some(&("h".repeat(64) + "8")),
1020+
Some(&("i".repeat(64) + "9")),
1021+
]),
1022+
make_utf8_batch(vec![
1023+
Some("j"),
1024+
Some("k"),
1025+
Some(&("l".repeat(64) + "12")),
1026+
Some(&("m".repeat(64) + "13")),
1027+
Some(&("n".repeat(64) + "14")),
1028+
]),
1029+
]
1030+
}
9901031
Scenario::UTF8View => {
9911032
// Make utf8_view batch including string length <12 and >12 bytes
9921033
// as the internal representation of StringView is differed for strings
@@ -1027,11 +1068,15 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
10271068
.tempfile()
10281069
.expect("tempfile creation");
10291070

1030-
let props = WriterProperties::builder()
1071+
let mut builder = WriterProperties::builder()
10311072
.set_max_row_group_size(row_per_group)
10321073
.set_bloom_filter_enabled(true)
1033-
.set_statistics_enabled(EnabledStatistics::Page)
1034-
.build();
1074+
.set_statistics_enabled(EnabledStatistics::Page);
1075+
if scenario.truncate_stats() {
1076+
// The same as default `column_index_truncate_length` to check both stats with one value
1077+
builder = builder.set_statistics_truncate_length(DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH);
1078+
}
1079+
let props = builder.build();
10351080

10361081
let batches = create_data_batch(scenario);
10371082
let schema = batches[0].schema();

0 commit comments

Comments
 (0)