apache · alamb · Jun 6, 2025 · May 31, 2025 · May 31, 2025 · Jun 1, 2025
diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -1403,6 +1403,48 @@ impl<'a> StatisticsConverter<'a> {
         max_statistics(data_type, iter, self.physical_type)
     }
 
+    /// Extract the `is_max_value_exact` flags from row group statistics in [`RowGroupMetaData`]
+    ///
+    /// See docs on [`Self::row_group_maxes`] for details
+    pub fn row_group_is_max_value_exact<I>(&self, metadatas: I) -> Result<BooleanArray>
+    where
+        I: IntoIterator<Item = &'a RowGroupMetaData>,
+    {
+        let Some(parquet_index) = self.parquet_column_index else {
+            let num_row_groups = metadatas.into_iter().count();
+            return Ok(BooleanArray::from_iter(
+                std::iter::repeat(None).take(num_row_groups),
+            ));
+        };
+
+        let is_max_value_exact = metadatas
+            .into_iter()
+            .map(|x| x.column(parquet_index).statistics())
+            .map(|s| s.map(|s| s.max_is_exact()));
+        Ok(BooleanArray::from_iter(is_max_value_exact))
+    }
+
+    /// Extract the `is_min_value_exact` flags from row group statistics in [`RowGroupMetaData`]
+    ///
+    /// See docs on [`Self::row_group_mins`] for details
+    pub fn row_group_is_min_value_exact<I>(&self, metadatas: I) -> Result<BooleanArray>
+    where
+        I: IntoIterator<Item = &'a RowGroupMetaData>,
+    {
+        let Some(parquet_index) = self.parquet_column_index else {
+            let num_row_groups = metadatas.into_iter().count();
+            return Ok(BooleanArray::from_iter(
+                std::iter::repeat(None).take(num_row_groups),
+            ));
+        };
+
+        let is_min_value_exact = metadatas
+            .into_iter()
+            .map(|x| x.column(parquet_index).statistics())
+            .map(|s| s.map(|s| s.min_is_exact()));
+        Ok(BooleanArray::from_iter(is_min_value_exact))
+    }
+
     /// Extract the null counts from row group statistics in [`RowGroupMetaData`]
     ///
     /// See docs on [`Self::row_group_mins`] for details

diff --git a/parquet/tests/arrow_reader/mod.rs b/parquet/tests/arrow_reader/mod.rs
@@ -31,7 +31,9 @@ use chrono::Datelike;
 use chrono::{Duration, TimeDelta};
 use half::f16;
 use parquet::arrow::ArrowWriter;
-use parquet::file::properties::{EnabledStatistics, WriterProperties};
+use parquet::file::properties::{
+    EnabledStatistics, WriterProperties, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
+};
 use std::sync::Arc;
 use tempfile::NamedTempFile;
 
@@ -91,10 +93,20 @@ enum Scenario {
     PeriodsInColumnNames,
     StructArray,
     UTF8,
+    /// UTF8 with max and min values truncated
+    TruncatedUTF8,
     UTF8View,
     BinaryView,
 }
 
+impl Scenario {
+    // If the test scenario needs to set `set_statistics_truncate_length` to test
+    // statistics truncation.
+    fn truncate_stats(&self) -> bool {
+        matches!(self, Scenario::TruncatedUTF8)
+    }
+}
+
 fn make_boolean_batch(v: Vec<Option<bool>>) -> RecordBatch {
     let schema = Arc::new(Schema::new(vec![Field::new(
         "bool",
@@ -631,6 +643,8 @@ fn make_dict_batch() -> RecordBatch {
     .unwrap()
 }
 
+/// Create data batches for the given scenario.
+/// `make_test_file_rg` uses the first batch to inference the schema of the file.
 fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
     match scenario {
         Scenario::Boolean => {
@@ -987,6 +1001,33 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
                 make_utf8_batch(vec![Some("e"), Some("f"), Some("g"), Some("h"), Some("i")]),
             ]
         }
+        Scenario::TruncatedUTF8 => {
+            // Make utf8 batch with strings longer than 64 bytes
+            // to check truncation of row group statistics
+            vec![
+                make_utf8_batch(vec![
+                    Some(&("a".repeat(64) + "1")),
+                    Some(&("b".repeat(64) + "2")),
+                    Some(&("c".repeat(64) + "3")),
-                    Some(&("c".repeat(64) + "3")),
+                    None,
 let schema = batches[0].schema(); 
 let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap(); 
-                    Some(&("c".repeat(64) + "3")),
+                    None,
 let schema = batches[0].schema(); 
  
 let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap(); 
+                    None,
+                    Some(&("d".repeat(64) + "4")),
+                ]),
+                make_utf8_batch(vec![
+                    Some(&("e".repeat(64) + "5")),
+                    Some(&("f".repeat(64) + "6")),
+                    Some(&("g".repeat(64) + "7")),
+                    Some(&("h".repeat(64) + "8")),
+                    Some(&("i".repeat(64) + "9")),
+                ]),
+                make_utf8_batch(vec![
+                    Some("j"),
+                    Some("k"),
+                    Some(&("l".repeat(64) + "12")),
+                    Some(&("m".repeat(64) + "13")),
+                    Some(&("n".repeat(64) + "14")),
+                ]),
+            ]
+        }
         Scenario::UTF8View => {
             // Make utf8_view batch including string length <12 and >12 bytes
             // as the internal representation of StringView is differed for strings
@@ -1027,11 +1068,15 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
         .tempfile()
         .expect("tempfile creation");
 
-    let props = WriterProperties::builder()
+    let mut builder = WriterProperties::builder()
         .set_max_row_group_size(row_per_group)
         .set_bloom_filter_enabled(true)
-        .set_statistics_enabled(EnabledStatistics::Page)
-        .build();
+        .set_statistics_enabled(EnabledStatistics::Page);
+    if scenario.truncate_stats() {
+        // The same as default `column_index_truncate_length` to check both stats with one value
+        builder = builder.set_statistics_truncate_length(DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH);
+    }
+    let props = builder.build();
 
     let batches = create_data_batch(scenario);
     let schema = batches[0].schema();