diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 09f8ec7cc274..cffa60e62e96 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1403,6 +1403,48 @@ impl<'a> StatisticsConverter<'a> { max_statistics(data_type, iter, self.physical_type) } + /// Extract the `is_max_value_exact` flags from row group statistics in [`RowGroupMetaData`] + /// + /// See docs on [`Self::row_group_maxes`] for details + pub fn row_group_is_max_value_exact(&self, metadatas: I) -> Result + where + I: IntoIterator, + { + let Some(parquet_index) = self.parquet_column_index else { + let num_row_groups = metadatas.into_iter().count(); + return Ok(BooleanArray::from_iter( + std::iter::repeat(None).take(num_row_groups), + )); + }; + + let is_max_value_exact = metadatas + .into_iter() + .map(|x| x.column(parquet_index).statistics()) + .map(|s| s.map(|s| s.max_is_exact())); + Ok(BooleanArray::from_iter(is_max_value_exact)) + } + + /// Extract the `is_min_value_exact` flags from row group statistics in [`RowGroupMetaData`] + /// + /// See docs on [`Self::row_group_mins`] for details + pub fn row_group_is_min_value_exact(&self, metadatas: I) -> Result + where + I: IntoIterator, + { + let Some(parquet_index) = self.parquet_column_index else { + let num_row_groups = metadatas.into_iter().count(); + return Ok(BooleanArray::from_iter( + std::iter::repeat(None).take(num_row_groups), + )); + }; + + let is_min_value_exact = metadatas + .into_iter() + .map(|x| x.column(parquet_index).statistics()) + .map(|s| s.map(|s| s.min_is_exact())); + Ok(BooleanArray::from_iter(is_min_value_exact)) + } + /// Extract the null counts from row group statistics in [`RowGroupMetaData`] /// /// See docs on [`Self::row_group_mins`] for details diff --git a/parquet/tests/arrow_reader/mod.rs b/parquet/tests/arrow_reader/mod.rs index 0e6783583cd5..21aa1c3f26f0 100644 --- a/parquet/tests/arrow_reader/mod.rs +++ b/parquet/tests/arrow_reader/mod.rs @@ -31,7 +31,9 @@ use chrono::Datelike; use chrono::{Duration, TimeDelta}; use half::f16; use parquet::arrow::ArrowWriter; -use parquet::file::properties::{EnabledStatistics, WriterProperties}; +use parquet::file::properties::{ + EnabledStatistics, WriterProperties, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, +}; use std::sync::Arc; use tempfile::NamedTempFile; @@ -91,10 +93,20 @@ enum Scenario { PeriodsInColumnNames, StructArray, UTF8, + /// UTF8 with max and min values truncated + TruncatedUTF8, UTF8View, BinaryView, } +impl Scenario { + // If the test scenario needs to set `set_statistics_truncate_length` to test + // statistics truncation. + fn truncate_stats(&self) -> bool { + matches!(self, Scenario::TruncatedUTF8) + } +} + fn make_boolean_batch(v: Vec>) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new( "bool", @@ -631,6 +643,8 @@ fn make_dict_batch() -> RecordBatch { .unwrap() } +/// Create data batches for the given scenario. +/// `make_test_file_rg` uses the first batch to inference the schema of the file. fn create_data_batch(scenario: Scenario) -> Vec { match scenario { Scenario::Boolean => { @@ -987,6 +1001,33 @@ fn create_data_batch(scenario: Scenario) -> Vec { make_utf8_batch(vec![Some("e"), Some("f"), Some("g"), Some("h"), Some("i")]), ] } + Scenario::TruncatedUTF8 => { + // Make utf8 batch with strings longer than 64 bytes + // to check truncation of row group statistics + vec![ + make_utf8_batch(vec![ + Some(&("a".repeat(64) + "1")), + Some(&("b".repeat(64) + "2")), + Some(&("c".repeat(64) + "3")), + None, + Some(&("d".repeat(64) + "4")), + ]), + make_utf8_batch(vec![ + Some(&("e".repeat(64) + "5")), + Some(&("f".repeat(64) + "6")), + Some(&("g".repeat(64) + "7")), + Some(&("h".repeat(64) + "8")), + Some(&("i".repeat(64) + "9")), + ]), + make_utf8_batch(vec![ + Some("j"), + Some("k"), + Some(&("l".repeat(64) + "12")), + Some(&("m".repeat(64) + "13")), + Some(&("n".repeat(64) + "14")), + ]), + ] + } Scenario::UTF8View => { // Make utf8_view batch including string length <12 and >12 bytes // as the internal representation of StringView is differed for strings @@ -1027,11 +1068,15 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem .tempfile() .expect("tempfile creation"); - let props = WriterProperties::builder() + let mut builder = WriterProperties::builder() .set_max_row_group_size(row_per_group) .set_bloom_filter_enabled(true) - .set_statistics_enabled(EnabledStatistics::Page) - .build(); + .set_statistics_enabled(EnabledStatistics::Page); + if scenario.truncate_stats() { + // The same as default `column_index_truncate_length` to check both stats with one value + builder = builder.set_statistics_truncate_length(DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH); + } + let props = builder.build(); let batches = create_data_batch(scenario); let schema = batches[0].schema(); diff --git a/parquet/tests/arrow_reader/statistics.rs b/parquet/tests/arrow_reader/statistics.rs index 0eb0fc2b277f..7a389fb5eb9a 100644 --- a/parquet/tests/arrow_reader/statistics.rs +++ b/parquet/tests/arrow_reader/statistics.rs @@ -212,6 +212,8 @@ struct Test<'a> { expected_max: ArrayRef, expected_null_counts: UInt64Array, expected_row_counts: Option, + expected_max_value_exact: BooleanArray, + expected_min_value_exact: BooleanArray, /// Which column to extract statistics from column_name: &'static str, /// What statistics should be checked? @@ -245,6 +247,8 @@ impl Test<'_> { expected_max, expected_null_counts, expected_row_counts, + expected_max_value_exact: expected_max_exact, + expected_min_value_exact: expected_min_exact, column_name, check, } = self; @@ -328,6 +332,24 @@ impl Test<'_> { "{column_name}: Mismatch with expected row counts. \ Actual: {row_counts:?}. Expected: {expected_row_counts:?}" ); + + let is_max_value_exact = converter + .row_group_is_max_value_exact(reader.metadata().row_groups().iter()) + .unwrap(); + assert_eq!( + is_max_value_exact, expected_max_exact, + "{column_name}: Mismatch with expected max value exactness. \ + Actual: {is_max_value_exact:?}. Expected: {expected_max_exact:?}" + ); + + let is_min_value_exact = converter + .row_group_is_min_value_exact(reader.metadata().row_groups().iter()) + .unwrap(); + assert_eq!( + is_min_value_exact, expected_min_exact, + "{column_name}: Mismatch with expected min value exactness. \ + Actual: {is_min_value_exact:?}. Expected: {expected_min_exact:?}" + ); } } @@ -354,7 +376,49 @@ impl Test<'_> { // // Remaining cases // f64::NAN -// - Using truncated statistics ("exact min value" and "exact max value" https://docs.rs/parquet/latest/parquet/file/statistics/enum.Statistics.html#method.max_is_exact) + +#[tokio::test] +async fn test_max_and_min_value_truncated() { + let reader = TestReader { + scenario: Scenario::TruncatedUTF8, + row_per_group: 5, + } + .build() + .await; + + Test { + reader: &reader, + // min is truncated to + // 1. `"a".repeat(64)`, original value is `"a".repeat(64) + "1"` + // 2. `"e".repeat(64)`, original value is `"e".repeat(64) + "5"` + // 3. "j", as expected with no truncation + expected_min: Arc::new(StringArray::from(vec![ + &("a".repeat(64)), + &("e".repeat(64)), + "j", + ])), + // max is truncated to + // 1. `"d".repeat(63) + "e"`, original value is `"d".repeat(64) + "4"` + // 2. `"i".repeat(63) + "j"`, original value is `"i".repeat(64) + "6"` + // 3. `"n".repeat(63) + "o"`, original value is `"n".repeat(64) + "14"` + expected_max: Arc::new(StringArray::from(vec![ + "d".repeat(63) + "e", + "i".repeat(63) + "j", + "n".repeat(63) + "o", + ])), + // no nulls + expected_null_counts: UInt64Array::from(vec![1, 0, 0]), + // 3 rows + expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // all max values are truncated + expected_max_value_exact: BooleanArray::from(vec![false, false, false]), + // min values are truncated in the first two row groups + expected_min_value_exact: BooleanArray::from(vec![false, false, true]), + column_name: "utf8", + check: Check::Both, + } + .run() +} #[tokio::test] async fn test_one_row_group_without_null() { @@ -377,6 +441,9 @@ async fn test_one_row_group_without_null() { expected_null_counts: UInt64Array::from(vec![0]), // 3 rows expected_row_counts: Some(UInt64Array::from(vec![3])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true]), + expected_min_value_exact: BooleanArray::from(vec![true]), column_name: "i64", check: Check::Both, } @@ -404,6 +471,9 @@ async fn test_one_row_group_with_null_and_negative() { expected_null_counts: UInt64Array::from(vec![2]), // 8 rows expected_row_counts: Some(UInt64Array::from(vec![8])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true]), + expected_min_value_exact: BooleanArray::from(vec![true]), column_name: "i64", check: Check::Both, } @@ -431,6 +501,9 @@ async fn test_two_row_group_with_null() { expected_null_counts: UInt64Array::from(vec![0, 2]), // row counts are [10, 5] expected_row_counts: Some(UInt64Array::from(vec![10, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "i64", check: Check::Both, } @@ -458,6 +531,8 @@ async fn test_two_row_groups_with_all_nulls_in_one() { expected_null_counts: UInt64Array::from(vec![1, 3]), // row counts are [5, 3] expected_row_counts: Some(UInt64Array::from(vec![5, 3])), + expected_max_value_exact: BooleanArray::from(vec![true, false]), + expected_min_value_exact: BooleanArray::from(vec![true, false]), column_name: "i64", check: Check::Both, } @@ -489,6 +564,8 @@ async fn test_multiple_data_pages_nulls_and_negatives() { expected_max: Arc::new(Int64Array::from(vec![Some(2), Some(6), Some(9), None])), expected_null_counts: UInt64Array::from(vec![0, 0, 1, 2]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 2])), + expected_max_value_exact: BooleanArray::from(vec![true, true, true, false]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, false]), column_name: "i64", check: Check::DataPage, } @@ -551,6 +628,8 @@ async fn test_data_page_stats_with_all_null_page() { expected_max: new_null_array(expected_data_type, 1), expected_null_counts: UInt64Array::from(vec![4]), expected_row_counts: Some(UInt64Array::from(vec![4])), + expected_max_value_exact: BooleanArray::from(vec![false]), + expected_min_value_exact: BooleanArray::from(vec![false]), column_name: "col", check: Check::DataPage, } @@ -585,6 +664,9 @@ async fn test_int_64() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "i64", check: Check::Both, } @@ -611,6 +693,9 @@ async fn test_int_32() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "i32", check: Check::Both, } @@ -637,6 +722,9 @@ async fn test_int_16() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "i16", check: Check::Both, } @@ -663,6 +751,9 @@ async fn test_int_8() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "i8", check: Check::Both, } @@ -699,6 +790,9 @@ async fn test_float_16() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "f", check: Check::Both, } @@ -725,6 +819,9 @@ async fn test_float_32() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "f", check: Check::Both, } @@ -751,6 +848,9 @@ async fn test_float_64() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "f", check: Check::Both, } @@ -801,6 +901,9 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "nanos", check: Check::Both, } @@ -830,6 +933,9 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "nanos_timezoned", check: Check::Both, } @@ -852,6 +958,9 @@ async fn test_timestamp() { ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "micros", check: Check::Both, } @@ -881,6 +990,9 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "micros_timezoned", check: Check::Both, } @@ -903,6 +1015,9 @@ async fn test_timestamp() { ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "millis", check: Check::Both, } @@ -932,6 +1047,10 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), + column_name: "millis_timezoned", check: Check::Both, } @@ -954,6 +1073,10 @@ async fn test_timestamp() { ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), + column_name: "seconds", check: Check::Both, } @@ -983,6 +1106,10 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), + column_name: "seconds_timezoned", check: Check::Both, } @@ -1029,6 +1156,9 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "nanos", check: Check::Both, } @@ -1056,6 +1186,9 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "nanos_timezoned", check: Check::Both, } @@ -1076,6 +1209,9 @@ async fn test_timestamp_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "micros", check: Check::Both, } @@ -1103,6 +1239,9 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "micros_timezoned", check: Check::Both, } @@ -1123,6 +1262,9 @@ async fn test_timestamp_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "millis", check: Check::Both, } @@ -1150,6 +1292,9 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "millis_timezoned", check: Check::Both, } @@ -1170,6 +1315,9 @@ async fn test_timestamp_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "seconds", check: Check::Both, } @@ -1197,6 +1345,9 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "seconds_timezoned", check: Check::Both, } @@ -1235,6 +1386,9 @@ async fn test_dates_32_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![2, 2]), // row counts are [13, 7] expected_row_counts: Some(UInt64Array::from(vec![13, 7])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "date32", check: Check::Both, } @@ -1258,6 +1412,9 @@ async fn test_time32_second_diff_rg_sizes() { expected_max: Arc::new(Time32SecondArray::from(vec![18509, 18513, 18517, 18521])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "second", check: Check::Both, } @@ -1285,6 +1442,9 @@ async fn test_time32_millisecond_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "millisecond", check: Check::Both, } @@ -1318,6 +1478,9 @@ async fn test_time64_microsecond_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "microsecond", check: Check::Both, } @@ -1351,6 +1514,9 @@ async fn test_time64_nanosecond_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), column_name: "nanosecond", check: Check::Both, } @@ -1378,6 +1544,9 @@ async fn test_dates_64_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![2, 2]), expected_row_counts: Some(UInt64Array::from(vec![13, 7])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "date64", check: Check::Both, } @@ -1406,6 +1575,9 @@ async fn test_uint() { expected_max: Arc::new(UInt8Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true, true]), column_name: "u8", check: Check::Both, } @@ -1417,6 +1589,9 @@ async fn test_uint() { expected_max: Arc::new(UInt16Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true, true]), column_name: "u16", check: Check::Both, } @@ -1428,6 +1603,9 @@ async fn test_uint() { expected_max: Arc::new(UInt32Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true, true]), column_name: "u32", check: Check::Both, } @@ -1439,6 +1617,9 @@ async fn test_uint() { expected_max: Arc::new(UInt64Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true, true]), column_name: "u64", check: Check::Both, } @@ -1462,6 +1643,9 @@ async fn test_int32_range() { expected_max: Arc::new(Int32Array::from(vec![300000])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: Some(UInt64Array::from(vec![4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true]), + expected_min_value_exact: BooleanArray::from(vec![true]), column_name: "i", check: Check::Both, } @@ -1485,6 +1669,9 @@ async fn test_uint32_range() { expected_max: Arc::new(UInt32Array::from(vec![300000])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: Some(UInt64Array::from(vec![4])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true]), + expected_min_value_exact: BooleanArray::from(vec![true]), column_name: "u", check: Check::Both, } @@ -1507,6 +1694,9 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt8Array::from(vec![100, u8::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "u8", check: Check::Both, } @@ -1518,6 +1708,9 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt16Array::from(vec![100, u16::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "u16", check: Check::Both, } @@ -1529,6 +1722,9 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt32Array::from(vec![100, u32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "u32", check: Check::Both, } @@ -1540,6 +1736,9 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt64Array::from(vec![100, u64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "u64", check: Check::Both, } @@ -1562,6 +1761,9 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "i8", check: Check::Both, } @@ -1573,6 +1775,9 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "i16", check: Check::Both, } @@ -1584,6 +1789,9 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "i32", check: Check::Both, } @@ -1595,6 +1803,9 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "i64", check: Check::Both, } @@ -1617,6 +1828,9 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float32Array::from(vec![100.0, f32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "f32", check: Check::Both, } @@ -1628,6 +1842,9 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float64Array::from(vec![100.0, f64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "f64", check: Check::Both, } @@ -1639,6 +1856,9 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float32Array::from(vec![100.0, -100.0])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "f32_nan", check: Check::Both, } @@ -1650,6 +1870,9 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float64Array::from(vec![100.0, -100.0])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "f64_nan", check: Check::Both, } @@ -1673,6 +1896,10 @@ async fn test_float64() { expected_max: Arc::new(Float64Array::from(vec![-1.0, 0.0, 4.0, 9.0])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), + column_name: "f", check: Check::Both, } @@ -1706,6 +1933,10 @@ async fn test_float16() { )), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true, true]), + column_name: "f", check: Check::Both, } @@ -1737,6 +1968,9 @@ async fn test_decimal() { ), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "decimal_col", check: Check::Both, } @@ -1767,6 +2001,9 @@ async fn test_decimal_256() { ), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "decimal256_col", check: Check::Both, } @@ -1787,6 +2024,9 @@ async fn test_dictionary() { expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "string_dict_i8", check: Check::Both, } @@ -1798,6 +2038,9 @@ async fn test_dictionary() { expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "string_dict_i32", check: Check::Both, } @@ -1809,6 +2052,9 @@ async fn test_dictionary() { expected_max: Arc::new(Int64Array::from(vec![0, 100])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "int_dict_i8", check: Check::Both, } @@ -1847,6 +2093,9 @@ async fn test_byte() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "name", check: Check::Both, } @@ -1867,6 +2116,9 @@ async fn test_byte() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "service_string", check: Check::Both, } @@ -1886,6 +2138,9 @@ async fn test_byte() { expected_max: Arc::new(BinaryArray::from(expected_service_binary_max_values)), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "service_binary", check: Check::Both, } @@ -1903,6 +2158,9 @@ async fn test_byte() { expected_max: Arc::new(FixedSizeBinaryArray::try_from_iter(max_input.into_iter()).unwrap()), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "service_fixedsize", check: Check::Both, } @@ -1924,6 +2182,9 @@ async fn test_byte() { )), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "service_large_binary", check: Check::Both, } @@ -1957,6 +2218,9 @@ async fn test_period_in_column_names() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "name", check: Check::Both, } @@ -1969,6 +2233,9 @@ async fn test_period_in_column_names() { expected_max: Arc::new(StringArray::from(vec!["frontend", "frontend", "backend"])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "service.name", check: Check::Both, } @@ -1993,6 +2260,9 @@ async fn test_boolean() { expected_max: Arc::new(BooleanArray::from(vec![true, false])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "bool", check: Check::Both, } @@ -2020,6 +2290,8 @@ async fn test_struct() { expected_max: Arc::new(struct_array(vec![(Some(2), Some(8.5), Some(14.0))])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: Some(UInt64Array::from(vec![3])), + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "struct", check: Check::RowGroup, } @@ -2043,6 +2315,9 @@ async fn test_utf8() { expected_max: Arc::new(StringArray::from(vec!["d", "i"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "utf8", check: Check::Both, } @@ -2055,6 +2330,9 @@ async fn test_utf8() { expected_max: Arc::new(LargeStringArray::from(vec!["d", "i"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "large_utf8", check: Check::Both, } @@ -2082,6 +2360,9 @@ async fn test_utf8_view() { ])), expected_null_counts: UInt64Array::from(vec![1, 3, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "utf8_view", check: Check::Both, } @@ -2109,6 +2390,9 @@ async fn test_binary_view() { expected_max: Arc::new(BinaryViewArray::from(expected_max)), expected_null_counts: UInt64Array::from(vec![1, 3, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true, true]), column_name: "binary_view", check: Check::Both, } @@ -2135,6 +2419,8 @@ async fn test_missing_statistics() { expected_max: Arc::new(Int64Array::from(vec![None])), expected_null_counts: UInt64Array::from(vec![None]), expected_row_counts: Some(UInt64Array::from(vec![3])), // still has row count statistics + expected_max_value_exact: BooleanArray::from(vec![None]), + expected_min_value_exact: BooleanArray::from(vec![None]), column_name: "i64", check: Check::Both, } @@ -2216,6 +2502,9 @@ async fn test_column_not_found() { expected_max: Arc::new(Int64Array::from(vec![18564, 21865])), expected_null_counts: UInt64Array::from(vec![2, 2]), expected_row_counts: Some(UInt64Array::from(vec![13, 7])), + // stats are exact + expected_max_value_exact: BooleanArray::from(vec![true, true]), + expected_min_value_exact: BooleanArray::from(vec![true, true]), column_name: "not_a_column", check: Check::Both, } @@ -2251,6 +2540,8 @@ async fn test_column_non_existent() { expected_null_counts: UInt64Array::from(vec![None, None, None, None]), // row counts are [5, 5, 5, 5] expected_row_counts: None, + expected_max_value_exact: BooleanArray::from(vec![None, None, None, None]), + expected_min_value_exact: BooleanArray::from(vec![None, None, None, None]), column_name: "i_do_not_exist", check: Check::Both, }