diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8aac5d74391f..b43af1fbdda3 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3018,7 +3018,10 @@ mod tests { // write data // and check the offset index and column index let page_writer = get_test_page_writer(); - let props = Default::default(); + let props = WriterProperties::builder() + .set_statistics_truncate_length(None) // disable column index truncation + .build() + .into(); let mut writer = get_test_column_writer::(page_writer, 0, 0, props); let mut data = vec![FixedLenByteArray::default(); 3]; @@ -3214,6 +3217,49 @@ mod tests { } } + #[test] + fn test_statistics_truncating_byte_array_default() { + let page_writer = get_test_page_writer(); + + // The default truncate length is 64 bytes + let props = WriterProperties::builder().build().into(); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + + let mut data = vec![ByteArray::default(); 1]; + data[0].set_data(Bytes::from(String::from( + "This string is longer than 64 bytes, so it will almost certainly be truncated.", + ))); + writer.write_batch(&data, None, None).unwrap(); + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + + assert_eq!(1, r.rows_written); + + let stats = r.metadata.statistics().expect("statistics"); + if let Statistics::ByteArray(_stats) = stats { + let min_value = _stats.min_opt().unwrap(); + let max_value = _stats.max_opt().unwrap(); + + assert!(!_stats.min_is_exact()); + assert!(!_stats.max_is_exact()); + + let expected_len = 64; + assert_eq!(min_value.len(), expected_len); + assert_eq!(max_value.len(), expected_len); + + let expected_min = + "This string is longer than 64 bytes, so it will almost certainly".as_bytes(); + assert_eq!(expected_min, min_value.as_bytes()); + // note the max value is different from the min value: the last byte is incremented + let expected_max = + "This string is longer than 64 bytes, so it will almost certainlz".as_bytes(); + assert_eq!(expected_max, max_value.as_bytes()); + } else { + panic!("expecting Statistics::ByteArray"); + } + } + #[test] fn test_statistics_truncating_byte_array() { let page_writer = get_test_page_writer(); diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index a84d58bcce89..280661d2a2dc 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -58,7 +58,7 @@ pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; /// Default value for [`BloomFilterProperties::ndv`] pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64; /// Default values for [`WriterProperties::statistics_truncate_length`] -pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option = None; +pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option = Some(64); /// Default value for [`WriterProperties::offset_index_disabled`] pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false; /// Default values for [`WriterProperties::coerce_types`] @@ -657,7 +657,7 @@ impl WriterPropertiesBuilder { } /// Sets the max length of min/max value fields in row group and data page header - /// [`Statistics`] (defaults to `None` (no limit) via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]). + /// [`Statistics`] (defaults to `Some(64)` via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]). /// /// # Notes /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is