Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet/src/arrow/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ mod test {
.unwrap();
assert_eq!(
err.to_string(),
"EOF: Parquet file too small. Page index range 82..115 overlaps with file metadata 0..341"
"EOF: Parquet file too small. Page index range 82..115 overlaps with file metadata 0..357"
);
}

Expand Down
57 changes: 57 additions & 0 deletions parquet/src/file/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,9 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
if let Some(statistics) = metadata.statistics() {
builder = builder.set_statistics(statistics.clone())
}
if let Some(page_encoding_stats) = metadata.page_encoding_stats() {
builder = builder.set_page_encoding_stats(page_encoding_stats.clone())
}
builder = self.set_column_crypto_metadata(builder, &metadata);
close.metadata = builder.build()?;

Expand Down Expand Up @@ -1004,6 +1007,7 @@ mod tests {
use crate::column::reader::get_typed_column_reader;
use crate::compression::{create_codec, Codec, CodecOptionsBuilder};
use crate::data_type::{BoolType, ByteArrayType, Int32Type};
use crate::file::page_encoding_stats::PageEncodingStats;
use crate::file::page_index::index::Index;
use crate::file::properties::EnabledStatistics;
use crate::file::serialized_reader::ReadOptionsBuilder;
Expand Down Expand Up @@ -2394,4 +2398,57 @@ mod tests {
start += 1;
}
}

#[test]
fn test_page_encoding_statistics_roundtrip() {
let message_type = "
message test_schema {
REQUIRED BYTE_ARRAY a (UTF8);
}
";
let schema = Arc::new(parse_message_type(message_type).unwrap());
let data = ByteArrayType::gen_vec(32, 7);
let file: File = tempfile::tempfile().unwrap();
let props = Arc::new(
WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::Page)
.build(),
);

let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap();
let mut row_group_writer = writer.next_row_group().unwrap();

let mut col_writer = row_group_writer.next_column().unwrap().unwrap();
col_writer
.typed::<ByteArrayType>()
.write_batch(&data, None, None)
.unwrap();
col_writer.close().unwrap();
row_group_writer.close().unwrap();
let file_metadata = writer.close().unwrap();

assert_eq!(file_metadata.row_groups.len(), 1);
assert_eq!(file_metadata.row_groups[0].columns.len(), 1);
let chunk_meta = file_metadata.row_groups[0].columns[0]
.meta_data
.as_ref()
.expect("column metadata missing");
assert!(chunk_meta.encoding_stats.is_some());
let chunk_page_stats = chunk_meta.encoding_stats.as_ref().unwrap();

// check that the read metadata is also correct
let options = ReadOptionsBuilder::new().with_page_index().build();
let reader = SerializedFileReader::new_with_options(file, options).unwrap();

let rowgroup = reader.get_row_group(0).expect("row group missing");
assert_eq!(rowgroup.num_columns(), 1);
let column = rowgroup.metadata().column(0);
assert!(column.page_encoding_stats().is_some());
let file_page_stats = column.page_encoding_stats().unwrap();
let chunk_stats: Vec<PageEncodingStats> = chunk_page_stats
.iter()
.map(|x| crate::file::page_encoding_stats::try_from_thrift(x).unwrap())
.collect();
assert_eq!(&chunk_stats, file_page_stats);
}
}
Loading