@@ -42,7 +42,7 @@ use arrow::datatypes::{
4242 Float32Type as ArrowFloat32Type , Float64Type as ArrowFloat64Type ,
4343 Int16Type as ArrowInt16Type , Int32Type as ArrowInt32Type ,
4444 Int64Type as ArrowInt64Type , Int8Type as ArrowInt8Type , IntervalUnit , Schema ,
45- Time32MillisecondType as ArrowTime32MillisecondType ,
45+ SchemaRef , Time32MillisecondType as ArrowTime32MillisecondType ,
4646 Time32SecondType as ArrowTime32SecondType ,
4747 Time64MicrosecondType as ArrowTime64MicrosecondType ,
4848 Time64NanosecondType as ArrowTime64NanosecondType , TimeUnit as ArrowTimeUnit ,
@@ -91,7 +91,7 @@ pub use byte_array::make_byte_array_reader;
9191pub use byte_array_dictionary:: make_byte_array_dictionary_reader;
9292
9393/// Array reader reads parquet data into arrow array.
94- pub trait ArrayReader {
94+ pub trait ArrayReader : Send {
9595 fn as_any ( & self ) -> & dyn Any ;
9696
9797 /// Returns the arrow type of this array reader.
@@ -117,6 +117,26 @@ pub trait ArrayReader {
117117 fn get_rep_levels ( & self ) -> Option < & [ i16 ] > ;
118118}
119119
120+ /// A collection of row groups
121+ pub trait RowGroupCollection {
122+ /// Get schema of parquet file.
123+ fn schema ( & self ) -> Result < SchemaDescPtr > ;
124+
125+ /// Returns an iterator over the column chunks for particular column
126+ fn column_chunks ( & self , i : usize ) -> Result < Box < dyn PageIterator > > ;
127+ }
128+
129+ impl RowGroupCollection for Arc < dyn FileReader > {
130+ fn schema ( & self ) -> Result < SchemaDescPtr > {
131+ Ok ( self . metadata ( ) . file_metadata ( ) . schema_descr_ptr ( ) )
132+ }
133+
134+ fn column_chunks ( & self , column_index : usize ) -> Result < Box < dyn PageIterator > > {
135+ let iterator = FilePageIterator :: new ( column_index, Arc :: clone ( self ) ) ?;
136+ Ok ( Box :: new ( iterator) )
137+ }
138+ }
139+
120140/// Uses `record_reader` to read up to `batch_size` records from `pages`
121141///
122142/// Returns the number of records read, which can be less than batch_size if
@@ -482,7 +502,7 @@ where
482502impl < T , C > ArrayReader for ComplexObjectArrayReader < T , C >
483503where
484504 T : DataType ,
485- C : Converter < Vec < Option < T :: T > > , ArrayRef > + ' static ,
505+ C : Converter < Vec < Option < T :: T > > , ArrayRef > + Send + ' static ,
486506{
487507 fn as_any ( & self ) -> & dyn Any {
488508 self
@@ -1315,9 +1335,9 @@ impl ArrayReader for StructArrayReader {
13151335/// Create array reader from parquet schema, column indices, and parquet file reader.
13161336pub fn build_array_reader < T > (
13171337 parquet_schema : SchemaDescPtr ,
1318- arrow_schema : Schema ,
1338+ arrow_schema : SchemaRef ,
13191339 column_indices : T ,
1320- file_reader : Arc < dyn FileReader > ,
1340+ row_groups : Box < dyn RowGroupCollection > ,
13211341) -> Result < Box < dyn ArrayReader > >
13221342where
13231343 T : IntoIterator < Item = usize > ,
@@ -1355,13 +1375,8 @@ where
13551375 fields : filtered_root_fields,
13561376 } ;
13571377
1358- ArrayReaderBuilder :: new (
1359- Arc :: new ( proj) ,
1360- Arc :: new ( arrow_schema) ,
1361- Arc :: new ( leaves) ,
1362- file_reader,
1363- )
1364- . build_array_reader ( )
1378+ ArrayReaderBuilder :: new ( Arc :: new ( proj) , arrow_schema, Arc :: new ( leaves) , row_groups)
1379+ . build_array_reader ( )
13651380}
13661381
13671382/// Used to build array reader.
@@ -1371,7 +1386,7 @@ struct ArrayReaderBuilder {
13711386 // Key: columns that need to be included in final array builder
13721387 // Value: column index in schema
13731388 columns_included : Arc < HashMap < * const Type , usize > > ,
1374- file_reader : Arc < dyn FileReader > ,
1389+ row_groups : Box < dyn RowGroupCollection > ,
13751390}
13761391
13771392/// Used in type visitor.
@@ -1671,13 +1686,13 @@ impl<'a> ArrayReaderBuilder {
16711686 root_schema : TypePtr ,
16721687 arrow_schema : Arc < Schema > ,
16731688 columns_included : Arc < HashMap < * const Type , usize > > ,
1674- file_reader : Arc < dyn FileReader > ,
1689+ file_reader : Box < dyn RowGroupCollection > ,
16751690 ) -> Self {
16761691 Self {
16771692 root_schema,
16781693 arrow_schema,
16791694 columns_included,
1680- file_reader,
1695+ row_groups : file_reader,
16811696 }
16821697 }
16831698
@@ -1711,10 +1726,10 @@ impl<'a> ArrayReaderBuilder {
17111726 context. rep_level ,
17121727 context. path . clone ( ) ,
17131728 ) ) ;
1714- let page_iterator = Box :: new ( FilePageIterator :: new (
1715- self . columns_included [ & ( cur_type . as_ref ( ) as * const Type ) ] ,
1716- self . file_reader . clone ( ) ,
1717- ) ? ) ;
1729+
1730+ let page_iterator = self
1731+ . row_groups
1732+ . column_chunks ( self . columns_included [ & ( cur_type . as_ref ( ) as * const Type ) ] ) ? ;
17181733
17191734 let arrow_type: Option < ArrowType > = self
17201735 . get_arrow_field ( & cur_type, context)
@@ -2827,7 +2842,8 @@ mod tests {
28272842 #[ test]
28282843 fn test_create_array_reader ( ) {
28292844 let file = get_test_file ( "nulls.snappy.parquet" ) ;
2830- let file_reader = Arc :: new ( SerializedFileReader :: new ( file) . unwrap ( ) ) ;
2845+ let file_reader: Arc < dyn FileReader > =
2846+ Arc :: new ( SerializedFileReader :: new ( file) . unwrap ( ) ) ;
28312847
28322848 let file_metadata = file_reader. metadata ( ) . file_metadata ( ) ;
28332849 let arrow_schema = parquet_to_arrow_schema (
@@ -2838,9 +2854,9 @@ mod tests {
28382854
28392855 let array_reader = build_array_reader (
28402856 file_reader. metadata ( ) . file_metadata ( ) . schema_descr_ptr ( ) ,
2841- arrow_schema,
2857+ Arc :: new ( arrow_schema) ,
28422858 vec ! [ 0usize ] . into_iter ( ) ,
2843- file_reader,
2859+ Box :: new ( file_reader) ,
28442860 )
28452861 . unwrap ( ) ;
28462862
0 commit comments