1919package org .apache .parquet .hadoop ;
2020
2121import static java .util .Collections .emptyList ;
22+ import static java .util .stream .Collectors .toList ;
2223import static org .apache .parquet .filter2 .predicate .FilterApi .and ;
2324import static org .apache .parquet .filter2 .predicate .FilterApi .binaryColumn ;
2425import static org .apache .parquet .filter2 .predicate .FilterApi .doubleColumn ;
3334import static org .apache .parquet .filter2 .predicate .FilterApi .userDefined ;
3435import static org .apache .parquet .filter2 .predicate .LogicalInverter .invert ;
3536import static org .apache .parquet .hadoop .ParquetFileWriter .Mode .OVERWRITE ;
37+ import static org .apache .parquet .schema .LogicalTypeAnnotation .stringType ;
38+ import static org .apache .parquet .schema .PrimitiveType .PrimitiveTypeName .BINARY ;
39+ import static org .apache .parquet .schema .PrimitiveType .PrimitiveTypeName .DOUBLE ;
40+ import static org .apache .parquet .schema .PrimitiveType .PrimitiveTypeName .INT64 ;
41+ import static org .apache .parquet .schema .Types .optional ;
42+ import static org .apache .parquet .schema .Types .required ;
3643import static org .junit .Assert .assertEquals ;
3744import static org .junit .Assert .assertFalse ;
3845import static org .junit .Assert .assertTrue ;
6471import org .apache .parquet .filter2 .recordlevel .PhoneBookWriter .Location ;
6572import org .apache .parquet .filter2 .recordlevel .PhoneBookWriter .PhoneNumber ;
6673import org .apache .parquet .filter2 .recordlevel .PhoneBookWriter .User ;
74+ import org .apache .parquet .hadoop .api .ReadSupport ;
6775import org .apache .parquet .hadoop .example .ExampleParquetWriter ;
6876import org .apache .parquet .hadoop .example .GroupReadSupport ;
6977import org .apache .parquet .io .api .Binary ;
78+ import org .apache .parquet .schema .MessageType ;
79+ import org .apache .parquet .schema .Types ;
7080import org .junit .AfterClass ;
7181import org .junit .BeforeClass ;
7282import org .junit .Test ;
@@ -87,6 +97,19 @@ public class TestColumnIndexFiltering {
8797 private static final List <User > DATA = Collections .unmodifiableList (generateData (10000 ));
8898 private static final Path FILE_V1 = createTempFile ();
8999 private static final Path FILE_V2 = createTempFile ();
100+ private static final MessageType SCHEMA_WITHOUT_NAME = Types .buildMessage ()
101+ .required (INT64 ).named ("id" )
102+ .optionalGroup ()
103+ .addField (optional (DOUBLE ).named ("lon" ))
104+ .addField (optional (DOUBLE ).named ("lat" ))
105+ .named ("location" )
106+ .optionalGroup ()
107+ .repeatedGroup ()
108+ .addField (required (INT64 ).named ("number" ))
109+ .addField (optional (BINARY ).as (stringType ()).named ("kind" ))
110+ .named ("phone" )
111+ .named ("phoneNumbers" )
112+ .named ("user_without_name" );
90113
91114 @ Parameters
92115 public static Collection <Object []> params () {
@@ -199,6 +222,16 @@ private List<User> readUsers(Filter filter, boolean useOtherFiltering, boolean u
199222 .useColumnIndexFilter (useColumnIndexFilter ));
200223 }
201224
225+ private List <User > readUsersWithProjection (Filter filter , MessageType schema , boolean useOtherFiltering , boolean useColumnIndexFilter ) throws IOException {
226+ return PhoneBookWriter .readUsers (ParquetReader .builder (new GroupReadSupport (), file )
227+ .withFilter (filter )
228+ .useDictionaryFilter (useOtherFiltering )
229+ .useStatsFilter (useOtherFiltering )
230+ .useRecordFilter (useOtherFiltering )
231+ .useColumnIndexFilter (useColumnIndexFilter )
232+ .set (ReadSupport .PARQUET_READ_SCHEMA , schema .toString ()));
233+ }
234+
202235 // Assumes that both lists are in the same order
203236 private static void assertContains (Stream <User > expected , List <User > actual ) {
204237 Iterator <User > expIt = expected .iterator ();
@@ -441,4 +474,21 @@ record -> record.getId() == 1234,
441474 or (eq (longColumn ("id" ), 1234l ),
442475 userDefined (longColumn ("not-existing-long" ), new IsDivisibleBy (1 ))));
443476 }
477+
478+ @ Test
479+ public void testFilteringWithProjection () throws IOException {
480+ // All rows shall be retrieved because all values in column 'name' shall be handled as null values
481+ assertEquals (
482+ DATA .stream ().map (user -> user .cloneWithName (null )).collect (toList ()),
483+ readUsersWithProjection (FilterCompat .get (eq (binaryColumn ("name" ), null )), SCHEMA_WITHOUT_NAME , true , true ));
484+
485+ // Column index filter shall drop all pages because all values in column 'name' shall be handled as null values
486+ assertEquals (
487+ emptyList (),
488+ readUsersWithProjection (FilterCompat .get (notEq (binaryColumn ("name" ), null )), SCHEMA_WITHOUT_NAME , false , true ));
489+ assertEquals (
490+ emptyList (),
491+ readUsersWithProjection (FilterCompat .get (userDefined (binaryColumn ("name" ), NameStartsWithVowel .class )),
492+ SCHEMA_WITHOUT_NAME , false , true ));
493+ }
444494}
0 commit comments