@@ -85,11 +85,15 @@ public class ParquetMetadataConverter {
8585 private final boolean useSignedStringMinMax ;
8686
8787 public ParquetMetadataConverter () {
88- this . useSignedStringMinMax = false ;
88+ this ( false ) ;
8989 }
9090
9191 public ParquetMetadataConverter (Configuration conf ) {
92- this .useSignedStringMinMax = conf .getBoolean ("parquet.strings.use-signed-order" , false );
92+ this (conf .getBoolean ("parquet.strings.signed-min-max.enabled" , false ));
93+ }
94+
95+ private ParquetMetadataConverter (boolean useSignedStringMinMax ) {
96+ this .useSignedStringMinMax = useSignedStringMinMax ;
9397 }
9498
9599 // NOTE: this cache is for memory savings, not cpu savings, and is used to de-duplicate
@@ -330,14 +334,14 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
330334
331335 // Visible for testing
332336 static org .apache .parquet .column .statistics .Statistics fromParquetStatisticsInternal
333- (String createdBy , Statistics statistics , PrimitiveTypeName type , SortOrder expectedOrder ) {
337+ (String createdBy , Statistics statistics , PrimitiveTypeName type , SortOrder typeSortOrder ) {
334338 // create stats object based on the column type
335339 org .apache .parquet .column .statistics .Statistics stats = org .apache .parquet .column .statistics .Statistics .getStatsBasedOnType (type );
336340 // If there was no statistics written to the footer, create an empty Statistics object and return
337341
338342 // NOTE: See docs in CorruptStatistics for explanation of why this check is needed
339343 if (statistics != null && !CorruptStatistics .shouldIgnoreStatistics (createdBy , type ) &&
340- SortOrder .SIGNED == expectedOrder ) {
344+ SortOrder .SIGNED == typeSortOrder ) {
341345 if (statistics .isSetMax () && statistics .isSetMin ()) {
342346 stats .setMinMaxFromBytes (statistics .min .array (), statistics .max .array ());
343347 }
@@ -348,7 +352,8 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
348352
349353 public org .apache .parquet .column .statistics .Statistics fromParquetStatistics (
350354 String createdBy , Statistics statistics , PrimitiveType type ) {
351- SortOrder expectedOrder = isSignedOrderOkay (type ) ? SortOrder .SIGNED : sortOrder (type );
355+ SortOrder expectedOrder = overrideSortOrderToSigned (type ) ?
356+ SortOrder .SIGNED : sortOrder (type );
352357 return fromParquetStatisticsInternal (
353358 createdBy , statistics , type .getPrimitiveTypeName (), expectedOrder );
354359 }
@@ -359,45 +364,55 @@ enum SortOrder {
359364 UNKNOWN
360365 }
361366
362- private boolean isSignedOrderOkay ( PrimitiveType type ) {
363- if (! useSignedStringMinMax ) {
364- return false ;
365- }
367+ private static final Set < OriginalType > STRING_TYPES = Collections
368+ . unmodifiableSet ( new HashSet <>( Arrays . asList (
369+ OriginalType . UTF8 , OriginalType . ENUM , OriginalType . JSON
370+ )));
366371
372+ /**
373+ * Returns whether to use signed order min and max with a type. It is safe to
374+ * use signed min and max when the type is a string type and contains only
375+ * ASCII characters (where the sign bit was 0). This checks whether the type
376+ * is a string type and uses {@code useSignedStringMinMax} to determine if
377+ * only ASCII characters were written.
378+ *
379+ * @param type a primitive type with a logical type annotation
380+ * @return true if signed order min/max can be used with this type
381+ */
382+ private boolean overrideSortOrderToSigned (PrimitiveType type ) {
367383 // even if the override is set, only return stats for string-ish types
368- if (type .getPrimitiveTypeName () != PrimitiveTypeName .BINARY ) {
369- return false ;
370- }
371- if (type .getOriginalType () == null ) {
372- return true ; // plain binary is okay
373- }
374- switch (type .getOriginalType ()) {
375- case UTF8 :
376- case ENUM :
377- case BSON :
378- case JSON :
379- return true ;
380- default : // includes decimal
381- return false ;
382- }
384+ // a null type annotation is considered string-ish because some writers
385+ // failed to use the UTF8 annotation.
386+ OriginalType annotation = type .getOriginalType ();
387+ return useSignedStringMinMax &&
388+ PrimitiveTypeName .BINARY == type .getPrimitiveTypeName () &&
389+ (annotation == null || STRING_TYPES .contains (annotation ));
383390 }
384391
392+ /**
393+ * @param primitive a primitive physical type
394+ * @return the default sort order used when the logical type is not known
395+ */
385396 private static SortOrder defaultSortOrder (PrimitiveTypeName primitive ) {
386397 switch (primitive ) {
387398 case BOOLEAN :
388399 case INT32 :
389400 case INT64 :
390401 case FLOAT :
391402 case DOUBLE :
392- case BINARY : // without a logical type, signed is okay
393- case FIXED_LEN_BYTE_ARRAY :
394403 return SortOrder .SIGNED ;
404+ case BINARY :
405+ case FIXED_LEN_BYTE_ARRAY :
395406 case INT96 : // only used for timestamp, which uses unsigned values
396407 return SortOrder .UNSIGNED ;
397408 }
398409 return SortOrder .UNKNOWN ;
399410 }
400411
412+ /**
413+ * @param primitive a primitive type with a logical type annotation
414+ * @return the "correct" sort order of the type that applications assume
415+ */
401416 private static SortOrder sortOrder (PrimitiveType primitive ) {
402417 OriginalType annotation = primitive .getOriginalType ();
403418 if (annotation != null ) {
@@ -416,12 +431,12 @@ private static SortOrder sortOrder(PrimitiveType primitive) {
416431 case UINT_16 :
417432 case UINT_32 :
418433 case UINT_64 :
419- case DECIMAL :
420434 case ENUM :
421435 case UTF8 :
422436 case BSON :
423437 case JSON :
424438 return SortOrder .UNSIGNED ;
439+ case DECIMAL :
425440 case LIST :
426441 case MAP :
427442 case MAP_KEY_VALUE :
0 commit comments