Skip to content

Commit dbfb6a9

Browse files
committed
PARQUET-686: Address review comments.
1 parent 8cdab31 commit dbfb6a9

4 files changed

Lines changed: 51 additions & 37 deletions

File tree

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,15 @@ public class ParquetMetadataConverter {
8585
private final boolean useSignedStringMinMax;
8686

8787
public ParquetMetadataConverter() {
88-
this.useSignedStringMinMax = false;
88+
this(false);
8989
}
9090

9191
public ParquetMetadataConverter(Configuration conf) {
92-
this.useSignedStringMinMax = conf.getBoolean("parquet.strings.use-signed-order", false);
92+
this(conf.getBoolean("parquet.strings.signed-min-max.enabled", false));
93+
}
94+
95+
private ParquetMetadataConverter(boolean useSignedStringMinMax) {
96+
this.useSignedStringMinMax = useSignedStringMinMax;
9397
}
9498

9599
// NOTE: this cache is for memory savings, not cpu savings, and is used to de-duplicate
@@ -330,14 +334,14 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
330334

331335
// Visible for testing
332336
static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
333-
(String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder expectedOrder) {
337+
(String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder) {
334338
// create stats object based on the column type
335339
org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type);
336340
// If there was no statistics written to the footer, create an empty Statistics object and return
337341

338342
// NOTE: See docs in CorruptStatistics for explanation of why this check is needed
339343
if (statistics != null && !CorruptStatistics.shouldIgnoreStatistics(createdBy, type) &&
340-
SortOrder.SIGNED == expectedOrder) {
344+
SortOrder.SIGNED == typeSortOrder) {
341345
if (statistics.isSetMax() && statistics.isSetMin()) {
342346
stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array());
343347
}
@@ -348,7 +352,8 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
348352

349353
public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
350354
String createdBy, Statistics statistics, PrimitiveType type) {
351-
SortOrder expectedOrder = isSignedOrderOkay(type) ? SortOrder.SIGNED : sortOrder(type);
355+
SortOrder expectedOrder = overrideSortOrderToSigned(type) ?
356+
SortOrder.SIGNED : sortOrder(type);
352357
return fromParquetStatisticsInternal(
353358
createdBy, statistics, type.getPrimitiveTypeName(), expectedOrder);
354359
}
@@ -359,45 +364,55 @@ enum SortOrder {
359364
UNKNOWN
360365
}
361366

362-
private boolean isSignedOrderOkay(PrimitiveType type) {
363-
if (!useSignedStringMinMax) {
364-
return false;
365-
}
367+
private static final Set<OriginalType> STRING_TYPES = Collections
368+
.unmodifiableSet(new HashSet<>(Arrays.asList(
369+
OriginalType.UTF8, OriginalType.ENUM, OriginalType.JSON
370+
)));
366371

372+
/**
373+
* Returns whether to use signed order min and max with a type. It is safe to
374+
* use signed min and max when the type is a string type and contains only
375+
* ASCII characters (where the sign bit was 0). This checks whether the type
376+
* is a string type and uses {@code useSignedStringMinMax} to determine if
377+
* only ASCII characters were written.
378+
*
379+
* @param type a primitive type with a logical type annotation
380+
* @return true if signed order min/max can be used with this type
381+
*/
382+
private boolean overrideSortOrderToSigned(PrimitiveType type) {
367383
// even if the override is set, only return stats for string-ish types
368-
if (type.getPrimitiveTypeName() != PrimitiveTypeName.BINARY) {
369-
return false;
370-
}
371-
if (type.getOriginalType() == null) {
372-
return true; // plain binary is okay
373-
}
374-
switch (type.getOriginalType()) {
375-
case UTF8:
376-
case ENUM:
377-
case BSON:
378-
case JSON:
379-
return true;
380-
default: // includes decimal
381-
return false;
382-
}
384+
// a null type annotation is considered string-ish because some writers
385+
// failed to use the UTF8 annotation.
386+
OriginalType annotation = type.getOriginalType();
387+
return useSignedStringMinMax &&
388+
PrimitiveTypeName.BINARY == type.getPrimitiveTypeName() &&
389+
(annotation == null || STRING_TYPES.contains(annotation));
383390
}
384391

392+
/**
393+
* @param primitive a primitive physical type
394+
* @return the default sort order used when the logical type is not known
395+
*/
385396
private static SortOrder defaultSortOrder(PrimitiveTypeName primitive) {
386397
switch (primitive) {
387398
case BOOLEAN:
388399
case INT32:
389400
case INT64:
390401
case FLOAT:
391402
case DOUBLE:
392-
case BINARY: // without a logical type, signed is okay
393-
case FIXED_LEN_BYTE_ARRAY:
394403
return SortOrder.SIGNED;
404+
case BINARY:
405+
case FIXED_LEN_BYTE_ARRAY:
395406
case INT96: // only used for timestamp, which uses unsigned values
396407
return SortOrder.UNSIGNED;
397408
}
398409
return SortOrder.UNKNOWN;
399410
}
400411

412+
/**
413+
* @param primitive a primitive type with a logical type annotation
414+
* @return the "correct" sort order of the type that applications assume
415+
*/
401416
private static SortOrder sortOrder(PrimitiveType primitive) {
402417
OriginalType annotation = primitive.getOriginalType();
403418
if (annotation != null) {
@@ -416,12 +431,12 @@ private static SortOrder sortOrder(PrimitiveType primitive) {
416431
case UINT_16:
417432
case UINT_32:
418433
case UINT_64:
419-
case DECIMAL:
420434
case ENUM:
421435
case UTF8:
422436
case BSON:
423437
case JSON:
424438
return SortOrder.UNSIGNED;
439+
case DECIMAL:
425440
case LIST:
426441
case MAP:
427442
case MAP_KEY_VALUE:

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@ public void testIgnoreStatsWithSignedSortOrder() {
541541
public void testUseStatsWithSignedSortOrder() {
542542
// override defaults and use stats that were accumulated using signed order
543543
Configuration conf = new Configuration();
544-
conf.set("parquet.strings.use-signed-order", "true");
544+
conf.setBoolean("parquet.strings.signed-min-max.enabled", true);
545545

546546
ParquetMetadataConverter converter = new ParquetMetadataConverter(conf);
547547
BinaryStatistics stats = new BinaryStatistics();
@@ -554,7 +554,8 @@ public void testUseStatsWithSignedSortOrder() {
554554
Statistics convertedStats = converter.fromParquetStatistics(
555555
Version.FULL_VERSION,
556556
ParquetMetadataConverter.toParquetStatistics(stats),
557-
Types.required(PrimitiveTypeName.BINARY).named("b"));
557+
Types.required(PrimitiveTypeName.BINARY)
558+
.as(OriginalType.UTF8).named("b"));
558559

559560
Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
560561
Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@
2424
import org.apache.hadoop.fs.FileStatus;
2525
import org.apache.hadoop.fs.FileSystem;
2626
import org.apache.hadoop.fs.Path;
27-
import org.apache.parquet.CorruptStatistics;
2827
import org.apache.parquet.Version;
29-
import org.apache.parquet.VersionParser;
3028
import org.apache.parquet.bytes.BytesUtils;
3129
import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
3230
import org.junit.Assume;
@@ -452,9 +450,9 @@ public void testWriteReadStatistics() throws Exception {
452450

453451
Path path = new Path(testFile.toURI());
454452
Configuration configuration = new Configuration();
455-
configuration.set("parquet.strings.use-signed-order", "true");
453+
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
456454

457-
MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;} required group c { required int64 d; }}");
455+
MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
458456
String[] path1 = {"a", "b"};
459457
ColumnDescriptor c1 = schema.getColumnDescription(path1);
460458
String[] path2 = {"c", "d"};
@@ -585,14 +583,14 @@ public void testWriteReadStatisticsAllNulls() throws Exception {
585583
testFile.delete();
586584

587585
writeSchema = "message example {\n" +
588-
"required binary content;\n" +
586+
"required binary content (UTF8);\n" +
589587
"}";
590588

591589
Path path = new Path(testFile.toURI());
592590

593591
MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
594592
Configuration configuration = new Configuration();
595-
configuration.set("parquet.strings.use-signed-order", "true");
593+
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
596594
GroupWriteSupport.setSchema(schema, configuration);
597595

598596
ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());

parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ public void testWriteStatistics() throws Exception {
114114
new RequiredPrimitiveFixture(false, (byte)100, (short)100, 100, 287l, -9.0d, "world"),
115115
new RequiredPrimitiveFixture(true, (byte)2, (short)2, 9, -17l, 9.63d, "hello"));
116116
final Configuration configuration = new Configuration();
117-
configuration.set("parquet.strings.use-signed-order", "true");
117+
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
118118
final FileSystem fs = p.getFileSystem(configuration);
119119
FileStatus fileStatus = fs.getFileStatus(p);
120120
ParquetMetadata footer = ParquetFileReader.readFooter(configuration, p);
@@ -161,7 +161,7 @@ public void testWriteStatistics() throws Exception {
161161

162162
// make new configuration and create file with new large stats
163163
final Configuration configuration_large = new Configuration();
164-
configuration_large.set("parquet.strings.use-signed-order", "true");
164+
configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
165165
final FileSystem fs_large = p_large.getFileSystem(configuration_large);
166166
FileStatus fileStatus_large = fs_large.getFileStatus(p_large);
167167
ParquetMetadata footer_large = ParquetFileReader.readFooter(configuration_large, p_large);

0 commit comments

Comments
 (0)