Fix Iceberg Reader for nested partitions

Andrei Ionescu · Andrei Ionescu · commit a53db5438290 · 2019-10-31T12:45:42.000+02:00
diff --git a/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java b/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java
@@ -81,13 +81,14 @@ public Schema record(Schema record, List<String> names, Iterable<Schema.Field> s
     List<Types.NestedField> expectedFields = struct.fields();
     for (int i = 0; i < expectedFields.size(); i += 1) {
       Types.NestedField field = expectedFields.get(i);
+      String sanitizedFieldName = AvroSchemaUtil.sanitize(field.name());
 
       // detect reordering
-      if (i < fields.size() && !field.name().equals(fields.get(i).name())) {
+      if (i < fields.size() && !sanitizedFieldName.equals(fields.get(i).name())) {
         hasChange = true;
       }
 
-      Schema.Field avroField = updateMap.get(field.name());
+      Schema.Field avroField = updateMap.get(sanitizedFieldName);
 
       if (avroField != null) {
         updatedFields.add(avroField);
@@ -123,7 +124,7 @@ public Schema.Field field(Schema.Field field, Supplier<Schema> fieldResult) {
       return null;
     }
 
-    String expectedName = expectedField.name();
+    String expectedName = AvroSchemaUtil.sanitize(expectedField.name());
 
     this.current = expectedField.type();
     try {
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -24,6 +24,7 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 import java.io.Closeable;
 import java.io.IOException;
 import java.io.Serializable;
@@ -396,7 +397,15 @@ private Iterator<InternalRow> open(FileScanTask task) {
       // schema or rows returned by readers
       Schema finalSchema = expectedSchema;
       PartitionSpec spec = task.spec();
-      Set<Integer> idColumns = spec.identitySourceIds();
+
+      Set<Integer> idColumns = Sets.newHashSet();
+      for (Integer i : spec.identitySourceIds()) {
+        if (spec.schema().columns().stream()
+            .noneMatch(j -> j.type().isStructType() && j.type().asStructType().field(i) != null)
+        ) {
+          idColumns.add(i);
+        }
+      }
 
       // schema needed for the projection and filtering
       StructType sparkType = SparkSchemaUtil.convert(finalSchema);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetWrite.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetWrite.java
@@ -32,6 +32,7 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
@@ -338,4 +339,65 @@ public void testPartitionedCreateWithTargetFileSizeViaOption() throws IOExceptio
     Assert.assertEquals("Should have 8 DataFiles", 8, files.size());
     Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000));
   }
+
+  @Test
+  public void testNestedPartitioning() throws IOException {
+    Schema nestedSchema = new Schema(
+        optional(1, "id", Types.IntegerType.get()),
+        optional(2, "data", Types.StringType.get()),
+        optional(3, "nestedData", Types.StructType.of(
+            optional(4, "id", Types.IntegerType.get()),
+            optional(5, "moreData", Types.StringType.get())))
+    );
+
+    File parent = temp.newFolder("parquet");
+    File location = new File(parent, "test");
+
+    HadoopTables tables = new HadoopTables(new Configuration());
+    PartitionSpec spec = PartitionSpec.builderFor(nestedSchema)
+        .identity("id")
+        .identity("nestedData.moreData")
+        .build();
+    Table table = tables.create(nestedSchema, spec, location.toString());
+
+    List<String> jsons = Lists.newArrayList(
+        "{ \"id\": 1, \"data\": \"a\", \"nestedData\": { \"id\": 100, \"moreData\": \"p1\"} }",
+        "{ \"id\": 2, \"data\": \"b\", \"nestedData\": { \"id\": 200, \"moreData\": \"p1\"} }",
+        "{ \"id\": 3, \"data\": \"c\", \"nestedData\": { \"id\": 300, \"moreData\": \"p2\"} }",
+        "{ \"id\": 4, \"data\": \"d\", \"nestedData\": { \"id\": 400, \"moreData\": \"p2\"} }"
+    );
+    Dataset<Row> df = spark.read().schema(SparkSchemaUtil.convert(nestedSchema))
+        .json(spark.createDataset(jsons, Encoders.STRING()));
+
+    // TODO: incoming columns must be ordered according to the table's schema
+    df.select("id", "data", "nestedData").write()
+        .format("iceberg")
+        .mode("append")
+        .save(location.toString());
+
+    table.refresh();
+
+    Dataset<Row> result = spark.read()
+        .format("iceberg")
+        .load(location.toString());
+
+    List<Row> actual = result.orderBy("id").collectAsList();
+    Assert.assertEquals("Number of rows should match", jsons.size(), actual.size());
+    Assert.assertEquals("Row 1 col 1 is 1", 1, actual.get(0).getInt(0));
+    Assert.assertEquals("Row 1 col 2 is a", "a", actual.get(0).getString(1));
+    Assert.assertEquals("Row 1 col 3,1 is 100", 100, actual.get(0).getStruct(2).getInt(0));
+    Assert.assertEquals("Row 1 col 3,2 is p1", "p1", actual.get(0).getStruct(2).getString(1));
+    Assert.assertEquals("Row 2 col 1 is 1", 2, actual.get(1).getInt(0));
+    Assert.assertEquals("Row 2 col 2 is a", "b", actual.get(1).getString(1));
+    Assert.assertEquals("Row 2 col 3,1 is 100", 200, actual.get(1).getStruct(2).getInt(0));
+    Assert.assertEquals("Row 2 col 3,2 is p1", "p1", actual.get(1).getStruct(2).getString(1));
+    Assert.assertEquals("Row 3 col 1 is 1", 3, actual.get(2).getInt(0));
+    Assert.assertEquals("Row 3 col 2 is a", "c", actual.get(2).getString(1));
+    Assert.assertEquals("Row 3 col 3,1 is 100", 300, actual.get(2).getStruct(2).getInt(0));
+    Assert.assertEquals("Row 3 col 3,2 is p1", "p2", actual.get(2).getStruct(2).getString(1));
+    Assert.assertEquals("Row 4 col 1 is 1", 4, actual.get(3).getInt(0));
+    Assert.assertEquals("Row 4 col 2 is a", "d", actual.get(3).getString(1));
+    Assert.assertEquals("Row 4 col 3,1 is 100", 400, actual.get(3).getStruct(2).getInt(0));
+    Assert.assertEquals("Row 4 col 3,2 is p1", "p2", actual.get(3).getStruct(2).getString(1));
+  }
 }