Added more test cases

rdsr · rdsr · commit a1c8e6eff60e · 2020-04-05T13:55:22.000-07:00
diff --git a/core/src/main/java/org/apache/iceberg/hadoop/Util.java b/core/src/main/java/org/apache/iceberg/hadoop/Util.java
@@ -33,7 +33,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-
 public class Util {
   private static final Logger LOG = LoggerFactory.getLogger(Util.class);
 
diff --git a/mr/src/main/java/org/apache/iceberg/mr/SerializationUtil.java b/mr/src/main/java/org/apache/iceberg/mr/SerializationUtil.java
@@ -26,8 +26,6 @@
 import java.io.ObjectOutputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.Base64;
-import java.util.zip.GZIPInputStream;
-import java.util.zip.GZIPOutputStream;
 import org.apache.iceberg.exceptions.RuntimeIOException;
 
 
@@ -38,8 +36,7 @@ private SerializationUtil() {
 
   public static byte[] serializeToBytes(Object obj) {
     try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        GZIPOutputStream gos = new GZIPOutputStream(baos);
-        ObjectOutputStream oos = new ObjectOutputStream(gos)) {
+        ObjectOutputStream oos = new ObjectOutputStream(baos)) {
       oos.writeObject(obj);
       return baos.toByteArray();
     } catch (IOException e) {
@@ -54,8 +51,7 @@ public static <T> T deserializeFromBytes(byte[] bytes) {
     }
 
     try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
-        GZIPInputStream gis = new GZIPInputStream(bais);
-        ObjectInputStream ois = new ObjectInputStream(gis)) {
+        ObjectInputStream ois = new ObjectInputStream(bais)) {
       return (T) ois.readObject();
     } catch (IOException e) {
       throw new RuntimeIOException("Failed to deserialize object", e);
diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -17,11 +17,12 @@
  * under the License.
  */
 
-package org.apache.iceberg.mr;
+package org.apache.iceberg.mr.mapreduce;
 
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 import java.io.Closeable;
 import java.io.DataInput;
 import java.io.DataOutput;
@@ -65,6 +66,7 @@
 import org.apache.iceberg.hadoop.Util;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.mr.SerializationUtil;
 import org.apache.iceberg.orc.ORC;
 import org.apache.iceberg.parquet.Parquet;
 import org.apache.iceberg.types.TypeUtil;
@@ -85,7 +87,7 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
   static final String FILTER_EXPRESSION = "iceberg.mr.filter.expression";
   static final String IN_MEMORY_DATA_MODEL = "iceberg.mr.in.memory.data.model";
   static final String READ_SCHEMA = "iceberg.mr.read.schema";
-  static final String REUSE_CONTAINERS = "iceberg.mr.case.sensitive";
+  static final String REUSE_CONTAINERS = "iceberg.mr.reuse.containers";
   static final String SNAPSHOT_ID = "iceberg.mr.snapshot.id";
   static final String SPLIT_SIZE = "iceberg.mr.split.size";
   static final String TABLE_PATH = "iceberg.mr.table.path";
@@ -197,7 +199,7 @@ public ConfigBuilder usePigTuples() {
      * can correctly apply the residual filters, then it
      * should call this api. Otherwise the current
      * api will throw an exception if the passed in
-     * filter is not completely satisfied. Note. This
+     * filter is not completely satisfied. Note: This
      * does not apply to standalone MR application
      */
     public ConfigBuilder platformAppliesFilterResiduals() {
@@ -264,7 +266,7 @@ private static void checkResiduals(Configuration conf, CombinedScanTask task) {
         if (residual != null && !residual.equals(Expressions.alwaysTrue())) {
           throw new RuntimeException(
               String.format(
-                  "Filter expression %s is not completely satisfied . Additional rows " +
+                  "Filter expression %s is not completely satisfied. Additional rows " +
                       "can be returned not satisfied by the filter expression", residual));
         }
       });
@@ -346,14 +348,15 @@ private Iterator<T> open(FileScanTask currentTask) {
       DataFile file = currentTask.file();
       // schema of rows returned by readers
       PartitionSpec spec = currentTask.spec();
-      Set<Integer> idColumns = spec.identitySourceIds();
       Schema readSchema = expectedSchema != null ? expectedSchema : tableSchema;
+      Set<Integer> idColumns =  Sets.intersection(spec.identitySourceIds(), TypeUtil.getProjectedIds(readSchema));
       boolean hasJoinedPartitionColumns = !idColumns.isEmpty();
+
       if (hasJoinedPartitionColumns) {
-        readSchema = TypeUtil.selectNot(tableSchema, idColumns);
-        Schema identityPartitionSchema = TypeUtil.select(tableSchema, idColumns);
+        Schema readDataSchema = TypeUtil.selectNot(readSchema, idColumns);
+        Schema identityPartitionSchema = TypeUtil.select(readSchema, idColumns);
         return Iterators.transform(
-            open(currentTask, readSchema),
+            open(currentTask, readDataSchema),
             row -> withPartitionColumns(row, identityPartitionSchema, spec, file.partition()));
       } else {
         return open(currentTask, readSchema);
@@ -482,15 +485,15 @@ private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTas
 
     private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
       ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile)
-                                          .schema(readSchema)
+                                          .project(readSchema)
                                           .caseSensitive(caseSensitive)
                                           .split(task.start(), task.length());
       // ORC does not support reuse containers yet
       switch (inMemoryDataModel) {
         case PIG:
         case HIVE:
-          //TODO implement value readers for Pig and Hive
-          throw new UnsupportedOperationException();
+          //TODO: implement value readers for Pig and Hive
+          throw new UnsupportedOperationException("In memory representation not yet supported for Pig and Hive");
         case DEFAULT:
           //TODO: We do not have support for Iceberg generics for ORC
           throw new UnsupportedOperationException();
@@ -502,6 +505,7 @@ private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask ta
 
   private static Table findTable(Configuration conf) {
     String path = conf.get(TABLE_PATH);
+    Preconditions.checkArgument(path != null, "Table path should not be null");
     String catalogFuncClass = conf.get(CATALOG);
     if (catalogFuncClass != null) {
       Function<Configuration, Catalog> catalogFunc
@@ -521,8 +525,8 @@ private static Table findTable(Configuration conf) {
     }
   }
 
-  private static class IcebergSplit extends InputSplit implements Writable {
-    private static final String[] ANYWHERE = new String[]{"*"};
+  static class IcebergSplit extends InputSplit implements Writable {
+    static final String[] ANYWHERE = new String[]{"*"};
     private CombinedScanTask task;
     private transient String[] locations;
     private transient Configuration conf;
diff --git a/mr/src/test/java/org/apache/iceberg/mr/mapreduce/TestIcebergInputFormat.java b/mr/src/test/java/org/apache/iceberg/mr/mapreduce/TestIcebergInputFormat.java
@@ -17,10 +17,11 @@
  * under the License.
  */
 
-package org.apache.iceberg.mr;
+package org.apache.iceberg.mr.mapreduce;
 
 import com.google.common.collect.FluentIterable;
 import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
@@ -34,6 +35,7 @@
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.apache.iceberg.AssertHelpers;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DataFiles;
 import org.apache.iceberg.FileFormat;
@@ -51,10 +53,12 @@
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.data.avro.DataWriter;
 import org.apache.iceberg.data.parquet.GenericParquetWriter;
+import org.apache.iceberg.expressions.Expressions;
 import org.apache.iceberg.hadoop.HadoopCatalog;
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.types.TypeUtil;
 import org.apache.iceberg.types.Types;
 import org.junit.Assert;
 import org.junit.Before;
@@ -69,15 +73,15 @@
 
 @RunWith(Parameterized.class)
 public class TestIcebergInputFormat {
-  private static final Schema SCHEMA = new Schema(
+  static final Schema SCHEMA = new Schema(
       required(1, "data", Types.StringType.get()),
-      required(3, "id", Types.LongType.get()),
-      required(2, "date", Types.StringType.get()));
+      required(2, "id", Types.LongType.get()),
+      required(3, "date", Types.StringType.get()));
 
-  private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA)
-                                                         .identity("date")
-                                                         .bucket("id", 1)
-                                                         .build();
+  static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA)
+                                                 .identity("date")
+                                                 .bucket("id", 1)
+                                                 .build();
 
   @Rule
   public TemporaryFolder temp = new TemporaryFolder();
@@ -116,7 +120,10 @@ public void testUnpartitionedTable() throws Exception {
     table.newAppend()
          .appendFile(dataFile)
          .commit();
-    validate(conf, location.toString(), null, expectedRecords);
+    Job job = Job.getInstance(conf);
+    IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
+    configBuilder.readFrom(location.toString());
+    validate(job, expectedRecords);
   }
 
   @Test
@@ -132,7 +139,136 @@ public void testPartitionedTable() throws Exception {
     table.newAppend()
          .appendFile(dataFile)
          .commit();
-    validate(conf, location.toString(), null, expectedRecords);
+
+    Job job = Job.getInstance(conf);
+    IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
+    configBuilder.readFrom(location.toString());
+    validate(job, expectedRecords);
+  }
+
+  @Test
+  public void testFilterExp() throws Exception {
+    File location = temp.newFolder(format.name());
+    Assert.assertTrue(location.delete());
+    Table table = tables.create(SCHEMA, SPEC,
+                                ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
+                                location.toString());
+    List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
+    expectedRecords.get(0).set(2, "2020-03-20");
+    expectedRecords.get(1).set(2, "2020-03-20");
+    DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
+    DataFile dataFile2 = writeFile(table, Row.of("2020-03-21", 0), format,
+                                   RandomGenericData.generate(table.schema(), 2, 0L));
+    table.newAppend()
+         .appendFile(dataFile1)
+         .appendFile(dataFile2)
+         .commit();
+    Job job = Job.getInstance(conf);
+    IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
+    configBuilder.readFrom(location.toString())
+                 .filter(Expressions.equal("date", "2020-03-20"));
+    validate(job, expectedRecords);
+  }
+
+  @Test
+  public void testResiduals() throws Exception {
+    File location = temp.newFolder(format.name());
+    Assert.assertTrue(location.delete());
+    Table table = tables.create(SCHEMA, SPEC,
+                                ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
+                                location.toString());
+    List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
+    expectedRecords.get(0).set(2, "2020-03-20");
+    expectedRecords.get(1).set(2, "2020-03-20");
+    DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
+    table.newAppend()
+         .appendFile(dataFile)
+         .commit();
+    Job job = Job.getInstance(conf);
+    IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
+    configBuilder.readFrom(location.toString())
+                 .filter(Expressions.and(
+                     Expressions.equal("date", "2020-03-20"),
+                     Expressions.equal("id", 0)));
+
+    AssertHelpers.assertThrows(
+        "Residuals are not evaluated today for Iceberg Generics In memory model",
+        RuntimeException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.",
+        () -> validate(job, expectedRecords));
+  }
+
+  @Test
+  public void testProjection() throws Exception {
+    File location = temp.newFolder(format.name());
+    Assert.assertTrue(location.delete());
+    Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
+    Table table = tables.create(SCHEMA, SPEC,
+                                ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
+                                location.toString());
+    List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
+    DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
+    table.newAppend()
+         .appendFile(dataFile)
+         .commit();
+
+    Job job = Job.getInstance(conf);
+    IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
+    configBuilder
+        .readFrom(location.toString())
+        .project(projectedSchema);
+    List<Record> outputRecords = readRecords(job.getConfiguration());
+    Assert.assertEquals(inputRecords.size(), outputRecords.size());
+    Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
+  }
+
+  @Test
+  public void testSnapshotReads() throws Exception {
+    File location = temp.newFolder(format.name());
+    Assert.assertTrue(location.delete());
+    Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
+                                ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
+                                location.toString());
+    List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
+    table.newAppend()
+         .appendFile(writeFile(table, null, format, expectedRecords))
+         .commit();
+    long snapshotId = table.currentSnapshot().snapshotId();
+    table.newAppend()
+         .appendFile(writeFile(table, null, format, RandomGenericData.generate(table.schema(), 1, 0L)))
+         .commit();
+
+    Job job = Job.getInstance(conf);
+    IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
+    configBuilder
+        .readFrom(location.toString())
+        .snapshotId(snapshotId);
+
+    validate(job, expectedRecords);
+  }
+
+  @Test
+  public void testLocality() throws Exception {
+    File location = temp.newFolder(format.name());
+    Assert.assertTrue(location.delete());
+    Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
+                                ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
+                                location.toString());
+    List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
+    table.newAppend()
+         .appendFile(writeFile(table, null, format, expectedRecords))
+         .commit();
+    Job job = Job.getInstance(conf);
+    IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
+    configBuilder.readFrom(location.toString());
+
+    for (InputSplit split : splits(job.getConfiguration())) {
+      Assert.assertArrayEquals(IcebergInputFormat.IcebergSplit.ANYWHERE, split.getLocations());
+    }
+
+    configBuilder.preferLocality();
+    for (InputSplit split : splits(job.getConfiguration())) {
+      Assert.assertArrayEquals(new String[]{"localhost"}, split.getLocations());
+    }
   }
 
   public static class HadoopCatalogFunc implements Function<Configuration, Catalog> {
@@ -157,22 +293,26 @@ public void testCustomCatalog() throws Exception {
     table.newAppend()
          .appendFile(dataFile)
          .commit();
-    validate(conf, tableIdentifier.toString(), HadoopCatalogFunc.class, expectedRecords);
-  }
 
-  private static void validate(
-      Configuration conf, String path, Class<? extends Function<Configuration, Catalog>> catalogFuncClass,
-      List<Record> expectedRecords) throws IOException {
     Job job = Job.getInstance(conf);
     IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
-    if (catalogFuncClass != null) {
-      configBuilder.catalogFunc(catalogFuncClass);
-    }
-    configBuilder.readFrom(path);
+    configBuilder
+        .catalogFunc(HadoopCatalogFunc.class)
+        .readFrom(tableIdentifier.toString());
+    validate(job, expectedRecords);
+  }
+
+  private static void validate(Job job, List<Record> expectedRecords) {
     List<Record> actualRecords = readRecords(job.getConfiguration());
     Assert.assertEquals(expectedRecords, actualRecords);
   }
 
+  private static <T> List<InputSplit> splits(Configuration conf) {
+    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
+    IcebergInputFormat<T> icebergInputFormat = new IcebergInputFormat<>();
+    return icebergInputFormat.getSplits(context);
+  }
+
   private static <T> List<T> readRecords(Configuration conf) {
     TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
     IcebergInputFormat<T> icebergInputFormat = new IcebergInputFormat<>();