KE-41399 spark-42388 Avoid parquet footer reads twice in vectorized reader (apache#623) (apache#629)

yabola · web-flow · commit ea75c337d2db · 2023-05-27T10:36:39.000+08:00
KE-41399 spark-42388 Avoid parquet footer reads twice in vectorized reader
diff --git a/pom.xml b/pom.xml
@@ -138,7 +138,7 @@
     <kafka.version>2.8.2</kafka.version>
     <!-- After 10.15.1.3, the minimum required version is JDK9 -->
     <derby.version>10.14.2.0</derby.version>
-    <parquet.version>1.12.2-kylin-r5</parquet.version>
+    <parquet.version>1.12.2-kylin-r6</parquet.version>
     <orc.version>1.6.11</orc.version>
     <jetty.version>9.4.49.v20220914</jetty.version>
     <mortbay.jetty.version>7.0.0.pre5</mortbay.jetty.version>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetFooterReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetFooterReader.java
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql.execution.datasources.parquet;
 
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
@@ -26,14 +30,60 @@
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
-
-import java.io.IOException;
+import org.apache.spark.sql.execution.datasources.PartitionedFile;
 
 /**
  * `ParquetFooterReader` is a util class which encapsulates the helper
  * methods of reading parquet file footer
  */
 public class ParquetFooterReader {
+
+  public static final boolean SKIP_ROW_GROUPS = true;
+  public static final boolean WITH_ROW_GROUPS = false;
+
+  public static ParquetFileReader reader(
+          Configuration configuration,
+          PartitionedFile file) throws IOException, URISyntaxException {
+    long fileStart = file.start();
+    ParquetMetadataConverter.MetadataFilter filter;
+    Path path = new Path(new URI(file.filePath()));
+    filter = HadoopReadOptions.builder(configuration, path)
+            .withRange(fileStart, fileStart + file.length())
+            .build()
+            .getMetadataFilter();
+    HadoopInputFile inputFile = HadoopInputFile.fromPath(path, configuration);
+    ParquetReadOptions readOptions =
+            HadoopReadOptions.builder(inputFile.getConfiguration()).withMetadataFilter(filter).build();
+    return ParquetFileReader.open(inputFile, readOptions);
+  }
+
+  /**
+   * Reads footer for the input Parquet file 'split'. If 'skipRowGroup' is true,
+   * this will skip reading the Parquet row group metadata.
+   *
+   * @param file a part (i.e. "block") of a single file that should be read
+   * @param configuration hadoop configuration of file
+   * @param skipRowGroup If true, skip reading row groups;
+   *                     if false, read row groups according to the file split range
+   */
+  public static ParquetMetadata readFooter(
+          Configuration configuration,
+          PartitionedFile file,
+          boolean skipRowGroup) throws IOException, URISyntaxException {
+    long fileStart = file.start();
+    ParquetMetadataConverter.MetadataFilter filter;
+    Path path = new Path(new URI(file.filePath()));
+    if (skipRowGroup) {
+      filter = ParquetMetadataConverter.SKIP_ROW_GROUPS;
+    } else {
+      filter = HadoopReadOptions.builder(configuration, path)
+              .withRange(fileStart, fileStart + file.length())
+              .build()
+              .getMetadataFilter();
+    }
+    return readFooter(configuration, path, filter);
+  }
+
   public static ParquetMetadata readFooter(Configuration configuration,
       Path file, ParquetMetadataConverter.MetadataFilter filter) throws IOException {
     return readFooter(HadoopInputFile.fromPath(file, configuration), filter);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
@@ -44,6 +44,7 @@
 import org.apache.parquet.hadoop.ParquetInputFormat;
 import org.apache.parquet.hadoop.api.InitContext;
 import org.apache.parquet.hadoop.api.ReadSupport;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.ConfigurationUtil;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
 import org.apache.parquet.schema.MessageType;
@@ -80,16 +81,27 @@ public abstract class SpecificParquetRecordReaderBase<T> extends RecordReader<Vo
 
   @Override
   public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
-      throws IOException, InterruptedException {
+          throws IOException, InterruptedException {
+    initialize(inputSplit, taskAttemptContext, Option.empty());
+  }
+
+  public void initialize(InputSplit inputSplit,
+                         TaskAttemptContext taskAttemptContext,
+                         Option<ParquetFileReader> fileReader)
+      throws IOException {
     Configuration configuration = taskAttemptContext.getConfiguration();
     FileSplit split = (FileSplit) inputSplit;
     this.file = split.getPath();
-
-    ParquetReadOptions options = HadoopReadOptions
-      .builder(configuration)
-      .withRange(split.getStart(), split.getStart() + split.getLength())
-      .build();
-    this.reader = new ParquetFileReader(HadoopInputFile.fromPath(file, configuration), options);
+    if (fileReader.isDefined()) {
+      this.reader = fileReader.get();
+    } else {
+      ParquetReadOptions options = HadoopReadOptions
+              .builder(configuration, file)
+              .withRange(split.getStart(), split.getStart() + split.getLength())
+              .build();
+      this.reader = new ParquetFileReader(
+              HadoopInputFile.fromPath(file, configuration), options);
+    }
     this.fileSchema = reader.getFileMetaData().getSchema();
     Map<String, String> fileMetadata = reader.getFileMetaData().getKeyValueMetaData();
     ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -27,6 +27,8 @@
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.page.PageReadStore;
 import org.apache.parquet.filter2.compat.QueryMetrics;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.Type;
 
 import org.apache.spark.memory.MemoryMode;
@@ -39,6 +41,8 @@
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
+import scala.Option;
+
 /**
  * A specialized RecordReader that reads into InternalRows or ColumnarBatches directly using the
  * Parquet column APIs. This is somewhat based on parquet-mr's ColumnReader.
@@ -155,6 +159,16 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
     initializeInternal();
   }
 
+  @Override
+  public void initialize(
+          InputSplit inputSplit,
+          TaskAttemptContext taskAttemptContext,
+          Option<ParquetFileReader> fileReader)
+          throws IOException, UnsupportedOperationException {
+    super.initialize(inputSplit, taskAttemptContext, fileReader);
+    initializeInternal();
+  }
+
   /**
    * Utility API that will read all the data in path. This circumvents the need to create Hadoop
    * objects to use this class. `columns` can contain the list of columns to project.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -268,8 +268,18 @@ class ParquetFileFormat
 
       S3FileUtils.tryOpenClose(sharedConf, filePath)
       val startTime = System.currentTimeMillis()
-      lazy val footerFileMetaData =
-        ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData
+      var fileReader = Option.empty[ParquetFileReader]
+      val fileFooter = if (enableVectorizedReader) {
+        // When there are vectorized reads, we can avoid reading the footer twice by reading
+        // all row groups in advance and filter row groups according to filters that require
+        // push down (no need to read the footer metadata again).
+        fileReader = Option.apply(ParquetFooterReader.reader(sharedConf, file))
+        fileReader.get.getFooter
+      } else {
+        ParquetFooterReader.readFooter(sharedConf, file, ParquetFooterReader.SKIP_ROW_GROUPS)
+      }
+      val footerFileMetaData = fileFooter.getFileMetaData
+
       val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
         footerFileMetaData.getKeyValueMetaData.get,
         datetimeRebaseModeInRead)
@@ -322,6 +332,9 @@ class ParquetFileFormat
       // Notice: This push-down is RowGroups level, not individual records.
       if (pushed.isDefined) {
         ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get)
+        if (fileReader.isDefined) {
+          fileReader.get.resetBlocks(hadoopAttemptContext.getConfiguration)
+        }
       }
       val taskContext = Option(TaskContext.get())
       val firstFooterEndTime = System.currentTimeMillis()
@@ -335,7 +348,7 @@ class ParquetFileFormat
         val iter = new RecordReaderIterator(vectorizedReader)
         // SPARK-23457 Register a task completion listener before `initialization`.
         taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close()))
-        vectorizedReader.initialize(split, hadoopAttemptContext)
+        vectorizedReader.initialize(split, hadoopAttemptContext, fileReader)
         logDebug(s"Appending $partitionSchema ${file.partitionValues}")
         vectorizedReader.initBatch(partitionSchema, file.partitionValues)
         if (returningBatch) {