apache · parthchandra · Jan 13, 2022 · Jan 14, 2022 · Jan 22, 2022 · Jan 24, 2022
diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt
diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt
diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt
diff --git a/...a/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/...a/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
@@ -29,6 +29,8 @@
 import java.util.Set;
 
 import com.google.common.annotations.VisibleForTesting;
+import org.apache.parquet.VersionParser;
+import org.apache.parquet.VersionParser.ParsedVersion;
 import org.apache.parquet.column.page.PageReadStore;
 import scala.Option;
 
@@ -69,6 +71,9 @@ public abstract class SpecificParquetRecordReaderBase<T> extends RecordReader<Vo
   protected MessageType fileSchema;
   protected MessageType requestedSchema;
   protected StructType sparkSchema;
+  // Keep track of the version of the parquet writer. An older version wrote
+  // corrupt delta byte arrays, and the version check is needed to detect that.
+  protected ParsedVersion writerVersion;
 
   /**
    * The total number of rows this RecordReader will eventually read. The sum of the
@@ -93,6 +98,12 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
         HadoopInputFile.fromPath(file, configuration), options);
     this.reader = new ParquetRowGroupReaderImpl(fileReader);
     this.fileSchema = fileReader.getFileMetaData().getSchema();
+    try {
+      this.writerVersion = VersionParser.parse(fileReader.getFileMetaData().getCreatedBy());
+    } catch (Exception e) {
+      // Swallow any exception, if we cannot parse the version we will revert to a sequential read
+      // if the column is a delta byte array encoding (due to PARQUET-246).
+    }
     Map<String, String> fileMetadata = fileReader.getFileMetaData().getKeyValueMetaData();
     ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
     ReadSupport.ReadContext readContext = readSupport.init(new InitContext(

diff --git a/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -21,13 +21,16 @@
 import java.time.ZoneId;
 import java.util.PrimitiveIterator;
 
+import org.apache.parquet.CorruptDeltaByteArrays;
+import org.apache.parquet.VersionParser.ParsedVersion;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.Encoding;
 import org.apache.parquet.column.page.*;
+import org.apache.parquet.column.values.RequiresPreviousReader;
 import org.apache.parquet.column.values.ValuesReader;
 import org.apache.parquet.schema.LogicalTypeAnnotation;
 import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation;
@@ -86,6 +89,7 @@ public class VectorizedColumnReader {
   private final ColumnDescriptor descriptor;
   private final LogicalTypeAnnotation logicalTypeAnnotation;
   private final String datetimeRebaseMode;
+  private final ParsedVersion writerVersion;
 
   public VectorizedColumnReader(
       ColumnDescriptor descriptor,
@@ -96,7 +100,8 @@ public VectorizedColumnReader(
       String datetimeRebaseMode,
       String datetimeRebaseTz,
       String int96RebaseMode,
-      String int96RebaseTz) throws IOException {
+      String int96RebaseTz,
+      ParsedVersion writerVersion) throws IOException {
     this.descriptor = descriptor;
     this.pageReader = pageReader;
     this.readState = new ParquetReadState(descriptor.getMaxDefinitionLevel(), rowIndexes);
@@ -129,6 +134,7 @@ public VectorizedColumnReader(
     this.datetimeRebaseMode = datetimeRebaseMode;
     assert "LEGACY".equals(int96RebaseMode) || "EXCEPTION".equals(int96RebaseMode) ||
       "CORRECTED".equals(int96RebaseMode);
+    this.writerVersion = writerVersion;
   }
 
   private boolean isLazyDecodingSupported(PrimitiveType.PrimitiveTypeName typeName) {
@@ -259,6 +265,7 @@ private void initDataReader(
       int pageValueCount,
       Encoding dataEncoding,
       ByteBufferInputStream in) throws IOException {
+    ValuesReader previousReader = this.dataColumn;
     if (dataEncoding.usesDictionary()) {
       this.dataColumn = null;
       if (dictionary == null) {
@@ -283,6 +290,11 @@ private void initDataReader(
     } catch (IOException e) {
       throw new IOException("could not read page in col " + descriptor, e);
     }
+    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
+        previousReader instanceof RequiresPreviousReader) {
+      // previous reader can only be set if reading sequentially
+      ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader);
+    }
   }
 
   private ValuesReader getValuesReader(Encoding encoding) {

diff --git a/...org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java b/...org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java
@@ -90,13 +90,18 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce
     Preconditions.checkArgument(miniSize % 8 == 0,
         "miniBlockSize must be multiple of 8, but it's " + miniSize);
     this.miniBlockSizeInValues = (int) miniSize;
+    // True value count. May be less than valueCount because of nulls
     this.totalValueCount = BytesUtils.readUnsignedVarInt(in);
     this.bitWidths = new int[miniBlockNumInABlock];
     this.unpackedValuesBuffer = new long[miniBlockSizeInValues];
     // read the first value
     firstValue = BytesUtils.readZigZagVarLong(in);
   }
 
+  int getTotalValueCount() {
+    return totalValueCount;
+  }
+
   @Override
   public byte readByte() {
     readValues(1, null, 0, (w, r, v) -> byteVal = (byte) v);

diff --git a/...va/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/...va/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java
@@ -16,51 +16,115 @@
  */
 package org.apache.spark.sql.execution.datasources.parquet;
 
+import static org.apache.spark.sql.types.DataTypes.BinaryType;
+import static org.apache.spark.sql.types.DataTypes.IntegerType;
+
 import org.apache.parquet.bytes.ByteBufferInputStream;
-import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
+import org.apache.parquet.column.values.RequiresPreviousReader;
+import org.apache.parquet.column.values.ValuesReader;
 import org.apache.parquet.io.api.Binary;
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
 /**
- * An implementation of the Parquet DELTA_BYTE_ARRAY decoder that supports the vectorized interface.
+ * An implementation of the Parquet DELTA_BYTE_ARRAY decoder that supports the vectorized
+ * interface.
  */
-public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase {
-  private final DeltaByteArrayReader deltaByteArrayReader = new DeltaByteArrayReader();
+public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase
+    implements VectorizedValuesReader, RequiresPreviousReader {
+
+  private final VectorizedDeltaBinaryPackedReader prefixLengthReader;
+  private final VectorizedDeltaLengthByteArrayReader suffixReader;
+  private WritableColumnVector prefixLengthVector;
+  private WritableColumnVector suffixVector;
+  private byte[] previous = new byte[0];
+  private int currentRow = 0;
+
+  // temporary variable used by getBinary
+  private final WritableColumnVector binaryValVector;
+
+  VectorizedDeltaByteArrayReader() {
+    this.prefixLengthReader = new VectorizedDeltaBinaryPackedReader();
+    this.suffixReader = new VectorizedDeltaLengthByteArrayReader();
+    binaryValVector = new OnHeapColumnVector(1, BinaryType);
+  }
 
   @Override
   public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException {
-    deltaByteArrayReader.initFromPage(valueCount, in);
+    prefixLengthVector = new OnHeapColumnVector(valueCount, IntegerType);
+    suffixVector = new OnHeapColumnVector(valueCount, BinaryType);
+    prefixLengthReader.initFromPage(valueCount, in);
+    prefixLengthReader.readIntegers(prefixLengthReader.getTotalValueCount(),
+        prefixLengthVector, 0);
+    suffixReader.initFromPage(valueCount, in);
+    suffixReader.readBinary(prefixLengthReader.getTotalValueCount(), suffixVector, 0);
   }
 
   @Override
   public Binary readBinary(int len) {
-    return deltaByteArrayReader.readBytes();
+    readValues(1, binaryValVector, 0, ByteBufferOutputWriter::writeArrayByteBuffer);
+    return Binary.fromConstantByteArray(binaryValVector.getBinary(0));
   }
 
-  @Override
-  public void readBinary(int total, WritableColumnVector c, int rowId) {
+  private void readValues(int total, WritableColumnVector c, int rowId,
+      ByteBufferOutputWriter outputWriter) {
     for (int i = 0; i < total; i++) {
-      Binary binary = deltaByteArrayReader.readBytes();
-      ByteBuffer buffer = binary.toByteBuffer();
-      if (buffer.hasArray()) {
-        c.putByteArray(rowId + i, buffer.array(), buffer.arrayOffset() + buffer.position(),
-          binary.length());
+      // NOTE: due to PARQUET-246, it is important that we
+      // respect prefixLength which was read from prefixLengthReader,
+      // even for the *first* value of a page. Even though the first
+      // value of the page should have an empty prefix, it may not
+      // because of PARQUET-246.
+      int prefixLength = prefixLengthVector.getInt(currentRow);
+      byte[] suffix = suffixVector.getBinary(currentRow);
+      int length = prefixLength + suffix.length;
+
+      // We have to do this to materialize the output
+      if (prefixLength != 0) {
+        // We could do
+        //  c.putByteArray(rowId + i, previous, 0, prefixLength);
+        //  c.putByteArray(rowId+i, suffix, prefixLength, suffix.length);
+        //  previous =  c.getBinary(rowId+1);
+        // but it incurs the same cost of copying the values twice _and_ c.getBinary
+        // is a _slow_ byte by byte copy
+        // The following always uses the faster system arraycopy method
+        byte[] out = new byte[length];
+        System.arraycopy(previous, 0, out, 0, prefixLength);
+        System.arraycopy(suffix, 0, out, prefixLength, suffix.length);
+        previous = out;
       } else {
-        byte[] bytes = new byte[binary.length()];
-        buffer.get(bytes);
-        c.putByteArray(rowId + i, bytes);
+        previous = suffix;
       }
+      outputWriter.write(c, rowId + i, ByteBuffer.wrap(previous), previous.length);
+      currentRow++;
     }
   }
 
   @Override
-  public void skipBinary(int total) {
-    for (int i = 0; i < total; i++) {
-      deltaByteArrayReader.skip();
+  public void readBinary(int total, WritableColumnVector c, int rowId) {
+    readValues(total, c, rowId, ByteBufferOutputWriter::writeArrayByteBuffer);
+  }
+
+  /**
+   * There was a bug (PARQUET-246) in which DeltaByteArrayWriter's reset() method did not clear the
+   * previous value state that it tracks internally. This resulted in the first value of all pages
+   * (except for the first page) to be a delta from the last value of the previous page. In order to
+   * read corrupted files written with this bug, when reading a new page we need to recover the
+   * previous page's last value to use it (if needed) to read the first value.
+   */
+  public void setPreviousReader(ValuesReader reader) {
+    if (reader != null) {
+      this.previous = ((VectorizedDeltaByteArrayReader) reader).previous;
     }
   }
 
+  @Override
+  public void skipBinary(int total) {
+    // we have to read all the values so that we always have the correct 'previous'
+    // we just don't write it to the output vector
+    readValues(total, null, currentRow, ByteBufferOutputWriter::skipWrite);
+  }
+
 }
diff --git a/.../apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/.../apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.parquet;
+
+import static org.apache.spark.sql.types.DataTypes.IntegerType;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.io.ParquetDecodingException;
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
+
+/**
+ * An implementation of the Parquet DELTA_LENGTH_BYTE_ARRAY decoder that supports the vectorized
+ * interface.
+ */
+public class VectorizedDeltaLengthByteArrayReader extends VectorizedReaderBase implements
+    VectorizedValuesReader {
+
+  private final VectorizedDeltaBinaryPackedReader lengthReader;
+  private ByteBufferInputStream in;
+  private WritableColumnVector lengthsVector;
+  private int currentRow = 0;
+
+  VectorizedDeltaLengthByteArrayReader() {
+    lengthReader = new VectorizedDeltaBinaryPackedReader();
+  }
+
+  @Override
+  public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException {
+    lengthsVector = new OnHeapColumnVector(valueCount, IntegerType);
+    lengthReader.initFromPage(valueCount, in);
+    lengthReader.readIntegers(lengthReader.getTotalValueCount(), lengthsVector, 0);
+    this.in = in.remainingStream();
+  }
+
+  @Override
+  public void readBinary(int total, WritableColumnVector c, int rowId) {
+    ByteBuffer buffer;
+    ByteBufferOutputWriter outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer;
+    int length;
+    for (int i = 0; i < total; i++) {
+      length = lengthsVector.getInt(rowId + i);
+      try {
+        buffer = in.slice(length);
+      } catch (EOFException e) {
+        throw new ParquetDecodingException("Failed to read " + length + " bytes");
+      }
+      outputWriter.write(c, rowId + i, buffer, length);
+    }
+    currentRow += total;
+  }
+
+  @Override
+  public void skipBinary(int total) {
+    if (total == 0) {
+      return;
+    }
+    int length;
+    for (int i = 0; i < total; i++) {
+      length = lengthsVector.getInt(currentRow + i);
+      int remaining = length;
+      while (remaining > 0) {
+        remaining -= in.skip(remaining);
+      }
+    }
+    currentRow += total;
+  }
+
+}
diff --git a/...ava/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/...ava/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -367,7 +367,9 @@ private void checkEndOfRowGroup() throws IOException {
         datetimeRebaseMode,
         datetimeRebaseTz,
         int96RebaseMode,
-        int96RebaseTz);
+        int96RebaseTz,
+        writerVersion
+        );
     }
     totalCountLoadedSoFar += pages.getRowCount();
   }

diff --git a/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet;
 
+import java.nio.ByteBuffer;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 
 import org.apache.parquet.io.api.Binary;
@@ -86,4 +87,20 @@ interface IntegerOutputWriter {
     void write(WritableColumnVector outputColumnVector, int rowId, long val);
   }
 
+  @FunctionalInterface
+  interface ByteBufferOutputWriter {
+    void write(WritableColumnVector c, int rowId, ByteBuffer val, int length);
+
+    static void writeArrayByteBuffer(WritableColumnVector c, int rowId, ByteBuffer val,
+        int length) {
+      c.putByteArray(rowId,
+          val.array(),
+          val.arrayOffset() + val.position(),
+          length);
+    }
+
+    static void skipWrite(WritableColumnVector c, int rowId, ByteBuffer val, int length) { }
+
+  }
+
 }