[SPARK-30946][SS] Serde entry with UnsafeRow on FileStream(Source/Sink)Log with LZ4 compression

HeartSaVioR · HeartSaVioR · commit 55d238feb599 · 2020-02-25T22:52:14.000+09:00
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -135,8 +135,11 @@ class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec {
   private[this] val defaultSeed: Int = 0x9747b28c // LZ4BlockOutputStream.DEFAULT_SEED
 
   override def compressedOutputStream(s: OutputStream): OutputStream = {
+    compressedOutputStream(s, syncFlush = false)
+  }
+
+  def compressedOutputStream(s: OutputStream, syncFlush: Boolean): OutputStream = {
     val blockSize = conf.get(IO_COMPRESSION_LZ4_BLOCKSIZE).toInt
-    val syncFlush = false
     new LZ4BlockOutputStream(
       s,
       blockSize,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -17,17 +17,21 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.io.{InputStream, IOException, OutputStream}
+import java.io.{DataInputStream, DataOutputStream, InputStream, IOException, OutputStream}
 import java.nio.charset.StandardCharsets.UTF_8
 
+import scala.collection.mutable
 import scala.io.{Source => IOSource}
 import scala.reflect.ClassTag
 
+import com.google.common.io.ByteStreams
 import org.apache.hadoop.fs.Path
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 
+import org.apache.spark.io.LZ4CompressionCodec
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 
 /**
  * An abstract class for compactible metadata logs. It will write one log file for each batch.
@@ -105,6 +109,8 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
     interval
   }
 
+  private val sparkConf = sparkSession.sparkContext.getConf
+
   /**
    * Filter out the obsolete logs.
    */
@@ -131,24 +137,89 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
     }
   }
 
+  def dataToUnsafeRow(data: T): UnsafeRow
+  def unsafeRowToData(row: UnsafeRow): T
+  def numFieldsForUnsafeRow: Int
+
   override def serialize(logData: Array[T], out: OutputStream): Unit = {
     // called inside a try-finally where the underlying stream is closed in the caller
     out.write(("v" + metadataLogVersion).getBytes(UTF_8))
+    metadataLogVersion match {
+      case 1 => serializeToV1(out, logData)
+      case 2 => serializeToV2(out, logData)
+      case _ =>
+        throw new IllegalStateException(s"UnsupportedLogVersion: unknown log version is provided" +
+          s", v$metadataLogVersion.")
+    }
+  }
+
+  private def serializeToV1(out: OutputStream, logData: Array[T]): Unit = {
     logData.foreach { data =>
       out.write('\n')
       out.write(Serialization.write(data).getBytes(UTF_8))
     }
   }
 
+  private def serializeToV2(out: OutputStream, logData: Array[T]): Unit = {
+    out.write('\n')
+    val dos = compressStream(out)
+    logData.foreach { data =>
+      val row = dataToUnsafeRow(data)
+      val rowBytes = row.getBytes
+      dos.writeInt(rowBytes.size)
+      dos.write(rowBytes)
+    }
+    dos.writeInt(-1)
+    dos.flush()
+  }
+
   override def deserialize(in: InputStream): Array[T] = {
-    val lines = IOSource.fromInputStream(in, UTF_8.name()).getLines()
-    if (!lines.hasNext) {
+    val line = readLine(in)
+    if (line == null || line.isEmpty) {
       throw new IllegalStateException("Incomplete log file")
     }
-    validateVersion(lines.next(), metadataLogVersion)
+
+    val version = parseVersion(line)
+    version match {
+      case 1 if version <= metadataLogVersion => deserializeFromV1(in)
+      case 2 if version <= metadataLogVersion => deserializeFromV2(in)
+      case version =>
+        throw new IllegalStateException(s"UnsupportedLogVersion: maximum supported log version " +
+          s"is v${metadataLogVersion}, but encountered v$version. The log file was produced " +
+          s"by a newer version of Spark and cannot be read by this version. Please upgrade.")
+    }
+  }
+
+  private def deserializeFromV1(in: InputStream): Array[T] = {
+    val lines = IOSource.fromInputStream(in, UTF_8.name()).getLines()
     lines.map(Serialization.read[T]).toArray
   }
 
+  private def deserializeFromV2(in: InputStream): Array[T] = {
+    val list = new scala.collection.mutable.ArrayBuffer[T]
+
+    val dis = decompressStream(in)
+    var eof = false
+
+    while (!eof) {
+      val size = dis.readInt()
+      if (size == -1) {
+        eof = true
+      } else if (size < 0) {
+        throw new IOException(
+          s"Error to deserialize file: size cannot be $size")
+      } else {
+        val rowBuffer = new Array[Byte](size)
+        ByteStreams.readFully(dis, rowBuffer, 0, size)
+        val row = new UnsafeRow(numFieldsForUnsafeRow)
+        row.pointTo(rowBuffer, size)
+        list += unsafeRowToData(row)
+      }
+    }
+
+    list.toArray
+  }
+
   override def add(batchId: Long, logs: Array[T]): Boolean = {
     val batchAdded =
       if (isCompactionBatch(batchId, compactInterval)) {
@@ -264,6 +335,33 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
       }
     }
   }
+
+  private def readLine(in: InputStream): String = {
+    val line = new mutable.ArrayBuffer[Byte]()
+    var eol = false
+    while (!eol) {
+      val b = in.read()
+      if (b == -1 || b == '\n') {
+        eol = true
+      } else {
+        line += b.toByte
+      }
+    }
+
+    new String(line.toArray, UTF_8)
+  }
+
+  private def compressStream(outputStream: OutputStream): DataOutputStream = {
+    // set syncFlush to true since we don't call close for compressed stream but call flush instead
+    val compressed = new LZ4CompressionCodec(sparkConf)
+      .compressedOutputStream(outputStream, syncFlush = true)
+    new DataOutputStream(compressed)
+  }
+
+  private def decompressStream(inputStream: InputStream): DataInputStream = {
+    val compressed = new LZ4CompressionCodec(sparkConf).compressedInputStream(inputStream)
+    new DataInputStream(compressed)
+  }
 }
 
 object CompactibleFileStreamLog {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
@@ -17,14 +17,24 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.io.{DataInputStream, DataOutputStream, InputStream, IOException, OutputStream}
 import java.net.URI
+import java.nio.charset.StandardCharsets.UTF_8
 
+import scala.collection.mutable
+import scala.io.{Source => IOSource}
+
+import com.google.common.io.ByteStreams
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 
+import org.apache.spark.io.LZ4CompressionCodec
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType, StringType, StructField, StructType}
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * The status of a file outputted by [[FileStreamSink]]. A file is visible only if it appears in
@@ -66,6 +76,41 @@ object SinkFileStatus {
   }
 }
 
+object SinkFileStatusV2 {
+  val SCHEMA = new StructType(
+    Array(
+      StructField("path", StringType),
+      StructField("size", LongType),
+      StructField("isDir", BooleanType),
+      StructField("modificationTime", LongType),
+      StructField("blockReplication", IntegerType),
+      StructField("blockSize", LongType),
+      StructField("action", StringType)
+    )
+  )
+
+  val PROJ_UNSAFE_ROW = UnsafeProjection.create(SCHEMA.fields.map(_.dataType))
+
+  def fromRow(row: UnsafeRow): SinkFileStatus = {
+    SinkFileStatus(
+      row.getString(0),
+      row.getLong(1),
+      row.getBoolean(2),
+      row.getLong(3),
+      row.getInt(4),
+      row.getLong(5),
+      row.getString(6)
+    )
+  }
+
+  def toRow(entry: SinkFileStatus): UnsafeRow = {
+    val row = new GenericInternalRow(Array[Any](
+      UTF8String.fromString(entry.path), entry.size, entry.isDir, entry.modificationTime,
+      entry.blockReplication, entry.blockSize, UTF8String.fromString(entry.action)))
+    PROJ_UNSAFE_ROW.apply(row).copy()
+  }
+}
+
 /**
  * A special log for [[FileStreamSink]]. It will write one log file for each batch. The first line
  * of the log file is the version number, and there are multiple JSON lines following. Each JSON
@@ -93,6 +138,8 @@ class FileStreamSinkLog(
   protected override val defaultCompactInterval =
     sparkSession.sessionState.conf.fileSinkLogCompactInterval
 
+  private val sparkConf = sparkSession.sparkContext.getConf
+
   require(defaultCompactInterval > 0,
     s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
       "to a positive value.")
@@ -105,10 +152,16 @@ class FileStreamSinkLog(
       logs.filter(f => !deletedFiles.contains(f.path))
     }
   }
+
+  override def dataToUnsafeRow(data: SinkFileStatus): UnsafeRow = SinkFileStatusV2.toRow(data)
+
+  override def unsafeRowToData(row: UnsafeRow): SinkFileStatus = SinkFileStatusV2.fromRow(row)
+
+  override def numFieldsForUnsafeRow: Int = SinkFileStatusV2.SCHEMA.fields.length
 }
 
 object FileStreamSinkLog {
-  val VERSION = 1
+  val VERSION = 2
   val DELETE_ACTION = "delete"
   val ADD_ACTION = "add"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -29,12 +29,14 @@ import org.apache.hadoop.fs.{FileStatus, FileSystem, GlobFilter, Path}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.connector.read.streaming
 import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit, ReadMaxFiles, SupportsAdmissionControl}
 import org.apache.spark.sql.execution.datasources.{DataSource, InMemoryFileIndex, LogicalRelation}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.ThreadUtils
 
 /**
@@ -313,6 +315,29 @@ object FileStreamSource {
 
   case class FileEntry(path: String, timestamp: Timestamp, batchId: Long) extends Serializable
 
+  object FileEntryV2 {
+    val SCHEMA = new StructType(
+      Array(
+        StructField("path", StringType),
+        StructField("timestamp", LongType),
+        StructField("batchId", LongType)
+      )
+    )
+
+    val PROJ_UNSAFE_ROW = UnsafeProjection.create(SCHEMA.fields.map(_.dataType))
+
+    def fromRow(row: UnsafeRow): FileEntry = {
+      FileEntry(row.getString(0), row.getLong(1), row.getLong(2))
+    }
+
+    def toRow(entry: FileEntry): UnsafeRow = {
+      val row = new GenericInternalRow(Array[Any](
+        UTF8String.fromString(entry.path), entry.timestamp, entry.batchId
+      ))
+      PROJ_UNSAFE_ROW.apply(row).copy()
+    }
+  }
+
   /**
    * A custom hash map used to track the list of files seen. This map is not thread-safe.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
@@ -17,16 +17,22 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.io.{DataInputStream, DataOutputStream, InputStream, IOException, OutputStream}
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.{LinkedHashMap => JLinkedHashMap}
 import java.util.Map.Entry
 
 import scala.collection.mutable
+import scala.io.{Source => IOSource}
 
+import com.google.common.io.ByteStreams
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 
+import org.apache.spark.io.LZ4CompressionCodec
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.streaming.FileStreamSource.FileEntry
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.streaming.FileStreamSource.{FileEntry, FileEntryV2}
 import org.apache.spark.sql.internal.SQLConf
 
 class FileStreamSourceLog(
@@ -52,6 +58,8 @@ class FileStreamSourceLog(
 
   private implicit val formats = Serialization.formats(NoTypeHints)
 
+  private val sparkConf = sparkSession.sparkContext.getConf
+
   // A fixed size log entry cache to cache the file entries belong to the compaction batch. It is
   // used to avoid scanning the compacted log file to retrieve it's own batch data.
   private val cacheSize = compactInterval
@@ -122,8 +130,14 @@ class FileStreamSourceLog(
     }
     batches
   }
+
+  override def dataToUnsafeRow(data: FileEntry): UnsafeRow = FileEntryV2.toRow(data)
+
+  override def unsafeRowToData(row: UnsafeRow): FileEntry = FileEntryV2.fromRow(row)
+
+  override def numFieldsForUnsafeRow: Int = FileEntryV2.SCHEMA.fields.length
 }
 
 object FileStreamSourceLog {
-  val VERSION = 1
+  val VERSION = 2
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -240,6 +240,16 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
    * "v123xyz" etc.)
    */
   private[sql] def validateVersion(text: String, maxSupportedVersion: Int): Int = {
+    val version = parseVersion(text)
+    if (version > maxSupportedVersion) {
+      throw new IllegalStateException(s"UnsupportedLogVersion: maximum supported log version " +
+        s"is v${maxSupportedVersion}, but encountered v$version. The log file was produced " +
+        s"by a newer version of Spark and cannot be read by this version. Please upgrade.")
+    }
+    version
+  }
+
+  private[sql] def parseVersion(text: String): Int = {
     if (text.length > 0 && text(0) == 'v') {
       val version =
         try {
@@ -249,15 +259,8 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
             throw new IllegalStateException(s"Log file was malformed: failed to read correct log " +
               s"version from $text.")
         }
-      if (version > 0) {
-        if (version > maxSupportedVersion) {
-          throw new IllegalStateException(s"UnsupportedLogVersion: maximum supported log version " +
-            s"is v${maxSupportedVersion}, but encountered v$version. The log file was produced " +
-            s"by a newer version of Spark and cannot be read by this version. Please upgrade.")
-        } else {
-          return version
-        }
-      }
+
+      if (version > 0) return version
     }
 
     // reaching here means we failed to read the correct log version
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala