Merge branch 'ESPARK-116' into 'spark_2.1'

cenyuhai · cenyuhai · commit 22feb413bcc5 · 2017-10-09T21:28:56.000+08:00
[ESPARK-116] 快速合并小文件，支持ORC、RC、Parquet、TextFile等文件格式 快速合并小文件，支持`ORC`、`RC`、`Parquet`、`TextFile`等文件格式 1、对于orc、rc文件，直接调用hive原来的方法进行合并 2、parquet `1.8.2`当中已经集成了快速合并的代码，参照orc的文件合并实现了一下 3、TextFile则是直接把数据块里的内容读出来，再写到新文件去，不做任何加工 4、对于不是以上4种格式的文件，在Optimize阶段就在后面追加了一个`distribute by rand()`过程执行合并 5、支持动态分区和非动态分区两种情况 6、合并的过程是顺序合并，不会打乱文件原来的顺序 7、当合并过程失败，忽略错误，使用合并前的文件作为结果 8、合并的速度接近于直接磁盘读写的速度 resolve apache#116 See merge request !77
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -145,6 +145,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val MERGE_FILE_PER_TASK = SQLConfigBuilder("spark.sql.hive.merge.size.per.task")
+    .doc("The size of one file")
+    .bytesConf(ByteUnit.BYTE)
+    .createWithDefault(240 * 1024 * 1024)
+
+  val MERGE_SMALLFILE_SIZE = SQLConfigBuilder("spark.sql.hive.merge.smallfile.size")
+    .doc("The average size of smallfile")
+    .bytesConf(ByteUnit.BYTE)
+    .createWithDefault(80 * 1024 * 1024)
+
   val SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE =
     SQLConfigBuilder("spark.sql.adaptive.shuffle.targetPostShuffleInputSize")
       .doc("The target post-shuffle input size in bytes of a task.")
@@ -790,6 +800,8 @@ class SQLConf extends Serializable with Logging {
 
   def mergeHiveFiles: Boolean = getConf(MERGE_HIVEFILES)
 
+  def mergeFileSize: Long = getConf(MERGE_FILE_PER_TASK)
+
   def targetPostShuffleInputSize: Long =
     getConf(SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
@@ -52,7 +52,7 @@ import org.apache.spark.sql.types._
  *        When set to false, use standard format defined in parquet-format spec.  This argument only
  *        affects Parquet write path.
  */
-private[parquet] class ParquetSchemaConverter(
+private[spark] class ParquetSchemaConverter(
     assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
     assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
     writeLegacyParquetFormat: Boolean = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get) {
@@ -63,10 +63,12 @@ private[parquet] class ParquetSchemaConverter(
     writeLegacyParquetFormat = conf.writeLegacyParquetFormat)
 
   def this(conf: Configuration) = this(
-    assumeBinaryIsString = conf.get(SQLConf.PARQUET_BINARY_AS_STRING.key).toBoolean,
-    assumeInt96IsTimestamp = conf.get(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key).toBoolean,
-    writeLegacyParquetFormat = conf.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
-      SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get.toString).toBoolean)
+    assumeBinaryIsString = conf.getBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key,
+      SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get),
+    assumeInt96IsTimestamp = conf.getBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
+      SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get),
+    writeLegacyParquetFormat = conf.getBoolean(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+      SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get))
 
   /**
    * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL [[StructType]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -35,6 +35,8 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
 
   private def targetPostShuffleInputSize: Long = conf.targetPostShuffleInputSize
 
+  private def mergeFileSize: Long = conf.mergeFileSize * 4
+
   private def adaptiveExecutionEnabled: Boolean = conf.adaptiveExecutionEnabled
 
   private def minNumPostShufflePartitions: Option[Int] = {
@@ -95,8 +97,18 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
             minNumPostShufflePartitions)
         children.zip(requiredChildDistributions).map {
           case (e: ShuffleExchange, _) =>
-            // This child is an Exchange, we need to add the coordinator.
-            e.copy(coordinator = Some(coordinator))
+            if (e.newPartitioning.asInstanceOf[HashPartitioning]
+                .toString.contains("SparkMergeTask")) {
+              val mergeCoordinator =
+                new ExchangeCoordinator(
+                  children.length,
+                  mergeFileSize,
+                  minNumPostShufflePartitions)
+              e.copy(coordinator = Some(mergeCoordinator))
+            } else {
+              // This child is an Exchange, we need to add the coordinator.
+              e.copy(coordinator = Some(coordinator))
+            }
           case (child, distribution) =>
             // If this child is not an Exchange, we need to add an Exchange for now.
             // Ideally, we can try to avoid this Exchange. However, when we reach here,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveOptimizerRules.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveOptimizerRules.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.CreateTable
+import org.apache.spark.sql.hive.merge.MergeUtils
 
 case class DeterminePartitionedTableStats(sparkSession: SparkSession)
   extends Rule[LogicalPlan] with PredicateHelper {
@@ -107,18 +108,23 @@ case class MergeSmallFiles(sparkSession: SparkSession) extends Rule[LogicalPlan]
     plan transformDown {
       case InsertIntoTable(table: MetastoreRelation, partition,
           child, overwrite, ifNotExists) if !child.isInstanceOf[Sort] &&
-          !child.children.exists(a => a.isInstanceOf[RepartitionByExpression] &&
-            !a.isInstanceOf[Repartition]) && !table.databaseName.contains("temp") =>
-        val rand = Alias(new Rand(), "_nondeterministic")()
+          !child.children.exists(a => a.isInstanceOf[RepartitionByExpression] ||
+            a.isInstanceOf[Repartition] ||
+            a.isInstanceOf[Sort]) && !table.databaseName.contains("temp") &&
+            !MergeUtils.SUPPORTED_FORMAT.contains(table.tableDesc.getOutputFileFormatClassName) =>
+        val rand = Alias(new Rand(), "SparkMergeTask")()
         val newProjected = Project(child.output :+ rand, child)
         val mergeFileStage = RepartitionByExpression(Seq(rand.toAttribute), newProjected, None)
         val finalOutput = Project(child.output, mergeFileStage)
         InsertIntoTable(table: MetastoreRelation, partition, finalOutput, overwrite, ifNotExists)
       case CreateTable(tableDesc, mode, Some(query)) if tableDesc.provider.get == "hive" &&
           !query.isInstanceOf[Sort] &&
           !query.children.exists(a => a.isInstanceOf[RepartitionByExpression]
-            && !a.isInstanceOf[Repartition]) && !tableDesc.database.contains("temp") =>
-        CreateTable(tableDesc, mode, Some(RepartitionByExpression(Seq(new Rand()), query, None)))
+            || a.isInstanceOf[Repartition] ||
+            a.isInstanceOf[Sort]) && !tableDesc.database.contains("temp") &&
+          !MergeUtils.SUPPORTED_FORMAT.contains(tableDesc.storage.outputFormat.get) =>
+        CreateTable(tableDesc, mode, Some(RepartitionByExpression(
+          Seq(Alias(new Rand(), "SparkMergeTask")()), query, None)))
     }
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -23,17 +23,21 @@ import java.security.PrivilegedExceptionAction
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale, Random}
 
+import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hdfs.DistributedFileSystem
 import org.apache.hadoop.hive.common.FileUtils
-import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.apache.hadoop.hive.ql.ErrorMsg
+import org.apache.hadoop.hive.ql.exec.{TaskRunner, Utilities}
+import org.apache.hadoop.hive.ql.io.HiveOutputFormat
+import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}
 import org.apache.hadoop.security.UserGroupInformation
 
+import org.apache.spark.SparkException
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
@@ -42,8 +46,9 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
-import org.apache.spark.SparkException
-import org.apache.spark.util.SerializableJobConf
+import org.apache.spark.sql.hive.merge.MergeUtils
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.util.{RpcUtils, SerializableJobConf}
 
 
 /**
@@ -94,6 +99,14 @@ case class InsertIntoHiveTable(
   val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
   val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive")
 
+  private val avgConditionSize = sqlContext.sparkSession.conf
+    .get(SQLConf.MERGE_SMALLFILE_SIZE)
+  private val outputAverageSize = sqlContext.sparkSession
+    .conf.get(SQLConf.MERGE_FILE_PER_TASK)
+  private val mergeHiveFiles = sqlContext.sparkSession.sessionState.conf.mergeHiveFiles
+  private val targetFileSize = Math.max(avgConditionSize, outputAverageSize)
+  private val retryWaitMs = RpcUtils.retryWaitMs(sqlContext.sparkContext.conf)
+
   private def executionId: String = {
     val rand: Random = new Random
     val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
@@ -222,6 +235,69 @@ case class InsertIntoHiveTable(
     new Path(getStagingDir(path), "-ext-10000") // Hive uses 10000
   }
 
+  private def mergeFile(
+      path: Path,
+      fs: FileSystem,
+      fileSinkConf: FileSinkDesc,
+      conf: SerializableJobConf,
+      directRenamePathList: java.util.List[String],
+      speculationEnabled: Boolean): Unit = {
+    val hiveOutputFormat = conf.value.getOutputFormat
+      .asInstanceOf[HiveOutputFormat[AnyRef, Writable]]
+    val extension = Utilities.getFileExtension(conf.value,
+      fileSinkConf.getCompressed, hiveOutputFormat)
+    val outputClassName = fileSinkConf.getTableInfo.getOutputFileFormatClassName
+    val outputDir = path.toString
+    val tmpMergeLocation = MergeUtils.getExternalMergeTmpPath(path, conf.value)
+    val tmpMergeLocationDir = tmpMergeLocation.toString
+    fileSinkConf.dir = tmpMergeLocation.toString
+    val waitTime = retryWaitMs
+    val numDynamicPartitions = partition.values.count(_.isEmpty)
+    if (numDynamicPartitions > 0) {
+      val mergeRules = MergeUtils.generateDynamicMergeRule(fs, path,
+        conf.value, avgConditionSize, targetFileSize, directRenamePathList)
+      sparkContext.union(mergeRules.map { r =>
+        val groupSize = Math.ceil(r.files.size * 1d / r.numFiles).toInt
+        val groupedFiles = r.files.toArray.grouped(groupSize).map(x => (r.path, x)).toArray
+        MergeUtils.mergePathRDD(sparkContext, groupedFiles, groupedFiles.size)
+      }).foreach { case (partOutputDir, files) =>
+        val tmpPartMergeLocationDir = partOutputDir.replace("-ext-10000", MergeUtils.TEMP_DIR)
+        MergeUtils.mergeAction(conf, outputClassName, files, partOutputDir, tmpPartMergeLocationDir,
+          extension, waitTime)
+      }
+      if (speculationEnabled) {
+        mergeRules.foreach { r =>
+          val specFiles = fs.listStatus(
+            new Path(r.path.toString.replace("-ext-10000", MergeUtils.TEMP_DIR)))
+            .filter(!_.getPath.getName.startsWith("part"))
+          specFiles.foreach(f => fs.delete(f.getPath))
+        }
+      }
+    } else {
+      val numFiles = MergeUtils.getTargetFileNum(path, conf.value,
+        avgConditionSize, targetFileSize)
+      if (numFiles > 0) {
+        val files = fs.listStatus(path).filter(_.getLen > 0).map(_.getPath.toString)
+        val groupSize = Math.ceil(files.size * 1d / numFiles).toInt
+        val groupedFiles = files.grouped(groupSize).toArray
+        fileSinkConf.dir = tmpMergeLocation.toString
+        sparkContext.parallelize(groupedFiles, groupedFiles.size).foreach { files =>
+          MergeUtils.mergeAction(conf, outputClassName, files, outputDir, tmpMergeLocationDir,
+            extension, waitTime)
+        }
+        if (speculationEnabled) {
+          val specFiles = fs.listStatus(tmpMergeLocation)
+            .filter(!_.getPath.getName.startsWith("part"))
+          specFiles.foreach(f => fs.delete(f.getPath))
+        }
+        if (conf.value.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) {
+          fs.createNewFile(new Path(tmpMergeLocationDir + "/_SUCCESS"))
+        }
+      }
+    }
+    FileOutputFormat.setOutputPath(conf.value, tmpMergeLocation)
+  }
+
   private def saveAsHiveFile(
       rdd: RDD[InternalRow],
       valueClass: Class[_],
@@ -310,6 +386,7 @@ case class InsertIntoHiveTable(
     }
 
     val jobConf = new JobConf(hadoopConf)
+    jobConf.set(MergeUtils.SCHEMA, table.attributes.toStructType.json)
     val jobConfSer = new SerializableJobConf(jobConf)
 
     // When speculation is on and output committer class name contains "Direct", we should warn
@@ -342,6 +419,38 @@ case class InsertIntoHiveTable(
     @transient val outputClass = writerContainer.newSerializer(table.tableDesc).getSerializedClass
     saveAsHiveFile(child.execute(), outputClass, fileSinkConf, jobConfSer, writerContainer)
 
+    val outputFormatClass = fileSinkConf.getTableInfo.getOutputFileFormatClassName
+    if (mergeHiveFiles && targetFileSize > 0 && !table.databaseName.contains("temp") &&
+        MergeUtils.SUPPORTED_FORMAT.contains(outputFormatClass)) {
+      val directRenamePathList = new java.util.ArrayList[String]()
+      val rollbackPathList = new java.util.ArrayList[String]()
+      val fs = tmpLocation.getFileSystem(jobConf)
+      try {
+        mergeFile(tmpLocation, fs, fileSinkConf, jobConfSer,
+          directRenamePathList, speculationEnabled)
+        if (!directRenamePathList.isEmpty) {
+          directRenamePathList.asScala.foreach { path =>
+            val destPath = path.replace("-ext-10000", MergeUtils.TEMP_DIR)
+            rollbackPathList.add(path)
+            logInfo("rename [" + path + " to " + destPath + "]")
+            fs.rename(new Path(path), new Path(destPath))
+          }
+        }
+      } catch {
+        case ex: Exception =>
+          logInfo("Merge file of " + tmpLocation + " failed!", ex)
+          fileSinkConf.dir = tmpLocation.toString
+          if (!rollbackPathList.isEmpty) {
+            rollbackPathList.asScala.foreach { path =>
+              val srcPath = path.replace("-ext-10000", MergeUtils.TEMP_DIR)
+              logInfo("rename [" + srcPath + " to "
+                + path + "]")
+              fs.rename(new Path(srcPath), new Path(path))
+            }
+          }
+      }
+    }
+
     val outputPath = FileOutputFormat.getOutputPath(jobConf)
     // TODO: Correctly set holdDDLTime.
     // In most of the time, we should have holdDDLTime = false.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/FileRecordReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/FileRecordReader.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.merge
+
+import org.apache.hadoop.mapred.RecordReader
+
+case class StringWrapper(var file: String)
+
+class FileRecordReader(file: String) extends RecordReader[StringWrapper, StringWrapper] {
+  var isClose = false
+
+  override def next(k: StringWrapper, v: StringWrapper): Boolean = {
+    if (!isClose) {
+      k.file = file
+      v.file = file
+      isClose = true
+      true
+    } else {
+      false
+    }
+  }
+
+  override def getProgress: Float = if (isClose) 1 else 0
+
+  override def getPos: Long = if (isClose) 1 else 0
+
+  override def createKey(): StringWrapper = StringWrapper(file)
+
+  override def close(): Unit = {}
+
+  override def createValue(): StringWrapper = StringWrapper(file)
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/MergeUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/MergeUtils.scala
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/ParquetFileMergeOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/ParquetFileMergeOperator.scala
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/TextFileMergeOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/merge/TextFileMergeOperator.scala