apache · LantaoJin · Sep 13, 2018 · Sep 14, 2018 · Sep 14, 2018 · Sep 18, 2018
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.execution.command.{DataWritingCommand, DataWritingCommandExec}
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.metric.SQLMetricInfo
 
@@ -60,6 +61,7 @@ private[execution] object SparkPlanInfo {
     // dump the file scan metadata (e.g file path) to event log
     val metadata = plan match {
       case fileScan: FileSourceScanExec => fileScan.metadata
+      case writing: DataWritingCommandExec => writing.metadata
       case _ => Map[String, String]()
     }
     new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan),

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.command
 
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.Attribute
@@ -48,6 +49,8 @@ trait DataWritingCommand extends Command {
   def outputColumns: Seq[Attribute] =
     DataWritingCommand.logicalPlanOutputWithNames(query, outputColumnNames)
 
+  def outputPath: Option[Path]
+
   lazy val metrics: Map[String, SQLMetric] = BasicWriteJobStatsTracker.metrics
 
   def basicWriteJobStatsTracker(hadoopConf: Configuration): BasicWriteJobStatsTracker = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -121,6 +121,22 @@ case class DataWritingCommandExec(cmd: DataWritingCommand, child: SparkPlan)
   protected override def doExecute(): RDD[InternalRow] = {
     sqlContext.sparkContext.parallelize(sideEffectResult, 1)
   }
+
+  // Metadata that describes more details of this writing.
+  lazy val metadata: Map[String, String] = {
+    def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
+    val outputPath = cmd.outputPath match {
+      case Some(path) if path != null => path.toString
+      case _ => ""
+    }
+    val columnNames = cmd.outputColumnNames
+    val metadata =
+      Map(
+        "OutputColumnNames" -> seqToString(columnNames),
+        "OutputPath" -> outputPath
+      )
+    metadata
+  }
 }
 
 /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -19,9 +19,10 @@ package org.apache.spark.sql.execution.command
 
 import java.net.URI
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources._
@@ -221,4 +222,6 @@ case class CreateDataSourceTableAsSelectCommand(
         throw ex
     }
   }
+
+  override def outputPath: Option[Path] = table.storage.locationUri.map(new Path(_))
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -440,7 +440,7 @@ case class DataSource(
     // ordering of data.logicalPlan (partition columns are all moved after data column).  This
     // will be adjusted within InsertIntoHadoopFsRelation.
     InsertIntoHadoopFsRelationCommand(
-      outputPath = outputPath,
+      outputFsPath = outputPath,
       staticPartitions = Map.empty,
       ifPartitionNotExists = false,
       partitionColumns = partitionColumns.map(UnresolvedAttribute.quoted),

diff --git a/.../scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/.../scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -45,7 +45,7 @@ import org.apache.spark.sql.util.SchemaUtils
  *                             Only valid for static partitions.
  */
 case class InsertIntoHadoopFsRelationCommand(
-    outputPath: Path,
+    outputFsPath: Path,
     staticPartitions: TablePartitionSpec,
     ifPartitionNotExists: Boolean,
     partitionColumns: Seq[Attribute],
@@ -64,12 +64,12 @@ case class InsertIntoHadoopFsRelationCommand(
     // Most formats don't do well with duplicate columns, so lets not allow that
     SchemaUtils.checkColumnNameDuplication(
       outputColumnNames,
-      s"when inserting into $outputPath",
+      s"when inserting into $outputFsPath",
       sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
     val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options)
-    val fs = outputPath.getFileSystem(hadoopConf)
-    val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+    val fs = outputFsPath.getFileSystem(hadoopConf)
+    val qualifiedOutputPath = outputFsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
 
     val partitionsTrackedByCatalog = sparkSession.sessionState.conf.manageFilesourcePartitions &&
       catalogTable.isDefined &&
@@ -106,7 +106,7 @@ case class InsertIntoHadoopFsRelationCommand(
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputPath.toString,
+      outputPath = outputFsPath.toString,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = (mode, pathExists) match {
@@ -184,7 +184,7 @@ case class InsertIntoHadoopFsRelationCommand(
       // refresh cached files in FileIndex
       fileIndex.foreach(_.refresh())
       // refresh data cache if table is cached
-      sparkSession.catalog.refreshByPath(outputPath.toString)
+      sparkSession.catalog.refreshByPath(outputFsPath.toString)
 
       if (catalogTable.nonEmpty) {
         CommandUtils.updateTableStats(sparkSession, catalogTable.get)
@@ -261,4 +261,6 @@ case class InsertIntoHadoopFsRelationCommand(
       }
     }.toMap
   }
+
+  override def outputPath: Option[Path] = Some(this.outputFsPath)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala
@@ -58,4 +58,13 @@ class SparkPlanSuite extends QueryTest with SharedSQLContext {
       assert(SparkPlanInfo.fromSparkPlan(f.queryExecution.sparkPlan).metadata.nonEmpty)
     }
   }
+
+  test("SPARK-25421 DataWritingCommandExec should contains 'OutputPath' metadata") {
+    withTable("t") {
+      sql("CREATE TABLE t(col_I int) USING PARQUET")
+      val f = sql("INSERT OVERWRITE TABLE t SELECT 1")
+      assert(SparkPlanInfo.fromSparkPlan(f.queryExecution.sparkPlan).metadata
+        .contains("OutputPath"))
+    }
+  }
 }
diff --git a/...e/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/...e/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.hadoop.fs.Path
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
-import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.DataWritingCommand
@@ -100,4 +100,6 @@ case class CreateHiveTableAsSelectCommand(
     s"TableName: ${tableDesc.identifier.table}, " +
     s"InsertIntoHiveTable]"
   }
+
+  override def outputPath: Option[Path] = tableDesc.storage.locationUri.map(new Path(_))
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
@@ -30,7 +30,6 @@ import org.apache.spark.SparkException
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
-import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.client.HiveClientImpl
@@ -131,5 +130,7 @@ case class InsertIntoHiveDirCommand(
 
     Seq.empty[Row]
   }
+
+  override def outputPath: Option[Path] = storage.locationUri.map(new Path(_))
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -264,4 +264,6 @@ case class InsertIntoHiveTable(
         isSrcLocal = false)
     }
   }
+
+  override def outputPath: Option[Path] = table.storage.locationUri.map(new Path(_))
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -264,4 +264,6 @@ case class InsertIntoHiveTable( @@
             isSrcLocal = false)
         }
       }
+      override def outputPath: Option[Path] = table.storage.locationUri.map(new Path(_))
     }