Revert "[CARMEL-7385][CARMEL-6381] Remove unnecessary sql metrics for UnionExec" (apache#129)

wangyum · GitHub Enterprise · commit b8dccff99dc8 · 2024-01-15T23:38:29.000-06:00
Reverts carmel/ebay-spark#119
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -296,18 +296,6 @@ package object config {
       .stringConf
       .createWithDefaultString("file,hdfs")
 
-  private[spark] val EXECUTOR_METRICS_SENT_UPDATED_EXCEPT_FIRST_PART_ENABLED =
-    ConfigBuilder("spark.executor.metrics.send.updated.exceptFistPart")
-      .doc("Only sent updated metrics to driver side for all tasks except the first " +
-        "partition, the first partition will send back all metrics, because some metrics " +
-        "like sql related metrics is needed from driver side even it is zero, but only " +
-        "one partition send back the zero metrics is good enough, that will save lots " +
-        "of driver memory especially for union rdds, which contains lots of unused metrics " +
-        "for each task.")
-      .version("3.5.0")
-      .booleanConf
-      .createWithDefault(true)
-
   private[spark] val EXECUTOR_JAVA_OPTIONS =
     ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS)
       .withPrepended(SparkLauncher.EXECUTOR_DEFAULT_JAVA_OPTIONS)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -455,7 +455,8 @@ private[spark] class DAGScheduler(
       prevShuffleSize.getAndAdd(currTaskShuffleSize)
     }
 
-    val event = CompletionEvent(task, reason, result, lightTaskMetrics, metricPeaks, taskInfo)
+    val event = CompletionEvent(task, reason, result,
+      lightAccumUpdates, lightTaskMetrics, metricPeaks, taskInfo)
     val stageOpt = stageIdToStage.get(task.stageId)
     if (stageOpt.isEmpty) {
       // The stage may have already finished when we get this event -- eg. maybe it was a
@@ -482,15 +483,14 @@ private[spark] class DAGScheduler(
               case Some(job) =>
                 // Only update the accumulator once for each result task.
                 if (!job.finished(rt.outputId)) {
-                  updateAccumulators(event.task, lightAccumUpdates, event.taskInfo)
+                  updateAccumulators(event)
                 }
               case None => // Ignore update if task's job has finished.
             }
           case _ =>
-            updateAccumulators(event.task, lightAccumUpdates, event.taskInfo)
+            updateAccumulators(event)
         }
-      case _: ExceptionFailure | _: TaskKilled =>
-        updateAccumulators(event.task, lightAccumUpdates, event.taskInfo)
+      case _: ExceptionFailure | _: TaskKilled => updateAccumulators(event)
       case _ =>
     }
 
@@ -504,7 +504,7 @@ private[spark] class DAGScheduler(
 
     val taskMetricsForDAG: TaskMetrics = taskMetricsFromAccumulators(accumUpdatesForDAG)
     val eventForDAGScheduler = CompletionEvent(task, reason, result,
-      taskMetricsForDAG, metricPeaks, taskInfo)
+      accumUpdatesForDAG, taskMetricsForDAG, metricPeaks, taskInfo)
     eventProcessLoop.post(eventForDAGScheduler)
   }
 
@@ -1844,7 +1844,14 @@ private[spark] class DAGScheduler(
       // this synchronization in case another concurrent job is checkpointing this RDD, so we get a
       // consistent view of both variables.
       RDDCheckpointData.synchronized {
-        taskBinaryBytes = serializeTaskBinaries(stage)
+        taskBinaryBytes = stage match {
+          case stage: ShuffleMapStage =>
+            JavaUtils.bufferToArray(
+              closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
+          case stage: ResultStage =>
+            JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
+        }
+
         partitions = stage.rdd.partitions
       }
     } catch {
@@ -1905,17 +1912,6 @@ private[spark] class DAGScheduler(
     }
   }
 
-  private[scheduler] def serializeTaskBinaries(stage: Stage): Array[Byte] = {
-    val taskBinaries = stage match {
-      case stage: ShuffleMapStage =>
-        JavaUtils.bufferToArray(
-          closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
-      case stage: ResultStage =>
-        JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
-    }
-    taskBinaries
-  }
-
   private[scheduler] def handleSubmitMissingTask(missingTask: SubmitMissingTask): Unit = {
     logDebug("submitMissingTasks(" + missingTask.stage + ")")
     if (missingTask.taskBinary == null) {
@@ -2017,13 +2013,11 @@ private[spark] class DAGScheduler(
    * This still doesn't stop the caller from updating the accumulator outside the scheduler,
    * but that's not our problem since there's nothing we can do about that.
    */
-  private def updateAccumulators(
-      task: Task[_],
-      accumUpdates: Seq[AccumulatorV2[_, _]],
-      taskInfo: TaskInfo): Unit = {
+  private def updateAccumulators(event: CompletionEvent): Unit = {
+    val task = event.task
     val stage = stageIdToStage(task.stageId)
 
-   accumUpdates.foreach { updates =>
+    event.accumUpdates.foreach { updates =>
       val id = updates.id
       try {
         // Find the corresponding accumulator on the driver and update it
@@ -2038,8 +2032,8 @@ private[spark] class DAGScheduler(
         // To avoid UI cruft, ignore cases where value wasn't updated
         if (acc.name.isDefined && !updates.isZero) {
           stage.latestInfo.accumulables(id) = acc.toInfo(None, Some(acc.value))
-          taskInfo.setAccumulables(
-            acc.toInfo(Some(updates.value), Some(acc.value)) +: taskInfo.accumulables)
+          event.taskInfo.setAccumulables(
+            acc.toInfo(Some(updates.value), Some(acc.value)) +: event.taskInfo.accumulables)
         }
       } catch {
         case NonFatal(e) =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -23,7 +23,7 @@ import org.apache.spark._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.CallSite
+import org.apache.spark.util.{AccumulatorV2, CallSite}
 
 /**
  * Types of events that can be handled by the DAGScheduler. The DAGScheduler uses an event queue
@@ -84,6 +84,7 @@ private[scheduler] case class CompletionEvent(
     task: Task[_],
     reason: TaskEndReason,
     result: Any,
+    accumUpdates: Seq[AccumulatorV2[_, _]],
     taskMetrics: TaskMetrics,
     metricPeaks: Array[Long],
     taskInfo: TaskInfo)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -19,9 +19,10 @@ package org.apache.spark.scheduler
 
 import java.nio.ByteBuffer
 import java.util.Properties
+
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.internal.config.{APP_CALLER_CONTEXT, EXECUTOR_METRICS_SENT_UPDATED_EXCEPT_FIRST_PART_ENABLED}
+import org.apache.spark.internal.config.APP_CALLER_CONTEXT
 import org.apache.spark.internal.plugin.PluginContainer
 import org.apache.spark.memory.{MemoryMode, TaskMemoryManager}
 import org.apache.spark.metrics.MetricsSystem
@@ -209,27 +210,12 @@ private[spark] abstract class Task[T](
       context.taskMetrics.nonZeroInternalAccums() ++
         // zero value external accumulators may still be useful, e.g. SQLMetrics, we should not
         // filter them out.
-      collectExternalAccumUpdates(context.taskMetrics.externalAccums, taskFailed)
+        context.taskMetrics.externalAccums.filter(a => !taskFailed || a.countFailedValues)
     } else {
       Seq.empty
     }
   }
 
-  private def collectExternalAccumUpdates(
-      extAccumUpdates: Seq[AccumulatorV2[_, _]], taskFailed: Boolean): Seq[AccumulatorV2[_, _]] = {
-    val sentOnlyUpdatedMetricsExceptFirstPart =
-      SparkEnv.get.conf.get(EXECUTOR_METRICS_SENT_UPDATED_EXCEPT_FIRST_PART_ENABLED)
-    extAccumUpdates.filter { a =>
-      var filter = !taskFailed || a.countFailedValues
-      // only send all metrics for the first part
-      // and send only updated metrics for other partitions
-      if (sentOnlyUpdatedMetricsExceptFirstPart && partitionId != 0) {
-        filter = filter && !a.isZero
-      }
-      filter
-    }
-  }
-
   /**
    * Kills a task by setting the interrupted flag to true. This relies on the upper level Spark
    * code and user code to properly handle the flag. This function should be idempotent so it can
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -4974,7 +4974,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
       } else {
         null
       }
-    CompletionEvent(task, reason, result, taskMetrics, metricPeaks, taskInfo)
+    CompletionEvent(task, reason, result, allAccumUpdates, taskMetrics, metricPeaks, taskInfo)
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -669,48 +669,6 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     assert(invocationOrder === Seq("C", "B", "A", "D"))
   }
 
-  test("Only first partition updated external accumulators will be sent back to driver") {
-    sc = new SparkContext("local", "test")
-    // Create a dummy task. We won't end up running this; we just want to collect
-    // accumulator updates from it.
-    val taskMetrics1 = TaskMetrics.registered
-    val ext1 = new LongAccumulator
-    ext1.register(sc, Some("extAccum1"))
-    taskMetrics1.registerAccumulator(ext1)
-    val task1 = new Task[Int](0, 0, 0, 1, JobArtifactSet.getActiveOrDefault(sc)) {
-      context = new TaskContextImpl(0, 0, 0, 0L, 0, 1,
-        new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
-        new Properties,
-        SparkEnv.get.metricsSystem,
-        taskMetrics1)
-
-      override def runTask(tc: TaskContext): Int = 0
-    }
-    val updatedAccums = task1.collectAccumulatorUpdates()
-    assert(updatedAccums.length == 2)
-    assert(updatedAccums(0).name == Some(InternalAccumulator.RESULT_SIZE))
-    assert(updatedAccums(0).value == 0)
-    assert(updatedAccums(1).name == Some("extAccum1"))
-    assert(updatedAccums(1).value == 0)
-
-    val taskMetrics2 = TaskMetrics.registered
-    val ext2 = new LongAccumulator
-    ext2.register(sc, Some("extAccum2"))
-    taskMetrics2.registerAccumulator(ext2)
-    val task2 = new Task[Int](0, 0, 1, 1, JobArtifactSet.getActiveOrDefault(sc)) {
-      context = new TaskContextImpl(0, 0, 1, 0L, 0, 1,
-        new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
-        new Properties,
-        SparkEnv.get.metricsSystem,
-        taskMetrics2)
-
-      override def runTask(tc: TaskContext): Int = 0
-    }
-    val updatedAccums2 = task2.collectAccumulatorUpdates()
-    // external accumulators won't be send back for the second partition
-    // when it is not updated
-    assert(updatedAccums2.length == 1)
-  }
 }
 
 private object TaskContextSuite {

Original file line number	Diff line number	Diff line change
`@@ -4974,7 +4974,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti`
`4974`	`4974`	`} else {`
`4975`	`4975`	`null`
`4976`	`4976`	`}`
`4977`		`- CompletionEvent(task, reason, result, taskMetrics, metricPeaks, taskInfo)`
	`4977`	`+ CompletionEvent(task, reason, result, allAccumUpdates, taskMetrics, metricPeaks, taskInfo)`
`4978`	`4978`	`}`
`4979`	`4979`	`}`
`4980`	`4980`