[SPARK-17515] CollectLimit.execute() should perform per-partition limits

JoshRosen · hvanhovell · commit 3f6a2bb3f7be · 2016-09-13T12:54:03.000+02:00
## What changes were proposed in this pull request? CollectLimit.execute() incorrectly omits per-partition limits, leading to performance regressions in case this case is hit (which should not happen in normal operation, but can occur in some cases (see #15068 for one example). ## How was this patch tested? Regression test in SQLQuerySuite that asserts the number of records scanned from the input RDD. Author: Josh Rosen <joshrosen@databricks.com> Closes #15070 from JoshRosen/SPARK-17515.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -39,9 +39,10 @@ case class CollectLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode
   override def executeCollect(): Array[InternalRow] = child.executeTake(limit)
   private val serializer: Serializer = new UnsafeRowSerializer(child.output.size)
   protected override def doExecute(): RDD[InternalRow] = {
+    val locallyLimited = child.execute().mapPartitionsInternal(_.take(limit))
     val shuffled = new ShuffledRowRDD(
       ShuffleExchange.prepareShuffleDependency(
-        child.execute(), child.output, SinglePartition, serializer))
+        locallyLimited, child.output, SinglePartition, serializer))
     shuffled.mapPartitionsInternal(_.take(limit))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2661,4 +2661,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         data.selectExpr("`part.col1`", "`col.1`"))
     }
   }
+
+  test("SPARK-17515: CollectLimit.execute() should perform per-partition limits") {
+    val numRecordsRead = spark.sparkContext.longAccumulator
+    spark.range(1, 100, 1, numPartitions = 10).map { x =>
+      numRecordsRead.add(1)
+      x
+    }.limit(1).queryExecution.toRdd.count()
+    assert(numRecordsRead.value === 10)
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -39,9 +39,10 @@ case class CollectLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode`
`39`	`39`	`override def executeCollect(): Array[InternalRow] = child.executeTake(limit)`
`40`	`40`	`private val serializer: Serializer = new UnsafeRowSerializer(child.output.size)`
`41`	`41`	`protected override def doExecute(): RDD[InternalRow] = {`
	`42`	`+ val locallyLimited = child.execute().mapPartitionsInternal(_.take(limit))`
`42`	`43`	`val shuffled = new ShuffledRowRDD(`
`43`	`44`	`ShuffleExchange.prepareShuffleDependency(`
`44`		`- child.execute(), child.output, SinglePartition, serializer))`
	`45`	`+ locallyLimited, child.output, SinglePartition, serializer))`
`45`	`46`	`shuffled.mapPartitionsInternal(_.take(limit))`
`46`	`47`	`}`
`47`	`48`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2661,4 +2661,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {`
`2661`	`2661`	data.selectExpr("`part.col1`", "`col.1`"))
`2662`	`2662`	`}`
`2663`	`2663`	`}`
	`2664`	`+`
	`2665`	`+ test("SPARK-17515: CollectLimit.execute() should perform per-partition limits") {`
	`2666`	`+ val numRecordsRead = spark.sparkContext.longAccumulator`
	`2667`	`+ spark.range(1, 100, 1, numPartitions = 10).map { x =>`
	`2668`	`+ numRecordsRead.add(1)`
	`2669`	`+ x`
	`2670`	`+ }.limit(1).queryExecution.toRdd.count()`
	`2671`	`+ assert(numRecordsRead.value === 10)`
	`2672`	`+ }`
`2664`	`2673`	`}`