Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,19 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
numPartitionsSet.headOption
}

val targetNumPartitions = requiredNumPartitions.getOrElse(childrenNumPartitions.max)
// Read bucketed tables always obeys numShufflePartitions because maxNumPostShufflePartitions
// is usually much larger than numShufflePartitions,
// which causes some bucket map join lose efficacy after enabling adaptive execution.
Copy link
Contributor

@cloud-fan cloud-fan Nov 14, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment is hard to understand. How about

If there are non-shuffle children that satisfy the required distribution, we have some tradeoffs
when picking the expected number of shuffle partitions:
1. we should avoid shuffling these children
2. we should have a reasonable parallelism

Here we pick the max number of partitions among these non-shuffle children as the expected number
of shuffle partitions. However, if it's smaller than `conf.numShufflePartitions`, we pick 
`conf.numShufflePartitions` as the expected number of shuffle partitions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

val nonShuffleChildrenNumPartitions =
childrenIndexes.map(children).filterNot(_.isInstanceOf[ShuffleExchangeExec])
.map(_.outputPartitioning.numPartitions)
val expectedChildrenNumPartitions = if (nonShuffleChildrenNumPartitions.nonEmpty) {
math.max(nonShuffleChildrenNumPartitions.max, conf.numShufflePartitions)
} else {
childrenNumPartitions.max
}

val targetNumPartitions = requiredNumPartitions.getOrElse(expectedChildrenNumPartitions)

children = children.zip(requiredChildDistributions).zipWithIndex.map {
case ((child, distribution), index) if childrenIndexes.contains(index) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
import org.apache.spark.sql.execution.{DataSourceScanExec, SortExec}
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
import org.apache.spark.sql.execution.datasources.BucketingUtils
import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
import org.apache.spark.sql.execution.joins.SortMergeJoinExec
Expand Down Expand Up @@ -382,8 +383,16 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
joined.sort("bucketed_table1.k", "bucketed_table2.k"),
df1.join(df2, joinCondition(df1, df2), joinType).sort("df1.k", "df2.k"))

assert(joined.queryExecution.executedPlan.isInstanceOf[SortMergeJoinExec])
val joinOperator = joined.queryExecution.executedPlan.asInstanceOf[SortMergeJoinExec]
val joinOperator = if (joined.sqlContext.conf.adaptiveExecutionEnabled) {
val executedPlan =
joined.queryExecution.executedPlan.asInstanceOf[AdaptiveSparkPlanExec].executedPlan
assert(executedPlan.isInstanceOf[SortMergeJoinExec])
executedPlan.asInstanceOf[SortMergeJoinExec]
} else {
val executedPlan = joined.queryExecution.executedPlan
assert(executedPlan.isInstanceOf[SortMergeJoinExec])
executedPlan.asInstanceOf[SortMergeJoinExec]
}

// check existence of shuffle
assert(
Expand Down Expand Up @@ -795,4 +804,22 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
}
}

test("SPARK-29655 Read bucketed tables obeys spark.sql.shuffle.partitions") {
withSQLConf(
SQLConf.SHUFFLE_PARTITIONS.key -> "5",
SQLConf.SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS.key -> "7") {
val bucketSpec = Some(BucketSpec(6, Seq("i", "j"), Nil))
Seq(false, true).foreach { enableAdaptive =>
withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> s"$enableAdaptive") {
val bucketedTableTestSpecLeft = BucketedTableTestSpec(bucketSpec, expectedShuffle = false)
val bucketedTableTestSpecRight = BucketedTableTestSpec(None, expectedShuffle = true)
testBucketing(
bucketedTableTestSpecLeft = bucketedTableTestSpecLeft,
bucketedTableTestSpecRight = bucketedTableTestSpecRight,
joinCondition = joinCondition(Seq("i", "j"))
)
}
}
}
}
}