Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.spark.internal.Logging
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
import org.apache.spark.sql.catalyst.optimizer.JoinSelectionHelper
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._

Expand Down Expand Up @@ -173,7 +174,7 @@ object ScanOperation extends OperationHelper with PredicateHelper {
* Null-safe equality will be transformed into equality as joining key (replace null with default
* value).
*/
object ExtractEquiJoinKeys extends Logging with PredicateHelper {
object ExtractEquiJoinKeys extends Logging with PredicateHelper with JoinSelectionHelper {
/** (joinType, leftKeys, rightKeys, condition, leftChild, rightChild, joinHint) */
type ReturnType =
(JoinType, Seq[Expression], Seq[Expression],
Expand Down Expand Up @@ -205,7 +206,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
}
val otherPredicates = predicates.filterNot {
case EqualTo(l, r) if l.references.isEmpty || r.references.isEmpty => false
case Equality(l, r) =>
case Equality(l, r) if !hintToShuffleReplicateNL(hint) =>
canEvaluate(l, left) && canEvaluate(r, right) ||
canEvaluate(l, right) && canEvaluate(r, left)
case _ => false
Expand Down
27 changes: 27 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -570,4 +570,31 @@ class JoinHintSuite extends PlanTest with SharedSparkSession with AdaptiveSparkP
assert(joinHints == expectedHints)
}
}

test("SPARK-32220: Non Cartesian Product Join Result Correct with SHUFFLE_REPLICATE_NL hint") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, is this a correctness issue, @AngersZhuuuu ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I think so. Nice catch, @AngersZhuuuu

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, when I try new join hint, I found this result is non-correct.

withTempView("t1", "t2") {
Seq((1, "4"), (2, "2")).toDF("key", "value").createTempView("t1")
Seq((1, "1"), (2, "12.3"), (2, "123")).toDF("key", "value").createTempView("t2")
val df1 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key = t2.key")
val df2 = sql("SELECT * from t1 join t2 ON t1.key = t2.key")
assert(df1.collect().size == df2.collect().size)

val df3 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2")
val df4 = sql("SELECT * from t1 join t2")
assert(df3.collect().size == df4.collect().size)

val df5 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < t2.key")
val df6 = sql("SELECT * from t1 join t2 ON t1.key < t2.key")
assert(df5.collect().size == df6.collect().size)

val df7 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < 2")
val df8 = sql("SELECT * from t1 join t2 ON t1.key < 2")
assert(df7.collect().size == df8.collect().size)


val df9 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t2.key < 2")
val df10 = sql("SELECT * from t1 join t2 ON t2.key < 2")
assert(df9.collect().size == df10.collect().size)
}
}
}