apache · AngersZhuuuu · Jul 8, 2020 · Jul 8, 2020 · Jul 8, 2020 · Jul 10, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -21,6 +21,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
+import org.apache.spark.sql.catalyst.optimizer.JoinSelectionHelper
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 
@@ -173,7 +174,7 @@ object ScanOperation extends OperationHelper with PredicateHelper {
  * Null-safe equality will be transformed into equality as joining key (replace null with default
  * value).
  */
-object ExtractEquiJoinKeys extends Logging with PredicateHelper {
+object ExtractEquiJoinKeys extends Logging with PredicateHelper with JoinSelectionHelper {
   /** (joinType, leftKeys, rightKeys, condition, leftChild, rightChild, joinHint) */
   type ReturnType =
     (JoinType, Seq[Expression], Seq[Expression],
@@ -205,7 +206,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       }
       val otherPredicates = predicates.filterNot {
         case EqualTo(l, r) if l.references.isEmpty || r.references.isEmpty => false
-        case Equality(l, r) =>
+        case Equality(l, r) if !hintToShuffleReplicateNL(hint) =>
           canEvaluate(l, left) && canEvaluate(r, right) ||
             canEvaluate(l, right) && canEvaluate(r, left)
         case _ => false

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala
@@ -570,4 +570,31 @@ class JoinHintSuite extends PlanTest with SharedSparkSession with AdaptiveSparkP
       assert(joinHints == expectedHints)
     }
   }
+
+  test("SPARK-32220: Non Cartesian Product Join Result Correct with SHUFFLE_REPLICATE_NL hint") {
+    withTempView("t1", "t2") {
+      Seq((1, "4"), (2, "2")).toDF("key", "value").createTempView("t1")
+      Seq((1, "1"), (2, "12.3"), (2, "123")).toDF("key", "value").createTempView("t2")
+      val df1 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key = t2.key")
+      val df2 = sql("SELECT * from t1 join t2 ON t1.key = t2.key")
+      assert(df1.collect().size == df2.collect().size)
+
+      val df3 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2")
+      val df4 = sql("SELECT * from t1 join t2")
+      assert(df3.collect().size == df4.collect().size)
+
+      val df5 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < t2.key")
+      val df6 = sql("SELECT * from t1 join t2 ON t1.key < t2.key")
+      assert(df5.collect().size == df6.collect().size)
+
+      val df7 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < 2")
+      val df8 = sql("SELECT * from t1 join t2 ON t1.key < 2")
+      assert(df7.collect().size == df8.collect().size)
+
+
+      val df9 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t2.key < 2")
+      val df10 = sql("SELECT * from t1 join t2 ON t2.key < 2")
+      assert(df9.collect().size == df10.collect().size)
+    }
+  }
 }