address comments

mgaido91 · mgaido91 · commit 1beb40ce0aa2 · 2018-12-11T11:15:47.000+01:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -114,18 +114,20 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
           // Deduplicate conflicting attributes if any.
           dedupJoin(Join(outerPlan, sub, LeftAnti, joinCond))
         case (p, InSubquery(values, ListQuery(sub, conditions, _, _))) =>
+          // Deduplicate conflicting attributes if any.
           val newSub = dedupSubqueryOnSelfJoin(values, sub)
           val inConditions = values.zip(newSub.output).map(EqualTo.tupled)
           val (joinCond, outerPlan) = rewriteExistentialExpr(inConditions ++ conditions, p)
-          // Deduplicate conflicting attributes if any.
-          dedupJoin(Join(outerPlan, newSub, LeftSemi, joinCond))
+          Join(outerPlan, newSub, LeftSemi, joinCond)
         case (p, Not(InSubquery(values, ListQuery(sub, conditions, _, _)))) =>
           // This is a NULL-aware (left) anti join (NAAJ) e.g. col NOT IN expr
           // Construct the condition. A NULL in one of the conditions is regarded as a positive
           // result; such a row will be filtered out by the Anti-Join operator.
 
           // Note that will almost certainly be planned as a Broadcast Nested Loop join.
           // Use EXISTS if performance matters to you.
+
+          // Deduplicate conflicting attributes if any.
           val newSub = dedupSubqueryOnSelfJoin(values, sub)
           val inConditions = values.zip(newSub.output).map(EqualTo.tupled)
           val (joinCond, outerPlan) = rewriteExistentialExpr(inConditions, p)
@@ -142,8 +144,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
           // will have the final conditions in the LEFT ANTI as
           // (A.A1 = B.B1 OR ISNULL(A.A1 = B.B1)) AND (B.B2 = A.A2) AND B.B3 > 1
           val finalJoinCond = (nullAwareJoinConds ++ conditions).reduceLeft(And)
-          // Deduplicate conflicting attributes if any.
-          dedupJoin(Join(outerPlan, newSub, LeftAnti, Option(finalJoinCond)))
+          Join(outerPlan, newSub, LeftAnti, Option(finalJoinCond))
         case (p, predicate) =>
           val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p)
           Project(p.output, Filter(newCond.get, inputPlan))
@@ -170,11 +171,11 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
           exists
         case InSubquery(values, ListQuery(sub, conditions, _, _)) =>
           val exists = AttributeReference("exists", BooleanType, nullable = false)()
+          // Deduplicate conflicting attributes if any.
           val newSub = dedupSubqueryOnSelfJoin(values, sub)
           val inConditions = values.zip(newSub.output).map(EqualTo.tupled)
           val newConditions = (inConditions ++ conditions).reduceLeftOption(And)
-          // Deduplicate conflicting attributes if any.
-          newPlan = dedupJoin(Join(newPlan, newSub, ExistenceJoin(exists), newConditions))
+          newPlan = Join(newPlan, newSub, ExistenceJoin(exists), newConditions)
           exists
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -1284,14 +1284,8 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-26078: deduplicate fake self joins for IN subqueries") {
     withTempView("a", "b") {
-      def genTestViewWithName(name: String): Unit = {
-        val df = spark.createDataFrame(
-          spark.sparkContext.parallelize(Seq(Row("a", 2), Row("b", 1))),
-          StructType(Seq(StructField("id", StringType), StructField("num", IntegerType))))
-        df.createOrReplaceTempView(name)
-      }
-      genTestViewWithName("a")
-      genTestViewWithName("b")
+      Seq("a" -> 2, "b" -> 1).toDF("id", "num").createTempView("a")
+      Seq("a" -> 2, "b" -> 1).toDF("id", "num").createTempView("b")
 
       val df1 = spark.sql(
         """