review

maropu · maropu · commit 2340afee8065 · 2020-08-25T23:47:56.000+09:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -326,48 +326,29 @@ object TypeCoercion {
    *
    * This rule is only applied to Union/Except/Intersect
    */
-  object WidenSetOperationTypes extends Rule[LogicalPlan] {
-
-    def apply(plan: LogicalPlan): LogicalPlan = {
-      val exprIdMapArray = mutable.ArrayBuffer[(ExprId, Attribute)]()
-      val newPlan = plan resolveOperatorsUp {
-        case s @ Except(left, right, isAll) if s.childrenResolved &&
-          left.output.length == right.output.length && !s.resolved =>
-          val (newChildren, newExprIds) = buildNewChildrenWithWiderTypes(left :: right :: Nil)
-          exprIdMapArray ++= newExprIds
-          assert(newChildren.length == 2)
-          Except(newChildren.head, newChildren.last, isAll)
-
-        case s @ Intersect(left, right, isAll) if s.childrenResolved &&
-          left.output.length == right.output.length && !s.resolved =>
-          val (newChildren, newExprIds) = buildNewChildrenWithWiderTypes(left :: right :: Nil)
-          exprIdMapArray ++= newExprIds
-          assert(newChildren.length == 2)
-          Intersect(newChildren.head, newChildren.last, isAll)
-
-        case s: Union if s.childrenResolved && !s.byName &&
+  object WidenSetOperationTypes extends TypeCoercionRule {
+
+    override def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
+      case s @ Except(left, right, isAll) if s.childrenResolved &&
+        left.output.length == right.output.length && !s.resolved =>
+        val newChildren: Seq[LogicalPlan] = buildNewChildrenWithWiderTypes(left :: right :: Nil)
+        assert(newChildren.length == 2)
+        Except(newChildren.head, newChildren.last, isAll)
+
+      case s @ Intersect(left, right, isAll) if s.childrenResolved &&
+        left.output.length == right.output.length && !s.resolved =>
+        val newChildren: Seq[LogicalPlan] = buildNewChildrenWithWiderTypes(left :: right :: Nil)
+        assert(newChildren.length == 2)
+        Intersect(newChildren.head, newChildren.last, isAll)
+
+      case s: Union if s.childrenResolved && !s.byName &&
           s.children.forall(_.output.length == s.children.head.output.length) && !s.resolved =>
-          val (newChildren, newExprIds) = buildNewChildrenWithWiderTypes(s.children)
-          exprIdMapArray ++= newExprIds
-          s.copy(children = newChildren)
-      }
-
-      // Re-maps existing references to the new ones (exprId and dataType)
-      // for aliases added when widening columns' data types.
-      val exprIdMap = exprIdMapArray.toMap
-      newPlan resolveOperatorsUp {
-        case p if p.childrenResolved && p.missingInput.nonEmpty =>
-          p.mapExpressions { _.transform {
-            case a: AttributeReference if p.missingInput.contains(a) &&
-              exprIdMap.contains(a.exprId) => exprIdMap(a.exprId)
-          }
-        }
-      }
+        val newChildren: Seq[LogicalPlan] = buildNewChildrenWithWiderTypes(s.children)
+        s.copy(children = newChildren)
     }
 
     /** Build new children with the widest types for each attribute among all the children */
-    private def buildNewChildrenWithWiderTypes(children: Seq[LogicalPlan])
-      : (Seq[LogicalPlan], Seq[(ExprId, Attribute)]) = {
+    private def buildNewChildrenWithWiderTypes(children: Seq[LogicalPlan]): Seq[LogicalPlan] = {
       require(children.forall(_.output.length == children.head.output.length))
 
       // Get a sequence of data types, each of which is the widest type of this specific attribute
@@ -377,11 +358,10 @@ object TypeCoercion {
 
       if (targetTypes.nonEmpty) {
         // Add an extra Project if the targetTypes are different from the original types.
-        val (newChildren, newExprIds) = children.map(widenTypes(_, targetTypes)).unzip
-        (newChildren, newExprIds.flatten)
+        children.map(widenTypes(_, targetTypes))
       } else {
         // Unable to find a target type to widen, then just return the original set.
-        (children, Nil)
+        children
       }
     }
 
@@ -405,16 +385,12 @@ object TypeCoercion {
     }
 
     /** Given a plan, add an extra project on top to widen some columns' data types. */
-    private def widenTypes(plan: LogicalPlan, targetTypes: Seq[DataType])
-      : (LogicalPlan, Seq[(ExprId, Attribute)]) = {
-      val (casted, newExprIds) = plan.output.zip(targetTypes).map {
-        case (e, dt) if e.dataType != dt =>
-          val alias = Alias(Cast(e, dt), e.name)()
-          (alias, Some(e.exprId -> alias.toAttribute))
-        case (e, _) =>
-          (e, None)
-      }.unzip
-      (Project(casted, plan), newExprIds.flatten)
+    private def widenTypes(plan: LogicalPlan, targetTypes: Seq[DataType]): LogicalPlan = {
+      val casted = plan.output.zip(targetTypes).map {
+        case (e, dt) if e.dataType != dt => Alias(Cast(e, dt), e.name)(exprId = e.exprId)
+        case (e, _) => e
+      }
+      Project(casted, plan)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -477,7 +477,8 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
 object RemoveNoopOperators extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // Eliminate no-op Projects
-    case p @ Project(_, child) if child.sameOutput(p) => child
+    case Project(projList, child) if projList.length == child.output.length &&
+      projList.zip(child.output).forall { case (e1, e2) => e1.semanticEquals(e2) } => child
 
     // Eliminate no-op Window
     case w: Window if w.windowExpressions.isEmpty => w.child
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, PartialMerge}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.aggregate.BaseAggregateExec
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExecBase
-import org.apache.spark.sql.execution.window.WindowExec
 import org.apache.spark.sql.internal.SQLConf
 
 /**
@@ -85,14 +84,19 @@ case class RemoveRedundantProjects(conf: SQLConf) extends Rule[SparkPlan] {
       // to convert the rows to UnsafeRow. See DataSourceV2Strategy for more details.
       case d: DataSourceV2ScanExecBase if !d.supportsColumnar => false
       case _ =>
+        def semanticEquals(exprs1: Seq[Expression], exprs2: Seq[Expression]): Boolean = {
+          exprs1.length == exprs2.length && exprs1.zip(exprs2).forall {
+            case (e1, e2) => e1.semanticEquals(e2)
+          }
+        }
         if (requireOrdering) {
-          project.output.map(_.exprId.id) == child.output.map(_.exprId.id) &&
+          semanticEquals(project.projectList, child.output) &&
             checkNullability(project.output, child.output)
         } else {
-          val orderedProjectOutput = project.output.sortBy(_.exprId.id)
+          val orderedProjectList = project.projectList.sortBy(_.exprId.id)
           val orderedChildOutput = child.output.sortBy(_.exprId.id)
-          orderedProjectOutput.map(_.exprId.id) == orderedChildOutput.map(_.exprId.id) &&
-            checkNullability(orderedProjectOutput, orderedChildOutput)
+          semanticEquals(orderedProjectList, orderedChildOutput) &&
+            checkNullability(orderedProjectList.map(_.toAttribute), orderedChildOutput)
         }
     }
   }