Use has been resolved Attribute instead of create new one.

wangyum · wangyum · commit 72bde208e191 · 2018-08-21T16:18:01.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -177,7 +177,7 @@ package object expressions  {
       // Collect matching attributes given a name and a lookup.
       def collectMatches(name: String, candidates: Option[Seq[Attribute]]): Seq[Attribute] = {
         candidates.toSeq.flatMap(_.collect {
-          case a if resolver(a.name, name) => a.withName(name)
+          case a if resolver(a.name, name) => a
         })
       }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -326,33 +326,6 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
     case a => a
   }
 
-  private def removeSpecialRedundantAliases(
-      plan: LogicalPlan,
-      currentNextAttrPairs: mutable.Buffer[(Attribute, Attribute)],
-      newNode: LogicalPlan,
-      blacklist: AttributeSet): LogicalPlan = {
-    // Create the attribute mapping. Note that the currentNextAttrPairs can contain duplicate
-    // keys in case of Union (this is caused by the PushProjectionThroughUnion rule); in this
-    // case we use the the first mapping (which should be provided by the first child).
-    val mapping = AttributeMap(currentNextAttrPairs)
-
-    // Create a an expression cleaning function for nodes that can actually produce redundant
-    // aliases, use identity otherwise.
-    val clean: Expression => Expression = plan match {
-      case _: Project => removeRedundantAlias(_, blacklist)
-      case _: Aggregate => removeRedundantAlias(_, blacklist)
-      case _: Window => removeRedundantAlias(_, blacklist)
-      case _ => identity[Expression]
-    }
-
-    // Transform the expressions.
-    newNode.mapExpressions { expr =>
-      clean(expr.transform {
-        case a: Attribute => mapping.getOrElse(a, a)
-      })
-    }
-  }
-
   /**
    * Remove redundant alias expression from a LogicalPlan and its subtree. A blacklist is used to
    * prevent the removal of seemingly redundant aliases used to deduplicate the input for a (self)
@@ -374,23 +347,12 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
         val newRight = removeRedundantAliases(right, blacklist ++ newLeft.outputSet)
         val mapping = AttributeMap(
           createAttributeMapping(left, newLeft) ++
-            createAttributeMapping(right, newRight))
+          createAttributeMapping(right, newRight))
         val newCondition = condition.map(_.transform {
           case a: Attribute => mapping.getOrElse(a, a)
         })
         Join(newLeft, newRight, joinType, newCondition)
 
-      case command: Command =>
-        // Add child.outputSet to blacklist otherwise
-        // the schema written in the file may not match the schema of the table.
-        val currentNextAttrPairs = mutable.Buffer.empty[(Attribute, Attribute)]
-        val newNode = command.mapChildren { child =>
-          val newChild = removeRedundantAliases(child, blacklist ++ child.outputSet)
-          currentNextAttrPairs ++= createAttributeMapping(child, newChild)
-          newChild
-        }
-        removeSpecialRedundantAliases(plan, currentNextAttrPairs, newNode, blacklist)
-
       case _ =>
         // Remove redundant aliases in the subtree(s).
         val currentNextAttrPairs = mutable.Buffer.empty[(Attribute, Attribute)]
@@ -399,7 +361,27 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
           currentNextAttrPairs ++= createAttributeMapping(child, newChild)
           newChild
         }
-        removeSpecialRedundantAliases(plan, currentNextAttrPairs, newNode, blacklist)
+
+        // Create the attribute mapping. Note that the currentNextAttrPairs can contain duplicate
+        // keys in case of Union (this is caused by the PushProjectionThroughUnion rule); in this
+        // case we use the the first mapping (which should be provided by the first child).
+        val mapping = AttributeMap(currentNextAttrPairs)
+
+        // Create a an expression cleaning function for nodes that can actually produce redundant
+        // aliases, use identity otherwise.
+        val clean: Expression => Expression = plan match {
+          case _: Project => removeRedundantAlias(_, blacklist)
+          case _: Aggregate => removeRedundantAlias(_, blacklist)
+          case _: Window => removeRedundantAlias(_, blacklist)
+          case _ => identity[Expression]
+        }
+
+        // Transform the expressions.
+        newNode.mapExpressions { expr =>
+          clean(expr.transform {
+            case a: Attribute => mapping.getOrElse(a, a)
+          })
+        }
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DataSourceV2AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DataSourceV2AnalysisSuite.scala
@@ -87,7 +87,7 @@ class DataSourceV2AnalysisSuite extends AnalysisTest {
     val parsedPlan = AppendData.byName(table, query)
     val expectedPlan = AppendData.byName(table,
       Project(Seq(
-        Alias(Cast(toLower(X), FloatType, Some(conf.sessionLocalTimeZone)), "x")(),
+        Alias(Cast(X, FloatType, Some(conf.sessionLocalTimeZone)), "x")(),
         Alias(Cast(y, FloatType, Some(conf.sessionLocalTimeZone)), "y")()),
         query))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker
 import org.apache.spark.sql.execution.datasources.FileFormatWriter
-import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.util.SerializableConfiguration
 
 /**
diff --git a/sql/core/src/test/resources/sql-tests/results/order-by-nulls-ordering.sql.out b/sql/core/src/test/resources/sql-tests/results/order-by-nulls-ordering.sql.out
@@ -102,7 +102,7 @@ struct<col1:int,col2:int,col3:int,sum_col2:bigint>
 -- !query 6
 SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 ASC NULLS FIRST, COL2
 -- !query 6 schema
-struct<COL1:int,COL2:int,COL3:int>
+struct<col1:int,col2:int,col3:int>
 -- !query 6 output
 6	10	NULL
 6	13	NULL
@@ -118,7 +118,7 @@ struct<COL1:int,COL2:int,COL3:int>
 -- !query 7
 SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 NULLS LAST, COL2
 -- !query 7 schema
-struct<COL1:int,COL2:int,COL3:int>
+struct<col1:int,col2:int,col3:int>
 -- !query 7 output
 6	7	4
 6	11	4
@@ -134,7 +134,7 @@ struct<COL1:int,COL2:int,COL3:int>
 -- !query 8
 SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 DESC NULLS FIRST, COL2
 -- !query 8 schema
-struct<COL1:int,COL2:int,COL3:int>
+struct<col1:int,col2:int,col3:int>
 -- !query 8 output
 6	10	NULL
 6	13	NULL
@@ -150,7 +150,7 @@ struct<COL1:int,COL2:int,COL3:int>
 -- !query 9
 SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 DESC NULLS LAST, COL2
 -- !query 9 schema
-struct<COL1:int,COL2:int,COL3:int>
+struct<col1:int,col2:int,col3:int>
 -- !query 9 output
 6	9	10
 6	12	10
diff --git a/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out b/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out
@@ -183,7 +183,7 @@ struct<>
 -- !query 20
 SELECT p.`(KEY)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3
 -- !query 20 schema
-struct<value1:string,value2:string,b:int,A:int,c:int,d:int>
+struct<value1:string,value2:string,B:int,A:int,c:int,d:int>
 -- !query 20 output
 1	11	1	1	1	2
 1	11	2	1	1	2
@@ -194,7 +194,7 @@ struct<value1:string,value2:string,b:int,A:int,c:int,d:int>
 -- !query 21
 SELECT p.`(key)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3
 -- !query 21 schema
-struct<value1:string,value2:string,b:int,A:int,c:int,d:int>
+struct<value1:string,value2:string,B:int,A:int,c:int,d:int>
 -- !query 21 output
 1	11	1	1	1	2
 1	11	2	1	1	2
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2853,55 +2853,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("SPARK-25135: insert table may all null when select from view") {
-    withTempDir { dir =>
-      val path = dir.getCanonicalPath
-      val cnt = 30
-      spark.range(cnt).selectExpr("cast(id as bigint) as col1", "cast(id as bigint) as col2")
-        .write.mode(SaveMode.Overwrite).parquet(path)
-      withTable("table1", "table2", "table3", "table4") {
-        spark.sql(s"CREATE TABLE table1(col1 bigint, col2 bigint) using parquet location '$path'")
-
-        withView("view1", "view2") {
-          spark.sql("CREATE VIEW view1 as select col1, col2 from table1 where col1 > -20")
-
-          spark.sql("CREATE TABLE table2 (COL1 BIGINT, COL2 BIGINT) using parquet")
-          spark.sql("INSERT OVERWRITE TABLE table2 select COL1, COL2 from view1")
-          assert(spark.table("table2").count() === cnt)
-          checkAnswer(spark.table("table1"), spark.table("table2"))
-
-          spark.sql("CREATE TABLE table3 (COL1 BIGINT) using parquet")
-          spark.sql("INSERT OVERWRITE TABLE table3 select COL1 from view1")
-          assert(spark.table("table3").count() === cnt)
-          checkAnswer(spark.table("table1").select("COL1"), spark.table("table3"))
-
-          spark.sql("CREATE TABLE table4 (COL1 BIGINT, COL2 BIGINT, COL3 BIGINT) using parquet")
-          spark.sql("INSERT OVERWRITE TABLE table4 select COL1, COL1, COL2 from view1")
-          assert(spark.table("table4").count() === cnt)
-          checkAnswer(spark.table("table1").select("col1", "col1", "col2"), spark.table("table4"))
-
-          spark.sql("INSERT OVERWRITE TABLE table4 select 1, COL1, COL2 from view1")
-          assert(spark.table("table4").count() === cnt)
-          checkAnswer(spark.table("table1").selectExpr("1", "col1", "col2"), spark.table("table4"))
-
-          assertThrows[AnalysisException] {
-            spark.sql("INSERT OVERWRITE TABLE table4 select COL1, COL3, COL2 from view1")
-          }
-
-          spark.sql("CREATE TEMP VIEW view2 as select col1, 1 as col2 from view1")
-
-          spark.sql("INSERT OVERWRITE TABLE table2 select COL1, COL2 from view2")
-          assert(spark.table("table2").count() === cnt)
-          checkAnswer(spark.table("table1").selectExpr("col1", "1"), spark.table("table2"))
-
-          spark.sql("INSERT OVERWRITE TABLE table2 select col1, COL2 from view2")
-          assert(spark.table("table2").count() === cnt)
-          checkAnswer(spark.table("table1").selectExpr("col1", "1"), spark.table("table2"))
-        }
-      }
-    }
-  }
-
   test("SPARK-25144 'distinct' causes memory leak") {
     val ds = List(Foo(Some("bar"))).toDS
     val result = ds.flatMap(_.bar).distinct
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -891,6 +891,31 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       }
     }
   }
+
+  test("SPARK-25135: insert parquet table may all null when select from view") {
+    withTempDir { dir =>
+      val path = dir.getCanonicalPath
+      val cnt = 30
+      val table1Path = s"$path/table1"
+      val table2Path = s"$path/table2"
+      spark.range(cnt).selectExpr("cast(id as bigint) as col1")
+        .write.mode(SaveMode.Overwrite).parquet(table1Path)
+      withTable("table1", "table2") {
+        spark.sql(s"CREATE TABLE table1(col1 bigint) using parquet location '$table1Path/'")
+        spark.sql(s"CREATE TABLE table2(COL1 bigint) using parquet location '$table2Path/'")
+
+        withView("view1") {
+          spark.sql("CREATE VIEW view1 as select col1 from table1 where col1 > -20")
+          spark.sql("INSERT OVERWRITE TABLE table2 select COL1 from view1")
+          assert(spark.table("table2").count() === cnt)
+          spark.read.parquet(table2Path).schema.zip(
+            spark.table("table2").schema).foreach { case (actual, table) =>
+            assert(actual.name.equals(table.name))
+          }
+        }
+      }
+    }
+  }
 }
 
 object TestingUDT {

Original file line number	Diff line number	Diff line change
`@@ -177,7 +177,7 @@ package object expressions {`
`177`	`177`	`// Collect matching attributes given a name and a lookup.`
`178`	`178`	`def collectMatches(name: String, candidates: Option[Seq[Attribute]]): Seq[Attribute] = {`
`179`	`179`	`candidates.toSeq.flatMap(_.collect {`
`180`		`- case a if resolver(a.name, name) => a.withName(name)`
	`180`	`+ case a if resolver(a.name, name) => a`
`181`	`181`	`})`
`182`	`182`	`}`
`183`	`183`