[SPARK-31956][SQL] Do not fail if there is no ambiguous self join

cloud-fan · dongjoon-hyun · commit c40051932290 · 2020-06-10T13:11:24.000-07:00
### What changes were proposed in this pull request? This is a followup of #28695 , to fix the problem completely. The root cause is that, `df("col").as("name")` is not a column reference anymore, and should not have the special column metadata. However, this was broken in ba7adc4#diff-ac415c903887e49486ba542a65eec980L1050-L1053 This PR fixes the regression, by strip the special column metadata in `Column.name`, which is the behavior before #28326 . ### Why are the changes needed? Fix a regression. We shouldn't fail if there is no ambiguous self-join. ### Does this PR introduce _any_ user-facing change? Yes, the query in the test can run now. ### How was this patch tested? updated test Closes #28783 from cloud-fan/self-join. Authored-by: Wenchen Fan <wenchen@databricks.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -1042,7 +1042,7 @@ class Column(val expr: Expression) extends Logging {
    * @since 2.0.0
    */
   def name(alias: String): Column = withExpr {
-    Alias(expr, alias)()
+    Alias(normalizedExpr(), alias)()
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
@@ -204,14 +204,19 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
     }
   }
 
-  test("SPARK-28344: don't fail as ambiguous self join when there is no join") {
+  test("SPARK-28344: don't fail if there is no ambiguous self join") {
     withSQLConf(
       SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true") {
       val df = Seq(1, 1, 2, 2).toDF("a")
       val w = Window.partitionBy(df("a"))
       checkAnswer(
         df.select(df("a").alias("x"), sum(df("a")).over(w)),
         Seq((1, 2), (1, 2), (2, 4), (2, 4)).map(Row.fromTuple))
+
+      val joined = df.join(spark.range(1)).select($"a")
+      checkAnswer(
+        joined.select(joined("a").alias("x"), sum(joined("a")).over(w)),
+        Seq((1, 2), (1, 2), (2, 4), (2, 4)).map(Row.fromTuple))
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -1042,7 +1042,7 @@ class Column(val expr: Expression) extends Logging {`
`1042`	`1042`	`* @since 2.0.0`
`1043`	`1043`	`*/`
`1044`	`1044`	`def name(alias: String): Column = withExpr {`
`1045`		`- Alias(expr, alias)()`
	`1045`	`+ Alias(normalizedExpr(), alias)()`
`1046`	`1046`	`}`
`1047`	`1047`
`1048`	`1048`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -204,14 +204,19 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {`
`204`	`204`	`}`
`205`	`205`	`}`
`206`	`206`
`207`		`- test("SPARK-28344: don't fail as ambiguous self join when there is no join") {`
	`207`	`+ test("SPARK-28344: don't fail if there is no ambiguous self join") {`
`208`	`208`	`withSQLConf(`
`209`	`209`	`SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true") {`
`210`	`210`	`val df = Seq(1, 1, 2, 2).toDF("a")`
`211`	`211`	`val w = Window.partitionBy(df("a"))`
`212`	`212`	`checkAnswer(`
`213`	`213`	`df.select(df("a").alias("x"), sum(df("a")).over(w)),`
`214`	`214`	`Seq((1, 2), (1, 2), (2, 4), (2, 4)).map(Row.fromTuple))`
	`215`	`+`
	`216`	`+ val joined = df.join(spark.range(1)).select($"a")`
	`217`	`+ checkAnswer(`
	`218`	`+ joined.select(joined("a").alias("x"), sum(joined("a")).over(w)),`
	`219`	`+ Seq((1, 2), (1, 2), (2, 4), (2, 4)).map(Row.fromTuple))`
`215`	`220`	`}`
`216`	`221`	`}`
`217`	`222`	`}`