[SPARK-21538][SQL] Attribute resolution inconsistency in the Dataset API

aokolnychyi · gatorsmile · commit f44ead89f48f · 2017-07-27T16:49:42.000-07:00
## What changes were proposed in this pull request? This PR contains a tiny update that removes an attribute resolution inconsistency in the Dataset API. The following example is taken from the ticket description: ``` spark.range(1).withColumnRenamed("id", "x").sort(col("id")) // works spark.range(1).withColumnRenamed("id", "x").sort($"id") // works spark.range(1).withColumnRenamed("id", "x").sort('id) // works spark.range(1).withColumnRenamed("id", "x").sort("id") // fails with: org.apache.spark.sql.AnalysisException: Cannot resolve column name "id" among (x); ``` The above `AnalysisException` happens because the last case calls `Dataset.apply()` to convert strings into columns, which triggers attribute resolution. To make the API consistent between overloaded methods, this PR defers the resolution and constructs columns directly. Author: aokolnychyi <anton.okolnychyi@sap.com> Closes #18740 from aokolnychyi/spark-21538.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1108,7 +1108,7 @@ class Dataset[T] private[sql](
    */
   @scala.annotation.varargs
   def sort(sortCol: String, sortCols: String*): Dataset[T] = {
-    sort((sortCol +: sortCols).map(apply) : _*)
+    sort((sortCol +: sortCols).map(Column(_)) : _*)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1304,6 +1304,19 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       assert(rlike3.count() == 0)
     }
   }
+
+  test("SPARK-21538: Attribute resolution inconsistency in Dataset API") {
+    val df = spark.range(3).withColumnRenamed("id", "x")
+    val expected = Row(0) :: Row(1) :: Row (2) :: Nil
+    checkAnswer(df.sort("id"), expected)
+    checkAnswer(df.sort(col("id")), expected)
+    checkAnswer(df.sort($"id"), expected)
+    checkAnswer(df.sort('id), expected)
+    checkAnswer(df.orderBy("id"), expected)
+    checkAnswer(df.orderBy(col("id")), expected)
+    checkAnswer(df.orderBy($"id"), expected)
+    checkAnswer(df.orderBy('id), expected)
+  }
 }
 
 case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String])

Original file line number	Diff line number	Diff line change
`@@ -1108,7 +1108,7 @@ class Dataset[T] private[sql](`
`1108`	`1108`	`*/`
`1109`	`1109`	`@scala.annotation.varargs`
`1110`	`1110`	`def sort(sortCol: String, sortCols: String*): Dataset[T] = {`
`1111`		`- sort((sortCol +: sortCols).map(apply) : _*)`
	`1111`	`+ sort((sortCol +: sortCols).map(Column(_)) : _*)`
`1112`	`1112`	`}`
`1113`	`1113`
`1114`	`1114`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -1304,6 +1304,19 @@ class DatasetSuite extends QueryTest with SharedSQLContext {`
`1304`	`1304`	`assert(rlike3.count() == 0)`
`1305`	`1305`	`}`
`1306`	`1306`	`}`
	`1307`	`+`
	`1308`	`+ test("SPARK-21538: Attribute resolution inconsistency in Dataset API") {`
	`1309`	`+ val df = spark.range(3).withColumnRenamed("id", "x")`
	`1310`	`+ val expected = Row(0) :: Row(1) :: Row (2) :: Nil`
	`1311`	`+ checkAnswer(df.sort("id"), expected)`
	`1312`	`+ checkAnswer(df.sort(col("id")), expected)`
	`1313`	`+ checkAnswer(df.sort($"id"), expected)`
	`1314`	`+ checkAnswer(df.sort('id), expected)`
	`1315`	`+ checkAnswer(df.orderBy("id"), expected)`
	`1316`	`+ checkAnswer(df.orderBy(col("id")), expected)`
	`1317`	`+ checkAnswer(df.orderBy($"id"), expected)`
	`1318`	`+ checkAnswer(df.orderBy('id), expected)`
	`1319`	`+ }`
`1307`	`1320`	`}`
`1308`	`1321`
`1309`	`1322`	`case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String])`