Skip to content

Commit dbcf855

Browse files
MaxGekkdongjoon-hyun
authored andcommitted
[SPARK-31563][SQL] Fix failure of InSet.sql for collections of Catalyst's internal types
### What changes were proposed in this pull request? In the PR, I propose to fix the `InSet.sql` method for the cases when input collection contains values of internal Catalyst's types, for instance `UTF8String`. Elements of the input set `hset` are converted to Scala types, and wrapped by `Literal` to properly form SQL view of the input collection. ### Why are the changes needed? The changes fixed the bug in `InSet.sql` that makes wrong assumption about types of collection elements. See more details in SPARK-31563. ### Does this PR introduce any user-facing change? Highly likely, not. ### How was this patch tested? Added a test to `ColumnExpressionSuite` Closes #28343 from MaxGekk/fix-InSet-sql. Authored-by: Max Gekk <max.gekk@gmail.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org> (cherry picked from commit 7d8216a) Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
1 parent b1d9088 commit dbcf855

2 files changed

Lines changed: 11 additions & 2 deletions

File tree

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
1919

2020
import scala.collection.immutable.TreeSet
2121

22+
import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
2223
import org.apache.spark.sql.catalyst.InternalRow
2324
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
2425
import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReference
@@ -519,7 +520,9 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
519520

520521
override def sql: String = {
521522
val valueSQL = child.sql
522-
val listSQL = hset.toSeq.map(Literal(_).sql).mkString(", ")
523+
val listSQL = hset.toSeq
524+
.map(elem => Literal(convertToScala(elem, child.dataType)).sql)
525+
.mkString(", ")
523526
s"($valueSQL IN ($listSQL))"
524527
}
525528
}

sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@ import org.apache.hadoop.io.{LongWritable, Text}
2626
import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat}
2727
import org.scalatest.Matchers._
2828

29-
import org.apache.spark.sql.catalyst.expressions.{In, InSet, NamedExpression}
29+
import org.apache.spark.sql.catalyst.expressions.{In, InSet, Literal, NamedExpression}
3030
import org.apache.spark.sql.execution.ProjectExec
3131
import org.apache.spark.sql.functions._
3232
import org.apache.spark.sql.internal.SQLConf
3333
import org.apache.spark.sql.test.SharedSparkSession
3434
import org.apache.spark.sql.types._
35+
import org.apache.spark.unsafe.types.UTF8String
3536

3637
class ColumnExpressionSuite extends QueryTest with SharedSparkSession {
3738
import testImplicits._
@@ -869,4 +870,9 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession {
869870
df.select(typedLit(("a", 2, 1.0))),
870871
Row(Row("a", 2, 1.0)) :: Nil)
871872
}
873+
874+
test("SPARK-31563: sql of InSet for UTF8String collection") {
875+
val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString))
876+
assert(inSet.sql === "('a' IN ('a', 'b'))")
877+
}
872878
}

0 commit comments

Comments
 (0)