-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23922][SQL] Add arrays_overlap function #21028
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
e5ebdad
682bc73
876cd93
88e09b3
c895707
65b7d6d
f9a1ecf
1dbcd0c
076fc69
eafca0f
5925104
2a1121c
bf81e4a
4a18ba8
566946a
710433e
3cf410a
9d086f9
964f7af
41ef6c6
3dd724b
f7089f5
e36a5d7
49d9372
227437b
2e9e024
92730a1
56c59ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,15 +18,45 @@ package org.apache.spark.sql.catalyst.expressions | |
|
|
||
| import java.util.Comparator | ||
|
|
||
| import scala.collection.mutable | ||
|
|
||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||
| import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} | ||
| import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder | ||
| import org.apache.spark.sql.catalyst.expressions.codegen._ | ||
| import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.array.ByteArrayMethods | ||
| import org.apache.spark.unsafe.types.{ByteArray, UTF8String} | ||
|
|
||
| /** | ||
| * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit | ||
| * casting. | ||
| */ | ||
| trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression | ||
| with ImplicitCastInputTypes { | ||
|
|
||
| protected lazy val elementType: DataType = inputTypes.head.asInstanceOf[ArrayType].elementType | ||
|
||
|
|
||
| override def inputTypes: Seq[AbstractDataType] = { | ||
| TypeCoercion.findWiderTypeForTwo(left.dataType, right.dataType) match { | ||
|
||
| case Some(arrayType) => Seq(arrayType, arrayType) | ||
| case _ => Seq.empty | ||
| } | ||
| } | ||
|
|
||
| override def checkInputDataTypes(): TypeCheckResult = { | ||
| (left.dataType, right.dataType) match { | ||
| case (ArrayType(e1, _), ArrayType(e2, _)) if e1.sameType(e2) => | ||
| TypeCheckResult.TypeCheckSuccess | ||
| case _ => TypeCheckResult.TypeCheckFailure(s"input to function $prettyName should have " + | ||
| s"been two ${ArrayType.simpleString}s with same element type, but it's " + | ||
| s"[${left.dataType.simpleString}, ${right.dataType.simpleString}]") | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * Given an array or map, returns its size. Returns -1 if null. | ||
| */ | ||
|
|
@@ -529,6 +559,157 @@ case class ArrayContains(left: Expression, right: Expression) | |
| override def prettyName: String = "array_contains" | ||
| } | ||
|
|
||
| /** | ||
| * Checks if the two arrays contain at least one common element. | ||
| */ | ||
| // scalastyle:off line.size.limit | ||
| @ExpressionDescription( | ||
| usage = "_FUNC_(a1, a2) - Returns true if a1 contains at least an element present also in a2. If the arrays have no common element and either of them contains a null element null is returned, false otherwise.", | ||
| examples = """ | ||
| Examples: | ||
| > SELECT _FUNC_(array(1, 2, 3), array(3, 4, 5)); | ||
| true | ||
| """, since = "2.4.0") | ||
| // scalastyle:off line.size.limit | ||
| case class ArraysOverlap(left: Expression, right: Expression) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't you override
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done, thanks! |
||
| extends BinaryArrayExpressionWithImplicitCast { | ||
|
|
||
| override def dataType: DataType = BooleanType | ||
|
|
||
| override def nullable: Boolean = { | ||
| left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull || | ||
| right.dataType.asInstanceOf[ArrayType].containsNull | ||
| } | ||
|
|
||
| override def nullSafeEval(a1: Any, a2: Any): Any = { | ||
| var hasNull = false | ||
| val arr1 = a1.asInstanceOf[ArrayData] | ||
| val arr2 = a2.asInstanceOf[ArrayData] | ||
| val (bigger, smaller, biggerDt) = if (arr1.numElements() > arr2.numElements()) { | ||
|
||
| (arr1, arr2, left.dataType.asInstanceOf[ArrayType]) | ||
| } else { | ||
|
||
| (arr2, arr1, right.dataType.asInstanceOf[ArrayType]) | ||
| } | ||
| if (smaller.numElements() > 0) { | ||
| val smallestSet = new mutable.HashSet[Any] | ||
| smaller.foreach(elementType, (_, v) => | ||
| if (v == null) { | ||
| hasNull = true | ||
| } else { | ||
| smallestSet += v | ||
| }) | ||
| bigger.foreach(elementType, (_, v1) => | ||
| if (v1 == null) { | ||
| hasNull = true | ||
| } else if (smallestSet.contains(v1)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't work with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually it was not working also with |
||
| return true | ||
| } | ||
| ) | ||
| } else if (containsNull(bigger, biggerDt)) { | ||
| hasNull = true | ||
| } | ||
| if (hasNull) { | ||
| null | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
|
|
||
| def containsNull(arr: ArrayData, dt: ArrayType): Boolean = { | ||
| if (dt.containsNull) { | ||
| var i = 0 | ||
| var hasNull = false | ||
| while (i < arr.numElements && !hasNull) { | ||
| hasNull = arr.isNullAt(i) | ||
| i += 1 | ||
| } | ||
| hasNull | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| nullSafeCodeGen(ctx, ev, (a1, a2) => { | ||
| val i = ctx.freshName("i") | ||
| val smaller = ctx.freshName("smallerArray") | ||
| val bigger = ctx.freshName("biggerArray") | ||
| val getFromSmaller = CodeGenerator.getValue(smaller, elementType, i) | ||
| val getFromBigger = CodeGenerator.getValue(bigger, elementType, i) | ||
| val smallerEmptyCode = if (inputTypes.exists(_.asInstanceOf[ArrayType].containsNull)) { | ||
| s""" | ||
| |else { | ||
| | for (int $i = 0; $i < $bigger.numElements(); $i ++) { | ||
| | if ($bigger.isNullAt($i)) { | ||
| | ${ev.isNull} = true; | ||
| | break; | ||
| | } | ||
| | } | ||
| |} | ||
| """.stripMargin | ||
| } else { | ||
| "" | ||
| } | ||
| val javaElementClass = CodeGenerator.boxedType(elementType) | ||
| val javaSet = classOf[java.util.HashSet[_]].getName | ||
| val set2 = ctx.freshName("set") | ||
| val addToSetFromSmallerCode = nullSafeElementCodegen( | ||
| smaller, i, s"$set2.add($getFromSmaller);", s"${ev.isNull} = true;") | ||
| val elementIsInSetCode = nullSafeElementCodegen( | ||
| bigger, | ||
| i, | ||
| s""" | ||
| |if ($set2.contains($getFromBigger)) { | ||
| | ${ev.isNull} = false; | ||
| | ${ev.value} = true; | ||
| | break; | ||
| |} | ||
| |""".stripMargin, | ||
| s"${ev.isNull} = true;") | ||
| s""" | ||
| |ArrayData $smaller; | ||
| |ArrayData $bigger; | ||
| |if ($a1.numElements() > $a2.numElements()) { | ||
| | $bigger = $a1; | ||
| | $smaller = $a2; | ||
| |} else { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||
| | $smaller = $a1; | ||
| | $bigger = $a2; | ||
| |} | ||
| |if ($smaller.numElements() > 0) { | ||
| | $javaSet<$javaElementClass> $set2 = new $javaSet<$javaElementClass>(); | ||
| | for (int $i = 0; $i < $smaller.numElements(); $i ++) { | ||
| | $addToSetFromSmallerCode | ||
| | } | ||
| | for (int $i = 0; $i < $bigger.numElements(); $i ++) { | ||
| | $elementIsInSetCode | ||
| | } | ||
| |} $smallerEmptyCode | ||
| |""".stripMargin | ||
| }) | ||
| } | ||
|
|
||
| def nullSafeElementCodegen( | ||
| arrayVar: String, | ||
| index: String, | ||
| code: String, | ||
| isNullCode: String): String = { | ||
| if (inputTypes.exists(_.asInstanceOf[ArrayType].containsNull)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this depend on whether the input array
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unfortunately we don't know which one we have here (the left or the rigth) as
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i see, makes sense! |
||
| s""" | ||
| |if ($arrayVar.isNullAt($index)) { | ||
| | $isNullCode | ||
| |} else { | ||
| | $code | ||
| |} | ||
| |""".stripMargin | ||
| } else { | ||
| code | ||
| } | ||
| } | ||
|
|
||
| override def prettyName: String = "arrays_overlap" | ||
| } | ||
|
|
||
| /** | ||
| * Slices an array according to the requested start index and length | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -136,6 +136,37 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper | |
| checkEvaluation(ArrayContains(a3, Literal.create(null, StringType)), null) | ||
| } | ||
|
|
||
| test("ArraysOverlap") { | ||
| val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType)) | ||
| val a1 = Literal.create(Seq(4, 5, 3), ArrayType(IntegerType)) | ||
| val a2 = Literal.create(Seq(null, 5, 6), ArrayType(IntegerType)) | ||
| val a3 = Literal.create(Seq(7, 8), ArrayType(IntegerType)) | ||
| val a4 = Literal.create(Seq.empty[Int], ArrayType(IntegerType)) | ||
|
|
||
| val a5 = Literal.create(Seq[String](null, ""), ArrayType(StringType)) | ||
| val a6 = Literal.create(Seq[String]("", "abc"), ArrayType(StringType)) | ||
| val a7 = Literal.create(Seq[String]("def", "ghi"), ArrayType(StringType)) | ||
|
|
||
| checkEvaluation(ArraysOverlap(a0, a1), true) | ||
| checkEvaluation(ArraysOverlap(a0, a2), null) | ||
| checkEvaluation(ArraysOverlap(a1, a2), true) | ||
| checkEvaluation(ArraysOverlap(a1, a3), false) | ||
| checkEvaluation(ArraysOverlap(a0, a4), false) | ||
| checkEvaluation(ArraysOverlap(a2, a4), null) | ||
| checkEvaluation(ArraysOverlap(a4, a2), null) | ||
|
|
||
| checkEvaluation(ArraysOverlap(a5, a6), true) | ||
| checkEvaluation(ArraysOverlap(a5, a7), null) | ||
| checkEvaluation(ArraysOverlap(a6, a7), false) | ||
|
|
||
| // null handling | ||
| checkEvaluation(ArraysOverlap(Literal.create(null, ArrayType(IntegerType)), a0), null) | ||
| checkEvaluation(ArraysOverlap(a0, Literal.create(null, ArrayType(IntegerType))), null) | ||
| checkEvaluation(ArraysOverlap( | ||
| Literal.create(Seq(null), ArrayType(IntegerType)), | ||
| Literal.create(Seq(null), ArrayType(IntegerType))), null) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am returning
I will add a sentence to clarify the behavior in our docs. Thanks for this nice catch!
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have a test case for it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This case is covered by https://github.com/apache/spark/pull/21028/files#diff-d31eca9f1c4c33104dc2cb8950486910R163 for instance. Anyway, I am adding another on which is exactly this one. |
||
| } | ||
|
|
||
| test("Slice") { | ||
| val a0 = Literal.create(Seq(1, 2, 3, 4, 5, 6), ArrayType(IntegerType)) | ||
| val a1 = Literal.create(Seq[String]("a", "b", "c", "d"), ArrayType(StringType)) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -442,6 +442,29 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { | |
| ) | ||
| } | ||
|
|
||
| test("arrays_overlap function") { | ||
| val df = Seq( | ||
| (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), | ||
| (Seq.empty[Option[Int]], Seq[Option[Int]](Some(-1), None)), | ||
| (Seq[Option[Int]](Some(3), Some(2)), Seq[Option[Int]](Some(1), Some(2))) | ||
| ).toDF("a", "b") | ||
|
|
||
| val answer = Seq(Row(false), Row(null), Row(true)) | ||
|
|
||
| checkAnswer(df.select(arrays_overlap(df("a"), df("b"))), answer) | ||
| checkAnswer(df.selectExpr("arrays_overlap(a, b)"), answer) | ||
|
|
||
| checkAnswer(sql("select arrays_overlap(array(1, 2, 3), array('a', 'b', 'c'))"), Row(false)) | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a test like this for Do we expect the result is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this case is already covered here: https://github.com/apache/spark/pull/21028/files#diff-d31eca9f1c4c33104dc2cb8950486910R136, am I right?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC, I think no. It is good to have test cases with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, now I see what you mean. I can add it, but it seems useless to me. This function accepts only
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that to add this makes sense to explicitly ensure
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly I don't see its utility but I see also no harm in introducing it, so if you think it is a added value, I think it is fine to add it. So I just added it, thanks.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you. |
||
| intercept[AnalysisException] { | ||
| sql("select arrays_overlap(array(array(1)), array('a'))") | ||
| } | ||
|
|
||
| intercept[AnalysisException] { | ||
| sql("select arrays_overlap(null, null)") | ||
| } | ||
| } | ||
|
|
||
| test("slice function") { | ||
| val df = Seq( | ||
| Seq(1, 2, 3), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
ImplicitCastInputTypestrait is able to work with any number of children. Would it be possible to implement this trait to behave in the same way?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's possible indeed. Though, as far as I know there is no use case for a function with a different number of children, so I am not sure if it makes sense to generalize it. @cloud-fan @kiszk @ueshin WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As @ueshin pointed out here,
concatis also a use case that has a different number of children. Am I wrong?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@kiszk you are not wrong, but
Concatis a very specific case, since it supports alsoStrings andBinarys, so it would anyway require a specific implementation.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, I would like to hear other opinions