Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
4a6f903
Reuse completeNextStageWithFetchFailure
beliefer Jun 19, 2020
96456e2
Merge remote-tracking branch 'upstream/master'
beliefer Jul 1, 2020
4314005
Merge remote-tracking branch 'upstream/master'
beliefer Jul 3, 2020
d6af4a7
Merge remote-tracking branch 'upstream/master'
beliefer Jul 9, 2020
f69094f
Merge remote-tracking branch 'upstream/master'
beliefer Jul 16, 2020
b86a42d
Merge remote-tracking branch 'upstream/master'
beliefer Jul 25, 2020
2ac5159
Merge branch 'master' of github.com:beliefer/spark
beliefer Jul 25, 2020
9021d6c
Merge remote-tracking branch 'upstream/master'
beliefer Jul 28, 2020
74a2ef4
Merge branch 'master' of github.com:beliefer/spark
beliefer Jul 28, 2020
9828158
Merge remote-tracking branch 'upstream/master'
beliefer Jul 31, 2020
9cd1aaf
Merge remote-tracking branch 'upstream/master'
beliefer Aug 5, 2020
abfcbb9
Merge remote-tracking branch 'upstream/master'
beliefer Aug 26, 2020
07c6c81
Merge remote-tracking branch 'upstream/master'
beliefer Sep 1, 2020
580130b
Merge remote-tracking branch 'upstream/master'
beliefer Sep 2, 2020
3712808
Merge branch 'master' of github.com:beliefer/spark
beliefer Sep 11, 2020
6107413
Merge remote-tracking branch 'upstream/master'
beliefer Sep 11, 2020
4b799b4
Merge remote-tracking branch 'upstream/master'
beliefer Sep 14, 2020
ee0ecbf
Merge remote-tracking branch 'upstream/master'
beliefer Sep 18, 2020
596bc61
Merge remote-tracking branch 'upstream/master'
beliefer Sep 24, 2020
0164e2f
Merge remote-tracking branch 'upstream/master'
beliefer Sep 27, 2020
90b79fc
Merge remote-tracking branch 'upstream/master'
beliefer Sep 29, 2020
2cef3a9
Merge remote-tracking branch 'upstream/master'
beliefer Oct 13, 2020
c26b64f
Merge remote-tracking branch 'upstream/master'
beliefer Oct 19, 2020
2e02cd2
Merge remote-tracking branch 'upstream/master'
beliefer Oct 22, 2020
a6d0741
Merge remote-tracking branch 'upstream/master'
beliefer Oct 28, 2020
82e5b2c
Merge remote-tracking branch 'upstream/master'
beliefer Nov 4, 2020
70bbf5d
Merge remote-tracking branch 'upstream/master'
beliefer Nov 6, 2020
126a51e
Merge remote-tracking branch 'upstream/master'
beliefer Nov 13, 2020
f2ceacd
Merge remote-tracking branch 'upstream/master'
beliefer Nov 19, 2020
5ad208f
Merge remote-tracking branch 'upstream/master'
beliefer Nov 23, 2020
970917e
Merge remote-tracking branch 'upstream/master'
beliefer Dec 1, 2020
ddc1b8b
Merge remote-tracking branch 'upstream/master'
beliefer Dec 3, 2020
2b1ed0b
Merge remote-tracking branch 'upstream/master'
beliefer Dec 4, 2020
a7d3729
Merge remote-tracking branch 'upstream/master'
beliefer Dec 7, 2020
17ef8fc
Merge remote-tracking branch 'upstream/master'
beliefer Dec 10, 2020
f7a2902
Merge remote-tracking branch 'upstream/master'
beliefer Dec 11, 2020
a803c9b
Merge remote-tracking branch 'upstream/master'
beliefer Dec 21, 2020
9d79697
Merge remote-tracking branch 'upstream/master'
beliefer Dec 21, 2020
7127f5e
Merge remote-tracking branch 'upstream/master'
beliefer Dec 21, 2020
69bd51b
Merge remote-tracking branch 'upstream/master'
beliefer Dec 25, 2020
a7ced5c
Merge remote-tracking branch 'upstream/master'
beliefer Dec 30, 2020
31749bc
Simplify logical operation for multiple like
beliefer Dec 30, 2020
de5c166
Update code
beliefer Dec 31, 2020
20acb26
Update code
beliefer Dec 31, 2020
bcc0b90
Update code
beliefer Dec 31, 2020
e4934f1
Optimize code
beliefer Jan 4, 2021
2003a2b
Optimize code
beliefer Jan 4, 2021
7bc9813
Optimize code
beliefer Jan 4, 2021
b6ba902
Optimize code
beliefer Jan 4, 2021
fec4fe5
Merge branch 'master' into SPARK-33938
beliefer Jan 5, 2021
52ecedd
Remove Whitespace
beliefer Jan 5, 2021
454a68b
Optimize code
beliefer Jan 5, 2021
8f26d1b
Adjust code
beliefer Jan 5, 2021
e2c7dfe
Update code
beliefer Jan 5, 2021
6acf1c1
Update code
beliefer Jan 5, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char)
}
}

abstract class MultiLikeBase
sealed abstract class MultiLikeBase
extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant {

protected def patterns: Seq[UTF8String]
Expand Down Expand Up @@ -220,7 +220,7 @@ abstract class MultiLikeBase
/**
* Optimized version of LIKE ALL, when all pattern values are literal.
*/
abstract class LikeAllBase extends MultiLikeBase {
sealed abstract class LikeAllBase extends MultiLikeBase {

override def matches(exprValue: String): Any = {
if (cache.forall(matchFunc(_, exprValue))) {
Expand Down Expand Up @@ -276,7 +276,7 @@ case class NotLikeAll(child: Expression, patterns: Seq[UTF8String]) extends Like
/**
* Optimized version of LIKE ANY, when all pattern values are literal.
*/
abstract class LikeAnyBase extends MultiLikeBase {
sealed abstract class LikeAnyBase extends MultiLikeBase {

override def matches(exprValue: String): Any = {
if (cache.exists(matchFunc(_, exprValue))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import scala.collection.immutable.HashSet
import scala.collection.mutable.{ArrayBuffer, Stack}

import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, _}
import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, MultiLikeBase, _}
import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
Expand All @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

/*
* Optimization rules defined in this file should not affect the structure of the logical plan.
Expand Down Expand Up @@ -634,36 +635,68 @@ object LikeSimplification extends Rule[LogicalPlan] {
private val contains = "%([^_%]+)%".r
private val equalTo = "([^_%]*)".r

private def simplifyLike(
input: Expression, pattern: String, escapeChar: Char = '\\'): Option[Expression] = {
if (pattern.contains(escapeChar)) {
// There are three different situations when pattern containing escapeChar:
// 1. pattern contains invalid escape sequence, e.g. 'm\aca'
// 2. pattern contains escaped wildcard character, e.g. 'ma\%ca'
// 3. pattern contains escaped escape character, e.g. 'ma\\ca'
// Although there are patterns can be optimized if we handle the escape first, we just
// skip this rule if pattern contains any escapeChar for simplicity.
None
} else {
pattern match {
case startsWith(prefix) =>
Some(StartsWith(input, Literal(prefix)))
case endsWith(postfix) =>
Some(EndsWith(input, Literal(postfix)))
// 'a%a' pattern is basically same with 'a%' && '%a'.
// However, the additional `Length` condition is required to prevent 'a' match 'a%a'.
case startsAndEndsWith(prefix, postfix) =>
Some(And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)),
And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix)))))
case contains(infix) =>
Some(Contains(input, Literal(infix)))
case equalTo(str) =>
Some(EqualTo(input, Literal(str)))
case _ => None
}
}
}

private def simplifyMultiLike(
child: Expression, patterns: Seq[UTF8String], multi: MultiLikeBase): Expression = {
val (remainPatternMap, replacementMap) =
patterns.map { p => p -> simplifyLike(child, p.toString)}.partition(_._2.isEmpty)
val remainPatterns = remainPatternMap.map(_._1)
val replacements = replacementMap.map(_._2.get)
if (replacements.isEmpty) {
multi
} else {
multi match {
case l: LikeAll => And(replacements.reduceLeft(And), l.copy(patterns = remainPatterns))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may cause StackOverflowError.

scala> spark.sql("drop table SPARK_33938")
res6: org.apache.spark.sql.DataFrame = []

scala> spark.sql("create table SPARK_33938(id string) using parquet")
res7: org.apache.spark.sql.DataFrame = []

scala> val values = Range(1, 10000)
values: scala.collection.immutable.Range = Range 1 until 10000

scala> spark.sql(s"select * from SPARK_33938 where id like all (${values.map(s => s"'$s'").mkString(", ")})").show
java.lang.StackOverflowError
  at java.lang.ThreadLocal.set(ThreadLocal.java:201)
  at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.set(TreeNode.scala:62)
  at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
  at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:317)
  at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:322)
  at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:407)
  at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:243)
  at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:405)
  at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:358)
  at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:322)
  at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:322)
  at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:407)
  at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:243)
  at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:405)
  at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:358)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wangyum I will fix this issue.

Copy link
Contributor Author

@beliefer beliefer Jan 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, patterns a, b, c, d, e, and f. Suppose a, b, c, and d are patterns that can be optimized with startsWith. According to the current logic, it is startsWith(a)&startsWith(b)&startsWith(c)&startsWith(d)&LikeAll(e,f). Their hierarchy is not shown here.
We can use the threshold to determine the number of patterns that can be optimized, for example, only two patterns can be optimized. Then it is startsWith(a)&startsWith(b)&LikeAll(c,d,e,f)

case l: NotLikeAll =>
And(replacements.map(Not(_)).reduceLeft(And), l.copy(patterns = remainPatterns))
case l: LikeAny => Or(replacements.reduceLeft(Or), l.copy(patterns = remainPatterns))
case l: NotLikeAny =>
Or(replacements.map(Not(_)).reduceLeft(Or), l.copy(patterns = remainPatterns))
}
}
}

def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
case l @ Like(input, Literal(pattern, StringType), escapeChar) =>
if (pattern == null) {
// If pattern is null, return null value directly, since "col like null" == null.
Literal(null, BooleanType)
} else {
pattern.toString match {
// There are three different situations when pattern containing escapeChar:
// 1. pattern contains invalid escape sequence, e.g. 'm\aca'
// 2. pattern contains escaped wildcard character, e.g. 'ma\%ca'
// 3. pattern contains escaped escape character, e.g. 'ma\\ca'
// Although there are patterns can be optimized if we handle the escape first, we just
// skip this rule if pattern contains any escapeChar for simplicity.
case p if p.contains(escapeChar) => l
case startsWith(prefix) =>
StartsWith(input, Literal(prefix))
case endsWith(postfix) =>
EndsWith(input, Literal(postfix))
// 'a%a' pattern is basically same with 'a%' && '%a'.
// However, the additional `Length` condition is required to prevent 'a' match 'a%a'.
case startsAndEndsWith(prefix, postfix) =>
And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)),
And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix))))
case contains(infix) =>
Contains(input, Literal(infix))
case equalTo(str) =>
EqualTo(input, Literal(str))
case _ => l
}
simplifyLike(input, pattern.toString, escapeChar).getOrElse(l)
}
case l @ LikeAll(child, patterns) => simplifyMultiLike(child, patterns, l)
case l @ NotLikeAll(child, patterns) => simplifyMultiLike(child, patterns, l)
case l @ LikeAny(child, patterns) => simplifyMultiLike(child, patterns, l)
case l @ NotLikeAny(child, patterns) => simplifyMultiLike(child, patterns, l)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,72 @@ class LikeSimplificationSuite extends PlanTest {
.analyze
comparePlans(optimized5, correctAnswer5)
}

test("simplify LikeAll") {
val originalQuery =
testRelation
.where(('a likeAll(
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))

val optimized = Optimize.execute(originalQuery.analyze)
val correctAnswer = testRelation
.where((((((StartsWith('a, "abc") && EndsWith('a, "xyz")) &&
(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) &&
Contains('a, "mn")) && ('a === "")) && ('a === "abc")) &&
('a likeAll("abc\\%", "abc\\%def", "%mn\\%")))
.analyze

comparePlans(optimized, correctAnswer)
}

test("simplify NotLikeAll") {
val originalQuery =
testRelation
.where(('a notLikeAll(
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))

val optimized = Optimize.execute(originalQuery.analyze)
val correctAnswer = testRelation
.where((((((Not(StartsWith('a, "abc")) && Not(EndsWith('a, "xyz"))) &&
Not(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) &&
Not(Contains('a, "mn"))) && Not('a === "")) && Not('a === "abc")) &&
('a notLikeAll("abc\\%", "abc\\%def", "%mn\\%")))
.analyze

comparePlans(optimized, correctAnswer)
}

test("simplify LikeAny") {
val originalQuery =
testRelation
.where(('a likeAny(
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))

val optimized = Optimize.execute(originalQuery.analyze)
val correctAnswer = testRelation
.where((((((StartsWith('a, "abc") || EndsWith('a, "xyz")) ||
(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) ||
Contains('a, "mn")) || ('a === "")) || ('a === "abc")) ||
('a likeAny("abc\\%", "abc\\%def", "%mn\\%")))
.analyze

comparePlans(optimized, correctAnswer)
}

test("simplify NotLikeAny") {
val originalQuery =
testRelation
.where(('a notLikeAny(
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))

val optimized = Optimize.execute(originalQuery.analyze)
val correctAnswer = testRelation
.where((((((Not(StartsWith('a, "abc")) || Not(EndsWith('a, "xyz"))) ||
Not(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) ||
Not(Contains('a, "mn"))) || Not('a === "")) || Not('a === "abc")) ||
('a notLikeAny("abc\\%", "abc\\%def", "%mn\\%")))
.analyze

comparePlans(optimized, correctAnswer)
}
}