Skip to content

Commit 5685a05

Browse files
beliefercloud-fan
authored andcommitted
[SPARK-33938][SQL][3.1] Optimize Like Any/All by LikeSimplification
### What changes were proposed in this pull request? We should optimize Like Any/All by LikeSimplification to improve performance. ### Why are the changes needed? Optimize Like Any/All ### Does this PR introduce _any_ user-facing change? 'No'. ### How was this patch tested? Jenkins test. Closes #31063 from beliefer/SPARK-33938_backport-3.1. Authored-by: gengjiaan <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 78d29fe commit 5685a05

3 files changed

Lines changed: 127 additions & 26 deletions

File tree

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char)
180180
}
181181
}
182182

183-
abstract class MultiLikeBase
183+
sealed abstract class MultiLikeBase
184184
extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant {
185185

186186
protected def patterns: Seq[UTF8String]
@@ -219,7 +219,7 @@ abstract class MultiLikeBase
219219
/**
220220
* Optimized version of LIKE ALL, when all pattern values are literal.
221221
*/
222-
abstract class LikeAllBase extends MultiLikeBase {
222+
sealed abstract class LikeAllBase extends MultiLikeBase {
223223

224224
override def matches(exprValue: String): Any = {
225225
if (cache.forall(matchFunc(_, exprValue))) {
@@ -275,7 +275,7 @@ case class NotLikeAll(child: Expression, patterns: Seq[UTF8String]) extends Like
275275
/**
276276
* Optimized version of LIKE ANY, when all pattern values are literal.
277277
*/
278-
abstract class LikeAnyBase extends MultiLikeBase {
278+
sealed abstract class LikeAnyBase extends MultiLikeBase {
279279

280280
override def matches(exprValue: String): Any = {
281281
if (cache.exists(matchFunc(_, exprValue))) {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala

Lines changed: 56 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
3030
import org.apache.spark.sql.catalyst.rules._
3131
import org.apache.spark.sql.internal.SQLConf
3232
import org.apache.spark.sql.types._
33+
import org.apache.spark.unsafe.types.UTF8String
3334

3435
/*
3536
* Optimization rules defined in this file should not affect the structure of the logical plan.
@@ -542,36 +543,68 @@ object LikeSimplification extends Rule[LogicalPlan] {
542543
private val contains = "%([^_%]+)%".r
543544
private val equalTo = "([^_%]*)".r
544545

546+
private def simplifyLike(
547+
input: Expression, pattern: String, escapeChar: Char = '\\'): Option[Expression] = {
548+
if (pattern.contains(escapeChar)) {
549+
// There are three different situations when pattern containing escapeChar:
550+
// 1. pattern contains invalid escape sequence, e.g. 'm\aca'
551+
// 2. pattern contains escaped wildcard character, e.g. 'ma\%ca'
552+
// 3. pattern contains escaped escape character, e.g. 'ma\\ca'
553+
// Although there are patterns can be optimized if we handle the escape first, we just
554+
// skip this rule if pattern contains any escapeChar for simplicity.
555+
None
556+
} else {
557+
pattern match {
558+
case startsWith(prefix) =>
559+
Some(StartsWith(input, Literal(prefix)))
560+
case endsWith(postfix) =>
561+
Some(EndsWith(input, Literal(postfix)))
562+
// 'a%a' pattern is basically same with 'a%' && '%a'.
563+
// However, the additional `Length` condition is required to prevent 'a' match 'a%a'.
564+
case startsAndEndsWith(prefix, postfix) =>
565+
Some(And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)),
566+
And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix)))))
567+
case contains(infix) =>
568+
Some(Contains(input, Literal(infix)))
569+
case equalTo(str) =>
570+
Some(EqualTo(input, Literal(str)))
571+
case _ => None
572+
}
573+
}
574+
}
575+
576+
private def simplifyMultiLike(
577+
child: Expression, patterns: Seq[UTF8String], multi: MultiLikeBase): Expression = {
578+
val (remainPatternMap, replacementMap) =
579+
patterns.map { p => p -> simplifyLike(child, p.toString)}.partition(_._2.isEmpty)
580+
val remainPatterns = remainPatternMap.map(_._1)
581+
val replacements = replacementMap.map(_._2.get)
582+
if (replacements.isEmpty) {
583+
multi
584+
} else {
585+
multi match {
586+
case l: LikeAll => And(replacements.reduceLeft(And), l.copy(patterns = remainPatterns))
587+
case l: NotLikeAll =>
588+
And(replacements.map(Not(_)).reduceLeft(And), l.copy(patterns = remainPatterns))
589+
case l: LikeAny => Or(replacements.reduceLeft(Or), l.copy(patterns = remainPatterns))
590+
case l: NotLikeAny =>
591+
Or(replacements.map(Not(_)).reduceLeft(Or), l.copy(patterns = remainPatterns))
592+
}
593+
}
594+
}
595+
545596
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
546597
case l @ Like(input, Literal(pattern, StringType), escapeChar) =>
547598
if (pattern == null) {
548599
// If pattern is null, return null value directly, since "col like null" == null.
549600
Literal(null, BooleanType)
550601
} else {
551-
pattern.toString match {
552-
// There are three different situations when pattern containing escapeChar:
553-
// 1. pattern contains invalid escape sequence, e.g. 'm\aca'
554-
// 2. pattern contains escaped wildcard character, e.g. 'ma\%ca'
555-
// 3. pattern contains escaped escape character, e.g. 'ma\\ca'
556-
// Although there are patterns can be optimized if we handle the escape first, we just
557-
// skip this rule if pattern contains any escapeChar for simplicity.
558-
case p if p.contains(escapeChar) => l
559-
case startsWith(prefix) =>
560-
StartsWith(input, Literal(prefix))
561-
case endsWith(postfix) =>
562-
EndsWith(input, Literal(postfix))
563-
// 'a%a' pattern is basically same with 'a%' && '%a'.
564-
// However, the additional `Length` condition is required to prevent 'a' match 'a%a'.
565-
case startsAndEndsWith(prefix, postfix) =>
566-
And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)),
567-
And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix))))
568-
case contains(infix) =>
569-
Contains(input, Literal(infix))
570-
case equalTo(str) =>
571-
EqualTo(input, Literal(str))
572-
case _ => l
573-
}
602+
simplifyLike(input, pattern.toString, escapeChar).getOrElse(l)
574603
}
604+
case l @ LikeAll(child, patterns) => simplifyMultiLike(child, patterns, l)
605+
case l @ NotLikeAll(child, patterns) => simplifyMultiLike(child, patterns, l)
606+
case l @ LikeAny(child, patterns) => simplifyMultiLike(child, patterns, l)
607+
case l @ NotLikeAny(child, patterns) => simplifyMultiLike(child, patterns, l)
575608
}
576609
}
577610

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,4 +164,72 @@ class LikeSimplificationSuite extends PlanTest {
164164
.analyze
165165
comparePlans(optimized5, correctAnswer5)
166166
}
167+
168+
test("simplify LikeAll") {
169+
val originalQuery =
170+
testRelation
171+
.where(('a likeAll(
172+
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))
173+
174+
val optimized = Optimize.execute(originalQuery.analyze)
175+
val correctAnswer = testRelation
176+
.where((((((StartsWith('a, "abc") && EndsWith('a, "xyz")) &&
177+
(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) &&
178+
Contains('a, "mn")) && ('a === "")) && ('a === "abc")) &&
179+
('a likeAll("abc\\%", "abc\\%def", "%mn\\%")))
180+
.analyze
181+
182+
comparePlans(optimized, correctAnswer)
183+
}
184+
185+
test("simplify NotLikeAll") {
186+
val originalQuery =
187+
testRelation
188+
.where(('a notLikeAll(
189+
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))
190+
191+
val optimized = Optimize.execute(originalQuery.analyze)
192+
val correctAnswer = testRelation
193+
.where((((((Not(StartsWith('a, "abc")) && Not(EndsWith('a, "xyz"))) &&
194+
Not(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) &&
195+
Not(Contains('a, "mn"))) && Not('a === "")) && Not('a === "abc")) &&
196+
('a notLikeAll("abc\\%", "abc\\%def", "%mn\\%")))
197+
.analyze
198+
199+
comparePlans(optimized, correctAnswer)
200+
}
201+
202+
test("simplify LikeAny") {
203+
val originalQuery =
204+
testRelation
205+
.where(('a likeAny(
206+
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))
207+
208+
val optimized = Optimize.execute(originalQuery.analyze)
209+
val correctAnswer = testRelation
210+
.where((((((StartsWith('a, "abc") || EndsWith('a, "xyz")) ||
211+
(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) ||
212+
Contains('a, "mn")) || ('a === "")) || ('a === "abc")) ||
213+
('a likeAny("abc\\%", "abc\\%def", "%mn\\%")))
214+
.analyze
215+
216+
comparePlans(optimized, correctAnswer)
217+
}
218+
219+
test("simplify NotLikeAny") {
220+
val originalQuery =
221+
testRelation
222+
.where(('a notLikeAny(
223+
"abc%", "abc\\%", "%xyz", "abc\\%def", "abc%def", "%mn%", "%mn\\%", "", "abc")))
224+
225+
val optimized = Optimize.execute(originalQuery.analyze)
226+
val correctAnswer = testRelation
227+
.where((((((Not(StartsWith('a, "abc")) || Not(EndsWith('a, "xyz"))) ||
228+
Not(Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) ||
229+
Not(Contains('a, "mn"))) || Not('a === "")) || Not('a === "abc")) ||
230+
('a notLikeAny("abc\\%", "abc\\%def", "%mn\\%")))
231+
.analyze
232+
233+
comparePlans(optimized, correctAnswer)
234+
}
167235
}

0 commit comments

Comments
 (0)