Skip to content

Commit 685dff3

Browse files
author
wangzhenhua
committed
convert compound Not conditions
1 parent 895662d commit 685dff3

2 files changed

Lines changed: 19 additions & 35 deletions

File tree

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -104,36 +104,20 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
104104
val percent2 = calculateFilterSelectivity(cond2, update = false).getOrElse(1.0)
105105
Some(percent1 + percent2 - (percent1 * percent2))
106106

107-
// For AND and OR conditions, we will estimate conservatively if one of two
108-
// components is not supported, e.g. suppose c1 is not supported,
109-
// then p(And(c1, c2)) = p(c2), and p(Or(c1, c2)) = 1.0.
110-
// But once they are wrapped in NOT condition, then after 1 - p, it becomes
111-
// under-estimation. So in these cases, we consider them as unsupported.
112107
case Not(And(cond1, cond2)) =>
113-
val p1 = calculateFilterSelectivity(cond1, update = false)
114-
val p2 = calculateFilterSelectivity(cond2, update = false)
115-
if (p1.isDefined && p2.isDefined) {
116-
Some(1 - p1.get * p2.get)
117-
} else {
118-
None
119-
}
108+
calculateFilterSelectivity(Or(Not(cond1), Not(cond2)), update = false)
120109

121110
case Not(Or(cond1, cond2)) =>
122-
val p1 = calculateFilterSelectivity(cond1, update = false)
123-
val p2 = calculateFilterSelectivity(cond2, update = false)
124-
if (p1.isDefined && p2.isDefined) {
125-
Some(1 - (p1.get + p2.get - (p1.get * p2.get)))
126-
} else {
127-
None
128-
}
111+
calculateFilterSelectivity(And(Not(cond1), Not(cond2)), update = false)
129112

130113
case Not(cond) =>
131114
calculateFilterSelectivity(cond, update = false) match {
132115
case Some(percent) => Some(1.0 - percent)
133116
case None => None
134117
}
135118

136-
case _ => calculateSingleCondition(condition, update)
119+
case _ =>
120+
calculateSingleCondition(condition, update)
137121
}
138122
}
139123

@@ -472,12 +456,15 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
472456
percent = op match {
473457
case _: LessThan =>
474458
if (numericLiteral == max) {
459+
// If the literal value is right on the boundary, we can minus the part of the
460+
// boundary value (1/ndv).
475461
1.0 - 1.0 / ndv
476462
} else {
477463
(numericLiteral - min) / (max - min)
478464
}
479465
case _: LessThanOrEqual =>
480466
if (numericLiteral == min) {
467+
// The boundary value is the only satisfying value.
481468
1.0 / ndv
482469
} else {
483470
(numericLiteral - min) / (max - min)
@@ -505,14 +492,11 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
505492
if (newNdv < 1) newNdv = 1
506493

507494
op match {
508-
case _: GreaterThan =>
509-
if (newNdv == 1) newMin = newMax else newMin = newValue
510-
case _: GreaterThanOrEqual =>
511-
newMin = newValue
512-
case _: LessThan =>
513-
if (newNdv == 1) newMax = newMin else newMax = newValue
514-
case _: LessThanOrEqual =>
515-
newMax = newValue
495+
case _: GreaterThan | _: GreaterThanOrEqual =>
496+
// If new ndv is 1, then new max must be equal to new min.
497+
newMin = if (newNdv == 1) newMax else newValue
498+
case _: LessThan | _: LessThanOrEqual =>
499+
newMax = if (newNdv == 1) newMin else newValue
516500
}
517501

518502
val newStats =

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -188,28 +188,28 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
188188
expectedRowCount = 6)
189189
}
190190

191-
test("Not(cint = 3 OR cint = 6)") {
192-
val condition = Not(Or(EqualTo(attrInt, Literal(3)), EqualTo(attrInt, Literal(6))))
191+
test("Not(cint <= 3 OR cint > 6)") {
192+
val condition = Not(Or(LessThanOrEqual(attrInt, Literal(3)), GreaterThan(attrInt, Literal(6))))
193193
validateEstimatedStats(
194194
Filter(condition, childStatsTestPlan(Seq(attrInt), 10L)),
195195
Seq(attrInt -> colStatInt),
196-
expectedRowCount = 9)
196+
expectedRowCount = 5)
197197
}
198198

199-
test("Not(cint > 3 AND cstring < 'A8') - unsupported") {
200-
val condition = Not(And(GreaterThan(attrInt, Literal(3)), LessThan(attrString, Literal("A8"))))
199+
test("Not(cint = 3 AND cstring < 'A8')") {
200+
val condition = Not(And(EqualTo(attrInt, Literal(3)), LessThan(attrString, Literal("A8"))))
201201
validateEstimatedStats(
202202
Filter(condition, childStatsTestPlan(Seq(attrInt, attrString), 10L)),
203203
Seq(attrInt -> colStatInt, attrString -> colStatString),
204204
expectedRowCount = 10)
205205
}
206206

207-
test("Not(cint = 3 OR cstring < 'A8') - unsupported") {
207+
test("Not(cint = 3 OR cstring < 'A8')") {
208208
val condition = Not(Or(EqualTo(attrInt, Literal(3)), LessThan(attrString, Literal("A8"))))
209209
validateEstimatedStats(
210210
Filter(condition, childStatsTestPlan(Seq(attrInt, attrString), 10L)),
211211
Seq(attrInt -> colStatInt, attrString -> colStatString),
212-
expectedRowCount = 10)
212+
expectedRowCount = 9)
213213
}
214214

215215
test("cint IN (3, 4, 5)") {

0 commit comments

Comments
 (0)