Skip to content

Commit 341f7a8

Browse files
committed
simplify orc filters
1 parent 02ca045 commit 341f7a8

2 files changed

Lines changed: 68 additions & 113 deletions

File tree

  • sql/core
    • v1.2.1/src/main/scala/org/apache/spark/sql/execution/datasources/orc
    • v2.3.5/src/main/scala/org/apache/spark/sql/execution/datasources/orc

sql/core/v1.2.1/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala

Lines changed: 34 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ private[sql] object OrcFilters extends OrcFiltersBase {
6666
for {
6767
// Combines all convertible filters using `And` to produce a single conjunction
6868
conjunction <- buildTree(convertibleFilters(schema, dataTypeMap, filters))
69-
// Then tries to build a single ORC `SearchArgument` for the conjunction predicate
69+
// Then tries to build a single ORC `SearchArgument` for the conjunction predicate.
70+
// The input predicate is fully convertible. There should not be any empty result in the
71+
// following recursive method call `buildSearchArgument`.
7072
builder <- buildSearchArgument(dataTypeMap, conjunction, newBuilder)
7173
} yield builder.build()
7274
}
@@ -80,6 +82,17 @@ private[sql] object OrcFilters extends OrcFiltersBase {
8082
def convertibleFiltersHelper(
8183
filter: Filter,
8284
canPartialPushDown: Boolean): Option[Filter] = filter match {
85+
// At here, it is not safe to just convert one side and remove the other side
86+
// if we do not understand what the parent filters are.
87+
//
88+
// Here is an example used to explain the reason.
89+
// Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to
90+
// convert b in ('1'). If we only convert a = 2, we will end up with a filter
91+
// NOT(a = 2), which will generate wrong results.
92+
//
93+
// Pushing one side of AND down is only safe to do at the top level or in the child
94+
// AND before hitting NOT or OR conditions, and in this case, the unsupported predicate
95+
// can be safely removed.
8396
case And(left, right) =>
8497
val leftResultOptional = convertibleFiltersHelper(left, canPartialPushDown)
8598
val rightResultOptional = convertibleFiltersHelper(right, canPartialPushDown)
@@ -90,6 +103,17 @@ private[sql] object OrcFilters extends OrcFiltersBase {
90103
case _ => None
91104
}
92105

106+
// The Or predicate is convertible when both of its children can be pushed down.
107+
// That is to say, if one/both of the children can be partially pushed down, the Or
108+
// predicate can be partially pushed down as well.
109+
//
110+
// Here is an example used to explain the reason.
111+
// Let's say we have
112+
// (a1 AND a2) OR (b1 AND b2),
113+
// a1 and b1 is convertible, while a2 and b2 is not.
114+
// The predicate can be converted as
115+
// (a1 OR b1) AND (a1 OR b2) AND (a2 OR b1) AND (a2 OR b2)
116+
// As per the logical in And predicate, we can push down (a1 OR b1).
93117
case Or(left, right) =>
94118
val leftResultOptional = convertibleFiltersHelper(left, canPartialPushDown)
95119
val rightResultOptional = convertibleFiltersHelper(right, canPartialPushDown)
@@ -150,87 +174,40 @@ private[sql] object OrcFilters extends OrcFiltersBase {
150174
dataTypeMap: Map[String, DataType],
151175
expression: Filter,
152176
builder: Builder): Option[Builder] = {
153-
createBuilder(dataTypeMap, expression, builder, canPartialPushDownConjuncts = true)
177+
createBuilder(dataTypeMap, expression, builder)
154178
}
155179

156180
/**
157181
* @param dataTypeMap a map from the attribute name to its data type.
158182
* @param expression the input filter predicates.
159183
* @param builder the input SearchArgument.Builder.
160-
* @param canPartialPushDownConjuncts whether a subset of conjuncts of predicates can be pushed
161-
* down safely. Pushing ONLY one side of AND down is safe to
162-
* do at the top level or none of its ancestors is NOT and OR.
163184
* @return the builder so far.
164185
*/
165186
private def createBuilder(
166187
dataTypeMap: Map[String, DataType],
167188
expression: Filter,
168-
builder: Builder,
169-
canPartialPushDownConjuncts: Boolean): Option[Builder] = {
189+
builder: Builder): Option[Builder] = {
170190
def getType(attribute: String): PredicateLeaf.Type =
171191
getPredicateLeafType(dataTypeMap(attribute))
172192

173193
import org.apache.spark.sql.sources._
174194

175195
expression match {
176196
case And(left, right) =>
177-
// At here, it is not safe to just convert one side and remove the other side
178-
// if we do not understand what the parent filters are.
179-
//
180-
// Here is an example used to explain the reason.
181-
// Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to
182-
// convert b in ('1'). If we only convert a = 2, we will end up with a filter
183-
// NOT(a = 2), which will generate wrong results.
184-
//
185-
// Pushing one side of AND down is only safe to do at the top level or in the child
186-
// AND before hitting NOT or OR conditions, and in this case, the unsupported predicate
187-
// can be safely removed.
188-
val leftBuilderOption =
189-
createBuilder(dataTypeMap, left, newBuilder, canPartialPushDownConjuncts)
190-
val rightBuilderOption =
191-
createBuilder(dataTypeMap, right, newBuilder, canPartialPushDownConjuncts)
192-
(leftBuilderOption, rightBuilderOption) match {
193-
case (Some(_), Some(_)) =>
194-
for {
195-
lhs <- createBuilder(dataTypeMap, left,
196-
builder.startAnd(), canPartialPushDownConjuncts)
197-
rhs <- createBuilder(dataTypeMap, right, lhs, canPartialPushDownConjuncts)
198-
} yield rhs.end()
199-
200-
case (Some(_), None) if canPartialPushDownConjuncts =>
201-
createBuilder(dataTypeMap, left, builder, canPartialPushDownConjuncts)
202-
203-
case (None, Some(_)) if canPartialPushDownConjuncts =>
204-
createBuilder(dataTypeMap, right, builder, canPartialPushDownConjuncts)
205-
206-
case _ => None
207-
}
197+
for {
198+
lhs <- createBuilder(dataTypeMap, left, builder.startAnd())
199+
rhs <- createBuilder(dataTypeMap, right, lhs)
200+
} yield rhs.end()
208201

209202
case Or(left, right) =>
210-
// The Or predicate is convertible when both of its children can be pushed down.
211-
// That is to say, if one/both of the children can be partially pushed down, the Or
212-
// predicate can be partially pushed down as well.
213-
//
214-
// Here is an example used to explain the reason.
215-
// Let's say we have
216-
// (a1 AND a2) OR (b1 AND b2),
217-
// a1 and b1 is convertible, while a2 and b2 is not.
218-
// The predicate can be converted as
219-
// (a1 OR b1) AND (a1 OR b2) AND (a2 OR b1) AND (a2 OR b2)
220-
// As per the logical in And predicate, we can push down (a1 OR b1).
221203
for {
222-
_ <- createBuilder(dataTypeMap, left, newBuilder, canPartialPushDownConjuncts)
223-
_ <- createBuilder(dataTypeMap, right, newBuilder, canPartialPushDownConjuncts)
224-
lhs <- createBuilder(dataTypeMap, left,
225-
builder.startOr(), canPartialPushDownConjuncts)
226-
rhs <- createBuilder(dataTypeMap, right, lhs, canPartialPushDownConjuncts)
204+
lhs <- createBuilder(dataTypeMap, left, builder.startOr())
205+
rhs <- createBuilder(dataTypeMap, right, lhs)
227206
} yield rhs.end()
228207

229208
case Not(child) =>
230209
for {
231-
_ <- createBuilder(dataTypeMap, child, newBuilder, canPartialPushDownConjuncts = false)
232-
negate <- createBuilder(dataTypeMap,
233-
child, builder.startNot(), canPartialPushDownConjuncts = false)
210+
negate <- createBuilder(dataTypeMap, child, builder.startNot())
234211
} yield negate.end()
235212

236213
// NOTE: For all case branches dealing with leaf predicates below, the additional `startAnd()`

sql/core/v2.3.5/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala

Lines changed: 34 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ private[sql] object OrcFilters extends OrcFiltersBase {
6666
for {
6767
// Combines all convertible filters using `And` to produce a single conjunction
6868
conjunction <- buildTree(convertibleFilters(schema, dataTypeMap, filters))
69-
// Then tries to build a single ORC `SearchArgument` for the conjunction predicate
69+
// Then tries to build a single ORC `SearchArgument` for the conjunction predicate.
70+
// The input predicate is fully convertible. There should not be any empty result in the
71+
// following recursive method call `buildSearchArgument`.
7072
builder <- buildSearchArgument(dataTypeMap, conjunction, newBuilder)
7173
} yield builder.build()
7274
}
@@ -80,6 +82,17 @@ private[sql] object OrcFilters extends OrcFiltersBase {
8082
def convertibleFiltersHelper(
8183
filter: Filter,
8284
canPartialPushDown: Boolean): Option[Filter] = filter match {
85+
// At here, it is not safe to just convert one side and remove the other side
86+
// if we do not understand what the parent filters are.
87+
//
88+
// Here is an example used to explain the reason.
89+
// Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to
90+
// convert b in ('1'). If we only convert a = 2, we will end up with a filter
91+
// NOT(a = 2), which will generate wrong results.
92+
//
93+
// Pushing one side of AND down is only safe to do at the top level or in the child
94+
// AND before hitting NOT or OR conditions, and in this case, the unsupported predicate
95+
// can be safely removed.
8396
case And(left, right) =>
8497
val leftResultOptional = convertibleFiltersHelper(left, canPartialPushDown)
8598
val rightResultOptional = convertibleFiltersHelper(right, canPartialPushDown)
@@ -90,6 +103,17 @@ private[sql] object OrcFilters extends OrcFiltersBase {
90103
case _ => None
91104
}
92105

106+
// The Or predicate is convertible when both of its children can be pushed down.
107+
// That is to say, if one/both of the children can be partially pushed down, the Or
108+
// predicate can be partially pushed down as well.
109+
//
110+
// Here is an example used to explain the reason.
111+
// Let's say we have
112+
// (a1 AND a2) OR (b1 AND b2),
113+
// a1 and b1 is convertible, while a2 and b2 is not.
114+
// The predicate can be converted as
115+
// (a1 OR b1) AND (a1 OR b2) AND (a2 OR b1) AND (a2 OR b2)
116+
// As per the logical in And predicate, we can push down (a1 OR b1).
93117
case Or(left, right) =>
94118
val leftResultOptional = convertibleFiltersHelper(left, canPartialPushDown)
95119
val rightResultOptional = convertibleFiltersHelper(right, canPartialPushDown)
@@ -150,86 +174,40 @@ private[sql] object OrcFilters extends OrcFiltersBase {
150174
dataTypeMap: Map[String, DataType],
151175
expression: Filter,
152176
builder: Builder): Option[Builder] = {
153-
createBuilder(dataTypeMap, expression, builder, canPartialPushDownConjuncts = true)
177+
createBuilder(dataTypeMap, expression, builder)
154178
}
155179

156180
/**
157181
* @param dataTypeMap a map from the attribute name to its data type.
158182
* @param expression the input filter predicates.
159183
* @param builder the input SearchArgument.Builder.
160-
* @param canPartialPushDownConjuncts whether a subset of conjuncts of predicates can be pushed
161-
* down safely. Pushing ONLY one side of AND down is safe to
162-
* do at the top level or none of its ancestors is NOT and OR.
163184
* @return the builder so far.
164185
*/
165186
private def createBuilder(
166187
dataTypeMap: Map[String, DataType],
167188
expression: Filter,
168-
builder: Builder,
169-
canPartialPushDownConjuncts: Boolean): Option[Builder] = {
189+
builder: Builder): Option[Builder] = {
170190
def getType(attribute: String): PredicateLeaf.Type =
171191
getPredicateLeafType(dataTypeMap(attribute))
172192

173193
import org.apache.spark.sql.sources._
174194

175195
expression match {
176196
case And(left, right) =>
177-
// At here, it is not safe to just convert one side and remove the other side
178-
// if we do not understand what the parent filters are.
179-
//
180-
// Here is an example used to explain the reason.
181-
// Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to
182-
// convert b in ('1'). If we only convert a = 2, we will end up with a filter
183-
// NOT(a = 2), which will generate wrong results.
184-
//
185-
// Pushing one side of AND down is only safe to do at the top level or in the child
186-
// AND before hitting NOT or OR conditions, and in this case, the unsupported predicate
187-
// can be safely removed.
188-
val leftBuilderOption =
189-
createBuilder(dataTypeMap, left, newBuilder, canPartialPushDownConjuncts)
190-
val rightBuilderOption =
191-
createBuilder(dataTypeMap, right, newBuilder, canPartialPushDownConjuncts)
192-
(leftBuilderOption, rightBuilderOption) match {
193-
case (Some(_), Some(_)) =>
194-
for {
195-
lhs <- createBuilder(dataTypeMap, left,
196-
builder.startAnd(), canPartialPushDownConjuncts)
197-
rhs <- createBuilder(dataTypeMap, right, lhs, canPartialPushDownConjuncts)
198-
} yield rhs.end()
199-
200-
case (Some(_), None) if canPartialPushDownConjuncts =>
201-
createBuilder(dataTypeMap, left, builder, canPartialPushDownConjuncts)
202-
203-
case (None, Some(_)) if canPartialPushDownConjuncts =>
204-
createBuilder(dataTypeMap, right, builder, canPartialPushDownConjuncts)
205-
206-
case _ => None
207-
}
197+
for {
198+
lhs <- createBuilder(dataTypeMap, left, builder.startAnd())
199+
rhs <- createBuilder(dataTypeMap, right, lhs)
200+
} yield rhs.end()
208201

209202
case Or(left, right) =>
210-
// The Or predicate is convertible when both of its children can be pushed down.
211-
// That is to say, if one/both of the children can be partially pushed down, the Or
212-
// predicate can be partially pushed down as well.
213-
//
214-
// Here is an example used to explain the reason.
215-
// Let's say we have
216-
// (a1 AND a2) OR (b1 AND b2),
217-
// a1 and b1 is convertible, while a2 and b2 is not.
218-
// The predicate can be converted as
219-
// (a1 OR b1) AND (a1 OR b2) AND (a2 OR b1) AND (a2 OR b2)
220-
// As per the logical in And predicate, we can push down (a1 OR b1).
221203
for {
222-
_ <- createBuilder(dataTypeMap, left, newBuilder, canPartialPushDownConjuncts)
223-
_ <- createBuilder(dataTypeMap, right, newBuilder, canPartialPushDownConjuncts)
224-
lhs <- createBuilder(dataTypeMap, left, builder.startOr(), canPartialPushDownConjuncts)
225-
rhs <- createBuilder(dataTypeMap, right, lhs, canPartialPushDownConjuncts)
204+
lhs <- createBuilder(dataTypeMap, left, builder.startOr())
205+
rhs <- createBuilder(dataTypeMap, right, lhs)
226206
} yield rhs.end()
227207

228208
case Not(child) =>
229209
for {
230-
_ <- createBuilder(dataTypeMap, child, newBuilder, canPartialPushDownConjuncts = false)
231-
negate <- createBuilder(dataTypeMap,
232-
child, builder.startNot(), canPartialPushDownConjuncts = false)
210+
negate <- createBuilder(dataTypeMap, child, builder.startNot())
233211
} yield negate.end()
234212

235213
// NOTE: For all case branches dealing with leaf predicates below, the additional `startAnd()`

0 commit comments

Comments
 (0)