Skip to content

Commit dc7ce09

Browse files
committed
DRILL-5796 : implement ROWS_MATCH enum to keep inside rowgroup the filter result information, used to prune the filter if all rows match.
1 parent f1a3bd1 commit dc7ce09

17 files changed

Lines changed: 393 additions & 195 deletions

exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,29 @@ private static <C extends Comparable<C>> LogicalExpression createAndPredicate(
4646
ExpressionPosition pos
4747
) {
4848
return new ParquetBooleanPredicate<C>(name, args, pos) {
49+
/**
50+
* Evaluates a compound "AND" filter on the statistics of a RowGroup (the filter reads "filterA and filterB").
51+
* Return value :<ul>
52+
* <li>ALL : only if all filters return ALL
53+
* <li>NONE : if one filter at least returns NONE
54+
* <li>SOME : all other cases
55+
* </ul>
56+
*/
4957
@Override
50-
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
51-
// "and" : as long as one branch is OK to drop, we can drop it.
58+
public RowsMatch matches(RangeExprEvaluator<C> evaluator) {
59+
RowsMatch resultMatch = RowsMatch.ALL;
5260
for (LogicalExpression child : this) {
53-
if (child instanceof ParquetFilterPredicate && ((ParquetFilterPredicate)child).canDrop(evaluator)) {
54-
return true;
61+
if (child instanceof ParquetFilterPredicate) {
62+
switch (((ParquetFilterPredicate) child).matches(evaluator)) {
63+
case NONE:
64+
return RowsMatch.NONE; // No row comply to 1 filter part => can drop RG
65+
case SOME:
66+
resultMatch = RowsMatch.SOME;
67+
default: // Do nothing
68+
}
5569
}
5670
}
57-
return false;
71+
return resultMatch;
5872
}
5973
};
6074
}
@@ -66,15 +80,29 @@ private static <C extends Comparable<C>> LogicalExpression createOrPredicate(
6680
ExpressionPosition pos
6781
) {
6882
return new ParquetBooleanPredicate<C>(name, args, pos) {
83+
/**
84+
* Evaluates a compound "OR" filter on the statistics of a RowGroup (the filter reads "filterA or filterB").
85+
* Return value :<ul>
86+
* <li>NONE : only if all filters return NONE
87+
* <li>ALL : if one filter at least returns ALL
88+
* <li>SOME : all other cases
89+
* </ul>
90+
*/
6991
@Override
70-
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
92+
public RowsMatch matches(RangeExprEvaluator<C> evaluator) {
93+
RowsMatch resultMatch = RowsMatch.NONE;
7194
for (LogicalExpression child : this) {
72-
// "or" : as long as one branch is NOT ok to drop, we can NOT drop it.
73-
if (!(child instanceof ParquetFilterPredicate) || !((ParquetFilterPredicate)child).canDrop(evaluator)) {
74-
return false;
95+
if (child instanceof ParquetFilterPredicate) {
96+
switch (((ParquetFilterPredicate) child).matches(evaluator)) {
97+
case ALL:
98+
return RowsMatch.ALL; // One at least is ALL => can drop filter but not RG
99+
case SOME:
100+
resultMatch = RowsMatch.SOME;
101+
default: // Do nothing
102+
}
75103
}
76104
}
77-
return true;
105+
return resultMatch;
78106
}
79107
};
80108
}

exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java

Lines changed: 43 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@
2626
import java.util.ArrayList;
2727
import java.util.Iterator;
2828
import java.util.List;
29-
import java.util.function.BiPredicate;
29+
import java.util.function.BiFunction;
3030

31+
import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.hasNoNulls;
3132
import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isNullOrEmpty;
3233
import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isAllNulls;
3334

@@ -38,12 +39,13 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical
3839
implements ParquetFilterPredicate<C> {
3940
private final LogicalExpression left;
4041
private final LogicalExpression right;
41-
private final BiPredicate<Statistics<C>, Statistics<C>> predicate;
42+
43+
private final BiFunction<Statistics<C>, Statistics<C>, RowsMatch> predicate;
4244

4345
private ParquetComparisonPredicate(
4446
LogicalExpression left,
4547
LogicalExpression right,
46-
BiPredicate<Statistics<C>, Statistics<C>> predicate
48+
BiFunction<Statistics<C>, Statistics<C>, RowsMatch> predicate
4749
) {
4850
super(left.getPosition());
4951
this.left = left;
@@ -65,7 +67,7 @@ public <T, V, E extends Exception> T accept(ExprVisitor<T, V, E> visitor, V valu
6567
}
6668

6769
/**
68-
* Semantics of canDrop() is very similar to what is implemented in Parquet library's
70+
* Semantics of matches() is very similar to what is implemented in Parquet library's
6971
* {@link org.apache.parquet.filter2.statisticslevel.StatisticsFilter} and
7072
* {@link org.apache.parquet.filter2.predicate.FilterPredicate}
7173
*
@@ -83,23 +85,29 @@ public <T, V, E extends Exception> T accept(ExprVisitor<T, V, E> visitor, V valu
8385
* where Column1 and Column2 are from same parquet table.
8486
*/
8587
@Override
86-
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
88+
public RowsMatch matches(RangeExprEvaluator<C> evaluator) {
8789
Statistics<C> leftStat = left.accept(evaluator, null);
8890
if (isNullOrEmpty(leftStat)) {
89-
return false;
91+
return RowsMatch.SOME;
9092
}
91-
9293
Statistics<C> rightStat = right.accept(evaluator, null);
9394
if (isNullOrEmpty(rightStat)) {
94-
return false;
95+
return RowsMatch.SOME;
9596
}
96-
97-
// if either side is ALL null, = is evaluated to UNKNOWN -> canDrop
9897
if (isAllNulls(leftStat, evaluator.getRowCount()) || isAllNulls(rightStat, evaluator.getRowCount())) {
99-
return true;
98+
return RowsMatch.NONE;
99+
}
100+
if (!leftStat.hasNonNullValue() || !rightStat.hasNonNullValue()) {
101+
return RowsMatch.SOME;
100102
}
103+
return predicate.apply(leftStat, rightStat);
104+
}
101105

102-
return (leftStat.hasNonNullValue() && rightStat.hasNonNullValue()) && predicate.test(leftStat, rightStat);
106+
/**
107+
* If one rowgroup contains some null values, change the RowsMatch.ALL into RowsMatch.SOME (null values should be discarded by filter)
108+
*/
109+
private static RowsMatch checkNull(Statistics leftStat, Statistics rightStat) {
110+
return !hasNoNulls(leftStat) || !hasNoNulls(rightStat) ? RowsMatch.SOME : RowsMatch.ALL;
103111
}
104112

105113
/**
@@ -109,12 +117,9 @@ private static <C extends Comparable<C>> LogicalExpression createEqualPredicate(
109117
LogicalExpression left,
110118
LogicalExpression right
111119
) {
112-
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
113-
// can drop when left's max < right's min, or right's max < left's min
114-
final C leftMin = leftStat.genericGetMin();
115-
final C rightMin = rightStat.genericGetMin();
116-
return (leftStat.compareMaxToValue(rightMin) < 0) || (rightStat.compareMaxToValue(leftMin) < 0);
117-
}) {
120+
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) ->
121+
leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 || rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0 ? RowsMatch.NONE : RowsMatch.SOME
122+
) {
118123
@Override
119124
public String toString() {
120125
return left + " = " + right;
@@ -130,9 +135,10 @@ private static <C extends Comparable<C>> LogicalExpression createGTPredicate(
130135
LogicalExpression right
131136
) {
132137
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
133-
// can drop when left's max <= right's min.
134-
final C rightMin = rightStat.genericGetMin();
135-
return leftStat.compareMaxToValue(rightMin) <= 0;
138+
if (leftStat.compareMaxToValue(rightStat.genericGetMin()) <= 0) {
139+
return RowsMatch.NONE;
140+
}
141+
return leftStat.compareMinToValue(rightStat.genericGetMax()) > 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
136142
});
137143
}
138144

@@ -144,9 +150,10 @@ private static <C extends Comparable<C>> LogicalExpression createGEPredicate(
144150
LogicalExpression right
145151
) {
146152
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
147-
// can drop when left's max < right's min.
148-
final C rightMin = rightStat.genericGetMin();
149-
return leftStat.compareMaxToValue(rightMin) < 0;
153+
if (leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0) {
154+
return RowsMatch.NONE;
155+
}
156+
return leftStat.compareMinToValue(rightStat.genericGetMax()) >= 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
150157
});
151158
}
152159

@@ -158,9 +165,10 @@ private static <C extends Comparable<C>> LogicalExpression createLTPredicate(
158165
LogicalExpression right
159166
) {
160167
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
161-
// can drop when right's max <= left's min.
162-
final C leftMin = leftStat.genericGetMin();
163-
return rightStat.compareMaxToValue(leftMin) <= 0;
168+
if (rightStat.compareMaxToValue(leftStat.genericGetMin()) <= 0) {
169+
return RowsMatch.NONE;
170+
}
171+
return leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
164172
});
165173
}
166174

@@ -171,9 +179,10 @@ private static <C extends Comparable<C>> LogicalExpression createLEPredicate(
171179
LogicalExpression left, LogicalExpression right
172180
) {
173181
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
174-
// can drop when right's max < left's min.
175-
final C leftMin = leftStat.genericGetMin();
176-
return rightStat.compareMaxToValue(leftMin) < 0;
182+
if (rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0) {
183+
return RowsMatch.NONE;
184+
}
185+
return leftStat.compareMaxToValue(rightStat.genericGetMin()) <= 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME;
177186
});
178187
}
179188

@@ -185,11 +194,10 @@ private static <C extends Comparable<C>> LogicalExpression createNEPredicate(
185194
LogicalExpression right
186195
) {
187196
return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> {
188-
// can drop when there is only one unique value.
189-
final C leftMax = leftStat.genericGetMax();
190-
final C rightMax = rightStat.genericGetMax();
191-
return leftStat.compareMinToValue(leftMax) == 0 && rightStat.compareMinToValue(rightMax) == 0 &&
192-
leftStat.compareMaxToValue(rightMax) == 0;
197+
if (leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 || rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0) {
198+
return checkNull(leftStat, rightStat);
199+
}
200+
return leftStat.compareMaxToValue(rightStat.genericGetMax()) == 0 && leftStat.compareMinToValue(rightStat.genericGetMin()) == 0 ? RowsMatch.NONE : RowsMatch.SOME;
193201
});
194202
}
195203

exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetFilterPredicate.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,16 @@
1818
package org.apache.drill.exec.expr.stat;
1919

2020
public interface ParquetFilterPredicate<T extends Comparable<T>> {
21-
boolean canDrop(RangeExprEvaluator<T> evaluator);
21+
22+
/**
23+
* Define the validity of a row group against a filter
24+
* <ul>
25+
* <li>ALL : all rows match the filter (can not drop the row group and can prune the filter)
26+
* <li>NONE : no row matches the filter (can drop the row group)
27+
* <li>SOME : some rows only match the filter or the filter can not be applied (can not drop the row group nor the filter)
28+
* </ul>
29+
*/
30+
enum RowsMatch {ALL, NONE, SOME}
31+
32+
RowsMatch matches(RangeExprEvaluator<T> evaluator);
2233
}

exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java

Lines changed: 35 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
import org.apache.drill.common.expression.LogicalExpression;
2121
import org.apache.drill.common.expression.LogicalExpressionBase;
22-
import org.apache.drill.common.expression.SchemaPath;
2322
import org.apache.drill.common.expression.TypedFieldExpr;
2423
import org.apache.drill.common.expression.visitors.ExprVisitor;
2524
import org.apache.drill.exec.expr.fn.FunctionGenerationHelper;
@@ -29,7 +28,7 @@
2928
import java.util.ArrayList;
3029
import java.util.Iterator;
3130
import java.util.List;
32-
import java.util.function.BiPredicate;
31+
import java.util.function.BiFunction;
3332

3433
import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.hasNoNulls;
3534
import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isAllNulls;
@@ -42,9 +41,10 @@ public class ParquetIsPredicate<C extends Comparable<C>> extends LogicalExpressi
4241
implements ParquetFilterPredicate<C> {
4342

4443
private final LogicalExpression expr;
45-
private final BiPredicate<Statistics<C>, RangeExprEvaluator<C>> predicate;
4644

47-
private ParquetIsPredicate(LogicalExpression expr, BiPredicate<Statistics<C>, RangeExprEvaluator<C>> predicate) {
45+
private final BiFunction<Statistics<C>, RangeExprEvaluator<C>, RowsMatch> predicate;
46+
47+
private ParquetIsPredicate(LogicalExpression expr, BiFunction<Statistics<C>, RangeExprEvaluator<C>, RowsMatch> predicate) {
4848
super(expr.getPosition());
4949
this.expr = expr;
5050
this.predicate = predicate;
@@ -62,50 +62,51 @@ public <T, V, E extends Exception> T accept(ExprVisitor<T, V, E> visitor, V valu
6262
return visitor.visitUnknown(this, value);
6363
}
6464

65-
@Override
66-
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
65+
/**
66+
* Apply the filter condition against the meta of the rowgroup.
67+
*/
68+
public RowsMatch matches(RangeExprEvaluator<C> evaluator) {
6769
Statistics<C> exprStat = expr.accept(evaluator, null);
68-
if (isNullOrEmpty(exprStat)) {
69-
return false;
70-
}
70+
return isNullOrEmpty(exprStat) ? RowsMatch.SOME : predicate.apply(exprStat, evaluator);
71+
}
7172

72-
return predicate.test(exprStat, evaluator);
73+
/**
74+
* After the applying of the filter against the statistics of the rowgroup, if the result is RowsMatch.ALL,
75+
* then we still must know if the rowgroup contains some null values, because it can change the filter result.
76+
* If it contains some null values, then we change the RowsMatch.ALL into RowsMatch.SOME, which sya that maybe
77+
* some values (the null ones) should be disgarded.
78+
*/
79+
private static RowsMatch checkNull(Statistics exprStat) {
80+
return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
7381
}
7482

7583
/**
7684
* IS NULL predicate.
7785
*/
7886
private static <C extends Comparable<C>> LogicalExpression createIsNullPredicate(LogicalExpression expr) {
7987
return new ParquetIsPredicate<C>(expr,
80-
//if there are no nulls -> canDrop
81-
(exprStat, evaluator) -> hasNoNulls(exprStat)) {
82-
private final boolean isArray = isArray(expr);
83-
84-
private boolean isArray(LogicalExpression expression) {
85-
if (expression instanceof TypedFieldExpr) {
86-
TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
87-
SchemaPath schemaPath = typedFieldExpr.getPath();
88-
return schemaPath.isArray();
89-
}
90-
return false;
91-
}
92-
93-
@Override
94-
public boolean canDrop(RangeExprEvaluator<C> evaluator) {
88+
(exprStat, evaluator) -> {
9589
// for arrays we are not able to define exact number of nulls
9690
// [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null but statistics shows no nulls
97-
return !isArray && super.canDrop(evaluator);
98-
}
99-
};
91+
if (expr instanceof TypedFieldExpr) {
92+
TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
93+
if (typedFieldExpr.getPath().isArray()) {
94+
return RowsMatch.SOME;
95+
}
96+
}
97+
if (hasNoNulls(exprStat)) {
98+
return RowsMatch.NONE;
99+
}
100+
return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : RowsMatch.SOME;
101+
});
100102
}
101103

102104
/**
103105
* IS NOT NULL predicate.
104106
*/
105107
private static <C extends Comparable<C>> LogicalExpression createIsNotNullPredicate(LogicalExpression expr) {
106108
return new ParquetIsPredicate<C>(expr,
107-
//if there are all nulls -> canDrop
108-
(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
109+
(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : checkNull(exprStat)
109110
);
110111
}
111112

@@ -114,8 +115,7 @@ private static <C extends Comparable<C>> LogicalExpression createIsNotNullPredic
114115
*/
115116
private static LogicalExpression createIsTruePredicate(LogicalExpression expr) {
116117
return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
117-
//if max value is not true or if there are all nulls -> canDrop
118-
isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
118+
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax() || isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : checkNull(exprStat)
119119
);
120120
}
121121

@@ -124,8 +124,7 @@ private static LogicalExpression createIsTruePredicate(LogicalExpression expr) {
124124
*/
125125
private static LogicalExpression createIsFalsePredicate(LogicalExpression expr) {
126126
return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
127-
//if min value is not false or if there are all nulls -> canDrop
128-
isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
127+
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() || isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : checkNull(exprStat)
129128
);
130129
}
131130

@@ -134,8 +133,7 @@ private static LogicalExpression createIsFalsePredicate(LogicalExpression expr)
134133
*/
135134
private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr) {
136135
return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
137-
//if min value is not false or if there are no nulls -> canDrop
138-
hasNoNulls(exprStat) && exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
136+
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() && hasNoNulls(exprStat) ? RowsMatch.NONE : checkNull(exprStat)
139137
);
140138
}
141139

@@ -144,8 +142,7 @@ private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr
144142
*/
145143
private static LogicalExpression createIsNotFalsePredicate(LogicalExpression expr) {
146144
return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
147-
//if max value is not true or if there are no nulls -> canDrop
148-
hasNoNulls(exprStat) && exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
145+
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax() && hasNoNulls(exprStat) ? RowsMatch.NONE : checkNull(exprStat)
149146
);
150147
}
151148

0 commit comments

Comments
 (0)