-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-2554][SQL] CountDistinct partial aggregation and object allocation improvements #1935
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
213ada8
bd08239
050bb97
41fbd1d
d494598
9153652
38c7449
f31b8ad
c1f7114
b3d0f64
57ae3b1
27984d0
abee26d
87d101d
58d15f1
8ff6402
2b46c4b
c9e67de
3868f6c
db44a30
93d0f64
fdca896
fae38f4
b2e8ef3
c122cca
32d216f
8074a80
5c7848d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,3 +85,21 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet | |
|
|
||
| override def eval(input: Row): Any = i2(input, left, right, _.rem(_, _)) | ||
| } | ||
|
|
||
| case class MaxOf(left: Expression, right: Expression) extends Expression { | ||
| type EvaluatedType = Any | ||
|
|
||
| override def nullable = left.nullable && right.nullable | ||
|
|
||
| override def children = left :: right :: Nil | ||
|
|
||
| override def references = (left.flatMap(_.references) ++ right.flatMap(_.references)).toSet | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be |
||
|
|
||
| override def dataType = left.dataType | ||
|
|
||
| override def eval(input: Row): Any = { | ||
| val leftEval = left.eval(input) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code is missing.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do u mean?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code was missing, I just added it.
|
||
| } | ||
|
|
||
| override def toString = s"MaxOf($left, $right)" | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,9 @@ import org.apache.spark.sql.catalyst.expressions | |
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.types._ | ||
|
|
||
| class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int] | ||
| class LongHashSet extends org.apache.spark.util.collection.OpenHashSet[Long] | ||
|
|
||
| /** | ||
| * A base class for generators of byte code to perform expression evaluation. Includes a set of | ||
| * helpers for referring to Catalyst types and building trees that perform evaluation of individual | ||
|
|
@@ -71,7 +74,8 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| * From the Guava Docs: A Cache is similar to ConcurrentMap, but not quite the same. The most | ||
| * fundamental difference is that a ConcurrentMap persists all elements that are added to it until | ||
| * they are explicitly removed. A Cache on the other hand is generally configured to evict entries | ||
| * automatically, in order to constrain its memory footprint | ||
| * automatically, in order to constrain its memory footprint. Note that this cache does not use | ||
| * weak keys/values and thus does not respond to memory pressure. | ||
| */ | ||
| protected val cache = CacheBuilder.newBuilder() | ||
| .maximumSize(1000) | ||
|
|
@@ -398,6 +402,75 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| $primitiveTerm = ${falseEval.primitiveTerm} | ||
| } | ||
| """.children | ||
|
|
||
| case NewSet(elementType) => | ||
| q""" | ||
| val $nullTerm = false | ||
| val $primitiveTerm = new ${hashSetForType(elementType)}() | ||
| """.children | ||
|
|
||
| case AddItemToSet(item, set) => | ||
| val itemEval = expressionEvaluator(item) | ||
| val setEval = expressionEvaluator(set) | ||
|
|
||
| val ArrayType(elementType, _) = set.dataType | ||
|
|
||
| itemEval.code ++ setEval.code ++ | ||
| q""" | ||
| if (!${itemEval.nullTerm}) { | ||
| ${setEval.primitiveTerm} | ||
| .asInstanceOf[${hashSetForType(elementType)}] | ||
| .add(${itemEval.primitiveTerm}) | ||
| } | ||
|
|
||
| val $nullTerm = false | ||
| val $primitiveTerm = ${setEval.primitiveTerm} | ||
| """.children | ||
|
|
||
| case CombineSets(left, right) => | ||
| val leftEval = expressionEvaluator(left) | ||
| val rightEval = expressionEvaluator(right) | ||
|
|
||
| val ArrayType(elementType, _) = left.dataType | ||
|
|
||
| leftEval.code ++ rightEval.code ++ | ||
| q""" | ||
| val leftSet = ${leftEval.primitiveTerm}.asInstanceOf[${hashSetForType(elementType)}] | ||
| val rightSet = ${rightEval.primitiveTerm}.asInstanceOf[${hashSetForType(elementType)}] | ||
| val iterator = rightSet.iterator | ||
| while (iterator.hasNext) { | ||
| leftSet.add(iterator.next()) | ||
| } | ||
|
|
||
| val $nullTerm = false | ||
| val $primitiveTerm = leftSet | ||
| """.children | ||
|
|
||
| case MaxOf(e1, e2) => | ||
| val eval1 = expressionEvaluator(e1) | ||
| val eval2 = expressionEvaluator(e2) | ||
|
|
||
| eval1.code ++ eval2.code ++ | ||
| q""" | ||
| var $nullTerm = false | ||
| var $primitiveTerm: ${termForType(e1.dataType)} = ${defaultPrimitive(e1.dataType)} | ||
|
|
||
| if (${eval1.nullTerm}) { | ||
| $nullTerm = ${eval2.nullTerm} | ||
| $primitiveTerm = ${eval2.primitiveTerm} | ||
| } else if (${eval2.nullTerm}) { | ||
| $nullTerm = ${eval1.nullTerm} | ||
| $primitiveTerm = ${eval1.primitiveTerm} | ||
| } else { | ||
| $nullTerm = false | ||
| if (${eval1.primitiveTerm} > ${eval2.primitiveTerm}) { | ||
| $primitiveTerm = ${eval1.primitiveTerm} | ||
| } else { | ||
| $primitiveTerm = ${eval2.primitiveTerm} | ||
| } | ||
| } | ||
| """.children | ||
|
|
||
| } | ||
|
|
||
| // If there was no match in the partial function above, we fall back on calling the interpreted | ||
|
|
@@ -437,6 +510,11 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin | |
| protected def accessorForType(dt: DataType) = newTermName(s"get${primitiveForType(dt)}") | ||
| protected def mutatorForType(dt: DataType) = newTermName(s"set${primitiveForType(dt)}") | ||
|
|
||
| protected def hashSetForType(dt: DataType) = dt match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this throw an exception for count distinct of non int/longs?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i guess it is disabled in the higher level how about adding a default case to throw an exception so it shows that you've controlled this earlier?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is correct, the planner will not use code generation unless the type of the count distinct is a single int or long value. Good idea though, I've added the check. |
||
| case IntegerType => typeOf[IntegerHashSet] | ||
| case LongType => typeOf[LongHashSet] | ||
| } | ||
|
|
||
| protected def primitiveForType(dt: DataType) = dt match { | ||
| case IntegerType => "Int" | ||
| case LongType => "Long" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this support null?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure, we will never put null into it though (we always put rows in, and furthermore count distinct semantics don't count null).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe add the line there explaining we never put null into it. i think the open hash set doesn't support null.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually I think most HashSets don't support
null.scala.collection.mutable.HashSetthrows an exception if you try to addnull.