-
Notifications
You must be signed in to change notification settings - Fork 87
Add bounded unique count aggregation #781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
dac9f91
d5746da
03a3a3b
fd27c7a
0ca15cd
5f1ff09
996d050
941a060
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -25,6 +25,7 @@ import com.yahoo.sketches.kll.KllFloatsSketch | |||||
| import com.yahoo.sketches.{ArrayOfDoublesSerDe, ArrayOfItemsSerDe, ArrayOfLongsSerDe, ArrayOfStringsSerDe} | ||||||
|
|
||||||
| import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} | ||||||
| import java.security.MessageDigest | ||||||
| import java.util | ||||||
| import scala.collection.mutable | ||||||
| import scala.jdk.CollectionConverters._ | ||||||
|
|
@@ -599,6 +600,59 @@ class ApproxHistogram[T: FrequentItemsFriendly](mapSize: Int, errorType: ErrorTy | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| class BoundedUniqueCount[T](inputType: DataType, k: Int = 8) extends SimpleAggregator[T, util.Set[String], Long] { | ||||||
| private def toBytes(input: T): Array[Byte] = { | ||||||
| val bos = new ByteArrayOutputStream() | ||||||
| val out = new ObjectOutputStream(bos) | ||||||
| out.writeObject(input) | ||||||
| out.flush() | ||||||
| bos.toByteArray | ||||||
| } | ||||||
|
|
||||||
| private def md5Hex(bytes: Array[Byte]): String = | ||||||
| MessageDigest.getInstance("MD5").digest(bytes).map("%02x".format(_)).mkString | ||||||
|
|
||||||
| private def hashInput(input: T): String = | ||||||
| md5Hex(toBytes(input)) | ||||||
|
|
||||||
| override def prepare(input: T): util.Set[String] = { | ||||||
| val result = new util.HashSet[String](k) | ||||||
| result.add(hashInput(input)) | ||||||
| result | ||||||
| } | ||||||
|
|
||||||
| override def update(ir: util.Set[String], input: T): util.Set[String] = { | ||||||
| if (ir.size() >= k) { | ||||||
| return ir | ||||||
|
||||||
| return ir | |
| if(ir == Constants.SentinelSet || ir.size() >= k) return Constants.SentinelSet |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm.. I don't think we have sentinel set yet in OSS branch.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ir2.asScala.foreach(v => | |
| ir2.iterator().asScala.foreach(v => |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
otherwise it will create intermediate collections
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call out!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| override def finalize(ir: util.Set[String]): Long = ir.size() | |
| override def finalize(ir: util.Set[String]): Long = if(ir == Constants.SentinelSet) k else ir.size() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| package ai.chronon.aggregator.test | ||
|
|
||
| import ai.chronon.aggregator.base.BoundedUniqueCount | ||
| import ai.chronon.api.StringType | ||
| import junit.framework.TestCase | ||
| import org.junit.Assert._ | ||
|
|
||
| import java.util | ||
| import scala.jdk.CollectionConverters._ | ||
|
|
||
| class BoundedUniqueCountTest extends TestCase { | ||
| def testHappyCase(): Unit = { | ||
| val boundedDistinctCount = new BoundedUniqueCount[String](StringType, 5) | ||
| var ir = boundedDistinctCount.prepare("1") | ||
| ir = boundedDistinctCount.update(ir, "1") | ||
| ir = boundedDistinctCount.update(ir, "2") | ||
|
|
||
| val result = boundedDistinctCount.finalize(ir) | ||
| assertEquals(2, result) | ||
| } | ||
|
|
||
| def testExceedSize(): Unit = { | ||
| val boundedDistinctCount = new BoundedUniqueCount[String](StringType, 5) | ||
| var ir = boundedDistinctCount.prepare("1") | ||
| ir = boundedDistinctCount.update(ir, "2") | ||
| ir = boundedDistinctCount.update(ir, "3") | ||
| ir = boundedDistinctCount.update(ir, "4") | ||
| ir = boundedDistinctCount.update(ir, "5") | ||
| ir = boundedDistinctCount.update(ir, "6") | ||
| ir = boundedDistinctCount.update(ir, "7") | ||
|
|
||
| val result = boundedDistinctCount.finalize(ir) | ||
| assertEquals(5, result) | ||
| } | ||
|
|
||
| def testMerge(): Unit = { | ||
| val boundedDistinctCount = new BoundedUniqueCount[String](StringType, 5) | ||
| val ir1 = new util.HashSet[String](Seq("1", "2", "3").asJava) | ||
| val ir2 = new util.HashSet[String](Seq("4", "5", "6").asJava) | ||
|
|
||
| val merged = boundedDistinctCount.merge(ir1, ir2) | ||
| assertEquals(merged.size(), 5) | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets say i want to unique count a bunch of user / merchant ids (long values) - won't this be less efficient than simply keeping the set of longs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
made the code change to keep the numeric type as is