airbnb · pengyu-hou · Jul 11, 2025 · Jul 2, 2024 · Jul 4, 2025 · Jul 4, 2025
diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
@@ -25,6 +25,7 @@ import com.yahoo.sketches.kll.KllFloatsSketch
 import com.yahoo.sketches.{ArrayOfDoublesSerDe, ArrayOfItemsSerDe, ArrayOfLongsSerDe, ArrayOfStringsSerDe}
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
+import java.security.MessageDigest
 import java.util
 import scala.collection.mutable
 import scala.jdk.CollectionConverters._
@@ -599,6 +600,59 @@ class ApproxHistogram[T: FrequentItemsFriendly](mapSize: Int, errorType: ErrorTy
   }
 }
 
+class BoundedUniqueCount[T](inputType: DataType, k: Int = 8) extends SimpleAggregator[T, util.Set[String], Long] {
+  private def toBytes(input: T): Array[Byte] = {
+    val bos = new ByteArrayOutputStream()
+    val out = new ObjectOutputStream(bos)
+    out.writeObject(input)
+    out.flush()
+    bos.toByteArray
+  }
+
+  private def md5Hex(bytes: Array[Byte]): String =
+    MessageDigest.getInstance("MD5").digest(bytes).map("%02x".format(_)).mkString
+
+  private def hashInput(input: T): String =
+    md5Hex(toBytes(input))
+
+  override def prepare(input: T): util.Set[String] = {
+    val result = new util.HashSet[String](k)
+    result.add(hashInput(input))
+    result
+  }
+
+  override def update(ir: util.Set[String], input: T): util.Set[String] = {
+    if (ir.size() >= k) {
+      return ir
-      return ir
+      if(ir == Constants.SentinelSet || ir.size() >= k) return Constants.SentinelSet
-      return ir
+      if(ir == Constants.SentinelSet || ir.size() >= k) return Constants.SentinelSet
+    }
+
+    ir.add(hashInput(input))
+    ir
+  }
+
+  override def outputType: DataType = LongType
+
+  override def irType: DataType = ListType(StringType)
+
+  override def merge(ir1: util.Set[String], ir2: util.Set[String]): util.Set[String] = {
+    ir2.asScala.foreach(v =>
-    ir2.asScala.foreach(v =>
+    ir2.iterator().asScala.foreach(v =>
-    ir2.asScala.foreach(v =>
+    ir2.iterator().asScala.foreach(v =>
+      if (ir1.size() < k) {
+        ir1.add(v)
+      })
+
+    ir1
+  }
+
+  override def finalize(ir: util.Set[String]): Long = ir.size()
-  override def finalize(ir: util.Set[String]): Long = ir.size()
+  override def finalize(ir: util.Set[String]): Long = if(ir == Constants.SentinelSet) k else ir.size()
-  override def finalize(ir: util.Set[String]): Long = ir.size()
+  override def finalize(ir: util.Set[String]): Long = if(ir == Constants.SentinelSet) k else ir.size()
+
+  override def clone(ir: util.Set[String]): util.Set[String] = new util.HashSet[String](ir)
+
+  override def normalize(ir: util.Set[String]): Any = new util.ArrayList[String](ir)
+
+  override def denormalize(ir: Any): util.Set[String] =
+    new util.HashSet[String](ir.asInstanceOf[util.ArrayList[String]])
+}
+
 // Based on CPC sketch (a faster, smaller and more accurate version of HLL)
 // See: Back to the future: an even more nearly optimal cardinality estimation algorithm, 2017
 // https://arxiv.org/abs/1708.06839

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/ColumnAggregator.scala
@@ -307,7 +307,19 @@ object ColumnAggregator {
           case BinaryType => simple(new ApproxDistinctCount[Array[Byte]](aggregationPart.getInt("k", Some(8))))
           case _          => mismatchException
         }
+      case Operation.BOUNDED_UNIQUE_COUNT =>
+        val k = aggregationPart.getInt("k", Some(8))
 
+        inputType match {
+          case IntType    => simple(new BoundedUniqueCount[Int](inputType, k))
+          case LongType   => simple(new BoundedUniqueCount[Long](inputType, k))
+          case ShortType  => simple(new BoundedUniqueCount[Short](inputType, k))
+          case DoubleType => simple(new BoundedUniqueCount[Double](inputType, k))
+          case FloatType  => simple(new BoundedUniqueCount[Float](inputType, k))
+          case StringType => simple(new BoundedUniqueCount[String](inputType, k))
+          case BinaryType => simple(new BoundedUniqueCount[Array[Byte]](inputType, k))
+          case _          => mismatchException
+        }
       case Operation.APPROX_PERCENTILE =>
         val k = aggregationPart.getInt("k", Some(128))
         val mapper = new ObjectMapper()

diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/BoundedUniqueCountTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/BoundedUniqueCountTest.scala
@@ -0,0 +1,44 @@
+package ai.chronon.aggregator.test
+
+import ai.chronon.aggregator.base.BoundedUniqueCount
+import ai.chronon.api.StringType
+import junit.framework.TestCase
+import org.junit.Assert._
+
+import java.util
+import scala.jdk.CollectionConverters._
+
+class BoundedUniqueCountTest extends TestCase {
+  def testHappyCase(): Unit = {
+    val boundedDistinctCount = new BoundedUniqueCount[String](StringType, 5)
+    var ir = boundedDistinctCount.prepare("1")
+    ir = boundedDistinctCount.update(ir, "1")
+    ir = boundedDistinctCount.update(ir, "2")
+
+    val result = boundedDistinctCount.finalize(ir)
+    assertEquals(2, result)
+  }
+
+  def testExceedSize(): Unit = {
+    val boundedDistinctCount = new BoundedUniqueCount[String](StringType, 5)
+    var ir = boundedDistinctCount.prepare("1")
+    ir = boundedDistinctCount.update(ir, "2")
+    ir = boundedDistinctCount.update(ir, "3")
+    ir = boundedDistinctCount.update(ir, "4")
+    ir = boundedDistinctCount.update(ir, "5")
+    ir = boundedDistinctCount.update(ir, "6")
+    ir = boundedDistinctCount.update(ir, "7")
+
+    val result = boundedDistinctCount.finalize(ir)
+    assertEquals(5, result)
+  }
+
+  def testMerge(): Unit = {
+    val boundedDistinctCount = new BoundedUniqueCount[String](StringType, 5)
+    val ir1 = new util.HashSet[String](Seq("1", "2", "3").asJava)
+    val ir2 = new util.HashSet[String](Seq("4", "5", "6").asJava)
+
+    val merged = boundedDistinctCount.merge(ir1, ir2)
+    assertEquals(merged.size(), 5)
+  }
+}
diff --git a/api/py/ai/chronon/group_by.py b/api/py/ai/chronon/group_by.py
@@ -65,6 +65,7 @@ class Operation:
     # https://github.com/apache/incubator-datasketches-java/blob/master/src/main/java/org/apache/datasketches/cpc/CpcSketch.java#L180
     APPROX_UNIQUE_COUNT_LGK = collector(ttypes.Operation.APPROX_UNIQUE_COUNT)
     UNIQUE_COUNT = ttypes.Operation.UNIQUE_COUNT
+    BOUNDED_UNIQUE_COUNT_K = collector(ttypes.Operation.BOUNDED_UNIQUE_COUNT)
     COUNT = ttypes.Operation.COUNT
     SUM = ttypes.Operation.SUM
     AVERAGE = ttypes.Operation.AVERAGE

diff --git a/api/thrift/api.thrift b/api/thrift/api.thrift
@@ -161,7 +161,8 @@ enum Operation {
     BOTTOM_K = 16
 
     HISTOGRAM = 17, // use this only if you know the set of inputs is bounded
-    APPROX_HISTOGRAM_K = 18
+    APPROX_HISTOGRAM_K = 18,
+    BOUNDED_UNIQUE_COUNT = 19
 }
 
 // integers map to milliseconds in the timeunit

diff --git a/docs/source/authoring_features/GroupBy.md b/docs/source/authoring_features/GroupBy.md
@@ -147,6 +147,7 @@ Limitations:
 | approx_unique_count      | primitive types | list, map        | long              | no         | k=8                | yes            |
 | approx_percentile        | primitive types | list, map        | list<input,>      | no         | k=128, percentiles | yes            |
 | unique_count             | primitive types | list, map        | long              | no         |                    | no             |
+| bounded_unique_count     | primitive types | list, map        | long              | no         | k=inf              | yes            |
 
 
 ## Accuracy

diff --git a/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala b/spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala
@@ -273,8 +273,11 @@ class FetcherTest extends TestCase {
         Builders.Aggregation(operation = Operation.LAST_K,
                              argMap = Map("k" -> "300"),
                              inputColumn = "user",
-                             windows = Seq(new Window(2, TimeUnit.DAYS), new Window(30, TimeUnit.DAYS)))
-      ),
+                             windows = Seq(new Window(2, TimeUnit.DAYS), new Window(30, TimeUnit.DAYS))),
+        Builders.Aggregation(operation = Operation.BOUNDED_UNIQUE_COUNT,
+                             argMap = Map("k" -> "5"),
+                             inputColumn = "user",
+                             windows = Seq(new Window(2, TimeUnit.DAYS), new Window(30, TimeUnit.DAYS)))),
       metaData = Builders.MetaData(name = "unit_test/vendor_ratings", namespace = namespace),
       accuracy = Accuracy.SNAPSHOT
     )
@@ -503,6 +506,11 @@ class FetcherTest extends TestCase {
           operation = Operation.APPROX_HISTOGRAM_K,
           inputColumn = "rating",
           windows = Seq(new Window(1, TimeUnit.DAYS))
+        ),
+        Builders.Aggregation(
+          operation = Operation.BOUNDED_UNIQUE_COUNT,
+          inputColumn = "rating",
+          windows = Seq(new Window(1, TimeUnit.DAYS))
         )
       ),
       accuracy = Accuracy.TEMPORAL,

diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -724,4 +724,47 @@ class GroupByTest {
       assert(count > 0, s"Found a count value that is not greater than zero: $count")
     }
   }
+
+  @Test
+  def testBoundedUniqueCounts(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("GroupByTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    val (source, endPartition) = createTestSource(suffix = "_bounded_counts")
+    val tableUtils = TableUtils(spark)
+    val namespace = "test_bounded_counts"
+    val aggs = Seq(
+      Builders.Aggregation(
+        operation = Operation.BOUNDED_UNIQUE_COUNT,
+        inputColumn = "item",
+        windows = Seq(
+          new Window(15, TimeUnit.DAYS),
+          new Window(60, TimeUnit.DAYS)
+        ),
+        argMap = Map("k" -> "5")
+      ),
+      Builders.Aggregation(
+        operation = Operation.BOUNDED_UNIQUE_COUNT,
+        inputColumn = "price",
+        windows = Seq(
+          new Window(15, TimeUnit.DAYS),
+          new Window(60, TimeUnit.DAYS)
+        ),
+        argMap = Map("k" -> "5")
+      ),
+    )
+    backfill(name = "unit_test_group_by_bounded_counts",
+      source = source,
+      endPartition = endPartition,
+      namespace = namespace,
+      tableUtils = tableUtils,
+      additionalAgg = aggs)
+
+    val result = spark.sql(
+      """
+        |select *
+        |from test_bounded_counts.unit_test_group_by_bounded_counts
+        |where item_bounded_unique_count_60d > 5 or price_bounded_unique_count_60d > 5
+        |""".stripMargin)
+
+    assertTrue(result.isEmpty)
+  }
 }