getsentry · volokluev · Mar 23, 2026
@@ -113,15 +113,16 @@ def get_co_occurring_attributes(
 
       The query at the end looks something like this:
 
-          SELECT distinct(arrayJoin(arrayFilter(attr -> ((NOT ((attr.2) IN ['test_tag_1_0'])) AND startsWith(attr.2, 'test_')), arrayMap(x -> ('TYPE_STRING', x), attributes_string)))) AS attr_key
+          SELECT arrayJoin(arrayFilter(attr -> ((NOT ((attr.2) IN ['test_tag_1_0'])) AND startsWith(attr.2, 'test_')), arrayMap(x -> ('TYPE_STRING', x), attributes_string))) AS attr_key, count() AS count
           FROM eap_item_co_occurring_attrs_1_local
           WHERE (item_type = 1) AND (project_id IN [1]) AND (organization_id = 1) AND (date < toDateTime(toDate('2025-03-17', 'Universal'))) AND (date >= toDateTime(toDate('2025-03-10', 'Universal')))
 
           -- This is a faster way of looking up whether all attributes co-exist, it uses an array of hashes. This avoids string equality comparisons
           AND hasAll(attribute_keys_hash, [cityHash64('test_tag_1_0')])
           --
 
-          ORDER BY attr_key ASC
+          GROUP BY attr_key
+          ORDER BY count DESC, attr_key ASC
           LIMIT 10000
 
       **Explanation:**
@@ -148,7 +149,7 @@ def get_co_occurring_attributes(
               )
       ) AS attr_key
       ```
-    4. . The outer query deduplicates the attributes sent by the inner query to return to the user distinct co-occurring attributes
+    4. . The outer query aggregates and counts the attributes sent by the inner query, returning co-occurring attributes sorted by frequency (most common first)
 
       **The following things make this query more performant than searching the source table:**
 
@@ -258,19 +259,23 @@ def get_co_occurring_attributes(
         selected_columns=[
             SelectedExpression(
                 name="attr_key",
-                expression=f.distinct(
-                    f.arrayJoin(
-                        f.arrayFilter(
-                            Lambda(None, ("attr",), attr_filter),
-                            array_func,
-                        ),
-                        alias="attr_key",
-                    )
+                expression=f.arrayJoin(
+                    f.arrayFilter(
+                        Lambda(None, ("attr",), attr_filter),
+                        array_func,
+                    ),
+                    alias="attr_key",
                 ),
             ),
+            SelectedExpression(
+                name="count",
+                expression=f.count(),
+            ),
         ],
+        groupby=[column("attr_key")],
         condition=condition,
         order_by=[
+            OrderBy(direction=OrderByDirection.DESC, expression=f.count()),
             OrderBy(direction=OrderByDirection.ASC, expression=column("attr_key")),
         ],
         # chosen arbitrarily to be a high number
@@ -305,26 +310,32 @@ def convert_co_occurring_results_to_attributes(
     query_res: QueryResult,
 ) -> list[TraceItemAttributeNamesResponse.Attribute]:
     def t(row: Row) -> TraceItemAttributeNamesResponse.Attribute:
-        # our query to snuba only selected 1 column, attr_key
-        # so the result should only have 1 item per row
-        vals = row.values()
-        assert len(vals) == 1
-        attr_type, attr_name = list(vals)[0]
+        # our query to snuba selected 2 columns: attr_key and count
+        # we only need the attr_key for the response
+        vals = list(row.values())
+        assert len(vals) == 2
+        attr_type, attr_name = vals[0]  # first column is attr_key
+        # second column is count, which we don't need in the response
         assert isinstance(attr_type, str)
         return TraceItemAttributeNamesResponse.Attribute(
             name=attr_name, type=getattr(AttributeKey.Type, attr_type)
         )
 
     data = query_res.result.get("data", [])
     if request.type in (AttributeKey.TYPE_UNSPECIFIED, AttributeKey.TYPE_STRING):
+        # Add non-stored attributes with a very high count so they appear first
+        # (since these are typically important service-level attributes)
         data.extend(
             [
-                {"attr_key": ("TYPE_STRING", key_name)}
+                {"attr_key": ("TYPE_STRING", key_name), "count": float("inf")}
                 for key_name in NON_STORED_ATTRIBUTE_KEYS
                 if request.value_substring_match in key_name
             ]
         )
-        data.sort(key=lambda row: tuple(row.get("attr_key", ("TYPE_STRING", ""))))
+        # Re-sort with the new attributes included
+        data.sort(
+            key=lambda row: (-row.get("count", 0), tuple(row.get("attr_key", ("TYPE_STRING", ""))))
+        )
 
     return list(map(t, data))
 

@@ -260,3 +260,69 @@ def test_simple_boolean(self) -> None:
                 )
             )
         assert res.attributes == expected
+
+    def test_sorted_by_frequency(self) -> None:
+        """Test that attributes are returned and sorted (by frequency DESC, then name ASC)."""
+        # Request string attributes with a filter to get a smaller result set
+        req = TraceItemAttributeNamesRequest(
+            meta=RequestMeta(
+                project_ids=[1, 2, 3],
+                organization_id=1,
+                cogs_category="something",
+                referrer="something",
+                start_timestamp=Timestamp(seconds=int((BASE_TIME - timedelta(days=1)).timestamp())),
+                end_timestamp=Timestamp(seconds=int((BASE_TIME + timedelta(days=1)).timestamp())),
+            ),
+            limit=100,
+            type=AttributeKey.Type.TYPE_STRING,
+            value_substring_match="a_tag",
+        )
+        res = EndpointTraceItemAttributeNames().execute(req)
+
+        # Verify we got results
+        assert len(res.attributes) > 0
+
+        # Get just the attribute names
+        attr_names = [attr.name for attr in res.attributes]
+
+        # Since all a_tag_* attributes appear on only one span each (same frequency),
+        # they should be sorted alphabetically
+        assert attr_names == sorted(attr_names), (
+            "Attributes with the same frequency should be sorted alphabetically"
+        )
+
+        # Verify the expected attributes are present and in order
+        expected = []
+        for i in range(TOTAL_GENERATED_ATTR_PER_TYPE):
+            expected.append(f"a_tag_{str(i).zfill(3)}")
+
+        assert attr_names == expected
+
+    def test_frequency_ordering_with_specific_attributes(self) -> None:
+        """Test that specific attributes appear in order based on their frequency."""
+        req = TraceItemAttributeNamesRequest(
+            meta=RequestMeta(
+                project_ids=[1, 2, 3],
+                organization_id=1,
+                cogs_category="something",
+                referrer="something",
+                start_timestamp=Timestamp(seconds=int((BASE_TIME - timedelta(days=1)).timestamp())),
+                end_timestamp=Timestamp(seconds=int((BASE_TIME + timedelta(days=1)).timestamp())),
+            ),
+            limit=1000,
+            type=AttributeKey.Type.TYPE_DOUBLE,
+            value_substring_match="b_measurement_",
+        )
+        res = EndpointTraceItemAttributeNames().execute(req)
+
+        # All b_measurement_* attributes appear on individual spans (frequency = 1)
+        # So they should all be sorted alphabetically since they have the same frequency
+        attr_names = [attr.name for attr in res.attributes]
+
+        # Verify all returned attributes are b_measurement_* attributes
+        assert all(name.startswith("b_measurement_") for name in attr_names)
+
+        # Verify they are sorted alphabetically (since all have same frequency)
+        assert attr_names == sorted(attr_names), (
+            "Attributes with equal frequency should be sorted alphabetically"
+        )