diff --git a/snuba/web/rpc/v1/endpoint_trace_item_attribute_names.py b/snuba/web/rpc/v1/endpoint_trace_item_attribute_names.py index 45dfa451e8..581d4420e3 100644 --- a/snuba/web/rpc/v1/endpoint_trace_item_attribute_names.py +++ b/snuba/web/rpc/v1/endpoint_trace_item_attribute_names.py @@ -113,7 +113,7 @@ def get_co_occurring_attributes( The query at the end looks something like this: - SELECT distinct(arrayJoin(arrayFilter(attr -> ((NOT ((attr.2) IN ['test_tag_1_0'])) AND startsWith(attr.2, 'test_')), arrayMap(x -> ('TYPE_STRING', x), attributes_string)))) AS attr_key + SELECT arrayJoin(arrayFilter(attr -> ((NOT ((attr.2) IN ['test_tag_1_0'])) AND startsWith(attr.2, 'test_')), arrayMap(x -> ('TYPE_STRING', x), attributes_string))) AS attr_key, count() AS count FROM eap_item_co_occurring_attrs_1_local WHERE (item_type = 1) AND (project_id IN [1]) AND (organization_id = 1) AND (date < toDateTime(toDate('2025-03-17', 'Universal'))) AND (date >= toDateTime(toDate('2025-03-10', 'Universal'))) @@ -121,7 +121,8 @@ def get_co_occurring_attributes( AND hasAll(attribute_keys_hash, [cityHash64('test_tag_1_0')]) -- - ORDER BY attr_key ASC + GROUP BY attr_key + ORDER BY count DESC, attr_key ASC LIMIT 10000 **Explanation:** @@ -148,7 +149,7 @@ def get_co_occurring_attributes( ) ) AS attr_key ``` - 4. . The outer query deduplicates the attributes sent by the inner query to return to the user distinct co-occurring attributes + 4. . The outer query aggregates and counts the attributes sent by the inner query, returning co-occurring attributes sorted by frequency (most common first) **The following things make this query more performant than searching the source table:** @@ -258,19 +259,23 @@ def get_co_occurring_attributes( selected_columns=[ SelectedExpression( name="attr_key", - expression=f.distinct( - f.arrayJoin( - f.arrayFilter( - Lambda(None, ("attr",), attr_filter), - array_func, - ), - alias="attr_key", - ) + expression=f.arrayJoin( + f.arrayFilter( + Lambda(None, ("attr",), attr_filter), + array_func, + ), + alias="attr_key", ), ), + SelectedExpression( + name="count", + expression=f.count(), + ), ], + groupby=[column("attr_key")], condition=condition, order_by=[ + OrderBy(direction=OrderByDirection.DESC, expression=f.count()), OrderBy(direction=OrderByDirection.ASC, expression=column("attr_key")), ], # chosen arbitrarily to be a high number @@ -305,11 +310,12 @@ def convert_co_occurring_results_to_attributes( query_res: QueryResult, ) -> list[TraceItemAttributeNamesResponse.Attribute]: def t(row: Row) -> TraceItemAttributeNamesResponse.Attribute: - # our query to snuba only selected 1 column, attr_key - # so the result should only have 1 item per row - vals = row.values() - assert len(vals) == 1 - attr_type, attr_name = list(vals)[0] + # our query to snuba selected 2 columns: attr_key and count + # we only need the attr_key for the response + vals = list(row.values()) + assert len(vals) == 2 + attr_type, attr_name = vals[0] # first column is attr_key + # second column is count, which we don't need in the response assert isinstance(attr_type, str) return TraceItemAttributeNamesResponse.Attribute( name=attr_name, type=getattr(AttributeKey.Type, attr_type) @@ -317,14 +323,19 @@ def t(row: Row) -> TraceItemAttributeNamesResponse.Attribute: data = query_res.result.get("data", []) if request.type in (AttributeKey.TYPE_UNSPECIFIED, AttributeKey.TYPE_STRING): + # Add non-stored attributes with a very high count so they appear first + # (since these are typically important service-level attributes) data.extend( [ - {"attr_key": ("TYPE_STRING", key_name)} + {"attr_key": ("TYPE_STRING", key_name), "count": float("inf")} for key_name in NON_STORED_ATTRIBUTE_KEYS if request.value_substring_match in key_name ] ) - data.sort(key=lambda row: tuple(row.get("attr_key", ("TYPE_STRING", "")))) + # Re-sort with the new attributes included + data.sort( + key=lambda row: (-row.get("count", 0), tuple(row.get("attr_key", ("TYPE_STRING", "")))) + ) return list(map(t, data)) diff --git a/tests/web/rpc/v1/test_endpoint_trace_item_attribute_names.py b/tests/web/rpc/v1/test_endpoint_trace_item_attribute_names.py index 513a9fe13a..261ac0cbcb 100644 --- a/tests/web/rpc/v1/test_endpoint_trace_item_attribute_names.py +++ b/tests/web/rpc/v1/test_endpoint_trace_item_attribute_names.py @@ -260,3 +260,69 @@ def test_simple_boolean(self) -> None: ) ) assert res.attributes == expected + + def test_sorted_by_frequency(self) -> None: + """Test that attributes are returned and sorted (by frequency DESC, then name ASC).""" + # Request string attributes with a filter to get a smaller result set + req = TraceItemAttributeNamesRequest( + meta=RequestMeta( + project_ids=[1, 2, 3], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=Timestamp(seconds=int((BASE_TIME - timedelta(days=1)).timestamp())), + end_timestamp=Timestamp(seconds=int((BASE_TIME + timedelta(days=1)).timestamp())), + ), + limit=100, + type=AttributeKey.Type.TYPE_STRING, + value_substring_match="a_tag", + ) + res = EndpointTraceItemAttributeNames().execute(req) + + # Verify we got results + assert len(res.attributes) > 0 + + # Get just the attribute names + attr_names = [attr.name for attr in res.attributes] + + # Since all a_tag_* attributes appear on only one span each (same frequency), + # they should be sorted alphabetically + assert attr_names == sorted(attr_names), ( + "Attributes with the same frequency should be sorted alphabetically" + ) + + # Verify the expected attributes are present and in order + expected = [] + for i in range(TOTAL_GENERATED_ATTR_PER_TYPE): + expected.append(f"a_tag_{str(i).zfill(3)}") + + assert attr_names == expected + + def test_frequency_ordering_with_specific_attributes(self) -> None: + """Test that specific attributes appear in order based on their frequency.""" + req = TraceItemAttributeNamesRequest( + meta=RequestMeta( + project_ids=[1, 2, 3], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=Timestamp(seconds=int((BASE_TIME - timedelta(days=1)).timestamp())), + end_timestamp=Timestamp(seconds=int((BASE_TIME + timedelta(days=1)).timestamp())), + ), + limit=1000, + type=AttributeKey.Type.TYPE_DOUBLE, + value_substring_match="b_measurement_", + ) + res = EndpointTraceItemAttributeNames().execute(req) + + # All b_measurement_* attributes appear on individual spans (frequency = 1) + # So they should all be sorted alphabetically since they have the same frequency + attr_names = [attr.name for attr in res.attributes] + + # Verify all returned attributes are b_measurement_* attributes + assert all(name.startswith("b_measurement_") for name in attr_names) + + # Verify they are sorted alphabetically (since all have same frequency) + assert attr_names == sorted(attr_names), ( + "Attributes with equal frequency should be sorted alphabetically" + )