Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 29 additions & 18 deletions snuba/web/rpc/v1/endpoint_trace_item_attribute_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,16 @@ def get_co_occurring_attributes(

The query at the end looks something like this:

SELECT distinct(arrayJoin(arrayFilter(attr -> ((NOT ((attr.2) IN ['test_tag_1_0'])) AND startsWith(attr.2, 'test_')), arrayMap(x -> ('TYPE_STRING', x), attributes_string)))) AS attr_key
SELECT arrayJoin(arrayFilter(attr -> ((NOT ((attr.2) IN ['test_tag_1_0'])) AND startsWith(attr.2, 'test_')), arrayMap(x -> ('TYPE_STRING', x), attributes_string))) AS attr_key, count() AS count
FROM eap_item_co_occurring_attrs_1_local
WHERE (item_type = 1) AND (project_id IN [1]) AND (organization_id = 1) AND (date < toDateTime(toDate('2025-03-17', 'Universal'))) AND (date >= toDateTime(toDate('2025-03-10', 'Universal')))

-- This is a faster way of looking up whether all attributes co-exist, it uses an array of hashes. This avoids string equality comparisons
AND hasAll(attribute_keys_hash, [cityHash64('test_tag_1_0')])
--

ORDER BY attr_key ASC
GROUP BY attr_key
ORDER BY count DESC, attr_key ASC
LIMIT 10000

**Explanation:**
Expand All @@ -148,7 +149,7 @@ def get_co_occurring_attributes(
)
) AS attr_key
```
4. . The outer query deduplicates the attributes sent by the inner query to return to the user distinct co-occurring attributes
4. . The outer query aggregates and counts the attributes sent by the inner query, returning co-occurring attributes sorted by frequency (most common first)

**The following things make this query more performant than searching the source table:**

Expand Down Expand Up @@ -258,19 +259,23 @@ def get_co_occurring_attributes(
selected_columns=[
SelectedExpression(
name="attr_key",
expression=f.distinct(
f.arrayJoin(
f.arrayFilter(
Lambda(None, ("attr",), attr_filter),
array_func,
),
alias="attr_key",
)
expression=f.arrayJoin(
f.arrayFilter(
Lambda(None, ("attr",), attr_filter),
array_func,
),
alias="attr_key",
),
),
SelectedExpression(
name="count",
expression=f.count(),
),
],
groupby=[column("attr_key")],
condition=condition,
order_by=[
OrderBy(direction=OrderByDirection.DESC, expression=f.count()),
OrderBy(direction=OrderByDirection.ASC, expression=column("attr_key")),
],
# chosen arbitrarily to be a high number
Expand Down Expand Up @@ -305,26 +310,32 @@ def convert_co_occurring_results_to_attributes(
query_res: QueryResult,
) -> list[TraceItemAttributeNamesResponse.Attribute]:
def t(row: Row) -> TraceItemAttributeNamesResponse.Attribute:
# our query to snuba only selected 1 column, attr_key
# so the result should only have 1 item per row
vals = row.values()
assert len(vals) == 1
attr_type, attr_name = list(vals)[0]
# our query to snuba selected 2 columns: attr_key and count
# we only need the attr_key for the response
vals = list(row.values())
assert len(vals) == 2
attr_type, attr_name = vals[0] # first column is attr_key
# second column is count, which we don't need in the response
assert isinstance(attr_type, str)
return TraceItemAttributeNamesResponse.Attribute(
name=attr_name, type=getattr(AttributeKey.Type, attr_type)
)

data = query_res.result.get("data", [])
if request.type in (AttributeKey.TYPE_UNSPECIFIED, AttributeKey.TYPE_STRING):
# Add non-stored attributes with a very high count so they appear first
# (since these are typically important service-level attributes)
data.extend(
[
{"attr_key": ("TYPE_STRING", key_name)}
{"attr_key": ("TYPE_STRING", key_name), "count": float("inf")}
for key_name in NON_STORED_ATTRIBUTE_KEYS
if request.value_substring_match in key_name
]
)
data.sort(key=lambda row: tuple(row.get("attr_key", ("TYPE_STRING", ""))))
# Re-sort with the new attributes included
data.sort(
key=lambda row: (-row.get("count", 0), tuple(row.get("attr_key", ("TYPE_STRING", ""))))
)

return list(map(t, data))

Expand Down
66 changes: 66 additions & 0 deletions tests/web/rpc/v1/test_endpoint_trace_item_attribute_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,69 @@ def test_simple_boolean(self) -> None:
)
)
assert res.attributes == expected

def test_sorted_by_frequency(self) -> None:
"""Test that attributes are returned and sorted (by frequency DESC, then name ASC)."""
# Request string attributes with a filter to get a smaller result set
req = TraceItemAttributeNamesRequest(
meta=RequestMeta(
project_ids=[1, 2, 3],
organization_id=1,
cogs_category="something",
referrer="something",
start_timestamp=Timestamp(seconds=int((BASE_TIME - timedelta(days=1)).timestamp())),
end_timestamp=Timestamp(seconds=int((BASE_TIME + timedelta(days=1)).timestamp())),
),
limit=100,
type=AttributeKey.Type.TYPE_STRING,
value_substring_match="a_tag",
)
res = EndpointTraceItemAttributeNames().execute(req)

# Verify we got results
assert len(res.attributes) > 0

# Get just the attribute names
attr_names = [attr.name for attr in res.attributes]

# Since all a_tag_* attributes appear on only one span each (same frequency),
# they should be sorted alphabetically
assert attr_names == sorted(attr_names), (
"Attributes with the same frequency should be sorted alphabetically"
)

# Verify the expected attributes are present and in order
expected = []
for i in range(TOTAL_GENERATED_ATTR_PER_TYPE):
expected.append(f"a_tag_{str(i).zfill(3)}")

assert attr_names == expected

def test_frequency_ordering_with_specific_attributes(self) -> None:
"""Test that specific attributes appear in order based on their frequency."""
req = TraceItemAttributeNamesRequest(
meta=RequestMeta(
project_ids=[1, 2, 3],
organization_id=1,
cogs_category="something",
referrer="something",
start_timestamp=Timestamp(seconds=int((BASE_TIME - timedelta(days=1)).timestamp())),
end_timestamp=Timestamp(seconds=int((BASE_TIME + timedelta(days=1)).timestamp())),
),
limit=1000,
type=AttributeKey.Type.TYPE_DOUBLE,
value_substring_match="b_measurement_",
)
res = EndpointTraceItemAttributeNames().execute(req)

# All b_measurement_* attributes appear on individual spans (frequency = 1)
# So they should all be sorted alphabetically since they have the same frequency
attr_names = [attr.name for attr in res.attributes]

# Verify all returned attributes are b_measurement_* attributes
assert all(name.startswith("b_measurement_") for name in attr_names)

# Verify they are sorted alphabetically (since all have same frequency)
assert attr_names == sorted(attr_names), (
"Attributes with equal frequency should be sorted alphabetically"
)
Loading