You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
("Record Count", "SELECT count() FROM bluesky_minimal_variant.bluesky_data"),
41
+
("Variant Type", "SELECT variantType(data), count() FROM bluesky_minimal_variant.bluesky_data GROUP BY variantType(data)"),
42
+
("Data Size", "SELECT formatReadableSize(sum(data_compressed_bytes)) as compressed_size, formatReadableSize(sum(data_uncompressed_bytes)) as uncompressed_size FROM system.columns WHERE database = 'bluesky_minimal_variant'"),
43
+
]
44
+
45
+
forname, queryinqueries:
46
+
print(f"\n{name}:")
47
+
avg_time, result=run_clickhouse_query(query)
48
+
ifavg_time>0:
49
+
print(f" Time: {avg_time:.4f}s")
50
+
print(f" Result: {result}")
51
+
else:
52
+
print(f" Error: {result}")
53
+
54
+
deftest_json_extraction():
55
+
"""Test JSON field extraction patterns."""
56
+
print("\n"+"="*60)
57
+
print("JSON FIELD EXTRACTION")
58
+
print("="*60)
59
+
60
+
# Test different JSON extraction methods
61
+
extraction_queries= [
62
+
("JSON Extract - kind", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'kind') as kind, count() as cnt FROM bluesky_minimal_variant.bluesky_data GROUP BY kind ORDER BY cnt DESC LIMIT 5"),
63
+
("JSON Extract - did", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'did') as did FROM bluesky_minimal_variant.bluesky_data LIMIT 3"),
64
+
("JSON Extract - time_us", "SELECT JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') as time_us FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') > 0 LIMIT 5"),
65
+
("JSON Extract - collection", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') as collection, count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') != '' GROUP BY collection ORDER BY count() DESC LIMIT 5"),
66
+
]
67
+
68
+
forname, queryinextraction_queries:
69
+
print(f"\n{name}:")
70
+
avg_time, result=run_clickhouse_query(query)
71
+
ifavg_time>0:
72
+
print(f" Time: {avg_time:.4f}s")
73
+
print(f" Result: {result}")
74
+
else:
75
+
print(f" Error: {result}")
76
+
77
+
deftest_filtering_queries():
78
+
"""Test filtering performance on variant data."""
79
+
print("\n"+"="*60)
80
+
print("FILTERING PERFORMANCE")
81
+
print("="*60)
82
+
83
+
filter_queries= [
84
+
("Filter by kind", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'kind') = 'commit'"),
85
+
("Filter by collection", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') = 'app.bsky.feed.post'"),
86
+
("Complex filter", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'kind') = 'commit' AND JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') LIKE '%post%'"),
87
+
("Time range filter", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') > 1600000000000000"),
88
+
]
89
+
90
+
forname, queryinfilter_queries:
91
+
print(f"\n{name}:")
92
+
avg_time, result=run_clickhouse_query(query)
93
+
ifavg_time>0:
94
+
print(f" Time: {avg_time:.4f}s")
95
+
print(f" Result: {result}")
96
+
else:
97
+
print(f" Error: {result}")
98
+
99
+
deftest_aggregation_queries():
100
+
"""Test aggregation performance."""
101
+
print("\n"+"="*60)
102
+
print("AGGREGATION PERFORMANCE")
103
+
print("="*60)
104
+
105
+
agg_queries= [
106
+
("Count by kind", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'kind') as kind, count() FROM bluesky_minimal_variant.bluesky_data GROUP BY kind ORDER BY count() DESC"),
107
+
("Count by collection", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') as collection, count() FROM bluesky_minimal_variant.bluesky_data WHERE collection != '' GROUP BY collection ORDER BY count() DESC LIMIT 10"),
108
+
("Time stats", "SELECT min(JSONExtractUInt(variantElement(data, 'JSON'), 'time_us')), max(JSONExtractUInt(variantElement(data, 'JSON'), 'time_us')), avg(JSONExtractUInt(variantElement(data, 'JSON'), 'time_us')) FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') > 0"),
109
+
]
110
+
111
+
forname, queryinagg_queries:
112
+
print(f"\n{name}:")
113
+
avg_time, result=run_clickhouse_query(query)
114
+
ifavg_time>0:
115
+
print(f" Time: {avg_time:.4f}s")
116
+
print(f" Result: {result}")
117
+
else:
118
+
print(f" Error: {result}")
119
+
120
+
defcompare_with_json_table():
121
+
"""Compare minimal variant performance with regular JSON table."""
122
+
print("\n"+"="*60)
123
+
print("COMPARISON: MINIMAL VARIANT vs REGULAR JSON")
124
+
print("="*60)
125
+
126
+
# Test same queries on both tables
127
+
test_queries= [
128
+
("Count records",
129
+
"SELECT count() FROM bluesky_minimal_variant.bluesky_data",
130
+
"SELECT count() FROM bluesky_sample.bluesky"),
131
+
132
+
("Extract kind field",
133
+
"SELECT JSONExtractString(variantElement(data, 'JSON'), 'kind') as kind FROM bluesky_minimal_variant.bluesky_data LIMIT 1000",
134
+
"SELECT data.kind FROM bluesky_sample.bluesky LIMIT 1000"),
135
+
136
+
("Filter by kind",
137
+
"SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'kind') = 'commit'",
138
+
"SELECT count() FROM bluesky_sample.bluesky WHERE data.kind = 'commit'"),
139
+
140
+
("Group by collection",
141
+
"SELECT JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') as collection, count() FROM bluesky_minimal_variant.bluesky_data WHERE collection != '' GROUP BY collection ORDER BY count() DESC LIMIT 5",
142
+
"SELECT data.commit.collection as collection, count() FROM bluesky_sample.bluesky WHERE collection != '' GROUP BY collection ORDER BY count() DESC LIMIT 5"),
("Minimal Variant Table Size", "SELECT formatReadableSize(sum(bytes_on_disk)) as size_on_disk, count() as rows FROM system.parts WHERE database = 'bluesky_minimal_variant' AND table = 'bluesky_data' AND active = 1"),
172
+
("Regular JSON Table Size", "SELECT formatReadableSize(sum(bytes_on_disk)) as size_on_disk, count() as rows FROM system.parts WHERE database = 'bluesky_sample' AND table = 'bluesky' AND active = 1"),
173
+
("Column Details", "SELECT column, formatReadableSize(data_compressed_bytes) as compressed, formatReadableSize(data_uncompressed_bytes) as uncompressed FROM system.columns WHERE database = 'bluesky_minimal_variant' AND table = 'bluesky_data'"),
174
+
]
175
+
176
+
forname, queryinstorage_queries:
177
+
print(f"\n{name}:")
178
+
avg_time, result=run_clickhouse_query(query)
179
+
ifavg_time>0:
180
+
print(f" {result}")
181
+
else:
182
+
print(f" Error: {result}")
183
+
184
+
defmain():
185
+
"""Run all benchmarks."""
186
+
print("MINIMAL VARIANT TABLE BENCHMARKS")
187
+
print("="*60)
188
+
print("Testing ultra-simple single Variant(JSON) column performance")
189
+
print("")
190
+
191
+
# Run all benchmark categories
192
+
test_basic_queries()
193
+
test_json_extraction()
194
+
test_filtering_queries()
195
+
test_aggregation_queries()
196
+
compare_with_json_table()
197
+
show_storage_stats()
198
+
199
+
print("\n"+"="*60)
200
+
print("BENCHMARK SUMMARY")
201
+
print("="*60)
202
+
print("✓ Minimal variant table uses only 1 column: data Variant(JSON)")
203
+
print("✓ All field access requires JSONExtract functions")
204
+
print("✓ Schema-on-read: can query any field without predefinition")
205
+
print("✓ Compare results above to see performance vs regular JSON")
0 commit comments