Skip to content

Commit e8af5f0

Browse files
committed
[ADD] add implementation for minimal variants
1 parent 6002d3c commit e8af5f0

18 files changed

+204855
-153
lines changed

clickhouse/benchmark_all_approaches_1m.py

Lines changed: 413 additions & 0 deletions
Large diffs are not rendered by default.

clickhouse/benchmark_all_variants_comprehensive.sh

Lines changed: 408 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Benchmark script for minimal variant table.
4+
Tests performance of various query patterns on the ultra-simple single variant column.
5+
"""
6+
7+
import subprocess
8+
import time
9+
import sys
10+
from typing import List, Tuple
11+
12+
def run_clickhouse_query(query: str, iterations: int = 3) -> Tuple[float, str]:
13+
"""Run a ClickHouse query multiple times and return average time and result."""
14+
times = []
15+
result = ""
16+
17+
for i in range(iterations):
18+
start_time = time.time()
19+
cmd = ['clickhouse', 'client', '--query', query]
20+
proc = subprocess.run(cmd, capture_output=True, text=True)
21+
end_time = time.time()
22+
23+
if proc.returncode != 0:
24+
return -1, f"Error: {proc.stderr}"
25+
26+
times.append(end_time - start_time)
27+
if i == 0: # Store result from first run
28+
result = proc.stdout.strip()
29+
30+
avg_time = sum(times) / len(times)
31+
return avg_time, result
32+
33+
def test_basic_queries():
34+
"""Test basic variant queries."""
35+
print("=" * 60)
36+
print("BASIC VARIANT QUERIES")
37+
print("=" * 60)
38+
39+
queries = [
40+
("Record Count", "SELECT count() FROM bluesky_minimal_variant.bluesky_data"),
41+
("Variant Type", "SELECT variantType(data), count() FROM bluesky_minimal_variant.bluesky_data GROUP BY variantType(data)"),
42+
("Data Size", "SELECT formatReadableSize(sum(data_compressed_bytes)) as compressed_size, formatReadableSize(sum(data_uncompressed_bytes)) as uncompressed_size FROM system.columns WHERE database = 'bluesky_minimal_variant'"),
43+
]
44+
45+
for name, query in queries:
46+
print(f"\n{name}:")
47+
avg_time, result = run_clickhouse_query(query)
48+
if avg_time > 0:
49+
print(f" Time: {avg_time:.4f}s")
50+
print(f" Result: {result}")
51+
else:
52+
print(f" Error: {result}")
53+
54+
def test_json_extraction():
55+
"""Test JSON field extraction patterns."""
56+
print("\n" + "=" * 60)
57+
print("JSON FIELD EXTRACTION")
58+
print("=" * 60)
59+
60+
# Test different JSON extraction methods
61+
extraction_queries = [
62+
("JSON Extract - kind", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'kind') as kind, count() as cnt FROM bluesky_minimal_variant.bluesky_data GROUP BY kind ORDER BY cnt DESC LIMIT 5"),
63+
("JSON Extract - did", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'did') as did FROM bluesky_minimal_variant.bluesky_data LIMIT 3"),
64+
("JSON Extract - time_us", "SELECT JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') as time_us FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') > 0 LIMIT 5"),
65+
("JSON Extract - collection", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') as collection, count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') != '' GROUP BY collection ORDER BY count() DESC LIMIT 5"),
66+
]
67+
68+
for name, query in extraction_queries:
69+
print(f"\n{name}:")
70+
avg_time, result = run_clickhouse_query(query)
71+
if avg_time > 0:
72+
print(f" Time: {avg_time:.4f}s")
73+
print(f" Result: {result}")
74+
else:
75+
print(f" Error: {result}")
76+
77+
def test_filtering_queries():
78+
"""Test filtering performance on variant data."""
79+
print("\n" + "=" * 60)
80+
print("FILTERING PERFORMANCE")
81+
print("=" * 60)
82+
83+
filter_queries = [
84+
("Filter by kind", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'kind') = 'commit'"),
85+
("Filter by collection", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') = 'app.bsky.feed.post'"),
86+
("Complex filter", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'kind') = 'commit' AND JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') LIKE '%post%'"),
87+
("Time range filter", "SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') > 1600000000000000"),
88+
]
89+
90+
for name, query in filter_queries:
91+
print(f"\n{name}:")
92+
avg_time, result = run_clickhouse_query(query)
93+
if avg_time > 0:
94+
print(f" Time: {avg_time:.4f}s")
95+
print(f" Result: {result}")
96+
else:
97+
print(f" Error: {result}")
98+
99+
def test_aggregation_queries():
100+
"""Test aggregation performance."""
101+
print("\n" + "=" * 60)
102+
print("AGGREGATION PERFORMANCE")
103+
print("=" * 60)
104+
105+
agg_queries = [
106+
("Count by kind", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'kind') as kind, count() FROM bluesky_minimal_variant.bluesky_data GROUP BY kind ORDER BY count() DESC"),
107+
("Count by collection", "SELECT JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') as collection, count() FROM bluesky_minimal_variant.bluesky_data WHERE collection != '' GROUP BY collection ORDER BY count() DESC LIMIT 10"),
108+
("Time stats", "SELECT min(JSONExtractUInt(variantElement(data, 'JSON'), 'time_us')), max(JSONExtractUInt(variantElement(data, 'JSON'), 'time_us')), avg(JSONExtractUInt(variantElement(data, 'JSON'), 'time_us')) FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractUInt(variantElement(data, 'JSON'), 'time_us') > 0"),
109+
]
110+
111+
for name, query in agg_queries:
112+
print(f"\n{name}:")
113+
avg_time, result = run_clickhouse_query(query)
114+
if avg_time > 0:
115+
print(f" Time: {avg_time:.4f}s")
116+
print(f" Result: {result}")
117+
else:
118+
print(f" Error: {result}")
119+
120+
def compare_with_json_table():
121+
"""Compare minimal variant performance with regular JSON table."""
122+
print("\n" + "=" * 60)
123+
print("COMPARISON: MINIMAL VARIANT vs REGULAR JSON")
124+
print("=" * 60)
125+
126+
# Test same queries on both tables
127+
test_queries = [
128+
("Count records",
129+
"SELECT count() FROM bluesky_minimal_variant.bluesky_data",
130+
"SELECT count() FROM bluesky_sample.bluesky"),
131+
132+
("Extract kind field",
133+
"SELECT JSONExtractString(variantElement(data, 'JSON'), 'kind') as kind FROM bluesky_minimal_variant.bluesky_data LIMIT 1000",
134+
"SELECT data.kind FROM bluesky_sample.bluesky LIMIT 1000"),
135+
136+
("Filter by kind",
137+
"SELECT count() FROM bluesky_minimal_variant.bluesky_data WHERE JSONExtractString(variantElement(data, 'JSON'), 'kind') = 'commit'",
138+
"SELECT count() FROM bluesky_sample.bluesky WHERE data.kind = 'commit'"),
139+
140+
("Group by collection",
141+
"SELECT JSONExtractString(variantElement(data, 'JSON'), 'commit', 'collection') as collection, count() FROM bluesky_minimal_variant.bluesky_data WHERE collection != '' GROUP BY collection ORDER BY count() DESC LIMIT 5",
142+
"SELECT data.commit.collection as collection, count() FROM bluesky_sample.bluesky WHERE collection != '' GROUP BY collection ORDER BY count() DESC LIMIT 5"),
143+
]
144+
145+
for name, variant_query, json_query in test_queries:
146+
print(f"\n{name}:")
147+
148+
# Test variant query
149+
variant_time, variant_result = run_clickhouse_query(variant_query)
150+
print(f" Minimal Variant: {variant_time:.4f}s")
151+
152+
# Test JSON query
153+
json_time, json_result = run_clickhouse_query(json_query)
154+
print(f" Regular JSON: {json_time:.4f}s")
155+
156+
if variant_time > 0 and json_time > 0:
157+
ratio = variant_time / json_time
158+
print(f" Ratio (V/J): {ratio:.2f}x")
159+
if ratio > 1:
160+
print(f" → JSON is {ratio:.1f}x faster")
161+
else:
162+
print(f" → Variant is {1/ratio:.1f}x faster")
163+
164+
def show_storage_stats():
165+
"""Show storage statistics."""
166+
print("\n" + "=" * 60)
167+
print("STORAGE STATISTICS")
168+
print("=" * 60)
169+
170+
storage_queries = [
171+
("Minimal Variant Table Size", "SELECT formatReadableSize(sum(bytes_on_disk)) as size_on_disk, count() as rows FROM system.parts WHERE database = 'bluesky_minimal_variant' AND table = 'bluesky_data' AND active = 1"),
172+
("Regular JSON Table Size", "SELECT formatReadableSize(sum(bytes_on_disk)) as size_on_disk, count() as rows FROM system.parts WHERE database = 'bluesky_sample' AND table = 'bluesky' AND active = 1"),
173+
("Column Details", "SELECT column, formatReadableSize(data_compressed_bytes) as compressed, formatReadableSize(data_uncompressed_bytes) as uncompressed FROM system.columns WHERE database = 'bluesky_minimal_variant' AND table = 'bluesky_data'"),
174+
]
175+
176+
for name, query in storage_queries:
177+
print(f"\n{name}:")
178+
avg_time, result = run_clickhouse_query(query)
179+
if avg_time > 0:
180+
print(f" {result}")
181+
else:
182+
print(f" Error: {result}")
183+
184+
def main():
185+
"""Run all benchmarks."""
186+
print("MINIMAL VARIANT TABLE BENCHMARKS")
187+
print("=" * 60)
188+
print("Testing ultra-simple single Variant(JSON) column performance")
189+
print("")
190+
191+
# Run all benchmark categories
192+
test_basic_queries()
193+
test_json_extraction()
194+
test_filtering_queries()
195+
test_aggregation_queries()
196+
compare_with_json_table()
197+
show_storage_stats()
198+
199+
print("\n" + "=" * 60)
200+
print("BENCHMARK SUMMARY")
201+
print("=" * 60)
202+
print("✓ Minimal variant table uses only 1 column: data Variant(JSON)")
203+
print("✓ All field access requires JSONExtract functions")
204+
print("✓ Schema-on-read: can query any field without predefinition")
205+
print("✓ Compare results above to see performance vs regular JSON")
206+
print("")
207+
print("Key takeaways:")
208+
print("- Simpler schema, more complex queries")
209+
print("- True flexibility: query any JSON field")
210+
print("- Performance trade-off for simplicity")
211+
212+
if __name__ == '__main__':
213+
main()

0 commit comments

Comments
 (0)