Skip to content

Commit 9632860

Browse files
committed
changed to match repo structure
1 parent ba8f800 commit 9632860

File tree

10 files changed

+152
-332
lines changed

10 files changed

+152
-332
lines changed

benchmarks/000.microbenchmarks/050.gpu-cache-latency/Dockerfile

Lines changed: 0 additions & 18 deletions
This file was deleted.

benchmarks/000.microbenchmarks/050.gpu-cache-latency/configs.json renamed to benchmarks/000.microbenchmarks/050.gpu-cache-latency/config.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
{
2+
"timeout": 300,
3+
"memory": 1024,
4+
"languages": ["python"],
5+
"modules": [],
26
"name": "050.gpu-cache-latency",
37
"runtime": "python3",
4-
"handler": "handler.handler",
8+
"handler": "python/function.handler",
59
"dockerfile": "Dockerfile",
610
"data_dir": "../../benchmarks-data/050.gpu-cache-latency",
711
"datasets": ["tiny", "small", "large"]
Binary file not shown.

benchmarks/000.microbenchmarks/050.gpu-cache-latency/gpu_cache_bench.cu

Lines changed: 0 additions & 124 deletions
This file was deleted.

benchmarks/000.microbenchmarks/050.gpu-cache-latency/handler.py

Lines changed: 0 additions & 31 deletions
This file was deleted.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# benchmarks/000.microbenchmarks/050.gpu-cache-latency/input.py
2+
3+
# You can tune these as you like later
4+
size_generators = {
5+
"test": {"working_set_bytes": 1 << 16, "iterations": 10_000},
6+
"small": {"working_set_bytes": 1 << 20, "iterations": 100_000},
7+
"large": {"working_set_bytes": 1 << 24, "iterations": 1_000_000},
8+
}
9+
10+
def generate_input(
11+
data_dir, # path to benchmark data dir (unused here)
12+
size, # "test" | "small" | "large"
13+
benchmarks_bucket, # storage bucket (unused locally)
14+
input_paths, # list of input paths (unused here)
15+
output_paths, # list of output paths (unused here)
16+
upload_func, # function to upload data (unused here)
17+
nosql_func # function to access NoSQL (unused here)
18+
):
19+
"""
20+
SeBS calls this to get the JSON-like dict that becomes event['input']
21+
for the function.
22+
"""
23+
cfg = size_generators[size]
24+
25+
return {
26+
"working_set_bytes": cfg["working_set_bytes"],
27+
"pattern": "random", # or "sequential", "stride_4", etc.
28+
"iterations": cfg["iterations"],
29+
"seed": 42
30+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# benchmarks/000.microbenchmarks/050.gpu-cache-latency/python/function.py
2+
3+
import time
4+
import math
5+
import torch
6+
7+
8+
def build_next_indices(n: int, pattern: str, device: torch.device, seed: int = 42):
9+
"""
10+
Build the 'next' array with the given pattern, similar to your C++ version.
11+
"""
12+
if n <= 0:
13+
n = 1
14+
15+
idx = torch.empty(n, dtype=torch.long)
16+
17+
if pattern == "sequential":
18+
idx = (torch.arange(n, dtype=torch.long) + 1) % n
19+
elif pattern.startswith("stride_"):
20+
stride = int(pattern.split("_", 1)[1])
21+
idx = (torch.arange(n, dtype=torch.long) + stride) % n
22+
elif pattern == "random":
23+
# deterministic permutation
24+
g = torch.Generator()
25+
g.manual_seed(seed)
26+
perm = torch.randperm(n, generator=g)
27+
idx[perm] = perm.roll(-1)
28+
else:
29+
raise ValueError(f"Unknown pattern '{pattern}'")
30+
31+
return idx.to(device)
32+
33+
34+
def pointer_chase(working_set_bytes: int, pattern: str, iterations: int, seed: int = 42):
35+
"""
36+
Pointer-chase microbenchmark, implemented in PyTorch.
37+
Uses GPU if available; otherwise falls back to CPU.
38+
"""
39+
40+
# Number of ints in the working set
41+
n = max(1, working_set_bytes // 4)
42+
43+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44+
45+
next_idx = build_next_indices(n, pattern, device, seed=seed)
46+
47+
cur = torch.tensor(0, dtype=torch.long, device=device)
48+
acc = torch.tensor(0, dtype=torch.long, device=device)
49+
50+
# Warmup (like your C++ version)
51+
warmup_iters = min(iterations, 1024)
52+
for _ in range(warmup_iters):
53+
cur = next_idx[cur]
54+
acc = acc + cur
55+
56+
# Measure time
57+
if device.type == "cuda":
58+
torch.cuda.synchronize()
59+
start_event = torch.cuda.Event(enable_timing=True)
60+
end_event = torch.cuda.Event(enable_timing=True)
61+
62+
start_event.record()
63+
for _ in range(iterations):
64+
cur = next_idx[cur]
65+
acc = acc + cur
66+
end_event.record()
67+
torch.cuda.synchronize()
68+
69+
elapsed_ms = start_event.elapsed_time(end_event) # ms
70+
total_seconds = elapsed_ms / 1000.0
71+
else:
72+
start_time = time.perf_counter()
73+
for _ in range(iterations):
74+
cur = next_idx[cur]
75+
acc = acc + cur
76+
total_seconds = time.perf_counter() - start_time
77+
78+
avg_ns = (total_seconds * 1e9 / iterations) if iterations > 0 else 0.0
79+
80+
return {
81+
"working_set_bytes": int(working_set_bytes),
82+
"pattern": pattern,
83+
"iterations": int(iterations),
84+
"device": str(device),
85+
"total_seconds": total_seconds,
86+
"avg_ns_per_step": avg_ns,
87+
"sink": int(acc.item()),
88+
}
89+
90+
91+
def handler(event, context=None):
92+
"""
93+
Entry point for SeBS.
94+
95+
For Python benchmarks, SeBS passes:
96+
event = {
97+
"input": { ...whatever generate_input returned... },
98+
...
99+
}
100+
We must return: { "result": <anything JSON-serializable> }
101+
"""
102+
103+
params = event.get("input", {})
104+
105+
working_set_bytes = int(params.get("working_set_bytes", 1 << 20))
106+
pattern = params.get("pattern", "random")
107+
iterations = int(params.get("iterations", 100_000))
108+
seed = int(params.get("seed", 42))
109+
110+
result = pointer_chase(working_set_bytes, pattern, iterations, seed=seed)
111+
112+
# SeBS expects this shape
113+
return {
114+
"result": result
115+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
torch
2+
numpy

0 commit comments

Comments
 (0)