Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/eval_from_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
app = modal.App("eval_from_generations_modal")
gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]}

cuda_version = "12.8.0" # should be no greater than host CUDA version
cuda_version = "13.0.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
Expand All @@ -71,7 +71,7 @@
)

.uv_sync(uv_project_dir=REPO_TOP_DIR)
.run_commands("git clone -b tk-v2 https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
.run_commands("git clone -b main https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
.env({
"THUNDERKITTENS_ROOT": "/root/ThunderKittens",
"PYTHONPATH": "/root/src:/root"
Expand Down
8 changes: 4 additions & 4 deletions scripts/generate_and_eval_single_sample_modal.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
'''
Example Usage:
python scripts/generate_and_eval_single_sample_modal.py dataset_src=huggingfac level=1 problem_id=1 eval_mode=modal gpu=L40S
server_type=deepseek model_name=deepseek-coder max_tokens=4096 temperature=0.0
uv run python scripts/generate_and_eval_single_sample_modal.py dataset_src=huggingface level=1 problem_id=1 eval_mode=modal gpu=L40S
server_type=gemini model_name=gemini-2.5-flash max_tokens=4096 temperature=0.0
'''

import pydra
Expand Down Expand Up @@ -89,7 +89,7 @@ def verbose_logging(self):
def __repr__(self):
return f"EvalConfig({self.to_dict()})"

cuda_version = "12.8.0" # should be no greater than host CUDA version
cuda_version = "13.0.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
Expand All @@ -105,7 +105,7 @@ def __repr__(self):
)

.uv_sync(uv_project_dir=REPO_TOP_DIR, extras=["gpu"])
.run_commands("git clone -b tk-v2 https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
.run_commands("git clone -b main https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
.env({
"THUNDERKITTENS_ROOT": "/root/ThunderKittens",
"PYTHONPATH": "/root:/root/src"
Expand Down
74 changes: 3 additions & 71 deletions scripts/generate_baseline_time.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
import torch
import numpy as np
from kernelbench.eval import (
load_original_model_and_inputs,
set_seed,
fetch_ref_arch_from_problem_id,
)
from kernelbench.timing import (
get_timing_function,
get_timing_stats,
)
from kernelbench.eval import fetch_ref_arch_from_problem_id
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'fetch_ref_arch_from_problem_id' is not used.

Suggested change
from kernelbench.eval import fetch_ref_arch_from_problem_id

Copilot uses AI. Check for mistakes.
from kernelbench.dataset import construct_kernelbench_dataset, fetch_ref_arch_from_dataset
from kernelbench.timing import measure_ref_program_time
from kernelbench.utils import read_file
import os
import json
Expand Down Expand Up @@ -48,67 +41,6 @@
TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing")


def measure_program_time(
ref_arch_name: str,
ref_arch_src: str,
num_trials: int = 100,
use_torch_compile: bool = False,
torch_compile_backend: str="inductor",
torch_compile_options: str="default",
device: torch.device="cuda:0",
verbose: bool = False,
timing_method: str = "cuda_event",
) -> dict:
"""
Measure the time of a KernelBench reference architecture
"""
context = {}
Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
ref_arch_src, context
)
try:
with torch.no_grad():
torch.cuda.synchronize(device=device)
set_seed(42)
inputs = get_inputs()
set_seed(42)
init_inputs = get_init_inputs()
inputs = [
x.cuda(device=device) if isinstance(x, torch.Tensor) else x
for x in inputs
]
init_inputs = [
x.cuda(device=device) if isinstance(x, torch.Tensor) else x
for x in init_inputs
]

# Initialize PyTorch model, use this for eager mode execution
model = Model(*init_inputs)

if use_torch_compile:
print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
else:
print(f"Using PyTorch Eager Execution on {ref_arch_name}")

model = model.cuda(device=device)
torch.cuda.synchronize(device=device)

# run chosen timing function
timing_fn = get_timing_function(timing_method)
elapsed_times = timing_fn(
model, inputs, num_trials=num_trials, verbose=verbose, device=device
)
runtime_stats = get_timing_stats(elapsed_times, device=device)

if verbose:
print(f"{ref_arch_name} {runtime_stats}")

return runtime_stats
except Exception as e:
print(f"[Eval] Error in Measuring Performance: {e}")



def record_baseline_times(use_torch_compile: bool = False,
torch_compile_backend: str="inductor",
Expand All @@ -129,7 +61,7 @@ def record_baseline_times(use_torch_compile: bool = False,
num_problems = len(dataset)
for problem_id in tqdm(dataset.get_problem_ids()):
ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(dataset, problem_id)
runtime_stats = measure_program_time(
runtime_stats = measure_ref_program_time(
ref_arch_name=ref_arch_name,
ref_arch_src=ref_arch_src,
use_torch_compile=use_torch_compile,
Expand Down
66 changes: 14 additions & 52 deletions scripts/generate_baseline_time_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(self):
import modal
app = modal.App("generate_baseline_modal")
gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "A100-80GB": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]}
cuda_version = "12.8.0" # should be no greater than host CUDA version
cuda_version = "13.0.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
Expand Down Expand Up @@ -141,58 +141,20 @@ def measure_program_time(
device: torch.device = torch.cuda.current_device() if torch.cuda.is_available() else None,
verbose: bool = False,
):
"""
Measure the time of a KernelBench reference architecture
"""
context = {}
Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
ref_arch_src, context
from kernelbench.timing import measure_ref_program_time
return measure_ref_program_time(
ref_arch_name=ref_arch_name,
ref_arch_src=ref_arch_src,
num_trials=num_trials,
num_warmup=3,
discard_first=1,
timing_method=timing_method,
use_torch_compile=use_torch_compile,
torch_compile_backend=torch_compile_backend,
torch_compile_options=torch_compile_options,
device=device,
verbose=verbose,
)
try:
with torch.no_grad():
torch.cuda.synchronize(device=device)
set_seed(42)
inputs = get_inputs()
set_seed(42)
init_inputs = get_init_inputs()
inputs = [
x.cuda(device=device) if isinstance(x, torch.Tensor) else x
for x in inputs
]
init_inputs = [
x.cuda(device=device) if isinstance(x, torch.Tensor) else x
for x in init_inputs
]

# Initialize PyTorch model, use this for eager mode execution
model = Model(*init_inputs)

if use_torch_compile:
print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
else:
print(f"Using PyTorch Eager Execution on {ref_arch_name}")

model = model.cuda(device=device)
timing_func = get_timing_function(timing_method)
torch.cuda.synchronize(device=device)
elapsed_times = timing_func(
model,
inputs,
num_warmup=3, # or any default you prefer
num_trials=num_trials,
discard_first=1, # or 0 to include first trial
verbose=verbose,
device=device,
)
runtime_stats = get_timing_stats(elapsed_times, device=device)

if verbose:
print(f"{ref_arch_name} {runtime_stats}")

return runtime_stats
except Exception as e:
print(f"[Eval] Error in Measuring Performance: {e}")

def record_baseline_times(config: BaselineConfig,
use_torch_compile: bool = False,
Expand Down
68 changes: 2 additions & 66 deletions scripts/get_baseline_time_single_problem.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,6 @@
import torch
import numpy as np
from kernelbench.eval import (
load_original_model_and_inputs,
set_seed,
fetch_ref_arch_from_problem_id,
)

from src.timing import get_timing_function, get_timing_stats

def measure_program_time(
ref_arch_name: str,
ref_arch_src: str,
num_trials: int = 100,
timing_method: str="cuda_event",
use_torch_compile: bool = False,
torch_compile_backend: str="inductor",
torch_compile_options: str="default",
device: torch.device="cuda:0",
verbose: bool = False,
) -> dict:
"""
Measure the time of a KernelBench reference architecture
"""
context = {}
Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
ref_arch_src, context
)
try:
with torch.no_grad():
torch.cuda.synchronize(device=device)
set_seed(42)
inputs = get_inputs()
set_seed(42)
init_inputs = get_init_inputs()
inputs = [
x.cuda(device=device) if isinstance(x, torch.Tensor) else x
for x in inputs
]
init_inputs = [
x.cuda(device=device) if isinstance(x, torch.Tensor) else x
for x in init_inputs
]

# Initialize PyTorch model, use this for eager mode execution
model = Model(*init_inputs)

if use_torch_compile:
print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
else:
print(f"Using PyTorch Eager Execution on {ref_arch_name}")

model = model.cuda(device=device)
torch.cuda.synchronize(device=device)
timing_func = get_timing_function(timing_method )
elapsed_times = timing_func(
model, inputs, num_warmup=3, num_trials=num_trials, discard_first=1, verbose=verbose, device=device
)
runtime_stats = get_timing_stats(elapsed_times, device=device)

if verbose:
print(f"{ref_arch_name} {runtime_stats}")

return runtime_stats
except Exception as e:
print(f"[Eval] Error in Measuring Performance: {e}")
from kernelbench.timing import measure_ref_program_time

if __name__ == "__main__":
ref_arch_name = "softmax"
Expand All @@ -89,4 +25,4 @@ def get_inputs():
def get_init_inputs():
return [] # No special initialization inputs needed
"""
print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event"))
print(measure_ref_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event"))
35 changes: 18 additions & 17 deletions scripts/run_and_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from kernelbench import eval as kernel_eval
from kernelbench import utils as kernel_utils
from scripts.generate_baseline_time import measure_program_time
from kernelbench.timing import measure_ref_program_time
from kernelbench.utils import read_file
from kernelbench.kernel_static_checker import validate_kernel_static

Expand All @@ -26,7 +26,7 @@

REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))

cuda_version = "12.8.0"
cuda_version = "13.0.0"
flavor = "devel"
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
Expand All @@ -38,8 +38,8 @@
image = (
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
.apt_install("git", "gcc-10", "g++-10", "clang")
.uv_sync(uv_project_dir=REPO_TOP_PATH)
.run_commands("git clone -b tk-v2 https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
.uv_sync(uv_project_dir=REPO_TOP_PATH, extras=["gpu"])
.run_commands("git clone -b main https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
.env({
"THUNDERKITTENS_ROOT": "/root/ThunderKittens",
"PYTHONPATH": "/root:/root/src:/root/scripts"
Expand Down Expand Up @@ -223,25 +223,22 @@ def measure_program_time_modal(
ref_arch_src: str,
num_trials: int,
use_torch_compile: bool,
torch_compile_backend: str,
torch_compile_options: str,
gpu_arch: list
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function signature is missing required parameters. Based on the function call at line 242 and usage patterns shown elsewhere, the function should include parameters like precision (or a configs dict), and possibly torch_compile_backend and torch_compile_options to match how the function is called from outside the diff context. The current signature will cause errors when the function is invoked with these additional parameters.

Suggested change
gpu_arch: list
gpu_arch: list,
configs: dict,

Copilot uses AI. Check for mistakes.
):
"""Measure the execution time of a reference program on Modal"""
from scripts.generate_baseline_time import measure_program_time
from kernelbench.timing import measure_ref_program_time
from kernelbench.utils import set_gpu_arch

set_gpu_arch(gpu_arch)
device = torch.device("cuda:0")

return measure_program_time(
return measure_ref_program_time(
ref_arch_name="Reference Program",
ref_arch_src=ref_arch_src,
num_trials=num_trials,
use_torch_compile=use_torch_compile,
torch_compile_backend=torch_compile_backend,
torch_compile_options=torch_compile_options,
device=device
verbose=False,
device=device,
)


Expand Down Expand Up @@ -311,21 +308,25 @@ def main(config: ScriptConfig):
# Measure baseline time
print("[INFO] Measuring reference program time")
# Default using PyTorch Eager here
ref_time_eager_result = measure_program_time(ref_arch_name="Reference Program",
ref_time_eager_result = measure_ref_program_time(ref_arch_name="Reference Program",
ref_arch_src=ref_arch_src,
num_trials=config.num_perf_trials,
use_torch_compile=False,
device=device)
timing_method=config.timing_method,
device=device,
verbose=False,
)
ref_exec_eager_time = ref_time_eager_result.get("mean", None)

# Measure Torch Compile time
ref_time_compile_result = measure_program_time(ref_arch_name="Reference Program",
ref_time_compile_result = measure_ref_program_time(ref_arch_name="Reference Program",
ref_arch_src=ref_arch_src,
num_trials=config.num_perf_trials,
use_torch_compile=True,
torch_compile_backend="inductor",
torch_compile_options="default",
device=device)
timing_method=config.timing_method,
device=device,
verbose=False,
)
ref_exec_compile_time = ref_time_compile_result.get("mean", None)

elif config.eval_mode == "modal":
Expand Down
Loading