ScalingIntelligence · simonguozirui · Jan 20, 2026 · Jan 19, 2026 · Jan 19, 2026 · Jan 19, 2026
diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
@@ -53,7 +53,7 @@
 app = modal.App("eval_from_generations_modal")
 gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]}
 
-cuda_version = "12.8.0"  # should be no greater than host CUDA version
+cuda_version = "13.0.0"  # should be no greater than host CUDA version
 flavor = "devel"  #  includes full CUDA toolkit
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -71,7 +71,7 @@
                 )
 
     .uv_sync(uv_project_dir=REPO_TOP_DIR)
-    .run_commands("git clone -b tk-v2 https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
+    .run_commands("git clone -b main https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
     .env({
         "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
         "PYTHONPATH": "/root/src:/root"

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
@@ -1,7 +1,7 @@
 '''
 Example Usage:
-python scripts/generate_and_eval_single_sample_modal.py dataset_src=huggingfac level=1 problem_id=1 eval_mode=modal gpu=L40S 
-    server_type=deepseek model_name=deepseek-coder max_tokens=4096 temperature=0.0
+uv run python scripts/generate_and_eval_single_sample_modal.py dataset_src=huggingface level=1 problem_id=1 eval_mode=modal gpu=L40S 
+    server_type=gemini model_name=gemini-2.5-flash max_tokens=4096 temperature=0.0
 '''
 
 import pydra
@@ -89,7 +89,7 @@ def verbose_logging(self):
     def __repr__(self):
         return f"EvalConfig({self.to_dict()})"
 
-cuda_version = "12.8.0"  # should be no greater than host CUDA version
+cuda_version = "13.0.0"  # should be no greater than host CUDA version
 flavor = "devel"  #  includes full CUDA toolkit
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -105,7 +105,7 @@ def __repr__(self):
                 )
 
     .uv_sync(uv_project_dir=REPO_TOP_DIR, extras=["gpu"])
-    .run_commands("git clone -b tk-v2 https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
+    .run_commands("git clone -b main https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
     .env({
         "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
         "PYTHONPATH": "/root:/root/src"

diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py
@@ -1,15 +1,8 @@
 import torch
 import numpy as np
-from kernelbench.eval import (
-    load_original_model_and_inputs,
-    set_seed,
-    fetch_ref_arch_from_problem_id,
-)
-from kernelbench.timing import (
-    get_timing_function,
-    get_timing_stats,
-)
+from kernelbench.eval import fetch_ref_arch_from_problem_id
-from kernelbench.eval import fetch_ref_arch_from_problem_id
-from kernelbench.eval import fetch_ref_arch_from_problem_id
 from kernelbench.dataset import construct_kernelbench_dataset, fetch_ref_arch_from_dataset
+from kernelbench.timing import measure_ref_program_time
 from kernelbench.utils import read_file
 import os
 import json
@@ -48,67 +41,6 @@
 TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing")
 
 
-def measure_program_time(
-        ref_arch_name: str,
-        ref_arch_src: str, 
-        num_trials: int = 100,
-        use_torch_compile: bool = False,
-        torch_compile_backend: str="inductor", 
-        torch_compile_options: str="default",
-        device: torch.device="cuda:0",
-        verbose: bool = False,
-        timing_method: str = "cuda_event",
-) -> dict:
-    """
-    Measure the time of a KernelBench reference architecture
-    """
-    context = {}
-    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
-        ref_arch_src, context
-    )
-    try:
-        with torch.no_grad():
-            torch.cuda.synchronize(device=device)
-            set_seed(42)
-            inputs = get_inputs()
-            set_seed(42)
-            init_inputs = get_init_inputs()
-            inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in inputs
-            ]
-            init_inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in init_inputs
-            ]
-
-            # Initialize PyTorch model, use this for eager mode execution
-            model = Model(*init_inputs)
-
-            if use_torch_compile:
-                print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
-                model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
-            else:
-                print(f"Using PyTorch Eager Execution on {ref_arch_name}")
-
-            model = model.cuda(device=device)
-            torch.cuda.synchronize(device=device)
-
-            # run chosen timing function
-            timing_fn = get_timing_function(timing_method)
-            elapsed_times = timing_fn(
-                model, inputs, num_trials=num_trials, verbose=verbose, device=device
-            )
-            runtime_stats = get_timing_stats(elapsed_times, device=device)
-
-            if verbose:
-                print(f"{ref_arch_name} {runtime_stats}")
-
-            return runtime_stats
-    except Exception as e:
-        print(f"[Eval] Error in Measuring Performance: {e}")
-
-
 
 def record_baseline_times(use_torch_compile: bool = False, 
                           torch_compile_backend: str="inductor", 
@@ -129,7 +61,7 @@ def record_baseline_times(use_torch_compile: bool = False,
         num_problems = len(dataset)
         for problem_id in tqdm(dataset.get_problem_ids()):
             ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(dataset, problem_id)
-            runtime_stats = measure_program_time(
+            runtime_stats = measure_ref_program_time(
                 ref_arch_name=ref_arch_name,
                 ref_arch_src=ref_arch_src,
                 use_torch_compile=use_torch_compile,

diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py
@@ -77,7 +77,7 @@ def __init__(self):
 import modal
 app = modal.App("generate_baseline_modal")
 gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "A100-80GB": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]}
-cuda_version = "12.8.0"  # should be no greater than host CUDA version
+cuda_version = "13.0.0"  # should be no greater than host CUDA version
 flavor = "devel"  #  includes full CUDA toolkit
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -141,58 +141,20 @@ def measure_program_time(
             device: torch.device = torch.cuda.current_device() if torch.cuda.is_available() else None,
             verbose: bool = False,
     ):
-        """
-        Measure the time of a KernelBench reference architecture
-        """
-        context = {}
-        Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
-            ref_arch_src, context
+        from kernelbench.timing import measure_ref_program_time
+        return measure_ref_program_time(
+            ref_arch_name=ref_arch_name,
+            ref_arch_src=ref_arch_src,
+            num_trials=num_trials,
+            num_warmup=3,
+            discard_first=1,
+            timing_method=timing_method,
+            use_torch_compile=use_torch_compile,
+            torch_compile_backend=torch_compile_backend,
+            torch_compile_options=torch_compile_options,
+            device=device,
+            verbose=verbose,
         )
-        try:
-            with torch.no_grad():
-                torch.cuda.synchronize(device=device)
-                set_seed(42)
-                inputs = get_inputs()
-                set_seed(42)
-                init_inputs = get_init_inputs()
-                inputs = [
-                    x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                    for x in inputs
-                ]
-                init_inputs = [
-                    x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                    for x in init_inputs
-                ]
-
-                # Initialize PyTorch model, use this for eager mode execution
-                model = Model(*init_inputs)
-
-                if use_torch_compile:
-                    print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
-                    model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
-                else:
-                    print(f"Using PyTorch Eager Execution on {ref_arch_name}")
-
-                model = model.cuda(device=device)
-                timing_func = get_timing_function(timing_method)
-                torch.cuda.synchronize(device=device)
-                elapsed_times = timing_func(
-                    model,
-                    inputs,
-                    num_warmup=3,  # or any default you prefer
-                    num_trials=num_trials,
-                    discard_first=1,  # or 0 to include first trial
-                    verbose=verbose,
-                    device=device,
-                )
-                runtime_stats = get_timing_stats(elapsed_times, device=device)
-
-                if verbose:
-                    print(f"{ref_arch_name} {runtime_stats}")
-
-                return runtime_stats
-        except Exception as e:
-            print(f"[Eval] Error in Measuring Performance: {e}")
 
 def record_baseline_times(config: BaselineConfig,
                           use_torch_compile: bool = False,

diff --git a/scripts/get_baseline_time_single_problem.py b/scripts/get_baseline_time_single_problem.py
@@ -1,70 +1,6 @@
 import torch
 import numpy as np
-from kernelbench.eval import (
-    load_original_model_and_inputs,
-    set_seed,
-    fetch_ref_arch_from_problem_id,
-)
-
-from src.timing import get_timing_function, get_timing_stats
-
-def measure_program_time(
-        ref_arch_name: str,
-        ref_arch_src: str, 
-        num_trials: int = 100,
-        timing_method: str="cuda_event",        
-        use_torch_compile: bool = False,
-        torch_compile_backend: str="inductor", 
-        torch_compile_options: str="default",
-        device: torch.device="cuda:0",
-        verbose: bool = False,
-) -> dict:
-    """
-    Measure the time of a KernelBench reference architecture
-    """
-    context = {}
-    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
-        ref_arch_src, context
-    )
-    try:
-        with torch.no_grad():
-            torch.cuda.synchronize(device=device)
-            set_seed(42)
-            inputs = get_inputs()
-            set_seed(42)
-            init_inputs = get_init_inputs()
-            inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in inputs
-            ]
-            init_inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in init_inputs
-            ]
-
-            # Initialize PyTorch model, use this for eager mode execution
-            model = Model(*init_inputs)
-
-            if use_torch_compile:
-                print(f"Using torch.compile to compile model {ref_arch_name} with {torch_compile_backend} backend and {torch_compile_options} mode")
-                model = torch.compile(model, backend=torch_compile_backend, mode=torch_compile_options)
-            else:
-                print(f"Using PyTorch Eager Execution on {ref_arch_name}")
-
-            model = model.cuda(device=device)
-            torch.cuda.synchronize(device=device)
-            timing_func = get_timing_function(timing_method )
-            elapsed_times = timing_func(
-                model, inputs, num_warmup=3, num_trials=num_trials, discard_first=1, verbose=verbose, device=device
-            )
-            runtime_stats = get_timing_stats(elapsed_times, device=device)
-
-            if verbose:
-                print(f"{ref_arch_name} {runtime_stats}")
-
-            return runtime_stats
-    except Exception as e:
-        print(f"[Eval] Error in Measuring Performance: {e}")
+from kernelbench.timing import measure_ref_program_time
 
 if __name__ == "__main__":
     ref_arch_name = "softmax"
@@ -89,4 +25,4 @@ def get_inputs():
 def get_init_inputs():
     return []  # No special initialization inputs needed
     """
-    print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event"))
+    print(measure_ref_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event"))
diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
@@ -7,7 +7,7 @@
 
 from kernelbench import eval as kernel_eval
 from kernelbench import utils as kernel_utils
-from scripts.generate_baseline_time import measure_program_time
+from kernelbench.timing import measure_ref_program_time
 from kernelbench.utils import read_file
 from kernelbench.kernel_static_checker import validate_kernel_static
 
@@ -26,7 +26,7 @@
 
 REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 
-cuda_version = "12.8.0"
+cuda_version = "13.0.0"
 flavor = "devel"
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -38,8 +38,8 @@
 image = (
     modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
     .apt_install("git", "gcc-10", "g++-10", "clang")
-    .uv_sync(uv_project_dir=REPO_TOP_PATH)
-    .run_commands("git clone -b tk-v2 https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
+    .uv_sync(uv_project_dir=REPO_TOP_PATH, extras=["gpu"])
+    .run_commands("git clone -b main https://github.com/HazyResearch/ThunderKittens.git /root/ThunderKittens")
     .env({
         "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
         "PYTHONPATH": "/root:/root/src:/root/scripts"
@@ -223,25 +223,22 @@ def measure_program_time_modal(
         ref_arch_src: str,
         num_trials: int,
         use_torch_compile: bool,
-        torch_compile_backend: str,
-        torch_compile_options: str,
         gpu_arch: list
-        gpu_arch: list
+        gpu_arch: list,
+        configs: dict,
-        gpu_arch: list
+        gpu_arch: list,
+        configs: dict,
     ):
         """Measure the execution time of a reference program on Modal"""
-        from scripts.generate_baseline_time import measure_program_time
+        from kernelbench.timing import measure_ref_program_time
         from kernelbench.utils import set_gpu_arch
 
         set_gpu_arch(gpu_arch)
         device = torch.device("cuda:0")
 
-        return measure_program_time(
+        return measure_ref_program_time(
             ref_arch_name="Reference Program",
             ref_arch_src=ref_arch_src,
             num_trials=num_trials,
             use_torch_compile=use_torch_compile,
-            torch_compile_backend=torch_compile_backend,
-            torch_compile_options=torch_compile_options,
-            device=device
+            verbose=False,
+            device=device,
         )
 
 
@@ -311,21 +308,25 @@ def main(config: ScriptConfig):
         # Measure baseline time
         print("[INFO] Measuring reference program time")
         # Default using PyTorch Eager here
-        ref_time_eager_result = measure_program_time(ref_arch_name="Reference Program",
+        ref_time_eager_result = measure_ref_program_time(ref_arch_name="Reference Program",
                                                     ref_arch_src=ref_arch_src,
                                                     num_trials=config.num_perf_trials,
                                                     use_torch_compile=False,
-                                                    device=device)
+                                                    timing_method=config.timing_method,
+                                                    device=device,
+                                                    verbose=False,
+                                                    )
         ref_exec_eager_time = ref_time_eager_result.get("mean", None)
 
         # Measure Torch Compile time
-        ref_time_compile_result = measure_program_time(ref_arch_name="Reference Program",
+        ref_time_compile_result = measure_ref_program_time(ref_arch_name="Reference Program",
                                                     ref_arch_src=ref_arch_src,
                                                     num_trials=config.num_perf_trials,
                                                     use_torch_compile=True,
-                                                    torch_compile_backend="inductor",
-                                                    torch_compile_options="default",
-                                                    device=device)
+                                                    timing_method=config.timing_method,
+                                                    device=device,
+                                                    verbose=False,
+                                                    )
         ref_exec_compile_time = ref_time_compile_result.get("mean", None)
 
     elif config.eval_mode == "modal":