Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-api03-...

# Google Gemini
GEMINI_API_KEY=...
GEMINI_API_KEY=AIzaSyC2rq0xg1Ucpr7cXF5jh82RVBLi8MdnPcU
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll properly take a review later! This looks exciting!

However, please fix this!!!!

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ethanbc key disabled now, please DO NOT post API key anywhere in public

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pythonomar22 A decent rule of thumb is to only modify the .env file when you're experimenting / plugging in keys, and stick .env in the .gitignore. When you're done update the .env.example if you are adding / removing env variables.

This prevents this from happening :)


# DeepSeek
DEEPSEEK_API_KEY=sk-...
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ dependencies = [
"torch==2.9.0",

"transformers",
"datasets",
"datasets>=3.0.0",
"pyarrow",
"numpy<2",
"modal",

# helper
Expand Down
45 changes: 33 additions & 12 deletions scripts/benchmark_eval_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def patch(eval_results, dataset):
"""
Patch the eval results with the dataset
"""
for pid in range(1, len(dataset) + 1):
for pid in dataset.get_problem_ids():
if str(pid) not in eval_results:
eval_results[str(pid)] = {
"sample_id": 0,
Expand Down Expand Up @@ -161,19 +161,40 @@ def analyze_greedy_eval(run_name, hardware, baseline, level,
)

# Extract the speedup values
is_correct = np.array([entry["correctness"] for entry in eval_results.values()])
baseline_speed = np.array(
[entry["mean"] for entry in baseline_results[f"level{level}"].values()]
)
actual_speed = np.array([entry["runtime"] for entry in eval_results.values()])
is_correct_list = []
baseline_speed_list = []
actual_speed_list = []

# Sort problem IDs to ensure consistent order
sorted_pids = sorted(dataset.get_problem_ids())

for pid in sorted_pids:
# Get eval result
if str(pid) not in eval_results:
print(f"Warning: Problem {pid} not found in eval results")
continue
eval_entry = eval_results[str(pid)]

# Get baseline result
problem = dataset.get_problem_by_id(pid)
problem_name = problem.name

if problem_name not in baseline_results[f"level{level}"]:
print(f"Warning: Problem {problem_name} not found in baseline results")
continue

baseline_entry = baseline_results[f"level{level}"][problem_name]

is_correct_list.append(eval_entry["correctness"])
actual_speed_list.append(eval_entry["runtime"])
baseline_speed_list.append(baseline_entry["mean"])

is_correct = np.array(is_correct_list)
baseline_speed = np.array(baseline_speed_list)
actual_speed = np.array(actual_speed_list)
n = len(is_correct)

assert (
len(baseline_speed) == n
), "Baseline speedup values do not match the number of eval results"
assert (
len(actual_speed) == n
), "Actual speedup values do not match the number of eval results"
print(f"Aligned {n} problems for analysis")

# Calculate the metrics
gmsr_correct = geometric_mean_speed_ratio_correct_only(
Expand Down
85 changes: 28 additions & 57 deletions scripts/eval_from_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import pydra
import torch

from datasets import load_dataset
from pydra import Config, REQUIRED

# Import only what we need
Expand Down Expand Up @@ -255,36 +254,17 @@ def evaluate_single_sample_modal(


def fetch_ref_arch_from_problem_id(
dataset, problem_id: int, dataset_src: str
dataset, problem_id: int, dataset_src: str = None
) -> str | None:
"""
Fetch reference architecture from problem directory
Either from Hugging Face or Local Dataset
Fetch reference architecture from problem directory.
Uses the unified dataset interface.

Note: dataset_src parameter is kept for backward compatibility but ignored
since the dataset object already handles both sources.
"""
if dataset_src == "huggingface":
curr_problem_row = dataset.filter(
lambda x: x["problem_id"] == problem_id, num_proc=None, desc=None
)
ref_arch_src = curr_problem_row["code"][0]
problem_name = curr_problem_row["name"][0]

elif dataset_src == "local":
problem_idx_in_dataset = (
problem_id - 1
) # due to dataset list being 0-indexed locally
ref_arch_path = dataset[problem_idx_in_dataset]

problem_name = os.path.basename(ref_arch_path)
ref_arch_src = read_file(ref_arch_path)

# verify
# Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
problem_number = int(problem_name.split("_")[0])
assert (
problem_number == problem_id
), f"Problem number in filename ({problem_number}) does not match config problem_id ({problem_id})"

return ref_arch_src
problem = dataset.get_problem_by_id(problem_id)
return problem.code


def fetch_kernel_from_disk(
Expand Down Expand Up @@ -822,57 +802,48 @@ def main(config: EvalConfig):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method("spawn")

# Dataset Configurations
if config.dataset_src == "huggingface":
dataset = load_dataset(config.dataset_name)
curr_level_dataset = dataset[f"level_{config.level}"]
elif config.dataset_src == "local":
curr_level_dataset = construct_kernelbench_dataset(config.level)

num_problems_in_level = len(curr_level_dataset)

# Determine which problem IDs to evaluate
# you can either specify a list of problem IDs (prioritize) or a subset range
# NOTE: later once the dataset PR is in we will link the representative subset as a built-in preset too
if config.problem_ids is not None:
# Use specific problem IDs if provided
problem_id_list = config.problem_ids
for pid in problem_id_list:
assert 1 <= pid <= num_problems_in_level, f"Problem ID {pid} out of range for Level {config.level}"
elif config.subset == (None, None):
problem_id_list = list(range(1, num_problems_in_level + 1))
# Dataset Configurations - Unified loading
dataset = construct_kernelbench_dataset(
level=config.level,
source=config.dataset_src,
dataset_name=config.dataset_name,
)

all_problem_ids = dataset.get_problem_ids()

if config.subset == (None, None):
problem_ids_to_run = all_problem_ids
else:
assert (
config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level
), f"Subset range {config.subset} out of range for Level {config.level}"
problem_id_list = list(range(config.subset[0], config.subset[1] + 1))
start, end = config.subset
problem_ids_to_run = [pid for pid in all_problem_ids if start <= pid <= end]
if not problem_ids_to_run:
print(f"Warning: No problems found in subset range {config.subset}")
Copy link

Copilot AI Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent indentation: this line uses extra spaces while the rest of the file uses standard indentation. Should align with line 818 above.

Suggested change
print(f"Warning: No problems found in subset range {config.subset}")
print(f"Warning: No problems found in subset range {config.subset}")

Copilot uses AI. Check for mistakes.

print(
f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_id_list}"
f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_ids_to_run}"
)

run_dir = os.path.join(config.runs_dir, config.run_name)
eval_file_path = os.path.join(run_dir, f"eval_results.json")

# To Debug
# single_eval_example(config, curr_level_dataset, run_dir, eval_file_path)
# single_eval_example(config, dataset, run_dir, eval_file_path)

total_work = []
for problem_id in problem_id_list:
for problem_id in problem_ids_to_run:
for sample_id in range(config.num_samples_per_problem):
if not check_if_eval_exists_local(problem_id, sample_id, eval_file_path):
total_work.append((problem_id, sample_id))

print(
f"Start evaluation on {len(total_work)} unevaluated samples"
f" for problems: {problem_id_list}"
f" in range: {problem_ids_to_run}"
)
# Build Cache on CPU as that is faster (only for local mode)
if config.build_cache and config.eval_mode == "local":
compile.batch_compile(total_work, config.to_dict())

# Batch Eval on multiple GPUs in parallel
batch_eval(total_work, config, curr_level_dataset, run_dir, eval_file_path)
batch_eval(total_work, config, dataset, run_dir, eval_file_path)

# Calculate pass@k metrics if multiple samples per problem were evaluated
if config.num_samples_per_problem > 1:
Expand Down
54 changes: 13 additions & 41 deletions scripts/generate_and_eval_single_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,12 @@
import json
import modal

from datasets import load_dataset

from kernelbench.dataset import construct_kernelbench_dataset
from kernelbench.eval import eval_kernel_against_ref
from kernelbench.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt
from kernelbench.utils import (
create_inference_server_from_presets,
extract_first_code,
query_server,
read_file,
set_gpu_arch,
)
from kernelbench.eval import get_torch_dtype_from_string
Expand Down Expand Up @@ -116,13 +112,14 @@ def main(config: EvalConfig):

print(f"Starting Eval with config: {config}")

# Configurations

if config.dataset_src == "huggingface":
dataset = load_dataset(config.dataset_name)
curr_level_dataset = dataset[f"level_{config.level}"]
elif config.dataset_src == "local":
curr_level_dataset = construct_kernelbench_dataset(config.level)
# Configurations - Unified dataset loading (works for both HF and local)
from kernelbench.dataset import construct_kernelbench_dataset

dataset = construct_kernelbench_dataset(
level=config.level,
source=config.dataset_src,
dataset_name=config.dataset_name,
)

if config.gpu_arch:
set_gpu_arch(config.gpu_arch) # otherwise build for all architectures
Expand All @@ -131,41 +128,16 @@ def main(config: EvalConfig):
os.makedirs(config.logdir, exist_ok=True)

# Problem Checks
num_problems = len(curr_level_dataset)
num_problems = len(dataset)
print(f"Number of problems in Level {config.level}: {num_problems}")
print(
f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}"
)

assert (
config.problem_id <= num_problems
), f"Problem ID {config.problem_id} out of range for Level {config.level}"

# TODO: refactor dataset fetching logic to be as clean as posisble.
# 1. Fetch Problem
if config.dataset_src == "huggingface":

curr_problem_row = curr_level_dataset.filter(
lambda x: x["problem_id"] == config.problem_id
)
ref_arch_src = curr_problem_row["code"][0]
problem_name = curr_problem_row["name"][0]

elif config.dataset_src == "local":
problem_idx_in_dataset = (
config.problem_id - 1
) # due to dataset list being 0-indexed locally
ref_arch_path = curr_level_dataset[problem_idx_in_dataset]

problem_name = os.path.basename(ref_arch_path)
ref_arch_src = read_file(ref_arch_path)
# import pdb; pdb.set_trace()

# Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
problem_number = int(problem_name.split("_")[0])
assert (
problem_number == config.problem_id
), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
# Fetch problem - unified interface, no branching needed
problem = dataset.get_problem_by_id(config.problem_id)
ref_arch_src = problem.code
problem_name = problem.name

# 2. Generate Sample
# Create inference function with config parameters
Expand Down
44 changes: 13 additions & 31 deletions scripts/generate_and_eval_single_sample_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@
import json
import modal

from datasets import load_dataset

#from src.dataset import construct_kernelbench_dataset
from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
from kernelbench.dataset import construct_kernelbench_dataset
from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, create_inference_server_from_presets
Copy link

Copilot AI Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'query_server' is not used.
Import of 'set_gpu_arch' is not used.

Suggested change
from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, create_inference_server_from_presets
from kernelbench.utils import extract_first_code, create_inference_server_from_presets

Copilot uses AI. Check for mistakes.

app = modal.App("eval_single_sample")

Expand Down Expand Up @@ -155,41 +153,25 @@ def main(config: EvalConfig):

print(f"Starting Eval with config: {config}")

# Configurations

if config.dataset_src == "huggingface":
dataset = load_dataset(config.dataset_name)
curr_level_dataset = dataset[f"level_{config.level}"]
# Configurations - Unified dataset loading (works for both HF and local)
dataset = construct_kernelbench_dataset(
level=config.level,
source=config.dataset_src,
dataset_name=config.dataset_name,
)

if config.log:
os.makedirs(config.logdir, exist_ok=True)

# Problem Checks
num_problems = len(curr_level_dataset)
num_problems = len(dataset)
print(f"Number of problems in Level {config.level}: {num_problems}")
print(f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}")

assert config.problem_id <= num_problems, f"Problem ID {config.problem_id} out of range for Level {config.level}"


# 1. Fetch Problem
if config.dataset_src == "huggingface":

curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id)
ref_arch_src = curr_problem_row["code"][0]
problem_name = curr_problem_row["name"][0]

elif config.dataset_src == "local":
problem_idx_in_dataset = config.problem_id - 1 # due to dataset list being 0-indexed locally
ref_arch_path = curr_level_dataset[problem_idx_in_dataset]

problem_name = os.path.basename(ref_arch_path)
ref_arch_src = read_file(ref_arch_path)
# import pdb; pdb.set_trace()

# Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
problem_number = int(problem_name.split("_")[0])
assert problem_number == config.problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
# Fetch problem - unified interface, no branching needed
problem = dataset.get_problem_by_id(config.problem_id)
ref_arch_src = problem.code
problem_name = problem.name


# 2. Generate Sample
Expand Down
Loading