-
Notifications
You must be signed in to change notification settings - Fork 153
Dataset Object #95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dataset Object #95
Changes from 9 commits
509ed99
c77955f
660a47f
dfbef11
3e31530
601b980
1058878
7bc4a2f
a2fb76a
0cf6f19
10892cb
97894b9
7a86a5b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -12,7 +12,6 @@ | |||||
| import pydra | ||||||
| import torch | ||||||
|
|
||||||
| from datasets import load_dataset | ||||||
| from pydra import Config, REQUIRED | ||||||
|
|
||||||
| # Import only what we need | ||||||
|
|
@@ -255,36 +254,17 @@ def evaluate_single_sample_modal( | |||||
|
|
||||||
|
|
||||||
| def fetch_ref_arch_from_problem_id( | ||||||
| dataset, problem_id: int, dataset_src: str | ||||||
| dataset, problem_id: int, dataset_src: str = None | ||||||
| ) -> str | None: | ||||||
| """ | ||||||
| Fetch reference architecture from problem directory | ||||||
| Either from Hugging Face or Local Dataset | ||||||
| Fetch reference architecture from problem directory. | ||||||
| Uses the unified dataset interface. | ||||||
|
|
||||||
| Note: dataset_src parameter is kept for backward compatibility but ignored | ||||||
| since the dataset object already handles both sources. | ||||||
| """ | ||||||
| if dataset_src == "huggingface": | ||||||
| curr_problem_row = dataset.filter( | ||||||
| lambda x: x["problem_id"] == problem_id, num_proc=None, desc=None | ||||||
| ) | ||||||
| ref_arch_src = curr_problem_row["code"][0] | ||||||
| problem_name = curr_problem_row["name"][0] | ||||||
|
|
||||||
| elif dataset_src == "local": | ||||||
| problem_idx_in_dataset = ( | ||||||
| problem_id - 1 | ||||||
| ) # due to dataset list being 0-indexed locally | ||||||
| ref_arch_path = dataset[problem_idx_in_dataset] | ||||||
|
|
||||||
| problem_name = os.path.basename(ref_arch_path) | ||||||
| ref_arch_src = read_file(ref_arch_path) | ||||||
|
|
||||||
| # verify | ||||||
| # Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py") | ||||||
| problem_number = int(problem_name.split("_")[0]) | ||||||
| assert ( | ||||||
| problem_number == problem_id | ||||||
| ), f"Problem number in filename ({problem_number}) does not match config problem_id ({problem_id})" | ||||||
|
|
||||||
| return ref_arch_src | ||||||
| problem = dataset.get_problem_by_id(problem_id) | ||||||
| return problem.code | ||||||
|
|
||||||
|
|
||||||
| def fetch_kernel_from_disk( | ||||||
|
|
@@ -822,57 +802,48 @@ def main(config: EvalConfig): | |||||
| if mp.get_start_method(allow_none=True) is None: | ||||||
| mp.set_start_method("spawn") | ||||||
|
|
||||||
| # Dataset Configurations | ||||||
| if config.dataset_src == "huggingface": | ||||||
| dataset = load_dataset(config.dataset_name) | ||||||
| curr_level_dataset = dataset[f"level_{config.level}"] | ||||||
| elif config.dataset_src == "local": | ||||||
| curr_level_dataset = construct_kernelbench_dataset(config.level) | ||||||
|
|
||||||
| num_problems_in_level = len(curr_level_dataset) | ||||||
|
|
||||||
| # Determine which problem IDs to evaluate | ||||||
| # you can either specify a list of problem IDs (prioritize) or a subset range | ||||||
| # NOTE: later once the dataset PR is in we will link the representative subset as a built-in preset too | ||||||
| if config.problem_ids is not None: | ||||||
| # Use specific problem IDs if provided | ||||||
| problem_id_list = config.problem_ids | ||||||
| for pid in problem_id_list: | ||||||
| assert 1 <= pid <= num_problems_in_level, f"Problem ID {pid} out of range for Level {config.level}" | ||||||
| elif config.subset == (None, None): | ||||||
| problem_id_list = list(range(1, num_problems_in_level + 1)) | ||||||
| # Dataset Configurations - Unified loading | ||||||
| dataset = construct_kernelbench_dataset( | ||||||
| level=config.level, | ||||||
| source=config.dataset_src, | ||||||
| dataset_name=config.dataset_name, | ||||||
| ) | ||||||
|
|
||||||
| all_problem_ids = dataset.get_problem_ids() | ||||||
|
|
||||||
| if config.subset == (None, None): | ||||||
| problem_ids_to_run = all_problem_ids | ||||||
| else: | ||||||
| assert ( | ||||||
| config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level | ||||||
| ), f"Subset range {config.subset} out of range for Level {config.level}" | ||||||
| problem_id_list = list(range(config.subset[0], config.subset[1] + 1)) | ||||||
| start, end = config.subset | ||||||
| problem_ids_to_run = [pid for pid in all_problem_ids if start <= pid <= end] | ||||||
| if not problem_ids_to_run: | ||||||
| print(f"Warning: No problems found in subset range {config.subset}") | ||||||
|
||||||
| print(f"Warning: No problems found in subset range {config.subset}") | |
| print(f"Warning: No problems found in subset range {config.subset}") |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -11,10 +11,8 @@ | |||||
| import json | ||||||
| import modal | ||||||
|
|
||||||
| from datasets import load_dataset | ||||||
|
|
||||||
| #from src.dataset import construct_kernelbench_dataset | ||||||
| from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets | ||||||
| from kernelbench.dataset import construct_kernelbench_dataset | ||||||
| from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, create_inference_server_from_presets | ||||||
|
||||||
| from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, create_inference_server_from_presets | |
| from kernelbench.utils import extract_first_code, create_inference_server_from_presets |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll properly take a review later! This looks exciting!
However, please fix this!!!!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ethanbc key disabled now, please DO NOT post API key anywhere in public
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pythonomar22 A decent rule of thumb is to only modify the .env file when you're experimenting / plugging in keys, and stick .env in the .gitignore. When you're done update the .env.example if you are adding / removing env variables.
This prevents this from happening :)