Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion datasets/mbpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,14 @@ DatasetDict({
- `challenge_test_list`: list of more challenging test to further probe solution

### Data Splits
There are two version of the dataset (full and sanitized) which only one split each (test).
There are two version of the dataset (full and sanitized), each with four splits:
- train
- evaluation
- test
- prompt

The `prompt` split corresponds to samples used for few-shot prompting and not for training.

## Dataset Creation
See section 2.1 of original [paper](https://arxiv.org/abs/2108.07732).

Expand Down
2 changes: 1 addition & 1 deletion datasets/mbpp/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 468088, "num_examples": 974, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 468088, "size_in_bytes": 1031831}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 219712, "num_examples": 427, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219712, "size_in_bytes": 474765}}
{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 176879, "num_examples": 374, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 244104, "num_examples": 500, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 42405, "num_examples": 90, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 4550, "num_examples": 10, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 467938, "size_in_bytes": 1031681}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 63453, "num_examples": 120, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 132720, "num_examples": 257, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 20050, "num_examples": 43, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 3407, "num_examples": 7, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219630, "size_in_bytes": 474683}}
Binary file removed datasets/mbpp/dummy/full/1.0.1/dummy_data.zip
Binary file not shown.
Binary file added datasets/mbpp/dummy/full/1.0.2/dummy_data.zip
Binary file not shown.
Binary file removed datasets/mbpp/dummy/sanitized/1.0.1/dummy_data.zip
Binary file not shown.
Binary file not shown.
83 changes: 59 additions & 24 deletions datasets/mbpp/mbpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,15 @@
The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python
programming problems, designed to be solvable by entry level programmers, covering programming
fundamentals, standard library functionality, and so on. Each problem consists of a task
description, code solution and 3 automated test cases.
description, code solution and 3 automated test cases. The sanitized subset of the data has been
hand-verified by the authors.
"""

_URLs = {
"full": "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl",
"sanitized": "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json",
}

_SPLITS = ["full", "sanitized"]

_CITATION = """\
@article{austin2021program,
title={Program Synthesis with Large Language Models},
Expand All @@ -33,15 +32,15 @@
class MBPP(datasets.GeneratorBasedBuilder):
"""MBPP: Mostly Basic Python Problems Dataset"""

VERSION = datasets.Version("1.0.1")
VERSION = datasets.Version("1.0.2")

BUILDER_CONFIGS = [
datasets.BuilderConfig(
name=f"{split}",
version=datasets.Version("1.0.1"),
name="full",
version=datasets.Version("1.0.2"),
description=_DESCRIPTION,
)
for split in _SPLITS
),
datasets.BuilderConfig(name="sanitized", version=datasets.Version("1.0.2"), description=_DESCRIPTION),
]

DEFAULT_CONFIG_NAME = "full"
Expand All @@ -58,7 +57,7 @@ def _info(self):
"challenge_test_list": datasets.Sequence(datasets.Value("string")),
}
)
else:
elif self.config.name == "sanitized":
features = datasets.Features(
{
"source_file": datasets.Value("string"),
Expand All @@ -83,22 +82,58 @@ def _split_generators(self, dl_manager):
config_urls = _URLs[self.config.name]
data_dir = dl_manager.download_and_extract(config_urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": data_dir, "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": data_dir,
},
)
gen_kwargs={"filepath": data_dir, "split": "test"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": data_dir, "split": "validation"},
),
datasets.SplitGenerator(
name=datasets.Split("prompt"),
gen_kwargs={"filepath": data_dir, "split": "prompt"},
),
]

def _generate_examples(self, filepath):
"""Yields examples."""
with open(filepath, encoding="utf-8") as file:
if self.config.name == "full":
data = [json.loads(line) for line in file]
else:
data = json.load(file)
id_ = 0
for sample in data:
yield id_, sample
id_ += 1
def _generate_examples(self, filepath, split):
if self.config.name == "full":

def _read_lines(fn, start, end):
data = []
with open(fn, encoding="utf-8") as f:
for line in f:
sample = json.loads(line)
if start <= sample["task_id"] <= end:
data.append(sample)
elif sample["task_id"] > end:
break
return data

if split == "test":
data = _read_lines(filepath, 11, 510)
elif split == "train":
data = _read_lines(filepath, 601, 974)
elif split == "validation":
data = _read_lines(filepath, 511, 600)
elif split == "prompt":
data = _read_lines(filepath, 1, 10)
elif self.config.name == "sanitized":
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
if split == "test":
data = [sample for sample in data if 11 <= sample["task_id"] <= 510]
elif split == "train":
data = [sample for sample in data if 601 <= sample["task_id"] <= 974]
elif split == "validation":
data = [sample for sample in data if 511 <= sample["task_id"] <= 600]
elif split == "prompt":
data = [sample for sample in data if 1 <= sample["task_id"] <= 10]
id_ = 0
for sample in data:
yield id_, sample
id_ += 1