Skip to content

Commit e195bc1

Browse files
authored
Add splits to MBPP dataset (#4943)
* first commit * added dummy data * removed unnecessary import * removed unnecessary lines * style reformat * removed trailing whitespace
1 parent 0b1dca2 commit e195bc1

File tree

7 files changed

+68
-26
lines changed

7 files changed

+68
-26
lines changed

datasets/mbpp/README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,14 @@ DatasetDict({
130130
- `challenge_test_list`: list of more challenging test to further probe solution
131131

132132
### Data Splits
133-
There are two version of the dataset (full and sanitized) which only one split each (test).
133+
There are two version of the dataset (full and sanitized), each with four splits:
134+
- train
135+
- evaluation
136+
- test
137+
- prompt
138+
139+
The `prompt` split corresponds to samples used for few-shot prompting and not for training.
140+
134141
## Dataset Creation
135142
See section 2.1 of original [paper](https://arxiv.org/abs/2108.07732).
136143

datasets/mbpp/dataset_infos.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 468088, "num_examples": 974, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 468088, "size_in_bytes": 1031831}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 219712, "num_examples": 427, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219712, "size_in_bytes": 474765}}
1+
{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 176879, "num_examples": 374, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 244104, "num_examples": 500, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 42405, "num_examples": 90, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 4550, "num_examples": 10, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 467938, "size_in_bytes": 1031681}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 63453, "num_examples": 120, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 132720, "num_examples": 257, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 20050, "num_examples": 43, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 3407, "num_examples": 7, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219630, "size_in_bytes": 474683}}
-1.17 KB
Binary file not shown.
1.31 KB
Binary file not shown.
-1.16 KB
Binary file not shown.
1.09 KB
Binary file not shown.

datasets/mbpp/mbpp.py

Lines changed: 59 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@
77
The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python
88
programming problems, designed to be solvable by entry level programmers, covering programming
99
fundamentals, standard library functionality, and so on. Each problem consists of a task
10-
description, code solution and 3 automated test cases.
10+
description, code solution and 3 automated test cases. The sanitized subset of the data has been
11+
hand-verified by the authors.
1112
"""
1213

1314
_URLs = {
1415
"full": "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl",
1516
"sanitized": "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json",
1617
}
1718

18-
_SPLITS = ["full", "sanitized"]
19-
2019
_CITATION = """\
2120
@article{austin2021program,
2221
title={Program Synthesis with Large Language Models},
@@ -33,15 +32,15 @@
3332
class MBPP(datasets.GeneratorBasedBuilder):
3433
"""MBPP: Mostly Basic Python Problems Dataset"""
3534

36-
VERSION = datasets.Version("1.0.1")
35+
VERSION = datasets.Version("1.0.2")
3736

3837
BUILDER_CONFIGS = [
3938
datasets.BuilderConfig(
40-
name=f"{split}",
41-
version=datasets.Version("1.0.1"),
39+
name="full",
40+
version=datasets.Version("1.0.2"),
4241
description=_DESCRIPTION,
43-
)
44-
for split in _SPLITS
42+
),
43+
datasets.BuilderConfig(name="sanitized", version=datasets.Version("1.0.2"), description=_DESCRIPTION),
4544
]
4645

4746
DEFAULT_CONFIG_NAME = "full"
@@ -58,7 +57,7 @@ def _info(self):
5857
"challenge_test_list": datasets.Sequence(datasets.Value("string")),
5958
}
6059
)
61-
else:
60+
elif self.config.name == "sanitized":
6261
features = datasets.Features(
6362
{
6463
"source_file": datasets.Value("string"),
@@ -83,22 +82,58 @@ def _split_generators(self, dl_manager):
8382
config_urls = _URLs[self.config.name]
8483
data_dir = dl_manager.download_and_extract(config_urls)
8584
return [
85+
datasets.SplitGenerator(
86+
name=datasets.Split.TRAIN,
87+
gen_kwargs={"filepath": data_dir, "split": "train"},
88+
),
8689
datasets.SplitGenerator(
8790
name=datasets.Split.TEST,
88-
gen_kwargs={
89-
"filepath": data_dir,
90-
},
91-
)
91+
gen_kwargs={"filepath": data_dir, "split": "test"},
92+
),
93+
datasets.SplitGenerator(
94+
name=datasets.Split.VALIDATION,
95+
gen_kwargs={"filepath": data_dir, "split": "validation"},
96+
),
97+
datasets.SplitGenerator(
98+
name=datasets.Split("prompt"),
99+
gen_kwargs={"filepath": data_dir, "split": "prompt"},
100+
),
92101
]
93102

94-
def _generate_examples(self, filepath):
95-
"""Yields examples."""
96-
with open(filepath, encoding="utf-8") as file:
97-
if self.config.name == "full":
98-
data = [json.loads(line) for line in file]
99-
else:
100-
data = json.load(file)
101-
id_ = 0
102-
for sample in data:
103-
yield id_, sample
104-
id_ += 1
103+
def _generate_examples(self, filepath, split):
104+
if self.config.name == "full":
105+
106+
def _read_lines(fn, start, end):
107+
data = []
108+
with open(fn, encoding="utf-8") as f:
109+
for line in f:
110+
sample = json.loads(line)
111+
if start <= sample["task_id"] <= end:
112+
data.append(sample)
113+
elif sample["task_id"] > end:
114+
break
115+
return data
116+
117+
if split == "test":
118+
data = _read_lines(filepath, 11, 510)
119+
elif split == "train":
120+
data = _read_lines(filepath, 601, 974)
121+
elif split == "validation":
122+
data = _read_lines(filepath, 511, 600)
123+
elif split == "prompt":
124+
data = _read_lines(filepath, 1, 10)
125+
elif self.config.name == "sanitized":
126+
with open(filepath, encoding="utf-8") as f:
127+
data = json.load(f)
128+
if split == "test":
129+
data = [sample for sample in data if 11 <= sample["task_id"] <= 510]
130+
elif split == "train":
131+
data = [sample for sample in data if 601 <= sample["task_id"] <= 974]
132+
elif split == "validation":
133+
data = [sample for sample in data if 511 <= sample["task_id"] <= 600]
134+
elif split == "prompt":
135+
data = [sample for sample in data if 1 <= sample["task_id"] <= 10]
136+
id_ = 0
137+
for sample in data:
138+
yield id_, sample
139+
id_ += 1

0 commit comments

Comments
 (0)