diff --git a/datasets/mbpp/README.md b/datasets/mbpp/README.md index 6879ab722af..558d65cf3cc 100644 --- a/datasets/mbpp/README.md +++ b/datasets/mbpp/README.md @@ -130,7 +130,14 @@ DatasetDict({ - `challenge_test_list`: list of more challenging test to further probe solution ### Data Splits -There are two version of the dataset (full and sanitized) which only one split each (test). +There are two version of the dataset (full and sanitized), each with four splits: +- train +- evaluation +- test +- prompt + +The `prompt` split corresponds to samples used for few-shot prompting and not for training. + ## Dataset Creation See section 2.1 of original [paper](https://arxiv.org/abs/2108.07732). diff --git a/datasets/mbpp/dataset_infos.json b/datasets/mbpp/dataset_infos.json index fc07315dbfd..2ccb2e6fe54 100644 --- a/datasets/mbpp/dataset_infos.json +++ b/datasets/mbpp/dataset_infos.json @@ -1 +1 @@ -{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 468088, "num_examples": 974, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 468088, "size_in_bytes": 1031831}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 219712, "num_examples": 427, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219712, "size_in_bytes": 474765}} \ No newline at end of file +{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 176879, "num_examples": 374, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 244104, "num_examples": 500, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 42405, "num_examples": 90, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 4550, "num_examples": 10, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 467938, "size_in_bytes": 1031681}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n title={Program Synthesis with Large Language Models},\n author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n journal={arXiv preprint arXiv:2108.07732},\n year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 63453, "num_examples": 120, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 132720, "num_examples": 257, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 20050, "num_examples": 43, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 3407, "num_examples": 7, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219630, "size_in_bytes": 474683}} \ No newline at end of file diff --git a/datasets/mbpp/dummy/full/1.0.1/dummy_data.zip b/datasets/mbpp/dummy/full/1.0.1/dummy_data.zip deleted file mode 100644 index 9974d8eae78..00000000000 Binary files a/datasets/mbpp/dummy/full/1.0.1/dummy_data.zip and /dev/null differ diff --git a/datasets/mbpp/dummy/full/1.0.2/dummy_data.zip b/datasets/mbpp/dummy/full/1.0.2/dummy_data.zip new file mode 100644 index 00000000000..c2ae95e2e0a Binary files /dev/null and b/datasets/mbpp/dummy/full/1.0.2/dummy_data.zip differ diff --git a/datasets/mbpp/dummy/sanitized/1.0.1/dummy_data.zip b/datasets/mbpp/dummy/sanitized/1.0.1/dummy_data.zip deleted file mode 100644 index 1a7354c0798..00000000000 Binary files a/datasets/mbpp/dummy/sanitized/1.0.1/dummy_data.zip and /dev/null differ diff --git a/datasets/mbpp/dummy/sanitized/1.0.2/dummy_data.zip b/datasets/mbpp/dummy/sanitized/1.0.2/dummy_data.zip new file mode 100644 index 00000000000..c61ad9fd2c5 Binary files /dev/null and b/datasets/mbpp/dummy/sanitized/1.0.2/dummy_data.zip differ diff --git a/datasets/mbpp/mbpp.py b/datasets/mbpp/mbpp.py index d1649a8f53a..7a18bc39f65 100644 --- a/datasets/mbpp/mbpp.py +++ b/datasets/mbpp/mbpp.py @@ -7,7 +7,8 @@ The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python programming problems, designed to be solvable by entry level programmers, covering programming fundamentals, standard library functionality, and so on. Each problem consists of a task -description, code solution and 3 automated test cases. +description, code solution and 3 automated test cases. The sanitized subset of the data has been +hand-verified by the authors. """ _URLs = { @@ -15,8 +16,6 @@ "sanitized": "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json", } -_SPLITS = ["full", "sanitized"] - _CITATION = """\ @article{austin2021program, title={Program Synthesis with Large Language Models}, @@ -33,15 +32,15 @@ class MBPP(datasets.GeneratorBasedBuilder): """MBPP: Mostly Basic Python Problems Dataset""" - VERSION = datasets.Version("1.0.1") + VERSION = datasets.Version("1.0.2") BUILDER_CONFIGS = [ datasets.BuilderConfig( - name=f"{split}", - version=datasets.Version("1.0.1"), + name="full", + version=datasets.Version("1.0.2"), description=_DESCRIPTION, - ) - for split in _SPLITS + ), + datasets.BuilderConfig(name="sanitized", version=datasets.Version("1.0.2"), description=_DESCRIPTION), ] DEFAULT_CONFIG_NAME = "full" @@ -58,7 +57,7 @@ def _info(self): "challenge_test_list": datasets.Sequence(datasets.Value("string")), } ) - else: + elif self.config.name == "sanitized": features = datasets.Features( { "source_file": datasets.Value("string"), @@ -83,22 +82,58 @@ def _split_generators(self, dl_manager): config_urls = _URLs[self.config.name] data_dir = dl_manager.download_and_extract(config_urls) return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_dir, "split": "train"}, + ), datasets.SplitGenerator( name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_dir, - }, - ) + gen_kwargs={"filepath": data_dir, "split": "test"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_dir, "split": "validation"}, + ), + datasets.SplitGenerator( + name=datasets.Split("prompt"), + gen_kwargs={"filepath": data_dir, "split": "prompt"}, + ), ] - def _generate_examples(self, filepath): - """Yields examples.""" - with open(filepath, encoding="utf-8") as file: - if self.config.name == "full": - data = [json.loads(line) for line in file] - else: - data = json.load(file) - id_ = 0 - for sample in data: - yield id_, sample - id_ += 1 + def _generate_examples(self, filepath, split): + if self.config.name == "full": + + def _read_lines(fn, start, end): + data = [] + with open(fn, encoding="utf-8") as f: + for line in f: + sample = json.loads(line) + if start <= sample["task_id"] <= end: + data.append(sample) + elif sample["task_id"] > end: + break + return data + + if split == "test": + data = _read_lines(filepath, 11, 510) + elif split == "train": + data = _read_lines(filepath, 601, 974) + elif split == "validation": + data = _read_lines(filepath, 511, 600) + elif split == "prompt": + data = _read_lines(filepath, 1, 10) + elif self.config.name == "sanitized": + with open(filepath, encoding="utf-8") as f: + data = json.load(f) + if split == "test": + data = [sample for sample in data if 11 <= sample["task_id"] <= 510] + elif split == "train": + data = [sample for sample in data if 601 <= sample["task_id"] <= 974] + elif split == "validation": + data = [sample for sample in data if 511 <= sample["task_id"] <= 600] + elif split == "prompt": + data = [sample for sample in data if 1 <= sample["task_id"] <= 10] + id_ = 0 + for sample in data: + yield id_, sample + id_ += 1