diff --git a/datasets/mbpp/README.md b/datasets/mbpp/README.md
index 6879ab722af..558d65cf3cc 100644
--- a/datasets/mbpp/README.md
+++ b/datasets/mbpp/README.md
@@ -130,7 +130,14 @@ DatasetDict({
 - `challenge_test_list`: list of more challenging test to further probe solution
 
 ### Data Splits
-There are two version of the dataset (full and sanitized) which only one split each (test).
+There are two version of the dataset (full and sanitized), each with four splits:
+- train
+- evaluation
+- test
+- prompt
+
+The `prompt` split corresponds to samples used for few-shot prompting and not for training.
+
 ## Dataset Creation
 See section 2.1 of original [paper](https://arxiv.org/abs/2108.07732).
 
diff --git a/datasets/mbpp/dataset_infos.json b/datasets/mbpp/dataset_infos.json
index fc07315dbfd..2ccb2e6fe54 100644
--- a/datasets/mbpp/dataset_infos.json
+++ b/datasets/mbpp/dataset_infos.json
@@ -1 +1 @@
-{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n  title={Program Synthesis with Large Language Models},\n  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n  journal={arXiv preprint arXiv:2108.07732},\n  year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 468088, "num_examples": 974, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 468088, "size_in_bytes": 1031831}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.\n", "citation": "@article{austin2021program,\n  title={Program Synthesis with Large Language Models},\n  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n  journal={arXiv preprint arXiv:2108.07732},\n  year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.1", "description": null, "major": 1, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 219712, "num_examples": 427, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219712, "size_in_bytes": 474765}}
\ No newline at end of file
+{"full": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n  title={Program Synthesis with Large Language Models},\n  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n  journal={arXiv preprint arXiv:2108.07732},\n  year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_setup_code": {"dtype": "string", "id": null, "_type": "Value"}, "challenge_test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "full", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 176879, "num_examples": 374, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 244104, "num_examples": 500, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 42405, "num_examples": 90, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 4550, "num_examples": 10, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl": {"num_bytes": 563743, "checksum": "ccf64ceae9c5403bf50a044cb6d505bfd2a2963ee58338ba268fd65beab92a9f"}}, "download_size": 563743, "post_processing_size": null, "dataset_size": 467938, "size_in_bytes": 1031681}, "sanitized": {"description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been \nhand-verified by the authors.\n", "citation": "@article{austin2021program,\n  title={Program Synthesis with Large Language Models},\n  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n  journal={arXiv preprint arXiv:2108.07732},\n  year={2021}\n}", "homepage": "https://github.com/google-research/google-research/tree/master/mbpp", "license": "CC-BY-4.0", "features": {"source_file": {"dtype": "string", "id": null, "_type": "Value"}, "task_id": {"dtype": "int32", "id": null, "_type": "Value"}, "prompt": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "test_imports": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "test_list": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mbpp", "config_name": "sanitized", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"train": {"name": "train", "num_bytes": 63453, "num_examples": 120, "dataset_name": "mbpp"}, "test": {"name": "test", "num_bytes": 132720, "num_examples": 257, "dataset_name": "mbpp"}, "validation": {"name": "validation", "num_bytes": 20050, "num_examples": 43, "dataset_name": "mbpp"}, "prompt": {"name": "prompt", "num_bytes": 3407, "num_examples": 7, "dataset_name": "mbpp"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json": {"num_bytes": 255053, "checksum": "ca95deaa9a01ef0a6f439f88bcf0dd3db3563d22f22aad6cae04ebb9a8d8c8e9"}}, "download_size": 255053, "post_processing_size": null, "dataset_size": 219630, "size_in_bytes": 474683}}
\ No newline at end of file
diff --git a/datasets/mbpp/dummy/full/1.0.1/dummy_data.zip b/datasets/mbpp/dummy/full/1.0.1/dummy_data.zip
deleted file mode 100644
index 9974d8eae78..00000000000
Binary files a/datasets/mbpp/dummy/full/1.0.1/dummy_data.zip and /dev/null differ
diff --git a/datasets/mbpp/dummy/full/1.0.2/dummy_data.zip b/datasets/mbpp/dummy/full/1.0.2/dummy_data.zip
new file mode 100644
index 00000000000..c2ae95e2e0a
Binary files /dev/null and b/datasets/mbpp/dummy/full/1.0.2/dummy_data.zip differ
diff --git a/datasets/mbpp/dummy/sanitized/1.0.1/dummy_data.zip b/datasets/mbpp/dummy/sanitized/1.0.1/dummy_data.zip
deleted file mode 100644
index 1a7354c0798..00000000000
Binary files a/datasets/mbpp/dummy/sanitized/1.0.1/dummy_data.zip and /dev/null differ
diff --git a/datasets/mbpp/dummy/sanitized/1.0.2/dummy_data.zip b/datasets/mbpp/dummy/sanitized/1.0.2/dummy_data.zip
new file mode 100644
index 00000000000..c61ad9fd2c5
Binary files /dev/null and b/datasets/mbpp/dummy/sanitized/1.0.2/dummy_data.zip differ
diff --git a/datasets/mbpp/mbpp.py b/datasets/mbpp/mbpp.py
index d1649a8f53a..7a18bc39f65 100644
--- a/datasets/mbpp/mbpp.py
+++ b/datasets/mbpp/mbpp.py
@@ -7,7 +7,8 @@
 The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python
 programming problems, designed to be solvable by entry level programmers, covering programming
 fundamentals, standard library functionality, and so on. Each problem consists of a task
-description, code solution and 3 automated test cases.
+description, code solution and 3 automated test cases. The sanitized subset of the data has been
+hand-verified by the authors.
 """
 
 _URLs = {
@@ -15,8 +16,6 @@
     "sanitized": "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json",
 }
 
-_SPLITS = ["full", "sanitized"]
-
 _CITATION = """\
 @article{austin2021program,
   title={Program Synthesis with Large Language Models},
@@ -33,15 +32,15 @@
 class MBPP(datasets.GeneratorBasedBuilder):
     """MBPP: Mostly Basic Python Problems Dataset"""
 
-    VERSION = datasets.Version("1.0.1")
+    VERSION = datasets.Version("1.0.2")
 
     BUILDER_CONFIGS = [
         datasets.BuilderConfig(
-            name=f"{split}",
-            version=datasets.Version("1.0.1"),
+            name="full",
+            version=datasets.Version("1.0.2"),
             description=_DESCRIPTION,
-        )
-        for split in _SPLITS
+        ),
+        datasets.BuilderConfig(name="sanitized", version=datasets.Version("1.0.2"), description=_DESCRIPTION),
     ]
 
     DEFAULT_CONFIG_NAME = "full"
@@ -58,7 +57,7 @@ def _info(self):
                     "challenge_test_list": datasets.Sequence(datasets.Value("string")),
                 }
             )
-        else:
+        elif self.config.name == "sanitized":
             features = datasets.Features(
                 {
                     "source_file": datasets.Value("string"),
@@ -83,22 +82,58 @@ def _split_generators(self, dl_manager):
         config_urls = _URLs[self.config.name]
         data_dir = dl_manager.download_and_extract(config_urls)
         return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": data_dir, "split": "train"},
+            ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepath": data_dir,
-                },
-            )
+                gen_kwargs={"filepath": data_dir, "split": "test"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": data_dir, "split": "validation"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split("prompt"),
+                gen_kwargs={"filepath": data_dir, "split": "prompt"},
+            ),
         ]
 
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as file:
-            if self.config.name == "full":
-                data = [json.loads(line) for line in file]
-            else:
-                data = json.load(file)
-            id_ = 0
-            for sample in data:
-                yield id_, sample
-                id_ += 1
+    def _generate_examples(self, filepath, split):
+        if self.config.name == "full":
+
+            def _read_lines(fn, start, end):
+                data = []
+                with open(fn, encoding="utf-8") as f:
+                    for line in f:
+                        sample = json.loads(line)
+                        if start <= sample["task_id"] <= end:
+                            data.append(sample)
+                        elif sample["task_id"] > end:
+                            break
+                return data
+
+            if split == "test":
+                data = _read_lines(filepath, 11, 510)
+            elif split == "train":
+                data = _read_lines(filepath, 601, 974)
+            elif split == "validation":
+                data = _read_lines(filepath, 511, 600)
+            elif split == "prompt":
+                data = _read_lines(filepath, 1, 10)
+        elif self.config.name == "sanitized":
+            with open(filepath, encoding="utf-8") as f:
+                data = json.load(f)
+            if split == "test":
+                data = [sample for sample in data if 11 <= sample["task_id"] <= 510]
+            elif split == "train":
+                data = [sample for sample in data if 601 <= sample["task_id"] <= 974]
+            elif split == "validation":
+                data = [sample for sample in data if 511 <= sample["task_id"] <= 600]
+            elif split == "prompt":
+                data = [sample for sample in data if 1 <= sample["task_id"] <= 10]
+        id_ = 0
+        for sample in data:
+            yield id_, sample
+            id_ += 1