huggingface · lhoestq · Aug 27, 2021 · Aug 12, 2021 · Aug 12, 2021 · Aug 12, 2021
diff --git a/datasets/gooaq/README.md b/datasets/gooaq/README.md
@@ -18,6 +18,7 @@ task_categories:
 task_ids:
 - open-domain-qa
 paperswithcode_id: gooaq
+pretty_name: 'GooAQ: Open Question Answering with Diverse Answer Types'
 ---
 
 # Dataset Card for GooAQ
@@ -108,11 +109,14 @@ Here is the dominant types in the current dataset:
 
 ### Data Splits
 
-This dataset is split into train set. Number of samples in train set is given below:
+Number of samples in train/validation/test set are given below:
+
+| Split      | Number of samples |
+|------------|-------------------|
+| Train      | 3112679           |
+| Validation | 2500              |
+| Test       | 2500              |
 
-|                            | Train  |
-| -----                      | ------ |
-| Gooaq                      | 5030530|
 
 ## Dataset Creation
 

diff --git a/datasets/gooaq/dataset_infos.json b/datasets/gooaq/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n  title={GooAQ: Open Question Answering with Diverse Answer Types},\n  author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n  journal={arXiv preprint},\n  year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1102827066, "num_examples": 5030530, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl": {"num_bytes": 1467162788, "checksum": "7c57029dbac90db21c7abcb3dcdbf9cd9f83f9a1d24815a2d8c0663fe13e4a17"}}, "download_size": 1467162788, "post_processing_size": null, "dataset_size": 1102827066, "size_in_bytes": 2569989854}}
+{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n  title={GooAQ: Open Question Answering with Diverse Answer Types},\n  author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n  journal={arXiv preprint},\n  year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 974320061, "num_examples": 3112679, "dataset_name": "gooaq"}, "validation": {"name": "validation", "num_bytes": 444553, "num_examples": 2500, "dataset_name": "gooaq"}, "test": {"name": "test", "num_bytes": 445810, "num_examples": 2500, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl": {"num_bytes": 1920133810, "checksum": "d68007293be8740a7a7388efa8ea30ae5a3232d18a340a63f0190e07942d9da2"}, "https://github.com/allenai/gooaq/raw/main/data/split.json": {"num_bytes": 191225091, "checksum": "728921af66afb7b2c04466795e595586ad1f92bbd15a879b47fc59aaca8826db"}}, "download_size": 2111358901, "post_processing_size": null, "dataset_size": 975210424, "size_in_bytes": 3086569325}}
diff --git a/datasets/gooaq/dummy/1.1.0/dummy_data.zip b/datasets/gooaq/dummy/1.1.0/dummy_data.zip
diff --git a/datasets/gooaq/dummy/1.2.0/dummy_data.zip b/datasets/gooaq/dummy/1.2.0/dummy_data.zip
diff --git a/datasets/gooaq/gooaq.py b/datasets/gooaq/gooaq.py
@@ -17,6 +17,8 @@
 
 import json
 
+import numpy as np
+
 import datasets
 
 
@@ -43,13 +45,15 @@
 
 _LICENSE = "Licensed under the Apache License, Version 2.0"
 
-_URL = "https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl"
+_URL = "https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl"
+
+_SPLITS_URL = "https://github.com/allenai/gooaq/raw/main/data/split.json"
 
 
 class Gooaq(datasets.GeneratorBasedBuilder):
     """GooAQ - Question-answers, collected from Google"""
 
-    VERSION = datasets.Version("1.1.0")
+    VERSION = datasets.Version("1.2.0")
 
     def _info(self):
         features = datasets.Features(
@@ -83,39 +87,70 @@ def _info(self):
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
 
-        data_dir = dl_manager.download(_URL)
+        data = dl_manager.download(_URL)
+        splits = dl_manager.download(_SPLITS_URL)
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "filepath": data_dir,
+                    "filepath": data,
                     "split": "train",
+                    "split_file": splits,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data,
+                    "split": "dev",
+                    "split_file": splits,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": data,
+                    "split": "test",
+                    "split_file": splits,
                 },
             ),
         ]
 
     def _generate_examples(
-        self, filepath, split  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+        self,
+        filepath,
+        split,
+        split_file,  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
     ):
         dominant_classes = ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"]
 
+        with open(split_file, encoding="utf-8") as f_split:
+            if split == "train":
+                split_ids = json.load(f_split)[split]
+                split_ids = np.array(split_ids)[:, 0]
+            else:
+                split_ids = json.load(f_split)[split]
+
+        split_ids = set(split_ids)
+
         with open(filepath, encoding="utf-8") as f:
             for id_, row in enumerate(f):
                 data = json.loads(row)
 
-                if data["answer_type"] not in dominant_classes:
-                    yield id_, {
-                        "id": data["id"],
-                        "question": data["question"],
-                        "short_answer": data["short_answer"],
-                        "answer": data["answer"],
-                        "answer_type": -1,
-                    }
-                else:
-                    yield id_, {
-                        "id": data["id"],
-                        "question": data["question"],
-                        "short_answer": data["short_answer"],
-                        "answer": data["answer"],
-                        "answer_type": data["answer_type"],
-                    }
+                if data["id"] in split_ids:
+                    if data["answer_type"] not in dominant_classes:
+                        yield id_, {
+                            "id": data["id"],
+                            "question": data["question"],
+                            "short_answer": data["short_answer"],
+                            "answer": data["answer"],
+                            "answer_type": -1,
+                        }
+                    else:
+                        yield id_, {
+                            "id": data["id"],
+                            "question": data["question"],
+                            "short_answer": data["short_answer"],
+                            "answer": data["answer"],
+                            "answer_type": data["answer_type"],
+                        }
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1102827066, "num_examples": 5030530, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl": {"num_bytes": 1467162788, "checksum": "7c57029dbac90db21c7abcb3dcdbf9cd9f83f9a1d24815a2d8c0663fe13e4a17"}}, "download_size": 1467162788, "post_processing_size": null, "dataset_size": 1102827066, "size_in_bytes": 2569989854}}
		{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 974320061, "num_examples": 3112679, "dataset_name": "gooaq"}, "validation": {"name": "validation", "num_bytes": 444553, "num_examples": 2500, "dataset_name": "gooaq"}, "test": {"name": "test", "num_bytes": 445810, "num_examples": 2500, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl": {"num_bytes": 1920133810, "checksum": "d68007293be8740a7a7388efa8ea30ae5a3232d18a340a63f0190e07942d9da2"}, "https://github.com/allenai/gooaq/raw/main/data/split.json": {"num_bytes": 191225091, "checksum": "728921af66afb7b2c04466795e595586ad1f92bbd15a879b47fc59aaca8826db"}}, "download_size": 2111358901, "post_processing_size": null, "dataset_size": 975210424, "size_in_bytes": 3086569325}}