diff --git a/datasets/gooaq/README.md b/datasets/gooaq/README.md index 8ab2df06e23..54d80c25333 100644 --- a/datasets/gooaq/README.md +++ b/datasets/gooaq/README.md @@ -18,6 +18,7 @@ task_categories: task_ids: - open-domain-qa paperswithcode_id: gooaq +pretty_name: 'GooAQ: Open Question Answering with Diverse Answer Types' --- # Dataset Card for GooAQ @@ -108,11 +109,14 @@ Here is the dominant types in the current dataset: ### Data Splits -This dataset is split into train set. Number of samples in train set is given below: +Number of samples in train/validation/test set are given below: + +| Split | Number of samples | +|------------|-------------------| +| Train | 3112679 | +| Validation | 2500 | +| Test | 2500 | -| | Train | -| ----- | ------ | -| Gooaq | 5030530| ## Dataset Creation diff --git a/datasets/gooaq/dataset_infos.json b/datasets/gooaq/dataset_infos.json index 24fa07dad47..0fda560fed3 100644 --- a/datasets/gooaq/dataset_infos.json +++ b/datasets/gooaq/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1102827066, "num_examples": 5030530, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl": {"num_bytes": 1467162788, "checksum": "7c57029dbac90db21c7abcb3dcdbf9cd9f83f9a1d24815a2d8c0663fe13e4a17"}}, "download_size": 1467162788, "post_processing_size": null, "dataset_size": 1102827066, "size_in_bytes": 2569989854}} \ No newline at end of file +{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 974320061, "num_examples": 3112679, "dataset_name": "gooaq"}, "validation": {"name": "validation", "num_bytes": 444553, "num_examples": 2500, "dataset_name": "gooaq"}, "test": {"name": "test", "num_bytes": 445810, "num_examples": 2500, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl": {"num_bytes": 1920133810, "checksum": "d68007293be8740a7a7388efa8ea30ae5a3232d18a340a63f0190e07942d9da2"}, "https://github.com/allenai/gooaq/raw/main/data/split.json": {"num_bytes": 191225091, "checksum": "728921af66afb7b2c04466795e595586ad1f92bbd15a879b47fc59aaca8826db"}}, "download_size": 2111358901, "post_processing_size": null, "dataset_size": 975210424, "size_in_bytes": 3086569325}} \ No newline at end of file diff --git a/datasets/gooaq/dummy/1.1.0/dummy_data.zip b/datasets/gooaq/dummy/1.1.0/dummy_data.zip deleted file mode 100644 index 60306d9e014..00000000000 Binary files a/datasets/gooaq/dummy/1.1.0/dummy_data.zip and /dev/null differ diff --git a/datasets/gooaq/dummy/1.2.0/dummy_data.zip b/datasets/gooaq/dummy/1.2.0/dummy_data.zip new file mode 100644 index 00000000000..b2ceeea51e4 Binary files /dev/null and b/datasets/gooaq/dummy/1.2.0/dummy_data.zip differ diff --git a/datasets/gooaq/gooaq.py b/datasets/gooaq/gooaq.py index a098cde26cf..53e116d64d9 100644 --- a/datasets/gooaq/gooaq.py +++ b/datasets/gooaq/gooaq.py @@ -17,6 +17,8 @@ import json +import numpy as np + import datasets @@ -43,13 +45,15 @@ _LICENSE = "Licensed under the Apache License, Version 2.0" -_URL = "https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl" +_URL = "https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl" + +_SPLITS_URL = "https://github.com/allenai/gooaq/raw/main/data/split.json" class Gooaq(datasets.GeneratorBasedBuilder): """GooAQ - Question-answers, collected from Google""" - VERSION = datasets.Version("1.1.0") + VERSION = datasets.Version("1.2.0") def _info(self): features = datasets.Features( @@ -83,39 +87,70 @@ def _info(self): def _split_generators(self, dl_manager): """Returns SplitGenerators.""" - data_dir = dl_manager.download(_URL) + data = dl_manager.download(_URL) + splits = dl_manager.download(_SPLITS_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ - "filepath": data_dir, + "filepath": data, "split": "train", + "split_file": splits, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data, + "split": "dev", + "split_file": splits, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data, + "split": "test", + "split_file": splits, }, ), ] def _generate_examples( - self, filepath, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + self, + filepath, + split, + split_file, # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` ): dominant_classes = ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"] + with open(split_file, encoding="utf-8") as f_split: + if split == "train": + split_ids = json.load(f_split)[split] + split_ids = np.array(split_ids)[:, 0] + else: + split_ids = json.load(f_split)[split] + + split_ids = set(split_ids) + with open(filepath, encoding="utf-8") as f: for id_, row in enumerate(f): data = json.loads(row) - if data["answer_type"] not in dominant_classes: - yield id_, { - "id": data["id"], - "question": data["question"], - "short_answer": data["short_answer"], - "answer": data["answer"], - "answer_type": -1, - } - else: - yield id_, { - "id": data["id"], - "question": data["question"], - "short_answer": data["short_answer"], - "answer": data["answer"], - "answer_type": data["answer_type"], - } + if data["id"] in split_ids: + if data["answer_type"] not in dominant_classes: + yield id_, { + "id": data["id"], + "question": data["question"], + "short_answer": data["short_answer"], + "answer": data["answer"], + "answer_type": -1, + } + else: + yield id_, { + "id": data["id"], + "question": data["question"], + "short_answer": data["short_answer"], + "answer": data["answer"], + "answer_type": data["answer_type"], + }