Skip to content

Commit e34e5cd

Browse files
Update GooAQ (#2792)
* update gooaq * update README * add pretty name * dummy data changed * update version * remove 1.1.0 dummy_data
1 parent b153141 commit e34e5cd

File tree

5 files changed

+65
-26
lines changed

5 files changed

+65
-26
lines changed

datasets/gooaq/README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ task_categories:
1818
task_ids:
1919
- open-domain-qa
2020
paperswithcode_id: gooaq
21+
pretty_name: 'GooAQ: Open Question Answering with Diverse Answer Types'
2122
---
2223

2324
# Dataset Card for GooAQ
@@ -108,11 +109,14 @@ Here is the dominant types in the current dataset:
108109

109110
### Data Splits
110111

111-
This dataset is split into train set. Number of samples in train set is given below:
112+
Number of samples in train/validation/test set are given below:
113+
114+
| Split | Number of samples |
115+
|------------|-------------------|
116+
| Train | 3112679 |
117+
| Validation | 2500 |
118+
| Test | 2500 |
112119

113-
| | Train |
114-
| ----- | ------ |
115-
| Gooaq | 5030530|
116120

117121
## Dataset Creation
118122

datasets/gooaq/dataset_infos.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1102827066, "num_examples": 5030530, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl": {"num_bytes": 1467162788, "checksum": "7c57029dbac90db21c7abcb3dcdbf9cd9f83f9a1d24815a2d8c0663fe13e4a17"}}, "download_size": 1467162788, "post_processing_size": null, "dataset_size": 1102827066, "size_in_bytes": 2569989854}}
1+
{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 974320061, "num_examples": 3112679, "dataset_name": "gooaq"}, "validation": {"name": "validation", "num_bytes": 444553, "num_examples": 2500, "dataset_name": "gooaq"}, "test": {"name": "test", "num_bytes": 445810, "num_examples": 2500, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl": {"num_bytes": 1920133810, "checksum": "d68007293be8740a7a7388efa8ea30ae5a3232d18a340a63f0190e07942d9da2"}, "https://github.com/allenai/gooaq/raw/main/data/split.json": {"num_bytes": 191225091, "checksum": "728921af66afb7b2c04466795e595586ad1f92bbd15a879b47fc59aaca8826db"}}, "download_size": 2111358901, "post_processing_size": null, "dataset_size": 975210424, "size_in_bytes": 3086569325}}
-428 Bytes
Binary file not shown.
2.24 KB
Binary file not shown.

datasets/gooaq/gooaq.py

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import json
1919

20+
import numpy as np
21+
2022
import datasets
2123

2224

@@ -43,13 +45,15 @@
4345

4446
_LICENSE = "Licensed under the Apache License, Version 2.0"
4547

46-
_URL = "https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl"
48+
_URL = "https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl"
49+
50+
_SPLITS_URL = "https://github.com/allenai/gooaq/raw/main/data/split.json"
4751

4852

4953
class Gooaq(datasets.GeneratorBasedBuilder):
5054
"""GooAQ - Question-answers, collected from Google"""
5155

52-
VERSION = datasets.Version("1.1.0")
56+
VERSION = datasets.Version("1.2.0")
5357

5458
def _info(self):
5559
features = datasets.Features(
@@ -83,39 +87,70 @@ def _info(self):
8387
def _split_generators(self, dl_manager):
8488
"""Returns SplitGenerators."""
8589

86-
data_dir = dl_manager.download(_URL)
90+
data = dl_manager.download(_URL)
91+
splits = dl_manager.download(_SPLITS_URL)
8792
return [
8893
datasets.SplitGenerator(
8994
name=datasets.Split.TRAIN,
9095
gen_kwargs={
91-
"filepath": data_dir,
96+
"filepath": data,
9297
"split": "train",
98+
"split_file": splits,
99+
},
100+
),
101+
datasets.SplitGenerator(
102+
name=datasets.Split.VALIDATION,
103+
gen_kwargs={
104+
"filepath": data,
105+
"split": "dev",
106+
"split_file": splits,
107+
},
108+
),
109+
datasets.SplitGenerator(
110+
name=datasets.Split.TEST,
111+
gen_kwargs={
112+
"filepath": data,
113+
"split": "test",
114+
"split_file": splits,
93115
},
94116
),
95117
]
96118

97119
def _generate_examples(
98-
self, filepath, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
120+
self,
121+
filepath,
122+
split,
123+
split_file, # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
99124
):
100125
dominant_classes = ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"]
101126

127+
with open(split_file, encoding="utf-8") as f_split:
128+
if split == "train":
129+
split_ids = json.load(f_split)[split]
130+
split_ids = np.array(split_ids)[:, 0]
131+
else:
132+
split_ids = json.load(f_split)[split]
133+
134+
split_ids = set(split_ids)
135+
102136
with open(filepath, encoding="utf-8") as f:
103137
for id_, row in enumerate(f):
104138
data = json.loads(row)
105139

106-
if data["answer_type"] not in dominant_classes:
107-
yield id_, {
108-
"id": data["id"],
109-
"question": data["question"],
110-
"short_answer": data["short_answer"],
111-
"answer": data["answer"],
112-
"answer_type": -1,
113-
}
114-
else:
115-
yield id_, {
116-
"id": data["id"],
117-
"question": data["question"],
118-
"short_answer": data["short_answer"],
119-
"answer": data["answer"],
120-
"answer_type": data["answer_type"],
121-
}
140+
if data["id"] in split_ids:
141+
if data["answer_type"] not in dominant_classes:
142+
yield id_, {
143+
"id": data["id"],
144+
"question": data["question"],
145+
"short_answer": data["short_answer"],
146+
"answer": data["answer"],
147+
"answer_type": -1,
148+
}
149+
else:
150+
yield id_, {
151+
"id": data["id"],
152+
"question": data["question"],
153+
"short_answer": data["short_answer"],
154+
"answer": data["answer"],
155+
"answer_type": data["answer_type"],
156+
}

0 commit comments

Comments
 (0)