Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions datasets/gooaq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ task_categories:
task_ids:
- open-domain-qa
paperswithcode_id: gooaq
pretty_name: 'GooAQ: Open Question Answering with Diverse Answer Types'
---

# Dataset Card for GooAQ
Expand Down Expand Up @@ -108,11 +109,14 @@ Here is the dominant types in the current dataset:

### Data Splits

This dataset is split into train set. Number of samples in train set is given below:
Number of samples in train/validation/test set are given below:

| Split | Number of samples |
|------------|-------------------|
| Train | 3112679 |
| Validation | 2500 |
| Test | 2500 |

| | Train |
| ----- | ------ |
| Gooaq | 5030530|

## Dataset Creation

Expand Down
2 changes: 1 addition & 1 deletion datasets/gooaq/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1102827066, "num_examples": 5030530, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl": {"num_bytes": 1467162788, "checksum": "7c57029dbac90db21c7abcb3dcdbf9cd9f83f9a1d24815a2d8c0663fe13e4a17"}}, "download_size": 1467162788, "post_processing_size": null, "dataset_size": 1102827066, "size_in_bytes": 2569989854}}
{"default": {"description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.\n", "citation": "@article{gooaq2021,\n title={GooAQ: Open Question Answering with Diverse Answer Types},\n author={Khashabi, Daniel and Ng, Amos and Khot, Tushar and Sabharwal, Ashish and Hajishirzi, Hannaneh and Callison-Burch, Chris},\n journal={arXiv preprint},\n year={2021}\n}\n", "homepage": "https://github.com/allenai/gooaq", "license": "Licensed under the Apache License, Version 2.0", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "short_answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "answer_type": {"num_classes": 6, "names": ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gooaq", "config_name": "default", "version": {"version_str": "1.2.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 974320061, "num_examples": 3112679, "dataset_name": "gooaq"}, "validation": {"name": "validation", "num_bytes": 444553, "num_examples": 2500, "dataset_name": "gooaq"}, "test": {"name": "test", "num_bytes": 445810, "num_examples": 2500, "dataset_name": "gooaq"}}, "download_checksums": {"https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl": {"num_bytes": 1920133810, "checksum": "d68007293be8740a7a7388efa8ea30ae5a3232d18a340a63f0190e07942d9da2"}, "https://github.com/allenai/gooaq/raw/main/data/split.json": {"num_bytes": 191225091, "checksum": "728921af66afb7b2c04466795e595586ad1f92bbd15a879b47fc59aaca8826db"}}, "download_size": 2111358901, "post_processing_size": null, "dataset_size": 975210424, "size_in_bytes": 3086569325}}
Binary file removed datasets/gooaq/dummy/1.1.0/dummy_data.zip
Binary file not shown.
Binary file added datasets/gooaq/dummy/1.2.0/dummy_data.zip
Binary file not shown.
77 changes: 56 additions & 21 deletions datasets/gooaq/gooaq.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

import json

import numpy as np

import datasets


Expand All @@ -43,13 +45,15 @@

_LICENSE = "Licensed under the Apache License, Version 2.0"

_URL = "https://github.com/allenai/gooaq/raw/main/data/qoogle.jsonl"
_URL = "https://github.com/allenai/gooaq/raw/main/data/gooaq.jsonl"

_SPLITS_URL = "https://github.com/allenai/gooaq/raw/main/data/split.json"


class Gooaq(datasets.GeneratorBasedBuilder):
"""GooAQ - Question-answers, collected from Google"""

VERSION = datasets.Version("1.1.0")
VERSION = datasets.Version("1.2.0")

def _info(self):
features = datasets.Features(
Expand Down Expand Up @@ -83,39 +87,70 @@ def _info(self):
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""

data_dir = dl_manager.download(_URL)
data = dl_manager.download(_URL)
splits = dl_manager.download(_SPLITS_URL)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_dir,
"filepath": data,
"split": "train",
"split_file": splits,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": data,
"split": "dev",
"split_file": splits,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": data,
"split": "test",
"split_file": splits,
},
),
]

def _generate_examples(
self, filepath, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
self,
filepath,
split,
split_file, # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
):
dominant_classes = ["feat_snip", "collection", "knowledge", "unit_conv", "time_conv", "curr_conv"]

with open(split_file, encoding="utf-8") as f_split:
if split == "train":
split_ids = json.load(f_split)[split]
split_ids = np.array(split_ids)[:, 0]
else:
split_ids = json.load(f_split)[split]

split_ids = set(split_ids)

with open(filepath, encoding="utf-8") as f:
for id_, row in enumerate(f):
data = json.loads(row)

if data["answer_type"] not in dominant_classes:
yield id_, {
"id": data["id"],
"question": data["question"],
"short_answer": data["short_answer"],
"answer": data["answer"],
"answer_type": -1,
}
else:
yield id_, {
"id": data["id"],
"question": data["question"],
"short_answer": data["short_answer"],
"answer": data["answer"],
"answer_type": data["answer_type"],
}
if data["id"] in split_ids:
if data["answer_type"] not in dominant_classes:
yield id_, {
"id": data["id"],
"question": data["question"],
"short_answer": data["short_answer"],
"answer": data["answer"],
"answer_type": -1,
}
else:
yield id_, {
"id": data["id"],
"question": data["question"],
"short_answer": data["short_answer"],
"answer": data["answer"],
"answer_type": data["answer_type"],
}