diff --git a/datasets/qasper/README.md b/datasets/qasper/README.md new file mode 100644 index 00000000000..0a28a9fd724 --- /dev/null +++ b/datasets/qasper/README.md @@ -0,0 +1,233 @@ +--- +annotations_creators: +- expert-generated +source_datasets: +- original +language_creators: +- expert-generated +languages: +- en-US +licenses: +- cc-by-4.0 +multilinguality: +- monolingual +size_categories: +- 10K 5 years of experience). The field may be empty as well, indicating the writer has chosen not to share this information. + + - "topic_background" shows how familiar the question writer was with the topic of the paper. The values are "unfamiliar", "familiar", "research" (meaning that the topic is the research area of the writer), or null. + + - "paper_read", when specified shows whether the questionwriter has read the paper. + + - "search_query", if not empty, is the query the question writer used to find the abstract of the paper from a large pool of abstracts we made available to them. + +#### Fields specific to answers + +Unanswerable answers have "unanswerable" set to true. The remaining answers have exactly one of the following fields being non-empty. + + - "extractive_spans" are spans in the paper which serve as the answer. + - "free_form_answer" is a written out answer. + - "yes_no" is true iff the answer is Yes, and false iff the answer is No. + +"evidence" is the set of paragraphs, figures or tables used to arrive at the answer. Tables or figures start with the string "FLOAT SELECTED" + +"highlighted_evidence" is the set of sentences the answer providers selected as evidence if they chose textual evidence. The text in the "evidence" field is a mapping from these sentences to the paragraph level. That is, if you see textual evidence in the "evidence" field, it is guaranteed to be entire paragraphs, while that is not the case with "highlighted_evidence". + + +### Data Splits + +| | Train | Valid | +| ----- | ------ | ----- | +| Number of papers | 888 | 281 | +| Number of questions | 2593 | 1005 | +| Number of answers | 2675 | 1764 | + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +NLP papers: The full text of the papers is extracted from [S2ORC](https://huggingface.co/datasets/s2orc) (Lo et al., 2020) + +#### Initial Data Collection and Normalization + +[More Information Needed] + +#### Who are the source language producers? + +[More Information Needed] + +### Annotations + +[More Information Needed] + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +"The annotators are NLP practitioners, not +expert researchers, and it is likely that an expert +would score higher" + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +Crowdsourced NLP practitioners + +### Licensing Information + +[CC BY 4.0](https://creativecommons.org/licenses/by/4.0) + +### Citation Information + +``` +@inproceedings{Dasigi2021ADO, + title={A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers}, + author={Pradeep Dasigi and Kyle Lo and Iz Beltagy and Arman Cohan and Noah A. Smith and Matt Gardner}, + year={2021} +} +``` + +### Contributions + +Thanks to [@cceyda](https://github.com/cceyda) for adding this dataset. diff --git a/datasets/qasper/dataset_infos.json b/datasets/qasper/dataset_infos.json new file mode 100644 index 00000000000..95cdd4a161e --- /dev/null +++ b/datasets/qasper/dataset_infos.json @@ -0,0 +1 @@ +{"qasper": {"description": "A dataset containing 1585 papers with 5049 information-seeking questions asked by regular readers of NLP papers, and answered by a separate set of NLP practitioners.\n", "citation": "@inproceedings{Dasigi2021ADO,\n title={A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers},\n author={Pradeep Dasigi and Kyle Lo and Iz Beltagy and Arman Cohan and Noah A. Smith and Matt Gardner},\n year={2021}\n}\n", "homepage": "https://allenai.org/data/qasper", "license": "CC BY 4.0", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "abstract": {"dtype": "string", "id": null, "_type": "Value"}, "full_text": {"feature": {"section_name": {"dtype": "string", "id": null, "_type": "Value"}, "paragraphs": [{"dtype": "string", "id": null, "_type": "Value"}]}, "length": -1, "id": null, "_type": "Sequence"}, "qas": {"feature": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "question_id": {"dtype": "string", "id": null, "_type": "Value"}, "nlp_background": {"dtype": "string", "id": null, "_type": "Value"}, "topic_background": {"dtype": "string", "id": null, "_type": "Value"}, "paper_read": {"dtype": "string", "id": null, "_type": "Value"}, "search_query": {"dtype": "string", "id": null, "_type": "Value"}, "question_writer": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"answer": {"unanswerable": {"dtype": "bool", "id": null, "_type": "Value"}, "extractive_spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "yes_no": {"dtype": "bool", "id": null, "_type": "Value"}, "free_form_answer": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "highlighted_evidence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "annotation_id": {"dtype": "string", "id": null, "_type": "Value"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "qasper", "config_name": "qasper", "version": {"version_str": "0.1.0", "description": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 27277970, "num_examples": 888, "dataset_name": "qasper"}, "validation": {"name": "validation", "num_bytes": 9535330, "num_examples": 281, "dataset_name": "qasper"}}, "download_checksums": {"https://qasper-dataset.s3-us-west-2.amazonaws.com/qasper-train-dev-v0.1.tgz": {"num_bytes": 10359737, "checksum": "cd0cb8911342966fcc3eb91947af149cb7cf80b4f253ff9a6f0333f4752080dd"}}, "download_size": 10359737, "post_processing_size": null, "dataset_size": 36813300, "size_in_bytes": 47173037}} \ No newline at end of file diff --git a/datasets/qasper/dummy/qasper/0.1.0/dummy_data.zip b/datasets/qasper/dummy/qasper/0.1.0/dummy_data.zip new file mode 100644 index 00000000000..9226e01422c Binary files /dev/null and b/datasets/qasper/dummy/qasper/0.1.0/dummy_data.zip differ diff --git a/datasets/qasper/qasper.py b/datasets/qasper/qasper.py new file mode 100644 index 00000000000..5ae27ca827f --- /dev/null +++ b/datasets/qasper/qasper.py @@ -0,0 +1,130 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Qasper: A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers.""" + + +import json +import os + +import datasets + + +logger = datasets.logging.get_logger(__name__) + + +_CITATION = """\ +@inproceedings{Dasigi2021ADO, + title={A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers}, + author={Pradeep Dasigi and Kyle Lo and Iz Beltagy and Arman Cohan and Noah A. Smith and Matt Gardner}, + year={2021} +} +""" +_LICENSE = "CC BY 4.0" +_DESCRIPTION = """\ +A dataset containing 1585 papers with 5049 information-seeking questions asked by regular readers of NLP papers, and answered by a separate set of NLP practitioners. +""" + +_HOMEPAGE = "https://allenai.org/data/qasper" +_DOWNLOAD_URLS = {"data": "https://qasper-dataset.s3-us-west-2.amazonaws.com/qasper-train-dev-v0.1.tgz"} +data_files = {"train": "qasper-train-v0.1.json", "dev": "qasper-dev-v0.1.json"} + +_VERSION = "0.1.0" + + +class Qasper(datasets.GeneratorBasedBuilder): + """Qasper: A Dataset of Information-Seeking Q&A Anchored in Research Papers.""" + + BUILDER_CONFIGS = [ + datasets.BuilderConfig( + name="qasper", + version=datasets.Version(_VERSION), + description=_DESCRIPTION, + ) + ] + + def _info(self): + + features = datasets.Features( + { + "id": datasets.Value("string"), + "title": datasets.Value("string"), + "abstract": datasets.Value("string"), + "full_text": datasets.features.Sequence( + { + "section_name": datasets.Value("string"), + "paragraphs": [datasets.Value("string")], + } + ), + "qas": datasets.features.Sequence( + { + "question": datasets.Value("string"), + "question_id": datasets.Value("string"), + "nlp_background": datasets.Value("string"), + "topic_background": datasets.Value("string"), + "paper_read": datasets.Value("string"), + "search_query": datasets.Value("string"), + "question_writer": datasets.Value("string"), + "answers": datasets.features.Sequence( + { + "answer": { + "unanswerable": datasets.Value("bool"), + "extractive_spans": datasets.features.Sequence(datasets.Value("string")), + "yes_no": datasets.Value("bool"), + "free_form_answer": datasets.Value("string"), + "evidence": datasets.features.Sequence(datasets.Value("string")), + "highlighted_evidence": datasets.features.Sequence(datasets.Value("string")), + }, + "annotation_id": datasets.Value("string"), + "worker_id": datasets.Value("string"), + } + ), + } + ), + } + ) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + downloaded_files = dl_manager.download_and_extract(_DOWNLOAD_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": os.path.join(downloaded_files["data"], data_files["train"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": os.path.join(downloaded_files["data"], data_files["dev"])}, + ), + ] + + def _generate_examples(self, filepath): + """This function returns the examples in the raw (text) form.""" + logger.info("generating examples from = %s", filepath) + with open(filepath, encoding="utf-8") as f: + qasper = json.load(f) + for id_ in qasper: + qasper[id_]["id"] = id_ + yield id_, qasper[id_]