diff --git a/datasets/subjqa/README.md b/datasets/subjqa/README.md new file mode 100644 index 00000000000..5c896afb47a --- /dev/null +++ b/datasets/subjqa/README.md @@ -0,0 +1,288 @@ +--- +annotations_creators: +- expert-generated +language_creators: +- found +languages: +- en +licenses: +- unknown +multilinguality: +- monolingual +size_categories: +- 1K The platform provides quality control by showing the workers 5 questions at a time, out of which one is labeled by the experts. A worker who fails to maintain 70% accuracy is kicked out by the platform and his judgements are ignored ... To ensure good quality labels, we paid each worker 5 cents per annotation. + +The instructions for generating a question are shown in the following figure: + +ques_gen + +Similarly, the interface for the answer span and subjectivity labelling tasks is shown below: + +![span_collection](https://user-images.githubusercontent.com/26859204/117259223-1fda1480-ae4e-11eb-9305-658ee6e3971d.png) + +As described in the SubjQA paper, the workers assign subjectivity scores (1-5) to each question and the selected answer span. They can also indicate if a question cannot be answered from the given review. + + +#### Who are the annotators? + +Workers on the Appen platform. + +### Personal and Sensitive Information + +[Needs More Information] + +## Considerations for Using the Data + +### Social Impact of Dataset + +The SubjQA dataset can be used to develop question-answering systems that can provide better on-demand answers to e-commerce customers who are interested in subjective questions about products and services. + +### Discussion of Biases + +[Needs More Information] + +### Other Known Limitations + +[Needs More Information] + +## Additional Information + +### Dataset Curators + +The people involved in creating the SubjQA dataset are the authors of the accompanying paper: + +* Johannes Bjerva1, Department of Computer Science, University of Copenhagen, Department of Computer Science, Aalborg University +* Nikita Bhutani, Megagon Labs, Mountain View +* Behzad Golshan, Megagon Labs, Mountain View +* Wang-Chiew Tan, Megagon Labs, Mountain View +* Isabelle Augenstein, Department of Computer Science, University of Copenhagen + +### Licensing Information + +The SubjQA dataset is provided "as-is", and its creators make no representation as to its accuracy. + +The SubjQA dataset is constructed based on the following datasets and thus contains subsets of their data: +* [Amazon Review Dataset](http://jmcauley.ucsd.edu/data/amazon/links.html) from UCSD + * Used for _books_, _movies_, _grocery_, and _electronics_ domains +* [The TripAdvisor Dataset](http://times.cs.uiuc.edu/~wang296/Data/) from UIUC's Database and Information Systems Laboratory + * Used for the _TripAdvisor_ domain +* [The Yelp Dataset](https://www.yelp.com/dataset) + * Used for the _restaurants_ domain + +Consequently, the data within each domain of the SubjQA dataset should be considered under the same license as the dataset it was built upon. + +### Citation Information + +If you are using the dataset, please cite the following in your work: +``` +@inproceedings{bjerva20subjqa, + title = "SubjQA: A Dataset for Subjectivity and Review Comprehension", + author = "Bjerva, Johannes and + Bhutani, Nikita and + Golahn, Behzad and + Tan, Wang-Chiew and + Augenstein, Isabelle", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing", + month = November, + year = "2020", + publisher = "Association for Computational Linguistics", +} +``` + +### Contributions + +Thanks to [@lewtun](https://github.com/lewtun) for adding this dataset. diff --git a/datasets/subjqa/dataset_infos.json b/datasets/subjqa/dataset_infos.json new file mode 100644 index 00000000000..8ef13ca17bb --- /dev/null +++ b/datasets/subjqa/dataset_infos.json @@ -0,0 +1 @@ +{"books": {"description": "SubjQA is a question answering dataset that focuses on subjective questions and answers.\nThe dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,\nelectronics, TripAdvisor (i.e. hotels), and restaurants.", "citation": "@inproceedings{bjerva20subjqa,\n title = \"SubjQA: A Dataset for Subjectivity and Review Comprehension\",\n author = \"Bjerva, Johannes and\n Bhutani, Nikita and\n Golahn, Behzad and\n Tan, Wang-Chiew and\n Augenstein, Isabelle\",\n booktitle = \"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing\",\n month = November,\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "nn_mod": {"dtype": "string", "id": null, "_type": "Value"}, "nn_asp": {"dtype": "string", "id": null, "_type": "Value"}, "query_mod": {"dtype": "string", "id": null, "_type": "Value"}, "query_asp": {"dtype": "string", "id": null, "_type": "Value"}, "q_reviews_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ques_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ques_subjective": {"dtype": "bool", "id": null, "_type": "Value"}, "review_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "answer_start": {"dtype": "int32", "id": null, "_type": "Value"}, "answer_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ans_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ans_subjective": {"dtype": "bool", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "subjqa", "config_name": "books", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2473128, "num_examples": 1314, "dataset_name": "subjqa"}, "test": {"name": "test", "num_bytes": 649413, "num_examples": 345, "dataset_name": "subjqa"}, "validation": {"name": "validation", "num_bytes": 460214, "num_examples": 256, "dataset_name": "subjqa"}}, "download_checksums": {"https://github.com/lewtun/SubjQA/archive/refs/heads/master.zip": {"num_bytes": 11384657, "checksum": "f3d58fd04c698fccb326b7ea4ea93098cc2186a3925f4bbad9b538ed7acc72db"}}, "download_size": 11384657, "post_processing_size": null, "dataset_size": 3582755, "size_in_bytes": 14967412}, "electronics": {"description": "SubjQA is a question answering dataset that focuses on subjective questions and answers.\nThe dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,\nelectronics, TripAdvisor (i.e. hotels), and restaurants.", "citation": "@inproceedings{bjerva20subjqa,\n title = \"SubjQA: A Dataset for Subjectivity and Review Comprehension\",\n author = \"Bjerva, Johannes and\n Bhutani, Nikita and\n Golahn, Behzad and\n Tan, Wang-Chiew and\n Augenstein, Isabelle\",\n booktitle = \"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing\",\n month = November,\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "nn_mod": {"dtype": "string", "id": null, "_type": "Value"}, "nn_asp": {"dtype": "string", "id": null, "_type": "Value"}, "query_mod": {"dtype": "string", "id": null, "_type": "Value"}, "query_asp": {"dtype": "string", "id": null, "_type": "Value"}, "q_reviews_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ques_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ques_subjective": {"dtype": "bool", "id": null, "_type": "Value"}, "review_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "answer_start": {"dtype": "int32", "id": null, "_type": "Value"}, "answer_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ans_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ans_subjective": {"dtype": "bool", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "subjqa", "config_name": "electronics", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2123648, "num_examples": 1295, "dataset_name": "subjqa"}, "test": {"name": "test", "num_bytes": 608899, "num_examples": 358, "dataset_name": "subjqa"}, "validation": {"name": "validation", "num_bytes": 419042, "num_examples": 255, "dataset_name": "subjqa"}}, "download_checksums": {"https://github.com/lewtun/SubjQA/archive/refs/heads/master.zip": {"num_bytes": 11384657, "checksum": "f3d58fd04c698fccb326b7ea4ea93098cc2186a3925f4bbad9b538ed7acc72db"}}, "download_size": 11384657, "post_processing_size": null, "dataset_size": 3151589, "size_in_bytes": 14536246}, "grocery": {"description": "SubjQA is a question answering dataset that focuses on subjective questions and answers.\nThe dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,\nelectronics, TripAdvisor (i.e. hotels), and restaurants.", "citation": "@inproceedings{bjerva20subjqa,\n title = \"SubjQA: A Dataset for Subjectivity and Review Comprehension\",\n author = \"Bjerva, Johannes and\n Bhutani, Nikita and\n Golahn, Behzad and\n Tan, Wang-Chiew and\n Augenstein, Isabelle\",\n booktitle = \"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing\",\n month = November,\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "nn_mod": {"dtype": "string", "id": null, "_type": "Value"}, "nn_asp": {"dtype": "string", "id": null, "_type": "Value"}, "query_mod": {"dtype": "string", "id": null, "_type": "Value"}, "query_asp": {"dtype": "string", "id": null, "_type": "Value"}, "q_reviews_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ques_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ques_subjective": {"dtype": "bool", "id": null, "_type": "Value"}, "review_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "answer_start": {"dtype": "int32", "id": null, "_type": "Value"}, "answer_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ans_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ans_subjective": {"dtype": "bool", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "subjqa", "config_name": "grocery", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1317488, "num_examples": 1124, "dataset_name": "subjqa"}, "test": {"name": "test", "num_bytes": 721827, "num_examples": 591, "dataset_name": "subjqa"}, "validation": {"name": "validation", "num_bytes": 254432, "num_examples": 218, "dataset_name": "subjqa"}}, "download_checksums": {"https://github.com/lewtun/SubjQA/archive/refs/heads/master.zip": {"num_bytes": 11384657, "checksum": "f3d58fd04c698fccb326b7ea4ea93098cc2186a3925f4bbad9b538ed7acc72db"}}, "download_size": 11384657, "post_processing_size": null, "dataset_size": 2293747, "size_in_bytes": 13678404}, "movies": {"description": "SubjQA is a question answering dataset that focuses on subjective questions and answers.\nThe dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,\nelectronics, TripAdvisor (i.e. hotels), and restaurants.", "citation": "@inproceedings{bjerva20subjqa,\n title = \"SubjQA: A Dataset for Subjectivity and Review Comprehension\",\n author = \"Bjerva, Johannes and\n Bhutani, Nikita and\n Golahn, Behzad and\n Tan, Wang-Chiew and\n Augenstein, Isabelle\",\n booktitle = \"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing\",\n month = November,\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "nn_mod": {"dtype": "string", "id": null, "_type": "Value"}, "nn_asp": {"dtype": "string", "id": null, "_type": "Value"}, "query_mod": {"dtype": "string", "id": null, "_type": "Value"}, "query_asp": {"dtype": "string", "id": null, "_type": "Value"}, "q_reviews_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ques_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ques_subjective": {"dtype": "bool", "id": null, "_type": "Value"}, "review_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "answer_start": {"dtype": "int32", "id": null, "_type": "Value"}, "answer_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ans_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ans_subjective": {"dtype": "bool", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "subjqa", "config_name": "movies", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2986348, "num_examples": 1369, "dataset_name": "subjqa"}, "test": {"name": "test", "num_bytes": 620513, "num_examples": 291, "dataset_name": "subjqa"}, "validation": {"name": "validation", "num_bytes": 589663, "num_examples": 261, "dataset_name": "subjqa"}}, "download_checksums": {"https://github.com/lewtun/SubjQA/archive/refs/heads/master.zip": {"num_bytes": 11384657, "checksum": "f3d58fd04c698fccb326b7ea4ea93098cc2186a3925f4bbad9b538ed7acc72db"}}, "download_size": 11384657, "post_processing_size": null, "dataset_size": 4196524, "size_in_bytes": 15581181}, "restaurants": {"description": "SubjQA is a question answering dataset that focuses on subjective questions and answers.\nThe dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,\nelectronics, TripAdvisor (i.e. hotels), and restaurants.", "citation": "@inproceedings{bjerva20subjqa,\n title = \"SubjQA: A Dataset for Subjectivity and Review Comprehension\",\n author = \"Bjerva, Johannes and\n Bhutani, Nikita and\n Golahn, Behzad and\n Tan, Wang-Chiew and\n Augenstein, Isabelle\",\n booktitle = \"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing\",\n month = November,\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "nn_mod": {"dtype": "string", "id": null, "_type": "Value"}, "nn_asp": {"dtype": "string", "id": null, "_type": "Value"}, "query_mod": {"dtype": "string", "id": null, "_type": "Value"}, "query_asp": {"dtype": "string", "id": null, "_type": "Value"}, "q_reviews_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ques_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ques_subjective": {"dtype": "bool", "id": null, "_type": "Value"}, "review_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "answer_start": {"dtype": "int32", "id": null, "_type": "Value"}, "answer_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ans_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ans_subjective": {"dtype": "bool", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "subjqa", "config_name": "restaurants", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1823331, "num_examples": 1400, "dataset_name": "subjqa"}, "test": {"name": "test", "num_bytes": 335453, "num_examples": 266, "dataset_name": "subjqa"}, "validation": {"name": "validation", "num_bytes": 349354, "num_examples": 267, "dataset_name": "subjqa"}}, "download_checksums": {"https://github.com/lewtun/SubjQA/archive/refs/heads/master.zip": {"num_bytes": 11384657, "checksum": "f3d58fd04c698fccb326b7ea4ea93098cc2186a3925f4bbad9b538ed7acc72db"}}, "download_size": 11384657, "post_processing_size": null, "dataset_size": 2508138, "size_in_bytes": 13892795}, "tripadvisor": {"description": "SubjQA is a question answering dataset that focuses on subjective questions and answers.\nThe dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,\nelectronics, TripAdvisor (i.e. hotels), and restaurants.", "citation": "@inproceedings{bjerva20subjqa,\n title = \"SubjQA: A Dataset for Subjectivity and Review Comprehension\",\n author = \"Bjerva, Johannes and\n Bhutani, Nikita and\n Golahn, Behzad and\n Tan, Wang-Chiew and\n Augenstein, Isabelle\",\n booktitle = \"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing\",\n month = November,\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "nn_mod": {"dtype": "string", "id": null, "_type": "Value"}, "nn_asp": {"dtype": "string", "id": null, "_type": "Value"}, "query_mod": {"dtype": "string", "id": null, "_type": "Value"}, "query_asp": {"dtype": "string", "id": null, "_type": "Value"}, "q_reviews_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ques_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ques_subjective": {"dtype": "bool", "id": null, "_type": "Value"}, "review_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "answer_start": {"dtype": "int32", "id": null, "_type": "Value"}, "answer_subj_level": {"dtype": "int64", "id": null, "_type": "Value"}, "ans_subj_score": {"dtype": "float32", "id": null, "_type": "Value"}, "is_ans_subjective": {"dtype": "bool", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "subjqa", "config_name": "tripadvisor", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1575021, "num_examples": 1165, "dataset_name": "subjqa"}, "test": {"name": "test", "num_bytes": 689508, "num_examples": 512, "dataset_name": "subjqa"}, "validation": {"name": "validation", "num_bytes": 312645, "num_examples": 230, "dataset_name": "subjqa"}}, "download_checksums": {"https://github.com/lewtun/SubjQA/archive/refs/heads/master.zip": {"num_bytes": 11384657, "checksum": "f3d58fd04c698fccb326b7ea4ea93098cc2186a3925f4bbad9b538ed7acc72db"}}, "download_size": 11384657, "post_processing_size": null, "dataset_size": 2577174, "size_in_bytes": 13961831}} \ No newline at end of file diff --git a/datasets/subjqa/dummy/books/1.1.0/dummy_data.zip b/datasets/subjqa/dummy/books/1.1.0/dummy_data.zip new file mode 100644 index 00000000000..6135c4d1f69 Binary files /dev/null and b/datasets/subjqa/dummy/books/1.1.0/dummy_data.zip differ diff --git a/datasets/subjqa/dummy/electronics/1.1.0/dummy_data.zip b/datasets/subjqa/dummy/electronics/1.1.0/dummy_data.zip new file mode 100644 index 00000000000..10364ccb87f Binary files /dev/null and b/datasets/subjqa/dummy/electronics/1.1.0/dummy_data.zip differ diff --git a/datasets/subjqa/dummy/grocery/1.1.0/dummy_data.zip b/datasets/subjqa/dummy/grocery/1.1.0/dummy_data.zip new file mode 100644 index 00000000000..77e2a73d123 Binary files /dev/null and b/datasets/subjqa/dummy/grocery/1.1.0/dummy_data.zip differ diff --git a/datasets/subjqa/dummy/movies/1.1.0/dummy_data.zip b/datasets/subjqa/dummy/movies/1.1.0/dummy_data.zip new file mode 100644 index 00000000000..d6f6346f4d9 Binary files /dev/null and b/datasets/subjqa/dummy/movies/1.1.0/dummy_data.zip differ diff --git a/datasets/subjqa/dummy/restaurants/1.1.0/dummy_data.zip b/datasets/subjqa/dummy/restaurants/1.1.0/dummy_data.zip new file mode 100644 index 00000000000..48a0d552275 Binary files /dev/null and b/datasets/subjqa/dummy/restaurants/1.1.0/dummy_data.zip differ diff --git a/datasets/subjqa/dummy/tripadvisor/1.1.0/dummy_data.zip b/datasets/subjqa/dummy/tripadvisor/1.1.0/dummy_data.zip new file mode 100644 index 00000000000..3faea760c3e Binary files /dev/null and b/datasets/subjqa/dummy/tripadvisor/1.1.0/dummy_data.zip differ diff --git a/datasets/subjqa/subjqa.py b/datasets/subjqa/subjqa.py new file mode 100644 index 00000000000..642759c4046 --- /dev/null +++ b/datasets/subjqa/subjqa.py @@ -0,0 +1,211 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SubjQA is a question answering dataset that focuses on subjective questions and answers. +The dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery, +electronics, TripAdvisor (i.e. hotels), and restaurants.""" + + +import ast +import os + +import pandas as pd + +import datasets + + +_CITATION = """\ +@inproceedings{bjerva20subjqa, + title = "SubjQA: A Dataset for Subjectivity and Review Comprehension", + author = "Bjerva, Johannes and + Bhutani, Nikita and + Golahn, Behzad and + Tan, Wang-Chiew and + Augenstein, Isabelle", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing", + month = November, + year = "2020", + publisher = "Association for Computational Linguistics", +} +""" + +_DESCRIPTION = """SubjQA is a question answering dataset that focuses on subjective questions and answers. +The dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery, +electronics, TripAdvisor (i.e. hotels), and restaurants.""" + +_HOMEPAGE = "" + +_LICENSE = "" + +_URLs = {"default": "https://github.com/lewtun/SubjQA/archive/refs/heads/master.zip"} + + +class Subjqa(datasets.GeneratorBasedBuilder): + """SubjQA is a question answering dataset that focuses on subjective questions and answers.""" + + VERSION = datasets.Version("1.1.0") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name="books", version=VERSION, description="Amazon book reviews"), + datasets.BuilderConfig(name="electronics", version=VERSION, description="Amazon electronics reviews"), + datasets.BuilderConfig(name="grocery", version=VERSION, description="Amazon grocery reviews"), + datasets.BuilderConfig(name="movies", version=VERSION, description="Amazon movie reviews"), + datasets.BuilderConfig(name="restaurants", version=VERSION, description="Yelp restaurant reviews"), + datasets.BuilderConfig(name="tripadvisor", version=VERSION, description="TripAdvisor hotel reviews"), + ] + + def _info(self): + features = datasets.Features( + { + "domain": datasets.Value("string"), + "nn_mod": datasets.Value("string"), + "nn_asp": datasets.Value("string"), + "query_mod": datasets.Value("string"), + "query_asp": datasets.Value("string"), + "q_reviews_id": datasets.Value("string"), + "question_subj_level": datasets.Value("int64"), + "ques_subj_score": datasets.Value("float"), + "is_ques_subjective": datasets.Value("bool"), + "review_id": datasets.Value("string"), + "id": datasets.Value("string"), + "title": datasets.Value("string"), + "context": datasets.Value("string"), + "question": datasets.Value("string"), + "answers": datasets.features.Sequence( + { + "text": datasets.Value("string"), + "answer_start": datasets.Value("int32"), + "answer_subj_level": datasets.Value("int64"), + "ans_subj_score": datasets.Value("float"), + "is_ans_subjective": datasets.Value("bool"), + } + ), + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + data_dir = dl_manager.download_and_extract(_URLs["default"]) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, f"SubjQA-master/SubjQA/{self.config.name}/splits/train.csv") + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, f"SubjQA-master/SubjQA/{self.config.name}/splits/test.csv") + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, f"SubjQA-master/SubjQA/{self.config.name}/splits/dev.csv") + }, + ), + ] + + def _generate_examples(self, filepath): + df = pd.read_csv(filepath) + squad_format = self._convert_to_squad(df) + for example in squad_format["data"]: + title = example.get("title", "").strip() + for paragraph in example["paragraphs"]: + context = paragraph["context"].strip() + for qa in paragraph["qas"]: + question = qa["question"].strip() + question_meta = {k: v for k, v in qa.items() if k in self.question_meta_columns} + id_ = qa["id"] + answer_starts = [answer["answer_start"] for answer in qa["answers"]] + answers = [answer["text"].strip() for answer in qa["answers"]] + answer_meta = pd.DataFrame(qa["answers"], columns=self.answer_meta_columns).to_dict("list") + yield id_, { + **{ + "title": title, + "context": context, + "question": question, + "id": id_, + "answers": { + **{ + "answer_start": answer_starts, + "text": answers, + }, + **answer_meta, + }, + }, + **question_meta, + } + + def _create_paragraphs(self, df): + "A helper function to convert a pandas.DataFrame of (question, context, answer) rows to SQuAD paragraphs." + self.question_meta_columns = [ + "domain", + "nn_mod", + "nn_asp", + "query_mod", + "query_asp", + "q_reviews_id", + "question_subj_level", + "ques_subj_score", + "is_ques_subjective", + "review_id", + ] + self.answer_meta_columns = ["answer_subj_level", "ans_subj_score", "is_ans_subjective"] + id2review = dict(zip(df["review_id"], df["review"])) + pars = [] + for review_id, review in id2review.items(): + qas = [] + review_df = df.query(f"review_id == '{review_id}'") + id2question = dict(zip(review_df["q_review_id"], review_df["question"])) + + for k, v in id2question.items(): + d = df.query(f"q_review_id == '{k}'").to_dict(orient="list") + answer_starts = [ast.literal_eval(a)[0] for a in d["human_ans_indices"]] + answer_meta = {k: v[0] for k, v in d.items() if k in self.answer_meta_columns} + question_meta = {k: v[0] for k, v in d.items() if k in self.question_meta_columns} + # Only fill answerable questions + if pd.unique(d["human_ans_spans"])[0] != "ANSWERNOTFOUND": + answers = [ + {**{"text": text, "answer_start": answer_start}, **answer_meta} + for text, answer_start in zip(d["human_ans_spans"], answer_starts) + if text != "ANSWERNOTFOUND" + ] + else: + answers = [] + qas.append({**{"question": v, "id": k, "answers": answers}, **question_meta}) + # Slice off ANSWERNOTFOUND from context + pars.append({"qas": qas, "context": review[: -len(" ANSWERNOTFOUND")]}) + return pars + + def _convert_to_squad(self, df): + "A helper function to convert a pandas.DataFrame of product-based QA dataset into SQuAD format" + groups = ( + df.groupby("item_id") + .apply(self._create_paragraphs) + .to_frame(name="paragraphs") + .reset_index() + .rename(columns={"item_id": "title"}) + ) + squad_data = {} + squad_data["data"] = groups.to_dict(orient="records") + return squad_data