huggingface · albertvillanova · Oct 3, 2022 · Feb 13, 2022 · Oct 3, 2022 · Oct 3, 2022
diff --git a/datasets/msr_sqa/dataset_infos.json b/datasets/msr_sqa/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "citation": "@inproceedings{iyyer2017search,\n  title={Search-based neural structured learning for sequential question answering},\n  author={Iyyer, Mohit and Yih, Wen-tau and Chang, Ming-Wei},\n  booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},\n  pages={1821--1831},\n  year={2017}\n}\n", "homepage": "https://msropendata.com/datasets/b25190ed-0f59-47b1-9211-5962858142c2", "license": "Microsoft Research Data License Agreement", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "annotator": {"dtype": "int32", "id": null, "_type": "Value"}, "position": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "table_file": {"dtype": "string", "id": null, "_type": "Value"}, "table_header": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_data": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_coordinates": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "msr_sqa", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 22605449, "num_examples": 14541, "dataset_name": "msr_sqa"}, "test": {"name": "test", "num_bytes": 4924516, "num_examples": 3012, "dataset_name": "msr_sqa"}}, "download_checksums": {"https://download.microsoft.com/download/1/D/C/1DC270D2-1B53-4A61-A2E3-88AB3E4E6E1F/SQA%20Release%201.0.zip": {"num_bytes": 4796932, "checksum": "791a07ef90d6e736c186b25009d3c10cb38624b879bb668033445a3ab8892f64"}}, "download_size": 4796932, "post_processing_size": null, "dataset_size": 27529965, "size_in_bytes": 32326897}}
+{"default": {"description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "citation": "@inproceedings{iyyer2017search,\n  title={Search-based neural structured learning for sequential question answering},\n  author={Iyyer, Mohit and Yih, Wen-tau and Chang, Ming-Wei},\n  booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},\n  pages={1821--1831},\n  year={2017}\n}\n", "homepage": "https://msropendata.com/datasets/b25190ed-0f59-47b1-9211-5962858142c2", "license": "Microsoft Research Data License Agreement", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "annotator": {"dtype": "int32", "id": null, "_type": "Value"}, "position": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "question_and_history": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_file": {"dtype": "string", "id": null, "_type": "Value"}, "table_header": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_data": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_coordinates": {"feature": {"row_index": {"dtype": "int32", "id": null, "_type": "Value"}, "column_index": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answer_text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "msr_sqa", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 19732499, "num_examples": 12276, "dataset_name": "msr_sqa"}, "validation": {"name": "validation", "num_bytes": 3738331, "num_examples": 2265, "dataset_name": "msr_sqa"}, "test": {"name": "test", "num_bytes": 5105873, "num_examples": 3012, "dataset_name": "msr_sqa"}}, "download_checksums": {"https://download.microsoft.com/download/1/D/C/1DC270D2-1B53-4A61-A2E3-88AB3E4E6E1F/SQA%20Release%201.0.zip": {"num_bytes": 4796932, "checksum": "791a07ef90d6e736c186b25009d3c10cb38624b879bb668033445a3ab8892f64"}}, "download_size": 4796932, "post_processing_size": null, "dataset_size": 28576703, "size_in_bytes": 33373635}}
diff --git a/datasets/msr_sqa/dummy/0.0.0/dummy_data.zip b/datasets/msr_sqa/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/msr_sqa/dummy/1.0.0/dummy_data.zip b/datasets/msr_sqa/dummy/1.0.0/dummy_data.zip
diff --git a/datasets/msr_sqa/msr_sqa.py b/datasets/msr_sqa/msr_sqa.py
@@ -19,10 +19,11 @@
 import csv
 import os
 
+import pandas as pd
+
 import datasets
 
 
-# TODO: Add BibTeX citation
 # Find for instance the citation on arxiv or on the dataset repo/website
 _CITATION = """\
 @inproceedings{iyyer2017search,
@@ -60,13 +61,16 @@ def _load_table_data(table_file):
 
     Returns:
         header: a list of headers in the table.
-        data: 2d array of data in the table.
+        rows: 2d array of data in the table.
     """
-    with open(table_file, encoding="utf-8") as f:
-        lines = f.readlines()
-    header = lines[0].strip().split(",")
-    data = [line.strip().split(",") for line in lines[1:]]
-    return header, data
+    rows = []
+    table_data = pd.read_csv(table_file)
+    # the first line is header
+    header = list(table_data.columns)
+    for row_data in table_data.values:
+        rows.append([str(_) for _ in list(row_data)])
+
+    return header, rows
 
 
 def _parse_answer_coordinates(answer_coordinate_str):
@@ -113,6 +117,8 @@ def _parse_answer_text(answer_text_str):
 class MsrSQA(datasets.GeneratorBasedBuilder):
     """Microsoft Research Sequential Question Answering (SQA) Dataset"""
 
+    VERSION = datasets.Version("1.0.0")
+
     def _info(self):
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -122,6 +128,7 @@ def _info(self):
                     "annotator": datasets.Value("int32"),
                     "position": datasets.Value("int32"),
                     "question": datasets.Value("string"),
+                    "question_and_history": datasets.Sequence(datasets.Value("string")),
                     "table_file": datasets.Value("string"),
                     "table_header": datasets.features.Sequence(datasets.Value("string")),
                     "table_data": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string"))),
@@ -143,7 +150,11 @@ def _split_generators(self, dl_manager):
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": os.path.join(data_dir, "train.tsv"), "data_dir": data_dir},
+                gen_kwargs={"filepath": os.path.join(data_dir, "random-split-1-train.tsv"), "data_dir": data_dir},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(data_dir, "random-split-1-dev.tsv"), "data_dir": data_dir},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
@@ -155,10 +166,15 @@ def _generate_examples(self, filepath, data_dir):
         """Yields examples."""
         with open(filepath, encoding="utf-8") as f:
             reader = csv.DictReader(f, delimiter="\t")
+            question_and_history = []
             for idx, item in enumerate(reader):
                 item["answer_text"] = _parse_answer_text(item["answer_text"])
                 item["answer_coordinates"] = _parse_answer_coordinates(item["answer_coordinates"])
                 header, table_data = _load_table_data(os.path.join(data_dir, item["table_file"]))
                 item["table_header"] = header
                 item["table_data"] = table_data
+                if item["position"] == "0":
+                    question_and_history = []  # reset history
+                question_and_history.append(item["question"])
+                item["question_and_history"] = question_and_history
                 yield idx, item
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "citation": "@inproceedings{iyyer2017search,\n title={Search-based neural structured learning for sequential question answering},\n author={Iyyer, Mohit and Yih, Wen-tau and Chang, Ming-Wei},\n booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},\n pages={1821--1831},\n year={2017}\n}\n", "homepage": "https://msropendata.com/datasets/b25190ed-0f59-47b1-9211-5962858142c2", "license": "Microsoft Research Data License Agreement", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "annotator": {"dtype": "int32", "id": null, "_type": "Value"}, "position": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "table_file": {"dtype": "string", "id": null, "_type": "Value"}, "table_header": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_data": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_coordinates": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "msr_sqa", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 22605449, "num_examples": 14541, "dataset_name": "msr_sqa"}, "test": {"name": "test", "num_bytes": 4924516, "num_examples": 3012, "dataset_name": "msr_sqa"}}, "download_checksums": {"https://download.microsoft.com/download/1/D/C/1DC270D2-1B53-4A61-A2E3-88AB3E4E6E1F/SQA%20Release%201.0.zip": {"num_bytes": 4796932, "checksum": "791a07ef90d6e736c186b25009d3c10cb38624b879bb668033445a3ab8892f64"}}, "download_size": 4796932, "post_processing_size": null, "dataset_size": 27529965, "size_in_bytes": 32326897}}
		{"default": {"description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "citation": "@inproceedings{iyyer2017search,\n title={Search-based neural structured learning for sequential question answering},\n author={Iyyer, Mohit and Yih, Wen-tau and Chang, Ming-Wei},\n booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},\n pages={1821--1831},\n year={2017}\n}\n", "homepage": "https://msropendata.com/datasets/b25190ed-0f59-47b1-9211-5962858142c2", "license": "Microsoft Research Data License Agreement", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "annotator": {"dtype": "int32", "id": null, "_type": "Value"}, "position": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "question_and_history": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_file": {"dtype": "string", "id": null, "_type": "Value"}, "table_header": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_data": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_coordinates": {"feature": {"row_index": {"dtype": "int32", "id": null, "_type": "Value"}, "column_index": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answer_text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "msr_sqa", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 19732499, "num_examples": 12276, "dataset_name": "msr_sqa"}, "validation": {"name": "validation", "num_bytes": 3738331, "num_examples": 2265, "dataset_name": "msr_sqa"}, "test": {"name": "test", "num_bytes": 5105873, "num_examples": 3012, "dataset_name": "msr_sqa"}}, "download_checksums": {"https://download.microsoft.com/download/1/D/C/1DC270D2-1B53-4A61-A2E3-88AB3E4E6E1F/SQA%20Release%201.0.zip": {"num_bytes": 4796932, "checksum": "791a07ef90d6e736c186b25009d3c10cb38624b879bb668033445a3ab8892f64"}}, "download_size": 4796932, "post_processing_size": null, "dataset_size": 28576703, "size_in_bytes": 33373635}}