Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/msr_sqa/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "citation": "@inproceedings{iyyer2017search,\n title={Search-based neural structured learning for sequential question answering},\n author={Iyyer, Mohit and Yih, Wen-tau and Chang, Ming-Wei},\n booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},\n pages={1821--1831},\n year={2017}\n}\n", "homepage": "https://msropendata.com/datasets/b25190ed-0f59-47b1-9211-5962858142c2", "license": "Microsoft Research Data License Agreement", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "annotator": {"dtype": "int32", "id": null, "_type": "Value"}, "position": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "table_file": {"dtype": "string", "id": null, "_type": "Value"}, "table_header": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_data": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_coordinates": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "msr_sqa", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 22605449, "num_examples": 14541, "dataset_name": "msr_sqa"}, "test": {"name": "test", "num_bytes": 4924516, "num_examples": 3012, "dataset_name": "msr_sqa"}}, "download_checksums": {"https://download.microsoft.com/download/1/D/C/1DC270D2-1B53-4A61-A2E3-88AB3E4E6E1F/SQA%20Release%201.0.zip": {"num_bytes": 4796932, "checksum": "791a07ef90d6e736c186b25009d3c10cb38624b879bb668033445a3ab8892f64"}}, "download_size": 4796932, "post_processing_size": null, "dataset_size": 27529965, "size_in_bytes": 32326897}}
{"default": {"description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "citation": "@inproceedings{iyyer2017search,\n title={Search-based neural structured learning for sequential question answering},\n author={Iyyer, Mohit and Yih, Wen-tau and Chang, Ming-Wei},\n booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},\n pages={1821--1831},\n year={2017}\n}\n", "homepage": "https://msropendata.com/datasets/b25190ed-0f59-47b1-9211-5962858142c2", "license": "Microsoft Research Data License Agreement", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "annotator": {"dtype": "int32", "id": null, "_type": "Value"}, "position": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "question_and_history": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_file": {"dtype": "string", "id": null, "_type": "Value"}, "table_header": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "table_data": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "answer_coordinates": {"feature": {"row_index": {"dtype": "int32", "id": null, "_type": "Value"}, "column_index": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answer_text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "msr_sqa", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 19732499, "num_examples": 12276, "dataset_name": "msr_sqa"}, "validation": {"name": "validation", "num_bytes": 3738331, "num_examples": 2265, "dataset_name": "msr_sqa"}, "test": {"name": "test", "num_bytes": 5105873, "num_examples": 3012, "dataset_name": "msr_sqa"}}, "download_checksums": {"https://download.microsoft.com/download/1/D/C/1DC270D2-1B53-4A61-A2E3-88AB3E4E6E1F/SQA%20Release%201.0.zip": {"num_bytes": 4796932, "checksum": "791a07ef90d6e736c186b25009d3c10cb38624b879bb668033445a3ab8892f64"}}, "download_size": 4796932, "post_processing_size": null, "dataset_size": 28576703, "size_in_bytes": 33373635}}
Binary file removed datasets/msr_sqa/dummy/0.0.0/dummy_data.zip
Binary file not shown.
Binary file added datasets/msr_sqa/dummy/1.0.0/dummy_data.zip
Binary file not shown.
32 changes: 24 additions & 8 deletions datasets/msr_sqa/msr_sqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@
import csv
import os

import pandas as pd

import datasets


# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@inproceedings{iyyer2017search,
Expand Down Expand Up @@ -60,13 +61,16 @@ def _load_table_data(table_file):

Returns:
header: a list of headers in the table.
data: 2d array of data in the table.
rows: 2d array of data in the table.
"""
with open(table_file, encoding="utf-8") as f:
lines = f.readlines()
header = lines[0].strip().split(",")
data = [line.strip().split(",") for line in lines[1:]]
return header, data
rows = []
table_data = pd.read_csv(table_file)
# the first line is header
header = list(table_data.columns)
for row_data in table_data.values:
rows.append([str(_) for _ in list(row_data)])

return header, rows


def _parse_answer_coordinates(answer_coordinate_str):
Expand Down Expand Up @@ -113,6 +117,8 @@ def _parse_answer_text(answer_text_str):
class MsrSQA(datasets.GeneratorBasedBuilder):
"""Microsoft Research Sequential Question Answering (SQA) Dataset"""

VERSION = datasets.Version("1.0.0")

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
Expand All @@ -122,6 +128,7 @@ def _info(self):
"annotator": datasets.Value("int32"),
"position": datasets.Value("int32"),
"question": datasets.Value("string"),
"question_and_history": datasets.Sequence(datasets.Value("string")),
"table_file": datasets.Value("string"),
"table_header": datasets.features.Sequence(datasets.Value("string")),
"table_data": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string"))),
Expand All @@ -143,7 +150,11 @@ def _split_generators(self, dl_manager):
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": os.path.join(data_dir, "train.tsv"), "data_dir": data_dir},
gen_kwargs={"filepath": os.path.join(data_dir, "random-split-1-train.tsv"), "data_dir": data_dir},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": os.path.join(data_dir, "random-split-1-dev.tsv"), "data_dir": data_dir},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
Expand All @@ -155,10 +166,15 @@ def _generate_examples(self, filepath, data_dir):
"""Yields examples."""
with open(filepath, encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
question_and_history = []
for idx, item in enumerate(reader):
item["answer_text"] = _parse_answer_text(item["answer_text"])
item["answer_coordinates"] = _parse_answer_coordinates(item["answer_coordinates"])
header, table_data = _load_table_data(os.path.join(data_dir, item["table_file"]))
item["table_header"] = header
item["table_data"] = table_data
if item["position"] == "0":
question_and_history = [] # reset history
question_and_history.append(item["question"])
item["question_and_history"] = question_and_history
yield idx, item