diff --git a/datasets/squad/README.md b/datasets/squad/README.md index 45618cb3985..c22de2e9f5a 100644 --- a/datasets/squad/README.md +++ b/datasets/squad/README.md @@ -1,4 +1,5 @@ --- +pretty_name: SQuAD annotations_creators: - crowdsourced language_creators: diff --git a/datasets/squad/squad.py b/datasets/squad/squad.py index 8f91dc18de6..6880da1e80c 100644 --- a/datasets/squad/squad.py +++ b/datasets/squad/squad.py @@ -120,15 +120,15 @@ def _generate_examples(self, filepath): with open(filepath, encoding="utf-8") as f: squad = json.load(f) for article in squad["data"]: - title = article.get("title", "").strip() + title = article.get("title", "") for paragraph in article["paragraphs"]: - context = paragraph["context"].strip() + context = paragraph["context"] # do not strip leading blank spaces GH-2585 for qa in paragraph["qas"]: - question = qa["question"].strip() + question = qa["question"] id_ = qa["id"] answer_starts = [answer["answer_start"] for answer in qa["answers"]] - answers = [answer["text"].strip() for answer in qa["answers"]] + answers = [answer["text"] for answer in qa["answers"]] # Features currently used are "context", "question", and "answers". # Others are extracted here for the ease of future expansions. diff --git a/datasets/squad_v2/README.md b/datasets/squad_v2/README.md index 58b7ed2b2f5..47ccdf8a794 100644 --- a/datasets/squad_v2/README.md +++ b/datasets/squad_v2/README.md @@ -1,4 +1,5 @@ --- +pretty_name: SQuAD2.0 annotations_creators: - crowdsourced language_creators: diff --git a/datasets/squad_v2/squad_v2.py b/datasets/squad_v2/squad_v2.py index e1091fca7e7..a754d5160d9 100644 --- a/datasets/squad_v2/squad_v2.py +++ b/datasets/squad_v2/squad_v2.py @@ -109,15 +109,15 @@ def _generate_examples(self, filepath): with open(filepath, encoding="utf-8") as f: squad = json.load(f) for example in squad["data"]: - title = example.get("title", "").strip() + title = example.get("title", "") for paragraph in example["paragraphs"]: - context = paragraph["context"].strip() + context = paragraph["context"] # do not strip leading blank spaces GH-2585 for qa in paragraph["qas"]: - question = qa["question"].strip() + question = qa["question"] id_ = qa["id"] answer_starts = [answer["answer_start"] for answer in qa["answers"]] - answers = [answer["text"].strip() for answer in qa["answers"]] + answers = [answer["text"] for answer in qa["answers"]] # Features currently used are "context", "question", and "answers". # Others are extracted here for the ease of future expansions.