Fix kor nli csv reader (#855)

lhoestq · web-flow · commit 3d1564eff6cd · 2020-11-16T14:59:12.000+01:00
* iter by line instead of csv reader

* update infos

* style
diff --git a/datasets/kor_nli/dataset_infos.json b/datasets/kor_nli/dataset_infos.json
@@ -1 +1 @@
-{"multi_nli": {"description": " Korean Natural  Language Inference datasets\n", "citation": "@article{ham2020kornli,\n  title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n  author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n  journal={arXiv preprint arXiv:2004.03289},\n  year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "gold_label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "multi_nli", "version": {"version_str": "1.0.0", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 87169599, "num_examples": 385494, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "dataset_size": 87169599, "size_in_bytes": 129282831}}
+{"multi_nli": {"description": " Korean Natural  Language Inference datasets\n", "citation": "@article{ham2020kornli,\n  title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n  author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n  journal={arXiv preprint arXiv:2004.03289},\n  year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "multi_nli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 84729207, "num_examples": 392702, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 84729207, "size_in_bytes": 126842439}, "snli": {"description": " Korean Natural  Language Inference datasets\n", "citation": "@article{ham2020kornli,\n  title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n  author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n  journal={arXiv preprint arXiv:2004.03289},\n  year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "snli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 80137097, "num_examples": 550152, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 80137097, "size_in_bytes": 122250329}, "xnli": {"description": " Korean Natural  Language Inference datasets\n", "citation": "@article{ham2020kornli,\n  title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n  author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n  journal={arXiv preprint arXiv:2004.03289},\n  year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "xnli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 518830, "num_examples": 2490, "dataset_name": "kor_nli"}, "test": {"name": "test", "num_bytes": 1047437, "num_examples": 5010, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 1566267, "size_in_bytes": 43679499}}
diff --git a/datasets/kor_nli/kor_nli.py b/datasets/kor_nli/kor_nli.py
@@ -2,7 +2,6 @@
 
 from __future__ import absolute_import, division, print_function
 
-import csv
 import os
 
 import datasets
@@ -58,9 +57,9 @@ def _info(self):
             features=datasets.Features(
                 {
                     # These are the features of your dataset like images, labels ...
-                    "sentence1": datasets.Value("string"),
-                    "sentence2": datasets.Value("string"),
-                    "gold_label": datasets.Value("string"),
+                    "premise": datasets.Value("string"),
+                    "hypothesis": datasets.Value("string"),
+                    "label": datasets.ClassLabel(names=["entailment", "neutral", "contradiction"]),
                 }
             ),
             # If there's a common (input, target) tuple from the features,
@@ -113,9 +112,11 @@ def _generate_examples(self, filepath):
         """Yields examples."""
         # TODO(kor_nli): Yields (key, example) tuples from the dataset
         with open(filepath, encoding="utf-8") as f:
-            data = csv.DictReader(f, dialect="excel-tab")
-            for id_, row in enumerate(data):
-
+            next(f)  # skip headers
+            columns = ("premise", "hypothesis", "label")
+            for id_, row in enumerate(f):
+                row = row.strip().split("\t")
                 if len(row) != 3:
                     continue
+                row = dict(zip(columns, row))
                 yield id_, row

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"multi_nli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "gold_label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "multi_nli", "version": {"version_str": "1.0.0", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 87169599, "num_examples": 385494, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "dataset_size": 87169599, "size_in_bytes": 129282831}}
	`1`	+{"multi_nli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "multi_nli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 84729207, "num_examples": 392702, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 84729207, "size_in_bytes": 126842439}, "snli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "snli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 80137097, "num_examples": 550152, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 80137097, "size_in_bytes": 122250329}, "xnli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "xnli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 518830, "num_examples": 2490, "dataset_name": "kor_nli"}, "test": {"name": "test", "num_bytes": 1047437, "num_examples": 5010, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 1566267, "size_in_bytes": 43679499}}