Skip to content

Commit 3d1564e

Browse files
authored
Fix kor nli csv reader (#855)
* iter by line instead of csv reader * update infos * style
1 parent d21457e commit 3d1564e

File tree

2 files changed

+9
-8
lines changed

2 files changed

+9
-8
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"multi_nli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "gold_label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "multi_nli", "version": {"version_str": "1.0.0", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 87169599, "num_examples": 385494, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "dataset_size": 87169599, "size_in_bytes": 129282831}}
1+
{"multi_nli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "multi_nli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 84729207, "num_examples": 392702, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 84729207, "size_in_bytes": 126842439}, "snli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "snli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 80137097, "num_examples": 550152, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 80137097, "size_in_bytes": 122250329}, "xnli": {"description": " Korean Natural Language Inference datasets\n", "citation": "@article{ham2020kornli,\n title={KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding},\n author={Ham, Jiyeon and Choe, Yo Joong and Park, Kyubyong and Choi, Ilji and Soh, Hyungjoon},\n journal={arXiv preprint arXiv:2004.03289},\n year={2020}\n}\n", "homepage": "https://github.com/kakaobrain/KorNLUDatasets", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "neutral", "contradiction"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "kor_nli", "config_name": "xnli", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 518830, "num_examples": 2490, "dataset_name": "kor_nli"}, "test": {"name": "test", "num_bytes": 1047437, "num_examples": 5010, "dataset_name": "kor_nli"}}, "download_checksums": {"https://github.com/kakaobrain/KorNLUDatasets/archive/master.zip": {"num_bytes": 42113232, "checksum": "b1184d5e78a7d988400eabe3374b8a7e2abf182896f54e6e311c5173bb2c9bf5"}}, "download_size": 42113232, "post_processing_size": null, "dataset_size": 1566267, "size_in_bytes": 43679499}}

datasets/kor_nli/kor_nli.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from __future__ import absolute_import, division, print_function
44

5-
import csv
65
import os
76

87
import datasets
@@ -58,9 +57,9 @@ def _info(self):
5857
features=datasets.Features(
5958
{
6059
# These are the features of your dataset like images, labels ...
61-
"sentence1": datasets.Value("string"),
62-
"sentence2": datasets.Value("string"),
63-
"gold_label": datasets.Value("string"),
60+
"premise": datasets.Value("string"),
61+
"hypothesis": datasets.Value("string"),
62+
"label": datasets.ClassLabel(names=["entailment", "neutral", "contradiction"]),
6463
}
6564
),
6665
# If there's a common (input, target) tuple from the features,
@@ -113,9 +112,11 @@ def _generate_examples(self, filepath):
113112
"""Yields examples."""
114113
# TODO(kor_nli): Yields (key, example) tuples from the dataset
115114
with open(filepath, encoding="utf-8") as f:
116-
data = csv.DictReader(f, dialect="excel-tab")
117-
for id_, row in enumerate(data):
118-
115+
next(f) # skip headers
116+
columns = ("premise", "hypothesis", "label")
117+
for id_, row in enumerate(f):
118+
row = row.strip().split("\t")
119119
if len(row) != 3:
120120
continue
121+
row = dict(zip(columns, row))
121122
yield id_, row

0 commit comments

Comments
 (0)