test data added, dataset_infos updated (#2263)

bhavitvyamalik · web-flow · commit 74f87cc78845 · 2021-04-29T11:30:20.000+02:00
* test data added, dataset_infos updated

* make style

* make style again
diff --git a/datasets/newsph_nli/dataset_infos.json b/datasets/newsph_nli/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": "    @article{cruz2020investigating,\n      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n      journal={arXiv preprint arXiv:2010.11574},\n      year={2020}\n    }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 342036728, "size_in_bytes": 418602015}}
+{"default": {"description": "    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": "    @article{cruz2020investigating,\n      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n      journal={arXiv preprint arXiv:2010.11574},\n      year={2020}\n    }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 3283665, "num_examples": 9000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 190809794, "size_in_bytes": 267375081}}
diff --git a/datasets/newsph_nli/dummy/1.0.0/dummy_data.zip b/datasets/newsph_nli/dummy/1.0.0/dummy_data.zip
diff --git a/datasets/newsph_nli/newsph_nli.py b/datasets/newsph_nli/newsph_nli.py
@@ -21,22 +21,24 @@
 
 
 _DESCRIPTION = """\
-    First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.
+First benchmark dataset for sentence entailment in the low-resource Filipino language.
+Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,
+in 70-15-15 split for training, validation, and testing.
 """
 
 _CITATION = """\
-    @article{cruz2020investigating,
-      title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
-      author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
-      journal={arXiv preprint arXiv:2010.11574},
-      year={2020}
-    }
+@article{cruz2020investigating,
+    title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
+    author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
+    journal={arXiv preprint arXiv:2010.11574},
+    year={2020}
+}
 """
 
 _HOMEPAGE = "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks"
 
 # TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0"
 
 _URL = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip"
 
@@ -68,7 +70,7 @@ def _split_generators(self, dl_manager):
         data_dir = dl_manager.download_and_extract(_URL)
         download_path = os.path.join(data_dir, "newsph-nli")
         train_path = os.path.join(download_path, "train.csv")
-        test_path = os.path.join(download_path, "train.csv")
+        test_path = os.path.join(download_path, "test.csv")
         validation_path = os.path.join(download_path, "valid.csv")
 
         return [

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 342036728, "size_in_bytes": 418602015}}
	`1`	+{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 3283665, "num_examples": 9000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 190809794, "size_in_bytes": 267375081}}