Skip to content

Commit 74f87cc

Browse files
test data added, dataset_infos updated (#2263)
* test data added, dataset_infos updated * make style * make style again
1 parent 80e59ef commit 74f87cc

File tree

3 files changed

+12
-10
lines changed

3 files changed

+12
-10
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 342036728, "size_in_bytes": 418602015}}
1+
{"default": {"description": " First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.\n", "citation": " @article{cruz2020investigating,\n title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},\n author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},\n journal={arXiv preprint arXiv:2010.11574},\n year={2020}\n }\n", "homepage": "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks", "license": "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "newsph_nli", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 154510599, "num_examples": 420000, "dataset_name": "newsph_nli"}, "test": {"name": "test", "num_bytes": 3283665, "num_examples": 9000, "dataset_name": "newsph_nli"}, "validation": {"name": "validation", "num_bytes": 33015530, "num_examples": 90000, "dataset_name": "newsph_nli"}}, "download_checksums": {"https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip": {"num_bytes": 76565287, "checksum": "544823dffe5b253718746ecc66d34116d918deb9886a58077447aeafe9538374"}}, "download_size": 76565287, "post_processing_size": null, "dataset_size": 190809794, "size_in_bytes": 267375081}}
0 Bytes
Binary file not shown.

datasets/newsph_nli/newsph_nli.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,24 @@
2121

2222

2323
_DESCRIPTION = """\
24-
First benchmark dataset for sentence entailment in the low-resource Filipino language. Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs, in 70-15-15 split for training, validation, and testing.
24+
First benchmark dataset for sentence entailment in the low-resource Filipino language.
25+
Constructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,
26+
in 70-15-15 split for training, validation, and testing.
2527
"""
2628

2729
_CITATION = """\
28-
@article{cruz2020investigating,
29-
title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
30-
author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
31-
journal={arXiv preprint arXiv:2010.11574},
32-
year={2020}
33-
}
30+
@article{cruz2020investigating,
31+
title={Investigating the True Performance of Transformers in Low-Resource Languages: A Case Study in Automatic Corpus Creation},
32+
author={Jan Christian Blaise Cruz and Jose Kristian Resabal and James Lin and Dan John Velasco and Charibeth Cheng},
33+
journal={arXiv preprint arXiv:2010.11574},
34+
year={2020}
35+
}
3436
"""
3537

3638
_HOMEPAGE = "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks"
3739

3840
# TODO: Add the licence for the dataset here if you can find it
39-
_LICENSE = ""
41+
_LICENSE = "Filipino-Text-Benchmarks is licensed under the GNU General Public License v3.0"
4042

4143
_URL = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph-nli.zip"
4244

@@ -68,7 +70,7 @@ def _split_generators(self, dl_manager):
6870
data_dir = dl_manager.download_and_extract(_URL)
6971
download_path = os.path.join(data_dir, "newsph-nli")
7072
train_path = os.path.join(download_path, "train.csv")
71-
test_path = os.path.join(download_path, "train.csv")
73+
test_path = os.path.join(download_path, "test.csv")
7274
validation_path = os.path.join(download_path, "valid.csv")
7375

7476
return [

0 commit comments

Comments
 (0)