huggingface · lhoestq · Jun 7, 2021 · Jun 5, 2021 · Jun 5, 2021 · Jun 5, 2021
diff --git a/datasets/flores/dataset_infos.json b/datasets/flores/dataset_infos.json
@@ -1 +1 @@
-{"neen": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n    title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n    author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n    year={2019},\n    eprint={1902.01382},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["ne", "en"], "id": null, "_type": "Translation"}}, "supervised_keys": {"input": "ne", "output": "en"}, "builder_name": "flores", "config_name": "neen", "version": {"version_str": "1.1.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1000483, "num_examples": 2836, "dataset_name": "flores"}, "validation": {"name": "validation", "num_bytes": 850660, "num_examples": 2560, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "dataset_size": 1851143, "size_in_bytes": 3393924}, "sien": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n    title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n    author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n    year={2019},\n    eprint={1902.01382},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["si", "en"], "id": null, "_type": "Translation"}}, "supervised_keys": {"input": "si", "output": "en"}, "builder_name": "flores", "config_name": "sien", "version": {"version_str": "1.1.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 984947, "num_examples": 2767, "dataset_name": "flores"}, "validation": {"name": "validation", "num_bytes": 1032610, "num_examples": 2899, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "dataset_size": 2017557, "size_in_bytes": 3560338}}
+{"neen": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n    title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n    author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n    year={2019},\n    eprint={1902.01382},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["ne", "en"], "id": null, "_type": "Translation"}}, "post_processed": null, "supervised_keys": {"input": "ne", "output": "en"}, "task_templates": null, "builder_name": "flores", "config_name": "neen", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 849380, "num_examples": 2560, "dataset_name": "flores"}, "test": {"name": "test", "num_bytes": 999063, "num_examples": 2836, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/floresv1/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "post_processing_size": null, "dataset_size": 1848443, "size_in_bytes": 3391224}, "sien": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n    title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n    author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n    year={2019},\n    eprint={1902.01382},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["si", "en"], "id": null, "_type": "Translation"}}, "post_processed": null, "supervised_keys": {"input": "si", "output": "en"}, "task_templates": null, "builder_name": "flores", "config_name": "sien", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 1031158, "num_examples": 2899, "dataset_name": "flores"}, "test": {"name": "test", "num_bytes": 983563, "num_examples": 2767, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/floresv1/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "post_processing_size": null, "dataset_size": 2014721, "size_in_bytes": 3557502}}
diff --git a/datasets/flores/flores.py b/datasets/flores/flores.py
@@ -37,7 +37,7 @@
 }
 """
 
-_DATA_URL = "https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz"
+_DATA_URL = "https://github.com/facebookresearch/flores/raw/master/floresv1/data/wikipedia_en_ne_si_test_sets.tgz"
 
 # Tuple that describes a single pair of files with matching translations.
 # language_to_file is the map from language (2 letter string: example 'en')
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"neen": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n year={2019},\n eprint={1902.01382},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["ne", "en"], "id": null, "_type": "Translation"}}, "supervised_keys": {"input": "ne", "output": "en"}, "builder_name": "flores", "config_name": "neen", "version": {"version_str": "1.1.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1000483, "num_examples": 2836, "dataset_name": "flores"}, "validation": {"name": "validation", "num_bytes": 850660, "num_examples": 2560, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "dataset_size": 1851143, "size_in_bytes": 3393924}, "sien": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n year={2019},\n eprint={1902.01382},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["si", "en"], "id": null, "_type": "Translation"}}, "supervised_keys": {"input": "si", "output": "en"}, "builder_name": "flores", "config_name": "sien", "version": {"version_str": "1.1.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 984947, "num_examples": 2767, "dataset_name": "flores"}, "validation": {"name": "validation", "num_bytes": 1032610, "num_examples": 2899, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "dataset_size": 2017557, "size_in_bytes": 3560338}}
		{"neen": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n year={2019},\n eprint={1902.01382},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["ne", "en"], "id": null, "_type": "Translation"}}, "post_processed": null, "supervised_keys": {"input": "ne", "output": "en"}, "task_templates": null, "builder_name": "flores", "config_name": "neen", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 849380, "num_examples": 2560, "dataset_name": "flores"}, "test": {"name": "test", "num_bytes": 999063, "num_examples": 2836, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/floresv1/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "post_processing_size": null, "dataset_size": 1848443, "size_in_bytes": 3391224}, "sien": {"description": "Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.\n", "citation": "@misc{guzmn2019new,\n title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},\n author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n year={2019},\n eprint={1902.01382},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/facebookresearch/flores/", "license": "", "features": {"translation": {"languages": ["si", "en"], "id": null, "_type": "Translation"}}, "post_processed": null, "supervised_keys": {"input": "si", "output": "en"}, "task_templates": null, "builder_name": "flores", "config_name": "sien", "version": {"version_str": "1.1.0", "description": "", "major": 1, "minor": 1, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 1031158, "num_examples": 2899, "dataset_name": "flores"}, "test": {"name": "test", "num_bytes": 983563, "num_examples": 2767, "dataset_name": "flores"}}, "download_checksums": {"https://github.com/facebookresearch/flores/raw/master/floresv1/data/wikipedia_en_ne_si_test_sets.tgz": {"num_bytes": 1542781, "checksum": "7a0245bb29fd03b46a1129831c183dfba0efc8452a9739d962759f25141aa648"}}, "download_size": 1542781, "post_processing_size": null, "dataset_size": 2014721, "size_in_bytes": 3557502}}