huggingface · albertvillanova · Jul 13, 2021 · Jul 13, 2021
diff --git a/datasets/lj_speech/README.md b/datasets/lj_speech/README.md
@@ -9,16 +9,16 @@ licenses:
 - other-public-domain
 multilinguality:
 - monolingual
+paperswithcode_id: ljspeech
+pretty_name: LJ Speech
 size_categories:
 - 10K<n<100K
 source_datasets:
 - original
 task_categories:
-- other
+- speech-processing
 task_ids:
-- other-other-automatic-speech-recognition
-- other-other-text-to-speech
-paperswithcode_id: ljspeech
+- automatic-speech-recognition
 ---
 
 # Dataset Card for lj_speech

diff --git a/datasets/lj_speech/dataset_infos.json b/datasets/lj_speech/dataset_infos.json
@@ -1 +1 @@
-{"main": {"description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading \npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length \nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@misc{ljspeech17,\n  author       = {Keith Ito and Linda Johnson},\n  title        = {The LJ Speech Dataset},\n  howpublished = {\\url{https://keithito.com/LJ-Speech-Dataset/}},\n  year         = 2017\n}\n", "homepage": "https://keithito.com/LJ-Speech-Dataset/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "normalized_text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "builder_name": "lj_speech", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4667022, "num_examples": 13100, "dataset_name": "lj_speech"}}, "download_checksums": {"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2": {"num_bytes": 2748572632, "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5"}}, "download_size": 2748572632, "post_processing_size": null, "dataset_size": 4667022, "size_in_bytes": 2753239654}}
+{"main": {"description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading \npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length \nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@misc{ljspeech17,\n  author       = {Keith Ito and Linda Johnson},\n  title        = {The LJ Speech Dataset},\n  howpublished = {\\url{https://keithito.com/LJ-Speech-Dataset/}},\n  year         = 2017\n}\n", "homepage": "https://keithito.com/LJ-Speech-Dataset/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "normalized_text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "lj_speech", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4667022, "num_examples": 13100, "dataset_name": "lj_speech"}}, "download_checksums": {"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2": {"num_bytes": 2748572632, "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5"}}, "download_size": 2748572632, "post_processing_size": null, "dataset_size": 4667022, "size_in_bytes": 2753239654}}
diff --git a/datasets/lj_speech/lj_speech.py b/datasets/lj_speech/lj_speech.py
@@ -21,6 +21,7 @@
 import os
 
 import datasets
+from datasets.tasks import AutomaticSpeechRecognition
 
 
 _CITATION = """\
@@ -81,6 +82,7 @@ def _info(self):
             supervised_keys=("file", "text"),
             homepage=_URL,
             citation=_CITATION,
+            task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
         )
 
     def _split_generators(self, dl_manager):
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"main": {"description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading \npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length \nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@misc{ljspeech17,\n author = {Keith Ito and Linda Johnson},\n title = {The LJ Speech Dataset},\n howpublished = {\\url{https://keithito.com/LJ-Speech-Dataset/}},\n year = 2017\n}\n", "homepage": "https://keithito.com/LJ-Speech-Dataset/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "normalized_text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "builder_name": "lj_speech", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4667022, "num_examples": 13100, "dataset_name": "lj_speech"}}, "download_checksums": {"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2": {"num_bytes": 2748572632, "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5"}}, "download_size": 2748572632, "post_processing_size": null, "dataset_size": 4667022, "size_in_bytes": 2753239654}}
		{"main": {"description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading \npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length \nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@misc{ljspeech17,\n author = {Keith Ito and Linda Johnson},\n title = {The LJ Speech Dataset},\n howpublished = {\\url{https://keithito.com/LJ-Speech-Dataset/}},\n year = 2017\n}\n", "homepage": "https://keithito.com/LJ-Speech-Dataset/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "normalized_text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "lj_speech", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4667022, "num_examples": 13100, "dataset_name": "lj_speech"}}, "download_checksums": {"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2": {"num_bytes": 2748572632, "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5"}}, "download_size": 2748572632, "post_processing_size": null, "dataset_size": 4667022, "size_in_bytes": 2753239654}}