huggingface · lhoestq · Apr 23, 2021 · Apr 14, 2021 · Apr 14, 2021 · Apr 20, 2021
diff --git a/datasets/openslr/README.md b/datasets/openslr/README.md
@@ -4,6 +4,11 @@ annotations_creators:
 language_creators:
 - found
 languages:
+  SLR32:
+  - af
+  - st
+  - tn
+  - xh
   SLR35:
   - jv
   SLR36:
@@ -103,6 +108,20 @@ task_ids:
 OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition, 
 and software related to speech recognition. Currently, following resources are available: 
 
+#### SLR32: High quality TTS data for four South African languages (af, st, tn, xh).
+This data set contains multi-speaker high quality transcribed audio data for four languages of South Africa.
+The data set consists of wave files, and a TSV file transcribing the audio. In each folder, the file line_index.tsv
+contains a FileID, which in turn contains the UserID and the Transcription of audio in the file.
+
+The data set has had some quality checks, but there might still be errors.
+
+This data set was collected by as a collaboration between North West University and Google.
+
+The dataset is distributed under Creative Commons Attribution-ShareAlike 4.0 International Public License.
+See https://github.com/google/language-resources#license for license information.
+
+Copyright 2017 Google, Inc.
+
 #### SLR35: Large Javanese ASR training data set.
 This data set contains transcribed audio data for Javanese (~185K utterances). The data set consists of wave files,
 and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file.
@@ -452,15 +471,17 @@ Copyright 2018, 2019, 2020 Google, Inc.
 
 ### Languages
 
-Javanese, Khmer, Nepali, Sundanese, Malayalam, Marathi, Tamil, Telugu, Catalan
+Javanese, Khmer, Nepali, Sundanese, Malayalam, Marathi, Tamil, Telugu, Catalan, Nigerian English, Chilean Spanish,
+Columbian Spanish, Peruvian Spanish, Puerto Rico Spanish, Venezuelan Spanish, Basque, Galician, Gujarati, Kannada,
+Afrikaans, Sesotho, Setswana and isiXhosa.
 
 ## Dataset Structure
 
 ### Data Instances
 
 A typical data point comprises the path to the audio file, called path and its sentence. 
 
-#### SLR35, SLR36, SLR41, SLR42, SLR43, SLR44, SLR63, SLR64, SLR65, SLR66, SLR69, SLR70, SLR71, SLR72, SLR73, SLR74, SLR75, SLR76, SLR77, SLR78, SLR79, SLR80, SLR86
+#### SLR32, SLR35, SLR36, SLR41, SLR42, SLR43, SLR44, SLR63, SLR64, SLR65, SLR66, SLR69, SLR70, SLR71, SLR72, SLR73, SLR74, SLR75, SLR76, SLR77, SLR78, SLR79, SLR80, SLR86
 ```
 {
   'path': '/home/cahya/.cache/huggingface/datasets/downloads/extracted/4d9cf915efc21110199074da4d492566dee6097068b07a680f670fcec9176e62/su_id_female/wavs/suf_00297_00037352660.wav'
@@ -533,6 +554,19 @@ The speech material has only train dataset.
 [More Information Needed] 
 
 ### Citation Information
+#### SLR32
+```
+@inproceedings{van-niekerk-etal-2017,
+    title = {{Rapid development of TTS corpora for four South African languages}},
+    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson and Martin Jansche and Linne Ha},
+    booktitle = {Proc. Interspeech 2017},
+    pages = {2178--2182},
+    address = {Stockholm, Sweden},
+    month = aug,
+    year  = {2017},
+    URL   = {https://dx.doi.org/10.21437/Interspeech.2017-1139}
+}
+```
 
 #### SLR35, SLR36
 ```
@@ -594,7 +628,7 @@ The speech material has only train dataset.
 }
 ```
 
-#### SLR71, SLR71, SLR72, SLR73, SLR74, SLR75
+#### SLR70, SLR71, SLR72, SLR73, SLR74, SLR75
 ```
 @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
     title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},

diff --git a/datasets/openslr/dataset_infos.json b/datasets/openslr/dataset_infos.json
diff --git a/datasets/openslr/dummy/SLR32/0.0.0/dummy_data.zip b/datasets/openslr/dummy/SLR32/0.0.0/dummy_data.zip
diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
@@ -26,6 +26,19 @@
 _DATA_URL = "https://openslr.org/resources/{}"
 
 _CITATION = """\
+SLR32:
+@inproceedings{van-niekerk-etal-2017,
+    title = {{Rapid development of TTS corpora for four South African languages}},
+    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson
+    and Martin Jansche and Linne Ha},
+    booktitle = {Proc. Interspeech 2017},
+    pages = {2178--2182},
+    address = {Stockholm, Sweden},
+    month = aug,
+    year  = {2017},
+    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}
+}
+
 SLR35, SLR36:
 @inproceedings{kjartansson-etal-sltu2018,
     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},
@@ -35,25 +48,29 @@
     address = {Gurugram, India},
     month = aug,
     pages = {52--55},
-    URL   = {http://dx.doi.org/10.21437/SLTU.2018-11},
+    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},
 }
 
 SLR41, SLR42, SLR43, SLR44:
 @inproceedings{kjartansson-etal-tts-sltu2018,
-    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
-    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
+    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,
+    Khmer, Nepali, Sinhala, and Sundanese}},
+    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu
+    De Silva and Supheakmungkol Sarin},
     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
     year  = {2018},
     address = {Gurugram, India},
     month = aug,
     pages = {66--70},
-    URL   = {http://dx.doi.org/10.21437/SLTU.2018-14}
+    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}
 }
 
 SLR63, SLR64, SLR65, SLR66, SLR78, SLR79:
 @inproceedings{he-etal-2020-open,
-  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
-  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
+  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and
+  Telugu Speech Synthesis Systems}},
+  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,
+  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
   booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
   month = may,
   year = {2020},
@@ -68,7 +85,8 @@
 @inproceedings{kjartansson-etal-2020-open,
     title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
     author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
-    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
+    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages
+    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
     year = {2020},
     pages = {21--27},
     month = may,
@@ -81,7 +99,8 @@
 SLR71, SLR71, SLR72, SLR73, SLR74, SLR75:
 @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
     title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
-    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
+    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,
+    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
     year = {2020},
     month = may,
@@ -94,8 +113,10 @@
 
 SLR80
 @inproceedings{oo-etal-2020-burmese,
-    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application to Text-to-Speech}},
-    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},
+    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application
+    to Text-to-Speech}},
+    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,
+    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},
     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
     month = may,
     year = {2020},
@@ -132,6 +153,21 @@
 _LICENSE = ""
 
 _RESOURCES = {
+    "SLR32": {
+        "Language": "South African",
+        "LongName": "High quality TTS data for four South African languages (af, st, tn, xh)",
+        "Category": "Speech",
+        "Summary": "Multi-speaker TTS data for four South African languages, Afrikaans, Sesotho, "
+        "Setswana and isiXhosa.",
+        "Files": ["af_za.tar.gz", "st_za.tar.gz", "tn_za.tar.gz", "xh_za.tar.gz"],
+        "IndexFiles": [
+            "af_za/za/afr/line_index.tsv",
+            "st_za/za/sso/line_index.tsv",
+            "tn_za/za/tsn/line_index.tsv",
+            "xh_za/za/xho/line_index.tsv",
+        ],
+        "DataDirs": ["af_za/za/afr/wavs", "st_za/za/sso/wavs", "tn_za/za/tsn/wavs", "xh_za/za/xho/wavs"],
+    },
     "SLR35": {
         "Language": "Javanese",
         "LongName": "Large Javanese ASR training data set",
@@ -395,7 +431,10 @@ def __init__(self, name, **kwargs):
         self.files = kwargs.pop("files", None)
         self.index_files = kwargs.pop("index_files", None)
         self.data_dirs = kwargs.pop("data_dirs", None)
-        description = f"Open Speech and Language Resources dataset in {self.language}. Name: {self.name}, Summary: {self.summary}."
+        description = (
+            f"Open Speech and Language Resources dataset in {self.language}. Name: {self.name}, "
+            f"Summary: {self.summary}."
+        )
         super(OpenSlrConfig, self).__init__(name=name, description=description, **kwargs)