huggingface · lhoestq · May 7, 2021 · May 3, 2021 · May 4, 2021
diff --git a/datasets/openslr/README.md b/datasets/openslr/README.md
@@ -21,6 +21,12 @@ languages:
   - ne
   SLR44:
   - su
+  SLR52:
+  - si
+  SLR53:
+  - bn
+  SLR54:
+  - ne
   SLR63:
   - ml
   SLR64:
@@ -211,6 +217,42 @@ https://github.com/google/language-resources#license for license information.
 
 Copyright 2016, 2017, 2018 Google LLC
 
+#### SLR52: Large Sinhala ASR training data set.
+This data set contains transcribed audio data for Sinhala (~185K utterances). The data set consists of wave files,
+and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file.
+
+The data set has been manually quality checked, but there might still be errors.
+
+The dataset is distributed under Creative Commons Attribution-ShareAlike 4.0 International Public License.
+See [LICENSE](https://www.openslr.org/resources/52/LICENSE) file and 
+https://github.com/google/language-resources#license for license information.
+
+Copyright 2016, 2017, 2018 Google, Inc.
+
+#### SLR53: Large Bengali ASR training data set.
+This data set contains transcribed audio data for Bengali (~196K utterances). The data set consists of wave files,
+and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file.
+
+The data set has been manually quality checked, but there might still be errors.
+
+The dataset is distributed under Creative Commons Attribution-ShareAlike 4.0 International Public License.
+See [LICENSE](https://www.openslr.org/resources/53/LICENSE) file and 
+https://github.com/google/language-resources#license for license information.
+
+Copyright 2016, 2017, 2018 Google, Inc.
+
+#### SLR54: Large Nepali ASR training data set.
+This data set contains transcribed audio data for Nepali (~157K utterances). The data set consists of wave files,
+and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file.
+
+The data set has been manually quality checked, but there might still be errors.
+
+The dataset is distributed under Creative Commons Attribution-ShareAlike 4.0 International Public License.
+See [LICENSE](https://www.openslr.org/resources/54/LICENSE) file and 
+https://github.com/google/language-resources#license for license information.
+
+Copyright 2016, 2017, 2018 Google, Inc.
+
 #### SLR63: Crowdsourced high-quality Malayalam multi-speaker speech data set
 This data set contains transcribed high-quality audio of Malayalam sentences recorded by volunteers. The data set 
 consists of wave files, and a TSV file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and 
@@ -481,7 +523,7 @@ Afrikaans, Sesotho, Setswana and isiXhosa.
 
 A typical data point comprises the path to the audio file, called path and its sentence. 
 
-#### SLR32, SLR35, SLR36, SLR41, SLR42, SLR43, SLR44, SLR63, SLR64, SLR65, SLR66, SLR69, SLR70, SLR71, SLR72, SLR73, SLR74, SLR75, SLR76, SLR77, SLR78, SLR79, SLR80, SLR86
+#### SLR32, SLR35, SLR36, SLR41, SLR42, SLR43, SLR44, SLR52, SLR53, SLR54, SLR63, SLR64, SLR65, SLR66, SLR69, SLR70, SLR71, SLR72, SLR73, SLR74, SLR75, SLR76, SLR77, SLR78, SLR79, SLR80, SLR86
 ```
 {
   'path': '/home/cahya/.cache/huggingface/datasets/downloads/extracted/4d9cf915efc21110199074da4d492566dee6097068b07a680f670fcec9176e62/su_id_female/wavs/suf_00297_00037352660.wav'
@@ -568,7 +610,7 @@ The speech material has only train dataset.
 }
 ```
 
-#### SLR35, SLR36
+#### SLR35, SLR36, SLR52, SLR53, SLR54
 ```
 @inproceedings{kjartansson-etal-sltu2018,
     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},

diff --git a/datasets/openslr/dataset_infos.json b/datasets/openslr/dataset_infos.json
diff --git a/datasets/openslr/dummy/SLR52/0.0.0/dummy_data.zip b/datasets/openslr/dummy/SLR52/0.0.0/dummy_data.zip
diff --git a/datasets/openslr/dummy/SLR53/0.0.0/dummy_data.zip b/datasets/openslr/dummy/SLR53/0.0.0/dummy_data.zip
diff --git a/datasets/openslr/dummy/SLR54/0.0.0/dummy_data.zip b/datasets/openslr/dummy/SLR54/0.0.0/dummy_data.zip
diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
@@ -39,7 +39,7 @@
     URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}
 }
 
-SLR35, SLR36:
+SLR35, SLR36, SLR52, SLR53, SLR54:
 @inproceedings{kjartansson-etal-sltu2018,
     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},
     author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
@@ -256,6 +256,84 @@
         "IndexFiles": ["su_id_female/line_index.tsv", "su_id_male/line_index.tsv"],
         "DataDirs": ["su_id_female/wavs", "su_id_male/wavs"],
     },
+    "SLR52": {
+        "Language": "Sinhala",
+        "LongName": "Large Sinhala ASR training data set",
+        "Category": "Speech",
+        "Summary": "Sinhala ASR training data set containing ~185K utterances",
+        "Files": [
+            "asr_sinhala_0.zip",
+            "asr_sinhala_1.zip",
+            "asr_sinhala_2.zip",
+            "asr_sinhala_3.zip",
+            "asr_sinhala_4.zip",
+            "asr_sinhala_5.zip",
+            "asr_sinhala_6.zip",
+            "asr_sinhala_7.zip",
+            "asr_sinhala_8.zip",
+            "asr_sinhala_9.zip",
+            "asr_sinhala_a.zip",
+            "asr_sinhala_b.zip",
+            "asr_sinhala_c.zip",
+            "asr_sinhala_d.zip",
+            "asr_sinhala_e.zip",
+            "asr_sinhala_f.zip",
+        ],
+        "IndexFiles": ["asr_sinhala/utt_spk_text.tsv"] * 16,
+        "DataDirs": ["asr_sinhala/data"] * 16,
+    },
+    "SLR53": {
+        "Language": "Bengali",
+        "LongName": "Large Bengali ASR training data set",
+        "Category": "Speech",
+        "Summary": "Bengali ASR training data set containing ~196K utterances",
+        "Files": [
+            "asr_bengali_0.zip",
+            "asr_bengali_1.zip",
+            "asr_bengali_2.zip",
+            "asr_bengali_3.zip",
+            "asr_bengali_4.zip",
+            "asr_bengali_5.zip",
+            "asr_bengali_6.zip",
+            "asr_bengali_7.zip",
+            "asr_bengali_8.zip",
+            "asr_bengali_9.zip",
+            "asr_bengali_a.zip",
+            "asr_bengali_b.zip",
+            "asr_bengali_c.zip",
+            "asr_bengali_d.zip",
+            "asr_bengali_e.zip",
+            "asr_bengali_f.zip",
+        ],
+        "IndexFiles": ["asr_bengali/utt_spk_text.tsv"] * 16,
+        "DataDirs": ["asr_bengali/data"] * 16,
+    },
+    "SLR54": {
+        "Language": "Nepali",
+        "LongName": "Large Nepali ASR training data set",
+        "Category": "Speech",
+        "Summary": "Nepali ASR training data set containing ~157K utterances",
+        "Files": [
+            "asr_nepali_0.zip",
+            "asr_nepali_1.zip",
+            "asr_nepali_2.zip",
+            "asr_nepali_3.zip",
+            "asr_nepali_4.zip",
+            "asr_nepali_5.zip",
+            "asr_nepali_6.zip",
+            "asr_nepali_7.zip",
+            "asr_nepali_8.zip",
+            "asr_nepali_9.zip",
+            "asr_nepali_a.zip",
+            "asr_nepali_b.zip",
+            "asr_nepali_c.zip",
+            "asr_nepali_d.zip",
+            "asr_nepali_e.zip",
+            "asr_nepali_f.zip",
+        ],
+        "IndexFiles": ["asr_nepali/utt_spk_text.tsv"] * 16,
+        "DataDirs": ["asr_nepali/data"] * 16,
+    },
     "SLR63": {
         "Language": "Malayalam",
         "LongName": "Crowdsourced high-quality Malayalam multi-speaker speech data set",
@@ -493,7 +571,7 @@ def _generate_examples(self, path_to_indexs, path_to_datas):
         """Yields examples."""
 
         counter = -1
-        if self.config.name in ["SLR35", "SLR36"]:
+        if self.config.name in ["SLR35", "SLR36", "SLR52", "SLR53", "SLR54"]:
             sentence_index = {}
             for i, path_to_index in enumerate(path_to_indexs):
                 with open(path_to_index, encoding="utf-8") as f: