diff --git a/datasets/superb/README.md b/datasets/superb/README.md index 74c3e1f38f8..a58c15d41da 100644 --- a/datasets/superb/README.md +++ b/datasets/superb/README.md @@ -16,6 +16,7 @@ source_datasets: - original - extended|librispeech_asr - extended|other-librimix +- extended|other-speech_commands task_categories: - speech-processing task_ids: @@ -183,8 +184,14 @@ An example from each split looks like: #### ks -[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) +An example from each split looks like: +```python +{ + 'file': '/path/yes/af7a8296_nohash_1.wav', + 'label': 'yes' +} +``` #### qbe @@ -252,8 +259,8 @@ An example from each split looks like: #### ks -[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) - +- `file` (`string`): Path to the WAV audio file. +- `label` (`string`): Label of the spoken command. #### qbe @@ -311,8 +318,9 @@ The data fields in all splits are: #### ks -[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) - +| | train | validation | test | +|----|------:|-----------:|-----:| +| ks | 51094 | 6798 | 3081 | #### qbe @@ -447,4 +455,4 @@ the correct citation for each contained dataset. ### Contributions -Thanks to [@lewtun](https://github.com/lewtun) and [@albertvillanova](https://github.com/albertvillanova) for adding this dataset. +Thanks to [@lewtun](https://github.com/lewtun), [@albertvillanova](https://github.com/albertvillanova) and [@anton-l](https://github.com/anton-l) for adding this dataset. diff --git a/datasets/superb/dataset_infos.json b/datasets/superb/dataset_infos.json index 8bc72c50993..1bc50411835 100644 --- a/datasets/superb/dataset_infos.json +++ b/datasets/superb/dataset_infos.json @@ -1 +1 @@ -{"asr": {"description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .flac format and is not converted to a float32 array. To\nconvert, the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@article{DBLP:journals/corr/abs-2105-01051,\n author = {Shu{-}Wen Yang and\n Po{-}Han Chi and\n Yung{-}Sung Chuang and\n Cheng{-}I Jeff Lai and\n Kushal Lakhotia and\n Yist Y. Lin and\n Andy T. Liu and\n Jiatong Shi and\n Xuankai Chang and\n Guan{-}Ting Lin and\n Tzu{-}Hsien Huang and\n Wei{-}Cheng Tseng and\n Ko{-}tik Lee and\n Da{-}Rong Liu and\n Zili Huang and\n Shuyan Dong and\n Shang{-}Wen Li and\n Shinji Watanabe and\n Abdelrahman Mohamed and\n Hung{-}yi Lee},\n title = {{SUPERB:} Speech processing Universal PERformance Benchmark},\n journal = {CoRR},\n volume = {abs/2105.01051},\n year = {2021},\n url = {https://arxiv.org/abs/2105.01051},\n archivePrefix = {arXiv},\n eprint = {2105.01051},\n timestamp = {Thu, 01 Jul 2021 13:30:22 +0200},\n biburl = {https://dblp.org/rec/journals/corr/abs-2105-01051.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "superb", "config_name": "asr", "version": {"version_str": "1.9.0", "description": "", "major": 1, "minor": 9, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11823891, "num_examples": 28539, "dataset_name": "superb"}, "validation": {"name": "validation", "num_bytes": 894510, "num_examples": 2703, "dataset_name": "superb"}, "test": {"name": "test", "num_bytes": 868614, "num_examples": 2620, "dataset_name": "superb"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}}, "download_size": 7071899769, "post_processing_size": null, "dataset_size": 13587015, "size_in_bytes": 7085486784}, "sd": {"description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .flac format and is not converted to a float32 array. To\nconvert, the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@article{DBLP:journals/corr/abs-2105-01051,\n author = {Shu{-}Wen Yang and\n Po{-}Han Chi and\n Yung{-}Sung Chuang and\n Cheng{-}I Jeff Lai and\n Kushal Lakhotia and\n Yist Y. Lin and\n Andy T. Liu and\n Jiatong Shi and\n Xuankai Chang and\n Guan{-}Ting Lin and\n Tzu{-}Hsien Huang and\n Wei{-}Cheng Tseng and\n Ko{-}tik Lee and\n Da{-}Rong Liu and\n Zili Huang and\n Shuyan Dong and\n Shang{-}Wen Li and\n Shinji Watanabe and\n Abdelrahman Mohamed and\n Hung{-}yi Lee},\n title = {{SUPERB:} Speech processing Universal PERformance Benchmark},\n journal = {CoRR},\n volume = {abs/2105.01051},\n year = {2021},\n url = {https://arxiv.org/abs/2105.01051},\n archivePrefix = {arXiv},\n eprint = {2105.01051},\n timestamp = {Thu, 01 Jul 2021 13:30:22 +0200},\n biburl = {https://dblp.org/rec/journals/corr/abs-2105-01051.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "https://github.com/ftshijt/LibriMix", "license": "", "features": {"record_id": {"dtype": "string", "id": null, "_type": "Value"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "start": {"dtype": "int64", "id": null, "_type": "Value"}, "end": {"dtype": "int64", "id": null, "_type": "Value"}, "speakers": [{"speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "start": {"dtype": "int64", "id": null, "_type": "Value"}, "end": {"dtype": "int64", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "superb", "config_name": "sd", "version": {"version_str": "1.9.0", "description": "", "major": 1, "minor": 9, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4302290, "num_examples": 13901, "dataset_name": "superb"}, "dev": {"name": "dev", "num_bytes": 791150, "num_examples": 3014, "dataset_name": "superb"}, "test": {"name": "test", "num_bytes": 778757, "num_examples": 3002, "dataset_name": "superb"}}, "download_checksums": {"https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/reco2dur": {"num_bytes": 540906, "checksum": "879dca4b1108c93bd86df879463fca15a4de42a0f95a7e6987138dc6029b5554"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/segments": {"num_bytes": 5723993, "checksum": "f19cb0ecc342f8d2cd855118879a111822d7cf55fcd078ef156f5147233a8e11"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/utt2spk": {"num_bytes": 3165995, "checksum": "a4295726caf05d72f5ad24706180b9dbccffe6c0c2fc0128ca4b02b7b828a28a"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/wav.zip": {"num_bytes": 5706733518, "checksum": "4231070427ffbc9b3bddae874dba32f3985a0db0b0feb4dfa29ed4d1d11bf41b"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/reco2dur": {"num_bytes": 115918, "checksum": "a30fd59ad01db0315a82cad7a64baea009e6c2bcdfb6b2501bc8873ede72de06"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/segments": {"num_bytes": 673006, "checksum": "2b977917e7ab9feec03afb4fd6a4662df90e48dbcc42977a4b9c89c8d40432ee"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/utt2spk": {"num_bytes": 374794, "checksum": "9f47a7bed76e7a03e57d66ba9cc5f57d85d91f748d0b1eb20301d09e6c24cd20"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/wav.zip": {"num_bytes": 765594100, "checksum": "e28b3422ce59e2a5273be924e6ed6b8f115c0983db1997e56441973c27ee1cd8"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/reco2dur": {"num_bytes": 113357, "checksum": "6e013d917015031e2f1383871b52dfc1122e7b16cdee53bd8e5e0a7fbc57e406"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/segments": {"num_bytes": 650742, "checksum": "92f8de0f56c55a34e9111542c24ea13f2d2efaf9ebe64af31250cadab020f987"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/utt2spk": {"num_bytes": 361548, "checksum": "19dcb558aa886f0d553d8d9b8735ea1998b83e96d5245e5511cb732c84625ffd"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/wav.zip": {"num_bytes": 706322334, "checksum": "9c8ee97d3068759c0101bf88684abab77183374dbb3bb40f7c0b25d385992ea6"}}, "download_size": 7190370211, "post_processing_size": null, "dataset_size": 5872197, "size_in_bytes": 7196242408}} \ No newline at end of file +{"asr": {"description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .flac format and is not converted to a float32 array. To\nconvert, the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@article{DBLP:journals/corr/abs-2105-01051,\n author = {Shu{-}Wen Yang and\n Po{-}Han Chi and\n Yung{-}Sung Chuang and\n Cheng{-}I Jeff Lai and\n Kushal Lakhotia and\n Yist Y. Lin and\n Andy T. Liu and\n Jiatong Shi and\n Xuankai Chang and\n Guan{-}Ting Lin and\n Tzu{-}Hsien Huang and\n Wei{-}Cheng Tseng and\n Ko{-}tik Lee and\n Da{-}Rong Liu and\n Zili Huang and\n Shuyan Dong and\n Shang{-}Wen Li and\n Shinji Watanabe and\n Abdelrahman Mohamed and\n Hung{-}yi Lee},\n title = {{SUPERB:} Speech processing Universal PERformance Benchmark},\n journal = {CoRR},\n volume = {abs/2105.01051},\n year = {2021},\n url = {https://arxiv.org/abs/2105.01051},\n archivePrefix = {arXiv},\n eprint = {2105.01051},\n timestamp = {Thu, 01 Jul 2021 13:30:22 +0200},\n biburl = {https://dblp.org/rec/journals/corr/abs-2105-01051.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "superb", "config_name": "asr", "version": {"version_str": "1.9.0", "description": "", "major": 1, "minor": 9, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11852430, "num_examples": 28539, "dataset_name": "superb"}, "validation": {"name": "validation", "num_bytes": 897213, "num_examples": 2703, "dataset_name": "superb"}, "test": {"name": "test", "num_bytes": 871234, "num_examples": 2620, "dataset_name": "superb"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}}, "download_size": 7071899769, "post_processing_size": null, "dataset_size": 13620877, "size_in_bytes": 7085520646}, "sd": {"description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .flac format and is not converted to a float32 array. To\nconvert, the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@article{DBLP:journals/corr/abs-2105-01051,\n author = {Shu{-}Wen Yang and\n Po{-}Han Chi and\n Yung{-}Sung Chuang and\n Cheng{-}I Jeff Lai and\n Kushal Lakhotia and\n Yist Y. Lin and\n Andy T. Liu and\n Jiatong Shi and\n Xuankai Chang and\n Guan{-}Ting Lin and\n Tzu{-}Hsien Huang and\n Wei{-}Cheng Tseng and\n Ko{-}tik Lee and\n Da{-}Rong Liu and\n Zili Huang and\n Shuyan Dong and\n Shang{-}Wen Li and\n Shinji Watanabe and\n Abdelrahman Mohamed and\n Hung{-}yi Lee},\n title = {{SUPERB:} Speech processing Universal PERformance Benchmark},\n journal = {CoRR},\n volume = {abs/2105.01051},\n year = {2021},\n url = {https://arxiv.org/abs/2105.01051},\n archivePrefix = {arXiv},\n eprint = {2105.01051},\n timestamp = {Thu, 01 Jul 2021 13:30:22 +0200},\n biburl = {https://dblp.org/rec/journals/corr/abs-2105-01051.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "https://github.com/ftshijt/LibriMix", "license": "", "features": {"record_id": {"dtype": "string", "id": null, "_type": "Value"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "start": {"dtype": "int64", "id": null, "_type": "Value"}, "end": {"dtype": "int64", "id": null, "_type": "Value"}, "speakers": [{"speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "start": {"dtype": "int64", "id": null, "_type": "Value"}, "end": {"dtype": "int64", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "superb", "config_name": "sd", "version": {"version_str": "1.9.0", "description": "", "major": 1, "minor": 9, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4622013, "num_examples": 13901, "dataset_name": "superb"}, "dev": {"name": "dev", "num_bytes": 860472, "num_examples": 3014, "dataset_name": "superb"}, "test": {"name": "test", "num_bytes": 847803, "num_examples": 3002, "dataset_name": "superb"}}, "download_checksums": {"https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/reco2dur": {"num_bytes": 540906, "checksum": "879dca4b1108c93bd86df879463fca15a4de42a0f95a7e6987138dc6029b5554"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/segments": {"num_bytes": 5723993, "checksum": "f19cb0ecc342f8d2cd855118879a111822d7cf55fcd078ef156f5147233a8e11"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/utt2spk": {"num_bytes": 3165995, "checksum": "a4295726caf05d72f5ad24706180b9dbccffe6c0c2fc0128ca4b02b7b828a28a"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/train/wav.zip": {"num_bytes": 5706733518, "checksum": "4231070427ffbc9b3bddae874dba32f3985a0db0b0feb4dfa29ed4d1d11bf41b"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/reco2dur": {"num_bytes": 115918, "checksum": "a30fd59ad01db0315a82cad7a64baea009e6c2bcdfb6b2501bc8873ede72de06"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/segments": {"num_bytes": 673006, "checksum": "2b977917e7ab9feec03afb4fd6a4662df90e48dbcc42977a4b9c89c8d40432ee"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/utt2spk": {"num_bytes": 374794, "checksum": "9f47a7bed76e7a03e57d66ba9cc5f57d85d91f748d0b1eb20301d09e6c24cd20"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/dev/wav.zip": {"num_bytes": 765594100, "checksum": "e28b3422ce59e2a5273be924e6ed6b8f115c0983db1997e56441973c27ee1cd8"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/reco2dur": {"num_bytes": 113357, "checksum": "6e013d917015031e2f1383871b52dfc1122e7b16cdee53bd8e5e0a7fbc57e406"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/segments": {"num_bytes": 650742, "checksum": "92f8de0f56c55a34e9111542c24ea13f2d2efaf9ebe64af31250cadab020f987"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/utt2spk": {"num_bytes": 361548, "checksum": "19dcb558aa886f0d553d8d9b8735ea1998b83e96d5245e5511cb732c84625ffd"}, "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/test/wav.zip": {"num_bytes": 706322334, "checksum": "9c8ee97d3068759c0101bf88684abab77183374dbb3bb40f7c0b25d385992ea6"}}, "download_size": 7190370211, "post_processing_size": null, "dataset_size": 6330288, "size_in_bytes": 7196700499}, "ks": {"description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .flac format and is not converted to a float32 array. To\nconvert, the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@article{DBLP:journals/corr/abs-2105-01051,\n author = {Shu{-}Wen Yang and\n Po{-}Han Chi and\n Yung{-}Sung Chuang and\n Cheng{-}I Jeff Lai and\n Kushal Lakhotia and\n Yist Y. Lin and\n Andy T. Liu and\n Jiatong Shi and\n Xuankai Chang and\n Guan{-}Ting Lin and\n Tzu{-}Hsien Huang and\n Wei{-}Cheng Tseng and\n Ko{-}tik Lee and\n Da{-}Rong Liu and\n Zili Huang and\n Shuyan Dong and\n Shang{-}Wen Li and\n Shinji Watanabe and\n Abdelrahman Mohamed and\n Hung{-}yi Lee},\n title = {{SUPERB:} Speech processing Universal PERformance Benchmark},\n journal = {CoRR},\n volume = {abs/2105.01051},\n year = {2021},\n url = {https://arxiv.org/abs/2105.01051},\n archivePrefix = {arXiv},\n eprint = {2105.01051},\n timestamp = {Thu, 01 Jul 2021 13:30:22 +0200},\n biburl = {https://dblp.org/rec/journals/corr/abs-2105-01051.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "https://www.tensorflow.org/datasets/catalog/speech_commands", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 12, "names": ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "_silence_", "_unknown_"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "label"}, "task_templates": null, "builder_name": "superb", "config_name": "ks", "version": {"version_str": "1.9.0", "description": "", "major": 1, "minor": 9, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8467781, "num_examples": 51094, "dataset_name": "superb"}, "validation": {"name": "validation", "num_bytes": 1126476, "num_examples": 6798, "dataset_name": "superb"}, "test": {"name": "test", "num_bytes": 510619, "num_examples": 3081, "dataset_name": "superb"}}, "download_checksums": {"http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz": {"num_bytes": 1489096277, "checksum": "743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d"}, "http://download.tensorflow.org/data/speech_commands_test_set_v0.01.tar.gz": {"num_bytes": 71271436, "checksum": "baa084f6b62c91de660ff0588ae4dfc4e4d534aa99ac0e5f406cba75836cbd00"}}, "download_size": 1560367713, "post_processing_size": null, "dataset_size": 10104876, "size_in_bytes": 1570472589}} \ No newline at end of file diff --git a/datasets/superb/dummy/ks/1.9.0/dummy_data.zip b/datasets/superb/dummy/ks/1.9.0/dummy_data.zip new file mode 100644 index 00000000000..fcf5e1dc882 Binary files /dev/null and b/datasets/superb/dummy/ks/1.9.0/dummy_data.zip differ diff --git a/datasets/superb/superb.py b/datasets/superb/superb.py index cd50c62d441..3f9bb7c5777 100644 --- a/datasets/superb/superb.py +++ b/datasets/superb/superb.py @@ -148,6 +148,41 @@ class Superb(datasets.GeneratorBasedBuilder): data_url="http://www.openslr.org/resources/12/", task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")], ), + SuperbConfig( + name="ks", + description=textwrap.dedent( + """\ + Keyword Spotting (KS) detects preregistered keywords by classifying utterances into a predefined set of + words. The task is usually performed on-device for the fast response time. Thus, accuracy, model size, and + inference time are all crucial. SUPERB uses the widely used [Speech Commands dataset v1.0] for the task. + The dataset consists of ten classes of keywords, a class for silence, and an unknown class to include the + false positive. The evaluation metric is accuracy (ACC)""" + ), + features=datasets.Features( + { + "file": datasets.Value("string"), + "label": datasets.ClassLabel( + names=[ + "yes", + "no", + "up", + "down", + "left", + "right", + "on", + "off", + "stop", + "go", + "_silence_", + "_unknown_", + ] + ), + } + ), + supervised_keys=("file", "label"), + url="https://www.tensorflow.org/datasets/catalog/speech_commands", + data_url="http://download.tensorflow.org/data/{filename}", + ), SuperbConfig( name="sd", description=textwrap.dedent( @@ -206,6 +241,25 @@ def _split_generators(self, dl_manager): ), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}), ] + elif self.config.name == "ks": + _DL_URLS = { + "train_val_test": self.config.data_url.format(filename="speech_commands_v0.01.tar.gz"), + "test": self.config.data_url.format(filename="speech_commands_test_set_v0.01.tar.gz"), + } + archive_path = dl_manager.download_and_extract(_DL_URLS) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "val"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"], "split": "test"} + ), + ] elif self.config.name == "sd": splits = ["train", "dev", "test"] _DL_URLS = { @@ -244,6 +298,19 @@ def _generate_examples(self, archive_path, split=None): "text": transcript, } key += 1 + elif self.config.name == "ks": + words = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"] + splits = _split_ks_files(archive_path, split) + for key, audio_file in enumerate(sorted(splits[split])): + base_dir, file_name = os.path.split(audio_file) + _, word = os.path.split(base_dir) + if word in words: + label = word + elif word == "_silence_" or word == "_background_noise_": + label = "_silence_" + else: + label = "_unknown_" + yield key, {"file": audio_file, "label": label} elif self.config.name == "sd": data = SdData(archive_path) args = SdArgs() @@ -383,3 +450,26 @@ def _get_speakers(rec, data, args): } for segment in data.segments[rec] ] + + +def _split_ks_files(archive_path, split): + audio_path = os.path.join(archive_path, "**/*.wav") + audio_paths = glob.glob(audio_path) + if split == "test": + # use all available files for the test archive + return {"test": audio_paths} + + val_list_file = os.path.join(archive_path, "validation_list.txt") + test_list_file = os.path.join(archive_path, "testing_list.txt") + with open(val_list_file, encoding="utf-8") as f: + val_paths = f.read().strip().splitlines() + val_paths = [os.path.join(archive_path, p) for p in val_paths] + with open(test_list_file, encoding="utf-8") as f: + test_paths = f.read().strip().splitlines() + test_paths = [os.path.join(archive_path, p) for p in test_paths] + + # the paths for the train set is just whichever paths that do not exist in + # either the test or validation splits + train_paths = list(set(audio_paths) - set(val_paths) - set(test_paths)) + + return {"train": train_paths, "val": val_paths}