From c5fa638db79c4fb7d0d595f634ef6597f36ab747 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 31 May 2022 10:46:59 +0200 Subject: [PATCH 1/3] Fix DuplicatedKeysError in timit_asr dataset --- datasets/timit_asr/timit_asr.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datasets/timit_asr/timit_asr.py b/datasets/timit_asr/timit_asr.py index 0336cdb952e..d92664afe07 100644 --- a/datasets/timit_asr/timit_asr.py +++ b/datasets/timit_asr/timit_asr.py @@ -129,7 +129,7 @@ def _split_generators(self, dl_manager): def _generate_examples(self, split, data_dir): """Generate examples from TIMIT archive_path based on the test/train csv information.""" # Iterating the contents of the data to extract the relevant information - for wav_path in sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV")): + for id_, wav_path in enumerate(sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV"))): # extract transcript with open(wav_path.with_suffix(".TXT"), encoding="utf-8") as op: @@ -160,7 +160,6 @@ def _generate_examples(self, split, data_dir): dialect_region = wav_path.parents[1].name sentence_type = wav_path.name[0:2] speaker_id = wav_path.parents[0].name[1:] - id_ = wav_path.stem example = { "file": str(wav_path), @@ -171,7 +170,7 @@ def _generate_examples(self, split, data_dir): "dialect_region": dialect_region, "sentence_type": sentence_type, "speaker_id": speaker_id, - "id": id_, + "id": wav_path.stem, } yield id_, example From 79e80b3b806263dbbc753dcd48f70134c8bd5b02 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 31 May 2022 12:12:03 +0200 Subject: [PATCH 2/3] Refactor --- datasets/timit_asr/timit_asr.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datasets/timit_asr/timit_asr.py b/datasets/timit_asr/timit_asr.py index d92664afe07..25e86bf08a9 100644 --- a/datasets/timit_asr/timit_asr.py +++ b/datasets/timit_asr/timit_asr.py @@ -129,7 +129,7 @@ def _split_generators(self, dl_manager): def _generate_examples(self, split, data_dir): """Generate examples from TIMIT archive_path based on the test/train csv information.""" # Iterating the contents of the data to extract the relevant information - for id_, wav_path in enumerate(sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV"))): + for key, wav_path in enumerate(sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV"))): # extract transcript with open(wav_path.with_suffix(".TXT"), encoding="utf-8") as op: @@ -160,6 +160,7 @@ def _generate_examples(self, split, data_dir): dialect_region = wav_path.parents[1].name sentence_type = wav_path.name[0:2] speaker_id = wav_path.parents[0].name[1:] + id_ = wav_path.stem example = { "file": str(wav_path), @@ -170,7 +171,7 @@ def _generate_examples(self, split, data_dir): "dialect_region": dialect_region, "sentence_type": sentence_type, "speaker_id": speaker_id, - "id": wav_path.stem, + "id": id_, } - yield id_, example + yield key, example From 09ec67ad1c1b20683e55106f8d1081d233089e61 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 31 May 2022 12:12:16 +0200 Subject: [PATCH 3/3] Update dataset card --- datasets/timit_asr/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/timit_asr/README.md b/datasets/timit_asr/README.md index 03b9eec559e..eba7cd2b75e 100644 --- a/datasets/timit_asr/README.md +++ b/datasets/timit_asr/README.md @@ -170,7 +170,7 @@ A typical data point comprises the path to the audio file, usually called `file` - speaker_id: Unique id of the speaker. The same speaker id can be found for multiple data samples. -- id: Unique id of the data sample. Contains the . +- id: ID of the data sample. Contains the . ### Data Splits