From c5fa638db79c4fb7d0d595f634ef6597f36ab747 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 31 May 2022 10:46:59 +0200
Subject: [PATCH 1/3] Fix DuplicatedKeysError in timit_asr dataset

---
 datasets/timit_asr/timit_asr.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/datasets/timit_asr/timit_asr.py b/datasets/timit_asr/timit_asr.py
index 0336cdb952e..d92664afe07 100644
--- a/datasets/timit_asr/timit_asr.py
+++ b/datasets/timit_asr/timit_asr.py
@@ -129,7 +129,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(self, split, data_dir):
         """Generate examples from TIMIT archive_path based on the test/train csv information."""
         # Iterating the contents of the data to extract the relevant information
-        for wav_path in sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV")):
+        for id_, wav_path in enumerate(sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV"))):
 
             # extract transcript
             with open(wav_path.with_suffix(".TXT"), encoding="utf-8") as op:
@@ -160,7 +160,6 @@ def _generate_examples(self, split, data_dir):
             dialect_region = wav_path.parents[1].name
             sentence_type = wav_path.name[0:2]
             speaker_id = wav_path.parents[0].name[1:]
-            id_ = wav_path.stem
 
             example = {
                 "file": str(wav_path),
@@ -171,7 +170,7 @@ def _generate_examples(self, split, data_dir):
                 "dialect_region": dialect_region,
                 "sentence_type": sentence_type,
                 "speaker_id": speaker_id,
-                "id": id_,
+                "id": wav_path.stem,
             }
 
             yield id_, example

From 79e80b3b806263dbbc753dcd48f70134c8bd5b02 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 31 May 2022 12:12:03 +0200
Subject: [PATCH 2/3] Refactor

---
 datasets/timit_asr/timit_asr.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/datasets/timit_asr/timit_asr.py b/datasets/timit_asr/timit_asr.py
index d92664afe07..25e86bf08a9 100644
--- a/datasets/timit_asr/timit_asr.py
+++ b/datasets/timit_asr/timit_asr.py
@@ -129,7 +129,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(self, split, data_dir):
         """Generate examples from TIMIT archive_path based on the test/train csv information."""
         # Iterating the contents of the data to extract the relevant information
-        for id_, wav_path in enumerate(sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV"))):
+        for key, wav_path in enumerate(sorted(Path(data_dir).glob(f"**/{split.upper()}/**/*.WAV"))):
 
             # extract transcript
             with open(wav_path.with_suffix(".TXT"), encoding="utf-8") as op:
@@ -160,6 +160,7 @@ def _generate_examples(self, split, data_dir):
             dialect_region = wav_path.parents[1].name
             sentence_type = wav_path.name[0:2]
             speaker_id = wav_path.parents[0].name[1:]
+            id_ = wav_path.stem
 
             example = {
                 "file": str(wav_path),
@@ -170,7 +171,7 @@ def _generate_examples(self, split, data_dir):
                 "dialect_region": dialect_region,
                 "sentence_type": sentence_type,
                 "speaker_id": speaker_id,
-                "id": wav_path.stem,
+                "id": id_,
             }
 
-            yield id_, example
+            yield key, example

From 09ec67ad1c1b20683e55106f8d1081d233089e61 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 31 May 2022 12:12:16 +0200
Subject: [PATCH 3/3] Update dataset card

---
 datasets/timit_asr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/timit_asr/README.md b/datasets/timit_asr/README.md
index 03b9eec559e..eba7cd2b75e 100644
--- a/datasets/timit_asr/README.md
+++ b/datasets/timit_asr/README.md
@@ -170,7 +170,7 @@ A typical data point comprises the path to the audio file, usually called `file`
 
 - speaker_id: Unique id of the speaker. The same speaker id can be found for multiple data samples.
 
-- id: Unique id of the data sample. Contains the <SENTENCE_TYPE><SENTENCE_NUMBER>.  
+- id: ID of the data sample. Contains the <SENTENCE_TYPE><SENTENCE_NUMBER>.
 
 
 ### Data Splits