Fix dataset_dict.shuffle with single seed (#1626)

lhoestq · thomwolf · web-flow · commit 6a3037c98787 · 2021-01-04T11:00:02.000+01:00
* fix dataset_dict.shuffle with single seed

* add seed alias

* missing test

* Update src/datasets/dataset_dict.py

Co-authored-by: Thomas Wolf &lt;thomwolf@users.noreply.github.com&gt;

Co-authored-by: Thomas Wolf &lt;thomwolf@users.noreply.github.com&gt;
diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -420,7 +420,8 @@ def sort(
 
     def shuffle(
         self,
-        seeds: Optional[Dict[str, int]] = None,
+        seeds: Optional[Union[int, Dict[str, int]]] = None,
+        seed: Optional[int] = None,
         generators: Optional[Dict[str, np.random.Generator]] = None,
         keep_in_memory: bool = False,
         load_from_cache_file: bool = True,
@@ -434,10 +435,11 @@ def shuffle(
         You can either supply a NumPy BitGenerator to use, or a seed to initiate NumPy's default random generator (PCG64).
 
         Args:
-            seeds (Optional `Dict[str, int]`): A seed to initialize the default BitGenerator if ``generator=None``.
+            seeds (Optional `Dict[str, int]` or `int`): A seed to initialize the default BitGenerator if ``generator=None``.
                 If None, then fresh, unpredictable entropy will be pulled from the OS.
                 If an int or array_like[ints] is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state.
-                You have to provide one :obj:`seed` per dataset in the dataset dictionary.
+                You can provide one :obj:`seed` per dataset in the dataset dictionary.
+            seed (Optional `int`): A seed to initialize the default BitGenerator if ``generator=None``. Alias for seeds (the seed argument has priority over seeds if both arguments are provided).
             generators (Optional `Dict[str, np.random.Generator]`): Numpy random Generator to use to compute the permutation of the dataset rows.
                 If ``generator=None`` (default), uses np.random.default_rng (the default BitGenerator (PCG64) of NumPy).
                 You have to provide one :obj:`generator` per dataset in the dataset dictionary.
@@ -451,8 +453,13 @@ def shuffle(
                 Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`.
         """
         self._check_values_type()
+        if seed is not None and seeds is not None:
+            raise ValueError("Please specify seed or seeds, but not both")
+        seeds = seed if seed is not None else seeds
         if seeds is None:
             seeds = {k: None for k in self}
+        elif not isinstance(seeds, dict):
+            seeds = {k: seeds for k in self}
         if generators is None:
             generators = {k: None for k in self}
         if indices_cache_file_names is None:
diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py
@@ -283,7 +283,17 @@ def test_shuffle(self):
                 seeds=seeds, indices_cache_file_names=indices_cache_file_names_3, load_from_cache_file=False
             )
             self.assertNotEqual(dsets_shuffled_3["train"]["filename"], dsets_shuffled_3["test"]["filename"])
+
+            # other input types
+            dsets_shuffled_int = dsets.shuffle(42)
+            dsets_shuffled_alias = dsets.shuffle(seed=42)
+            dsets_shuffled_none = dsets.shuffle()
+            self.assertEqual(len(dsets_shuffled_int["train"]), 30)
+            self.assertEqual(len(dsets_shuffled_alias["train"]), 30)
+            self.assertEqual(len(dsets_shuffled_none["train"]), 30)
+
             del dsets, dsets_shuffled, dsets_shuffled_2, dsets_shuffled_3
+            del dsets_shuffled_int, dsets_shuffled_alias, dsets_shuffled_none
 
     def test_check_values_type(self):
         dsets = self._create_dummy_dataset_dict()