From 9274ff657b58bd72068e4a901dda7739f8324e8e Mon Sep 17 00:00:00 2001
From: Petru Radulescu <petradul@amazon.com>
Date: Wed, 24 Sep 2025 11:18:59 +0300
Subject: [PATCH 1/8] Sample without replacement option

---
 src/datasets/combine.py          |  9 ++++++++-
 src/datasets/iterable_dataset.py | 30 ++++++++++++++++++++++--------
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/src/datasets/combine.py b/src/datasets/combine.py
index 3b253e240c5..787d36561e8 100644
--- a/src/datasets/combine.py
+++ b/src/datasets/combine.py
@@ -22,6 +22,7 @@ def interleave_datasets(
     info: Optional[DatasetInfo] = None,
     split: Optional[NamedSplit] = None,
     stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
+    sample_with_replacement: bool = True,
 ) -> DatasetType:
     """
     Interleave several datasets (sources) into a single dataset.
@@ -151,7 +152,13 @@ def interleave_datasets(
         )
     else:
         return _interleave_iterable_datasets(
-            datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
+            datasets,
+            probabilities,
+            seed,
+            info=info,
+            split=split,
+            stopping_strategy=stopping_strategy,
+            sample_with_replacement=sample_with_replacement,
         )
 
 
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
index d5e8c6e0a91..0575c19d2d7 100644
--- a/src/datasets/iterable_dataset.py
+++ b/src/datasets/iterable_dataset.py
@@ -675,10 +675,12 @@ def __init__(
         self,
         ex_iterables: list[_BaseExamplesIterable],
         stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
+        sample_with_replacement: bool = True,
     ):
         super().__init__()
         self.ex_iterables = ex_iterables
         self.stopping_strategy = stopping_strategy
+        self.sample_with_replacement = sample_with_replacement
 
         # if undersampling ("first_exhausted"), we stop as soon as one dataset is exhausted
         # if oversampling ("all_exhausted"), we stop as soons as every dataset is exhausted, i.e as soon as every samples of every dataset has been visited at least once
@@ -735,6 +737,9 @@ def _iter_arrow(self):
             # if the stopping criteria is met, break the main for loop
             if self.bool_strategy_func(is_exhausted):
                 break
+            # Skip exhausted iterators
+            if is_exhausted[i] and not self.sample_with_replacement:
+                continue
             # let's pick one example from the iterator at index i
             if nexts[i] is None:
                 nexts[i] = next(iterators[i], False)
@@ -749,11 +754,12 @@ def _iter_arrow(self):
                 if self._state_dict:
                     self._state_dict["is_exhausted"][i] = True
                 # we reset it in case the stopping crtieria isn't met yet
-                nexts[i] = None
-                if self._state_dict:
-                    self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict()
-                    self._state_dict["previous_states"][i] = None
-                iterators[i] = self.ex_iterables[i].iter_arrow()
+                if self.sample_with_replacement:
+                    nexts[i] = None
+                    if self._state_dict:
+                        self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict()
+                        self._state_dict["previous_states"][i] = None
+                    iterators[i] = self.ex_iterables[i]._iter_arrow()
 
             if result is not False:
                 yield result
@@ -989,8 +995,9 @@ def __init__(
         generator: np.random.Generator,
         probabilities: Optional[list[float]] = None,
         stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
+        sample_with_replacement: bool = True,
     ):
-        super().__init__(ex_iterables, stopping_strategy)
+        super().__init__(ex_iterables, stopping_strategy, sample_with_replacement)
         self.generator = deepcopy(generator)
         self.probabilities = probabilities
         # TODO(QL): implement iter_arrow
@@ -4491,6 +4498,7 @@ def _interleave_iterable_datasets(
     info: Optional[DatasetInfo] = None,
     split: Optional[NamedSplit] = None,
     stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
+    sample_with_replacement: bool = True,
 ) -> IterableDataset:
     """
     Interleave several iterable datasets (sources) into a single iterable dataset.
@@ -4532,11 +4540,17 @@ def _interleave_iterable_datasets(
         ex_iterables = [RebatchedArrowExamplesIterable(ex_iterable, batch_size=1) for ex_iterable in ex_iterables]
     # Use cycling or random cycling of sources
     if probabilities is None:
-        ex_iterable = CyclingMultiSourcesExamplesIterable(ex_iterables, stopping_strategy=stopping_strategy)
+        ex_iterable = CyclingMultiSourcesExamplesIterable(
+            ex_iterables, stopping_strategy=stopping_strategy, sample_with_replacement=sample_with_replacement
+        )
     else:
         generator = np.random.default_rng(seed)
         ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable(
-            ex_iterables, generator=generator, probabilities=probabilities, stopping_strategy=stopping_strategy
+            ex_iterables,
+            generator=generator,
+            probabilities=probabilities,
+            stopping_strategy=stopping_strategy,
+            sample_with_replacement=sample_with_replacement,
         )
     # Set new info - we update the features
     # setting the features also ensures to fill missing columns with None

From a88541d90df99957812910108029ff2b0000aa59 Mon Sep 17 00:00:00 2001
From: Petru Radulescu <petradul@amazon.com>
Date: Wed, 24 Sep 2025 12:08:49 +0300
Subject: [PATCH 2/8] Exit early for non arrow iterable.

---
 src/datasets/iterable_dataset.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
index 0575c19d2d7..18f26400939 100644
--- a/src/datasets/iterable_dataset.py
+++ b/src/datasets/iterable_dataset.py
@@ -784,6 +784,8 @@ def __iter__(self):
             if self.bool_strategy_func(is_exhausted):
                 break
             # let's pick one example from the iterator at index i
+            if is_exhausted[i] and not self.sample_with_replacement:
+                continue
             if nexts[i] is None:
                 nexts[i] = next(iterators[i], False)
             result = nexts[i]
@@ -797,12 +799,12 @@ def __iter__(self):
                 if self._state_dict:
                     self._state_dict["is_exhausted"][i] = True
                 # we reset it in case the stopping crtieria isn't met yet
-                nexts[i] = None
-                if self._state_dict:
-                    self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict()
-                    self._state_dict["previous_states"][i] = None
-                iterators[i] = iter(self.ex_iterables[i])
-
+                if self.sample_with_replacement:
+                    nexts[i] = None
+                    if self._state_dict:
+                        self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict()
+                        self._state_dict["previous_states"][i] = None
+                    iterators[i] = iter(self.ex_iterables[i])
             if result is not False:
                 yield result
 

From dd6f9fb83d5b49385ce8bca2ddd52f3f6ad62280 Mon Sep 17 00:00:00 2001
From: Petru Radulescu <petradul@amazon.com>
Date: Wed, 24 Sep 2025 12:55:35 +0300
Subject: [PATCH 3/8] Add new stopping strategy

---
 src/datasets/iterable_dataset.py | 41 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
index 18f26400939..d195b57ff69 100644
--- a/src/datasets/iterable_dataset.py
+++ b/src/datasets/iterable_dataset.py
@@ -674,17 +674,20 @@ class CyclingMultiSourcesExamplesIterable(_BaseExamplesIterable):
     def __init__(
         self,
         ex_iterables: list[_BaseExamplesIterable],
-        stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
-        sample_with_replacement: bool = True,
+        stopping_strategy: Literal[
+            "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
+        ] = "first_exhausted",
     ):
         super().__init__()
         self.ex_iterables = ex_iterables
         self.stopping_strategy = stopping_strategy
-        self.sample_with_replacement = sample_with_replacement
 
         # if undersampling ("first_exhausted"), we stop as soon as one dataset is exhausted
         # if oversampling ("all_exhausted"), we stop as soons as every dataset is exhausted, i.e as soon as every samples of every dataset has been visited at least once
-        self.bool_strategy_func = np.all if (stopping_strategy == "all_exhausted") else np.any
+        # if sampling without replacement ("all_exhausted_without_replacement"), we stop once all samples of every dataset has been visited exactly once.
+        self.bool_strategy_func = (
+            np.all if (stopping_strategy in ("all_exhausted", "all_exhausted_without_replacement")) else np.any
+        )
 
     @property
     def is_typed(self):
@@ -737,8 +740,8 @@ def _iter_arrow(self):
             # if the stopping criteria is met, break the main for loop
             if self.bool_strategy_func(is_exhausted):
                 break
-            # Skip exhausted iterators
-            if is_exhausted[i] and not self.sample_with_replacement:
+            # Skip exhausted iterators if we sample without replacement
+            if is_exhausted[i] and self.stopping_strategy in ["all_exhausted_without_replacement"]:
                 continue
             # let's pick one example from the iterator at index i
             if nexts[i] is None:
@@ -753,8 +756,8 @@ def _iter_arrow(self):
                 is_exhausted[i] = True
                 if self._state_dict:
                     self._state_dict["is_exhausted"][i] = True
-                # we reset it in case the stopping crtieria isn't met yet
-                if self.sample_with_replacement:
+                # we reset it in case the stopping crtieria isn't met yet and we sample with replacement
+                if self.stopping_strategy not in ["all_exhausted_without_replacement"]:
                     nexts[i] = None
                     if self._state_dict:
                         self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict()
@@ -784,7 +787,7 @@ def __iter__(self):
             if self.bool_strategy_func(is_exhausted):
                 break
             # let's pick one example from the iterator at index i
-            if is_exhausted[i] and not self.sample_with_replacement:
+            if is_exhausted[i] and self.stopping_strategy in ["all_exhausted_without_replacement"]:
                 continue
             if nexts[i] is None:
                 nexts[i] = next(iterators[i], False)
@@ -799,7 +802,7 @@ def __iter__(self):
                 if self._state_dict:
                     self._state_dict["is_exhausted"][i] = True
                 # we reset it in case the stopping crtieria isn't met yet
-                if self.sample_with_replacement:
+                if self.stopping_strategy not in ["all_exhausted_without_replacement"]:
                     nexts[i] = None
                     if self._state_dict:
                         self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict()
@@ -996,13 +999,13 @@ def __init__(
         ex_iterables: list[_BaseExamplesIterable],
         generator: np.random.Generator,
         probabilities: Optional[list[float]] = None,
-        stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
-        sample_with_replacement: bool = True,
+        stopping_strategy: Literal[
+            "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
+        ] = "first_exhausted",
     ):
-        super().__init__(ex_iterables, stopping_strategy, sample_with_replacement)
+        super().__init__(ex_iterables, stopping_strategy)
         self.generator = deepcopy(generator)
         self.probabilities = probabilities
-        # TODO(QL): implement iter_arrow
 
     @property
     def is_typed(self):
@@ -4499,8 +4502,9 @@ def _interleave_iterable_datasets(
     seed: Optional[int] = None,
     info: Optional[DatasetInfo] = None,
     split: Optional[NamedSplit] = None,
-    stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
-    sample_with_replacement: bool = True,
+    stopping_strategy: Literal[
+        "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
+    ] = "first_exhausted",
 ) -> IterableDataset:
     """
     Interleave several iterable datasets (sources) into a single iterable dataset.
@@ -4542,9 +4546,7 @@ def _interleave_iterable_datasets(
         ex_iterables = [RebatchedArrowExamplesIterable(ex_iterable, batch_size=1) for ex_iterable in ex_iterables]
     # Use cycling or random cycling of sources
     if probabilities is None:
-        ex_iterable = CyclingMultiSourcesExamplesIterable(
-            ex_iterables, stopping_strategy=stopping_strategy, sample_with_replacement=sample_with_replacement
-        )
+        ex_iterable = CyclingMultiSourcesExamplesIterable(ex_iterables, stopping_strategy=stopping_strategy)
     else:
         generator = np.random.default_rng(seed)
         ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable(
@@ -4552,7 +4554,6 @@ def _interleave_iterable_datasets(
             generator=generator,
             probabilities=probabilities,
             stopping_strategy=stopping_strategy,
-            sample_with_replacement=sample_with_replacement,
         )
     # Set new info - we update the features
     # setting the features also ensures to fill missing columns with None

From 29832bb847e2ed4555bf6845ed9f4b10800fa671 Mon Sep 17 00:00:00 2001
From: Petru Radulescu <petradul@amazon.com>
Date: Wed, 24 Sep 2025 12:56:11 +0300
Subject: [PATCH 4/8] Remove sample_with_replacement argument

---
 src/datasets/combine.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/datasets/combine.py b/src/datasets/combine.py
index 787d36561e8..91a6457c02c 100644
--- a/src/datasets/combine.py
+++ b/src/datasets/combine.py
@@ -21,8 +21,9 @@ def interleave_datasets(
     seed: Optional[int] = None,
     info: Optional[DatasetInfo] = None,
     split: Optional[NamedSplit] = None,
-    stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
-    sample_with_replacement: bool = True,
+    stopping_strategy: Literal[
+        "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
+    ] = "first_exhausted",
 ) -> DatasetType:
     """
     Interleave several datasets (sources) into a single dataset.
@@ -56,9 +57,10 @@ def interleave_datasets(
             Name of the dataset split.
             <Added version="2.4.0"/>
         stopping_strategy (`str`, defaults to `first_exhausted`):
-            Two strategies are proposed right now, `first_exhausted` and `all_exhausted`.
+            Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`.
             By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
             If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
+            When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.
             Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
             - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
             - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
@@ -144,7 +146,7 @@ def interleave_datasets(
             raise ValueError(
                 f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
             )
-    if stopping_strategy not in ["first_exhausted", "all_exhausted"]:
+    if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
         raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.")
     if dataset_type is Dataset:
         return _interleave_map_style_datasets(
@@ -158,7 +160,6 @@ def interleave_datasets(
             info=info,
             split=split,
             stopping_strategy=stopping_strategy,
-            sample_with_replacement=sample_with_replacement,
         )
 
 

From a547d81469128bea4acc3bcc2a4a6a95968936ee Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 1 Oct 2025 16:34:47 +0200
Subject: [PATCH 5/8] fix
 CyclingMultiSourcesExamplesIterable.shard_data_sources

---
 src/datasets/iterable_dataset.py | 60 ++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
index d195b57ff69..04016682b61 100644
--- a/src/datasets/iterable_dataset.py
+++ b/src/datasets/iterable_dataset.py
@@ -818,16 +818,33 @@ def shuffle_data_sources(self, generator: np.random.Generator) -> "CyclingMultiS
 
     @property
     def num_shards(self) -> int:
-        return min(ex_iterable.num_shards for ex_iterable in self.ex_iterables)
+        return min(ex_iterable.num_shards for ex_iterable in self.ex_iterables) if self.ex_iterables else 0
 
     def shard_data_sources(
         self, num_shards: int, index: int, contiguous=True
     ) -> "CyclingMultiSourcesExamplesIterable":
         """Either keep only the requested shard, or propagate the request to the underlying iterable."""
-        return CyclingMultiSourcesExamplesIterable(
-            [iterable.shard_data_sources(num_shards, index, contiguous=contiguous) for iterable in self.ex_iterables],
-            stopping_strategy=self.stopping_strategy,
-        )
+        if num_shards < self.num_shards:
+            return CyclingMultiSourcesExamplesIterable(
+                [
+                    iterable.shard_data_sources(num_shards, index, contiguous=contiguous)
+                    for iterable in self.ex_iterables
+                ],
+                stopping_strategy=self.stopping_strategy,
+            )
+        elif index < self.num_shards:
+            return CyclingMultiSourcesExamplesIterable(
+                [
+                    iterable.shard_data_sources(self.num_shards, index, contiguous=contiguous)
+                    for iterable in self.ex_iterables
+                ],
+                stopping_strategy=self.stopping_strategy,
+            )
+        else:
+            return CyclingMultiSourcesExamplesIterable(
+                [],
+                stopping_strategy=self.stopping_strategy,
+            )
 
 
 class VerticallyConcatenatedMultiSourcesExamplesIterable(_BaseExamplesIterable):
@@ -1069,12 +1086,33 @@ def shard_data_sources(
         self, num_shards: int, index: int, contiguous=True
     ) -> "RandomlyCyclingMultiSourcesExamplesIterable":
         """Either keep only the requested shard, or propagate the request to the underlying iterable."""
-        return RandomlyCyclingMultiSourcesExamplesIterable(
-            [iterable.shard_data_sources(num_shards, index, contiguous=contiguous) for iterable in self.ex_iterables],
-            self.generator,
-            self.probabilities,
-            self.stopping_strategy,
-        )
+        if num_shards < self.num_shards:
+            return RandomlyCyclingMultiSourcesExamplesIterable(
+                [
+                    iterable.shard_data_sources(num_shards, index, contiguous=contiguous)
+                    for iterable in self.ex_iterables
+                ],
+                self.generator,
+                self.probabilities,
+                self.stopping_strategy,
+            )
+        elif index < self.num_shards:
+            return RandomlyCyclingMultiSourcesExamplesIterable(
+                [
+                    iterable.shard_data_sources(self.num_shards, index, contiguous=contiguous)
+                    for iterable in self.ex_iterables
+                ],
+                self.generator,
+                self.probabilities,
+                self.stopping_strategy,
+            )
+        else:
+            return RandomlyCyclingMultiSourcesExamplesIterable(
+                [],
+                self.generator,
+                self.probabilities,
+                self.stopping_strategy,
+            )
 
 
 def _table_output_to_arrow(output) -> pa.Table:

From 455bfaaa6d574aa9d9c9592baee390017512cc5f Mon Sep 17 00:00:00 2001
From: Petru Radulescu <petradul@amazon.com>
Date: Fri, 3 Oct 2025 11:52:41 +0300
Subject: [PATCH 6/8] Add sampling without replacement logic for map style
 datasets.

---
 src/datasets/arrow_dataset.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 36740e458b7..680ef60e002 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -6569,7 +6569,9 @@ def _interleave_map_style_datasets(
     seed: Optional[int] = None,
     info: Optional[DatasetInfo] = None,
     split: Optional[NamedSplit] = None,
-    stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
+    stopping_strategy: Literal[
+        "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
+    ] = "first_exhausted",
     **kwargs,
 ) -> "Dataset":
     """
@@ -6589,6 +6591,7 @@ def _interleave_map_style_datasets(
             Two strategies are proposed right now.
             By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
             If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
+            When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.
             Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
             - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.
             - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
@@ -6597,7 +6600,7 @@ def _interleave_map_style_datasets(
     Output:
         :class:`datasets.Dataset`
     """
-    if stopping_strategy not in ["first_exhausted", "all_exhausted"]:
+    if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
         raise ValueError(
             f"{stopping_strategy} stopping strategy in `interleave_datasets` is not implemented yet with a list of {type(datasets[0])}"
         )
@@ -6640,7 +6643,9 @@ def _interleave_map_style_datasets(
 
         # if undersampling ("first_exhausted"), we stop as soon as one dataset is exhausted
         # if oversampling ("all_exhausted"), we stop as soons as every dataset is exhausted, i.e as soon as every samples of every dataset has been visited at least once
-        bool_strategy_func = np.all if oversampling else np.any
+        bool_strategy_func = (
+            np.all if (oversampling or stopping_strategy == "all_exhausted_without_replacement") else np.any
+        )
 
         def iter_random_indices():
             """Get an infinite iterator that randomly samples the index of the source to pick examples from."""
@@ -6658,13 +6663,17 @@ def iter_random_indices():
                 break
 
             # let's add the example at the current index of the `source_idx`-th dataset
-            indices.append(current_index[source_idx] + offsets[source_idx])
-            current_index[source_idx] += 1
+            # For without replacement sampling we additionally need to make sure the current source is not exhausted to not oversample.
+            if stopping_strategy != "all_exhausted_without_replacement" or not is_exhausted[source_idx]:
+                indices.append(current_index[source_idx] + offsets[source_idx])
+                current_index[source_idx] += 1
 
             # we've ran out of examples for the current dataset, let's update our boolean array and bring the current_index back to 0
             if current_index[source_idx] >= lengths[source_idx]:
                 is_exhausted[source_idx] = True
-                current_index[source_idx] = 0
+                # We don't want to reset the iterator when stopping strategy is without replacement.
+                if stopping_strategy != "all_exhausted_without_replacement":
+                    current_index[source_idx] = 0
 
     return concatenated_datasets.select(indices, **kwargs)
 

From de59d60faffc8d998e92fbc0274dcb5b5fffe193 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Tue, 7 Oct 2025 16:49:32 +0200
Subject: [PATCH 7/8] Update process.mdx

---
 docs/source/process.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/process.mdx b/docs/source/process.mdx
index 652fc7b4a4d..70ddfb28b5f 100644
--- a/docs/source/process.mdx
+++ b/docs/source/process.mdx
@@ -657,6 +657,7 @@ In this case, the new dataset is constructed by getting examples one by one from
 You can also specify the `stopping_strategy`. The default strategy, `first_exhausted`, is a subsampling strategy, i.e the dataset construction is stopped as soon one of the dataset runs out of samples.
 You can specify `stopping_strategy=all_exhausted` to execute an oversampling strategy. In this case, the dataset construction is stopped as soon as every samples in every dataset has been added at least once. In practice, it means that if a dataset is exhausted, it will return to the beginning of this dataset until the stop criterion has been reached.
 Note that if no sampling probabilities are specified, the new dataset will have `max_length_datasets*nb_dataset samples`.
+There is also `stopping_strategy=all_exhausted_without_replacement` to ensure that every sample is seen exactly once.
 
 ```py
 >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})

From cd80f18e9f6a96640649ebca6a9ffa5b008e2f94 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Tue, 7 Oct 2025 16:49:53 +0200
Subject: [PATCH 8/8] Update stream.mdx

---
 docs/source/stream.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx
index 8375e0057a8..67f1ff420cd 100644
--- a/docs/source/stream.mdx
+++ b/docs/source/stream.mdx
@@ -197,6 +197,7 @@ Around 80% of the final dataset is made of the `es_dataset`, and 20% of the `fr_
 You can also specify the `stopping_strategy`. The default strategy, `first_exhausted`, is a subsampling strategy, i.e the dataset construction is stopped as soon one of the dataset runs out of samples.
 You can specify `stopping_strategy=all_exhausted` to execute an oversampling strategy. In this case, the dataset construction is stopped as soon as every samples in every dataset has been added at least once. In practice, it means that if a dataset is exhausted, it will return to the beginning of this dataset until the stop criterion has been reached.
 Note that if no sampling probabilities are specified, the new dataset will have `max_length_datasets*nb_dataset samples`.
+There is also `stopping_strategy=all_exhausted_without_replacement` to ensure that every sample is seen exactly once.
 
 ## Rename, remove, and cast