Add fn_kwargs param to IterableDataset.map (#4975)

mariosasko · web-flow · commit 418b3bc8e4e8 · 2022-09-13T18:45:33.000+02:00
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -334,6 +334,7 @@ def __init__(
         batch_size: int = 1000,
         drop_last_batch: bool = False,
         remove_columns: Optional[List[str]] = None,
+        fn_kwargs: Optional[dict] = None,
     ):
         self.ex_iterable = ex_iterable
         self.function = function
@@ -343,6 +344,7 @@ def __init__(
         self.remove_columns = remove_columns
         self.with_indices = with_indices
         self.input_columns = input_columns
+        self.fn_kwargs = fn_kwargs or {}
 
     def __iter__(self):
         iterator = iter(self.ex_iterable)
@@ -363,7 +365,7 @@ def __iter__(self):
                 if self.with_indices:
                     function_args.append([current_idx + i for i in range(len(key_examples_list))])
                 transformed_batch = dict(batch)  # this will be updated with the function output
-                transformed_batch.update(self.function(*function_args))
+                transformed_batch.update(self.function(*function_args, **self.fn_kwargs))
                 # then remove the unwanted columns
                 if self.remove_columns:
                     for c in self.remove_columns:
@@ -396,7 +398,7 @@ def __iter__(self):
                 if self.with_indices:
                     function_args.append(current_idx)
                 transformed_example = dict(example)  # this will be updated with the function output
-                transformed_example.update(self.function(*function_args))
+                transformed_example.update(self.function(*function_args, **self.fn_kwargs))
                 # then we remove the unwanted columns
                 if self.remove_columns:
                     for c in self.remove_columns:
@@ -414,6 +416,7 @@ def shuffle_data_sources(self, generator: np.random.Generator) -> "MappedExample
             batched=self.batched,
             batch_size=self.batch_size,
             remove_columns=self.remove_columns,
+            fn_kwargs=self.fn_kwargs,
         )
 
     def shard_data_sources(self, shard_idx: int) -> "MappedExamplesIterable":
@@ -426,6 +429,7 @@ def shard_data_sources(self, shard_idx: int) -> "MappedExamplesIterable":
             batched=self.batched,
             batch_size=self.batch_size,
             remove_columns=self.remove_columns,
+            fn_kwargs=self.fn_kwargs,
         )
 
     @property
@@ -759,6 +763,7 @@ def map(
         batch_size: int = 1000,
         drop_last_batch: bool = False,
         remove_columns: Optional[Union[str, List[str]]] = None,
+        fn_kwargs: Optional[dict] = None,
     ) -> "IterableDataset":
         """
         Apply a function to all the examples in the iterable dataset (individually or in batches) and update them.
@@ -797,6 +802,7 @@ def map(
             remove_columns (`Optional[List[str]]`, defaults to `None`): Remove a selection of columns while doing the mapping.
                 Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding
                 columns with names in `remove_columns`, these columns will be kept.
+            fn_kwargs (:obj:`Dict`, optional, default `None`): Keyword arguments to be passed to `function`.
 
         Example:
 
@@ -821,6 +827,8 @@ def map(
             remove_columns = [remove_columns]
         if function is None:
             function = lambda x: x  # noqa: E731
+        if fn_kwargs is None:
+            fn_kwargs = {}
         info = self._info.copy()
         info.features = None
         ex_iterable = MappedExamplesIterable(
@@ -834,6 +842,7 @@ def map(
             batch_size=batch_size,
             drop_last_batch=drop_last_batch,
             remove_columns=remove_columns,
+            fn_kwargs=fn_kwargs,
         )
         return iterable_dataset(
             ex_iterable=ex_iterable,
diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py
@@ -328,6 +328,39 @@ def test_mapped_examples_iterable_remove_columns(n, func, batch_size, remove_col
     assert list(x for _, x in ex_iterable) == expected
 
 
+@pytest.mark.parametrize(
+    "n, func, batch_size, fn_kwargs",
+    [
+        (3, lambda x, y=0: {"id+y": x["id"] + y}, None, None),
+        (3, lambda x, y=0: {"id+y": x["id"] + y}, None, {"y": 3}),
+        (25, lambda x, y=0: {"id+y": [i + y for i in x["id"]]}, 10, {"y": 3}),
+    ],
+)
+def test_mapped_examples_iterable_fn_kwargs(n, func, batch_size, fn_kwargs):
+    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
+    ex_iterable = MappedExamplesIterable(
+        base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, fn_kwargs=fn_kwargs
+    )
+    all_examples = [x for _, x in generate_examples_fn(n=n)]
+    if fn_kwargs is None:
+        fn_kwargs = {}
+    if batch_size is None:
+        expected = [{**x, **func(x, **fn_kwargs)} for x in all_examples]
+    else:
+        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
+        all_transformed_examples = []
+        for batch_offset in range(0, len(all_examples), batch_size):
+            examples = all_examples[batch_offset : batch_offset + batch_size]
+            batch = _examples_to_batch(examples)
+            transformed_batch = func(batch, **fn_kwargs)
+            all_transformed_examples.extend(_batch_to_examples(transformed_batch))
+        expected = _examples_to_batch(all_examples)
+        expected.update(_examples_to_batch(all_transformed_examples))
+        expected = list(_batch_to_examples(expected))
+    assert next(iter(ex_iterable))[1] == expected[0]
+    assert list(x for _, x in ex_iterable) == expected
+
+
 @pytest.mark.parametrize(
     "n, func, batch_size, input_columns",
     [