comments, typing, fix missing token_per_repo_id

lhoestq · lhoestq · commit 65fafbe4966b · 2022-06-28T17:57:28.000+02:00
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -101,7 +101,9 @@ def encode_example(self, value: Union[str, dict]) -> dict:
                 f"An audio sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
             )
 
-    def decode_example(self, value: dict, token_per_repo_id=None) -> dict:
+    def decode_example(
+        self, value: dict, token_per_repo_id: Optional[Dict[str, Union[str, bool, None]]] = None
+    ) -> dict:
         """Decode example audio file into audio data.
 
         Args:
@@ -211,7 +213,9 @@ def path_to_bytes(path):
         storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null())
         return array_cast(storage, self.pa_type)
 
-    def _decode_non_mp3_path_like(self, path, format=None, token_per_repo_id=None):
+    def _decode_non_mp3_path_like(
+        self, path, format=None, token_per_repo_id: Optional[Dict[str, Union[str, bool, None]]] = None
+    ):
         try:
             import librosa
         except ImportError as err:
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -1222,7 +1222,7 @@ def encode_nested_example(schema, obj, level=0):
     return obj
 
 
-def decode_nested_example(schema, obj, token_per_repo_id=None):
+def decode_nested_example(schema, obj, token_per_repo_id: Optional[Dict[str, Union[str, bool, None]]] = None):
     """Decode a nested example.
     This is used since some features (in particular Audio and Image) have some logic during decoding.
 
@@ -1613,7 +1613,7 @@ def encode_batch(self, batch):
             encoded_batch[key] = [encode_nested_example(self[key], obj) for obj in column]
         return encoded_batch
 
-    def decode_example(self, example: dict, token_per_repo_id=None):
+    def decode_example(self, example: dict, token_per_repo_id: Optional[Dict[str, Union[str, bool, None]]] = None):
         """Decode example with custom feature decoding.
 
         Args:
diff --git a/src/datasets/formatting/dataset_wrappers/torch_iterable_dataset.py b/src/datasets/formatting/dataset_wrappers/torch_iterable_dataset.py
@@ -1,7 +1,7 @@
 import fsspec
 import torch
 
-from ...iterable_dataset import IterableDataset
+from ...iterable_dataset import IterableDataset, _apply_feature_types
 from ...utils.logging import get_logger
 
 
@@ -46,7 +46,12 @@ def __iter__(self):
                 )
                 for shard_idx in shards_indices:
                     for key, example in self._iter_shard(shard_idx):
-                        yield self._apply_feature_types(example)
+                        if self.features:
+                            yield _apply_feature_types(
+                                example, self.features, token_per_repo_id=self._token_per_repo_id
+                            )
+                        else:
+                            yield example
                 logger.debug(
                     f"dataloader worker#{worker_info.id}, ': Finished iterating over {len(shards_indices)}/{self.n_shards} shards."
                 )
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -173,6 +173,17 @@ def shard_data_sources(self, shard_idx: int) -> "CyclingMultiSourcesExamplesIter
 
 
 class VerticallyConcatenatedMultiSourcesExamplesIterable(_BaseExamplesIterable):
+    """
+    VerticallyConcatenatedMultiSourcesExamplesIterable simply chains the input iterables.
+    It doesn't require the examples iterables to always yield the same columns.
+    Instead, this is handled by the `IterableDataset` class or `TypedExamplesIterable`.
+
+    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.
+    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.
+
+    Then for each example, `IterableDataset` and `TypedExamplesIterable` automatically fill missing columns with None.
+    This is done with `_apply_feature_types`.
+    """
     def __init__(self, ex_iterables: List[_BaseExamplesIterable]):
         self.ex_iterables = ex_iterables
 
@@ -210,6 +221,20 @@ def _check_column_names(column_names: List[str]):
 
 
 class HorizontallyConcatenatedMultiSourcesExamplesIterable(_BaseExamplesIterable):
+    """
+    HorizontallyConcatenatedMultiSourcesExamplesIterable merges examples together for the input list of iterables.
+    It also checks that there are no duplicate columns (otherwise we don't know which one to keep).
+    This check is done once when yielding the first example.
+
+    However it doesn't fill missing columns with None.
+    Instead, this is handled by the `IterableDataset` class or `TypedExamplesIterable`.
+
+    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.
+    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.
+
+    Then for each example, `IterableDataset` and `TypedExamplesIterable` automatically fill missing columns with None.
+    This is done with `_apply_feature_types`.
+    """
     def __init__(self, ex_iterables: List[_BaseExamplesIterable]):
         self.ex_iterables = ex_iterables
 
@@ -565,36 +590,52 @@ def n_shards(self) -> int:
         return self.ex_iterable.n_shards
 
 
+def _apply_feature_types(
+    example: dict, features: Features, token_per_repo_id: Dict[str, Union[str, bool, None]]
+) -> dict:
+    example = dict(example)
+    # add missing columns
+    for column_name in features:
+        if column_name not in example:
+            example[column_name] = None
+    # we encode the example for ClassLabel feature types for example
+    encoded_example = features.encode_example(example)
+    # Decode example for Audio feature, e.g.
+    decoded_example = features.decode_example(encoded_example, token_per_repo_id=token_per_repo_id)
+    return decoded_example
+
+
 class TypedExamplesIterable(_BaseExamplesIterable):
-    def __init__(self, ex_iterable: _BaseExamplesIterable, features: Features):
+    def __init__(
+        self,
+        ex_iterable: _BaseExamplesIterable,
+        features: Features,
+        token_per_repo_id: Dict[str, Union[str, bool, None]],
+    ):
         self.ex_iterable = ex_iterable
         self.features = features
+        self.token_per_repo_id = token_per_repo_id
 
     def __iter__(self):
+        # Then for each example, `TypedExamplesIterable` automatically fills missing columns with None.
+        # This is done with `_apply_feature_types`.
         for key, example in self.ex_iterable:
-            example = dict(example)
-            # add missing columns
-            for column_name in self.features:
-                if column_name not in example:
-                    example[column_name] = None
-            # we encode the example for ClassLabel feature types for example
-            encoded_example = self.features.encode_example(example)
-            # Decode example for Audio feature, e.g.
-            decoded_example = self.features.decode_example(encoded_example)
-            yield key, decoded_example
+            yield key, _apply_feature_types(example, self.features, token_per_repo_id=self.token_per_repo_id)
 
     def shuffle_data_sources(self, generator: np.random.Generator) -> "TypedExamplesIterable":
         """Shuffle the wrapped examples iterable."""
         return TypedExamplesIterable(
             self.ex_iterable.shuffle_data_sources(generator),
             features=self.features,
+            token_per_repo_id=self.token_per_repo_id,
         )
 
     def shard_data_sources(self, shard_idx: int) -> "TypedExamplesIterable":
         """Keep only the requested shard."""
         return TypedExamplesIterable(
             self.ex_iterable.shard_data_sources(shard_idx),
             features=self.features,
+            token_per_repo_id=self.token_per_repo_id,
         )
 
     @property
@@ -637,7 +678,7 @@ def __init__(
         self._format_type = format_type
         self._shuffling = shuffling
         self._epoch = 0
-        self._token_per_repo_id = token_per_repo_id or {}
+        self._token_per_repo_id: Dict[str, Union[str, bool, None]] = token_per_repo_id or {}
 
     def _head(self, n=5):
         return _examples_to_batch([x for key, x in islice(self._iter(), n)])
@@ -671,24 +712,14 @@ def _iter_shard(self, shard_idx: int):
             ex_iterable = self._ex_iterable
         yield from ex_iterable.shard_data_sources(shard_idx)
 
-    def _apply_feature_types(self, example):
-        if self.features:
-            example = dict(example)
-            # add missing columns
-            for column_name in self.features:
-                if column_name not in example:
-                    example[column_name] = None
-            # we encode the example for ClassLabel feature types for example
-            encoded_example = self.features.encode_example(example)
-            # Decode example for Audio feature, e.g.
-            decoded_example = self.features.decode_example(encoded_example, token_per_repo_id=self._token_per_repo_id)
-            return decoded_example
-        else:
-            return example
-
     def __iter__(self):
         for key, example in self._iter():
-            yield self._apply_feature_types(example)
+            if self.features:
+                # `IterableDataset` automatically fills missing columns with None.
+                # This is done with `_apply_feature_types`.
+                yield _apply_feature_types(example, self.features, token_per_repo_id=self._token_per_repo_id)
+            else:
+                yield example
 
     def with_format(
         self,
@@ -790,7 +821,7 @@ def map(
         info = self._info.copy()
         info.features = None
         ex_iterable = MappedExamplesIterable(
-            TypedExamplesIterable(self._ex_iterable, self._info.features)
+            TypedExamplesIterable(self._ex_iterable, self._info.features, token_per_repo_id=self._token_per_repo_id)
             if self._info.features is not None
             else self._ex_iterable,
             function=function,
@@ -859,7 +890,7 @@ def filter(
 
         # We need the examples to be decoded for certain feature types like Image or Audio, so we use TypedExamplesIterable here
         ex_iterable = FilteredExamplesIterable(
-            TypedExamplesIterable(self._ex_iterable, self._info.features)
+            TypedExamplesIterable(self._ex_iterable, self._info.features, token_per_repo_id=self._token_per_repo_id)
             if self._info.features is not None
             else self._ex_iterable,
             function=function,
@@ -1325,6 +1356,7 @@ def _concatenate_iterable_datasets(
     else:
         ex_iterable = HorizontallyConcatenatedMultiSourcesExamplesIterable(ex_iterables)
     # Set new info - we update the features
+    # setting the features also ensures to fill missing columns with None
     if info is None:
         info = DatasetInfo.from_merge([d.info for d in dsets])
     else:
@@ -1358,8 +1390,9 @@ def _interleave_iterable_datasets(
     Output:
         :class:`datasets.IterableDataset`
     """
+    # TODO(QL): merge the features as in _concatenate_iterable_datasets() and don't use TypedExamplesIterable
     ex_iterables = [
-        TypedExamplesIterable(d._ex_iterable, d.features)
+        TypedExamplesIterable(d._ex_iterable, d.features, token_per_repo_id=d._token_per_repo_id)
         if not isinstance(d._ex_iterable, TypedExamplesIterable) and d.features is not None
         else d._ex_iterable
         for d in datasets
@@ -1373,6 +1406,7 @@ def _interleave_iterable_datasets(
             ex_iterables, generator=generator, probabilities=probabilities
         )
     # Set new info - we reset the features
+    # TODO(QL): merge the features as in _concatenate_iterable_datasets() and use them here
     if info is None:
         info = DatasetInfo.from_merge([d.info for d in datasets])
         info.features = None