Fix embed_storage on features inside lists/sequences (#4615)

mariosasko · lhoestq · web-flow · commit ab6946d2a106 · 2022-07-08T14:01:35.000+02:00
* Dedicated function for embedding data into table

* Add test

* minor

* minor 2

* add test

Co-authored-by: Quentin Lhoest &lt;lhoest.q@gmail.com&gt;
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -84,8 +84,8 @@
     InMemoryTable,
     MemoryMappedTable,
     Table,
-    cast_table_to_features,
     concat_tables,
+    embed_table_storage,
     list_table_cache_files,
     table_cast,
     table_visitor,
@@ -95,7 +95,7 @@
 from .utils._hf_hub_fixes import create_repo
 from .utils.file_utils import _retry, cached_path, estimate_dataset_size, hf_hub_url
 from .utils.info_utils import is_small_dataset
-from .utils.py_utils import convert_file_size_to_int, temporary_assignment, unique_values
+from .utils.py_utils import convert_file_size_to_int, unique_values
 from .utils.stratify import stratified_shuffle_split_generate_indices
 from .utils.tf_utils import minimal_tf_collate_fn
 from .utils.typing import PathLike
@@ -4150,26 +4150,17 @@ def extra_nbytes_visitor(array, feature):
         if decodable_columns:
 
             def shards_with_embedded_external_files(shards):
-                # Temporarily assign the modified version of `cast_storage` before the cast to the decodable
-                # feature types to delete path information and embed file content in the arrow file.
-                with contextlib.ExitStack() as stack:
-                    for decodable_feature_type in [Audio, Image]:
-                        stack.enter_context(
-                            temporary_assignment(
-                                decodable_feature_type, "cast_storage", decodable_feature_type.embed_storage
-                            )
-                        )
-                    for shard in shards:
-                        format = shard.format
-                        shard = shard.with_format("arrow")
-                        shard = shard.map(
-                            partial(cast_table_to_features, features=shard.features),
-                            batched=True,
-                            batch_size=1000,
-                            keep_in_memory=True,
-                        )
-                        shard = shard.with_format(**format)
-                        yield shard
+                for shard in shards:
+                    format = shard.format
+                    shard = shard.with_format("arrow")
+                    shard = shard.map(
+                        embed_table_storage,
+                        batched=True,
+                        batch_size=1000,
+                        keep_in_memory=True,
+                    )
+                    shard = shard.with_format(**format)
+                    yield shard
 
             shards = shards_with_embedded_external_files(shards)
 
@@ -4224,7 +4215,9 @@ def path_in_repo(_index, shard):
             for data_file in data_files
             if data_file.startswith(f"data/{split}-") and data_file not in shards_path_in_repo
         ]
-        deleted_size = sum(xgetsize(hf_hub_url(repo_id, data_file)) for data_file in data_files_to_delete)
+        deleted_size = sum(
+            xgetsize(hf_hub_url(repo_id, data_file), use_auth_token=token) for data_file in data_files_to_delete
+        )
 
         def delete_file(file):
             api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -1438,6 +1438,24 @@ def require_storage_cast(feature: FeatureType) -> bool:
         return hasattr(feature, "cast_storage")
 
 
+def require_storage_embed(feature: FeatureType) -> bool:
+    """Check if a (possibly nested) feature requires embedding data into storage.
+
+    Args:
+        feature (FeatureType): the feature type to be checked
+    Returns:
+        :obj:`bool`
+    """
+    if isinstance(feature, dict):
+        return any(require_storage_cast(f) for f in feature.values())
+    elif isinstance(feature, (list, tuple)):
+        return require_storage_cast(feature[0])
+    elif isinstance(feature, Sequence):
+        return require_storage_cast(feature.feature)
+    else:
+        return hasattr(feature, "embed_storage")
+
+
 def keep_features_dicts_synced(func):
     """
     Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the :class:`datasets.Features` object
diff --git a/src/datasets/table.py b/src/datasets/table.py
@@ -1779,7 +1779,7 @@ def cast_array_to_feature(array: pa.Array, feature: "FeatureType", allow_number_
             = if casting from numbers to strings and allow_number_to_str is False
 
     Returns:
-         array (:obj:`pyarrow.Array`): the casted array
+        array (:obj:`pyarrow.Array`): the casted array
     """
     from .features.features import Sequence, get_nested_type
 
@@ -1850,8 +1850,89 @@ def cast_array_to_feature(array: pa.Array, feature: "FeatureType", allow_number_
     raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
 
 
+@_wrap_for_chunked_arrays
+def embed_array_storage(array: pa.Array, feature: "FeatureType"):
+    """Embed data into an arrays's storage.
+    For custom features like Audio or Image, it takes into account the "embed_storage" methods
+    they defined to enable embedding external data (e.g. an image file) into an other arrow types.
+
+    Args:
+        array (pa.Array): the PyArrow array in which to embed data
+        feature (FeatureType): array features
+
+    Raises:
+        TypeError: if the target type is not supported according, e.g.
+
+            - if a field is missing
+
+    Returns:
+         array (:obj:`pyarrow.Array`): the casted array
+    """
+    from .features import Sequence
+
+    _e = embed_array_storage
+
+    if isinstance(array, pa.ExtensionArray):
+        array = array.storage
+    if hasattr(feature, "embed_storage"):
+        return feature.embed_storage(array)
+    elif pa.types.is_struct(array.type):
+        # feature must be a dict or Sequence(subfeatures_dict)
+        if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
+            feature = {
+                name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items()
+            }
+        if isinstance(feature, dict):
+            arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()]
+            return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
+    elif pa.types.is_list(array.type):
+        # feature must be either [subfeature] or Sequence(subfeature)
+        if isinstance(feature, list):
+            if array.null_count > 0:
+                warnings.warn(
+                    f"None values are converted to empty lists when embedding array storage with {feature}. More info: https://github.com/huggingface/datasets/issues/3676. This will raise an error in a future major version of `datasets`"
+                )
+            return pa.ListArray.from_arrays(array.offsets, _e(array.values, feature[0]))
+        elif isinstance(feature, Sequence):
+            if feature.length > -1:
+                if feature.length * len(array) == len(array.values):
+                    return pa.FixedSizeListArray.from_arrays(_e(array.values, feature.feature), feature.length)
+            else:
+                casted_values = _e(array.values, feature.feature)
+                if casted_values.type == array.values.type:
+                    return array
+                else:
+                    if array.null_count > 0:
+                        warnings.warn(
+                            f"None values are converted to empty lists when embedding array storage with {feature}. More info: https://github.com/huggingface/datasets/issues/3676. This will raise an error in a future major version of `datasets`"
+                        )
+                    return pa.ListArray.from_arrays(array.offsets, _e(array.values, feature.feature))
+    elif pa.types.is_fixed_size_list(array.type):
+        # feature must be either [subfeature] or Sequence(subfeature)
+        if isinstance(feature, list):
+            if array.null_count > 0:
+                warnings.warn(
+                    f"None values are converted to empty lists when embedding array storage with {feature}. More info: https://github.com/huggingface/datasets/issues/3676. This will raise an error in a future major version of `datasets`"
+                )
+            return pa.ListArray.from_arrays(array.offsets, _e(array.values, feature[0]))
+        elif isinstance(feature, Sequence):
+            if feature.length > -1:
+                if feature.length * len(array) == len(array.values):
+                    return pa.FixedSizeListArray.from_arrays(_e(array.values, feature.feature), feature.length)
+            else:
+                offsets_arr = pa.array(range(len(array) + 1), pa.int32())
+                if array.null_count > 0:
+                    warnings.warn(
+                        f"None values are converted to empty lists when embedding array storage with {feature}. More info: https://github.com/huggingface/datasets/issues/3676. This will raise an error in a future major version of `datasets`"
+                    )
+                return pa.ListArray.from_arrays(offsets_arr, _e(array.values, feature.feature))
+    if not isinstance(feature, (Sequence, dict, list, tuple)):
+        return array
+    raise TypeError(f"Couldn't embed array of type\n{array.type}\nwith\n{feature}")
+
+
 def cast_table_to_features(table: pa.Table, features: "Features"):
-    """Cast an table to the arrow schema that corresponds to the requested features.
+    """Cast a table to the arrow schema that corresponds to the requested features.
 
     Args:
         table (:obj:`pyarrow.Table`): PyArrow table to cast
@@ -1885,6 +1966,25 @@ def cast_table_to_schema(table: pa.Table, schema: pa.Schema):
     return pa.Table.from_arrays(arrays, schema=schema)
 
 
+def embed_table_storage(table: pa.Table):
+    """Embed external data into a table's storage.
+
+    Args:
+        table (:obj:`pyarrow.Table`): PyArrow table in which to embed data
+
+    Returns:
+        table (:obj:`pyarrow.Table`): the table with embedded data
+    """
+    from .features.features import Features, require_storage_embed
+
+    features = Features.from_arrow_schema(table.schema)
+    arrays = [
+        embed_array_storage(table[name], feature) if require_storage_embed(feature) else table[name]
+        for name, feature in features.items()
+    ]
+    return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
+
+
 def table_cast(table: pa.Table, schema: pa.Schema):
     """Improved version of pa.Table.cast.
 
diff --git a/tests/test_table.py b/tests/test_table.py
@@ -7,7 +7,7 @@
 import pytest
 
 from datasets import Sequence, Value
-from datasets.features.features import ClassLabel, Features
+from datasets.features.features import ClassLabel, Features, Image
 from datasets.table import (
     ConcatenationTable,
     InMemoryTable,
@@ -20,7 +20,10 @@
     _memory_mapped_arrow_table_from_file,
     cast_array_to_feature,
     concat_tables,
+    embed_array_storage,
+    embed_table_storage,
     inject_arrow_table_documentation,
+    table_cast,
 )
 
 from .utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases, slow
@@ -1045,3 +1048,29 @@ def test_cast_array_to_features_to_null_type():
     arr = pa.array([[None, 1]])
     with pytest.raises(TypeError):
         cast_array_to_feature(arr, Sequence(Value("null")))
+
+
+def test_embed_array_storage(image_file):
+    array = pa.array([{"bytes": None, "path": image_file}], type=Image.pa_type)
+    embedded_images_array = embed_array_storage(array, Image())
+    assert embedded_images_array.to_pylist()[0]["path"] is None
+    assert isinstance(embedded_images_array.to_pylist()[0]["bytes"], bytes)
+
+
+def test_embed_array_storage_nested(image_file):
+    array = pa.array([[{"bytes": None, "path": image_file}]], type=pa.list_(Image.pa_type))
+    embedded_images_array = embed_array_storage(array, [Image()])
+    assert embedded_images_array.to_pylist()[0][0]["path"] is None
+    assert isinstance(embedded_images_array.to_pylist()[0][0]["bytes"], bytes)
+    array = pa.array([{"foo": {"bytes": None, "path": image_file}}], type=pa.struct({"foo": Image.pa_type}))
+    embedded_images_array = embed_array_storage(array, {"foo": Image()})
+    assert embedded_images_array.to_pylist()[0]["foo"]["path"] is None
+    assert isinstance(embedded_images_array.to_pylist()[0]["foo"]["bytes"], bytes)
+
+
+def test_embed_table_storage(image_file):
+    features = Features({"image": Image()})
+    table = table_cast(pa.table({"image": [image_file]}), features.arrow_schema)
+    embedded_images_table = embed_table_storage(table)
+    assert embedded_images_table.to_pydict()["image"][0]["path"] is None
+    assert isinstance(embedded_images_table.to_pydict()["image"][0]["bytes"], bytes)
diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -417,6 +417,31 @@ def test_push_dataset_to_hub_custom_features_image(self):
             finally:
                 self.cleanup_repo(ds_name)
 
+    @require_pil
+    def test_push_dataset_to_hub_custom_features_image_list(self):
+        image_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_image_rgb.jpg")
+        data = {"x": [[image_path], [image_path, image_path]], "y": [0, -1]}
+        features = Features({"x": [Image()], "y": Value("int32")})
+        ds = Dataset.from_dict(data, features=features)
+
+        for embed_external_files in [True, False]:
+            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
+            try:
+                ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)
+                hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")
+
+                self.assertListEqual(ds.column_names, hub_ds.column_names)
+                self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys()))
+                self.assertDictEqual(ds.features, hub_ds.features)
+                self.assertEqual(ds[:], hub_ds[:])
+                hub_ds = hub_ds.cast_column("x", [Image(decode=False)])
+                elem = hub_ds[0]["x"][0]
+                path, bytes_ = elem["path"], elem["bytes"]
+                self.assertTrue(bool(path) == (not embed_external_files))
+                self.assertTrue(bool(bytes_) == embed_external_files)
+            finally:
+                self.cleanup_repo(ds_name)
+
     def test_push_dataset_dict_to_hub_custom_features(self):
         features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])})
         ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features)