From d06b3dc7b7a3233b5eb074a6105253a3de973f9a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 15 Sep 2021 18:48:38 +0200 Subject: [PATCH 1/2] fix multidim arrays in list to arrow conversion --- src/datasets/arrow_dataset.py | 6 +--- src/datasets/arrow_writer.py | 12 ++++++-- src/datasets/features.py | 53 ++++++++++++++++++++++++++++------- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 5361c54664c..07915094229 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -44,7 +44,7 @@ from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence -from .features import ClassLabel, Features, Value, cast_to_python_objects +from .features import ClassLabel, Features, Value from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, @@ -449,8 +449,6 @@ def from_dict( info.features = features if features is not None: mapping = features.encode_batch(mapping) - else: - mapping = cast_to_python_objects(mapping) mapping = { col: OptimizedTypedSequence(data, type=features.type[col].type if features is not None else None, col=col) for col, data in mapping.items() @@ -2037,7 +2035,6 @@ def init_buffer_and_writer(): if isinstance(example, pa.Table): writer.write_row(example) else: - example = cast_to_python_objects(example) writer.write(example) else: for i in pbar: @@ -2065,7 +2062,6 @@ def init_buffer_and_writer(): if isinstance(batch, pa.Table): writer.write_table(batch) else: - batch = cast_to_python_objects(batch) writer.write_batch(batch) if update_data and writer is not None: writer.finalize() # close_stream=bool(buf_writer is None)) # We only close if we are writing in a file diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index a30689fd14a..74b0ad49d3f 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -24,7 +24,13 @@ import pyarrow as pa from . import config, utils -from .features import Features, _ArrayXDExtensionType, numpy_to_pyarrow_listarray +from .features import ( + Features, + _ArrayXDExtensionType, + cast_to_python_objects, + list_of_np_array_to_pyarrow_listarray, + numpy_to_pyarrow_listarray, +) from .info import DatasetInfo from .keyhash import DuplicatedKeysError, KeyHasher from .utils import logging @@ -103,8 +109,10 @@ def __arrow_array__(self, type=None): out = pa.ExtensionArray.from_storage(type, storage) elif isinstance(self.data, np.ndarray): out = numpy_to_pyarrow_listarray(self.data) + elif isinstance(self.data, list) and self.data and isinstance(self.data[0], np.ndarray): + out = list_of_np_array_to_pyarrow_listarray(self.data) else: - out = pa.array(self.data, type=type) + out = pa.array(cast_to_python_objects(self.data, only_1d_for_numpy=True), type=type) if trying_type and out[0].as_py() != self.data[0]: raise TypeError( "Specified try_type alters data. Please check that the type/feature that you provided match the type/features of the data." diff --git a/src/datasets/features.py b/src/datasets/features.py index 8adbe579d43..a3ede8d5140 100644 --- a/src/datasets/features.py +++ b/src/datasets/features.py @@ -144,7 +144,7 @@ def string_to_arrow(datasets_dtype: str) -> pa.DataType: return pa.__dict__[arrow_data_factory_function_name]() -def _cast_to_python_objects(obj: Any) -> Tuple[Any, bool]: +def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool) -> Tuple[Any, bool]: """ Cast pytorch/tensorflow/pandas objects to python numpy array/lists. It works recursively. @@ -155,6 +155,9 @@ def _cast_to_python_objects(obj: Any) -> Tuple[Any, bool]: Args: obj: the object (nested struct) to cast + only_1d_for_numpy (bool): whether to keep the full multi-dim tensors as multi-dim numpy arrays, or convert them to + nested lists of 1-dimensional numpy arrays. This can be useful to keep only 1-d arrays to instantiate Arrow arrays. + Indeed Arrow only support converting 1-dimensional array values. Returns: casted_obj: the casted object @@ -171,13 +174,27 @@ def _cast_to_python_objects(obj: Any) -> Tuple[Any, bool]: import jax.numpy as jnp if isinstance(obj, np.ndarray): - return obj.tolist(), False + if only_1d_for_numpy and obj.ndim == 1: + return obj, False + else: + return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in obj], True elif config.TORCH_AVAILABLE and "torch" in sys.modules and isinstance(obj, torch.Tensor): - return obj.detach().cpu().numpy(), True + if only_1d_for_numpy and obj.ndim == 1: + return obj.detach().cpu().numpy(), True + else: + return [ + _cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in obj.detach().cpu().numpy() + ], True elif config.TF_AVAILABLE and "tensorflow" in sys.modules and isinstance(obj, tf.Tensor): - return obj.numpy(), True + if only_1d_for_numpy and obj.ndim == 1: + return obj.numpy(), True + else: + return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in obj.numpy()], True elif config.JAX_AVAILABLE and "jax" in sys.modules and isinstance(obj, jnp.ndarray): - return np.asarray(obj), True + if only_1d_for_numpy and obj.ndim == 1: + return np.asarray(obj), True + else: + return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in np.asarray(obj)], True elif isinstance(obj, pd.Series): return obj.values.tolist(), True elif isinstance(obj, pd.DataFrame): @@ -186,7 +203,7 @@ def _cast_to_python_objects(obj: Any) -> Tuple[Any, bool]: output = {} has_changed = False for k, v in obj.items(): - casted_v, has_changed_v = _cast_to_python_objects(v) + casted_v, has_changed_v = _cast_to_python_objects(v, only_1d_for_numpy=only_1d_for_numpy) has_changed |= has_changed_v output[k] = casted_v return output if has_changed else obj, has_changed @@ -195,9 +212,11 @@ def _cast_to_python_objects(obj: Any) -> Tuple[Any, bool]: for first_elmt in obj: if first_elmt is not None: break - casted_first_elmt, has_changed_first_elmt = _cast_to_python_objects(first_elmt) + casted_first_elmt, has_changed_first_elmt = _cast_to_python_objects( + first_elmt, only_1d_for_numpy=only_1d_for_numpy + ) if has_changed_first_elmt: - return [_cast_to_python_objects(elmt)[0] for elmt in obj], True + return [_cast_to_python_objects(elmt, only_1d_for_numpy=only_1d_for_numpy)[0] for elmt in obj], True else: if isinstance(obj, list): return obj, False @@ -209,7 +228,7 @@ def _cast_to_python_objects(obj: Any) -> Tuple[Any, bool]: return obj, False -def cast_to_python_objects(obj: Any) -> Any: +def cast_to_python_objects(obj: Any, only_1d_for_numpy=False) -> Any: """ Cast numpy/pytorch/tensorflow/pandas objects to python lists. It works recursively. @@ -224,7 +243,7 @@ def cast_to_python_objects(obj: Any) -> Any: Returns: casted_obj: the casted object """ - return _cast_to_python_objects(obj)[0] + return _cast_to_python_objects(obj, only_1d_for_numpy=only_1d_for_numpy)[0] @dataclass @@ -963,6 +982,20 @@ def numpy_to_pyarrow_listarray(arr: np.ndarray, type: pa.DataType = None) -> pa. return values +def list_of_pa_arrays_to_pyarrow_listarray(l_arr: List[pa.Array]) -> pa.ListArray: + offsets = pa.array(np.cumsum([0] + [len(arr) for arr in l_arr]), type=pa.int32()) + values = pa.concat_arrays(l_arr) + return pa.ListArray.from_arrays(offsets, values) + + +def list_of_np_array_to_pyarrow_listarray(l_arr: List[np.ndarray], type: pa.DataType = None) -> pa.ListArray: + """Build a PyArrow ListArray from a possibly nested list of NumPy arrays""" + if len(l_arr) > 0: + return list_of_pa_arrays_to_pyarrow_listarray([numpy_to_pyarrow_listarray(arr, type=type) for arr in l_arr]) + else: + return pa.array([], type=type) + + class Features(dict): @property def type(self): From 9883322552efd9e7ab71b5ef65da69196a3f503f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 15 Sep 2021 19:07:13 +0200 Subject: [PATCH 2/2] fix test --- src/datasets/features.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/datasets/features.py b/src/datasets/features.py index a3ede8d5140..d1cc7a68b65 100644 --- a/src/datasets/features.py +++ b/src/datasets/features.py @@ -174,24 +174,24 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool) -> Tuple[Any, boo import jax.numpy as jnp if isinstance(obj, np.ndarray): - if only_1d_for_numpy and obj.ndim == 1: + if not only_1d_for_numpy or obj.ndim == 1: return obj, False else: return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in obj], True elif config.TORCH_AVAILABLE and "torch" in sys.modules and isinstance(obj, torch.Tensor): - if only_1d_for_numpy and obj.ndim == 1: + if not only_1d_for_numpy or obj.ndim == 1: return obj.detach().cpu().numpy(), True else: return [ _cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in obj.detach().cpu().numpy() ], True elif config.TF_AVAILABLE and "tensorflow" in sys.modules and isinstance(obj, tf.Tensor): - if only_1d_for_numpy and obj.ndim == 1: + if not only_1d_for_numpy or obj.ndim == 1: return obj.numpy(), True else: return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in obj.numpy()], True elif config.JAX_AVAILABLE and "jax" in sys.modules and isinstance(obj, jnp.ndarray): - if only_1d_for_numpy and obj.ndim == 1: + if not only_1d_for_numpy or obj.ndim == 1: return np.asarray(obj), True else: return [_cast_to_python_objects(x, only_1d_for_numpy=only_1d_for_numpy)[0] for x in np.asarray(obj)], True