From 7d785f15fa5906c844c1ae2525c66c24590e6eb6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:12:57 +0100 Subject: [PATCH 1/9] Replace pa.PyExtensionType with pa.ExtensionType --- src/datasets/features/features.py | 16 +++++++++------- src/datasets/table.py | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 271fc46c335..39e4a0b8269 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -631,7 +631,7 @@ class Array5D(_ArrayXD): _type: str = field(default="Array5D", init=False, repr=False) -class _ArrayXDExtensionType(pa.PyExtensionType): +class _ArrayXDExtensionType(pa.ExtensionType): ndims: Optional[int] = None def __init__(self, shape: tuple, dtype: str): @@ -645,13 +645,15 @@ def __init__(self, shape: tuple, dtype: str): self.shape = tuple(shape) self.value_type = dtype self.storage_dtype = self._generate_dtype(self.value_type) - pa.PyExtensionType.__init__(self, self.storage_dtype) + pa.ExtensionType.__init__(self, self.storage_dtype, f"{self.__class__.__module__}.{self.__class__.__name__}") - def __reduce__(self): - return self.__class__, ( - self.shape, - self.value_type, - ) + def __arrow_ext_serialize__(self): + return json.dumps((self.shape, self.value_type)).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + args = json.loads(serialized) + return cls(*args) def __hash__(self): return hash((self.__class__, self.shape, self.value_type)) diff --git a/src/datasets/table.py b/src/datasets/table.py index ef3dc2b52f7..25cc05fe934 100644 --- a/src/datasets/table.py +++ b/src/datasets/table.py @@ -1891,7 +1891,7 @@ def _offsets_concat(offsets): def _concat_arrays(arrays): array_type = arrays[0].type - if isinstance(array_type, pa.PyExtensionType): + if isinstance(array_type, pa.ExtensionType): return array_type.wrap_array(_concat_arrays([array.storage for array in arrays])) elif pa.types.is_struct(array_type): return pa.StructArray.from_arrays( From d54b6459f4ed0b2519ddec605dd71956d2d1d3e4 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:13:42 +0100 Subject: [PATCH 2/9] Register user-defined extension types --- src/datasets/features/features.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 39e4a0b8269..2359596fc1d 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -689,6 +689,13 @@ class Array5DExtensionType(_ArrayXDExtensionType): ndims = 5 +# Register the extension types for deserialization +pa.register_extension_type(Array2DExtensionType((1, 2), "int64")) +pa.register_extension_type(Array3DExtensionType((1, 2, 3), "int64")) +pa.register_extension_type(Array4DExtensionType((1, 2, 3, 4), "int64")) +pa.register_extension_type(Array5DExtensionType((1, 2, 3, 4, 5), "int64")) + + def _is_zero_copy_only(pa_type: pa.DataType, unnest: bool = False) -> bool: """ When converting a pyarrow array to a numpy array, we must know whether this could be done in zero-copy or not. From 04a3f006a1a88c894ea10610d66dfddd73ad1490 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:00:08 +0100 Subject: [PATCH 3/9] Pin minimum pyarrow version to 14.0.1 --- .github/workflows/ci.yml | 4 ++-- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee5646c1b13..12be14f71ce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,9 +62,9 @@ jobs: - name: Install dependencies (latest versions) if: ${{ matrix.deps_versions == 'deps-latest' }} run: pip install --upgrade pyarrow huggingface-hub dill - - name: Install depencencies (minimum versions) + - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} - run: pip install pyarrow==8.0.0 huggingface-hub==0.18.0 transformers dill==0.3.1.1 + run: pip install pyarrow==14.0.1 huggingface-hub==0.18.0 transformers dill==0.3.1.1 - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ diff --git a/setup.py b/setup.py index b1c4712ea5e..7071408bbdc 100644 --- a/setup.py +++ b/setup.py @@ -111,8 +111,8 @@ # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling) "numpy>=1.17", # Backend and serialization. - # Minimum 8.0.0 to be able to use .to_reader() - "pyarrow>=8.0.0", + # Minimum 14.0.1 to fix vulnerability CVE-2023-47248 + "pyarrow>=14.0.1", # For smart caching dataset processing "dill>=0.3.0,<0.3.8", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19 # For performance gains with apache arrow From 98871b9ba46e89e75e9d0dddc49f4241373c575d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:15:29 +0100 Subject: [PATCH 4/9] Temporarily pin minimum pyarrow due to beam constraint --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7071408bbdc..a1c4e73a014 100644 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ "numpy>=1.17", # Backend and serialization. # Minimum 14.0.1 to fix vulnerability CVE-2023-47248 - "pyarrow>=14.0.1", + "pyarrow>=9.0.0", # TODO: maximum version allowed by Apache Beam # For smart caching dataset processing "dill>=0.3.0,<0.3.8", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19 # For performance gains with apache arrow From aecdc94580d105d4b70c94e8e238ce097f97af90 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:31:57 +0100 Subject: [PATCH 5/9] Remove constraint on pyarrow by removing unneeded upper beam version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a1c4e73a014..4f08cfc0251 100644 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ "numpy>=1.17", # Backend and serialization. # Minimum 14.0.1 to fix vulnerability CVE-2023-47248 - "pyarrow>=9.0.0", # TODO: maximum version allowed by Apache Beam + "pyarrow>=14.0.1", # For smart caching dataset processing "dill>=0.3.0,<0.3.8", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19 # For performance gains with apache arrow @@ -163,7 +163,7 @@ "pytest-datadir", "pytest-xdist", # optional dependencies - "apache-beam>=2.26.0,<2.44.0;python_version<'3.10'", # doesn't support recent dill versions for recent python versions + "apache-beam>=2.26.0;python_version<'3.10'", # doesn't support recent dill versions for recent python versions "elasticsearch<8.0.0", # 8.0 asks users to provide hosts or cloud_id when instantiating ElasticSearch() "faiss-cpu>=1.6.4", "jax>=0.3.14; sys_platform != 'win32'", From 998623fa51991320740b945d0853ee20807304d7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:48:32 +0100 Subject: [PATCH 6/9] Reset pyarrow minimum due to apache-beam constraint --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4f08cfc0251..282e978ac0a 100644 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ "numpy>=1.17", # Backend and serialization. # Minimum 14.0.1 to fix vulnerability CVE-2023-47248 - "pyarrow>=14.0.1", + "pyarrow>=11.0.0", # TODO: maximum version allowed by apache-beam-2.51.0 # For smart caching dataset processing "dill>=0.3.0,<0.3.8", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19 # For performance gains with apache arrow From 05200c0a4f8f02c3890ab79a10b44ab0bcf11629 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 13 Nov 2023 12:08:10 +0100 Subject: [PATCH 7/9] Revert last 2 commits --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 282e978ac0a..a1c4e73a014 100644 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ "numpy>=1.17", # Backend and serialization. # Minimum 14.0.1 to fix vulnerability CVE-2023-47248 - "pyarrow>=11.0.0", # TODO: maximum version allowed by apache-beam-2.51.0 + "pyarrow>=9.0.0", # TODO: maximum version allowed by Apache Beam # For smart caching dataset processing "dill>=0.3.0,<0.3.8", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19 # For performance gains with apache arrow @@ -163,7 +163,7 @@ "pytest-datadir", "pytest-xdist", # optional dependencies - "apache-beam>=2.26.0;python_version<'3.10'", # doesn't support recent dill versions for recent python versions + "apache-beam>=2.26.0,<2.44.0;python_version<'3.10'", # doesn't support recent dill versions for recent python versions "elasticsearch<8.0.0", # 8.0 asks users to provide hosts or cloud_id when instantiating ElasticSearch() "faiss-cpu>=1.6.4", "jax>=0.3.14; sys_platform != 'win32'", From 980ad4c6e6e33f0129db8745e84de8c298741aa2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 14 Nov 2023 09:54:14 +0100 Subject: [PATCH 8/9] Revert minimum pyarrow version and use pyarrow-hotfix --- .github/workflows/ci.yml | 2 +- setup.py | 6 ++++-- src/datasets/features/features.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12be14f71ce..6a3de08ea24 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: run: pip install --upgrade pyarrow huggingface-hub dill - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} - run: pip install pyarrow==14.0.1 huggingface-hub==0.18.0 transformers dill==0.3.1.1 + run: pip install pyarrow==8.0.0 huggingface-hub==0.18.0 transformers dill==0.3.1.1 - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ diff --git a/setup.py b/setup.py index a1c4e73a014..359b862d4fb 100644 --- a/setup.py +++ b/setup.py @@ -111,8 +111,10 @@ # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling) "numpy>=1.17", # Backend and serialization. - # Minimum 14.0.1 to fix vulnerability CVE-2023-47248 - "pyarrow>=9.0.0", # TODO: maximum version allowed by Apache Beam + # Minimum 8.0.0 to be able to use .to_reader() + "pyarrow>=8.0.0", + # As long as we allow pyarrow < 14.0.1, to fix vulnerability CVE-2023-47248 + "pyarrow-hotfix", # For smart caching dataset processing "dill>=0.3.0,<0.3.8", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19 # For performance gains with apache arrow diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 2359596fc1d..16a91c5c8b2 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -31,6 +31,7 @@ import pyarrow as pa import pyarrow.compute as pc import pyarrow.types +import pyarrow_hotfix # noqa: F401 # to fix vulnerability on pyarrow<14.0.1 from pandas.api.extensions import ExtensionArray as PandasExtensionArray from pandas.api.extensions import ExtensionDtype as PandasExtensionDtype From 45abe297c178b829afcee853f9958b0c5a6469aa Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 14 Nov 2023 10:14:50 +0100 Subject: [PATCH 9/9] Add pa.ExtensionType.__reduce__ --- src/datasets/features/features.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 16a91c5c8b2..5773d26e045 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -656,6 +656,10 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): args = json.loads(serialized) return cls(*args) + # This was added to pa.ExtensionType in pyarrow >= 13.0.0 + def __reduce__(self): + return self.__arrow_ext_deserialize__, (self.storage_type, self.__arrow_ext_serialize__()) + def __hash__(self): return hash((self.__class__, self.shape, self.value_type))