From e6bf8e0eeb6d916e397fa8ca76f90a561556e998 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 12 Jun 2021 16:16:34 +0200 Subject: [PATCH 1/3] Allow latest pyarrow version --- .circleci/config.yml | 4 ++-- setup.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5864cdbc511..07807e67e45 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,7 +15,7 @@ jobs: - run: source venv/bin/activate - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - - run: pip install pyarrow==3.0.0 + - run: pip install pyarrow --upgrade - run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/ run_dataset_script_tests_pyarrow_1: @@ -46,7 +46,7 @@ jobs: - run: "& venv/Scripts/activate.ps1" - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - - run: pip install pyarrow==3.0.0 + - run: pip install pyarrow --upgrade - run: $env:HF_SCRIPTS_VERSION="master" - run: python -m pytest -sv ./tests/ diff --git a/setup.py b/setup.py index 381802ab469..e17bab8d6ba 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,8 @@ "numpy>=1.17", # Backend and serialization. # Minimum 1.0.0 to avoid permission errors on windows when using the compute layer on memory mapped data - "pyarrow>=1.0.0,<4.0.0", + # pyarrow 4.0.0 introduced segfault bug, see: https://github.com/huggingface/datasets/pull/2268 + "pyarrow>=1.0.0,!=4.0.0", # For smart caching dataset processing "dill", # For performance gains with apache arrow From 8c628fcce900a8fce9fa6d828ddc41ac75d5d22e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 12 Jun 2021 16:49:30 +0200 Subject: [PATCH 2/3] Fix test only valid for pyarrow < 4 --- tests/test_table.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_table.py b/tests/test_table.py index 4dd8c9b4425..11dab40671c 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -745,8 +745,9 @@ def test_concatenation_table_cast( for k, v in zip(in_memory_pa_table.schema.names, in_memory_pa_table.schema.types) } ) - with pytest.raises(pa.ArrowNotImplementedError): - ConcatenationTable.from_blocks(blocks).cast(schema) + if pa.__version__ < "4": + with pytest.raises(pa.ArrowNotImplementedError): + ConcatenationTable.from_blocks(blocks).cast(schema) schema = pa.schema( { k: v if v != pa.int64() else pa.int32() From 11eaddac5166dbd66c7b9467b515a7c22a5d19ea Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 12 Jun 2021 17:36:54 +0200 Subject: [PATCH 3/3] Add test for pyarrow >= 4 --- tests/test_table.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_table.py b/tests/test_table.py index 11dab40671c..0ed3564d6b0 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -748,6 +748,10 @@ def test_concatenation_table_cast( if pa.__version__ < "4": with pytest.raises(pa.ArrowNotImplementedError): ConcatenationTable.from_blocks(blocks).cast(schema) + else: + table = ConcatenationTable.from_blocks(blocks).cast(schema) + assert table.table == in_memory_pa_table.cast(schema) + assert isinstance(table, ConcatenationTable) schema = pa.schema( { k: v if v != pa.int64() else pa.int32()