Skip to content

Commit 1853a57

Browse files
Unblock NumPy 2.0 (#6991)
* Unblock NumPy 2.0 * . * Revert tensorflow min version * Add CI tests for numpy2 * Implement test require_numpy1_on_windows * Mark tests with require_numpy1_on_windows * Fix test skip reason * Add clarifying comment --------- Co-authored-by: Albert Villanova del Moral <[email protected]>
1 parent 3eeca2c commit 1853a57

File tree

11 files changed

+69
-7
lines changed

11 files changed

+69
-7
lines changed

.github/workflows/ci.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,30 @@ jobs:
9696
- name: Test with pytest
9797
run: |
9898
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
99+
100+
test_py310_numpy2:
101+
needs: check_code_quality
102+
strategy:
103+
matrix:
104+
test: ['unit']
105+
os: [ubuntu-latest, windows-latest]
106+
deps_versions: [deps-latest]
107+
continue-on-error: false
108+
runs-on: ${{ matrix.os }}
109+
steps:
110+
- uses: actions/checkout@v4
111+
with:
112+
fetch-depth: 0
113+
- name: Set up Python 3.10
114+
uses: actions/setup-python@v5
115+
with:
116+
python-version: "3.10"
117+
- name: Upgrade pip
118+
run: python -m pip install --upgrade pip
119+
- name: Install uv
120+
run: pip install --upgrade uv
121+
- name: Install dependencies
122+
run: uv pip install --system "datasets[tests_numpy2] @ ."
123+
- name: Test with pytest
124+
run: |
125+
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/

setup.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@
111111
# For file locking
112112
"filelock",
113113
# We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)
114-
"numpy>=1.17,<2.0.0", # Temporary upper version
114+
"numpy>=1.17",
115115
# Backend and serialization.
116116
# Minimum 15.0.0 to be able to cast dictionary types to their underlying types
117117
"pyarrow>=15.0.0",
@@ -166,7 +166,7 @@
166166
"pytest-xdist",
167167
# optional dependencies
168168
"elasticsearch<8.0.0", # 8.0 asks users to provide hosts or cloud_id when instantiating ElasticSearch()
169-
"faiss-cpu>=1.6.4",
169+
"faiss-cpu>=1.8.0.post1", # Pins numpy < 2
170170
"jax>=0.3.14; sys_platform != 'win32'",
171171
"jaxlib>=0.3.14; sys_platform != 'win32'",
172172
"lz4",
@@ -176,11 +176,11 @@
176176
"sqlalchemy",
177177
"s3fs>=2021.11.1", # aligned with fsspec[http]>=2021.11.1; test only on python 3.7 for now
178178
"protobuf<4.0.0", # 4.0.0 breaks compatibility with tensorflow<2.12
179-
"tensorflow>=2.6.0",
179+
"tensorflow>=2.6.0", # Issue installing 2.16.0 with Python 3.8; we rely on other dependencies pinning numpy < 2
180180
"tiktoken",
181181
"torch>=2.0.0",
182182
"soundfile>=0.12.1",
183-
"transformers",
183+
"transformers>=4.42.0", # Pins numpy < 2
184184
"typing-extensions>=4.6.1", # due to conflict between apache-beam and pydantic
185185
"zstandard",
186186
"polars[timezone]>=0.20.0",
@@ -217,6 +217,16 @@
217217
TESTS_REQUIRE.extend(VISION_REQUIRE)
218218
TESTS_REQUIRE.extend(AUDIO_REQUIRE)
219219

220+
NUMPY2_INCOMPATIBLE_LIBRARIES = [
221+
"faiss-cpu",
222+
"librosa",
223+
"tensorflow",
224+
"transformers",
225+
]
226+
TESTS_NUMPY2_REQUIRE = [
227+
library for library in TESTS_REQUIRE if library.partition(">")[0] not in NUMPY2_INCOMPATIBLE_LIBRARIES
228+
]
229+
220230
QUALITY_REQUIRE = ["ruff>=0.3.0"]
221231

222232
DOCS_REQUIRE = [
@@ -242,6 +252,7 @@
242252
"streaming": [], # for backward compatibility
243253
"dev": TESTS_REQUIRE + QUALITY_REQUIRE + DOCS_REQUIRE,
244254
"tests": TESTS_REQUIRE,
255+
"tests_numpy2": TESTS_NUMPY2_REQUIRE,
245256
"metrics-tests": METRICS_TESTS_REQUIRE,
246257
"quality": QUALITY_REQUIRE,
247258
"benchmarks": BENCHMARKS_REQUIRE,

tests/features/test_features.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from datasets.info import DatasetInfo
2626
from datasets.utils.py_utils import asdict
2727

28-
from ..utils import require_jax, require_tf, require_torch
28+
from ..utils import require_jax, require_numpy1_on_windows, require_tf, require_torch
2929

3030

3131
class FeaturesTest(TestCase):
@@ -543,6 +543,7 @@ def test_cast_to_python_objects_pandas_timedelta(self):
543543
casted_obj = cast_to_python_objects(pd.DataFrame({"a": [obj]}))
544544
self.assertDictEqual(casted_obj, {"a": [expected_obj]})
545545

546+
@require_numpy1_on_windows
546547
@require_torch
547548
def test_cast_to_python_objects_torch(self):
548549
import torch

tests/packaged_modules/test_webdataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from datasets import Audio, DownloadManager, Features, Image, Sequence, Value
88
from datasets.packaged_modules.webdataset.webdataset import WebDataset
99

10-
from ..utils import require_librosa, require_pil, require_sndfile, require_torch
10+
from ..utils import require_librosa, require_numpy1_on_windows, require_pil, require_sndfile, require_torch
1111

1212

1313
@pytest.fixture
@@ -226,6 +226,7 @@ def test_webdataset_with_features(image_wds_file):
226226
assert isinstance(decoded["jpg"], PIL.Image.Image)
227227

228228

229+
@require_numpy1_on_windows
229230
@require_torch
230231
def test_tensor_webdataset(tensor_wds_file):
231232
import torch

tests/test_arrow_dataset.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
require_dill_gt_0_3_2,
5858
require_jax,
5959
require_not_windows,
60+
require_numpy1_on_windows,
6061
require_pil,
6162
require_polars,
6263
require_pyspark,
@@ -420,6 +421,7 @@ def test_set_format_numpy_multiple_columns(self, in_memory):
420421
self.assertIsInstance(dset[0]["col_2"], np.str_)
421422
self.assertEqual(dset[0]["col_2"].item(), "a")
422423

424+
@require_numpy1_on_windows
423425
@require_torch
424426
def test_set_format_torch(self, in_memory):
425427
import torch
@@ -1525,6 +1527,7 @@ def func_return_multi_row_pd_dataframe(x):
15251527
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
15261528
self.assertRaises(ValueError, dset.map, func_return_multi_row_pd_dataframe)
15271529

1530+
@require_numpy1_on_windows
15281531
@require_torch
15291532
def test_map_torch(self, in_memory):
15301533
import torch
@@ -1590,6 +1593,7 @@ def func(example):
15901593
)
15911594
self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3])
15921595

1596+
@require_numpy1_on_windows
15931597
@require_torch
15941598
def test_map_tensor_batched(self, in_memory):
15951599
import torch

tests/test_dataset_dict.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .utils import (
1717
assert_arrow_memory_doesnt_increase,
1818
assert_arrow_memory_increases,
19+
require_numpy1_on_windows,
1920
require_polars,
2021
require_tf,
2122
require_torch,
@@ -109,6 +110,7 @@ def test_set_format_numpy(self):
109110
self.assertEqual(dset_split[0]["col_2"].item(), "a")
110111
del dset
111112

113+
@require_numpy1_on_windows
112114
@require_torch
113115
def test_set_format_torch(self):
114116
import torch

tests/test_fingerprint.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from .utils import (
2323
require_not_windows,
24+
require_numpy1_on_windows,
2425
require_regex,
2526
require_spacy,
2627
require_tiktoken,
@@ -303,6 +304,7 @@ def test_hash_tiktoken_encoding(self):
303304
self.assertEqual(hash1, hash3)
304305
self.assertNotEqual(hash1, hash2)
305306

307+
@require_numpy1_on_windows
306308
@require_torch
307309
def test_hash_torch_tensor(self):
308310
import torch
@@ -316,6 +318,7 @@ def test_hash_torch_tensor(self):
316318
self.assertEqual(hash1, hash3)
317319
self.assertNotEqual(hash1, hash2)
318320

321+
@require_numpy1_on_windows
319322
@require_torch
320323
def test_hash_torch_generator(self):
321324
import torch

tests/test_formatting.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from .utils import (
2222
require_jax,
2323
require_librosa,
24+
require_numpy1_on_windows,
2425
require_pil,
2526
require_polars,
2627
require_sndfile,
@@ -353,6 +354,7 @@ def test_polars_formatter(self):
353354
assert pl.Series.eq(batch["a"], pl.Series("a", _COL_A)).all()
354355
assert pl.Series.eq(batch["b"], pl.Series("b", _COL_B)).all()
355356

357+
@require_numpy1_on_windows
356358
@require_torch
357359
def test_torch_formatter(self):
358360
import torch
@@ -373,6 +375,7 @@ def test_torch_formatter(self):
373375
torch.testing.assert_close(batch["c"], torch.tensor(_COL_C, dtype=torch.float32))
374376
assert batch["c"].shape == np.array(_COL_C).shape
375377

378+
@require_numpy1_on_windows
376379
@require_torch
377380
def test_torch_formatter_torch_tensor_kwargs(self):
378381
import torch
@@ -389,6 +392,7 @@ def test_torch_formatter_torch_tensor_kwargs(self):
389392
self.assertEqual(batch["a"].dtype, torch.float16)
390393
self.assertEqual(batch["c"].dtype, torch.float16)
391394

395+
@require_numpy1_on_windows
392396
@require_torch
393397
@require_pil
394398
def test_torch_formatter_image(self):
@@ -975,6 +979,7 @@ def test_tf_formatter_sets_default_dtypes(cast_schema, arrow_table):
975979
tf.debugging.assert_equal(batch["col_float"], tf.ragged.constant(list_float, dtype=tf.float32))
976980

977981

982+
@require_numpy1_on_windows
978983
@require_torch
979984
@pytest.mark.parametrize(
980985
"cast_schema",

tests/test_iterable_dataset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
is_rng_equal,
5252
require_dill_gt_0_3_2,
5353
require_not_windows,
54+
require_numpy1_on_windows,
5455
require_pyspark,
5556
require_tf,
5657
require_torch,
@@ -1279,6 +1280,7 @@ def gen(shard_names):
12791280
assert dataset.n_shards == len(shard_names)
12801281

12811282

1283+
@require_numpy1_on_windows
12821284
def test_iterable_dataset_from_file(dataset: IterableDataset, arrow_file: str):
12831285
with assert_arrow_memory_doesnt_increase():
12841286
dataset_from_file = IterableDataset.from_file(arrow_file)

tests/test_py_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
zip_dict,
1919
)
2020

21-
from .utils import require_tf, require_torch
21+
from .utils import require_numpy1_on_windows, require_tf, require_torch
2222

2323

2424
def np_sum(x): # picklable for multiprocessing
@@ -151,6 +151,7 @@ def gen_random_output():
151151
np.testing.assert_equal(out1, out2)
152152
self.assertGreater(np.abs(out1 - out3).sum(), 0)
153153

154+
@require_numpy1_on_windows
154155
@require_torch
155156
def test_torch(self):
156157
import torch

0 commit comments

Comments
 (0)