From 7c9f6fc26710a09f9622939552adf1a1fe4005e4 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 28 Oct 2025 17:07:14 +0100 Subject: [PATCH 01/11] add 3.14 --- .github/conda/meta.yaml | 4 ++-- setup.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml index 59a16cda78b..7263b10035a 100644 --- a/.github/conda/meta.yaml +++ b/.github/conda/meta.yaml @@ -25,7 +25,7 @@ requirements: - dataclasses - multiprocess - fsspec - - huggingface_hub >=0.24.0,<1.0.0 + - huggingface_hub >=0.25.0,<2.0.0 - packaging run: - python @@ -41,7 +41,7 @@ requirements: - dataclasses - multiprocess - fsspec - - huggingface_hub >=0.24.0,<1.0.0 + - huggingface_hub >=0.25.0,<2.0.0 - packaging test: diff --git a/setup.py b/setup.py index 06eee6717c8..95235b972f7 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,7 @@ # for fast hashing "xxhash", # for better multiprocessing - "multiprocess<0.70.17", # to align with dill<0.3.9 (see above) + "multiprocess<0.70.19", # to align with dill<0.3.9 (see above) # to save datasets locally or on any filesystem # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 "fsspec[http]>=2023.1.0,<=2025.9.0", @@ -262,6 +262,9 @@ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], keywords="datasets machine learning datasets", From 4445cac3a390a66b59e1351ad63173525d92d09a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 28 Oct 2025 17:07:44 +0100 Subject: [PATCH 02/11] update ci --- .github/workflows/ci.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 128266b5e48..e40bc458d6f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,7 +82,7 @@ jobs: run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ - test_py312: + test_py314: needs: check_code_quality strategy: matrix: @@ -100,10 +100,10 @@ jobs: run: | sudo apt update sudo apt install -y ffmpeg - - name: Set up Python 3.12 + - name: Set up Python 3.14 uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.14" - name: Setup conda env (windows) if: ${{ matrix.os == 'windows-latest' }} uses: conda-incubator/setup-miniconda@v2 @@ -111,7 +111,7 @@ jobs: auto-update-conda: true miniconda-version: "latest" activate-environment: test - python-version: "3.12" + python-version: "3.14" - name: Setup FFmpeg (windows) if: ${{ matrix.os == 'windows-latest' }} run: conda install "ffmpeg=7.0.1" -c conda-forge @@ -127,7 +127,7 @@ jobs: run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ - test_py312_future: + test_py314_future: needs: check_code_quality strategy: matrix: @@ -145,10 +145,10 @@ jobs: run: | sudo apt update sudo apt install -y ffmpeg - - name: Set up Python 3.12 + - name: Set up Python 3.14 uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.14" - name: Setup conda env (windows) if: ${{ matrix.os == 'windows-latest' }} uses: conda-incubator/setup-miniconda@v2 @@ -156,7 +156,7 @@ jobs: auto-update-conda: true miniconda-version: "latest" activate-environment: test - python-version: "3.12" + python-version: "3.14" - name: Setup FFmpeg (windows) if: ${{ matrix.os == 'windows-latest' }} run: conda install "ffmpeg=7.0.1" -c conda-forge From d4c11b5fe3520aa9f1695c1a3488b6b1b03840d9 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 28 Oct 2025 17:22:51 +0100 Subject: [PATCH 03/11] go home tf --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 95235b972f7..df653db4e72 100644 --- a/setup.py +++ b/setup.py @@ -177,7 +177,7 @@ "sqlalchemy", "protobuf<4.0.0", # 4.0.0 breaks compatibility with tensorflow<2.12 "tensorflow>=2.6.0; python_version<'3.10' and sys_platform != 'win32'", # numpy-2 is not supported for Python < 3.10 - "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32'", # Pins numpy < 2 + "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32' and python_version < '3.14'", # Pins numpy < 2 "tiktoken", "torch>=2.8.0", "torchdata", From 0707839c5ed7d15bca30b65eaf63b6d6fa454345 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 28 Oct 2025 17:28:16 +0100 Subject: [PATCH 04/11] torchcodec --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index df653db4e72..8cb243f5d7b 100644 --- a/setup.py +++ b/setup.py @@ -185,7 +185,7 @@ "zstandard", "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced - "torchcodec>=0.7.0", # minium version to get windows support + "torchcodec>=0.7.0; python_version < '3.14'", # minium version to get windows support, torchcodec doesn't have wheels for 3.14 yet "nibabel>=5.3.1", ] From b8c47004ca4b9ce98602aa4a5b382841b47e5a3b Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 28 Oct 2025 18:35:52 +0100 Subject: [PATCH 05/11] numba --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8cb243f5d7b..c74f5b772e2 100644 --- a/setup.py +++ b/setup.py @@ -153,7 +153,7 @@ TESTS_REQUIRE = [ # fix pip install issues for windows - "numba>=0.56.4", # to get recent versions of llvmlite for windows ci + "numba>=0.56.4; python_version < '3.14'", # to get recent versions of llvmlite for windows ci, not available on 3.14 # test dependencies "absl-py", "decorator", From 0983da053bbd8cb12423a793bd06943b6ac31cf5 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 30 Oct 2025 17:28:06 +0100 Subject: [PATCH 06/11] fix ci --- setup.py | 2 +- tests/features/test_audio.py | 3 +++ tests/test_extract.py | 3 +-- tests/test_fingerprint.py | 5 +++-- tests/test_iterable_dataset.py | 39 +++++++++++++++++++--------------- tests/test_py_utils.py | 3 ++- tests/utils.py | 14 ++++++++++++ 7 files changed, 46 insertions(+), 23 deletions(-) diff --git a/setup.py b/setup.py index c74f5b772e2..8f9edd8f944 100644 --- a/setup.py +++ b/setup.py @@ -158,7 +158,7 @@ "absl-py", "decorator", "joblib<1.3.0", # joblibspark doesn't support recent joblib versions - "joblibspark", + "joblibspark; python_version < '3.14'", # python 3.14 gives AttributeError: module 'ast' has no attribute 'Num' "pytest", "pytest-datadir", "pytest-xdist", diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index aa5b2fcda94..aae59ea53ee 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -713,6 +713,7 @@ def test_dataset_with_audio_feature_loaded_from_cache(): assert isinstance(ds, Dataset) +@require_torchcodec def test_dataset_with_audio_feature_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} @@ -730,6 +731,7 @@ def test_dataset_with_audio_feature_undecoded(shared_datadir): assert column[0] == {"path": audio_path, "bytes": None} +@require_torchcodec def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} @@ -761,6 +763,7 @@ def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir): assert column[0] == {"path": audio_path, "bytes": None} +@require_torchcodec def test_dataset_with_audio_feature_map_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} diff --git a/tests/test_extract.py b/tests/test_extract.py index 186d65fd0ba..489e5efa586 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,5 +1,4 @@ import os -import zipfile import pytest @@ -199,5 +198,5 @@ def test_is_zipfile_false_positive(tmpdir): ) with not_a_zip_file.open("wb") as f: f.write(data) - assert zipfile.is_zipfile(str(not_a_zip_file)) # is a false positive for `zipfile` + # zipfile.is_zipfile(str(not_a_zip_file)) could be a false positive for `zipfile` assert not ZipExtractor.is_extractable(not_a_zip_file) # but we're right diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index 0b7a45458bd..e3ca7464b16 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -26,6 +26,7 @@ require_spacy, require_tiktoken, require_torch, + require_torch_compile, require_transformers, ) @@ -347,7 +348,7 @@ def test_hash_spacy_model(self): self.assertNotEqual(hash1, hash2) @require_not_windows - @require_torch + @require_torch_compile def test_hash_torch_compiled_function(self): import torch @@ -360,7 +361,7 @@ def f(x): self.assertEqual(hash1, hash2) @require_not_windows - @require_torch + @require_torch_compile def test_hash_torch_compiled_module(self): m = TorchModule() next(iter(m.parameters())).data.fill_(1.0) diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 583f5dab51a..bdfa60fdc01 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -1553,18 +1553,21 @@ def test_iterable_dataset_from_hub_torch_dataloader_parallel(num_workers, tmp_pa assert len(result) == 10 +def gen_with_worker_info(shard): + from torch.utils.data import get_worker_info + + worker_info = get_worker_info() + for i in range(100): + yield {"value": i, "worker_id": worker_info.id} + + @require_torch def test_iterable_dataset_shuffle_with_multiple_workers_different_rng(): # GH 7567 - from torch.utils.data import DataLoader, get_worker_info - - def gen(shard): - worker_info = get_worker_info() - for i in range(100): - yield {"value": i, "worker_id": worker_info.id} + from torch.utils.data import DataLoader num_workers = 20 - ds = IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers))}) + ds = IterableDataset.from_generator(gen_with_worker_info, gen_kwargs={"shard": list(range(num_workers))}) ds = ds.shuffle(buffer_size=100, seed=1234) dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers) @@ -1575,18 +1578,19 @@ def gen(shard): assert len(set(values)) != 1, "Make sure not all values are identical" +def gen_with_value(shard, value): + for i in range(100): + yield {"value": value} + + @require_torch def test_iterable_dataset_interleave_dataset_with_multiple_workers(): # GH 7567 from torch.utils.data import DataLoader - def gen(shard, value): - for i in range(100): - yield {"value": value} - num_workers = 20 ds = [ - IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i}) + IterableDataset.from_generator(gen_with_value, gen_kwargs={"shard": list(range(num_workers)), "value": i}) for i in range(10) ] ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234) @@ -1598,18 +1602,19 @@ def gen(shard, value): assert len(set(values)) != 1, "Make sure not all values are identical" +def gen_with_id(shard, value): + for i in range(50): + yield {"value": value, "id": i} + + @require_torch def test_iterable_dataset_interleave_dataset_deterministic_across_iterations(): # GH 7567 from torch.utils.data import DataLoader - def gen(shard, value): - for i in range(50): - yield {"value": value, "id": i} - num_workers = 10 ds = [ - IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i}) + IterableDataset.from_generator(gen_with_id, gen_kwargs={"shard": list(range(num_workers)), "value": i}) for i in range(5) ] ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234) diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py index d3e7795bf9d..aad95f74a59 100644 --- a/tests/test_py_utils.py +++ b/tests/test_py_utils.py @@ -1,4 +1,5 @@ import os +import pickle import time from dataclasses import dataclass from multiprocessing import Pool @@ -81,7 +82,7 @@ def test_map_nested(self): {k: v.tolist() for k, v in map_nested(int, sn1, map_numpy=True, num_proc=num_proc).items()}, {k: v.tolist() for k, v in expected_map_nested_sn1_int.items()}, ) - with self.assertRaises(AttributeError): # can't pickle a local lambda + with self.assertRaises((AttributeError, pickle.PicklingError)): # can't pickle a local lambda map_nested(lambda x: x + 1, sn1, num_proc=num_proc) def test_zip_dict(self): diff --git a/tests/utils.py b/tests/utils.py index b796641a290..1980cf3e257 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -125,6 +125,20 @@ def require_torch(test_case): return test_case +def require_torch_compile(test_case): + """ + Decorator marking a test that requires PyTorch. + + These tests are skipped when PyTorch isn't installed. + + """ + if not config.TORCH_AVAILABLE: + test_case = unittest.skip("test requires PyTorch")(test_case) + if config.PY_VERSION >= version.parse("3.14"): + test_case = unittest.skip("test requires torch compile which isn't available in python 3.14")(test_case) + return test_case + + def require_polars(test_case): """ Decorator marking a test that requires Polars. From 6f6e0ff25b1b1683343966a2bf33376cacc026d7 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 30 Oct 2025 17:43:43 +0100 Subject: [PATCH 07/11] no lz4 in python 3.14 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8f9edd8f944..497ccdee360 100644 --- a/setup.py +++ b/setup.py @@ -169,7 +169,7 @@ "h5py", "jax>=0.3.14; sys_platform != 'win32'", "jaxlib>=0.3.14; sys_platform != 'win32'", - "lz4", + "lz4; python_version < '3.14'", # python 3.14 gives ImportError: cannot import name '_compression' from partially initialized module 'lz4.frame "moto[server]", "pyspark>=3.4", # https://issues.apache.org/jira/browse/SPARK-40991 fixed in 3.4.0 "py7zr", From d8e5faf675f98c210ebc195c8d73feff2e82a0e3 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 31 Oct 2025 16:27:33 +0100 Subject: [PATCH 08/11] fix tests --- tests/test_streaming_download_manager.py | 25 ++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index d569637fdad..aec74c09348 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -1,5 +1,6 @@ import json import os +from pathlib import Path import pytest @@ -26,10 +27,16 @@ Bulbasaur, grass""" -@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"]) -def test_streaming_dl_manager_download_dummy_path(urlpath): +def test_streaming_dl_manager_download_dummy_path(): + path = str(Path().resolve().parents[-1] / "foo" / "bard.txt") dl_manager = StreamingDownloadManager() - assert dl_manager.download(urlpath) == urlpath + assert dl_manager.download(path) == path + + +def test_streaming_dl_manager_download_dummy_url(): + url = "https://f.oo/bar.txt" + dl_manager = StreamingDownloadManager() + assert dl_manager.download(url) == url @pytest.mark.parametrize( @@ -54,10 +61,16 @@ def test_streaming_dl_manager_download(text_path): assert f.read() == expected_file.read() -@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"]) -def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath): +def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_path(): + path = str(Path().resolve().parents[-1] / "foo" / "bard.txt") + dl_manager = StreamingDownloadManager() + assert dl_manager.download_and_extract(path) == path + + +def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_url(): + url = "https://f.oo/bar.txt" dl_manager = StreamingDownloadManager() - assert dl_manager.download_and_extract(urlpath) == urlpath + assert dl_manager.download_and_extract(url) == url def test_streaming_dl_manager_extract(text_gz_path, text_path): From 645d86b2ffbbc3a0050b25e5745e4d77487f5a03 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 31 Oct 2025 17:58:46 +0100 Subject: [PATCH 09/11] again --- tests/test_streaming_download_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index aec74c09348..0453614fb69 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -28,7 +28,7 @@ def test_streaming_dl_manager_download_dummy_path(): - path = str(Path().resolve().parents[-1] / "foo" / "bard.txt") + path = str(Path(__file__).resolve().parents[-1] / "foo" / "bar.txt") dl_manager = StreamingDownloadManager() assert dl_manager.download(path) == path From ed585f73bb79bde158b4f39fb170af1474d290ce Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 31 Oct 2025 18:08:15 +0100 Subject: [PATCH 10/11] again --- tests/test_streaming_download_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index 0453614fb69..05cb66f2cc9 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -28,7 +28,7 @@ def test_streaming_dl_manager_download_dummy_path(): - path = str(Path(__file__).resolve().parents[-1] / "foo" / "bar.txt") + path = str(Path(__file__).resolve()) dl_manager = StreamingDownloadManager() assert dl_manager.download(path) == path From bf6377906aebd10d86ed1be2eceaa9565496e704 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 31 Oct 2025 18:16:44 +0100 Subject: [PATCH 11/11] again --- tests/test_streaming_download_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index 05cb66f2cc9..1fc53502ba6 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -62,7 +62,7 @@ def test_streaming_dl_manager_download(text_path): def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_path(): - path = str(Path().resolve().parents[-1] / "foo" / "bard.txt") + path = str(Path(__file__).resolve()) dl_manager = StreamingDownloadManager() assert dl_manager.download_and_extract(path) == path