huggingface · lhoestq · Oct 31, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml
@@ -25,7 +25,7 @@ requirements:
     - dataclasses
     - multiprocess
     - fsspec
-    - huggingface_hub >=0.24.0,<1.0.0
+    - huggingface_hub >=0.25.0,<2.0.0
     - packaging
   run:
     - python
@@ -41,7 +41,7 @@ requirements:
     - dataclasses
     - multiprocess
     - fsspec
-    - huggingface_hub >=0.24.0,<1.0.0
+    - huggingface_hub >=0.25.0,<2.0.0
     - packaging
 
 test:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -82,7 +82,7 @@ jobs:
         run: |
           python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
 
-  test_py312:
+  test_py314:
     needs: check_code_quality
     strategy:
       matrix:
@@ -100,18 +100,18 @@ jobs:
         run: |
           sudo apt update
           sudo apt install -y ffmpeg
-      - name: Set up Python 3.12
+      - name: Set up Python 3.14
         uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: "3.14"
       - name: Setup conda env (windows)
         if: ${{ matrix.os == 'windows-latest' }}
         uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           miniconda-version: "latest"
           activate-environment: test
-          python-version: "3.12"
+          python-version: "3.14"
       - name: Setup FFmpeg (windows)
         if: ${{ matrix.os == 'windows-latest' }}
         run: conda install "ffmpeg=7.0.1" -c conda-forge
@@ -127,7 +127,7 @@ jobs:
         run: |
           python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
 
-  test_py312_future:
+  test_py314_future:
     needs: check_code_quality
     strategy:
       matrix:
@@ -145,18 +145,18 @@ jobs:
         run: |
           sudo apt update
           sudo apt install -y ffmpeg 
-      - name: Set up Python 3.12
+      - name: Set up Python 3.14
         uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: "3.14"
       - name: Setup conda env (windows)
         if: ${{ matrix.os == 'windows-latest' }}
         uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           miniconda-version: "latest"
           activate-environment: test
-          python-version: "3.12"
+          python-version: "3.14"
       - name: Setup FFmpeg (windows)
         if: ${{ matrix.os == 'windows-latest' }}
         run: conda install "ffmpeg=7.0.1" -c conda-forge

diff --git a/setup.py b/setup.py
@@ -124,7 +124,7 @@
     # for fast hashing
     "xxhash",
     # for better multiprocessing
-    "multiprocess<0.70.17",  # to align with dill<0.3.9 (see above)
+    "multiprocess<0.70.19",  # to align with dill<0.3.9 (see above)
     # to save datasets locally or on any filesystem
     # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
     "fsspec[http]>=2023.1.0,<=2025.9.0",
@@ -153,12 +153,12 @@
 
 TESTS_REQUIRE = [
     # fix pip install issues for windows
-    "numba>=0.56.4",  # to get recent versions of llvmlite for windows ci
+    "numba>=0.56.4; python_version < '3.14'",  # to get recent versions of llvmlite for windows ci, not available on 3.14
     # test dependencies
     "absl-py",
     "decorator",
     "joblib<1.3.0",  # joblibspark doesn't support recent joblib versions
-    "joblibspark",
+    "joblibspark; python_version < '3.14'",  # python 3.14 gives AttributeError: module 'ast' has no attribute 'Num'
     "pytest",
     "pytest-datadir",
     "pytest-xdist",
@@ -169,23 +169,23 @@
     "h5py",
     "jax>=0.3.14; sys_platform != 'win32'",
     "jaxlib>=0.3.14; sys_platform != 'win32'",
-    "lz4",
+    "lz4; python_version < '3.14'",  # python 3.14 gives ImportError: cannot import name '_compression' from partially initialized module 'lz4.frame
     "moto[server]",
     "pyspark>=3.4",  # https://issues.apache.org/jira/browse/SPARK-40991 fixed in 3.4.0
     "py7zr",
     "rarfile>=4.0",
     "sqlalchemy",
     "protobuf<4.0.0",  # 4.0.0 breaks compatibility with tensorflow<2.12
     "tensorflow>=2.6.0; python_version<'3.10' and sys_platform != 'win32'",  # numpy-2 is not supported for Python < 3.10
-    "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32'",  # Pins numpy < 2
+    "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32' and python_version < '3.14'",  # Pins numpy < 2
     "tiktoken",
     "torch>=2.8.0",
     "torchdata",
     "transformers>=4.42.0",  # Pins numpy < 2
     "zstandard",
     "polars[timezone]>=0.20.0",
     "Pillow>=9.4.0",  # When PIL.Image.ExifTags was introduced
-    "torchcodec>=0.7.0",  # minium version to get windows support
+    "torchcodec>=0.7.0; python_version < '3.14'",  # minium version to get windows support, torchcodec doesn't have wheels for 3.14 yet
     "nibabel>=5.3.1",
 ]
 
@@ -262,6 +262,9 @@
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
+        "Programming Language :: Python :: 3.14",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
     keywords="datasets machine learning datasets",

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
@@ -713,6 +713,7 @@ def test_dataset_with_audio_feature_loaded_from_cache():
     assert isinstance(ds, Dataset)
 
 
+@require_torchcodec
 def test_dataset_with_audio_feature_undecoded(shared_datadir):
     audio_path = str(shared_datadir / "test_audio_44100.wav")
     data = {"audio": [audio_path]}
@@ -730,6 +731,7 @@ def test_dataset_with_audio_feature_undecoded(shared_datadir):
     assert column[0] == {"path": audio_path, "bytes": None}
 
 
+@require_torchcodec
 def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):
     audio_path = str(shared_datadir / "test_audio_44100.wav")
     data = {"audio": [audio_path]}
@@ -761,6 +763,7 @@ def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):
         assert column[0] == {"path": audio_path, "bytes": None}
 
 
+@require_torchcodec
 def test_dataset_with_audio_feature_map_undecoded(shared_datadir):
     audio_path = str(shared_datadir / "test_audio_44100.wav")
     data = {"audio": [audio_path]}

diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -1,5 +1,4 @@
 import os
-import zipfile
 
 import pytest
 
@@ -199,5 +198,5 @@ def test_is_zipfile_false_positive(tmpdir):
     )
     with not_a_zip_file.open("wb") as f:
         f.write(data)
-    assert zipfile.is_zipfile(str(not_a_zip_file))  # is a false positive for `zipfile`
+    # zipfile.is_zipfile(str(not_a_zip_file)) could be a false positive for `zipfile`
     assert not ZipExtractor.is_extractable(not_a_zip_file)  # but we're right
diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py
@@ -26,6 +26,7 @@
     require_spacy,
     require_tiktoken,
     require_torch,
+    require_torch_compile,
     require_transformers,
 )
 
@@ -347,7 +348,7 @@ def test_hash_spacy_model(self):
         self.assertNotEqual(hash1, hash2)
 
     @require_not_windows
-    @require_torch
+    @require_torch_compile
     def test_hash_torch_compiled_function(self):
         import torch
 
@@ -360,7 +361,7 @@ def f(x):
         self.assertEqual(hash1, hash2)
 
     @require_not_windows
-    @require_torch
+    @require_torch_compile
     def test_hash_torch_compiled_module(self):
         m = TorchModule()
         next(iter(m.parameters())).data.fill_(1.0)

diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py
@@ -1553,18 +1553,21 @@ def test_iterable_dataset_from_hub_torch_dataloader_parallel(num_workers, tmp_pa
     assert len(result) == 10
 
 
+def gen_with_worker_info(shard):
+    from torch.utils.data import get_worker_info
+
+    worker_info = get_worker_info()
+    for i in range(100):
+        yield {"value": i, "worker_id": worker_info.id}
+
+
 @require_torch
 def test_iterable_dataset_shuffle_with_multiple_workers_different_rng():
     # GH 7567
-    from torch.utils.data import DataLoader, get_worker_info
-
-    def gen(shard):
-        worker_info = get_worker_info()
-        for i in range(100):
-            yield {"value": i, "worker_id": worker_info.id}
+    from torch.utils.data import DataLoader
 
     num_workers = 20
-    ds = IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers))})
+    ds = IterableDataset.from_generator(gen_with_worker_info, gen_kwargs={"shard": list(range(num_workers))})
     ds = ds.shuffle(buffer_size=100, seed=1234)
     dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers)
 
@@ -1575,18 +1578,19 @@ def gen(shard):
         assert len(set(values)) != 1, "Make sure not all values are identical"
 
 
+def gen_with_value(shard, value):
+    for i in range(100):
+        yield {"value": value}
+
+
 @require_torch
 def test_iterable_dataset_interleave_dataset_with_multiple_workers():
     # GH 7567
     from torch.utils.data import DataLoader
 
-    def gen(shard, value):
-        for i in range(100):
-            yield {"value": value}
-
     num_workers = 20
     ds = [
-        IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i})
+        IterableDataset.from_generator(gen_with_value, gen_kwargs={"shard": list(range(num_workers)), "value": i})
         for i in range(10)
     ]
     ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234)
@@ -1598,18 +1602,19 @@ def gen(shard, value):
         assert len(set(values)) != 1, "Make sure not all values are identical"
 
 
+def gen_with_id(shard, value):
+    for i in range(50):
+        yield {"value": value, "id": i}
+
+
 @require_torch
 def test_iterable_dataset_interleave_dataset_deterministic_across_iterations():
     # GH 7567
     from torch.utils.data import DataLoader
 
-    def gen(shard, value):
-        for i in range(50):
-            yield {"value": value, "id": i}
-
     num_workers = 10
     ds = [
-        IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i})
+        IterableDataset.from_generator(gen_with_id, gen_kwargs={"shard": list(range(num_workers)), "value": i})
         for i in range(5)
     ]
     ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234)

diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py
@@ -1,4 +1,5 @@
 import os
+import pickle
 import time
 from dataclasses import dataclass
 from multiprocessing import Pool
@@ -81,7 +82,7 @@ def test_map_nested(self):
             {k: v.tolist() for k, v in map_nested(int, sn1, map_numpy=True, num_proc=num_proc).items()},
             {k: v.tolist() for k, v in expected_map_nested_sn1_int.items()},
         )
-        with self.assertRaises(AttributeError):  # can't pickle a local lambda
+        with self.assertRaises((AttributeError, pickle.PicklingError)):  # can't pickle a local lambda
             map_nested(lambda x: x + 1, sn1, num_proc=num_proc)
 
     def test_zip_dict(self):

diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py
@@ -1,5 +1,6 @@
 import json
 import os
+from pathlib import Path
 
 import pytest
 
@@ -26,10 +27,16 @@
 Bulbasaur, grass"""
 
 
-@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"])
-def test_streaming_dl_manager_download_dummy_path(urlpath):
+def test_streaming_dl_manager_download_dummy_path():
+    path = str(Path(__file__).resolve())
     dl_manager = StreamingDownloadManager()
-    assert dl_manager.download(urlpath) == urlpath
+    assert dl_manager.download(path) == path
+
+
+def test_streaming_dl_manager_download_dummy_url():
+    url = "https://f.oo/bar.txt"
+    dl_manager = StreamingDownloadManager()
+    assert dl_manager.download(url) == url
 
 
 @pytest.mark.parametrize(
@@ -54,10 +61,16 @@ def test_streaming_dl_manager_download(text_path):
         assert f.read() == expected_file.read()
 
 
-@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"])
-def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath):
+def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_path():
+    path = str(Path(__file__).resolve())
+    dl_manager = StreamingDownloadManager()
+    assert dl_manager.download_and_extract(path) == path
+
+
+def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_url():
+    url = "https://f.oo/bar.txt"
     dl_manager = StreamingDownloadManager()
-    assert dl_manager.download_and_extract(urlpath) == urlpath
+    assert dl_manager.download_and_extract(url) == url
 
 
 def test_streaming_dl_manager_extract(text_gz_path, text_path):

diff --git a/tests/utils.py b/tests/utils.py
@@ -125,6 +125,20 @@ def require_torch(test_case):
     return test_case
 
 
+def require_torch_compile(test_case):
+    """
+    Decorator marking a test that requires PyTorch.
+
+    These tests are skipped when PyTorch isn't installed.
+
+    """
+    if not config.TORCH_AVAILABLE:
+        test_case = unittest.skip("test requires PyTorch")(test_case)
+    if config.PY_VERSION >= version.parse("3.14"):
+        test_case = unittest.skip("test requires torch compile which isn't available in python 3.14")(test_case)
+    return test_case
+
+
 def require_polars(test_case):
     """
     Decorator marking a test that requires Polars.