From 3248518aac97eed4203089373332ff38ce31b0e0 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 8 Jun 2022 13:25:45 +0200 Subject: [PATCH 01/21] Remove python 3.6 code --- src/datasets/__init__.py | 6 ++++ src/datasets/utils/py_utils.py | 14 --------- tests/commands/test_dummy_data.py | 51 +++++++++---------------------- tests/commands/test_test.py | 50 +++++++++--------------------- 4 files changed, 36 insertions(+), 85 deletions(-) diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index e208dba50fc..3155dfdb720 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -19,9 +19,14 @@ __version__ = "2.2.3.dev0" +import platform import pyarrow from packaging import version +if version.parse(platform.python_version()) < version.parse("3.7"): + raise ImportWarning( + "To use `datasets`, `python>=3.7` is required, and the current version of python doesn't match this condition." + ) if version.parse(pyarrow.__version__).major < 6: raise ImportWarning( @@ -31,6 +36,7 @@ SCRIPTS_VERSION = "master" if version.parse(__version__).is_devrelease else __version__ +del platform del pyarrow del version diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index 42a30351eaf..816ac0c6708 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -23,7 +23,6 @@ import os import pickle import re -import sys import types from contextlib import contextmanager from io import BytesIO as StringIO @@ -419,19 +418,6 @@ class Pickler(dill.Pickler): dispatch = dill._dill.MetaCatchingDict(dill.Pickler.dispatch.copy()) - def save_global(self, obj, name=None): - if sys.version_info[:2] < (3, 7) and _CloudPickleTypeHintFix._is_parametrized_type_hint( - obj - ): # noqa # pragma: no branch - # Parametrized typing constructs in Python < 3.7 are not compatible - # with type checks and ``isinstance`` semantics. For this reason, - # it is easier to detect them using a duck-typing-based check - # (``_is_parametrized_type_hint``) than to populate the Pickler's - # dispatch with type-specific savers. - _CloudPickleTypeHintFix._save_parametrized_type_hint(self, obj) - else: - dill.Pickler.save_global(self, obj, name=name) - def memoize(self, obj): # don't memoize strings since two identical strings can have different python ids if type(obj) != str: diff --git a/tests/commands/test_dummy_data.py b/tests/commands/test_dummy_data.py index 81d5ccb4568..7402be4099d 100644 --- a/tests/commands/test_dummy_data.py +++ b/tests/commands/test_dummy_data.py @@ -1,45 +1,24 @@ import os from collections import namedtuple -from dataclasses import dataclass -from packaging import version - -from datasets import config from datasets.commands.dummy_data import DummyDataCommand -if config.PY_VERSION >= version.parse("3.7"): - DummyDataCommandArgs = namedtuple( - "DummyDataCommandArgs", - [ - "path_to_dataset", - "auto_generate", - "n_lines", - "json_field", - "xml_tag", - "match_text_files", - "keep_uncompressed", - "cache_dir", - "encoding", - ], - defaults=[False, 5, None, None, None, False, None, None], - ) -else: - - @dataclass - class DummyDataCommandArgs: - path_to_dataset: str - auto_generate: bool = False - n_lines: int = 5 - json_field: str = None - xml_tag: str = None - match_text_files: str = None - keep_uncompressed: bool = False - cache_dir: str = None - encoding: str = None - - def __iter__(self): - return iter(self.__dict__.values()) +DummyDataCommandArgs = namedtuple( + "DummyDataCommandArgs", + [ + "path_to_dataset", + "auto_generate", + "n_lines", + "json_field", + "xml_tag", + "match_text_files", + "keep_uncompressed", + "cache_dir", + "encoding", + ], + defaults=[False, 5, None, None, None, False, None, None], +) class MockDummyDataCommand(DummyDataCommand): diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py index ccb9ca664ab..26dcef37d24 100644 --- a/tests/commands/test_test.py +++ b/tests/commands/test_test.py @@ -1,46 +1,26 @@ import json import os from collections import namedtuple -from dataclasses import dataclass - -from packaging import version from datasets import config from datasets.commands.test import TestCommand -if config.PY_VERSION >= version.parse("3.7"): - TestCommandArgs = namedtuple( - "TestCommandArgs", - [ - "dataset", - "name", - "cache_dir", - "data_dir", - "all_configs", - "save_infos", - "ignore_verifications", - "force_redownload", - "clear_cache", - ], - defaults=[None, None, None, False, False, False, False, False], - ) -else: - - @dataclass - class TestCommandArgs: - dataset: str - name: str = None - cache_dir: str = None - data_dir: str = None - all_configs: bool = False - save_infos: bool = False - ignore_verifications: bool = False - force_redownload: bool = False - clear_cache: bool = False - - def __iter__(self): - return iter(self.__dict__.values()) +TestCommandArgs = namedtuple( + "TestCommandArgs", + [ + "dataset", + "name", + "cache_dir", + "data_dir", + "all_configs", + "save_infos", + "ignore_verifications", + "force_redownload", + "clear_cache", + ], + defaults=[None, None, None, False, False, False, False, False], +) def test_test_command(dataset_loading_script_dir): From 6ad94e1fe307fe78181b6d1130338f1783759344 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 8 Jun 2022 13:26:06 +0200 Subject: [PATCH 02/21] Update requirements --- .circleci/config.yml | 10 +++++----- .github/hub/update_hub_repositories.py | 1 - Makefile | 4 ++-- additional-tests-requirements.txt | 2 +- docs/source/installation.md | 2 +- setup.py | 7 +------ 6 files changed, 10 insertions(+), 16 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e5c71f88d49..c64cf2d2f0f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,7 +7,7 @@ jobs: run_dataset_script_tests_pyarrow_latest: working_directory: ~/datasets docker: - - image: cimg/python:3.6 + - image: cimg/python:3.7 resource_class: medium steps: - checkout @@ -18,12 +18,12 @@ jobs: - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow --upgrade - - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.6 --dist loadfile -sv ./tests/ + - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.7 --dist loadfile -sv ./tests/ run_dataset_script_tests_pyarrow_6: working_directory: ~/datasets docker: - - image: cimg/python:3.6 + - image: cimg/python:3.7 resource_class: medium steps: - checkout @@ -34,7 +34,7 @@ jobs: - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow==6.0.0 - - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.6 --dist loadfile -sv ./tests/ + - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.7 --dist loadfile -sv ./tests/ run_dataset_script_tests_pyarrow_latest_WIN: working_directory: ~/datasets @@ -81,7 +81,7 @@ jobs: check_code_quality: working_directory: ~/datasets docker: - - image: cimg/python:3.6 + - image: cimg/python:3.7 resource_class: medium parallelism: 1 steps: diff --git a/.github/hub/update_hub_repositories.py b/.github/hub/update_hub_repositories.py index 875cbf80bd2..c923583ba7f 100644 --- a/.github/hub/update_hub_repositories.py +++ b/.github/hub/update_hub_repositories.py @@ -1,4 +1,3 @@ -import base64 import distutils.dir_util import logging import os diff --git a/Makefile b/Makefile index e3615d44ed0..b7936753dba 100644 --- a/Makefile +++ b/Makefile @@ -3,14 +3,14 @@ # Check that source code meets quality standards quality: - black --check --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics + black --check --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics isort --check-only tests src benchmarks datasets/**/*.py metrics flake8 tests src benchmarks datasets/**/*.py metrics # Format source code automatically style: - black --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics + black --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics isort tests src benchmarks datasets/**/*.py metrics # Run tests for the library diff --git a/additional-tests-requirements.txt b/additional-tests-requirements.txt index a827c308c9f..00b5b8d62a3 100644 --- a/additional-tests-requirements.txt +++ b/additional-tests-requirements.txt @@ -1,4 +1,4 @@ -unbabel-comet>=1.0.0;python_version>'3.6' +unbabel-comet>=1.0.0 git+https://github.com/google-research/bleurt.git git+https://github.com/ns-moosavi/coval.git git+https://github.com/hendrycks/math.git diff --git a/docs/source/installation.md b/docs/source/installation.md index 4ea7c7ad2a1..033b767c01d 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,6 +1,6 @@ # Installation -Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.6+**. +Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.7+**. diff --git a/setup.py b/setup.py index 2ffc3ffd46a..01b567b7f85 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,6 @@ Then push the change with a message 'set dev version' """ -import os from setuptools import find_packages, setup @@ -74,8 +73,6 @@ "requests>=2.19.0", # progress bars in download and scripts "tqdm>=4.62.1", - # dataclasses for Python versions that don't have it - "dataclasses;python_version<'3.7'", # for fast hashing "xxhash", # for better multiprocessing @@ -162,8 +159,6 @@ "texttable>=1.6.3", "Werkzeug>=1.0.1", "six~=1.15.0", - # metadata validation - "importlib_resources;python_version<'3.7'", ] TESTS_REQUIRE.extend(VISION_REQURE) @@ -211,6 +206,7 @@ packages=find_packages("src"), package_data={"datasets": ["py.typed", "scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml"]}, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, + python_requires=">=3.7.0", install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, classifiers=[ @@ -221,7 +217,6 @@ "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", From 3800c495c62fedf8aa0c882754f8504133c8cdcd Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 8 Jun 2022 13:32:48 +0200 Subject: [PATCH 03/21] Style --- src/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 3155dfdb720..e4806a9f0b2 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -20,9 +20,11 @@ __version__ = "2.2.3.dev0" import platform + import pyarrow from packaging import version + if version.parse(platform.python_version()) < version.parse("3.7"): raise ImportWarning( "To use `datasets`, `python>=3.7` is required, and the current version of python doesn't match this condition." From 32aab3db1948864ef5ce7a7e63b272e0f2e9701c Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 8 Jun 2022 13:49:33 +0200 Subject: [PATCH 04/21] Update audio gh action --- .github/workflows/test-audio.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml index 68e0b8f0b3b..805ca70777d 100644 --- a/.github/workflows/test-audio.yml +++ b/.github/workflows/test-audio.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: "3.6" + python-version: "3.7" - name: Install dependencies run: | python -m pip install --upgrade pip From 2dee899a6a7ded7c40669d36b450e1603fb2a8e0 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 8 Jun 2022 14:10:07 +0200 Subject: [PATCH 05/21] Benchmarks fix attempt #1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 01b567b7f85..931e5c8ee2f 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,7 @@ BENCHMARKS_REQUIRE = [ "numpy==1.18.5", "tensorflow==2.3.0", - "torch==1.6.0", + "torch==1.7.0", "transformers==3.0.2", ] From 8c6ce1920211b14a34ee1e101ad47ca69006f185 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 8 Jun 2022 15:09:17 +0200 Subject: [PATCH 06/21] Benchmarks fix attempt no.2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 931e5c8ee2f..b2a5ba43256 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,7 @@ BENCHMARKS_REQUIRE = [ "numpy==1.18.5", "tensorflow==2.3.0", - "torch==1.7.0", + "torch==1.11.0", "transformers==3.0.2", ] From 89b9ea64a4ba2b9bf75c35bc7acac5f9c15f5b12 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 13 Jun 2022 16:37:11 +0200 Subject: [PATCH 07/21] Use newer image --- .github/workflows/benchmarks.yaml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml index 81c524ff8c5..216337e7fa6 100644 --- a/.github/workflows/benchmarks.yaml +++ b/.github/workflows/benchmarks.yaml @@ -3,7 +3,7 @@ on: [push] jobs: run: runs-on: [ubuntu-latest] - container: docker://dvcorg/cml-py3:latest + container: docker://dvcorg/cml:latest steps: - uses: actions/checkout@v2 - name: cml_run diff --git a/setup.py b/setup.py index b2a5ba43256..a960efedc86 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,7 @@ BENCHMARKS_REQUIRE = [ "numpy==1.18.5", "tensorflow==2.3.0", - "torch==1.11.0", + "torch==1.7.1", "transformers==3.0.2", ] From f9b24c4256992487ae9c0336850d8af435b7b499 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 13 Jun 2022 16:43:29 +0200 Subject: [PATCH 08/21] Remove backticks --- src/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index e4806a9f0b2..6648525d214 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -27,7 +27,7 @@ if version.parse(platform.python_version()) < version.parse("3.7"): raise ImportWarning( - "To use `datasets`, `python>=3.7` is required, and the current version of python doesn't match this condition." + "To use `datasets`, Python>=3.7 is required, and the current version of Python doesn't match this condition." ) if version.parse(pyarrow.__version__).major < 6: From 8a82d72d418c2347df088558e312a3c0bf05a2fe Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 13 Jun 2022 17:04:52 +0200 Subject: [PATCH 09/21] Add suggested command to benchmark action --- .github/workflows/benchmarks.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml index 216337e7fa6..871ac192b5a 100644 --- a/.github/workflows/benchmarks.yaml +++ b/.github/workflows/benchmarks.yaml @@ -10,6 +10,9 @@ jobs: env: repo_token: ${{ secrets.GITHUB_TOKEN }} run: | + # See https://github.com/actions/checkout/issues/760 + git config --global --add safe.directory /__w/datasets/datasets + # Your ML workflow goes here pip install --upgrade pip From 6eab0e4ef8c5873ffbbdc956ef4d7ab3ce5388f7 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 14 Jun 2022 03:14:03 +0200 Subject: [PATCH 10/21] Avoid some FutureWarnings and DeprecationWarnings --- src/datasets/arrow_dataset.py | 4 ++-- src/datasets/features/features.py | 2 +- src/datasets/formatting/formatting.py | 4 ++-- src/datasets/formatting/jax_formatter.py | 2 +- src/datasets/formatting/tf_formatter.py | 4 +--- src/datasets/formatting/torch_formatter.py | 2 +- src/datasets/utils/stratify.py | 2 +- tests/features/test_array_xd.py | 2 +- tests/test_builder.py | 4 ++-- 9 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 22150f4ce66..0d48b87cb1f 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -280,7 +280,7 @@ def _get_output_signature( else: np_arrays.append(np.array(array)) - if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.bool: + if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool: tf_dtype = tf.int64 np_dtype = np.int64 elif np.issubdtype(np_arrays[0].dtype, np.number): @@ -3663,7 +3663,7 @@ def _feature(values: Union[float, int, str, np.ndarray]) -> "tf.train.Feature": return _float_feature([values.item()]) elif np.issubdtype(values.dtype, np.integer): return _int64_feature([values.item()]) - elif np.issubdtype(values.dtype, np.str): + elif np.issubdtype(values.dtype, str): return _bytes_feature([values.item().encode()]) else: raise ValueError(f"values={values} has dtype {values.dtype}, which cannot be serialized") diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 3e8301df08b..9ddad51e2e9 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -810,7 +810,7 @@ def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray, def take( self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None ) -> "PandasArrayExtensionArray": - indices: np.ndarray = np.asarray(indices, dtype=np.int) + indices: np.ndarray = np.asarray(indices, dtype=int) if allow_fill: fill_value = ( self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type) diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py index f857a53a992..ecc5b5e6606 100644 --- a/src/datasets/formatting/formatting.py +++ b/src/datasets/formatting/formatting.py @@ -194,11 +194,11 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any( - (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape)) + (isinstance(x, np.ndarray) and (x.dtype == object or x.shape != array[0].shape)) or (isinstance(x, float) and np.isnan(x)) for x in array ): - return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object}) + return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": object}) return np.array(array, copy=False, **self.np_array_kwargs) diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py index dffe37bc5f0..0a554203be5 100644 --- a/src/datasets/formatting/jax_formatter.py +++ b/src/datasets/formatting/jax_formatter.py @@ -54,7 +54,7 @@ def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): data_struct = np.array(data_struct, copy=False) - if data_struct.dtype == np.object: # jax arrays cannot be instantied from an array of objects + if data_struct.dtype == object: # jax arrays cannot be instantied from an array of objects return [self.recursive_tensorize(substruct) for substruct in data_struct] return self._tensorize(data_struct) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 7e835280c54..d07f1f636cc 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -65,9 +65,7 @@ def _tensorize(self, value): def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): - if ( - data_struct.dtype == np.object - ): # tensorflow tensors can sometimes be instantied from an array of objects + if data_struct.dtype == object: # tensorflow tensors can sometimes be instantied from an array of objects try: return self._tensorize(data_struct) except ValueError: diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py index 3106a024920..c5a7d3c214f 100644 --- a/src/datasets/formatting/torch_formatter.py +++ b/src/datasets/formatting/torch_formatter.py @@ -46,7 +46,7 @@ def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): data_struct = np.array(data_struct, copy=False) - if data_struct.dtype == np.object: # pytorch tensors cannot be instantied from an array of objects + if data_struct.dtype == object: # pytorch tensors cannot be instantied from an array of objects return [self.recursive_tensorize(substruct) for substruct in data_struct] return self._tensorize(data_struct) diff --git a/src/datasets/utils/stratify.py b/src/datasets/utils/stratify.py index 58ea04f2f85..3a72f6fc3f2 100644 --- a/src/datasets/utils/stratify.py +++ b/src/datasets/utils/stratify.py @@ -48,7 +48,7 @@ def approximate_mode(class_counts, n_draws, rng): need_to_add -= add_now if need_to_add == 0: break - return floored.astype(np.int) + return floored.astype(int) def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10): diff --git a/tests/features/test_array_xd.py b/tests/features/test_array_xd.py index ae007abbe00..2f6d9d94009 100644 --- a/tests/features/test_array_xd.py +++ b/tests/features/test_array_xd.py @@ -335,7 +335,7 @@ def test_array_xd_with_none(): dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict({"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) - assert isinstance(arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3,) + assert isinstance(arr, np.ndarray) and arr.dtype == object and arr.shape == (3,) np.testing.assert_equal(arr[0], dummy_array) np.testing.assert_equal(arr[2], dummy_array) assert np.isnan(arr[1]) # a single np.nan value - np.all not needed diff --git a/tests/test_builder.py b/tests/test_builder.py index 853859d40ca..c7e08e0f01b 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -837,8 +837,8 @@ def _generate_examples(self): "builder_class, kwargs", [ (DummyBuilderWithVersion, {}), - (DummyBuilderWithBuilderConfigs, {"name": "custom"}), - (DummyBuilderWithCustomBuilderConfigs, {"name": "20220501.en"}), + (DummyBuilderWithBuilderConfigs, {"config_name": "custom"}), + (DummyBuilderWithCustomBuilderConfigs, {"config_name": "20220501.en"}), (DummyBuilderWithCustomBuilderConfigs, {"date": "20220501", "language": "ca"}), ], ) From 97ee6987e9bab5149e4477d7ecbc3a16c1e21bfe Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 16 Jun 2022 19:11:04 +0200 Subject: [PATCH 11/21] Disable test --- tests/test_arrow_dataset.py | 6 +++--- tests/test_dataset_dict.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index d43b1509c50..68370bf5b28 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3056,9 +3056,9 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param @pytest.mark.skipif( - os.name == "nt" and os.getenv("CIRCLECI") == "true", - reason='On Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', -) # TODO(QL): find what's wrong with CircleCI + os.name in ["posix", "nt"] and os.getenv("CIRCLECI") == "true", + reason='On Linux/Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', +) # TODO(QL, mario): find what's wrong with CircleCI @require_s3 def test_dummy_dataset_serialize_s3(s3, dataset): mock_bucket = s3_test_bucket_name diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 814d04ee370..7e78b759030 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -656,9 +656,9 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path): @pytest.mark.skipif( - os.name == "nt" and os.getenv("CIRCLECI") == "true", - reason='On Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', -) # TODO(QL): find what's wrong with CircleCI + os.name in ["posix", "nt"] and os.getenv("CIRCLECI") == "true", + reason='On Linux/Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', +) # TODO(QL, mario): find what's wrong with CircleCI @require_s3 def test_dummy_dataset_serialize_s3(s3, dataset): dsets = DatasetDict({"train": dataset, "test": dataset.select(range(2))}) From 30de7fa2d2d3cdcc01b50b0d82d948fbf4aa068e Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 20 Jul 2022 13:48:19 +0200 Subject: [PATCH 12/21] Remove 3.6 pickling test --- tests/test_arrow_dataset.py | 2 +- tests/test_dataset_dict.py | 2 +- tests/test_fingerprint.py | 15 --------------- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 055b3d9f4ff..3bb44ce930e 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3119,7 +3119,7 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param @pytest.mark.skipif( - os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 2f612dd4ddc..643edf043ae 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -665,7 +665,7 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path): @pytest.mark.skipif( - os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index 0ffe453fcc1..23ca0c4e0af 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -226,21 +226,6 @@ def globalvars_mock2_side_effect(func, *args, **kwargs): self.assertEqual(hash1, hash2) -class TypeHintDumpTest(TestCase): - def test_dump_type_hint(self): - from typing import Union - - t1 = Union[str, None] # this type is not picklable in python 3.6 - # let's check that we can pickle it anyway using our pickler, even in 3.6 - hash1 = md5(datasets.utils.py_utils.dumps(t1)).hexdigest() - t2 = Union[str] # this type is picklable in python 3.6 - hash2 = md5(datasets.utils.py_utils.dumps(t2)).hexdigest() - t3 = Union[str, None] - hash3 = md5(datasets.utils.py_utils.dumps(t3)).hexdigest() - self.assertEqual(hash1, hash3) - self.assertNotEqual(hash1, hash2) - - class HashingTest(TestCase): def test_hash_simple(self): hash1 = Hasher.hash("hello") From 983b04e4a946afc8ec2cc32e47ef2173d46ab7ee Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 22 Jul 2022 13:00:33 +0200 Subject: [PATCH 13/21] CI test From eac1aaa3ad6feecd387c154538a46c63db5ff010 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 22 Jul 2022 13:30:04 +0200 Subject: [PATCH 14/21] Use python 3.7 in ubuntu-latest --- .github/workflows/ci.yml | 10 ++-------- src/datasets/packaged_modules/text/dataset_infos.json | 1 + 2 files changed, 3 insertions(+), 8 deletions(-) create mode 100644 src/datasets/packaged_modules/text/dataset_infos.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4f14085ee4..5513c1c6397 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.6" + python-version: "3.7" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -49,13 +49,7 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Set up Python 3.6 - if: ${{ matrix.os == 'ubuntu-latest' }} - uses: actions/setup-python@v4 - with: - python-version: 3.6 - name: Set up Python 3.7 - if: ${{ matrix.os == 'windows-latest' }} uses: actions/setup-python@v4 with: python-version: 3.7 @@ -63,7 +57,7 @@ jobs: run: python -m pip install --upgrade pip - name: Pin setuptools-scm if: ${{ matrix.os == 'ubuntu-latest' }} - run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.6" && pip install "setuptools-scm==6.4.2" + run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2" - name: Install dependencies run: | pip install .[tests] diff --git a/src/datasets/packaged_modules/text/dataset_infos.json b/src/datasets/packaged_modules/text/dataset_infos.json new file mode 100644 index 00000000000..c647519d3b4 --- /dev/null +++ b/src/datasets/packaged_modules/text/dataset_infos.json @@ -0,0 +1 @@ +{"bigscience": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "text", "config_name": "bigscience", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 938, "num_examples": 22, "dataset_name": "text"}}, "download_checksums": {"C:\\Users\\Mario\\Desktop\\bigscience\\biscience.txt": {"num_bytes": 892, "checksum": "1e1f85c9e2aefb6990dc6ec4a8805af1e5451ebecb7e9f50face10c83eed742e"}}, "download_size": 892, "post_processing_size": null, "dataset_size": 938, "size_in_bytes": 1830}} \ No newline at end of file From 4db3cf9330e098ed0cc80651140c21b11e79fa0f Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 22 Jul 2022 13:58:01 +0200 Subject: [PATCH 15/21] Disable s3 test on Linux --- tests/test_arrow_dataset.py | 2 +- tests/test_dataset_dict.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 3bb44ce930e..055b3d9f4ff 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3119,7 +3119,7 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param @pytest.mark.skipif( - os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 643edf043ae..2f612dd4ddc 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -665,7 +665,7 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path): @pytest.mark.skipif( - os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 From 2674f0c7e9e64d1c9694f87ecd1f773cde3aced5 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 26 Jul 2022 12:47:05 +0200 Subject: [PATCH 16/21] Remove weird json file --- src/datasets/packaged_modules/text/dataset_infos.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/datasets/packaged_modules/text/dataset_infos.json diff --git a/src/datasets/packaged_modules/text/dataset_infos.json b/src/datasets/packaged_modules/text/dataset_infos.json deleted file mode 100644 index c647519d3b4..00000000000 --- a/src/datasets/packaged_modules/text/dataset_infos.json +++ /dev/null @@ -1 +0,0 @@ -{"bigscience": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "text", "config_name": "bigscience", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 938, "num_examples": 22, "dataset_name": "text"}}, "download_checksums": {"C:\\Users\\Mario\\Desktop\\bigscience\\biscience.txt": {"num_bytes": 892, "checksum": "1e1f85c9e2aefb6990dc6ec4a8805af1e5451ebecb7e9f50face10c83eed742e"}}, "download_size": 892, "post_processing_size": null, "dataset_size": 938, "size_in_bytes": 1830}} \ No newline at end of file From c3bc52d6469bb96cd90548f424b907ec7aa20c84 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 26 Jul 2022 12:50:37 +0200 Subject: [PATCH 17/21] Remove cloudpickle stuff --- src/datasets/utils/py_utils.py | 44 +--------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index 48ba2293cb0..73a22d8f20e 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -21,7 +21,6 @@ import functools import itertools import os -import pickle import re import types from contextlib import contextmanager @@ -29,7 +28,7 @@ from multiprocessing import Pool, RLock from shutil import disk_usage from types import CodeType, FunctionType -from typing import Callable, ClassVar, Dict, Generic, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from urllib.parse import urlparse import dill @@ -559,47 +558,6 @@ def proxy(func): return proxy -class _CloudPickleTypeHintFix: - """ - Type hints can't be properly pickled in python < 3.7 - CloudPickle provided a way to make it work in older versions. - This class provide utilities to fix pickling of type hints in older versions. - from https://github.com/cloudpipe/cloudpickle/pull/318/files - """ - - def _is_parametrized_type_hint(obj): - # This is very cheap but might generate false positives. - origin = getattr(obj, "__origin__", None) # typing Constructs - values = getattr(obj, "__values__", None) # typing_extensions.Literal - type_ = getattr(obj, "__type__", None) # typing_extensions.Final - return origin is not None or values is not None or type_ is not None - - def _create_parametrized_type_hint(origin, args): - return origin[args] - - def _save_parametrized_type_hint(pickler, obj): - # The distorted type check sematic for typing construct becomes: - # ``type(obj) is type(TypeHint)``, which means "obj is a - # parametrized TypeHint" - if type(obj) is type(Literal): # pragma: no branch - initargs = (Literal, obj.__values__) - elif type(obj) is type(Final): # pragma: no branch - initargs = (Final, obj.__type__) - elif type(obj) is type(ClassVar): - initargs = (ClassVar, obj.__type__) - elif type(obj) in [type(Union), type(Tuple), type(Generic)]: - initargs = (obj.__origin__, obj.__args__) - elif type(obj) is type(Callable): - args = obj.__args__ - if args[0] is Ellipsis: - initargs = (obj.__origin__, args) - else: - initargs = (obj.__origin__, (list(args[:-1]), args[-1])) - else: # pragma: no cover - raise pickle.PicklingError(f"Datasets pickle Error: Unknown type {type(obj)}") - pickler.save_reduce(_CloudPickleTypeHintFix._create_parametrized_type_hint, initargs, obj=obj) - - @pklregister(CodeType) def _save_code(pickler, obj): """ From ad949dd02f880e3f3fb525f09c0eaaa90e0f1aa4 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 26 Jul 2022 13:15:18 +0200 Subject: [PATCH 18/21] Use lower torchaudio version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e2ee6cba04f..9085802d5d4 100644 --- a/setup.py +++ b/setup.py @@ -125,7 +125,7 @@ "s3fs>=2021.11.1", # aligned with fsspec[http]>=2021.11.1 "tensorflow>=2.3,!=2.6.0,!=2.6.1", "torch", - "torchaudio", + "torchaudio<0.12.0", "soundfile", "transformers", # datasets dependencies From feebd9048ac42e756155cf221e7364fe2dee6845 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 26 Jul 2022 14:11:49 +0200 Subject: [PATCH 19/21] Try to fix s3 errors --- setup.py | 2 +- tests/test_arrow_dataset.py | 8 ++++---- tests/test_dataset_dict.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 9085802d5d4..423787c46d9 100644 --- a/setup.py +++ b/setup.py @@ -120,7 +120,7 @@ "botocore>=1.22.8", # to be compatible with aiobotocore and boto3 "faiss-cpu>=1.6.4", "fsspec[s3]", - "moto[s3,server]==2.0.4", + "moto[s3,server]>=3.0.0", "rarfile>=4.0", "s3fs>=2021.11.1", # aligned with fsspec[http]>=2021.11.1 "tensorflow>=2.3,!=2.6.0,!=2.6.1", diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index afa17dff3c3..0013d13ba55 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3117,10 +3117,10 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param assert dataset._data.table == reloaded_dataset._data.table -@pytest.mark.skipif( - os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), - reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', -) # TODO: find what's wrong with CircleCI / GitHub Actions +# @pytest.mark.skipif( +# os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), +# reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', +# ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name): mock_bucket = s3_test_bucket_name diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 3593030ecbd..743bca6266c 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -663,10 +663,10 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path): assert all(dataset[split].split == split for split in path.keys()) -@pytest.mark.skipif( - os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), - reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', -) # TODO: find what's wrong with CircleCI / GitHub Actions +# @pytest.mark.skipif( +# os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), +# reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', +# ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name): dsets = DatasetDict({"train": dataset, "test": dataset.select(range(2))}) From 59a1e3d7f40c9f7f6156befc942b7652e3f45be6 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 26 Jul 2022 15:08:38 +0200 Subject: [PATCH 20/21] Another attempt --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 423787c46d9..468bb78ccf2 100644 --- a/setup.py +++ b/setup.py @@ -88,7 +88,7 @@ "huggingface-hub>=0.1.0,<1.0.0", # Utilities from PyPA to e.g., compare versions "packaging", - "responses<0.19", + "responses==0.16", ] AUDIO_REQUIRE = [ @@ -120,7 +120,7 @@ "botocore>=1.22.8", # to be compatible with aiobotocore and boto3 "faiss-cpu>=1.6.4", "fsspec[s3]", - "moto[s3,server]>=3.0.0", + "moto[s3,server]==2.0.4", "rarfile>=4.0", "s3fs>=2021.11.1", # aligned with fsspec[http]>=2021.11.1 "tensorflow>=2.3,!=2.6.0,!=2.6.1", From 9c2b16ce2d4072bd8c686497dd3b7c4d5add4107 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 26 Jul 2022 15:34:14 +0200 Subject: [PATCH 21/21] Disable test --- setup.py | 2 +- tests/test_arrow_dataset.py | 8 ++++---- tests/test_dataset_dict.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 468bb78ccf2..9085802d5d4 100644 --- a/setup.py +++ b/setup.py @@ -88,7 +88,7 @@ "huggingface-hub>=0.1.0,<1.0.0", # Utilities from PyPA to e.g., compare versions "packaging", - "responses==0.16", + "responses<0.19", ] AUDIO_REQUIRE = [ diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 0013d13ba55..afa17dff3c3 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3117,10 +3117,10 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param assert dataset._data.table == reloaded_dataset._data.table -# @pytest.mark.skipif( -# os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), -# reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', -# ) # TODO: find what's wrong with CircleCI / GitHub Actions +@pytest.mark.skipif( + os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', +) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name): mock_bucket = s3_test_bucket_name diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 743bca6266c..3593030ecbd 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -663,10 +663,10 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path): assert all(dataset[split].split == split for split in path.keys()) -# @pytest.mark.skipif( -# os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), -# reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', -# ) # TODO: find what's wrong with CircleCI / GitHub Actions +@pytest.mark.skipif( + os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', +) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name): dsets = DatasetDict({"train": dataset, "test": dataset.select(range(2))})