From 3248518aac97eed4203089373332ff38ce31b0e0 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 8 Jun 2022 13:25:45 +0200
Subject: [PATCH 01/21] Remove python 3.6 code

---
 src/datasets/__init__.py          |  6 ++++
 src/datasets/utils/py_utils.py    | 14 ---------
 tests/commands/test_dummy_data.py | 51 +++++++++----------------------
 tests/commands/test_test.py       | 50 +++++++++---------------------
 4 files changed, 36 insertions(+), 85 deletions(-)

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
index e208dba50fc..3155dfdb720 100644
--- a/src/datasets/__init__.py
+++ b/src/datasets/__init__.py
@@ -19,9 +19,14 @@
 
 __version__ = "2.2.3.dev0"
 
+import platform
 import pyarrow
 from packaging import version
 
+if version.parse(platform.python_version()) < version.parse("3.7"):
+    raise ImportWarning(
+        "To use `datasets`, `python>=3.7` is required, and the current version of python doesn't match this condition."
+    )
 
 if version.parse(pyarrow.__version__).major < 6:
     raise ImportWarning(
@@ -31,6 +36,7 @@
 
 SCRIPTS_VERSION = "master" if version.parse(__version__).is_devrelease else __version__
 
+del platform
 del pyarrow
 del version
 
diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py
index 42a30351eaf..816ac0c6708 100644
--- a/src/datasets/utils/py_utils.py
+++ b/src/datasets/utils/py_utils.py
@@ -23,7 +23,6 @@
 import os
 import pickle
 import re
-import sys
 import types
 from contextlib import contextmanager
 from io import BytesIO as StringIO
@@ -419,19 +418,6 @@ class Pickler(dill.Pickler):
 
     dispatch = dill._dill.MetaCatchingDict(dill.Pickler.dispatch.copy())
 
-    def save_global(self, obj, name=None):
-        if sys.version_info[:2] < (3, 7) and _CloudPickleTypeHintFix._is_parametrized_type_hint(
-            obj
-        ):  # noqa  # pragma: no branch
-            # Parametrized typing constructs in Python < 3.7 are not compatible
-            # with type checks and ``isinstance`` semantics. For this reason,
-            # it is easier to detect them using a duck-typing-based check
-            # (``_is_parametrized_type_hint``) than to populate the Pickler's
-            # dispatch with type-specific savers.
-            _CloudPickleTypeHintFix._save_parametrized_type_hint(self, obj)
-        else:
-            dill.Pickler.save_global(self, obj, name=name)
-
     def memoize(self, obj):
         # don't memoize strings since two identical strings can have different python ids
         if type(obj) != str:
diff --git a/tests/commands/test_dummy_data.py b/tests/commands/test_dummy_data.py
index 81d5ccb4568..7402be4099d 100644
--- a/tests/commands/test_dummy_data.py
+++ b/tests/commands/test_dummy_data.py
@@ -1,45 +1,24 @@
 import os
 from collections import namedtuple
-from dataclasses import dataclass
 
-from packaging import version
-
-from datasets import config
 from datasets.commands.dummy_data import DummyDataCommand
 
 
-if config.PY_VERSION >= version.parse("3.7"):
-    DummyDataCommandArgs = namedtuple(
-        "DummyDataCommandArgs",
-        [
-            "path_to_dataset",
-            "auto_generate",
-            "n_lines",
-            "json_field",
-            "xml_tag",
-            "match_text_files",
-            "keep_uncompressed",
-            "cache_dir",
-            "encoding",
-        ],
-        defaults=[False, 5, None, None, None, False, None, None],
-    )
-else:
-
-    @dataclass
-    class DummyDataCommandArgs:
-        path_to_dataset: str
-        auto_generate: bool = False
-        n_lines: int = 5
-        json_field: str = None
-        xml_tag: str = None
-        match_text_files: str = None
-        keep_uncompressed: bool = False
-        cache_dir: str = None
-        encoding: str = None
-
-        def __iter__(self):
-            return iter(self.__dict__.values())
+DummyDataCommandArgs = namedtuple(
+    "DummyDataCommandArgs",
+    [
+        "path_to_dataset",
+        "auto_generate",
+        "n_lines",
+        "json_field",
+        "xml_tag",
+        "match_text_files",
+        "keep_uncompressed",
+        "cache_dir",
+        "encoding",
+    ],
+    defaults=[False, 5, None, None, None, False, None, None],
+)
 
 
 class MockDummyDataCommand(DummyDataCommand):
diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py
index ccb9ca664ab..26dcef37d24 100644
--- a/tests/commands/test_test.py
+++ b/tests/commands/test_test.py
@@ -1,46 +1,26 @@
 import json
 import os
 from collections import namedtuple
-from dataclasses import dataclass
-
-from packaging import version
 
 from datasets import config
 from datasets.commands.test import TestCommand
 
 
-if config.PY_VERSION >= version.parse("3.7"):
-    TestCommandArgs = namedtuple(
-        "TestCommandArgs",
-        [
-            "dataset",
-            "name",
-            "cache_dir",
-            "data_dir",
-            "all_configs",
-            "save_infos",
-            "ignore_verifications",
-            "force_redownload",
-            "clear_cache",
-        ],
-        defaults=[None, None, None, False, False, False, False, False],
-    )
-else:
-
-    @dataclass
-    class TestCommandArgs:
-        dataset: str
-        name: str = None
-        cache_dir: str = None
-        data_dir: str = None
-        all_configs: bool = False
-        save_infos: bool = False
-        ignore_verifications: bool = False
-        force_redownload: bool = False
-        clear_cache: bool = False
-
-        def __iter__(self):
-            return iter(self.__dict__.values())
+TestCommandArgs = namedtuple(
+    "TestCommandArgs",
+    [
+        "dataset",
+        "name",
+        "cache_dir",
+        "data_dir",
+        "all_configs",
+        "save_infos",
+        "ignore_verifications",
+        "force_redownload",
+        "clear_cache",
+    ],
+    defaults=[None, None, None, False, False, False, False, False],
+)
 
 
 def test_test_command(dataset_loading_script_dir):

From 6ad94e1fe307fe78181b6d1130338f1783759344 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 8 Jun 2022 13:26:06 +0200
Subject: [PATCH 02/21] Update requirements

---
 .circleci/config.yml                   | 10 +++++-----
 .github/hub/update_hub_repositories.py |  1 -
 Makefile                               |  4 ++--
 additional-tests-requirements.txt      |  2 +-
 docs/source/installation.md            |  2 +-
 setup.py                               |  7 +------
 6 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e5c71f88d49..c64cf2d2f0f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,7 +7,7 @@ jobs:
     run_dataset_script_tests_pyarrow_latest:
         working_directory: ~/datasets
         docker:
-            - image: cimg/python:3.6
+            - image: cimg/python:3.7
         resource_class: medium
         steps:
             - checkout
@@ -18,12 +18,12 @@ jobs:
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow --upgrade
-            - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.6 --dist loadfile -sv ./tests/
+            - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.7 --dist loadfile -sv ./tests/
 
     run_dataset_script_tests_pyarrow_6:
         working_directory: ~/datasets
         docker:
-            - image: cimg/python:3.6
+            - image: cimg/python:3.7
         resource_class: medium
         steps:
             - checkout
@@ -34,7 +34,7 @@ jobs:
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow==6.0.0
-            - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.6 --dist loadfile -sv ./tests/
+            - run: HF_SCRIPTS_VERSION=master HF_ALLOW_CODE_EVAL=1 python -m pytest -d --tx 2*popen//python=python3.7 --dist loadfile -sv ./tests/
 
     run_dataset_script_tests_pyarrow_latest_WIN:
         working_directory: ~/datasets
@@ -81,7 +81,7 @@ jobs:
     check_code_quality:
         working_directory: ~/datasets
         docker:
-            - image: cimg/python:3.6
+            - image: cimg/python:3.7
         resource_class: medium
         parallelism: 1
         steps:
diff --git a/.github/hub/update_hub_repositories.py b/.github/hub/update_hub_repositories.py
index 875cbf80bd2..c923583ba7f 100644
--- a/.github/hub/update_hub_repositories.py
+++ b/.github/hub/update_hub_repositories.py
@@ -1,4 +1,3 @@
-import base64
 import distutils.dir_util
 import logging
 import os
diff --git a/Makefile b/Makefile
index e3615d44ed0..b7936753dba 100644
--- a/Makefile
+++ b/Makefile
@@ -3,14 +3,14 @@
 # Check that source code meets quality standards
 
 quality:
-	black --check --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics
+	black --check --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics
 	isort --check-only tests src benchmarks datasets/**/*.py metrics
 	flake8 tests src benchmarks datasets/**/*.py metrics
 
 # Format source code automatically
 
 style:
-	black --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics
+	black --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics
 	isort tests src benchmarks datasets/**/*.py metrics
 
 # Run tests for the library
diff --git a/additional-tests-requirements.txt b/additional-tests-requirements.txt
index a827c308c9f..00b5b8d62a3 100644
--- a/additional-tests-requirements.txt
+++ b/additional-tests-requirements.txt
@@ -1,4 +1,4 @@
-unbabel-comet>=1.0.0;python_version>'3.6'
+unbabel-comet>=1.0.0
 git+https://github.com/google-research/bleurt.git
 git+https://github.com/ns-moosavi/coval.git
 git+https://github.com/hendrycks/math.git
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 4ea7c7ad2a1..033b767c01d 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,6 +1,6 @@
 # Installation
 
-Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.6+**.
+Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.7+**.
 
 <Tip>
 
diff --git a/setup.py b/setup.py
index 2ffc3ffd46a..01b567b7f85 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,6 @@
    Then push the change with a message 'set dev version'
 """
 
-import os
 
 from setuptools import find_packages, setup
 
@@ -74,8 +73,6 @@
     "requests>=2.19.0",
     # progress bars in download and scripts
     "tqdm>=4.62.1",
-    # dataclasses for Python versions that don't have it
-    "dataclasses;python_version<'3.7'",
     # for fast hashing
     "xxhash",
     # for better multiprocessing
@@ -162,8 +159,6 @@
     "texttable>=1.6.3",
     "Werkzeug>=1.0.1",
     "six~=1.15.0",
-    # metadata validation
-    "importlib_resources;python_version<'3.7'",
 ]
 
 TESTS_REQUIRE.extend(VISION_REQURE)
@@ -211,6 +206,7 @@
     packages=find_packages("src"),
     package_data={"datasets": ["py.typed", "scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml"]},
     entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
+    python_requires=">=3.7.0",
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
     classifiers=[
@@ -221,7 +217,6 @@
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",

From 3800c495c62fedf8aa0c882754f8504133c8cdcd Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 8 Jun 2022 13:32:48 +0200
Subject: [PATCH 03/21] Style

---
 src/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
index 3155dfdb720..e4806a9f0b2 100644
--- a/src/datasets/__init__.py
+++ b/src/datasets/__init__.py
@@ -20,9 +20,11 @@
 __version__ = "2.2.3.dev0"
 
 import platform
+
 import pyarrow
 from packaging import version
 
+
 if version.parse(platform.python_version()) < version.parse("3.7"):
     raise ImportWarning(
         "To use `datasets`, `python>=3.7` is required, and the current version of python doesn't match this condition."

From 32aab3db1948864ef5ce7a7e63b272e0f2e9701c Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 8 Jun 2022 13:49:33 +0200
Subject: [PATCH 04/21] Update audio gh action

---
 .github/workflows/test-audio.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml
index 68e0b8f0b3b..805ca70777d 100644
--- a/.github/workflows/test-audio.yml
+++ b/.github/workflows/test-audio.yml
@@ -19,7 +19,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: "3.6"
+          python-version: "3.7"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

From 2dee899a6a7ded7c40669d36b450e1603fb2a8e0 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 8 Jun 2022 14:10:07 +0200
Subject: [PATCH 05/21] Benchmarks fix attempt #1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 01b567b7f85..931e5c8ee2f 100644
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@
 BENCHMARKS_REQUIRE = [
     "numpy==1.18.5",
     "tensorflow==2.3.0",
-    "torch==1.6.0",
+    "torch==1.7.0",
     "transformers==3.0.2",
 ]
 

From 8c6ce1920211b14a34ee1e101ad47ca69006f185 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 8 Jun 2022 15:09:17 +0200
Subject: [PATCH 06/21] Benchmarks fix attempt no.2

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 931e5c8ee2f..b2a5ba43256 100644
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@
 BENCHMARKS_REQUIRE = [
     "numpy==1.18.5",
     "tensorflow==2.3.0",
-    "torch==1.7.0",
+    "torch==1.11.0",
     "transformers==3.0.2",
 ]
 

From 89b9ea64a4ba2b9bf75c35bc7acac5f9c15f5b12 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Mon, 13 Jun 2022 16:37:11 +0200
Subject: [PATCH 07/21] Use newer image

---
 .github/workflows/benchmarks.yaml | 2 +-
 setup.py                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
index 81c524ff8c5..216337e7fa6 100644
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
@@ -3,7 +3,7 @@ on: [push]
 jobs:
   run:
     runs-on: [ubuntu-latest]
-    container: docker://dvcorg/cml-py3:latest
+    container: docker://dvcorg/cml:latest
     steps:
       - uses: actions/checkout@v2
       - name: cml_run
diff --git a/setup.py b/setup.py
index b2a5ba43256..a960efedc86 100644
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@
 BENCHMARKS_REQUIRE = [
     "numpy==1.18.5",
     "tensorflow==2.3.0",
-    "torch==1.11.0",
+    "torch==1.7.1",
     "transformers==3.0.2",
 ]
 

From f9b24c4256992487ae9c0336850d8af435b7b499 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Mon, 13 Jun 2022 16:43:29 +0200
Subject: [PATCH 08/21] Remove backticks

---
 src/datasets/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
index e4806a9f0b2..6648525d214 100644
--- a/src/datasets/__init__.py
+++ b/src/datasets/__init__.py
@@ -27,7 +27,7 @@
 
 if version.parse(platform.python_version()) < version.parse("3.7"):
     raise ImportWarning(
-        "To use `datasets`, `python>=3.7` is required, and the current version of python doesn't match this condition."
+        "To use `datasets`, Python>=3.7 is required, and the current version of Python doesn't match this condition."
     )
 
 if version.parse(pyarrow.__version__).major < 6:

From 8a82d72d418c2347df088558e312a3c0bf05a2fe Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Mon, 13 Jun 2022 17:04:52 +0200
Subject: [PATCH 09/21] Add suggested command to benchmark action

---
 .github/workflows/benchmarks.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
index 216337e7fa6..871ac192b5a 100644
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
@@ -10,6 +10,9 @@ jobs:
         env:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
         run: |
+          # See https://github.com/actions/checkout/issues/760
+          git config --global --add safe.directory /__w/datasets/datasets
+
           # Your ML workflow goes here
 
           pip install --upgrade pip

From 6eab0e4ef8c5873ffbbdc956ef4d7ab3ce5388f7 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Tue, 14 Jun 2022 03:14:03 +0200
Subject: [PATCH 10/21] Avoid some FutureWarnings and DeprecationWarnings

---
 src/datasets/arrow_dataset.py              | 4 ++--
 src/datasets/features/features.py          | 2 +-
 src/datasets/formatting/formatting.py      | 4 ++--
 src/datasets/formatting/jax_formatter.py   | 2 +-
 src/datasets/formatting/tf_formatter.py    | 4 +---
 src/datasets/formatting/torch_formatter.py | 2 +-
 src/datasets/utils/stratify.py             | 2 +-
 tests/features/test_array_xd.py            | 2 +-
 tests/test_builder.py                      | 4 ++--
 9 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 22150f4ce66..0d48b87cb1f 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -280,7 +280,7 @@ def _get_output_signature(
                 else:
                     np_arrays.append(np.array(array))
 
-            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.bool:
+            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool:
                 tf_dtype = tf.int64
                 np_dtype = np.int64
             elif np.issubdtype(np_arrays[0].dtype, np.number):
@@ -3663,7 +3663,7 @@ def _feature(values: Union[float, int, str, np.ndarray]) -> "tf.train.Feature":
                     return _float_feature([values.item()])
                 elif np.issubdtype(values.dtype, np.integer):
                     return _int64_feature([values.item()])
-                elif np.issubdtype(values.dtype, np.str):
+                elif np.issubdtype(values.dtype, str):
                     return _bytes_feature([values.item().encode()])
                 else:
                     raise ValueError(f"values={values} has dtype {values.dtype}, which cannot be serialized")
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
index 3e8301df08b..9ddad51e2e9 100644
--- a/src/datasets/features/features.py
+++ b/src/datasets/features/features.py
@@ -810,7 +810,7 @@ def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray,
     def take(
         self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None
     ) -> "PandasArrayExtensionArray":
-        indices: np.ndarray = np.asarray(indices, dtype=np.int)
+        indices: np.ndarray = np.asarray(indices, dtype=int)
         if allow_fill:
             fill_value = (
                 self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type)
diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py
index f857a53a992..ecc5b5e6606 100644
--- a/src/datasets/formatting/formatting.py
+++ b/src/datasets/formatting/formatting.py
@@ -194,11 +194,11 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
                 array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
         if len(array) > 0:
             if any(
-                (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
+                (isinstance(x, np.ndarray) and (x.dtype == object or x.shape != array[0].shape))
                 or (isinstance(x, float) and np.isnan(x))
                 for x in array
             ):
-                return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
+                return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": object})
         return np.array(array, copy=False, **self.np_array_kwargs)
 
 
diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py
index dffe37bc5f0..0a554203be5 100644
--- a/src/datasets/formatting/jax_formatter.py
+++ b/src/datasets/formatting/jax_formatter.py
@@ -54,7 +54,7 @@ def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct
         if isinstance(data_struct, (list, np.ndarray)):
             data_struct = np.array(data_struct, copy=False)
-            if data_struct.dtype == np.object:  # jax arrays cannot be instantied from an array of objects
+            if data_struct.dtype == object:  # jax arrays cannot be instantied from an array of objects
                 return [self.recursive_tensorize(substruct) for substruct in data_struct]
         return self._tensorize(data_struct)
 
diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py
index 7e835280c54..d07f1f636cc 100644
--- a/src/datasets/formatting/tf_formatter.py
+++ b/src/datasets/formatting/tf_formatter.py
@@ -65,9 +65,7 @@ def _tensorize(self, value):
     def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct
         if isinstance(data_struct, (list, np.ndarray)):
-            if (
-                data_struct.dtype == np.object
-            ):  # tensorflow tensors can sometimes be instantied from an array of objects
+            if data_struct.dtype == object:  # tensorflow tensors can sometimes be instantied from an array of objects
                 try:
                     return self._tensorize(data_struct)
                 except ValueError:
diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py
index 3106a024920..c5a7d3c214f 100644
--- a/src/datasets/formatting/torch_formatter.py
+++ b/src/datasets/formatting/torch_formatter.py
@@ -46,7 +46,7 @@ def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct
         if isinstance(data_struct, (list, np.ndarray)):
             data_struct = np.array(data_struct, copy=False)
-            if data_struct.dtype == np.object:  # pytorch tensors cannot be instantied from an array of objects
+            if data_struct.dtype == object:  # pytorch tensors cannot be instantied from an array of objects
                 return [self.recursive_tensorize(substruct) for substruct in data_struct]
         return self._tensorize(data_struct)
 
diff --git a/src/datasets/utils/stratify.py b/src/datasets/utils/stratify.py
index 58ea04f2f85..3a72f6fc3f2 100644
--- a/src/datasets/utils/stratify.py
+++ b/src/datasets/utils/stratify.py
@@ -48,7 +48,7 @@ def approximate_mode(class_counts, n_draws, rng):
             need_to_add -= add_now
             if need_to_add == 0:
                 break
-    return floored.astype(np.int)
+    return floored.astype(int)
 
 
 def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10):
diff --git a/tests/features/test_array_xd.py b/tests/features/test_array_xd.py
index ae007abbe00..2f6d9d94009 100644
--- a/tests/features/test_array_xd.py
+++ b/tests/features/test_array_xd.py
@@ -335,7 +335,7 @@ def test_array_xd_with_none():
     dummy_array = np.array([[1, 2], [3, 4]], dtype="int32")
     dataset = datasets.Dataset.from_dict({"foo": [dummy_array, None, dummy_array]}, features=features)
     arr = NumpyArrowExtractor().extract_column(dataset._data)
-    assert isinstance(arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3,)
+    assert isinstance(arr, np.ndarray) and arr.dtype == object and arr.shape == (3,)
     np.testing.assert_equal(arr[0], dummy_array)
     np.testing.assert_equal(arr[2], dummy_array)
     assert np.isnan(arr[1])  # a single np.nan value - np.all not needed
diff --git a/tests/test_builder.py b/tests/test_builder.py
index 853859d40ca..c7e08e0f01b 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -837,8 +837,8 @@ def _generate_examples(self):
     "builder_class, kwargs",
     [
         (DummyBuilderWithVersion, {}),
-        (DummyBuilderWithBuilderConfigs, {"name": "custom"}),
-        (DummyBuilderWithCustomBuilderConfigs, {"name": "20220501.en"}),
+        (DummyBuilderWithBuilderConfigs, {"config_name": "custom"}),
+        (DummyBuilderWithCustomBuilderConfigs, {"config_name": "20220501.en"}),
         (DummyBuilderWithCustomBuilderConfigs, {"date": "20220501", "language": "ca"}),
     ],
 )

From 97ee6987e9bab5149e4477d7ecbc3a16c1e21bfe Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 16 Jun 2022 19:11:04 +0200
Subject: [PATCH 11/21] Disable test

---
 tests/test_arrow_dataset.py | 6 +++---
 tests/test_dataset_dict.py  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index d43b1509c50..68370bf5b28 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -3056,9 +3056,9 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param
 
 
 @pytest.mark.skipif(
-    os.name == "nt" and os.getenv("CIRCLECI") == "true",
-    reason='On Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
-)  # TODO(QL): find what's wrong with CircleCI
+    os.name in ["posix", "nt"] and os.getenv("CIRCLECI") == "true",
+    reason='On Linux/Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
+)  # TODO(QL, mario): find what's wrong with CircleCI
 @require_s3
 def test_dummy_dataset_serialize_s3(s3, dataset):
     mock_bucket = s3_test_bucket_name
diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py
index 814d04ee370..7e78b759030 100644
--- a/tests/test_dataset_dict.py
+++ b/tests/test_dataset_dict.py
@@ -656,9 +656,9 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path):
 
 
 @pytest.mark.skipif(
-    os.name == "nt" and os.getenv("CIRCLECI") == "true",
-    reason='On Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
-)  # TODO(QL): find what's wrong with CircleCI
+    os.name in ["posix", "nt"] and os.getenv("CIRCLECI") == "true",
+    reason='On Linux/Windows CircleCI, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
+)  # TODO(QL, mario): find what's wrong with CircleCI
 @require_s3
 def test_dummy_dataset_serialize_s3(s3, dataset):
     dsets = DatasetDict({"train": dataset, "test": dataset.select(range(2))})

From 30de7fa2d2d3cdcc01b50b0d82d948fbf4aa068e Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 20 Jul 2022 13:48:19 +0200
Subject: [PATCH 12/21] Remove 3.6 pickling test

---
 tests/test_arrow_dataset.py |  2 +-
 tests/test_dataset_dict.py  |  2 +-
 tests/test_fingerprint.py   | 15 ---------------
 3 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 055b3d9f4ff..3bb44ce930e 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -3119,7 +3119,7 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param
 
 
 @pytest.mark.skipif(
-    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
 )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3
diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py
index 2f612dd4ddc..643edf043ae 100644
--- a/tests/test_dataset_dict.py
+++ b/tests/test_dataset_dict.py
@@ -665,7 +665,7 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path):
 
 
 @pytest.mark.skipif(
-    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
 )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3
diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py
index 0ffe453fcc1..23ca0c4e0af 100644
--- a/tests/test_fingerprint.py
+++ b/tests/test_fingerprint.py
@@ -226,21 +226,6 @@ def globalvars_mock2_side_effect(func, *args, **kwargs):
         self.assertEqual(hash1, hash2)
 
 
-class TypeHintDumpTest(TestCase):
-    def test_dump_type_hint(self):
-        from typing import Union
-
-        t1 = Union[str, None]  # this type is not picklable in python 3.6
-        # let's check that we can pickle it anyway using our pickler, even in 3.6
-        hash1 = md5(datasets.utils.py_utils.dumps(t1)).hexdigest()
-        t2 = Union[str]  # this type is picklable in python 3.6
-        hash2 = md5(datasets.utils.py_utils.dumps(t2)).hexdigest()
-        t3 = Union[str, None]
-        hash3 = md5(datasets.utils.py_utils.dumps(t3)).hexdigest()
-        self.assertEqual(hash1, hash3)
-        self.assertNotEqual(hash1, hash2)
-
-
 class HashingTest(TestCase):
     def test_hash_simple(self):
         hash1 = Hasher.hash("hello")

From 983b04e4a946afc8ec2cc32e47ef2173d46ab7ee Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Fri, 22 Jul 2022 13:00:33 +0200
Subject: [PATCH 13/21] CI test


From eac1aaa3ad6feecd387c154538a46c63db5ff010 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Fri, 22 Jul 2022 13:30:04 +0200
Subject: [PATCH 14/21] Use python 3.7 in ubuntu-latest

---
 .github/workflows/ci.yml                              | 10 ++--------
 src/datasets/packaged_modules/text/dataset_infos.json |  1 +
 2 files changed, 3 insertions(+), 8 deletions(-)
 create mode 100644 src/datasets/packaged_modules/text/dataset_infos.json

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b4f14085ee4..5513c1c6397 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.6"
+          python-version: "3.7"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -49,13 +49,7 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
-      - name: Set up Python 3.6
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.6
       - name: Set up Python 3.7
-        if: ${{ matrix.os == 'windows-latest' }}
         uses: actions/setup-python@v4
         with:
           python-version: 3.7
@@ -63,7 +57,7 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Pin setuptools-scm
         if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.6" && pip install "setuptools-scm==6.4.2"
+        run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2"
       - name: Install dependencies
         run: |
           pip install .[tests]
diff --git a/src/datasets/packaged_modules/text/dataset_infos.json b/src/datasets/packaged_modules/text/dataset_infos.json
new file mode 100644
index 00000000000..c647519d3b4
--- /dev/null
+++ b/src/datasets/packaged_modules/text/dataset_infos.json
@@ -0,0 +1 @@
+{"bigscience": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "text", "config_name": "bigscience", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 938, "num_examples": 22, "dataset_name": "text"}}, "download_checksums": {"C:\\Users\\Mario\\Desktop\\bigscience\\biscience.txt": {"num_bytes": 892, "checksum": "1e1f85c9e2aefb6990dc6ec4a8805af1e5451ebecb7e9f50face10c83eed742e"}}, "download_size": 892, "post_processing_size": null, "dataset_size": 938, "size_in_bytes": 1830}}
\ No newline at end of file

From 4db3cf9330e098ed0cc80651140c21b11e79fa0f Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Fri, 22 Jul 2022 13:58:01 +0200
Subject: [PATCH 15/21] Disable s3 test on Linux

---
 tests/test_arrow_dataset.py | 2 +-
 tests/test_dataset_dict.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 3bb44ce930e..055b3d9f4ff 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -3119,7 +3119,7 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param
 
 
 @pytest.mark.skipif(
-    os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
 )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3
diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py
index 643edf043ae..2f612dd4ddc 100644
--- a/tests/test_dataset_dict.py
+++ b/tests/test_dataset_dict.py
@@ -665,7 +665,7 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path):
 
 
 @pytest.mark.skipif(
-    os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
 )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3

From 2674f0c7e9e64d1c9694f87ecd1f773cde3aced5 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Tue, 26 Jul 2022 12:47:05 +0200
Subject: [PATCH 16/21] Remove weird json file

---
 src/datasets/packaged_modules/text/dataset_infos.json | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 src/datasets/packaged_modules/text/dataset_infos.json

diff --git a/src/datasets/packaged_modules/text/dataset_infos.json b/src/datasets/packaged_modules/text/dataset_infos.json
deleted file mode 100644
index c647519d3b4..00000000000
--- a/src/datasets/packaged_modules/text/dataset_infos.json
+++ /dev/null
@@ -1 +0,0 @@
-{"bigscience": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "text", "config_name": "bigscience", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 938, "num_examples": 22, "dataset_name": "text"}}, "download_checksums": {"C:\\Users\\Mario\\Desktop\\bigscience\\biscience.txt": {"num_bytes": 892, "checksum": "1e1f85c9e2aefb6990dc6ec4a8805af1e5451ebecb7e9f50face10c83eed742e"}}, "download_size": 892, "post_processing_size": null, "dataset_size": 938, "size_in_bytes": 1830}}
\ No newline at end of file

From c3bc52d6469bb96cd90548f424b907ec7aa20c84 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Tue, 26 Jul 2022 12:50:37 +0200
Subject: [PATCH 17/21] Remove cloudpickle stuff

---
 src/datasets/utils/py_utils.py | 44 +---------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py
index 48ba2293cb0..73a22d8f20e 100644
--- a/src/datasets/utils/py_utils.py
+++ b/src/datasets/utils/py_utils.py
@@ -21,7 +21,6 @@
 import functools
 import itertools
 import os
-import pickle
 import re
 import types
 from contextlib import contextmanager
@@ -29,7 +28,7 @@
 from multiprocessing import Pool, RLock
 from shutil import disk_usage
 from types import CodeType, FunctionType
-from typing import Callable, ClassVar, Dict, Generic, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
 import dill
@@ -559,47 +558,6 @@ def proxy(func):
     return proxy
 
 
-class _CloudPickleTypeHintFix:
-    """
-    Type hints can't be properly pickled in python < 3.7
-    CloudPickle provided a way to make it work in older versions.
-    This class provide utilities to fix pickling of type hints in older versions.
-    from https://github.com/cloudpipe/cloudpickle/pull/318/files
-    """
-
-    def _is_parametrized_type_hint(obj):
-        # This is very cheap but might generate false positives.
-        origin = getattr(obj, "__origin__", None)  # typing Constructs
-        values = getattr(obj, "__values__", None)  # typing_extensions.Literal
-        type_ = getattr(obj, "__type__", None)  # typing_extensions.Final
-        return origin is not None or values is not None or type_ is not None
-
-    def _create_parametrized_type_hint(origin, args):
-        return origin[args]
-
-    def _save_parametrized_type_hint(pickler, obj):
-        # The distorted type check sematic for typing construct becomes:
-        # ``type(obj) is type(TypeHint)``, which means "obj is a
-        # parametrized TypeHint"
-        if type(obj) is type(Literal):  # pragma: no branch
-            initargs = (Literal, obj.__values__)
-        elif type(obj) is type(Final):  # pragma: no branch
-            initargs = (Final, obj.__type__)
-        elif type(obj) is type(ClassVar):
-            initargs = (ClassVar, obj.__type__)
-        elif type(obj) in [type(Union), type(Tuple), type(Generic)]:
-            initargs = (obj.__origin__, obj.__args__)
-        elif type(obj) is type(Callable):
-            args = obj.__args__
-            if args[0] is Ellipsis:
-                initargs = (obj.__origin__, args)
-            else:
-                initargs = (obj.__origin__, (list(args[:-1]), args[-1]))
-        else:  # pragma: no cover
-            raise pickle.PicklingError(f"Datasets pickle Error: Unknown type {type(obj)}")
-        pickler.save_reduce(_CloudPickleTypeHintFix._create_parametrized_type_hint, initargs, obj=obj)
-
-
 @pklregister(CodeType)
 def _save_code(pickler, obj):
     """

From ad949dd02f880e3f3fb525f09c0eaaa90e0f1aa4 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Tue, 26 Jul 2022 13:15:18 +0200
Subject: [PATCH 18/21] Use lower torchaudio version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e2ee6cba04f..9085802d5d4 100644
--- a/setup.py
+++ b/setup.py
@@ -125,7 +125,7 @@
     "s3fs>=2021.11.1",  # aligned with fsspec[http]>=2021.11.1
     "tensorflow>=2.3,!=2.6.0,!=2.6.1",
     "torch",
-    "torchaudio",
+    "torchaudio<0.12.0",
     "soundfile",
     "transformers",
     # datasets dependencies

From feebd9048ac42e756155cf221e7364fe2dee6845 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Tue, 26 Jul 2022 14:11:49 +0200
Subject: [PATCH 19/21] Try to fix s3 errors

---
 setup.py                    | 2 +-
 tests/test_arrow_dataset.py | 8 ++++----
 tests/test_dataset_dict.py  | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 9085802d5d4..423787c46d9 100644
--- a/setup.py
+++ b/setup.py
@@ -120,7 +120,7 @@
     "botocore>=1.22.8",  # to be compatible with aiobotocore and boto3
     "faiss-cpu>=1.6.4",
     "fsspec[s3]",
-    "moto[s3,server]==2.0.4",
+    "moto[s3,server]>=3.0.0",
     "rarfile>=4.0",
     "s3fs>=2021.11.1",  # aligned with fsspec[http]>=2021.11.1
     "tensorflow>=2.3,!=2.6.0,!=2.6.1",
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index afa17dff3c3..0013d13ba55 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -3117,10 +3117,10 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param
         assert dataset._data.table == reloaded_dataset._data.table
 
 
-@pytest.mark.skipif(
-    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
-    reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
-)  # TODO: find what's wrong with CircleCI / GitHub Actions
+# @pytest.mark.skipif(
+#     os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+#     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
+# )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3
 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name):
     mock_bucket = s3_test_bucket_name
diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py
index 3593030ecbd..743bca6266c 100644
--- a/tests/test_dataset_dict.py
+++ b/tests/test_dataset_dict.py
@@ -663,10 +663,10 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path):
     assert all(dataset[split].split == split for split in path.keys())
 
 
-@pytest.mark.skipif(
-    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
-    reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
-)  # TODO: find what's wrong with CircleCI / GitHub Actions
+# @pytest.mark.skipif(
+#     os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+#     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
+# )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3
 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name):
     dsets = DatasetDict({"train": dataset, "test": dataset.select(range(2))})

From 59a1e3d7f40c9f7f6156befc942b7652e3f45be6 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Tue, 26 Jul 2022 15:08:38 +0200
Subject: [PATCH 20/21] Another attempt

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 423787c46d9..468bb78ccf2 100644
--- a/setup.py
+++ b/setup.py
@@ -88,7 +88,7 @@
     "huggingface-hub>=0.1.0,<1.0.0",
     # Utilities from PyPA to e.g., compare versions
     "packaging",
-    "responses<0.19",
+    "responses==0.16",
 ]
 
 AUDIO_REQUIRE = [
@@ -120,7 +120,7 @@
     "botocore>=1.22.8",  # to be compatible with aiobotocore and boto3
     "faiss-cpu>=1.6.4",
     "fsspec[s3]",
-    "moto[s3,server]>=3.0.0",
+    "moto[s3,server]==2.0.4",
     "rarfile>=4.0",
     "s3fs>=2021.11.1",  # aligned with fsspec[http]>=2021.11.1
     "tensorflow>=2.3,!=2.6.0,!=2.6.1",

From 9c2b16ce2d4072bd8c686497dd3b7c4d5add4107 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Tue, 26 Jul 2022 15:34:14 +0200
Subject: [PATCH 21/21] Disable test

---
 setup.py                    | 2 +-
 tests/test_arrow_dataset.py | 8 ++++----
 tests/test_dataset_dict.py  | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 468bb78ccf2..9085802d5d4 100644
--- a/setup.py
+++ b/setup.py
@@ -88,7 +88,7 @@
     "huggingface-hub>=0.1.0,<1.0.0",
     # Utilities from PyPA to e.g., compare versions
     "packaging",
-    "responses==0.16",
+    "responses<0.19",
 ]
 
 AUDIO_REQUIRE = [
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 0013d13ba55..afa17dff3c3 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -3117,10 +3117,10 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param
         assert dataset._data.table == reloaded_dataset._data.table
 
 
-# @pytest.mark.skipif(
-#     os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
-#     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
-# )  # TODO: find what's wrong with CircleCI / GitHub Actions
+@pytest.mark.skipif(
+    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
+)  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3
 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name):
     mock_bucket = s3_test_bucket_name
diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py
index 743bca6266c..3593030ecbd 100644
--- a/tests/test_dataset_dict.py
+++ b/tests/test_dataset_dict.py
@@ -663,10 +663,10 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path):
     assert all(dataset[split].split == split for split in path.keys())
 
 
-# @pytest.mark.skipif(
-#     os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
-#     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
-# )  # TODO: find what's wrong with CircleCI / GitHub Actions
+@pytest.mark.skipif(
+    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
+)  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3
 def test_dummy_dataset_serialize_s3(s3, dataset, s3_test_bucket_name):
     dsets = DatasetDict({"train": dataset, "test": dataset.select(range(2))})