From 9edc2e7429970259377f3a771d4aa99e4b65b6bf Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 30 Apr 2021 19:03:09 +0200
Subject: [PATCH 01/23] Reorder execution flow in cached_path

---
 src/datasets/utils/file_utils.py | 117 +++++++++++++++----------------
 1 file changed, 58 insertions(+), 59 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index 1c37a469659..c9e9d316cf1 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -300,67 +300,66 @@ def cached_path(
         # Something unknown
         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
 
-    if download_config.extract_compressed_file and output_path is not None:
-
-        if (
-            not is_zipfile(output_path)
-            and not tarfile.is_tarfile(output_path)
-            and not is_gzip(output_path)
-            and not is_xz(output_path)
-            and not is_rarfile(output_path)
-        ):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We extract in the cache dir, and get the extracted path name by hashing the original path"
-        abs_output_path = os.path.abspath(output_path)
-        output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path))
-
-        if (
-            os.path.isdir(output_path_extracted)
-            and os.listdir(output_path_extracted)
-            and not download_config.force_extract
-        ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract):
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted, exist_ok=True)
-            if tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            elif is_gzip(output_path):
-                os.rmdir(output_path_extracted)
-                with gzip.open(output_path, "rb") as gzip_file:
-                    with open(output_path_extracted, "wb") as extracted_file:
-                        shutil.copyfileobj(gzip_file, extracted_file)
-            elif is_zipfile(output_path):  # put zip file to the last, b/c it is possible wrongly detected as zip
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif is_xz(output_path):
-                os.rmdir(output_path_extracted)
-                with lzma.open(output_path) as compressed_file:
-                    with open(output_path_extracted, "wb") as extracted_file:
-                        shutil.copyfileobj(compressed_file, extracted_file)
-            elif is_rarfile(output_path):
-                if config.RARFILE_AVAILABLE:
-                    import rarfile
-
-                    rf = rarfile.RarFile(output_path)
-                    rf.extractall(output_path_extracted)
-                    rf.close()
-                else:
-                    raise EnvironmentError("Please pip install rarfile")
-            else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
-
+    if not download_config.extract_compressed_file or output_path is None:
+        return output_path
+
+    if (
+        not is_zipfile(output_path)
+        and not tarfile.is_tarfile(output_path)
+        and not is_gzip(output_path)
+        and not is_xz(output_path)
+        and not is_rarfile(output_path)
+    ):
+        return output_path
+
+    # Path where we extract compressed archives
+    # We extract in the cache dir, and get the extracted path name by hashing the original path"
+    abs_output_path = os.path.abspath(output_path)
+    output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path))
+
+    if (
+        os.path.isdir(output_path_extracted)
+        and os.listdir(output_path_extracted)
+        and not download_config.force_extract
+    ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract):
         return output_path_extracted
 
-    return output_path
+    # Prevent parallel extractions
+    lock_path = output_path + ".lock"
+    with FileLock(lock_path):
+        shutil.rmtree(output_path_extracted, ignore_errors=True)
+        os.makedirs(output_path_extracted, exist_ok=True)
+        if tarfile.is_tarfile(output_path):
+            tar_file = tarfile.open(output_path)
+            tar_file.extractall(output_path_extracted)
+            tar_file.close()
+        elif is_gzip(output_path):
+            os.rmdir(output_path_extracted)
+            with gzip.open(output_path, "rb") as gzip_file:
+                with open(output_path_extracted, "wb") as extracted_file:
+                    shutil.copyfileobj(gzip_file, extracted_file)
+        elif is_zipfile(output_path):  # put zip file to the last, b/c it is possible wrongly detected as zip
+            with ZipFile(output_path, "r") as zip_file:
+                zip_file.extractall(output_path_extracted)
+                zip_file.close()
+        elif is_xz(output_path):
+            os.rmdir(output_path_extracted)
+            with lzma.open(output_path) as compressed_file:
+                with open(output_path_extracted, "wb") as extracted_file:
+                    shutil.copyfileobj(compressed_file, extracted_file)
+        elif is_rarfile(output_path):
+            if config.RARFILE_AVAILABLE:
+                import rarfile
+
+                rf = rarfile.RarFile(output_path)
+                rf.extractall(output_path_extracted)
+                rf.close()
+            else:
+                raise EnvironmentError("Please pip install rarfile")
+        else:
+            raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+
+    return output_path_extracted
 
 
 def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:

From 77b0252989f78363d13719f2662ab14b982cf42b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 30 Apr 2021 19:11:00 +0200
Subject: [PATCH 02/23] Extract function _extract from cached_path

---
 src/datasets/utils/file_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index c9e9d316cf1..6d2d661ddd1 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -324,6 +324,12 @@ def cached_path(
     ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract):
         return output_path_extracted
 
+    _extract(output_path, output_path_extracted)
+
+    return output_path_extracted
+
+
+def _extract(output_path, output_path_extracted):
     # Prevent parallel extractions
     lock_path = output_path + ".lock"
     with FileLock(lock_path):
@@ -359,8 +365,6 @@ def cached_path(
         else:
             raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
 
-    return output_path_extracted
-
 
 def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
     ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION)

From 66f69758212450e15c819dd4a384404bc4b20ec4 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 30 Apr 2021 19:33:49 +0200
Subject: [PATCH 03/23] Extract method for each extract type

---
 src/datasets/utils/file_utils.py | 70 +++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index 6d2d661ddd1..b043380ad89 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -335,37 +335,61 @@ def _extract(output_path, output_path_extracted):
     with FileLock(lock_path):
         shutil.rmtree(output_path_extracted, ignore_errors=True)
         os.makedirs(output_path_extracted, exist_ok=True)
-        if tarfile.is_tarfile(output_path):
-            tar_file = tarfile.open(output_path)
-            tar_file.extractall(output_path_extracted)
-            tar_file.close()
+        if is_tarfile(output_path):
+            extract_tarfile(output_path, output_path_extracted)
         elif is_gzip(output_path):
-            os.rmdir(output_path_extracted)
-            with gzip.open(output_path, "rb") as gzip_file:
-                with open(output_path_extracted, "wb") as extracted_file:
-                    shutil.copyfileobj(gzip_file, extracted_file)
+            extract_gzip(output_path, output_path_extracted)
         elif is_zipfile(output_path):  # put zip file to the last, b/c it is possible wrongly detected as zip
-            with ZipFile(output_path, "r") as zip_file:
-                zip_file.extractall(output_path_extracted)
-                zip_file.close()
+            extract_zipfile(output_path, output_path_extracted)
         elif is_xz(output_path):
-            os.rmdir(output_path_extracted)
-            with lzma.open(output_path) as compressed_file:
-                with open(output_path_extracted, "wb") as extracted_file:
-                    shutil.copyfileobj(compressed_file, extracted_file)
+            extract_xz(output_path, output_path_extracted)
         elif is_rarfile(output_path):
-            if config.RARFILE_AVAILABLE:
-                import rarfile
-
-                rf = rarfile.RarFile(output_path)
-                rf.extractall(output_path_extracted)
-                rf.close()
-            else:
-                raise EnvironmentError("Please pip install rarfile")
+            extract_rarfile(output_path, output_path_extracted)
         else:
             raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
 
 
+def extract_rarfile(output_path, output_path_extracted):
+    if config.RARFILE_AVAILABLE:
+        import rarfile
+
+        rf = rarfile.RarFile(output_path)
+        rf.extractall(output_path_extracted)
+        rf.close()
+    else:
+        raise EnvironmentError("Please pip install rarfile")
+
+
+def extract_xz(output_path, output_path_extracted):
+    os.rmdir(output_path_extracted)
+    with lzma.open(output_path) as compressed_file:
+        with open(output_path_extracted, "wb") as extracted_file:
+            shutil.copyfileobj(compressed_file, extracted_file)
+
+
+def extract_zipfile(output_path, output_path_extracted):
+    with ZipFile(output_path, "r") as zip_file:
+        zip_file.extractall(output_path_extracted)
+        zip_file.close()
+
+
+def extract_gzip(output_path, output_path_extracted):
+    os.rmdir(output_path_extracted)
+    with gzip.open(output_path, "rb") as gzip_file:
+        with open(output_path_extracted, "wb") as extracted_file:
+            shutil.copyfileobj(gzip_file, extracted_file)
+
+
+def extract_tarfile(output_path, output_path_extracted):
+    tar_file = tarfile.open(output_path)
+    tar_file.extractall(output_path_extracted)
+    tar_file.close()
+
+
+def is_tarfile(output_path):
+    return tarfile.is_tarfile(output_path)
+
+
 def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
     ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION)
     ua += "; pyarrow/{}".format(pa.__version__)

From c1efe8846214291d1e7ff24594c5d23257bd61f0 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 30 Apr 2021 19:51:32 +0200
Subject: [PATCH 04/23] Create an Extractor class for each file type

---
 src/datasets/utils/file_utils.py | 212 ++++++++++++++++---------------
 1 file changed, 113 insertions(+), 99 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index b043380ad89..d9a1bbb8a60 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -23,7 +23,7 @@
 from pathlib import Path
 from typing import Dict, Optional, Union
 from urllib.parse import urlparse
-from zipfile import ZipFile, is_zipfile
+from zipfile import ZipFile, is_zipfile as _is_zipfile
 
 import numpy as np
 import posixpath
@@ -304,11 +304,11 @@ def cached_path(
         return output_path
 
     if (
-        not is_zipfile(output_path)
-        and not tarfile.is_tarfile(output_path)
-        and not is_gzip(output_path)
-        and not is_xz(output_path)
-        and not is_rarfile(output_path)
+        not ZipExtractor.is_zipfile(output_path)
+        and not TarExtractor.is_tarfile(output_path)
+        and not GzipExtractor.is_gzip(output_path)
+        and not XzExtractor.is_xz(output_path)
+        and not RarExtractor.is_rarfile(output_path)
     ):
         return output_path
 
@@ -324,70 +324,120 @@ def cached_path(
     ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract):
         return output_path_extracted
 
-    _extract(output_path, output_path_extracted)
+    Extractor.extract(output_path, output_path_extracted)
 
     return output_path_extracted
 
 
-def _extract(output_path, output_path_extracted):
-    # Prevent parallel extractions
-    lock_path = output_path + ".lock"
-    with FileLock(lock_path):
-        shutil.rmtree(output_path_extracted, ignore_errors=True)
-        os.makedirs(output_path_extracted, exist_ok=True)
-        if is_tarfile(output_path):
-            extract_tarfile(output_path, output_path_extracted)
-        elif is_gzip(output_path):
-            extract_gzip(output_path, output_path_extracted)
-        elif is_zipfile(output_path):  # put zip file to the last, b/c it is possible wrongly detected as zip
-            extract_zipfile(output_path, output_path_extracted)
-        elif is_xz(output_path):
-            extract_xz(output_path, output_path_extracted)
-        elif is_rarfile(output_path):
-            extract_rarfile(output_path, output_path_extracted)
+class Extractor:
+    @staticmethod
+    def extract(output_path, output_path_extracted):
+        # Prevent parallel extractions
+        lock_path = output_path + ".lock"
+        with FileLock(lock_path):
+            shutil.rmtree(output_path_extracted, ignore_errors=True)
+            os.makedirs(output_path_extracted, exist_ok=True)
+            if TarExtractor.is_tarfile(output_path):
+                TarExtractor.extract_tarfile(output_path, output_path_extracted)
+            elif GzipExtractor.is_gzip(output_path):
+                GzipExtractor.extract_gzip(output_path, output_path_extracted)
+            elif ZipExtractor.is_zipfile(output_path):  # put zip file to the last, b/c it is possible wrongly detected as zip
+                ZipExtractor.extract_zipfile(output_path, output_path_extracted)
+            elif XzExtractor.is_xz(output_path):
+                XzExtractor.extract_xz(output_path, output_path_extracted)
+            elif RarExtractor.is_rarfile(output_path):
+                RarExtractor.extract_rarfile(output_path, output_path_extracted)
+            else:
+                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+
+
+class TarExtractor:
+    @staticmethod
+    def is_tarfile(path):
+        return tarfile.is_tarfile(path)
+    @staticmethod
+    def extract_tarfile(output_path, output_path_extracted):
+        tar_file = tarfile.open(output_path)
+        tar_file.extractall(output_path_extracted)
+        tar_file.close()
+
+
+class GzipExtractor:
+    @staticmethod
+    def is_gzip(path: str) -> bool:
+        """from https://stackoverflow.com/a/60634210"""
+        with gzip.open(path, "r") as fh:
+            try:
+                fh.read(1)
+                return True
+            except OSError:
+                return False
+    @staticmethod
+    def extract_gzip(output_path, output_path_extracted):
+        os.rmdir(output_path_extracted)
+        with gzip.open(output_path, "rb") as gzip_file:
+            with open(output_path_extracted, "wb") as extracted_file:
+                shutil.copyfileobj(gzip_file, extracted_file)
+
+
+class ZipExtractor:
+    @staticmethod
+    def is_zipfile(path):
+        return _is_zipfile(path)
+
+    @staticmethod
+    def extract_zipfile(output_path, output_path_extracted):
+        with ZipFile(output_path, "r") as zip_file:
+            zip_file.extractall(output_path_extracted)
+            zip_file.close()
+
+
+class XzExtractor:
+    @staticmethod
+    def is_xz(path: str) -> bool:
+        """https://tukaani.org/xz/xz-file-format-1.0.4.txt"""
+        with open(path, "rb") as f:
+            try:
+                header_magic_bytes = f.read(6)
+            except OSError:
+                return False
+            if header_magic_bytes == b"\xfd7zXZ\x00":
+                return True
+            else:
+                return False
+
+    @staticmethod
+    def extract_xz(output_path, output_path_extracted):
+        os.rmdir(output_path_extracted)
+        with lzma.open(output_path) as compressed_file:
+            with open(output_path_extracted, "wb") as extracted_file:
+                shutil.copyfileobj(compressed_file, extracted_file)
+
+
+class RarExtractor:
+    @staticmethod
+    def is_rarfile(path: str) -> bool:
+        """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
+        RAR_ID = b"Rar!\x1a\x07\x00"
+        RAR5_ID = b"Rar!\x1a\x07\x01\x00"
+
+        with open(path, "rb", 1024) as fd:
+            buf = fd.read(len(RAR5_ID))
+        if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID):
+            return True
         else:
-            raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
-
-
-def extract_rarfile(output_path, output_path_extracted):
-    if config.RARFILE_AVAILABLE:
-        import rarfile
-
-        rf = rarfile.RarFile(output_path)
-        rf.extractall(output_path_extracted)
-        rf.close()
-    else:
-        raise EnvironmentError("Please pip install rarfile")
-
-
-def extract_xz(output_path, output_path_extracted):
-    os.rmdir(output_path_extracted)
-    with lzma.open(output_path) as compressed_file:
-        with open(output_path_extracted, "wb") as extracted_file:
-            shutil.copyfileobj(compressed_file, extracted_file)
-
-
-def extract_zipfile(output_path, output_path_extracted):
-    with ZipFile(output_path, "r") as zip_file:
-        zip_file.extractall(output_path_extracted)
-        zip_file.close()
-
-
-def extract_gzip(output_path, output_path_extracted):
-    os.rmdir(output_path_extracted)
-    with gzip.open(output_path, "rb") as gzip_file:
-        with open(output_path_extracted, "wb") as extracted_file:
-            shutil.copyfileobj(gzip_file, extracted_file)
-
-
-def extract_tarfile(output_path, output_path_extracted):
-    tar_file = tarfile.open(output_path)
-    tar_file.extractall(output_path_extracted)
-    tar_file.close()
+            return False
 
+    @staticmethod
+    def extract_rarfile(output_path, output_path_extracted):
+        if config.RARFILE_AVAILABLE:
+            import rarfile
 
-def is_tarfile(output_path):
-    return tarfile.is_tarfile(output_path)
+            rf = rarfile.RarFile(output_path)
+            rf.extractall(output_path_extracted)
+            rf.close()
+        else:
+            raise EnvironmentError("Please pip install rarfile")
 
 
 def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
@@ -709,42 +759,6 @@ def _resumable_file_manager():
     return cache_path
 
 
-def is_gzip(path: str) -> bool:
-    """from https://stackoverflow.com/a/60634210"""
-    with gzip.open(path, "r") as fh:
-        try:
-            fh.read(1)
-            return True
-        except OSError:
-            return False
-
-
-def is_xz(path: str) -> bool:
-    """https://tukaani.org/xz/xz-file-format-1.0.4.txt"""
-    with open(path, "rb") as f:
-        try:
-            header_magic_bytes = f.read(6)
-        except OSError:
-            return False
-        if header_magic_bytes == b"\xfd7zXZ\x00":
-            return True
-        else:
-            return False
-
-
-def is_rarfile(path: str) -> bool:
-    """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
-    RAR_ID = b"Rar!\x1a\x07\x00"
-    RAR5_ID = b"Rar!\x1a\x07\x01\x00"
-
-    with open(path, "rb", 1024) as fd:
-        buf = fd.read(len(RAR5_ID))
-    if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID):
-        return True
-    else:
-        return False
-
-
 def add_start_docstrings(*docstr):
     def docstring_decorator(fn):
         fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")

From 652f47bfaff580d2b3efd9531ef25309e97ef750 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 30 Apr 2021 20:01:30 +0200
Subject: [PATCH 05/23] Rename extract method and input/output path params

---
 src/datasets/utils/file_utils.py | 53 +++++++++++++++++---------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index d9a1bbb8a60..0252f16f9de 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -23,7 +23,8 @@
 from pathlib import Path
 from typing import Dict, Optional, Union
 from urllib.parse import urlparse
-from zipfile import ZipFile, is_zipfile as _is_zipfile
+from zipfile import ZipFile
+from zipfile import is_zipfile as _is_zipfile
 
 import numpy as np
 import posixpath
@@ -338,15 +339,17 @@ def extract(output_path, output_path_extracted):
             shutil.rmtree(output_path_extracted, ignore_errors=True)
             os.makedirs(output_path_extracted, exist_ok=True)
             if TarExtractor.is_tarfile(output_path):
-                TarExtractor.extract_tarfile(output_path, output_path_extracted)
+                TarExtractor.extract(output_path, output_path_extracted)
             elif GzipExtractor.is_gzip(output_path):
-                GzipExtractor.extract_gzip(output_path, output_path_extracted)
-            elif ZipExtractor.is_zipfile(output_path):  # put zip file to the last, b/c it is possible wrongly detected as zip
-                ZipExtractor.extract_zipfile(output_path, output_path_extracted)
+                GzipExtractor.extract(output_path, output_path_extracted)
+            elif ZipExtractor.is_zipfile(
+                output_path
+            ):  # put zip file to the last, b/c it is possible wrongly detected as zip
+                ZipExtractor.extract(output_path, output_path_extracted)
             elif XzExtractor.is_xz(output_path):
-                XzExtractor.extract_xz(output_path, output_path_extracted)
+                XzExtractor.extract(output_path, output_path_extracted)
             elif RarExtractor.is_rarfile(output_path):
-                RarExtractor.extract_rarfile(output_path, output_path_extracted)
+                RarExtractor.extract(output_path, output_path_extracted)
             else:
                 raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
 
@@ -355,10 +358,11 @@ class TarExtractor:
     @staticmethod
     def is_tarfile(path):
         return tarfile.is_tarfile(path)
+
     @staticmethod
-    def extract_tarfile(output_path, output_path_extracted):
-        tar_file = tarfile.open(output_path)
-        tar_file.extractall(output_path_extracted)
+    def extract(input_path, output_path):
+        tar_file = tarfile.open(input_path)
+        tar_file.extractall(output_path)
         tar_file.close()
 
 
@@ -372,11 +376,12 @@ def is_gzip(path: str) -> bool:
                 return True
             except OSError:
                 return False
+
     @staticmethod
-    def extract_gzip(output_path, output_path_extracted):
-        os.rmdir(output_path_extracted)
-        with gzip.open(output_path, "rb") as gzip_file:
-            with open(output_path_extracted, "wb") as extracted_file:
+    def extract(input_path, output_path):
+        os.rmdir(output_path)
+        with gzip.open(input_path, "rb") as gzip_file:
+            with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(gzip_file, extracted_file)
 
 
@@ -386,9 +391,9 @@ def is_zipfile(path):
         return _is_zipfile(path)
 
     @staticmethod
-    def extract_zipfile(output_path, output_path_extracted):
-        with ZipFile(output_path, "r") as zip_file:
-            zip_file.extractall(output_path_extracted)
+    def extract(input_path, output_path):
+        with ZipFile(input_path, "r") as zip_file:
+            zip_file.extractall(output_path)
             zip_file.close()
 
 
@@ -407,10 +412,10 @@ def is_xz(path: str) -> bool:
                 return False
 
     @staticmethod
-    def extract_xz(output_path, output_path_extracted):
-        os.rmdir(output_path_extracted)
-        with lzma.open(output_path) as compressed_file:
-            with open(output_path_extracted, "wb") as extracted_file:
+    def extract(input_path, output_path):
+        os.rmdir(output_path)
+        with lzma.open(input_path) as compressed_file:
+            with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(compressed_file, extracted_file)
 
 
@@ -429,12 +434,12 @@ def is_rarfile(path: str) -> bool:
             return False
 
     @staticmethod
-    def extract_rarfile(output_path, output_path_extracted):
+    def extract(input_path, output_path):
         if config.RARFILE_AVAILABLE:
             import rarfile
 
-            rf = rarfile.RarFile(output_path)
-            rf.extractall(output_path_extracted)
+            rf = rarfile.RarFile(input_path)
+            rf.extractall(output_path)
             rf.close()
         else:
             raise EnvironmentError("Please pip install rarfile")

From 1e3e6e2b029bd6f7fff5c3288ff47a7d068813bf Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 30 Apr 2021 20:09:00 +0200
Subject: [PATCH 06/23] Rename is_extractable method

---
 src/datasets/utils/file_utils.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index 0252f16f9de..765bab2935a 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -305,11 +305,11 @@ def cached_path(
         return output_path
 
     if (
-        not ZipExtractor.is_zipfile(output_path)
-        and not TarExtractor.is_tarfile(output_path)
-        and not GzipExtractor.is_gzip(output_path)
-        and not XzExtractor.is_xz(output_path)
-        and not RarExtractor.is_rarfile(output_path)
+        not ZipExtractor.is_extractable(output_path)
+        and not TarExtractor.is_extractable(output_path)
+        and not GzipExtractor.is_extractable(output_path)
+        and not XzExtractor.is_extractable(output_path)
+        and not RarExtractor.is_extractable(output_path)
     ):
         return output_path
 
@@ -338,17 +338,17 @@ def extract(output_path, output_path_extracted):
         with FileLock(lock_path):
             shutil.rmtree(output_path_extracted, ignore_errors=True)
             os.makedirs(output_path_extracted, exist_ok=True)
-            if TarExtractor.is_tarfile(output_path):
+            if TarExtractor.is_extractable(output_path):
                 TarExtractor.extract(output_path, output_path_extracted)
-            elif GzipExtractor.is_gzip(output_path):
+            elif GzipExtractor.is_extractable(output_path):
                 GzipExtractor.extract(output_path, output_path_extracted)
-            elif ZipExtractor.is_zipfile(
+            elif ZipExtractor.is_extractable(
                 output_path
             ):  # put zip file to the last, b/c it is possible wrongly detected as zip
                 ZipExtractor.extract(output_path, output_path_extracted)
-            elif XzExtractor.is_xz(output_path):
+            elif XzExtractor.is_extractable(output_path):
                 XzExtractor.extract(output_path, output_path_extracted)
-            elif RarExtractor.is_rarfile(output_path):
+            elif RarExtractor.is_extractable(output_path):
                 RarExtractor.extract(output_path, output_path_extracted)
             else:
                 raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
@@ -356,7 +356,7 @@ def extract(output_path, output_path_extracted):
 
 class TarExtractor:
     @staticmethod
-    def is_tarfile(path):
+    def is_extractable(path):
         return tarfile.is_tarfile(path)
 
     @staticmethod
@@ -368,7 +368,7 @@ def extract(input_path, output_path):
 
 class GzipExtractor:
     @staticmethod
-    def is_gzip(path: str) -> bool:
+    def is_extractable(path: str) -> bool:
         """from https://stackoverflow.com/a/60634210"""
         with gzip.open(path, "r") as fh:
             try:
@@ -387,7 +387,7 @@ def extract(input_path, output_path):
 
 class ZipExtractor:
     @staticmethod
-    def is_zipfile(path):
+    def is_extractable(path):
         return _is_zipfile(path)
 
     @staticmethod
@@ -399,7 +399,7 @@ def extract(input_path, output_path):
 
 class XzExtractor:
     @staticmethod
-    def is_xz(path: str) -> bool:
+    def is_extractable(path: str) -> bool:
         """https://tukaani.org/xz/xz-file-format-1.0.4.txt"""
         with open(path, "rb") as f:
             try:
@@ -421,7 +421,7 @@ def extract(input_path, output_path):
 
 class RarExtractor:
     @staticmethod
-    def is_rarfile(path: str) -> bool:
+    def is_extractable(path: str) -> bool:
         """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
         RAR_ID = b"Rar!\x1a\x07\x00"
         RAR5_ID = b"Rar!\x1a\x07\x01\x00"

From 56dcea7391d5d8e9224e08003e9c0dd6c3c657b2 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 30 Apr 2021 20:41:35 +0200
Subject: [PATCH 07/23] Create generic Extractor.is_extractable

---
 src/datasets/utils/file_utils.py | 48 +++++++++++++++++---------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index 765bab2935a..7c554813159 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -304,13 +304,7 @@ def cached_path(
     if not download_config.extract_compressed_file or output_path is None:
         return output_path
 
-    if (
-        not ZipExtractor.is_extractable(output_path)
-        and not TarExtractor.is_extractable(output_path)
-        and not GzipExtractor.is_extractable(output_path)
-        and not XzExtractor.is_extractable(output_path)
-        and not RarExtractor.is_extractable(output_path)
-    ):
+    if not Extractor.is_extractable(output_path):
         return output_path
 
     # Path where we extract compressed archives
@@ -332,26 +326,36 @@ def cached_path(
 
 class Extractor:
     @staticmethod
-    def extract(output_path, output_path_extracted):
+    def is_extractable(path):
+        return (
+            ZipExtractor.is_extractable(path)
+            or TarExtractor.is_extractable(path)
+            or GzipExtractor.is_extractable(path)
+            or XzExtractor.is_extractable(path)
+            or RarExtractor.is_extractable(path)
+        )
+
+    @staticmethod
+    def extract(input_path, output_path):
         # Prevent parallel extractions
-        lock_path = output_path + ".lock"
+        lock_path = input_path + ".lock"
         with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted, exist_ok=True)
-            if TarExtractor.is_extractable(output_path):
-                TarExtractor.extract(output_path, output_path_extracted)
-            elif GzipExtractor.is_extractable(output_path):
-                GzipExtractor.extract(output_path, output_path_extracted)
+            shutil.rmtree(output_path, ignore_errors=True)
+            os.makedirs(output_path, exist_ok=True)
+            if TarExtractor.is_extractable(input_path):
+                TarExtractor.extract(input_path, output_path)
+            elif GzipExtractor.is_extractable(input_path):
+                GzipExtractor.extract(input_path, output_path)
             elif ZipExtractor.is_extractable(
-                output_path
+                input_path
             ):  # put zip file to the last, b/c it is possible wrongly detected as zip
-                ZipExtractor.extract(output_path, output_path_extracted)
-            elif XzExtractor.is_extractable(output_path):
-                XzExtractor.extract(output_path, output_path_extracted)
-            elif RarExtractor.is_extractable(output_path):
-                RarExtractor.extract(output_path, output_path_extracted)
+                ZipExtractor.extract(input_path, output_path)
+            elif XzExtractor.is_extractable(input_path):
+                XzExtractor.extract(input_path, output_path)
+            elif RarExtractor.is_extractable(input_path):
+                RarExtractor.extract(input_path, output_path)
             else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+                raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
 
 
 class TarExtractor:

From 75e27ee919d6124cdfd6437ac7fc355bc5bb91b2 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 10:59:35 +0200
Subject: [PATCH 08/23] Create class attribute extractors

---
 src/datasets/utils/file_utils.py | 57 +++++++++++++-------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index 7c554813159..b8a9005e88b 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -324,40 +324,6 @@ def cached_path(
     return output_path_extracted
 
 
-class Extractor:
-    @staticmethod
-    def is_extractable(path):
-        return (
-            ZipExtractor.is_extractable(path)
-            or TarExtractor.is_extractable(path)
-            or GzipExtractor.is_extractable(path)
-            or XzExtractor.is_extractable(path)
-            or RarExtractor.is_extractable(path)
-        )
-
-    @staticmethod
-    def extract(input_path, output_path):
-        # Prevent parallel extractions
-        lock_path = input_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path, ignore_errors=True)
-            os.makedirs(output_path, exist_ok=True)
-            if TarExtractor.is_extractable(input_path):
-                TarExtractor.extract(input_path, output_path)
-            elif GzipExtractor.is_extractable(input_path):
-                GzipExtractor.extract(input_path, output_path)
-            elif ZipExtractor.is_extractable(
-                input_path
-            ):  # put zip file to the last, b/c it is possible wrongly detected as zip
-                ZipExtractor.extract(input_path, output_path)
-            elif XzExtractor.is_extractable(input_path):
-                XzExtractor.extract(input_path, output_path)
-            elif RarExtractor.is_extractable(input_path):
-                RarExtractor.extract(input_path, output_path)
-            else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
-
-
 class TarExtractor:
     @staticmethod
     def is_extractable(path):
@@ -449,6 +415,29 @@ def extract(input_path, output_path):
             raise EnvironmentError("Please pip install rarfile")
 
 
+class Extractor:
+    #  Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip)
+    extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor]
+
+    @classmethod
+    def is_extractable(cls, path):
+        return any(extractor.is_extractable() for extractor in cls.extractors)
+
+    @classmethod
+    def extract(cls, input_path, output_path):
+        if not cls.is_extractable(input_path):
+            raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
+        # Prevent parallel extractions
+        lock_path = input_path + ".lock"
+        with FileLock(lock_path):
+            shutil.rmtree(output_path, ignore_errors=True)
+            os.makedirs(output_path, exist_ok=True)
+            for extractor in cls.extractors:
+                if extractor.is_extractable(input_path):
+                    extractor.extract(input_path, output_path)
+                    break
+
+
 def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
     ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION)
     ua += "; pyarrow/{}".format(pa.__version__)

From 7b9359f3eb9ee4304e4a1ada36b4ff8f8ce72628 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 11:07:57 +0200
Subject: [PATCH 09/23] Move extract functionality to extract module

---
 src/datasets/utils/extract.py    | 124 +++++++++++++++++++++++++++++++
 src/datasets/utils/file_utils.py | 120 +-----------------------------
 2 files changed, 125 insertions(+), 119 deletions(-)
 create mode 100644 src/datasets/utils/extract.py

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
new file mode 100644
index 00000000000..0e03caf9912
--- /dev/null
+++ b/src/datasets/utils/extract.py
@@ -0,0 +1,124 @@
+import gzip
+import lzma
+import os
+import shutil
+import tarfile
+from zipfile import ZipFile
+from zipfile import is_zipfile as _is_zipfile
+
+from datasets import config
+from datasets.utils.filelock import FileLock
+
+
+class TarExtractor:
+    @staticmethod
+    def is_extractable(path):
+        return tarfile.is_tarfile(path)
+
+    @staticmethod
+    def extract(input_path, output_path):
+        tar_file = tarfile.open(input_path)
+        tar_file.extractall(output_path)
+        tar_file.close()
+
+
+class GzipExtractor:
+    @staticmethod
+    def is_extractable(path: str) -> bool:
+        """from https://stackoverflow.com/a/60634210"""
+        with gzip.open(path, "r") as fh:
+            try:
+                fh.read(1)
+                return True
+            except OSError:
+                return False
+
+    @staticmethod
+    def extract(input_path, output_path):
+        os.rmdir(output_path)
+        with gzip.open(input_path, "rb") as gzip_file:
+            with open(output_path, "wb") as extracted_file:
+                shutil.copyfileobj(gzip_file, extracted_file)
+
+
+class ZipExtractor:
+    @staticmethod
+    def is_extractable(path):
+        return _is_zipfile(path)
+
+    @staticmethod
+    def extract(input_path, output_path):
+        with ZipFile(input_path, "r") as zip_file:
+            zip_file.extractall(output_path)
+            zip_file.close()
+
+
+class XzExtractor:
+    @staticmethod
+    def is_extractable(path: str) -> bool:
+        """https://tukaani.org/xz/xz-file-format-1.0.4.txt"""
+        with open(path, "rb") as f:
+            try:
+                header_magic_bytes = f.read(6)
+            except OSError:
+                return False
+            if header_magic_bytes == b"\xfd7zXZ\x00":
+                return True
+            else:
+                return False
+
+    @staticmethod
+    def extract(input_path, output_path):
+        os.rmdir(output_path)
+        with lzma.open(input_path) as compressed_file:
+            with open(output_path, "wb") as extracted_file:
+                shutil.copyfileobj(compressed_file, extracted_file)
+
+
+class RarExtractor:
+    @staticmethod
+    def is_extractable(path: str) -> bool:
+        """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
+        RAR_ID = b"Rar!\x1a\x07\x00"
+        RAR5_ID = b"Rar!\x1a\x07\x01\x00"
+
+        with open(path, "rb", 1024) as fd:
+            buf = fd.read(len(RAR5_ID))
+        if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID):
+            return True
+        else:
+            return False
+
+    @staticmethod
+    def extract(input_path, output_path):
+        if config.RARFILE_AVAILABLE:
+            import rarfile
+
+            rf = rarfile.RarFile(input_path)
+            rf.extractall(output_path)
+            rf.close()
+        else:
+            raise EnvironmentError("Please pip install rarfile")
+
+
+class Extractor:
+    #  Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip)
+    extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor]
+
+    @classmethod
+    def is_extractable(cls, path):
+        return any(extractor.is_extractable() for extractor in cls.extractors)
+
+    @classmethod
+    def extract(cls, input_path, output_path):
+        if not cls.is_extractable(input_path):
+            raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
+        # Prevent parallel extractions
+        lock_path = input_path + ".lock"
+        with FileLock(lock_path):
+            shutil.rmtree(output_path, ignore_errors=True)
+            os.makedirs(output_path, exist_ok=True)
+            for extractor in cls.extractors:
+                if extractor.is_extractable(input_path):
+                    extractor.extract(input_path, output_path)
+                    break
diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index b8a9005e88b..83399c10cec 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -5,14 +5,11 @@
 """
 
 import copy
-import gzip
 import json
-import lzma
 import os
 import re
 import shutil
 import sys
-import tarfile
 import tempfile
 import time
 import urllib
@@ -23,8 +20,6 @@
 from pathlib import Path
 from typing import Dict, Optional, Union
 from urllib.parse import urlparse
-from zipfile import ZipFile
-from zipfile import is_zipfile as _is_zipfile
 
 import numpy as np
 import posixpath
@@ -33,6 +28,7 @@
 from tqdm.auto import tqdm
 
 from .. import __version__, config
+from .extract import Extractor
 from .filelock import FileLock
 from .logging import WARNING, get_logger
 
@@ -324,120 +320,6 @@ def cached_path(
     return output_path_extracted
 
 
-class TarExtractor:
-    @staticmethod
-    def is_extractable(path):
-        return tarfile.is_tarfile(path)
-
-    @staticmethod
-    def extract(input_path, output_path):
-        tar_file = tarfile.open(input_path)
-        tar_file.extractall(output_path)
-        tar_file.close()
-
-
-class GzipExtractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        """from https://stackoverflow.com/a/60634210"""
-        with gzip.open(path, "r") as fh:
-            try:
-                fh.read(1)
-                return True
-            except OSError:
-                return False
-
-    @staticmethod
-    def extract(input_path, output_path):
-        os.rmdir(output_path)
-        with gzip.open(input_path, "rb") as gzip_file:
-            with open(output_path, "wb") as extracted_file:
-                shutil.copyfileobj(gzip_file, extracted_file)
-
-
-class ZipExtractor:
-    @staticmethod
-    def is_extractable(path):
-        return _is_zipfile(path)
-
-    @staticmethod
-    def extract(input_path, output_path):
-        with ZipFile(input_path, "r") as zip_file:
-            zip_file.extractall(output_path)
-            zip_file.close()
-
-
-class XzExtractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        """https://tukaani.org/xz/xz-file-format-1.0.4.txt"""
-        with open(path, "rb") as f:
-            try:
-                header_magic_bytes = f.read(6)
-            except OSError:
-                return False
-            if header_magic_bytes == b"\xfd7zXZ\x00":
-                return True
-            else:
-                return False
-
-    @staticmethod
-    def extract(input_path, output_path):
-        os.rmdir(output_path)
-        with lzma.open(input_path) as compressed_file:
-            with open(output_path, "wb") as extracted_file:
-                shutil.copyfileobj(compressed_file, extracted_file)
-
-
-class RarExtractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
-        RAR_ID = b"Rar!\x1a\x07\x00"
-        RAR5_ID = b"Rar!\x1a\x07\x01\x00"
-
-        with open(path, "rb", 1024) as fd:
-            buf = fd.read(len(RAR5_ID))
-        if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID):
-            return True
-        else:
-            return False
-
-    @staticmethod
-    def extract(input_path, output_path):
-        if config.RARFILE_AVAILABLE:
-            import rarfile
-
-            rf = rarfile.RarFile(input_path)
-            rf.extractall(output_path)
-            rf.close()
-        else:
-            raise EnvironmentError("Please pip install rarfile")
-
-
-class Extractor:
-    #  Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip)
-    extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor]
-
-    @classmethod
-    def is_extractable(cls, path):
-        return any(extractor.is_extractable() for extractor in cls.extractors)
-
-    @classmethod
-    def extract(cls, input_path, output_path):
-        if not cls.is_extractable(input_path):
-            raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
-        # Prevent parallel extractions
-        lock_path = input_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path, ignore_errors=True)
-            os.makedirs(output_path, exist_ok=True)
-            for extractor in cls.extractors:
-                if extractor.is_extractable(input_path):
-                    extractor.extract(input_path, output_path)
-                    break
-
-
 def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
     ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION)
     ua += "; pyarrow/{}".format(pa.__version__)

From 4f1bf02042f803e76d7513c4b19e0388d970e198 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 11:25:45 +0200
Subject: [PATCH 10/23] Fix Extractor.is_extractable

---
 src/datasets/utils/extract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 0e03caf9912..6f41791681f 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -107,7 +107,7 @@ class Extractor:
 
     @classmethod
     def is_extractable(cls, path):
-        return any(extractor.is_extractable() for extractor in cls.extractors)
+        return any(extractor.is_extractable(path) for extractor in cls.extractors)
 
     @classmethod
     def extract(cls, input_path, output_path):

From a36663b7f1e683e2045d03f25c9cbfd3186fa0ad Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 12:54:48 +0200
Subject: [PATCH 11/23] Create ExtractManager with all extract logic

---
 src/datasets/utils/extract.py    | 34 ++++++++++++++++++++++++++++++++
 src/datasets/utils/file_utils.py | 26 +++++++-----------------
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 6f41791681f..d64edbb2429 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -3,13 +3,47 @@
 import os
 import shutil
 import tarfile
+from dataclasses import dataclass
 from zipfile import ZipFile
 from zipfile import is_zipfile as _is_zipfile
 
 from datasets import config
+from datasets.utils.file_utils import hash_url_to_filename
 from datasets.utils.filelock import FileLock
 
 
+@dataclass
+class ExtractConfig:
+    extract_compressed_file: bool = False
+    force_extract: bool = False
+
+
+class ExtractManager:
+    def __init__(self, cache_dir=None):
+        self.cache_dir = cache_dir
+        self.extractor = Extractor
+
+    def _get_outout_path(self, path):
+        # Path where we extract compressed archives
+        # We extract in the cache dir, and get the extracted path name by hashing the original path"
+        abs_path = os.path.abspath(path)
+        return os.path.join(self.cache_dir, "extracted", hash_url_to_filename(abs_path))
+
+    def _do_extract(self, output_path, force_extract):
+        return force_extract or (
+            not os.path.isfile(output_path)
+            and not (os.path.isdir(output_path) and os.listdir(output_path))
+        )
+
+    def extract(self, input_path, force_extract=False):
+        output_path = input_path
+        if self.extractor.is_extractable(input_path):
+            output_path = self._get_outout_path(input_path)
+            if self._do_extract(output_path, force_extract):
+                self.extractor.extract(input_path, output_path)
+        return output_path
+
+
 class TarExtractor:
     @staticmethod
     def is_extractable(path):
diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index 83399c10cec..78fc00203f5 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -28,7 +28,7 @@
 from tqdm.auto import tqdm
 
 from .. import __version__, config
-from .extract import Extractor
+from .extract import ExtractManager
 from .filelock import FileLock
 from .logging import WARNING, get_logger
 
@@ -297,27 +297,15 @@ def cached_path(
         # Something unknown
         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
 
-    if not download_config.extract_compressed_file or output_path is None:
+    if output_path is None:
         return output_path
 
-    if not Extractor.is_extractable(output_path):
-        return output_path
-
-    # Path where we extract compressed archives
-    # We extract in the cache dir, and get the extracted path name by hashing the original path"
-    abs_output_path = os.path.abspath(output_path)
-    output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path))
-
-    if (
-        os.path.isdir(output_path_extracted)
-        and os.listdir(output_path_extracted)
-        and not download_config.force_extract
-    ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract):
-        return output_path_extracted
-
-    Extractor.extract(output_path, output_path_extracted)
+    if download_config.extract_compressed_file:
+        output_path = ExtractManager(cache_dir=cache_dir).extract(
+            output_path, force_extract=download_config.force_extract
+        )
 
-    return output_path_extracted
+    return output_path
 
 
 def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:

From 104f0f75e20c0c99bacf88b6b1b5dfa25848d6f6 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 13:02:10 +0200
Subject: [PATCH 12/23] Fix circular import

---
 src/datasets/utils/extract.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index d64edbb2429..252243ad333 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -8,7 +8,6 @@
 from zipfile import is_zipfile as _is_zipfile
 
 from datasets import config
-from datasets.utils.file_utils import hash_url_to_filename
 from datasets.utils.filelock import FileLock
 
 
@@ -24,6 +23,8 @@ def __init__(self, cache_dir=None):
         self.extractor = Extractor
 
     def _get_outout_path(self, path):
+        from datasets.utils.file_utils import hash_url_to_filename
+
         # Path where we extract compressed archives
         # We extract in the cache dir, and get the extracted path name by hashing the original path"
         abs_path = os.path.abspath(path)
@@ -31,8 +32,7 @@ def _get_outout_path(self, path):
 
     def _do_extract(self, output_path, force_extract):
         return force_extract or (
-            not os.path.isfile(output_path)
-            and not (os.path.isdir(output_path) and os.listdir(output_path))
+            not os.path.isfile(output_path) and not (os.path.isdir(output_path) and os.listdir(output_path))
         )
 
     def extract(self, input_path, force_extract=False):

From 9f57f5256641d73c9cbb25800be62c8e287d510e Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 13:22:00 +0200
Subject: [PATCH 13/23] Fix typo

---
 src/datasets/utils/extract.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 252243ad333..77b47c655c0 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -22,7 +22,7 @@ def __init__(self, cache_dir=None):
         self.cache_dir = cache_dir
         self.extractor = Extractor
 
-    def _get_outout_path(self, path):
+    def _get_output_path(self, path):
         from datasets.utils.file_utils import hash_url_to_filename
 
         # Path where we extract compressed archives
@@ -38,7 +38,7 @@ def _do_extract(self, output_path, force_extract):
     def extract(self, input_path, force_extract=False):
         output_path = input_path
         if self.extractor.is_extractable(input_path):
-            output_path = self._get_outout_path(input_path)
+            output_path = self._get_output_path(input_path)
             if self._do_extract(output_path, force_extract):
                 self.extractor.extract(input_path, output_path)
         return output_path

From ad59f5d512745d14535d2847d5c844f71c1cc256 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 13:25:51 +0200
Subject: [PATCH 14/23] Remove unused class

---
 src/datasets/utils/extract.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 77b47c655c0..77af7f4f221 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -11,12 +11,6 @@
 from datasets.utils.filelock import FileLock
 
 
-@dataclass
-class ExtractConfig:
-    extract_compressed_file: bool = False
-    force_extract: bool = False
-
-
 class ExtractManager:
     def __init__(self, cache_dir=None):
         self.cache_dir = cache_dir

From 7ef66c98529eb7960d01aa82c89554b7a4981579 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 3 May 2021 13:29:43 +0200
Subject: [PATCH 15/23] Fix style

---
 src/datasets/utils/extract.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 77af7f4f221..6551a30e9cc 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -3,7 +3,6 @@
 import os
 import shutil
 import tarfile
-from dataclasses import dataclass
 from zipfile import ZipFile
 from zipfile import is_zipfile as _is_zipfile
 

From e62ac10c81302d483d05024d4f90d3c2d428f63f Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Jul 2021 14:35:19 +0200
Subject: [PATCH 16/23] Fix issues after merge upstream master

---
 src/datasets/utils/extract.py    | 7 +++++--
 src/datasets/utils/file_utils.py | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index f7d78bfce44..c2fd86fd3e0 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -13,7 +13,9 @@
 
 class ExtractManager:
     def __init__(self, cache_dir=None):
-        self.cache_dir = cache_dir
+        self.extract_dir = (
+            os.path.join(cache_dir, config.EXTRACTED_DATASETS_DIR) if cache_dir else config.EXTRACTED_DATASETS_PATH
+        )
         self.extractor = Extractor
 
     def _get_output_path(self, path):
@@ -22,7 +24,7 @@ def _get_output_path(self, path):
         # Path where we extract compressed archives
         # We extract in the cache dir, and get the extracted path name by hashing the original path"
         abs_path = os.path.abspath(path)
-        return os.path.join(self.cache_dir, "extracted", hash_url_to_filename(abs_path))
+        return os.path.join(self.extract_dir, hash_url_to_filename(abs_path))
 
     def _do_extract(self, output_path, force_extract):
         return force_extract or (
@@ -145,6 +147,7 @@ def is_extractable(path: str) -> bool:
 
     @staticmethod
     def extract(input_path: str, output_path: str):
+        os.rmdir(output_path)
         if not config.ZSTANDARD_AVAILABLE:
             raise EnvironmentError("Please pip install zstandard")
         import zstandard as zstd
diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index fdbd5fca4dd..8b2dbdf5528 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -300,7 +300,7 @@ def cached_path(
         return output_path
 
     if download_config.extract_compressed_file:
-        output_path = ExtractManager(cache_dir=cache_dir).extract(
+        output_path = ExtractManager(cache_dir=download_config.cache_dir).extract(
             output_path, force_extract=download_config.force_extract
         )
 

From 8bd53c43674f0b04602b013933f60a7c91df48ec Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Jul 2021 15:13:47 +0200
Subject: [PATCH 17/23] Remove default os.makedirs and os.rmdir when not
 applicable

---
 src/datasets/utils/extract.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index c2fd86fd3e0..43aaebc56f5 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -47,6 +47,7 @@ def is_extractable(path):
 
     @staticmethod
     def extract(input_path, output_path):
+        os.makedirs(output_path, exist_ok=True)
         tar_file = tarfile.open(input_path)
         tar_file.extractall(output_path)
         tar_file.close()
@@ -65,7 +66,6 @@ def is_extractable(path: str) -> bool:
 
     @staticmethod
     def extract(input_path, output_path):
-        os.rmdir(output_path)
         with gzip.open(input_path, "rb") as gzip_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(gzip_file, extracted_file)
@@ -78,6 +78,7 @@ def is_extractable(path):
 
     @staticmethod
     def extract(input_path, output_path):
+        os.makedirs(output_path, exist_ok=True)
         with ZipFile(input_path, "r") as zip_file:
             zip_file.extractall(output_path)
             zip_file.close()
@@ -99,7 +100,6 @@ def is_extractable(path: str) -> bool:
 
     @staticmethod
     def extract(input_path, output_path):
-        os.rmdir(output_path)
         with lzma.open(input_path) as compressed_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(compressed_file, extracted_file)
@@ -121,14 +121,14 @@ def is_extractable(path: str) -> bool:
 
     @staticmethod
     def extract(input_path, output_path):
-        if config.RARFILE_AVAILABLE:
-            import rarfile
-
-            rf = rarfile.RarFile(input_path)
-            rf.extractall(output_path)
-            rf.close()
-        else:
+        if not config.RARFILE_AVAILABLE:
             raise EnvironmentError("Please pip install rarfile")
+        import rarfile
+
+        os.makedirs(output_path, exist_ok=True)
+        rf = rarfile.RarFile(input_path)
+        rf.extractall(output_path)
+        rf.close()
 
 
 class ZstdExtractor:
@@ -147,7 +147,6 @@ def is_extractable(path: str) -> bool:
 
     @staticmethod
     def extract(input_path: str, output_path: str):
-        os.rmdir(output_path)
         if not config.ZSTANDARD_AVAILABLE:
             raise EnvironmentError("Please pip install zstandard")
         import zstandard as zstd
@@ -173,7 +172,6 @@ def extract(cls, input_path, output_path):
         lock_path = input_path + ".lock"
         with FileLock(lock_path):
             shutil.rmtree(output_path, ignore_errors=True)
-            os.makedirs(output_path, exist_ok=True)
             for extractor in cls.extractors:
                 if extractor.is_extractable(input_path):
                     extractor.extract(input_path, output_path)

From 70c33448dc2b86d0ab2d8d8de96c3b972b409ce2 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Jul 2021 16:14:56 +0200
Subject: [PATCH 18/23] Create parent dirs of output_path

---
 src/datasets/utils/extract.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 43aaebc56f5..838dbd3de3b 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -172,6 +172,7 @@ def extract(cls, input_path, output_path):
         lock_path = input_path + ".lock"
         with FileLock(lock_path):
             shutil.rmtree(output_path, ignore_errors=True)
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
             for extractor in cls.extractors:
                 if extractor.is_extractable(input_path):
                     extractor.extract(input_path, output_path)

From f6739700c0c2903d97460d96e4f6bb240585a60f Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Jul 2021 16:45:26 +0200
Subject: [PATCH 19/23] Minor refactoring of ExtractManager

---
 src/datasets/utils/extract.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 838dbd3de3b..e6f589c5861 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -32,11 +32,14 @@ def _do_extract(self, output_path, force_extract):
         )
 
     def extract(self, input_path, force_extract=False):
-        output_path = input_path
-        if self.extractor.is_extractable(input_path):
-            output_path = self._get_output_path(input_path)
-            if self._do_extract(output_path, force_extract):
+        if not self.extractor.is_extractable(input_path):
+            return input_path
+        output_path = self._get_output_path(input_path)
+        if self._do_extract(output_path, force_extract):
+            try:
                 self.extractor.extract(input_path, output_path)
+            except Exception:
+                raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
         return output_path
 
 
@@ -166,8 +169,6 @@ def is_extractable(cls, path):
 
     @classmethod
     def extract(cls, input_path, output_path):
-        if not cls.is_extractable(input_path):
-            raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
         # Prevent parallel extractions
         lock_path = input_path + ".lock"
         with FileLock(lock_path):

From 57921306aa6eef7f083108115d65de41be686474 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Jul 2021 16:59:41 +0200
Subject: [PATCH 20/23] Optimize Extractor.extract by returning specific
 extractor

---
 src/datasets/utils/extract.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index e6f589c5861..432aa8f3a26 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -32,12 +32,13 @@ def _do_extract(self, output_path, force_extract):
         )
 
     def extract(self, input_path, force_extract=False):
-        if not self.extractor.is_extractable(input_path):
+        is_extractable, extractor = self.extractor.is_extractable(input_path, return_extractor=True)
+        if not is_extractable:
             return input_path
         output_path = self._get_output_path(input_path)
         if self._do_extract(output_path, force_extract):
             try:
-                self.extractor.extract(input_path, output_path)
+                extractor.extract(input_path, output_path)
             except Exception:
                 raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
         return output_path
@@ -164,16 +165,21 @@ class Extractor:
     extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor, ZstdExtractor]
 
     @classmethod
-    def is_extractable(cls, path):
-        return any(extractor.is_extractable(path) for extractor in cls.extractors)
+    def is_extractable(cls, path, return_extractor=False):
+        for extractor in cls.extractors:
+            if extractor.is_extractable(path):
+                return True if not return_extractor else (True, extractor)
+        return False if not return_extractor else (False, None)
 
     @classmethod
-    def extract(cls, input_path, output_path):
+    def extract(cls, input_path, output_path, extractor=None):
         # Prevent parallel extractions
         lock_path = input_path + ".lock"
         with FileLock(lock_path):
             shutil.rmtree(output_path, ignore_errors=True)
             os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            if extractor:
+                return extractor.extract(input_path, output_path)
             for extractor in cls.extractors:
                 if extractor.is_extractable(input_path):
                     extractor.extract(input_path, output_path)

From 7126a1d9f5fce77e6716b44482b3cdb66e94f35b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Jul 2021 17:13:32 +0200
Subject: [PATCH 21/23] Fix extract

---
 src/datasets/utils/extract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index 432aa8f3a26..d03b11783de 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -38,7 +38,7 @@ def extract(self, input_path, force_extract=False):
         output_path = self._get_output_path(input_path)
         if self._do_extract(output_path, force_extract):
             try:
-                extractor.extract(input_path, output_path)
+                self.extractor.extract(input_path, output_path, extractor=extractor)
             except Exception:
                 raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
         return output_path

From 834ecc35db89029bbae2f38a1a03d7dd08495be1 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Jul 2021 18:46:57 +0200
Subject: [PATCH 22/23] Test extract and add gzip to test_cached_path_extract

---
 tests/test_extract.py    | 50 ++++++++++++++++++++++++++++++++++++++++
 tests/test_file_utils.py | 29 +++++++++--------------
 2 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/tests/test_extract.py b/tests/test_extract.py
index e69de29bb2d..909ddf83895 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -0,0 +1,50 @@
+import pytest
+import zstandard as zstd
+
+from datasets.utils.extract import Extractor, ZstdExtractor
+
+
+FILE_CONTENT = """\
+    Text data.
+    Second line of data."""
+
+
+@pytest.fixture(scope="session")
+def zstd_path(tmp_path_factory):
+    path = tmp_path_factory.mktemp("data") / "file.zstd"
+    data = bytes(FILE_CONTENT, "utf-8")
+    with zstd.open(path, "wb") as f:
+        f.write(data)
+    return path
+
+
+def test_zstd_extractor(zstd_path, tmp_path, text_file):
+    input_path = zstd_path
+    assert ZstdExtractor.is_extractable(input_path)
+    output_path = str(tmp_path / "extracted.txt")
+    ZstdExtractor.extract(input_path, output_path)
+    with open(output_path) as f:
+        extracted_file_content = f.read()
+    with open(text_file) as f:
+        expected_file_content = f.read()
+    assert extracted_file_content == expected_file_content
+
+
+@pytest.mark.parametrize(
+    "compression_format, expected_text_path_name", [("gzip", "text_path"), ("xz", "text_file"), ("zstd", "text_file")]
+)
+def test_extractor(
+    compression_format, expected_text_path_name, text_gz_path, xz_file, zstd_path, tmp_path, text_file, text_path
+):
+    input_paths = {"gzip": text_gz_path, "xz": xz_file, "zstd": zstd_path}
+    input_path = str(input_paths[compression_format])
+    output_path = str(tmp_path / "extracted.txt")
+    assert Extractor.is_extractable(input_path)
+    Extractor.extract(input_path, output_path)
+    with open(output_path) as f:
+        extracted_file_content = f.read()
+    expected_text_paths = {"text_file": text_file, "text_path": text_path}
+    expected_text_path = str(expected_text_paths[expected_text_path_name])
+    with open(expected_text_path) as f:
+        expected_file_content = f.read()
+    assert extracted_file_content == expected_file_content
diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py
index edfd776599f..05b7baad00c 100644
--- a/tests/test_file_utils.py
+++ b/tests/test_file_utils.py
@@ -7,7 +7,6 @@
 import pytest
 import zstandard as zstd
 
-from datasets.utils.extract import ZstdExtractor
 from datasets.utils.file_utils import (
     DownloadConfig,
     OfflineModeIsEnabled,
@@ -88,28 +87,22 @@ def gen_random_output():
         self.assertGreater(np.abs(out1 - out3).sum(), 0)
 
 
-def test_zstd_extractor(zstd_path, tmp_path, text_file):
-    input_path = zstd_path
-    assert ZstdExtractor.is_extractable(input_path)
-    output_path = str(tmp_path / "extracted.txt")
-    ZstdExtractor.extract(input_path, output_path)
-    with open(output_path) as f:
-        extracted_file_content = f.read()
-    with open(text_file) as f:
-        expected_file_content = f.read()
-    assert extracted_file_content == expected_file_content
-
-
-@pytest.mark.parametrize("compression_format", ["xz", "zstd"])
-def test_cached_path_extract(compression_format, xz_file, zstd_path, tmp_path, text_file):
-    path = {"xz": xz_file, "zstd": zstd_path}
-    input_path = path[compression_format]
+@pytest.mark.parametrize(
+    "compression_format, expected_text_path_name", [("gzip", "text_path"), ("xz", "text_file"), ("zstd", "text_file")]
+)
+def test_cached_path_extract(
+    compression_format, expected_text_path_name, text_gz_path, xz_file, zstd_path, tmp_path, text_file, text_path
+):
+    input_paths = {"gzip": text_gz_path, "xz": xz_file, "zstd": zstd_path}
+    input_path = str(input_paths[compression_format])
     cache_dir = tmp_path / "cache"
     download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True)
     extracted_path = cached_path(input_path, download_config=download_config)
     with open(extracted_path) as f:
         extracted_file_content = f.read()
-    with open(text_file) as f:
+    expected_text_paths = {"text_file": text_file, "text_path": text_path}
+    expected_text_path = str(expected_text_paths[expected_text_path_name])
+    with open(expected_text_path) as f:
         expected_file_content = f.read()
     assert extracted_file_content == expected_file_content
 

From 006dfe20ab9021b1c9dff91f6a753363ae89391e Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 7 Jul 2021 17:55:16 +0200
Subject: [PATCH 23/23] Address requested changes

---
 src/datasets/utils/extract.py |  8 ++------
 tests/conftest.py             | 11 +++++++++++
 tests/test_file_utils.py      | 14 ++++----------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
index d03b11783de..c4dc0d4ddd5 100644
--- a/src/datasets/utils/extract.py
+++ b/src/datasets/utils/extract.py
@@ -37,10 +37,7 @@ def extract(self, input_path, force_extract=False):
             return input_path
         output_path = self._get_output_path(input_path)
         if self._do_extract(output_path, force_extract):
-            try:
-                self.extractor.extract(input_path, output_path, extractor=extractor)
-            except Exception:
-                raise EnvironmentError("Archive format of {} could not be identified".format(input_path))
+            self.extractor.extract(input_path, output_path, extractor=extractor)
         return output_path
 
 
@@ -182,5 +179,4 @@ def extract(cls, input_path, output_path, extractor=None):
                 return extractor.extract(input_path, output_path)
             for extractor in cls.extractors:
                 if extractor.is_extractable(input_path):
-                    extractor.extract(input_path, output_path)
-                    break
+                    return extractor.extract(input_path, output_path)
diff --git a/tests/conftest.py b/tests/conftest.py
index 3e416a35d98..cd42ad1d712 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -87,6 +87,17 @@ def xz_file(tmp_path_factory):
     return filename
 
 
+@pytest.fixture(scope="session")
+def gz_path(tmp_path_factory, text_path):
+    import gzip
+
+    path = str(tmp_path_factory.mktemp("data") / "file.gz")
+    data = bytes(FILE_CONTENT, "utf-8")
+    with gzip.open(path, "wb") as f:
+        f.write(data)
+    return path
+
+
 @pytest.fixture(scope="session")
 def xml_file(tmp_path_factory):
     filename = tmp_path_factory.mktemp("data") / "file.xml"
diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py
index 05b7baad00c..283b8b5a3ed 100644
--- a/tests/test_file_utils.py
+++ b/tests/test_file_utils.py
@@ -87,22 +87,16 @@ def gen_random_output():
         self.assertGreater(np.abs(out1 - out3).sum(), 0)
 
 
-@pytest.mark.parametrize(
-    "compression_format, expected_text_path_name", [("gzip", "text_path"), ("xz", "text_file"), ("zstd", "text_file")]
-)
-def test_cached_path_extract(
-    compression_format, expected_text_path_name, text_gz_path, xz_file, zstd_path, tmp_path, text_file, text_path
-):
-    input_paths = {"gzip": text_gz_path, "xz": xz_file, "zstd": zstd_path}
+@pytest.mark.parametrize("compression_format", ["gzip", "xz", "zstd"])
+def test_cached_path_extract(compression_format, gz_path, xz_file, zstd_path, tmp_path, text_file):
+    input_paths = {"gzip": gz_path, "xz": xz_file, "zstd": zstd_path}
     input_path = str(input_paths[compression_format])
     cache_dir = tmp_path / "cache"
     download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True)
     extracted_path = cached_path(input_path, download_config=download_config)
     with open(extracted_path) as f:
         extracted_file_content = f.read()
-    expected_text_paths = {"text_file": text_file, "text_path": text_path}
-    expected_text_path = str(expected_text_paths[expected_text_path_name])
-    with open(expected_text_path) as f:
+    with open(text_file) as f:
         expected_file_content = f.read()
     assert extracted_file_content == expected_file_content