Make Extractor accept Path as input (#4718)

albertvillanova · web-flow · commit 6a1c6b1faa73 · 2022-07-22T15:29:43.000+02:00
* Make Extractor accept Path as input

* Remove unnecessary casting of Path to str

* Remove other unnecessary casting of Path to str

* Add type hints

* Fix type hints with TYPE_CHECKING
diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
@@ -7,32 +7,37 @@
 import warnings
 import zipfile
 from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Optional, Union
 
 from .. import config
 from .filelock import FileLock
 
 
+if TYPE_CHECKING:
+    import pathlib
+
+
 class ExtractManager:
-    def __init__(self, cache_dir=None):
+    def __init__(self, cache_dir: Optional[str] = None):
         self.extract_dir = (
             os.path.join(cache_dir, config.EXTRACTED_DATASETS_DIR) if cache_dir else config.EXTRACTED_DATASETS_PATH
         )
         self.extractor = Extractor
 
-    def _get_output_path(self, path):
+    def _get_output_path(self, path: str) -> str:
         from .file_utils import hash_url_to_filename
 
         # Path where we extract compressed archives
         # We extract in the cache dir, and get the extracted path name by hashing the original path"
         abs_path = os.path.abspath(path)
         return os.path.join(self.extract_dir, hash_url_to_filename(abs_path))
 
-    def _do_extract(self, output_path, force_extract):
+    def _do_extract(self, output_path: str, force_extract: bool) -> bool:
         return force_extract or (
             not os.path.isfile(output_path) and not (os.path.isdir(output_path) and os.listdir(output_path))
         )
 
-    def extract(self, input_path, force_extract=False):
+    def extract(self, input_path: str, force_extract: bool = False) -> str:
         extractor_format = self.extractor.infer_extractor_format(input_path)
         if not extractor_format:
             return input_path
@@ -45,25 +50,25 @@ def extract(self, input_path, force_extract=False):
 class BaseExtractor(ABC):
     @classmethod
     @abstractmethod
-    def is_extractable(cls, path: str, **kwargs) -> bool:
+    def is_extractable(cls, path: Union["pathlib.Path", str], **kwargs) -> bool:
         ...
 
     @staticmethod
     @abstractmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         ...
 
 
 class MagicNumberBaseExtractor(BaseExtractor, ABC):
     magic_number = b""
 
     @staticmethod
-    def read_magic_number(path: str, magic_number_length: int):
+    def read_magic_number(path: Union["pathlib.Path", str], magic_number_length: int):
         with open(path, "rb") as f:
             return f.read(magic_number_length)
 
     @classmethod
-    def is_extractable(cls, path: str, magic_number: bytes = b"") -> bool:
+    def is_extractable(cls, path: Union["pathlib.Path", str], magic_number: bytes = b"") -> bool:
         if not magic_number:
             try:
                 magic_number = cls.read_magic_number(path, len(cls.magic_number))
@@ -74,11 +79,11 @@ def is_extractable(cls, path: str, magic_number: bytes = b"") -> bool:
 
 class TarExtractor(BaseExtractor):
     @classmethod
-    def is_extractable(cls, path: str, **kwargs) -> bool:
+    def is_extractable(cls, path: Union["pathlib.Path", str], **kwargs) -> bool:
         return tarfile.is_tarfile(path)
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         os.makedirs(output_path, exist_ok=True)
         tar_file = tarfile.open(input_path)
         tar_file.extractall(output_path)
@@ -89,19 +94,19 @@ class GzipExtractor(MagicNumberBaseExtractor):
     magic_number = b"\x1F\x8B"
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         with gzip.open(input_path, "rb") as gzip_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(gzip_file, extracted_file)
 
 
 class ZipExtractor(BaseExtractor):
     @classmethod
-    def is_extractable(cls, path: str, **kwargs) -> bool:
+    def is_extractable(cls, path: Union["pathlib.Path", str], **kwargs) -> bool:
         return zipfile.is_zipfile(path)
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         os.makedirs(output_path, exist_ok=True)
         with zipfile.ZipFile(input_path, "r") as zip_file:
             zip_file.extractall(output_path)
@@ -112,7 +117,7 @@ class XzExtractor(MagicNumberBaseExtractor):
     magic_number = b"\xFD\x37\x7A\x58\x5A\x00"
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         with lzma.open(input_path) as compressed_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(compressed_file, extracted_file)
@@ -123,14 +128,14 @@ class RarExtractor(BaseExtractor):
     RAR5_ID = b"Rar!\x1a\x07\x01\x00"
 
     @classmethod
-    def is_extractable(cls, path: str, **kwargs) -> bool:
+    def is_extractable(cls, path: Union["pathlib.Path", str], **kwargs) -> bool:
         """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
         with open(path, "rb") as f:
             magic_number = f.read(len(cls.RAR5_ID))
         return magic_number == cls.RAR5_ID or magic_number.startswith(cls.RAR_ID)
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         if not config.RARFILE_AVAILABLE:
             raise OSError("Please pip install rarfile")
         import rarfile
@@ -145,7 +150,7 @@ class ZstdExtractor(MagicNumberBaseExtractor):
     magic_number = b"\x28\xb5\x2F\xFD"
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         if not config.ZSTANDARD_AVAILABLE:
             raise OSError("Please pip install zstandard")
         import zstandard as zstd
@@ -159,7 +164,7 @@ class Bzip2Extractor(MagicNumberBaseExtractor):
     magic_number = b"\x42\x5A\x68"
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         with bz2.open(input_path, "rb") as compressed_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(compressed_file, extracted_file)
@@ -169,7 +174,7 @@ class SevenZipExtractor(MagicNumberBaseExtractor):
     magic_number = b"\x37\x7A\xBC\xAF\x27\x1C"
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         if not config.PY7ZR_AVAILABLE:
             raise OSError("Please pip install py7zr")
         import py7zr
@@ -183,7 +188,7 @@ class Lz4Extractor(MagicNumberBaseExtractor):
     magic_number = b"\x04\x22\x4D\x18"
 
     @staticmethod
-    def extract(input_path: str, output_path: str) -> None:
+    def extract(input_path: Union["pathlib.Path", str], output_path: Union["pathlib.Path", str]) -> None:
         if not config.LZ4_AVAILABLE:
             raise OSError("Please pip install lz4")
         import lz4.frame
@@ -219,14 +224,14 @@ def _get_magic_number_max_length(cls):
         return magic_number_max_length
 
     @staticmethod
-    def _read_magic_number(path: str, magic_number_length: int):
+    def _read_magic_number(path: Union["pathlib.Path", str], magic_number_length: int):
         try:
             return MagicNumberBaseExtractor.read_magic_number(path, magic_number_length=magic_number_length)
         except OSError:
             return b""
 
     @classmethod
-    def is_extractable(cls, path, return_extractor=False):
+    def is_extractable(cls, path: Union["pathlib.Path", str], return_extractor: bool = False) -> bool:
         warnings.warn(
             "Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. "
             "Use 'infer_extractor_format' instead.",
@@ -238,17 +243,23 @@ def is_extractable(cls, path, return_extractor=False):
         return False if not return_extractor else (False, None)
 
     @classmethod
-    def infer_extractor_format(cls, path):
+    def infer_extractor_format(cls, path: Union["pathlib.Path", str]) -> str:
         magic_number_max_length = cls._get_magic_number_max_length()
         magic_number = cls._read_magic_number(path, magic_number_max_length)
         for extractor_format, extractor in cls.extractors.items():
             if extractor.is_extractable(path, magic_number=magic_number):
                 return extractor_format
 
     @classmethod
-    def extract(cls, input_path, output_path, extractor_format=None, extractor="deprecated"):
+    def extract(
+        cls,
+        input_path: Union["pathlib.Path", str],
+        output_path: Union["pathlib.Path", str],
+        extractor_format: Optional[str] = None,
+        extractor: Optional[BaseExtractor] = "deprecated",
+    ) -> None:
         # Prevent parallel extractions
-        lock_path = input_path + ".lock"
+        lock_path = str(input_path) + ".lock"
         with FileLock(lock_path):
             shutil.rmtree(output_path, ignore_errors=True)
             os.makedirs(os.path.dirname(output_path), exist_ok=True)
diff --git a/tests/test_download_manager.py b/tests/test_download_manager.py
@@ -42,7 +42,7 @@ def test_download_manager_download(urls_type, tmp_path, monkeypatch):
         urls = {"train": url}
     dataset_name = "dummy"
     cache_subdir = "downloads"
-    cache_dir_root = str(tmp_path)
+    cache_dir_root = tmp_path
     download_config = DownloadConfig(
         cache_dir=os.path.join(cache_dir_root, cache_subdir),
         use_etag=False,
diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -123,7 +123,6 @@ def test_extractor(
         elif compression_format == "zstd":
             reason += require_zstandard.kwargs["reason"]
         pytest.skip(reason)
-    input_path = str(input_path)
     extractor_format = Extractor.infer_extractor_format(input_path)
     assert extractor_format is not None
     output_path = tmp_path / ("extracted" if is_archive else "extracted.txt")
diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py
@@ -26,7 +26,7 @@ def zstd_path(tmp_path_factory):
 @pytest.mark.parametrize("compression_format", ["gzip", "xz", "zstd"])
 def test_cached_path_extract(compression_format, gz_file, xz_file, zstd_path, tmp_path, text_file):
     input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_path}
-    input_path = str(input_paths[compression_format])
+    input_path = input_paths[compression_format]
     cache_dir = tmp_path / "cache"
     download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True)
     extracted_path = cached_path(input_path, download_config=download_config)
diff --git a/tests/test_filesystem.py b/tests/test_filesystem.py
@@ -71,7 +71,6 @@ def test_compression_filesystems(compression_fs_class, gz_file, bz2_file, lz4_fi
         elif compression_fs_class.protocol == "zstd":
             reason += require_zstandard.kwargs["reason"]
         pytest.skip(reason)
-    input_path = str(input_path)
     fs = fsspec.filesystem(compression_fs_class.protocol, fo=input_path)
     assert isinstance(fs, compression_fs_class)
     expected_filename = os.path.basename(input_path)
diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py
@@ -699,7 +699,6 @@ def test_streaming_dl_manager_extract_all_supported_single_file_compression_type
         elif compression_fs_class.protocol == "zstd":
             reason += require_zstandard.kwargs["reason"]
         pytest.skip(reason)
-    input_path = str(input_path)
     dl_manager = StreamingDownloadManager()
     output_path = dl_manager.extract(input_path)
     path = os.path.basename(input_path)
@@ -791,7 +790,7 @@ def _test_jsonl(path, file):
 
 def test_iter_archive_path(tar_jsonl_path):
     dl_manager = StreamingDownloadManager()
-    archive_iterable = dl_manager.iter_archive(str(tar_jsonl_path))
+    archive_iterable = dl_manager.iter_archive(tar_jsonl_path)
     num_jsonl = 0
     for num_jsonl, (path, file) in enumerate(archive_iterable, start=1):
         _test_jsonl(path, file)
@@ -805,7 +804,7 @@ def test_iter_archive_path(tar_jsonl_path):
 
 def test_iter_archive_file(tar_nested_jsonl_path):
     dl_manager = StreamingDownloadManager()
-    files_iterable = dl_manager.iter_archive(str(tar_nested_jsonl_path))
+    files_iterable = dl_manager.iter_archive(tar_nested_jsonl_path)
     num_tar, num_jsonl = 0, 0
     for num_tar, (path, file) in enumerate(files_iterable, start=1):
         for num_jsonl, (subpath, subfile) in enumerate(dl_manager.iter_archive(file), start=1):