Refactor base extractors (#4690)

albertvillanova · web-flow · commit 4fb3ed080bd4 · 2022-07-18T10:34:48.000+02:00
* Implement BaseExtractor

* Refactor base extractors

* Refactor zipfile import

* Improve performance of test_extractor

* Allow passing magic number to is_extractable

* Read magic number only once

* Refactor Extractor to use extractor_format

* Update test_extractor

* Make ExtractManager use extractor_format

* Refactor class hierarchy
diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py
@@ -3,10 +3,10 @@
 import lzma
 import os
 import shutil
-import struct
 import tarfile
-from zipfile import ZipFile
-from zipfile import is_zipfile as _is_zipfile
+import warnings
+import zipfile
+from abc import ABC, abstractmethod
 
 from .. import config
 from .filelock import FileLock
@@ -33,96 +33,104 @@ def _do_extract(self, output_path, force_extract):
         )
 
     def extract(self, input_path, force_extract=False):
-        is_extractable, extractor = self.extractor.is_extractable(input_path, return_extractor=True)
-        if not is_extractable:
+        extractor_format = self.extractor.infer_extractor_format(input_path)
+        if not extractor_format:
             return input_path
         output_path = self._get_output_path(input_path)
         if self._do_extract(output_path, force_extract):
-            self.extractor.extract(input_path, output_path, extractor=extractor)
+            self.extractor.extract(input_path, output_path, extractor_format)
         return output_path
 
 
-class TarExtractor:
+class BaseExtractor(ABC):
+    @classmethod
+    @abstractmethod
+    def is_extractable(cls, path: str, **kwargs) -> bool:
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def extract(input_path: str, output_path: str) -> None:
+        ...
+
+
+class MagicNumberBaseExtractor(BaseExtractor, ABC):
+    magic_number = b""
+
     @staticmethod
-    def is_extractable(path):
+    def read_magic_number(path: str, magic_number_length: int):
+        with open(path, "rb") as f:
+            return f.read(magic_number_length)
+
+    @classmethod
+    def is_extractable(cls, path: str, magic_number: bytes = b"") -> bool:
+        if not magic_number:
+            try:
+                magic_number = cls.read_magic_number(path, len(cls.magic_number))
+            except OSError:
+                return False
+        return magic_number.startswith(cls.magic_number)
+
+
+class TarExtractor(BaseExtractor):
+    @classmethod
+    def is_extractable(cls, path: str, **kwargs) -> bool:
         return tarfile.is_tarfile(path)
 
     @staticmethod
-    def extract(input_path, output_path):
+    def extract(input_path: str, output_path: str) -> None:
         os.makedirs(output_path, exist_ok=True)
         tar_file = tarfile.open(input_path)
         tar_file.extractall(output_path)
         tar_file.close()
 
 
-class GzipExtractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        """from https://stackoverflow.com/a/60634210"""
-        with gzip.open(path, "r") as fh:
-            try:
-                fh.read(1)
-                return True
-            except OSError:
-                return False
+class GzipExtractor(MagicNumberBaseExtractor):
+    magic_number = b"\x1F\x8B"
 
     @staticmethod
-    def extract(input_path, output_path):
+    def extract(input_path: str, output_path: str) -> None:
         with gzip.open(input_path, "rb") as gzip_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(gzip_file, extracted_file)
 
 
-class ZipExtractor:
-    @staticmethod
-    def is_extractable(path):
-        return _is_zipfile(path)
+class ZipExtractor(BaseExtractor):
+    @classmethod
+    def is_extractable(cls, path: str, **kwargs) -> bool:
+        return zipfile.is_zipfile(path)
 
     @staticmethod
-    def extract(input_path, output_path):
+    def extract(input_path: str, output_path: str) -> None:
         os.makedirs(output_path, exist_ok=True)
-        with ZipFile(input_path, "r") as zip_file:
+        with zipfile.ZipFile(input_path, "r") as zip_file:
             zip_file.extractall(output_path)
             zip_file.close()
 
 
-class XzExtractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        """https://tukaani.org/xz/xz-file-format-1.0.4.txt"""
-        with open(path, "rb") as f:
-            try:
-                header_magic_bytes = f.read(6)
-            except OSError:
-                return False
-            if header_magic_bytes == b"\xfd7zXZ\x00":
-                return True
-            else:
-                return False
+class XzExtractor(MagicNumberBaseExtractor):
+    magic_number = b"\xFD\x37\x7A\x58\x5A\x00"
 
     @staticmethod
-    def extract(input_path, output_path):
+    def extract(input_path: str, output_path: str) -> None:
         with lzma.open(input_path) as compressed_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(compressed_file, extracted_file)
 
 
-class RarExtractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
-        RAR_ID = b"Rar!\x1a\x07\x00"
-        RAR5_ID = b"Rar!\x1a\x07\x01\x00"
+class RarExtractor(BaseExtractor):
+    RAR_ID = b"Rar!\x1a\x07\x00"
+    RAR5_ID = b"Rar!\x1a\x07\x01\x00"
 
-        with open(path, "rb", 1024) as fd:
-            buf = fd.read(len(RAR5_ID))
-        if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID):
-            return True
-        else:
-            return False
+    @classmethod
+    def is_extractable(cls, path: str, **kwargs) -> bool:
+        """https://github.com/markokr/rarfile/blob/master/rarfile.py"""
+        with open(path, "rb") as f:
+            magic_number = f.read(len(cls.RAR5_ID))
+        return magic_number == cls.RAR5_ID or magic_number.startswith(cls.RAR_ID)
 
     @staticmethod
-    def extract(input_path, output_path):
+    def extract(input_path: str, output_path: str) -> None:
         if not config.RARFILE_AVAILABLE:
             raise OSError("Please pip install rarfile")
         import rarfile
@@ -133,22 +141,11 @@ def extract(input_path, output_path):
         rf.close()
 
 
-class ZstdExtractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        """https://datatracker.ietf.org/doc/html/rfc8878
-
-        Magic_Number:  4 bytes, little-endian format.  Value: 0xFD2FB528.
-        """
-        with open(path, "rb") as f:
-            try:
-                magic_number = f.read(4)
-            except OSError:
-                return False
-        return True if magic_number == struct.pack("<I", 0xFD2FB528) else False
+class ZstdExtractor(MagicNumberBaseExtractor):
+    magic_number = b"\x28\xb5\x2F\xFD"
 
     @staticmethod
-    def extract(input_path: str, output_path: str):
+    def extract(input_path: str, output_path: str) -> None:
         if not config.ZSTANDARD_AVAILABLE:
             raise OSError("Please pip install zstandard")
         import zstandard as zstd
@@ -158,40 +155,21 @@ def extract(input_path: str, output_path: str):
             dctx.copy_stream(ifh, ofh)
 
 
-class Bzip2Extractor:
-    @staticmethod
-    def is_extractable(path: str) -> bool:
-        with open(path, "rb") as f:
-            try:
-                header_magic_bytes = f.read(3)
-            except OSError:
-                return False
-            if header_magic_bytes == b"BZh":
-                return True
-            else:
-                return False
+class Bzip2Extractor(MagicNumberBaseExtractor):
+    magic_number = b"\x42\x5A\x68"
 
     @staticmethod
-    def extract(input_path, output_path):
+    def extract(input_path: str, output_path: str) -> None:
         with bz2.open(input_path, "rb") as compressed_file:
             with open(output_path, "wb") as extracted_file:
                 shutil.copyfileobj(compressed_file, extracted_file)
 
 
-class SevenZipExtractor:
+class SevenZipExtractor(MagicNumberBaseExtractor):
     magic_number = b"\x37\x7A\xBC\xAF\x27\x1C"
 
-    @classmethod
-    def is_extractable(cls, path):
-        with open(path, "rb") as f:
-            try:
-                magic_number = f.read(len(cls.magic_number))
-            except OSError:
-                return False
-        return True if magic_number == cls.magic_number else False
-
     @staticmethod
-    def extract(input_path: str, output_path: str):
+    def extract(input_path: str, output_path: str) -> None:
         if not config.PY7ZR_AVAILABLE:
             raise OSError("Please pip install py7zr")
         import py7zr
@@ -203,33 +181,79 @@ def extract(input_path: str, output_path: str):
 
 class Extractor:
     #  Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip)
-    extractors = [
-        TarExtractor,
-        GzipExtractor,
-        ZipExtractor,
-        XzExtractor,
-        RarExtractor,
-        ZstdExtractor,
-        Bzip2Extractor,
-        SevenZipExtractor,
-    ]
+    extractors = {
+        "tar": TarExtractor,
+        "gzip": GzipExtractor,
+        "zip": ZipExtractor,
+        "xz": XzExtractor,
+        "rar": RarExtractor,
+        "zstd": ZstdExtractor,
+        "bz2": Bzip2Extractor,
+        "7z": SevenZipExtractor,
+    }
+
+    @classmethod
+    def _get_magic_number_max_length(cls):
+        magic_number_max_length = 0
+        for extractor in cls.extractors.values():
+            if hasattr(extractor, "magic_number"):
+                magic_number_length = len(extractor.magic_number)
+                magic_number_max_length = (
+                    magic_number_length if magic_number_length > magic_number_max_length else magic_number_max_length
+                )
+        return magic_number_max_length
+
+    @staticmethod
+    def _read_magic_number(path: str, magic_number_length: int):
+        try:
+            return MagicNumberBaseExtractor.read_magic_number(path, magic_number_length=magic_number_length)
+        except OSError:
+            return b""
 
     @classmethod
     def is_extractable(cls, path, return_extractor=False):
-        for extractor in cls.extractors:
-            if extractor.is_extractable(path):
-                return True if not return_extractor else (True, extractor)
+        warnings.warn(
+            "Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. "
+            "Use 'infer_extractor_format' instead.",
+            category=FutureWarning,
+        )
+        extractor_format = cls.infer_extractor_format(path)
+        if extractor_format:
+            return True if not return_extractor else (True, cls.extractors[extractor_format])
         return False if not return_extractor else (False, None)
 
     @classmethod
-    def extract(cls, input_path, output_path, extractor=None):
+    def infer_extractor_format(cls, path):
+        magic_number_max_length = cls._get_magic_number_max_length()
+        magic_number = cls._read_magic_number(path, magic_number_max_length)
+        for extractor_format, extractor in cls.extractors.items():
+            if extractor.is_extractable(path, magic_number=magic_number):
+                return extractor_format
+
+    @classmethod
+    def extract(cls, input_path, output_path, extractor_format=None, extractor="deprecated"):
         # Prevent parallel extractions
         lock_path = input_path + ".lock"
         with FileLock(lock_path):
             shutil.rmtree(output_path, ignore_errors=True)
             os.makedirs(os.path.dirname(output_path), exist_ok=True)
-            if extractor:
+            if extractor_format or extractor != "deprecated":
+                if extractor != "deprecated" or not isinstance(extractor_format, str):  # passed as positional arg
+                    warnings.warn(
+                        "Parameter 'extractor' was deprecated in version 2.4.0 and will be removed in 3.0.0. "
+                        "Use 'extractor_format' instead.",
+                        category=FutureWarning,
+                    )
+                    extractor = extractor if extractor != "deprecated" else extractor_format
+                else:
+                    extractor = cls.extractors[extractor_format]
                 return extractor.extract(input_path, output_path)
-            for extractor in cls.extractors:
-                if extractor.is_extractable(input_path):
-                    return extractor.extract(input_path, output_path)
+            else:
+                warnings.warn(
+                    "Parameter 'extractor_format' was made required in version 2.4.0 and not passing it will raise an "
+                    "exception in 3.0.0.",
+                    category=FutureWarning,
+                )
+                for extractor in cls.extractors.values():
+                    if extractor.is_extractable(input_path):
+                        return extractor.extract(input_path, output_path)
diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -97,9 +97,10 @@ def test_extractor(
             reason += require_zstandard.kwargs["reason"]
         pytest.skip(reason)
     input_path = str(input_path)
-    assert Extractor.is_extractable(input_path)
+    extractor_format = Extractor.infer_extractor_format(input_path)
+    assert extractor_format is not None
     output_path = tmp_path / ("extracted" if is_archive else "extracted.txt")
-    Extractor.extract(input_path, output_path)
+    Extractor.extract(input_path, output_path, extractor_format)
     if is_archive:
         assert output_path.is_dir()
         for file_path in output_path.iterdir():