From 9edc2e7429970259377f3a771d4aa99e4b65b6bf Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Apr 2021 19:03:09 +0200 Subject: [PATCH 01/23] Reorder execution flow in cached_path --- src/datasets/utils/file_utils.py | 117 +++++++++++++++---------------- 1 file changed, 58 insertions(+), 59 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 1c37a469659..c9e9d316cf1 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -300,67 +300,66 @@ def cached_path( # Something unknown raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) - if download_config.extract_compressed_file and output_path is not None: - - if ( - not is_zipfile(output_path) - and not tarfile.is_tarfile(output_path) - and not is_gzip(output_path) - and not is_xz(output_path) - and not is_rarfile(output_path) - ): - return output_path - - # Path where we extract compressed archives - # We extract in the cache dir, and get the extracted path name by hashing the original path" - abs_output_path = os.path.abspath(output_path) - output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path)) - - if ( - os.path.isdir(output_path_extracted) - and os.listdir(output_path_extracted) - and not download_config.force_extract - ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract): - return output_path_extracted - - # Prevent parallel extractions - lock_path = output_path + ".lock" - with FileLock(lock_path): - shutil.rmtree(output_path_extracted, ignore_errors=True) - os.makedirs(output_path_extracted, exist_ok=True) - if tarfile.is_tarfile(output_path): - tar_file = tarfile.open(output_path) - tar_file.extractall(output_path_extracted) - tar_file.close() - elif is_gzip(output_path): - os.rmdir(output_path_extracted) - with gzip.open(output_path, "rb") as gzip_file: - with open(output_path_extracted, "wb") as extracted_file: - shutil.copyfileobj(gzip_file, extracted_file) - elif is_zipfile(output_path): # put zip file to the last, b/c it is possible wrongly detected as zip - with ZipFile(output_path, "r") as zip_file: - zip_file.extractall(output_path_extracted) - zip_file.close() - elif is_xz(output_path): - os.rmdir(output_path_extracted) - with lzma.open(output_path) as compressed_file: - with open(output_path_extracted, "wb") as extracted_file: - shutil.copyfileobj(compressed_file, extracted_file) - elif is_rarfile(output_path): - if config.RARFILE_AVAILABLE: - import rarfile - - rf = rarfile.RarFile(output_path) - rf.extractall(output_path_extracted) - rf.close() - else: - raise EnvironmentError("Please pip install rarfile") - else: - raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) - + if not download_config.extract_compressed_file or output_path is None: + return output_path + + if ( + not is_zipfile(output_path) + and not tarfile.is_tarfile(output_path) + and not is_gzip(output_path) + and not is_xz(output_path) + and not is_rarfile(output_path) + ): + return output_path + + # Path where we extract compressed archives + # We extract in the cache dir, and get the extracted path name by hashing the original path" + abs_output_path = os.path.abspath(output_path) + output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path)) + + if ( + os.path.isdir(output_path_extracted) + and os.listdir(output_path_extracted) + and not download_config.force_extract + ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract): return output_path_extracted - return output_path + # Prevent parallel extractions + lock_path = output_path + ".lock" + with FileLock(lock_path): + shutil.rmtree(output_path_extracted, ignore_errors=True) + os.makedirs(output_path_extracted, exist_ok=True) + if tarfile.is_tarfile(output_path): + tar_file = tarfile.open(output_path) + tar_file.extractall(output_path_extracted) + tar_file.close() + elif is_gzip(output_path): + os.rmdir(output_path_extracted) + with gzip.open(output_path, "rb") as gzip_file: + with open(output_path_extracted, "wb") as extracted_file: + shutil.copyfileobj(gzip_file, extracted_file) + elif is_zipfile(output_path): # put zip file to the last, b/c it is possible wrongly detected as zip + with ZipFile(output_path, "r") as zip_file: + zip_file.extractall(output_path_extracted) + zip_file.close() + elif is_xz(output_path): + os.rmdir(output_path_extracted) + with lzma.open(output_path) as compressed_file: + with open(output_path_extracted, "wb") as extracted_file: + shutil.copyfileobj(compressed_file, extracted_file) + elif is_rarfile(output_path): + if config.RARFILE_AVAILABLE: + import rarfile + + rf = rarfile.RarFile(output_path) + rf.extractall(output_path_extracted) + rf.close() + else: + raise EnvironmentError("Please pip install rarfile") + else: + raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) + + return output_path_extracted def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: From 77b0252989f78363d13719f2662ab14b982cf42b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Apr 2021 19:11:00 +0200 Subject: [PATCH 02/23] Extract function _extract from cached_path --- src/datasets/utils/file_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index c9e9d316cf1..6d2d661ddd1 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -324,6 +324,12 @@ def cached_path( ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract): return output_path_extracted + _extract(output_path, output_path_extracted) + + return output_path_extracted + + +def _extract(output_path, output_path_extracted): # Prevent parallel extractions lock_path = output_path + ".lock" with FileLock(lock_path): @@ -359,8 +365,6 @@ def cached_path( else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) - return output_path_extracted - def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION) From 66f69758212450e15c819dd4a384404bc4b20ec4 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Apr 2021 19:33:49 +0200 Subject: [PATCH 03/23] Extract method for each extract type --- src/datasets/utils/file_utils.py | 70 +++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 6d2d661ddd1..b043380ad89 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -335,37 +335,61 @@ def _extract(output_path, output_path_extracted): with FileLock(lock_path): shutil.rmtree(output_path_extracted, ignore_errors=True) os.makedirs(output_path_extracted, exist_ok=True) - if tarfile.is_tarfile(output_path): - tar_file = tarfile.open(output_path) - tar_file.extractall(output_path_extracted) - tar_file.close() + if is_tarfile(output_path): + extract_tarfile(output_path, output_path_extracted) elif is_gzip(output_path): - os.rmdir(output_path_extracted) - with gzip.open(output_path, "rb") as gzip_file: - with open(output_path_extracted, "wb") as extracted_file: - shutil.copyfileobj(gzip_file, extracted_file) + extract_gzip(output_path, output_path_extracted) elif is_zipfile(output_path): # put zip file to the last, b/c it is possible wrongly detected as zip - with ZipFile(output_path, "r") as zip_file: - zip_file.extractall(output_path_extracted) - zip_file.close() + extract_zipfile(output_path, output_path_extracted) elif is_xz(output_path): - os.rmdir(output_path_extracted) - with lzma.open(output_path) as compressed_file: - with open(output_path_extracted, "wb") as extracted_file: - shutil.copyfileobj(compressed_file, extracted_file) + extract_xz(output_path, output_path_extracted) elif is_rarfile(output_path): - if config.RARFILE_AVAILABLE: - import rarfile - - rf = rarfile.RarFile(output_path) - rf.extractall(output_path_extracted) - rf.close() - else: - raise EnvironmentError("Please pip install rarfile") + extract_rarfile(output_path, output_path_extracted) else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) +def extract_rarfile(output_path, output_path_extracted): + if config.RARFILE_AVAILABLE: + import rarfile + + rf = rarfile.RarFile(output_path) + rf.extractall(output_path_extracted) + rf.close() + else: + raise EnvironmentError("Please pip install rarfile") + + +def extract_xz(output_path, output_path_extracted): + os.rmdir(output_path_extracted) + with lzma.open(output_path) as compressed_file: + with open(output_path_extracted, "wb") as extracted_file: + shutil.copyfileobj(compressed_file, extracted_file) + + +def extract_zipfile(output_path, output_path_extracted): + with ZipFile(output_path, "r") as zip_file: + zip_file.extractall(output_path_extracted) + zip_file.close() + + +def extract_gzip(output_path, output_path_extracted): + os.rmdir(output_path_extracted) + with gzip.open(output_path, "rb") as gzip_file: + with open(output_path_extracted, "wb") as extracted_file: + shutil.copyfileobj(gzip_file, extracted_file) + + +def extract_tarfile(output_path, output_path_extracted): + tar_file = tarfile.open(output_path) + tar_file.extractall(output_path_extracted) + tar_file.close() + + +def is_tarfile(output_path): + return tarfile.is_tarfile(output_path) + + def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION) ua += "; pyarrow/{}".format(pa.__version__) From c1efe8846214291d1e7ff24594c5d23257bd61f0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Apr 2021 19:51:32 +0200 Subject: [PATCH 04/23] Create an Extractor class for each file type --- src/datasets/utils/file_utils.py | 212 ++++++++++++++++--------------- 1 file changed, 113 insertions(+), 99 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index b043380ad89..d9a1bbb8a60 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -23,7 +23,7 @@ from pathlib import Path from typing import Dict, Optional, Union from urllib.parse import urlparse -from zipfile import ZipFile, is_zipfile +from zipfile import ZipFile, is_zipfile as _is_zipfile import numpy as np import posixpath @@ -304,11 +304,11 @@ def cached_path( return output_path if ( - not is_zipfile(output_path) - and not tarfile.is_tarfile(output_path) - and not is_gzip(output_path) - and not is_xz(output_path) - and not is_rarfile(output_path) + not ZipExtractor.is_zipfile(output_path) + and not TarExtractor.is_tarfile(output_path) + and not GzipExtractor.is_gzip(output_path) + and not XzExtractor.is_xz(output_path) + and not RarExtractor.is_rarfile(output_path) ): return output_path @@ -324,70 +324,120 @@ def cached_path( ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract): return output_path_extracted - _extract(output_path, output_path_extracted) + Extractor.extract(output_path, output_path_extracted) return output_path_extracted -def _extract(output_path, output_path_extracted): - # Prevent parallel extractions - lock_path = output_path + ".lock" - with FileLock(lock_path): - shutil.rmtree(output_path_extracted, ignore_errors=True) - os.makedirs(output_path_extracted, exist_ok=True) - if is_tarfile(output_path): - extract_tarfile(output_path, output_path_extracted) - elif is_gzip(output_path): - extract_gzip(output_path, output_path_extracted) - elif is_zipfile(output_path): # put zip file to the last, b/c it is possible wrongly detected as zip - extract_zipfile(output_path, output_path_extracted) - elif is_xz(output_path): - extract_xz(output_path, output_path_extracted) - elif is_rarfile(output_path): - extract_rarfile(output_path, output_path_extracted) +class Extractor: + @staticmethod + def extract(output_path, output_path_extracted): + # Prevent parallel extractions + lock_path = output_path + ".lock" + with FileLock(lock_path): + shutil.rmtree(output_path_extracted, ignore_errors=True) + os.makedirs(output_path_extracted, exist_ok=True) + if TarExtractor.is_tarfile(output_path): + TarExtractor.extract_tarfile(output_path, output_path_extracted) + elif GzipExtractor.is_gzip(output_path): + GzipExtractor.extract_gzip(output_path, output_path_extracted) + elif ZipExtractor.is_zipfile(output_path): # put zip file to the last, b/c it is possible wrongly detected as zip + ZipExtractor.extract_zipfile(output_path, output_path_extracted) + elif XzExtractor.is_xz(output_path): + XzExtractor.extract_xz(output_path, output_path_extracted) + elif RarExtractor.is_rarfile(output_path): + RarExtractor.extract_rarfile(output_path, output_path_extracted) + else: + raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) + + +class TarExtractor: + @staticmethod + def is_tarfile(path): + return tarfile.is_tarfile(path) + @staticmethod + def extract_tarfile(output_path, output_path_extracted): + tar_file = tarfile.open(output_path) + tar_file.extractall(output_path_extracted) + tar_file.close() + + +class GzipExtractor: + @staticmethod + def is_gzip(path: str) -> bool: + """from https://stackoverflow.com/a/60634210""" + with gzip.open(path, "r") as fh: + try: + fh.read(1) + return True + except OSError: + return False + @staticmethod + def extract_gzip(output_path, output_path_extracted): + os.rmdir(output_path_extracted) + with gzip.open(output_path, "rb") as gzip_file: + with open(output_path_extracted, "wb") as extracted_file: + shutil.copyfileobj(gzip_file, extracted_file) + + +class ZipExtractor: + @staticmethod + def is_zipfile(path): + return _is_zipfile(path) + + @staticmethod + def extract_zipfile(output_path, output_path_extracted): + with ZipFile(output_path, "r") as zip_file: + zip_file.extractall(output_path_extracted) + zip_file.close() + + +class XzExtractor: + @staticmethod + def is_xz(path: str) -> bool: + """https://tukaani.org/xz/xz-file-format-1.0.4.txt""" + with open(path, "rb") as f: + try: + header_magic_bytes = f.read(6) + except OSError: + return False + if header_magic_bytes == b"\xfd7zXZ\x00": + return True + else: + return False + + @staticmethod + def extract_xz(output_path, output_path_extracted): + os.rmdir(output_path_extracted) + with lzma.open(output_path) as compressed_file: + with open(output_path_extracted, "wb") as extracted_file: + shutil.copyfileobj(compressed_file, extracted_file) + + +class RarExtractor: + @staticmethod + def is_rarfile(path: str) -> bool: + """https://github.com/markokr/rarfile/blob/master/rarfile.py""" + RAR_ID = b"Rar!\x1a\x07\x00" + RAR5_ID = b"Rar!\x1a\x07\x01\x00" + + with open(path, "rb", 1024) as fd: + buf = fd.read(len(RAR5_ID)) + if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID): + return True else: - raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) - - -def extract_rarfile(output_path, output_path_extracted): - if config.RARFILE_AVAILABLE: - import rarfile - - rf = rarfile.RarFile(output_path) - rf.extractall(output_path_extracted) - rf.close() - else: - raise EnvironmentError("Please pip install rarfile") - - -def extract_xz(output_path, output_path_extracted): - os.rmdir(output_path_extracted) - with lzma.open(output_path) as compressed_file: - with open(output_path_extracted, "wb") as extracted_file: - shutil.copyfileobj(compressed_file, extracted_file) - - -def extract_zipfile(output_path, output_path_extracted): - with ZipFile(output_path, "r") as zip_file: - zip_file.extractall(output_path_extracted) - zip_file.close() - - -def extract_gzip(output_path, output_path_extracted): - os.rmdir(output_path_extracted) - with gzip.open(output_path, "rb") as gzip_file: - with open(output_path_extracted, "wb") as extracted_file: - shutil.copyfileobj(gzip_file, extracted_file) - - -def extract_tarfile(output_path, output_path_extracted): - tar_file = tarfile.open(output_path) - tar_file.extractall(output_path_extracted) - tar_file.close() + return False + @staticmethod + def extract_rarfile(output_path, output_path_extracted): + if config.RARFILE_AVAILABLE: + import rarfile -def is_tarfile(output_path): - return tarfile.is_tarfile(output_path) + rf = rarfile.RarFile(output_path) + rf.extractall(output_path_extracted) + rf.close() + else: + raise EnvironmentError("Please pip install rarfile") def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: @@ -709,42 +759,6 @@ def _resumable_file_manager(): return cache_path -def is_gzip(path: str) -> bool: - """from https://stackoverflow.com/a/60634210""" - with gzip.open(path, "r") as fh: - try: - fh.read(1) - return True - except OSError: - return False - - -def is_xz(path: str) -> bool: - """https://tukaani.org/xz/xz-file-format-1.0.4.txt""" - with open(path, "rb") as f: - try: - header_magic_bytes = f.read(6) - except OSError: - return False - if header_magic_bytes == b"\xfd7zXZ\x00": - return True - else: - return False - - -def is_rarfile(path: str) -> bool: - """https://github.com/markokr/rarfile/blob/master/rarfile.py""" - RAR_ID = b"Rar!\x1a\x07\x00" - RAR5_ID = b"Rar!\x1a\x07\x01\x00" - - with open(path, "rb", 1024) as fd: - buf = fd.read(len(RAR5_ID)) - if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID): - return True - else: - return False - - def add_start_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") From 652f47bfaff580d2b3efd9531ef25309e97ef750 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Apr 2021 20:01:30 +0200 Subject: [PATCH 05/23] Rename extract method and input/output path params --- src/datasets/utils/file_utils.py | 53 +++++++++++++++++--------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index d9a1bbb8a60..0252f16f9de 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -23,7 +23,8 @@ from pathlib import Path from typing import Dict, Optional, Union from urllib.parse import urlparse -from zipfile import ZipFile, is_zipfile as _is_zipfile +from zipfile import ZipFile +from zipfile import is_zipfile as _is_zipfile import numpy as np import posixpath @@ -338,15 +339,17 @@ def extract(output_path, output_path_extracted): shutil.rmtree(output_path_extracted, ignore_errors=True) os.makedirs(output_path_extracted, exist_ok=True) if TarExtractor.is_tarfile(output_path): - TarExtractor.extract_tarfile(output_path, output_path_extracted) + TarExtractor.extract(output_path, output_path_extracted) elif GzipExtractor.is_gzip(output_path): - GzipExtractor.extract_gzip(output_path, output_path_extracted) - elif ZipExtractor.is_zipfile(output_path): # put zip file to the last, b/c it is possible wrongly detected as zip - ZipExtractor.extract_zipfile(output_path, output_path_extracted) + GzipExtractor.extract(output_path, output_path_extracted) + elif ZipExtractor.is_zipfile( + output_path + ): # put zip file to the last, b/c it is possible wrongly detected as zip + ZipExtractor.extract(output_path, output_path_extracted) elif XzExtractor.is_xz(output_path): - XzExtractor.extract_xz(output_path, output_path_extracted) + XzExtractor.extract(output_path, output_path_extracted) elif RarExtractor.is_rarfile(output_path): - RarExtractor.extract_rarfile(output_path, output_path_extracted) + RarExtractor.extract(output_path, output_path_extracted) else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) @@ -355,10 +358,11 @@ class TarExtractor: @staticmethod def is_tarfile(path): return tarfile.is_tarfile(path) + @staticmethod - def extract_tarfile(output_path, output_path_extracted): - tar_file = tarfile.open(output_path) - tar_file.extractall(output_path_extracted) + def extract(input_path, output_path): + tar_file = tarfile.open(input_path) + tar_file.extractall(output_path) tar_file.close() @@ -372,11 +376,12 @@ def is_gzip(path: str) -> bool: return True except OSError: return False + @staticmethod - def extract_gzip(output_path, output_path_extracted): - os.rmdir(output_path_extracted) - with gzip.open(output_path, "rb") as gzip_file: - with open(output_path_extracted, "wb") as extracted_file: + def extract(input_path, output_path): + os.rmdir(output_path) + with gzip.open(input_path, "rb") as gzip_file: + with open(output_path, "wb") as extracted_file: shutil.copyfileobj(gzip_file, extracted_file) @@ -386,9 +391,9 @@ def is_zipfile(path): return _is_zipfile(path) @staticmethod - def extract_zipfile(output_path, output_path_extracted): - with ZipFile(output_path, "r") as zip_file: - zip_file.extractall(output_path_extracted) + def extract(input_path, output_path): + with ZipFile(input_path, "r") as zip_file: + zip_file.extractall(output_path) zip_file.close() @@ -407,10 +412,10 @@ def is_xz(path: str) -> bool: return False @staticmethod - def extract_xz(output_path, output_path_extracted): - os.rmdir(output_path_extracted) - with lzma.open(output_path) as compressed_file: - with open(output_path_extracted, "wb") as extracted_file: + def extract(input_path, output_path): + os.rmdir(output_path) + with lzma.open(input_path) as compressed_file: + with open(output_path, "wb") as extracted_file: shutil.copyfileobj(compressed_file, extracted_file) @@ -429,12 +434,12 @@ def is_rarfile(path: str) -> bool: return False @staticmethod - def extract_rarfile(output_path, output_path_extracted): + def extract(input_path, output_path): if config.RARFILE_AVAILABLE: import rarfile - rf = rarfile.RarFile(output_path) - rf.extractall(output_path_extracted) + rf = rarfile.RarFile(input_path) + rf.extractall(output_path) rf.close() else: raise EnvironmentError("Please pip install rarfile") From 1e3e6e2b029bd6f7fff5c3288ff47a7d068813bf Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Apr 2021 20:09:00 +0200 Subject: [PATCH 06/23] Rename is_extractable method --- src/datasets/utils/file_utils.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 0252f16f9de..765bab2935a 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -305,11 +305,11 @@ def cached_path( return output_path if ( - not ZipExtractor.is_zipfile(output_path) - and not TarExtractor.is_tarfile(output_path) - and not GzipExtractor.is_gzip(output_path) - and not XzExtractor.is_xz(output_path) - and not RarExtractor.is_rarfile(output_path) + not ZipExtractor.is_extractable(output_path) + and not TarExtractor.is_extractable(output_path) + and not GzipExtractor.is_extractable(output_path) + and not XzExtractor.is_extractable(output_path) + and not RarExtractor.is_extractable(output_path) ): return output_path @@ -338,17 +338,17 @@ def extract(output_path, output_path_extracted): with FileLock(lock_path): shutil.rmtree(output_path_extracted, ignore_errors=True) os.makedirs(output_path_extracted, exist_ok=True) - if TarExtractor.is_tarfile(output_path): + if TarExtractor.is_extractable(output_path): TarExtractor.extract(output_path, output_path_extracted) - elif GzipExtractor.is_gzip(output_path): + elif GzipExtractor.is_extractable(output_path): GzipExtractor.extract(output_path, output_path_extracted) - elif ZipExtractor.is_zipfile( + elif ZipExtractor.is_extractable( output_path ): # put zip file to the last, b/c it is possible wrongly detected as zip ZipExtractor.extract(output_path, output_path_extracted) - elif XzExtractor.is_xz(output_path): + elif XzExtractor.is_extractable(output_path): XzExtractor.extract(output_path, output_path_extracted) - elif RarExtractor.is_rarfile(output_path): + elif RarExtractor.is_extractable(output_path): RarExtractor.extract(output_path, output_path_extracted) else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) @@ -356,7 +356,7 @@ def extract(output_path, output_path_extracted): class TarExtractor: @staticmethod - def is_tarfile(path): + def is_extractable(path): return tarfile.is_tarfile(path) @staticmethod @@ -368,7 +368,7 @@ def extract(input_path, output_path): class GzipExtractor: @staticmethod - def is_gzip(path: str) -> bool: + def is_extractable(path: str) -> bool: """from https://stackoverflow.com/a/60634210""" with gzip.open(path, "r") as fh: try: @@ -387,7 +387,7 @@ def extract(input_path, output_path): class ZipExtractor: @staticmethod - def is_zipfile(path): + def is_extractable(path): return _is_zipfile(path) @staticmethod @@ -399,7 +399,7 @@ def extract(input_path, output_path): class XzExtractor: @staticmethod - def is_xz(path: str) -> bool: + def is_extractable(path: str) -> bool: """https://tukaani.org/xz/xz-file-format-1.0.4.txt""" with open(path, "rb") as f: try: @@ -421,7 +421,7 @@ def extract(input_path, output_path): class RarExtractor: @staticmethod - def is_rarfile(path: str) -> bool: + def is_extractable(path: str) -> bool: """https://github.com/markokr/rarfile/blob/master/rarfile.py""" RAR_ID = b"Rar!\x1a\x07\x00" RAR5_ID = b"Rar!\x1a\x07\x01\x00" From 56dcea7391d5d8e9224e08003e9c0dd6c3c657b2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Apr 2021 20:41:35 +0200 Subject: [PATCH 07/23] Create generic Extractor.is_extractable --- src/datasets/utils/file_utils.py | 48 +++++++++++++++++--------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 765bab2935a..7c554813159 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -304,13 +304,7 @@ def cached_path( if not download_config.extract_compressed_file or output_path is None: return output_path - if ( - not ZipExtractor.is_extractable(output_path) - and not TarExtractor.is_extractable(output_path) - and not GzipExtractor.is_extractable(output_path) - and not XzExtractor.is_extractable(output_path) - and not RarExtractor.is_extractable(output_path) - ): + if not Extractor.is_extractable(output_path): return output_path # Path where we extract compressed archives @@ -332,26 +326,36 @@ def cached_path( class Extractor: @staticmethod - def extract(output_path, output_path_extracted): + def is_extractable(path): + return ( + ZipExtractor.is_extractable(path) + or TarExtractor.is_extractable(path) + or GzipExtractor.is_extractable(path) + or XzExtractor.is_extractable(path) + or RarExtractor.is_extractable(path) + ) + + @staticmethod + def extract(input_path, output_path): # Prevent parallel extractions - lock_path = output_path + ".lock" + lock_path = input_path + ".lock" with FileLock(lock_path): - shutil.rmtree(output_path_extracted, ignore_errors=True) - os.makedirs(output_path_extracted, exist_ok=True) - if TarExtractor.is_extractable(output_path): - TarExtractor.extract(output_path, output_path_extracted) - elif GzipExtractor.is_extractable(output_path): - GzipExtractor.extract(output_path, output_path_extracted) + shutil.rmtree(output_path, ignore_errors=True) + os.makedirs(output_path, exist_ok=True) + if TarExtractor.is_extractable(input_path): + TarExtractor.extract(input_path, output_path) + elif GzipExtractor.is_extractable(input_path): + GzipExtractor.extract(input_path, output_path) elif ZipExtractor.is_extractable( - output_path + input_path ): # put zip file to the last, b/c it is possible wrongly detected as zip - ZipExtractor.extract(output_path, output_path_extracted) - elif XzExtractor.is_extractable(output_path): - XzExtractor.extract(output_path, output_path_extracted) - elif RarExtractor.is_extractable(output_path): - RarExtractor.extract(output_path, output_path_extracted) + ZipExtractor.extract(input_path, output_path) + elif XzExtractor.is_extractable(input_path): + XzExtractor.extract(input_path, output_path) + elif RarExtractor.is_extractable(input_path): + RarExtractor.extract(input_path, output_path) else: - raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) + raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) class TarExtractor: From 75e27ee919d6124cdfd6437ac7fc355bc5bb91b2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 10:59:35 +0200 Subject: [PATCH 08/23] Create class attribute extractors --- src/datasets/utils/file_utils.py | 57 +++++++++++++------------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 7c554813159..b8a9005e88b 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -324,40 +324,6 @@ def cached_path( return output_path_extracted -class Extractor: - @staticmethod - def is_extractable(path): - return ( - ZipExtractor.is_extractable(path) - or TarExtractor.is_extractable(path) - or GzipExtractor.is_extractable(path) - or XzExtractor.is_extractable(path) - or RarExtractor.is_extractable(path) - ) - - @staticmethod - def extract(input_path, output_path): - # Prevent parallel extractions - lock_path = input_path + ".lock" - with FileLock(lock_path): - shutil.rmtree(output_path, ignore_errors=True) - os.makedirs(output_path, exist_ok=True) - if TarExtractor.is_extractable(input_path): - TarExtractor.extract(input_path, output_path) - elif GzipExtractor.is_extractable(input_path): - GzipExtractor.extract(input_path, output_path) - elif ZipExtractor.is_extractable( - input_path - ): # put zip file to the last, b/c it is possible wrongly detected as zip - ZipExtractor.extract(input_path, output_path) - elif XzExtractor.is_extractable(input_path): - XzExtractor.extract(input_path, output_path) - elif RarExtractor.is_extractable(input_path): - RarExtractor.extract(input_path, output_path) - else: - raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) - - class TarExtractor: @staticmethod def is_extractable(path): @@ -449,6 +415,29 @@ def extract(input_path, output_path): raise EnvironmentError("Please pip install rarfile") +class Extractor: + # Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip) + extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor] + + @classmethod + def is_extractable(cls, path): + return any(extractor.is_extractable() for extractor in cls.extractors) + + @classmethod + def extract(cls, input_path, output_path): + if not cls.is_extractable(input_path): + raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) + # Prevent parallel extractions + lock_path = input_path + ".lock" + with FileLock(lock_path): + shutil.rmtree(output_path, ignore_errors=True) + os.makedirs(output_path, exist_ok=True) + for extractor in cls.extractors: + if extractor.is_extractable(input_path): + extractor.extract(input_path, output_path) + break + + def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION) ua += "; pyarrow/{}".format(pa.__version__) From 7b9359f3eb9ee4304e4a1ada36b4ff8f8ce72628 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 11:07:57 +0200 Subject: [PATCH 09/23] Move extract functionality to extract module --- src/datasets/utils/extract.py | 124 +++++++++++++++++++++++++++++++ src/datasets/utils/file_utils.py | 120 +----------------------------- 2 files changed, 125 insertions(+), 119 deletions(-) create mode 100644 src/datasets/utils/extract.py diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py new file mode 100644 index 00000000000..0e03caf9912 --- /dev/null +++ b/src/datasets/utils/extract.py @@ -0,0 +1,124 @@ +import gzip +import lzma +import os +import shutil +import tarfile +from zipfile import ZipFile +from zipfile import is_zipfile as _is_zipfile + +from datasets import config +from datasets.utils.filelock import FileLock + + +class TarExtractor: + @staticmethod + def is_extractable(path): + return tarfile.is_tarfile(path) + + @staticmethod + def extract(input_path, output_path): + tar_file = tarfile.open(input_path) + tar_file.extractall(output_path) + tar_file.close() + + +class GzipExtractor: + @staticmethod + def is_extractable(path: str) -> bool: + """from https://stackoverflow.com/a/60634210""" + with gzip.open(path, "r") as fh: + try: + fh.read(1) + return True + except OSError: + return False + + @staticmethod + def extract(input_path, output_path): + os.rmdir(output_path) + with gzip.open(input_path, "rb") as gzip_file: + with open(output_path, "wb") as extracted_file: + shutil.copyfileobj(gzip_file, extracted_file) + + +class ZipExtractor: + @staticmethod + def is_extractable(path): + return _is_zipfile(path) + + @staticmethod + def extract(input_path, output_path): + with ZipFile(input_path, "r") as zip_file: + zip_file.extractall(output_path) + zip_file.close() + + +class XzExtractor: + @staticmethod + def is_extractable(path: str) -> bool: + """https://tukaani.org/xz/xz-file-format-1.0.4.txt""" + with open(path, "rb") as f: + try: + header_magic_bytes = f.read(6) + except OSError: + return False + if header_magic_bytes == b"\xfd7zXZ\x00": + return True + else: + return False + + @staticmethod + def extract(input_path, output_path): + os.rmdir(output_path) + with lzma.open(input_path) as compressed_file: + with open(output_path, "wb") as extracted_file: + shutil.copyfileobj(compressed_file, extracted_file) + + +class RarExtractor: + @staticmethod + def is_extractable(path: str) -> bool: + """https://github.com/markokr/rarfile/blob/master/rarfile.py""" + RAR_ID = b"Rar!\x1a\x07\x00" + RAR5_ID = b"Rar!\x1a\x07\x01\x00" + + with open(path, "rb", 1024) as fd: + buf = fd.read(len(RAR5_ID)) + if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID): + return True + else: + return False + + @staticmethod + def extract(input_path, output_path): + if config.RARFILE_AVAILABLE: + import rarfile + + rf = rarfile.RarFile(input_path) + rf.extractall(output_path) + rf.close() + else: + raise EnvironmentError("Please pip install rarfile") + + +class Extractor: + # Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip) + extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor] + + @classmethod + def is_extractable(cls, path): + return any(extractor.is_extractable() for extractor in cls.extractors) + + @classmethod + def extract(cls, input_path, output_path): + if not cls.is_extractable(input_path): + raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) + # Prevent parallel extractions + lock_path = input_path + ".lock" + with FileLock(lock_path): + shutil.rmtree(output_path, ignore_errors=True) + os.makedirs(output_path, exist_ok=True) + for extractor in cls.extractors: + if extractor.is_extractable(input_path): + extractor.extract(input_path, output_path) + break diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index b8a9005e88b..83399c10cec 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -5,14 +5,11 @@ """ import copy -import gzip import json -import lzma import os import re import shutil import sys -import tarfile import tempfile import time import urllib @@ -23,8 +20,6 @@ from pathlib import Path from typing import Dict, Optional, Union from urllib.parse import urlparse -from zipfile import ZipFile -from zipfile import is_zipfile as _is_zipfile import numpy as np import posixpath @@ -33,6 +28,7 @@ from tqdm.auto import tqdm from .. import __version__, config +from .extract import Extractor from .filelock import FileLock from .logging import WARNING, get_logger @@ -324,120 +320,6 @@ def cached_path( return output_path_extracted -class TarExtractor: - @staticmethod - def is_extractable(path): - return tarfile.is_tarfile(path) - - @staticmethod - def extract(input_path, output_path): - tar_file = tarfile.open(input_path) - tar_file.extractall(output_path) - tar_file.close() - - -class GzipExtractor: - @staticmethod - def is_extractable(path: str) -> bool: - """from https://stackoverflow.com/a/60634210""" - with gzip.open(path, "r") as fh: - try: - fh.read(1) - return True - except OSError: - return False - - @staticmethod - def extract(input_path, output_path): - os.rmdir(output_path) - with gzip.open(input_path, "rb") as gzip_file: - with open(output_path, "wb") as extracted_file: - shutil.copyfileobj(gzip_file, extracted_file) - - -class ZipExtractor: - @staticmethod - def is_extractable(path): - return _is_zipfile(path) - - @staticmethod - def extract(input_path, output_path): - with ZipFile(input_path, "r") as zip_file: - zip_file.extractall(output_path) - zip_file.close() - - -class XzExtractor: - @staticmethod - def is_extractable(path: str) -> bool: - """https://tukaani.org/xz/xz-file-format-1.0.4.txt""" - with open(path, "rb") as f: - try: - header_magic_bytes = f.read(6) - except OSError: - return False - if header_magic_bytes == b"\xfd7zXZ\x00": - return True - else: - return False - - @staticmethod - def extract(input_path, output_path): - os.rmdir(output_path) - with lzma.open(input_path) as compressed_file: - with open(output_path, "wb") as extracted_file: - shutil.copyfileobj(compressed_file, extracted_file) - - -class RarExtractor: - @staticmethod - def is_extractable(path: str) -> bool: - """https://github.com/markokr/rarfile/blob/master/rarfile.py""" - RAR_ID = b"Rar!\x1a\x07\x00" - RAR5_ID = b"Rar!\x1a\x07\x01\x00" - - with open(path, "rb", 1024) as fd: - buf = fd.read(len(RAR5_ID)) - if buf.startswith(RAR_ID) or buf.startswith(RAR5_ID): - return True - else: - return False - - @staticmethod - def extract(input_path, output_path): - if config.RARFILE_AVAILABLE: - import rarfile - - rf = rarfile.RarFile(input_path) - rf.extractall(output_path) - rf.close() - else: - raise EnvironmentError("Please pip install rarfile") - - -class Extractor: - # Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip) - extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor] - - @classmethod - def is_extractable(cls, path): - return any(extractor.is_extractable() for extractor in cls.extractors) - - @classmethod - def extract(cls, input_path, output_path): - if not cls.is_extractable(input_path): - raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) - # Prevent parallel extractions - lock_path = input_path + ".lock" - with FileLock(lock_path): - shutil.rmtree(output_path, ignore_errors=True) - os.makedirs(output_path, exist_ok=True) - for extractor in cls.extractors: - if extractor.is_extractable(input_path): - extractor.extract(input_path, output_path) - break - - def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: ua = "datasets/{}; python/{}".format(__version__, config.PY_VERSION) ua += "; pyarrow/{}".format(pa.__version__) From 4f1bf02042f803e76d7513c4b19e0388d970e198 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 11:25:45 +0200 Subject: [PATCH 10/23] Fix Extractor.is_extractable --- src/datasets/utils/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 0e03caf9912..6f41791681f 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -107,7 +107,7 @@ class Extractor: @classmethod def is_extractable(cls, path): - return any(extractor.is_extractable() for extractor in cls.extractors) + return any(extractor.is_extractable(path) for extractor in cls.extractors) @classmethod def extract(cls, input_path, output_path): From a36663b7f1e683e2045d03f25c9cbfd3186fa0ad Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 12:54:48 +0200 Subject: [PATCH 11/23] Create ExtractManager with all extract logic --- src/datasets/utils/extract.py | 34 ++++++++++++++++++++++++++++++++ src/datasets/utils/file_utils.py | 26 +++++++----------------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 6f41791681f..d64edbb2429 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -3,13 +3,47 @@ import os import shutil import tarfile +from dataclasses import dataclass from zipfile import ZipFile from zipfile import is_zipfile as _is_zipfile from datasets import config +from datasets.utils.file_utils import hash_url_to_filename from datasets.utils.filelock import FileLock +@dataclass +class ExtractConfig: + extract_compressed_file: bool = False + force_extract: bool = False + + +class ExtractManager: + def __init__(self, cache_dir=None): + self.cache_dir = cache_dir + self.extractor = Extractor + + def _get_outout_path(self, path): + # Path where we extract compressed archives + # We extract in the cache dir, and get the extracted path name by hashing the original path" + abs_path = os.path.abspath(path) + return os.path.join(self.cache_dir, "extracted", hash_url_to_filename(abs_path)) + + def _do_extract(self, output_path, force_extract): + return force_extract or ( + not os.path.isfile(output_path) + and not (os.path.isdir(output_path) and os.listdir(output_path)) + ) + + def extract(self, input_path, force_extract=False): + output_path = input_path + if self.extractor.is_extractable(input_path): + output_path = self._get_outout_path(input_path) + if self._do_extract(output_path, force_extract): + self.extractor.extract(input_path, output_path) + return output_path + + class TarExtractor: @staticmethod def is_extractable(path): diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 83399c10cec..78fc00203f5 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -28,7 +28,7 @@ from tqdm.auto import tqdm from .. import __version__, config -from .extract import Extractor +from .extract import ExtractManager from .filelock import FileLock from .logging import WARNING, get_logger @@ -297,27 +297,15 @@ def cached_path( # Something unknown raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) - if not download_config.extract_compressed_file or output_path is None: + if output_path is None: return output_path - if not Extractor.is_extractable(output_path): - return output_path - - # Path where we extract compressed archives - # We extract in the cache dir, and get the extracted path name by hashing the original path" - abs_output_path = os.path.abspath(output_path) - output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path)) - - if ( - os.path.isdir(output_path_extracted) - and os.listdir(output_path_extracted) - and not download_config.force_extract - ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract): - return output_path_extracted - - Extractor.extract(output_path, output_path_extracted) + if download_config.extract_compressed_file: + output_path = ExtractManager(cache_dir=cache_dir).extract( + output_path, force_extract=download_config.force_extract + ) - return output_path_extracted + return output_path def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: From 104f0f75e20c0c99bacf88b6b1b5dfa25848d6f6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 13:02:10 +0200 Subject: [PATCH 12/23] Fix circular import --- src/datasets/utils/extract.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index d64edbb2429..252243ad333 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -8,7 +8,6 @@ from zipfile import is_zipfile as _is_zipfile from datasets import config -from datasets.utils.file_utils import hash_url_to_filename from datasets.utils.filelock import FileLock @@ -24,6 +23,8 @@ def __init__(self, cache_dir=None): self.extractor = Extractor def _get_outout_path(self, path): + from datasets.utils.file_utils import hash_url_to_filename + # Path where we extract compressed archives # We extract in the cache dir, and get the extracted path name by hashing the original path" abs_path = os.path.abspath(path) @@ -31,8 +32,7 @@ def _get_outout_path(self, path): def _do_extract(self, output_path, force_extract): return force_extract or ( - not os.path.isfile(output_path) - and not (os.path.isdir(output_path) and os.listdir(output_path)) + not os.path.isfile(output_path) and not (os.path.isdir(output_path) and os.listdir(output_path)) ) def extract(self, input_path, force_extract=False): From 9f57f5256641d73c9cbb25800be62c8e287d510e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 13:22:00 +0200 Subject: [PATCH 13/23] Fix typo --- src/datasets/utils/extract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 252243ad333..77b47c655c0 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -22,7 +22,7 @@ def __init__(self, cache_dir=None): self.cache_dir = cache_dir self.extractor = Extractor - def _get_outout_path(self, path): + def _get_output_path(self, path): from datasets.utils.file_utils import hash_url_to_filename # Path where we extract compressed archives @@ -38,7 +38,7 @@ def _do_extract(self, output_path, force_extract): def extract(self, input_path, force_extract=False): output_path = input_path if self.extractor.is_extractable(input_path): - output_path = self._get_outout_path(input_path) + output_path = self._get_output_path(input_path) if self._do_extract(output_path, force_extract): self.extractor.extract(input_path, output_path) return output_path From ad59f5d512745d14535d2847d5c844f71c1cc256 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 13:25:51 +0200 Subject: [PATCH 14/23] Remove unused class --- src/datasets/utils/extract.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 77b47c655c0..77af7f4f221 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -11,12 +11,6 @@ from datasets.utils.filelock import FileLock -@dataclass -class ExtractConfig: - extract_compressed_file: bool = False - force_extract: bool = False - - class ExtractManager: def __init__(self, cache_dir=None): self.cache_dir = cache_dir From 7ef66c98529eb7960d01aa82c89554b7a4981579 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 3 May 2021 13:29:43 +0200 Subject: [PATCH 15/23] Fix style --- src/datasets/utils/extract.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 77af7f4f221..6551a30e9cc 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -3,7 +3,6 @@ import os import shutil import tarfile -from dataclasses import dataclass from zipfile import ZipFile from zipfile import is_zipfile as _is_zipfile From e62ac10c81302d483d05024d4f90d3c2d428f63f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Jul 2021 14:35:19 +0200 Subject: [PATCH 16/23] Fix issues after merge upstream master --- src/datasets/utils/extract.py | 7 +++++-- src/datasets/utils/file_utils.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index f7d78bfce44..c2fd86fd3e0 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -13,7 +13,9 @@ class ExtractManager: def __init__(self, cache_dir=None): - self.cache_dir = cache_dir + self.extract_dir = ( + os.path.join(cache_dir, config.EXTRACTED_DATASETS_DIR) if cache_dir else config.EXTRACTED_DATASETS_PATH + ) self.extractor = Extractor def _get_output_path(self, path): @@ -22,7 +24,7 @@ def _get_output_path(self, path): # Path where we extract compressed archives # We extract in the cache dir, and get the extracted path name by hashing the original path" abs_path = os.path.abspath(path) - return os.path.join(self.cache_dir, "extracted", hash_url_to_filename(abs_path)) + return os.path.join(self.extract_dir, hash_url_to_filename(abs_path)) def _do_extract(self, output_path, force_extract): return force_extract or ( @@ -145,6 +147,7 @@ def is_extractable(path: str) -> bool: @staticmethod def extract(input_path: str, output_path: str): + os.rmdir(output_path) if not config.ZSTANDARD_AVAILABLE: raise EnvironmentError("Please pip install zstandard") import zstandard as zstd diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index fdbd5fca4dd..8b2dbdf5528 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -300,7 +300,7 @@ def cached_path( return output_path if download_config.extract_compressed_file: - output_path = ExtractManager(cache_dir=cache_dir).extract( + output_path = ExtractManager(cache_dir=download_config.cache_dir).extract( output_path, force_extract=download_config.force_extract ) From 8bd53c43674f0b04602b013933f60a7c91df48ec Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Jul 2021 15:13:47 +0200 Subject: [PATCH 17/23] Remove default os.makedirs and os.rmdir when not applicable --- src/datasets/utils/extract.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index c2fd86fd3e0..43aaebc56f5 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -47,6 +47,7 @@ def is_extractable(path): @staticmethod def extract(input_path, output_path): + os.makedirs(output_path, exist_ok=True) tar_file = tarfile.open(input_path) tar_file.extractall(output_path) tar_file.close() @@ -65,7 +66,6 @@ def is_extractable(path: str) -> bool: @staticmethod def extract(input_path, output_path): - os.rmdir(output_path) with gzip.open(input_path, "rb") as gzip_file: with open(output_path, "wb") as extracted_file: shutil.copyfileobj(gzip_file, extracted_file) @@ -78,6 +78,7 @@ def is_extractable(path): @staticmethod def extract(input_path, output_path): + os.makedirs(output_path, exist_ok=True) with ZipFile(input_path, "r") as zip_file: zip_file.extractall(output_path) zip_file.close() @@ -99,7 +100,6 @@ def is_extractable(path: str) -> bool: @staticmethod def extract(input_path, output_path): - os.rmdir(output_path) with lzma.open(input_path) as compressed_file: with open(output_path, "wb") as extracted_file: shutil.copyfileobj(compressed_file, extracted_file) @@ -121,14 +121,14 @@ def is_extractable(path: str) -> bool: @staticmethod def extract(input_path, output_path): - if config.RARFILE_AVAILABLE: - import rarfile - - rf = rarfile.RarFile(input_path) - rf.extractall(output_path) - rf.close() - else: + if not config.RARFILE_AVAILABLE: raise EnvironmentError("Please pip install rarfile") + import rarfile + + os.makedirs(output_path, exist_ok=True) + rf = rarfile.RarFile(input_path) + rf.extractall(output_path) + rf.close() class ZstdExtractor: @@ -147,7 +147,6 @@ def is_extractable(path: str) -> bool: @staticmethod def extract(input_path: str, output_path: str): - os.rmdir(output_path) if not config.ZSTANDARD_AVAILABLE: raise EnvironmentError("Please pip install zstandard") import zstandard as zstd @@ -173,7 +172,6 @@ def extract(cls, input_path, output_path): lock_path = input_path + ".lock" with FileLock(lock_path): shutil.rmtree(output_path, ignore_errors=True) - os.makedirs(output_path, exist_ok=True) for extractor in cls.extractors: if extractor.is_extractable(input_path): extractor.extract(input_path, output_path) From 70c33448dc2b86d0ab2d8d8de96c3b972b409ce2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Jul 2021 16:14:56 +0200 Subject: [PATCH 18/23] Create parent dirs of output_path --- src/datasets/utils/extract.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 43aaebc56f5..838dbd3de3b 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -172,6 +172,7 @@ def extract(cls, input_path, output_path): lock_path = input_path + ".lock" with FileLock(lock_path): shutil.rmtree(output_path, ignore_errors=True) + os.makedirs(os.path.dirname(output_path), exist_ok=True) for extractor in cls.extractors: if extractor.is_extractable(input_path): extractor.extract(input_path, output_path) From f6739700c0c2903d97460d96e4f6bb240585a60f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Jul 2021 16:45:26 +0200 Subject: [PATCH 19/23] Minor refactoring of ExtractManager --- src/datasets/utils/extract.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 838dbd3de3b..e6f589c5861 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -32,11 +32,14 @@ def _do_extract(self, output_path, force_extract): ) def extract(self, input_path, force_extract=False): - output_path = input_path - if self.extractor.is_extractable(input_path): - output_path = self._get_output_path(input_path) - if self._do_extract(output_path, force_extract): + if not self.extractor.is_extractable(input_path): + return input_path + output_path = self._get_output_path(input_path) + if self._do_extract(output_path, force_extract): + try: self.extractor.extract(input_path, output_path) + except Exception: + raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) return output_path @@ -166,8 +169,6 @@ def is_extractable(cls, path): @classmethod def extract(cls, input_path, output_path): - if not cls.is_extractable(input_path): - raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) # Prevent parallel extractions lock_path = input_path + ".lock" with FileLock(lock_path): From 57921306aa6eef7f083108115d65de41be686474 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Jul 2021 16:59:41 +0200 Subject: [PATCH 20/23] Optimize Extractor.extract by returning specific extractor --- src/datasets/utils/extract.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index e6f589c5861..432aa8f3a26 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -32,12 +32,13 @@ def _do_extract(self, output_path, force_extract): ) def extract(self, input_path, force_extract=False): - if not self.extractor.is_extractable(input_path): + is_extractable, extractor = self.extractor.is_extractable(input_path, return_extractor=True) + if not is_extractable: return input_path output_path = self._get_output_path(input_path) if self._do_extract(output_path, force_extract): try: - self.extractor.extract(input_path, output_path) + extractor.extract(input_path, output_path) except Exception: raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) return output_path @@ -164,16 +165,21 @@ class Extractor: extractors = [TarExtractor, GzipExtractor, ZipExtractor, XzExtractor, RarExtractor, ZstdExtractor] @classmethod - def is_extractable(cls, path): - return any(extractor.is_extractable(path) for extractor in cls.extractors) + def is_extractable(cls, path, return_extractor=False): + for extractor in cls.extractors: + if extractor.is_extractable(path): + return True if not return_extractor else (True, extractor) + return False if not return_extractor else (False, None) @classmethod - def extract(cls, input_path, output_path): + def extract(cls, input_path, output_path, extractor=None): # Prevent parallel extractions lock_path = input_path + ".lock" with FileLock(lock_path): shutil.rmtree(output_path, ignore_errors=True) os.makedirs(os.path.dirname(output_path), exist_ok=True) + if extractor: + return extractor.extract(input_path, output_path) for extractor in cls.extractors: if extractor.is_extractable(input_path): extractor.extract(input_path, output_path) From 7126a1d9f5fce77e6716b44482b3cdb66e94f35b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Jul 2021 17:13:32 +0200 Subject: [PATCH 21/23] Fix extract --- src/datasets/utils/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index 432aa8f3a26..d03b11783de 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -38,7 +38,7 @@ def extract(self, input_path, force_extract=False): output_path = self._get_output_path(input_path) if self._do_extract(output_path, force_extract): try: - extractor.extract(input_path, output_path) + self.extractor.extract(input_path, output_path, extractor=extractor) except Exception: raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) return output_path From 834ecc35db89029bbae2f38a1a03d7dd08495be1 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Jul 2021 18:46:57 +0200 Subject: [PATCH 22/23] Test extract and add gzip to test_cached_path_extract --- tests/test_extract.py | 50 ++++++++++++++++++++++++++++++++++++++++ tests/test_file_utils.py | 29 +++++++++-------------- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/tests/test_extract.py b/tests/test_extract.py index e69de29bb2d..909ddf83895 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -0,0 +1,50 @@ +import pytest +import zstandard as zstd + +from datasets.utils.extract import Extractor, ZstdExtractor + + +FILE_CONTENT = """\ + Text data. + Second line of data.""" + + +@pytest.fixture(scope="session") +def zstd_path(tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "file.zstd" + data = bytes(FILE_CONTENT, "utf-8") + with zstd.open(path, "wb") as f: + f.write(data) + return path + + +def test_zstd_extractor(zstd_path, tmp_path, text_file): + input_path = zstd_path + assert ZstdExtractor.is_extractable(input_path) + output_path = str(tmp_path / "extracted.txt") + ZstdExtractor.extract(input_path, output_path) + with open(output_path) as f: + extracted_file_content = f.read() + with open(text_file) as f: + expected_file_content = f.read() + assert extracted_file_content == expected_file_content + + +@pytest.mark.parametrize( + "compression_format, expected_text_path_name", [("gzip", "text_path"), ("xz", "text_file"), ("zstd", "text_file")] +) +def test_extractor( + compression_format, expected_text_path_name, text_gz_path, xz_file, zstd_path, tmp_path, text_file, text_path +): + input_paths = {"gzip": text_gz_path, "xz": xz_file, "zstd": zstd_path} + input_path = str(input_paths[compression_format]) + output_path = str(tmp_path / "extracted.txt") + assert Extractor.is_extractable(input_path) + Extractor.extract(input_path, output_path) + with open(output_path) as f: + extracted_file_content = f.read() + expected_text_paths = {"text_file": text_file, "text_path": text_path} + expected_text_path = str(expected_text_paths[expected_text_path_name]) + with open(expected_text_path) as f: + expected_file_content = f.read() + assert extracted_file_content == expected_file_content diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py index edfd776599f..05b7baad00c 100644 --- a/tests/test_file_utils.py +++ b/tests/test_file_utils.py @@ -7,7 +7,6 @@ import pytest import zstandard as zstd -from datasets.utils.extract import ZstdExtractor from datasets.utils.file_utils import ( DownloadConfig, OfflineModeIsEnabled, @@ -88,28 +87,22 @@ def gen_random_output(): self.assertGreater(np.abs(out1 - out3).sum(), 0) -def test_zstd_extractor(zstd_path, tmp_path, text_file): - input_path = zstd_path - assert ZstdExtractor.is_extractable(input_path) - output_path = str(tmp_path / "extracted.txt") - ZstdExtractor.extract(input_path, output_path) - with open(output_path) as f: - extracted_file_content = f.read() - with open(text_file) as f: - expected_file_content = f.read() - assert extracted_file_content == expected_file_content - - -@pytest.mark.parametrize("compression_format", ["xz", "zstd"]) -def test_cached_path_extract(compression_format, xz_file, zstd_path, tmp_path, text_file): - path = {"xz": xz_file, "zstd": zstd_path} - input_path = path[compression_format] +@pytest.mark.parametrize( + "compression_format, expected_text_path_name", [("gzip", "text_path"), ("xz", "text_file"), ("zstd", "text_file")] +) +def test_cached_path_extract( + compression_format, expected_text_path_name, text_gz_path, xz_file, zstd_path, tmp_path, text_file, text_path +): + input_paths = {"gzip": text_gz_path, "xz": xz_file, "zstd": zstd_path} + input_path = str(input_paths[compression_format]) cache_dir = tmp_path / "cache" download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True) extracted_path = cached_path(input_path, download_config=download_config) with open(extracted_path) as f: extracted_file_content = f.read() - with open(text_file) as f: + expected_text_paths = {"text_file": text_file, "text_path": text_path} + expected_text_path = str(expected_text_paths[expected_text_path_name]) + with open(expected_text_path) as f: expected_file_content = f.read() assert extracted_file_content == expected_file_content From 006dfe20ab9021b1c9dff91f6a753363ae89391e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 7 Jul 2021 17:55:16 +0200 Subject: [PATCH 23/23] Address requested changes --- src/datasets/utils/extract.py | 8 ++------ tests/conftest.py | 11 +++++++++++ tests/test_file_utils.py | 14 ++++---------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index d03b11783de..c4dc0d4ddd5 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -37,10 +37,7 @@ def extract(self, input_path, force_extract=False): return input_path output_path = self._get_output_path(input_path) if self._do_extract(output_path, force_extract): - try: - self.extractor.extract(input_path, output_path, extractor=extractor) - except Exception: - raise EnvironmentError("Archive format of {} could not be identified".format(input_path)) + self.extractor.extract(input_path, output_path, extractor=extractor) return output_path @@ -182,5 +179,4 @@ def extract(cls, input_path, output_path, extractor=None): return extractor.extract(input_path, output_path) for extractor in cls.extractors: if extractor.is_extractable(input_path): - extractor.extract(input_path, output_path) - break + return extractor.extract(input_path, output_path) diff --git a/tests/conftest.py b/tests/conftest.py index 3e416a35d98..cd42ad1d712 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -87,6 +87,17 @@ def xz_file(tmp_path_factory): return filename +@pytest.fixture(scope="session") +def gz_path(tmp_path_factory, text_path): + import gzip + + path = str(tmp_path_factory.mktemp("data") / "file.gz") + data = bytes(FILE_CONTENT, "utf-8") + with gzip.open(path, "wb") as f: + f.write(data) + return path + + @pytest.fixture(scope="session") def xml_file(tmp_path_factory): filename = tmp_path_factory.mktemp("data") / "file.xml" diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py index 05b7baad00c..283b8b5a3ed 100644 --- a/tests/test_file_utils.py +++ b/tests/test_file_utils.py @@ -87,22 +87,16 @@ def gen_random_output(): self.assertGreater(np.abs(out1 - out3).sum(), 0) -@pytest.mark.parametrize( - "compression_format, expected_text_path_name", [("gzip", "text_path"), ("xz", "text_file"), ("zstd", "text_file")] -) -def test_cached_path_extract( - compression_format, expected_text_path_name, text_gz_path, xz_file, zstd_path, tmp_path, text_file, text_path -): - input_paths = {"gzip": text_gz_path, "xz": xz_file, "zstd": zstd_path} +@pytest.mark.parametrize("compression_format", ["gzip", "xz", "zstd"]) +def test_cached_path_extract(compression_format, gz_path, xz_file, zstd_path, tmp_path, text_file): + input_paths = {"gzip": gz_path, "xz": xz_file, "zstd": zstd_path} input_path = str(input_paths[compression_format]) cache_dir = tmp_path / "cache" download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True) extracted_path = cached_path(input_path, download_config=download_config) with open(extracted_path) as f: extracted_file_content = f.read() - expected_text_paths = {"text_file": text_file, "text_path": text_path} - expected_text_path = str(expected_text_paths[expected_text_path_name]) - with open(expected_text_path) as f: + with open(text_file) as f: expected_file_content = f.read() assert extracted_file_content == expected_file_content