From e0218d7567b2b19a5dbdd4edb4911d992383f10d Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 16 Jul 2021 15:08:28 +0200 Subject: [PATCH 1/7] Improve prepare_module messages --- src/datasets/load.py | 36 ++++++++++++++++++++-------------- src/datasets/utils/py_utils.py | 11 ++++++++++- tests/test_py_utils.py | 2 ++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index f9bbad0f04c..3ab6794fead 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -29,6 +29,8 @@ import fsspec +from datasets.utils.py_utils import rel_to_abs_path + from . import config from .arrow_dataset import Dataset from .builder import DatasetBuilder @@ -246,7 +248,7 @@ def prepare_module( script_version (Optional ``Union[str, datasets.Version]``): If specified, the module will be loaded from the datasets repository at this version. By default: - - it is set to the local version fo the lib. + - it is set to the local version of the lib. - it will also try to load it from the master branch if it's not available at the local version fo the lib. Specifying a version that is different from your local version of the lib might cause compatibility issues. download_config (Optional ``datasets.DownloadConfig``: specific download configuration parameters. @@ -308,9 +310,13 @@ def prepare_module( # - if path is a file or a remote url # - otherwise we assume path/name is a path to our S3 bucket combined_path = os.path.join(path, name) + + combined_path_abs = rel_to_abs_path(combined_path) + path_abs = rel_to_abs_path(path) + if os.path.isfile(combined_path): file_path = combined_path - local_path = file_path + local_path = combined_path elif os.path.isfile(path): file_path = path local_path = path @@ -336,18 +342,18 @@ def prepare_module( try: local_path = cached_path(file_path, download_config=download_config) logger.warning( - "Couldn't find file locally at {}, or remotely at {}.\n" + "Couldn't find file locally at {} or at {}, or remotely at {}.\n" "The file was picked from the master branch on github instead at {}.".format( - combined_path, github_file_path, file_path + combined_path_abs, path_abs, github_file_path, file_path ) ) except FileNotFoundError: raise FileNotFoundError( - "Couldn't find file locally at {}, or remotely at {}.\n" + "Couldn't find file locally at {} or at {}, or remotely at {}.\n" "The file is also not present on the master branch on github.".format( - combined_path, github_file_path + combined_path_abs, path_abs, github_file_path ) - ) + ) from None elif path.count("/") == 1: # users datasets/metrics: s3 path (hub for datasets and s3 for metrics) if dataset: file_path = hf_hub_url(path=path, name=name, version=script_version) @@ -357,17 +363,17 @@ def prepare_module( local_path = cached_path(file_path, download_config=download_config) except FileNotFoundError: raise FileNotFoundError( - "Couldn't find file locally at {}, or remotely at {}. Please provide a valid {} name".format( - combined_path, file_path, "dataset" if dataset else "metric" + "Couldn't find file locally at {} or at {}, or remotely at {}. Please provide a valid {} name".format( + combined_path_abs, path_abs, file_path, "dataset" if dataset else "metric" ) - ) + ) from None else: raise FileNotFoundError( - "Couldn't find file locally at {}. Please provide a valid {} name".format( - combined_path, "dataset" if dataset else "metric" + "Couldn't find file locally at {} or at {}. Please provide a valid {} name".format( + combined_path_abs, path_abs, "dataset" if dataset else "metric" ) ) - except Exception as e: # noqa: all the attempts failed, before raising the error we should check if the module already exists. + except: # noqa: all the attempts failed, before raising the error we should check if the module already exists. if os.path.isdir(main_folder_path): hashes = [h for h in os.listdir(main_folder_path) if len(h) == 64] if hashes: @@ -382,7 +388,7 @@ def _get_modification_time(module_hash): logger.warning( f"Using the latest cached version of the module from {os.path.join(main_folder_path, hash)} " f"(last modified on {time.ctime(_get_modification_time(hash))}) since it " - f"couldn't be found locally at {combined_path} or remotely ({type(e).__name__})." + f"couldn't be found locally at {combined_path_abs} or at {path_abs}, or remotely." ) if return_resolved_file_path: with open(os.path.join(main_folder_path, hash, short_name + ".json")) as cache_metadata: @@ -421,7 +427,7 @@ def _get_modification_time(module_hash): f"Error in {module_type} script at {file_path}, importing relative {import_name} module " f"but {import_name} is the name of the {module_type} script. " f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' " - f"comment pointing to the original realtive import file path." + f"comment pointing to the original relative import file path." ) if import_type == "internal": url_or_filename = url_or_path_join(base_path, import_path + ".py") diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index f4cf169a4ab..c1df67a7216 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -27,14 +27,17 @@ import types from io import BytesIO as StringIO from multiprocessing import Pool, RLock +from pathlib import Path from shutil import disk_usage from types import CodeType, FunctionType -from typing import Callable, ClassVar, Generic, Optional, Tuple, Union +from typing import Callable, ClassVar, Generic, Optional, Tuple, TypeVar, Union import dill import numpy as np from tqdm import tqdm +from datasets.utils.typing import PathLike + from .logging import INFO, WARNING, get_logger, get_verbosity, set_verbosity_warning @@ -80,6 +83,12 @@ def size_str(size_in_bytes): return "{} {}".format(int(size_in_bytes), "bytes") +def rel_to_abs_path(path: PathLike) -> PathLike: + """Convert relative path to absolute path.""" + abs_path_str = os.path.abspath(os.path.expanduser(os.path.expandvars(str(path)))) + return type(path)(abs_path_str) + + @contextlib.contextmanager def temporary_assignment(obj, attr, value): """Temporarily assign obj.attr to value.""" diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py index f01188552f5..a93f88fdc4a 100644 --- a/tests/test_py_utils.py +++ b/tests/test_py_utils.py @@ -1,3 +1,4 @@ +import os from unittest import TestCase import numpy as np @@ -7,6 +8,7 @@ NestedDataStructure, flatten_nest_dict, map_nested, + rel_to_abs_path, temporary_assignment, zip_dict, zip_nested, From e8d9bd4887ed6c939f01139beb56caff912efc2e Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 20 Jul 2021 00:45:05 +0200 Subject: [PATCH 2/7] Move rel_to_abs_path from py_utils to file_utils --- src/datasets/load.py | 17 ++++++++--------- src/datasets/utils/__init__.py | 2 +- src/datasets/utils/file_utils.py | 10 +++++++++- src/datasets/utils/filelock.py | 4 +++- src/datasets/utils/py_utils.py | 11 +---------- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 3ab6794fead..1bbdd320669 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -29,8 +29,6 @@ import fsspec -from datasets.utils.py_utils import rel_to_abs_path - from . import config from .arrow_dataset import Dataset from .builder import DatasetBuilder @@ -51,6 +49,7 @@ hf_github_url, hf_hub_url, init_hf_modules, + relative_to_absolute_path, url_or_path_join, url_or_path_parent, ) @@ -311,9 +310,6 @@ def prepare_module( # - otherwise we assume path/name is a path to our S3 bucket combined_path = os.path.join(path, name) - combined_path_abs = rel_to_abs_path(combined_path) - path_abs = rel_to_abs_path(path) - if os.path.isfile(combined_path): file_path = combined_path local_path = combined_path @@ -322,6 +318,9 @@ def prepare_module( local_path = path else: # Try github (canonical datasets/metrics) and then S3 (users datasets/metrics) + + combined_path_abs = relative_to_absolute_path(combined_path) + path_abs = relative_to_absolute_path(path) try: head_hf_s3(path, filename=name, dataset=dataset, max_retries=download_config.max_retries) script_version = str(script_version) if script_version is not None else None @@ -332,10 +331,10 @@ def prepare_module( except FileNotFoundError: if script_version is not None: raise FileNotFoundError( - "Couldn't find remote file with version {} at {}. Please provide a valid version and a valid {} name".format( + "Couldn't find remote file with version {} at {}. Please provide a valid version and a valid {} name.".format( script_version, file_path, "dataset" if dataset else "metric" ) - ) + ) from None else: github_file_path = file_path file_path = hf_github_url(path=path, name=name, dataset=dataset, version="master") @@ -363,13 +362,13 @@ def prepare_module( local_path = cached_path(file_path, download_config=download_config) except FileNotFoundError: raise FileNotFoundError( - "Couldn't find file locally at {} or at {}, or remotely at {}. Please provide a valid {} name".format( + "Couldn't find file locally at {} or at {}, or remotely at {}. Please provide a valid {} name.".format( combined_path_abs, path_abs, file_path, "dataset" if dataset else "metric" ) ) from None else: raise FileNotFoundError( - "Couldn't find file locally at {} or at {}. Please provide a valid {} name".format( + "Couldn't find file locally at {} or at {}. Please provide a valid {} name.".format( combined_path_abs, path_abs, "dataset" if dataset else "metric" ) ) diff --git a/src/datasets/utils/__init__.py b/src/datasets/utils/__init__.py index 623f4c955b6..f2d85472d01 100644 --- a/src/datasets/utils/__init__.py +++ b/src/datasets/utils/__init__.py @@ -19,7 +19,7 @@ from . import logging from .download_manager import DownloadManager, GenerateMode -from .file_utils import DownloadConfig, cached_path, hf_bucket_url, is_remote_url, temp_seed +from .file_utils import DownloadConfig, cached_path, hf_bucket_url, is_remote_url, relative_to_absolute_path, temp_seed from .mock_download_manager import MockDownloadManager from .py_utils import ( NonMutableDict, diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 47a1f02686d..7cbf4b2a869 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -18,7 +18,7 @@ from functools import partial from hashlib import sha256 from pathlib import Path -from typing import Dict, Optional, Union +from typing import Dict, Optional, TypeVar, Union from urllib.parse import urlparse import numpy as np @@ -36,6 +36,8 @@ INCOMPLETE_SUFFIX = ".incomplete" +T = TypeVar("T", str, Path) + def init_hf_modules(hf_modules_cache: Optional[Union[Path, str]] = None) -> str: """ @@ -126,6 +128,12 @@ def is_relative_path(url_or_filename: str) -> bool: return urlparse(url_or_filename).scheme == "" and not os.path.isabs(url_or_filename) +def relative_to_absolute_path(path: T) -> T: + """Convert relative path to absolute path.""" + abs_path_str = os.path.abspath(os.path.expanduser(os.path.expandvars(str(path)))) + return Path(abs_path_str) if isinstance(path, Path) else abs_path_str + + def hf_bucket_url(identifier: str, filename: str, use_cdn=False, dataset=True) -> str: if dataset: endpoint = config.CLOUDFRONT_DATASETS_DISTRIB_PREFIX if use_cdn else config.S3_DATASETS_BUCKET_PREFIX diff --git a/src/datasets/utils/filelock.py b/src/datasets/utils/filelock.py index fe4452dcb78..66e64a529ea 100644 --- a/src/datasets/utils/filelock.py +++ b/src/datasets/utils/filelock.py @@ -354,7 +354,9 @@ class WindowsFileLock(BaseFileLock): def __init__(self, lock_file, timeout=-1, max_filename_length=255): super().__init__(lock_file, timeout=timeout, max_filename_length=max_filename_length) - self._lock_file = "\\\\?\\" + os.path.abspath(os.path.expanduser(os.path.expandvars(self._lock_file))) + from .file_utils import relative_to_absolute_path + + self._lock_file = "\\\\?\\" + relative_to_absolute_path(self.lock_file) def _acquire(self): open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index c1df67a7216..f4cf169a4ab 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -27,17 +27,14 @@ import types from io import BytesIO as StringIO from multiprocessing import Pool, RLock -from pathlib import Path from shutil import disk_usage from types import CodeType, FunctionType -from typing import Callable, ClassVar, Generic, Optional, Tuple, TypeVar, Union +from typing import Callable, ClassVar, Generic, Optional, Tuple, Union import dill import numpy as np from tqdm import tqdm -from datasets.utils.typing import PathLike - from .logging import INFO, WARNING, get_logger, get_verbosity, set_verbosity_warning @@ -83,12 +80,6 @@ def size_str(size_in_bytes): return "{} {}".format(int(size_in_bytes), "bytes") -def rel_to_abs_path(path: PathLike) -> PathLike: - """Convert relative path to absolute path.""" - abs_path_str = os.path.abspath(os.path.expanduser(os.path.expandvars(str(path)))) - return type(path)(abs_path_str) - - @contextlib.contextmanager def temporary_assignment(obj, attr, value): """Temporarily assign obj.attr to value.""" From 28875e2c3dd3d943feb296ee0aa548989998f236 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 20 Jul 2021 00:58:58 +0200 Subject: [PATCH 3/7] Reorder lines in filelock --- src/datasets/utils/filelock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/filelock.py b/src/datasets/utils/filelock.py index 66e64a529ea..d440348e7c2 100644 --- a/src/datasets/utils/filelock.py +++ b/src/datasets/utils/filelock.py @@ -353,9 +353,9 @@ class WindowsFileLock(BaseFileLock): """ def __init__(self, lock_file, timeout=-1, max_filename_length=255): - super().__init__(lock_file, timeout=timeout, max_filename_length=max_filename_length) from .file_utils import relative_to_absolute_path + super().__init__(lock_file, timeout=timeout, max_filename_length=max_filename_length) self._lock_file = "\\\\?\\" + relative_to_absolute_path(self.lock_file) def _acquire(self): From 917bdc7ae925d971398a65d7ec1f9b6ba3239cfa Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 20 Jul 2021 17:07:30 +0200 Subject: [PATCH 4/7] Remove 'from None' in prepare_module --- src/datasets/load.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 1bbdd320669..4d69a0a3f04 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -334,7 +334,7 @@ def prepare_module( "Couldn't find remote file with version {} at {}. Please provide a valid version and a valid {} name.".format( script_version, file_path, "dataset" if dataset else "metric" ) - ) from None + ) else: github_file_path = file_path file_path = hf_github_url(path=path, name=name, dataset=dataset, version="master") @@ -352,7 +352,7 @@ def prepare_module( "The file is also not present on the master branch on github.".format( combined_path_abs, path_abs, github_file_path ) - ) from None + ) elif path.count("/") == 1: # users datasets/metrics: s3 path (hub for datasets and s3 for metrics) if dataset: file_path = hf_hub_url(path=path, name=name, version=script_version) @@ -365,7 +365,7 @@ def prepare_module( "Couldn't find file locally at {} or at {}, or remotely at {}. Please provide a valid {} name.".format( combined_path_abs, path_abs, file_path, "dataset" if dataset else "metric" ) - ) from None + ) else: raise FileNotFoundError( "Couldn't find file locally at {} or at {}. Please provide a valid {} name.".format( From 0505a2c0273391c72da1be0c8086ee9636f37947 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 20 Jul 2021 17:10:39 +0200 Subject: [PATCH 5/7] Return FileNotFoundError notice --- src/datasets/load.py | 4 ++-- tests/test_py_utils.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 4d69a0a3f04..77a5c6434e8 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -372,7 +372,7 @@ def prepare_module( combined_path_abs, path_abs, "dataset" if dataset else "metric" ) ) - except: # noqa: all the attempts failed, before raising the error we should check if the module already exists. + except Exception as e: # noqa: all the attempts failed, before raising the error we should check if the module already exists. if os.path.isdir(main_folder_path): hashes = [h for h in os.listdir(main_folder_path) if len(h) == 64] if hashes: @@ -387,7 +387,7 @@ def _get_modification_time(module_hash): logger.warning( f"Using the latest cached version of the module from {os.path.join(main_folder_path, hash)} " f"(last modified on {time.ctime(_get_modification_time(hash))}) since it " - f"couldn't be found locally at {combined_path_abs} or at {path_abs}, or remotely." + f"couldn't be found locally at {combined_path_abs} or at {path_abs}, or remotely ({type(e).__name__})." ) if return_resolved_file_path: with open(os.path.join(main_folder_path, hash, short_name + ".json")) as cache_metadata: diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py index a93f88fdc4a..f01188552f5 100644 --- a/tests/test_py_utils.py +++ b/tests/test_py_utils.py @@ -1,4 +1,3 @@ -import os from unittest import TestCase import numpy as np @@ -8,7 +7,6 @@ NestedDataStructure, flatten_nest_dict, map_nested, - rel_to_abs_path, temporary_assignment, zip_dict, zip_nested, From 47b05aff93d992c44ef77304807fb1a78f4c9670 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 20 Jul 2021 20:39:42 +0200 Subject: [PATCH 6/7] Update tests --- tests/test_load.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_load.py b/tests/test_load.py index 68c6c87f27c..cc690dcf800 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1,5 +1,6 @@ import importlib import os +import re import shutil import tempfile import time @@ -224,7 +225,10 @@ def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory assert "Using the latest cached version of the module" in caplog.text with pytest.raises(FileNotFoundError) as exc_info: datasets.load_dataset("_dummy") - assert "at " + os.path.join("_dummy", "_dummy.py") in str(exc_info.value) + m_combined_path = re.search(fr"\S*{re.escape(os.path.join('_dummy', '_dummy.py'))}\b", str(exc_info.value)) + assert m_combined_path is not None and os.path.isabs(m_combined_path.group()) + m_path = re.search(r"\S*_dummy\b", str(exc_info.value)) + assert m_path is not None and os.path.isabs(m_path.group()) @require_streaming From 815bfe8bd6962c4e31a803f5c19006d5db30f1d6 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 22 Jul 2021 15:28:03 +0200 Subject: [PATCH 7/7] show combined path only once --- src/datasets/load.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 77a5c6434e8..c28335ecb7c 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -308,7 +308,7 @@ def prepare_module( # - if os.path.join(path, name) is a file or a remote url # - if path is a file or a remote url # - otherwise we assume path/name is a path to our S3 bucket - combined_path = os.path.join(path, name) + combined_path = path if path.endswith(name) else os.path.join(path, name) if os.path.isfile(combined_path): file_path = combined_path @@ -320,7 +320,6 @@ def prepare_module( # Try github (canonical datasets/metrics) and then S3 (users datasets/metrics) combined_path_abs = relative_to_absolute_path(combined_path) - path_abs = relative_to_absolute_path(path) try: head_hf_s3(path, filename=name, dataset=dataset, max_retries=download_config.max_retries) script_version = str(script_version) if script_version is not None else None @@ -341,16 +340,16 @@ def prepare_module( try: local_path = cached_path(file_path, download_config=download_config) logger.warning( - "Couldn't find file locally at {} or at {}, or remotely at {}.\n" + "Couldn't find file locally at {}, or remotely at {}.\n" "The file was picked from the master branch on github instead at {}.".format( - combined_path_abs, path_abs, github_file_path, file_path + combined_path_abs, github_file_path, file_path ) ) except FileNotFoundError: raise FileNotFoundError( - "Couldn't find file locally at {} or at {}, or remotely at {}.\n" + "Couldn't find file locally at {}, or remotely at {}.\n" "The file is also not present on the master branch on github.".format( - combined_path_abs, path_abs, github_file_path + combined_path_abs, github_file_path ) ) elif path.count("/") == 1: # users datasets/metrics: s3 path (hub for datasets and s3 for metrics) @@ -362,14 +361,14 @@ def prepare_module( local_path = cached_path(file_path, download_config=download_config) except FileNotFoundError: raise FileNotFoundError( - "Couldn't find file locally at {} or at {}, or remotely at {}. Please provide a valid {} name.".format( - combined_path_abs, path_abs, file_path, "dataset" if dataset else "metric" + "Couldn't find file locally at {}, or remotely at {}. Please provide a valid {} name.".format( + combined_path_abs, file_path, "dataset" if dataset else "metric" ) ) else: raise FileNotFoundError( - "Couldn't find file locally at {} or at {}. Please provide a valid {} name.".format( - combined_path_abs, path_abs, "dataset" if dataset else "metric" + "Couldn't find file locally at {}. Please provide a valid {} name.".format( + combined_path_abs, "dataset" if dataset else "metric" ) ) except Exception as e: # noqa: all the attempts failed, before raising the error we should check if the module already exists. @@ -387,7 +386,7 @@ def _get_modification_time(module_hash): logger.warning( f"Using the latest cached version of the module from {os.path.join(main_folder_path, hash)} " f"(last modified on {time.ctime(_get_modification_time(hash))}) since it " - f"couldn't be found locally at {combined_path_abs} or at {path_abs}, or remotely ({type(e).__name__})." + f"couldn't be found locally at {combined_path_abs}, or remotely ({type(e).__name__})." ) if return_resolved_file_path: with open(os.path.join(main_folder_path, hash, short_name + ".json")) as cache_metadata: