From dfb8d41c7fc7ffade08cac5a0b820f2de4e55064 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 12 Jun 2021 07:29:12 +0200 Subject: [PATCH 1/4] Test extracted datasets path --- tests/conftest.py | 14 ++++++++------ tests/test_file_utils.py | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4f110012a69..69d7e613c75 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,12 +15,14 @@ def set_test_cache_config(tmp_path_factory, monkeypatch): # test_hf_cache_home = tmp_path_factory.mktemp("cache") # TODO: why a cache dir per test function does not work? test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache" - test_hf_datasets_cache = str(test_hf_cache_home / "datasets") - test_hf_metrics_cache = str(test_hf_cache_home / "metrics") - test_hf_modules_cache = str(test_hf_cache_home / "modules") - monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", test_hf_datasets_cache) - monkeypatch.setattr("datasets.config.HF_METRICS_CACHE", test_hf_metrics_cache) - monkeypatch.setattr("datasets.config.HF_MODULES_CACHE", test_hf_modules_cache) + test_hf_datasets_cache = test_hf_cache_home / "datasets" + test_hf_metrics_cache = test_hf_cache_home / "metrics" + test_hf_modules_cache = test_hf_cache_home / "modules" + monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache)) + monkeypatch.setattr("datasets.config.HF_METRICS_CACHE", str(test_hf_metrics_cache)) + monkeypatch.setattr("datasets.config.HF_MODULES_CACHE", str(test_hf_modules_cache)) + test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted" + monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path)) FILE_CONTENT = """\ diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py index 978d80c7742..aedc5985099 100644 --- a/tests/test_file_utils.py +++ b/tests/test_file_utils.py @@ -84,6 +84,31 @@ def test_cached_path_extract(xz_file, tmp_path, text_file): assert extracted_file_content == expected_file_content +@pytest.mark.parametrize("default_extracted", [True, False]) +@pytest.mark.parametrize("default_cache_dir", [True, False]) +def test_extracted_datasets_path( + default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch +): + custom_cache_dir = "custom_cache" + custom_extracted_dir = "custom_extracted_dir" + custom_extracted_path = tmp_path / "custom_extracted_path" + if default_extracted: + expected = ("downloads" if default_cache_dir else custom_cache_dir, "extracted") + else: + monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_DIR", custom_extracted_dir) + monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(custom_extracted_path)) + expected = custom_extracted_path.parts[-2:] if default_cache_dir else (custom_cache_dir, custom_extracted_dir) + + filename = xz_file + download_config = ( + DownloadConfig(extract_compressed_file=True) + if default_cache_dir + else DownloadConfig(cache_dir=tmp_path / custom_cache_dir, extract_compressed_file=True) + ) + extracted_file_path = cached_path(filename, download_config=download_config) + assert Path(extracted_file_path).parent.parts[-2:] == expected + + def test_cached_path_local(text_file): # absolute path text_file = str(Path(text_file).resolve()) From 2a961c2a6d59c107865f7fe946c78be879e25688 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 12 Jun 2021 07:35:18 +0200 Subject: [PATCH 2/4] Set configurable extracted datasets path --- src/datasets/config.py | 4 ++++ src/datasets/utils/file_utils.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/datasets/config.py b/src/datasets/config.py index 582481efd31..2845c62d18a 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -128,6 +128,10 @@ DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules") HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE)) +EXTRACTED_DATASETS_DIR = "extracted" +DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, "downloads", EXTRACTED_DATASETS_DIR) +EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH)) + # Batch size constants. For more info, see: # https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations) DEFAULT_MAX_BATCH_SIZE = 10_000 diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 1c37a469659..e6fcd0d8347 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -312,9 +312,15 @@ def cached_path( return output_path # Path where we extract compressed archives - # We extract in the cache dir, and get the extracted path name by hashing the original path" + # We extract in the cache dir, and get the extracted path name by hashing the original path abs_output_path = os.path.abspath(output_path) - output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path)) + output_path_extracted = ( + os.path.join( + download_config.cache_dir, config.EXTRACTED_DATASETS_DIR, hash_url_to_filename(abs_output_path) + ) + if download_config.cache_dir + else os.path.join(config.EXTRACTED_DATASETS_PATH, hash_url_to_filename(abs_output_path)) + ) if ( os.path.isdir(output_path_extracted) From 48d38bd03c8e6db411f496144516d4438537eaf9 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 12 Jun 2021 07:52:34 +0200 Subject: [PATCH 3/4] Fix style --- tests/test_file_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py index aedc5985099..38052eea26b 100644 --- a/tests/test_file_utils.py +++ b/tests/test_file_utils.py @@ -86,9 +86,7 @@ def test_cached_path_extract(xz_file, tmp_path, text_file): @pytest.mark.parametrize("default_extracted", [True, False]) @pytest.mark.parametrize("default_cache_dir", [True, False]) -def test_extracted_datasets_path( - default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch -): +def test_extracted_datasets_path(default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch): custom_cache_dir = "custom_cache" custom_extracted_dir = "custom_extracted_dir" custom_extracted_path = tmp_path / "custom_extracted_path" From d7eb2510bcf7e9a6e722316b88316cdf61019564 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 14 Jun 2021 10:45:47 +0200 Subject: [PATCH 4/4] Use DEFAULT_DOWNLOADED_DATASETS_PATH to define DEFAULT_EXTRACTED_DATASETS_PATH --- src/datasets/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/config.py b/src/datasets/config.py index 938d36af67e..9ed22ea3760 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -133,7 +133,7 @@ DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH)) EXTRACTED_DATASETS_DIR = "extracted" -DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, "downloads", EXTRACTED_DATASETS_DIR) +DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR) EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH)) # Batch size constants. For more info, see: