From dfb8d41c7fc7ffade08cac5a0b820f2de4e55064 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Sat, 12 Jun 2021 07:29:12 +0200
Subject: [PATCH 1/4] Test extracted datasets path

---
 tests/conftest.py        | 14 ++++++++------
 tests/test_file_utils.py | 25 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 4f110012a69..69d7e613c75 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,12 +15,14 @@
 def set_test_cache_config(tmp_path_factory, monkeypatch):
     # test_hf_cache_home = tmp_path_factory.mktemp("cache")  # TODO: why a cache dir per test function does not work?
     test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache"
-    test_hf_datasets_cache = str(test_hf_cache_home / "datasets")
-    test_hf_metrics_cache = str(test_hf_cache_home / "metrics")
-    test_hf_modules_cache = str(test_hf_cache_home / "modules")
-    monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", test_hf_datasets_cache)
-    monkeypatch.setattr("datasets.config.HF_METRICS_CACHE", test_hf_metrics_cache)
-    monkeypatch.setattr("datasets.config.HF_MODULES_CACHE", test_hf_modules_cache)
+    test_hf_datasets_cache = test_hf_cache_home / "datasets"
+    test_hf_metrics_cache = test_hf_cache_home / "metrics"
+    test_hf_modules_cache = test_hf_cache_home / "modules"
+    monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache))
+    monkeypatch.setattr("datasets.config.HF_METRICS_CACHE", str(test_hf_metrics_cache))
+    monkeypatch.setattr("datasets.config.HF_MODULES_CACHE", str(test_hf_modules_cache))
+    test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted"
+    monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path))
 
 
 FILE_CONTENT = """\
diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py
index 978d80c7742..aedc5985099 100644
--- a/tests/test_file_utils.py
+++ b/tests/test_file_utils.py
@@ -84,6 +84,31 @@ def test_cached_path_extract(xz_file, tmp_path, text_file):
     assert extracted_file_content == expected_file_content
 
 
+@pytest.mark.parametrize("default_extracted", [True, False])
+@pytest.mark.parametrize("default_cache_dir", [True, False])
+def test_extracted_datasets_path(
+    default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch
+):
+    custom_cache_dir = "custom_cache"
+    custom_extracted_dir = "custom_extracted_dir"
+    custom_extracted_path = tmp_path / "custom_extracted_path"
+    if default_extracted:
+        expected = ("downloads" if default_cache_dir else custom_cache_dir, "extracted")
+    else:
+        monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_DIR", custom_extracted_dir)
+        monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(custom_extracted_path))
+        expected = custom_extracted_path.parts[-2:] if default_cache_dir else (custom_cache_dir, custom_extracted_dir)
+
+    filename = xz_file
+    download_config = (
+        DownloadConfig(extract_compressed_file=True)
+        if default_cache_dir
+        else DownloadConfig(cache_dir=tmp_path / custom_cache_dir, extract_compressed_file=True)
+    )
+    extracted_file_path = cached_path(filename, download_config=download_config)
+    assert Path(extracted_file_path).parent.parts[-2:] == expected
+
+
 def test_cached_path_local(text_file):
     # absolute path
     text_file = str(Path(text_file).resolve())

From 2a961c2a6d59c107865f7fe946c78be879e25688 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Sat, 12 Jun 2021 07:35:18 +0200
Subject: [PATCH 2/4] Set configurable extracted datasets path

---
 src/datasets/config.py           |  4 ++++
 src/datasets/utils/file_utils.py | 10 ++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/datasets/config.py b/src/datasets/config.py
index 582481efd31..2845c62d18a 100644
--- a/src/datasets/config.py
+++ b/src/datasets/config.py
@@ -128,6 +128,10 @@
 DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
 HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
 
+EXTRACTED_DATASETS_DIR = "extracted"
+DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, "downloads", EXTRACTED_DATASETS_DIR)
+EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
+
 # Batch size constants. For more info, see:
 # https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
 DEFAULT_MAX_BATCH_SIZE = 10_000
diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
index 1c37a469659..e6fcd0d8347 100644
--- a/src/datasets/utils/file_utils.py
+++ b/src/datasets/utils/file_utils.py
@@ -312,9 +312,15 @@ def cached_path(
             return output_path
 
         # Path where we extract compressed archives
-        # We extract in the cache dir, and get the extracted path name by hashing the original path"
+        # We extract in the cache dir, and get the extracted path name by hashing the original path
         abs_output_path = os.path.abspath(output_path)
-        output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path))
+        output_path_extracted = (
+            os.path.join(
+                download_config.cache_dir, config.EXTRACTED_DATASETS_DIR, hash_url_to_filename(abs_output_path)
+            )
+            if download_config.cache_dir
+            else os.path.join(config.EXTRACTED_DATASETS_PATH, hash_url_to_filename(abs_output_path))
+        )
 
         if (
             os.path.isdir(output_path_extracted)

From 48d38bd03c8e6db411f496144516d4438537eaf9 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Sat, 12 Jun 2021 07:52:34 +0200
Subject: [PATCH 3/4] Fix style

---
 tests/test_file_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py
index aedc5985099..38052eea26b 100644
--- a/tests/test_file_utils.py
+++ b/tests/test_file_utils.py
@@ -86,9 +86,7 @@ def test_cached_path_extract(xz_file, tmp_path, text_file):
 
 @pytest.mark.parametrize("default_extracted", [True, False])
 @pytest.mark.parametrize("default_cache_dir", [True, False])
-def test_extracted_datasets_path(
-    default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch
-):
+def test_extracted_datasets_path(default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch):
     custom_cache_dir = "custom_cache"
     custom_extracted_dir = "custom_extracted_dir"
     custom_extracted_path = tmp_path / "custom_extracted_path"

From d7eb2510bcf7e9a6e722316b88316cdf61019564 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 14 Jun 2021 10:45:47 +0200
Subject: [PATCH 4/4] Use DEFAULT_DOWNLOADED_DATASETS_PATH to define
 DEFAULT_EXTRACTED_DATASETS_PATH

---
 src/datasets/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/config.py b/src/datasets/config.py
index 938d36af67e..9ed22ea3760 100644
--- a/src/datasets/config.py
+++ b/src/datasets/config.py
@@ -133,7 +133,7 @@
 DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH))
 
 EXTRACTED_DATASETS_DIR = "extracted"
-DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, "downloads", EXTRACTED_DATASETS_DIR)
+DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
 EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
 
 # Batch size constants. For more info, see: