Skip identical files in push_to_hub instead of overwriting (#4402)

mariosasko · web-flow · commit 7c8106d94395 · 2022-05-25T17:08:02.000+02:00
* Resume download instead of pushing identical files

* Update tests

* Update glob

* Add test

* Use fnmatch in tests

* Add warning when resuming upload
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -17,6 +17,7 @@
 
 import contextlib
 import copy
+import itertools
 import json
 import os
 import shutil
@@ -3949,60 +3950,69 @@ def shards_with_embedded_external_files(shards):
             shards = shards_with_embedded_external_files(shards)
 
         files = api.list_repo_files(repo_id, repo_type="dataset", revision=branch, token=token)
-        files = [file for file in files if file.startswith("data/")]
+        data_files = [file for file in files if file.startswith("data/")]
 
-        def path_in_repo(_index):
-            return f"data/{split}-{_index:05d}-of-{num_shards:05d}.parquet"
+        def path_in_repo(_index, shard):
+            return f"data/{split}-{_index:05d}-of-{num_shards:05d}-{shard._fingerprint}.parquet"
 
-        # Only delete file shards that don't currently exist. Others will be overwritten if the content is different
-        # or will be left intact is the content is identical.
-        def should_delete_file(file_name):
-            file_to_overwrite = file_name in [path_in_repo(i) for i in range(num_shards)]
-            file_from_same_split = file_name.startswith(f"data/{split}-")
+        shards_iter = iter(shards)
+        first_shard = next(shards_iter)
+        first_shard_path_in_repo = path_in_repo(0, first_shard)
+        if first_shard_path_in_repo in data_files:
+            logger.warning("Resuming upload of dataset shards")
 
-            return file_from_same_split and not file_to_overwrite
+        uploaded_size = 0
+        shards_path_in_repo = []
+        for index, shard in logging.tqdm(
+            enumerate(itertools.chain([first_shard], shards_iter)),
+            desc="Pushing dataset shards to the dataset hub",
+            total=num_shards,
+            disable=not logging.is_progress_bar_enabled(),
+        ):
+            shard_path_in_repo = path_in_repo(index, shard)
+            # Upload a shard only if it doesn't already exist in the repository
+            if shard_path_in_repo not in data_files:
+                buffer = BytesIO()
+                shard.to_parquet(buffer)
+                uploaded_size += buffer.tell()
+                _retry(
+                    api.upload_file,
+                    func_kwargs=dict(
+                        path_or_fileobj=buffer.getvalue(),
+                        path_in_repo=shard_path_in_repo,
+                        repo_id=repo_id,
+                        token=token,
+                        repo_type="dataset",
+                        revision=branch,
+                        identical_ok=False,
+                    ),
+                    exceptions=HTTPError,
+                    status_codes=[504],
+                    base_wait_time=2.0,
+                    max_retries=5,
+                    max_wait_time=20.0,
+                )
+            shards_path_in_repo.append(shard_path_in_repo)
 
-        file_shards_to_delete = [file for file in files if should_delete_file(file)]
+        # Cleanup to remove unused files
+        data_files_to_delete = [
+            data_file
+            for data_file in data_files
+            if data_file.startswith(f"data/{split}-") and data_file not in shards_path_in_repo
+        ]
 
         def delete_file(file):
             api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)
 
-        if len(file_shards_to_delete):
-            for file in logging.tqdm(
-                file_shards_to_delete,
+        if len(data_files_to_delete):
+            for data_file in logging.tqdm(
+                data_files_to_delete,
                 desc="Deleting unused files from dataset repository",
-                total=len(file_shards_to_delete),
+                total=len(data_files_to_delete),
                 disable=not logging.is_progress_bar_enabled(),
             ):
-                delete_file(file)
+                delete_file(data_file)
 
-        uploaded_size = 0
-        for index, shard in logging.tqdm(
-            enumerate(shards),
-            desc="Pushing dataset shards to the dataset hub",
-            total=num_shards,
-            disable=not logging.is_progress_bar_enabled(),
-        ):
-            buffer = BytesIO()
-            shard.to_parquet(buffer)
-            uploaded_size += buffer.tell()
-            _retry(
-                api.upload_file,
-                func_kwargs=dict(
-                    path_or_fileobj=buffer.getvalue(),
-                    path_in_repo=path_in_repo(index),
-                    repo_id=repo_id,
-                    token=token,
-                    repo_type="dataset",
-                    revision=branch,
-                    identical_ok=True,
-                ),
-                exceptions=HTTPError,
-                status_codes=[504],
-                base_wait_time=2.0,
-                max_retries=5,
-                max_wait_time=20.0,
-            )
         return repo_id, split, uploaded_size, dataset_nbytes
 
     def push_to_hub(
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -24,7 +24,7 @@ class Url(str):
     pass
 
 
-SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"
+SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
 
 DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
     str(Split.TRAIN): ["**train*"],
diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -1,3 +1,4 @@
+import fnmatch
 import os
 import tempfile
 import time
@@ -77,7 +78,14 @@ def test_push_dataset_dict_to_hub_no_token(self):
 
             # Ensure that there is a single file on the repository that has the correct name
             files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset"))
-            self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"])
+            self.assertTrue(
+                all(
+                    fnmatch.fnmatch(file, expected_file)
+                    for file, expected_file in zip(
+                        files, [".gitattributes", "data/train-00000-of-00001-*.parquet", "dataset_infos.json"]
+                    )
+                )
+            )
         finally:
             self.cleanup_repo(ds_name)
 
@@ -97,7 +105,14 @@ def test_push_dataset_dict_to_hub_name_without_namespace(self):
 
             # Ensure that there is a single file on the repository that has the correct name
             files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset"))
-            self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"])
+            self.assertTrue(
+                all(
+                    fnmatch.fnmatch(file, expected_file)
+                    for file, expected_file in zip(
+                        files, [".gitattributes", "data/train-00000-of-00001-*.parquet", "dataset_infos.json"]
+                    )
+                )
+            )
         finally:
             self.cleanup_repo(ds_name)
 
@@ -131,7 +146,14 @@ def test_push_dataset_dict_to_hub_private(self):
 
             # Ensure that there is a single file on the repository that has the correct name
             files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
-            self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"])
+            self.assertTrue(
+                all(
+                    fnmatch.fnmatch(file, expected_file)
+                    for file, expected_file in zip(
+                        files, [".gitattributes", "data/train-00000-of-00001-*.parquet", "dataset_infos.json"]
+                    )
+                )
+            )
         finally:
             self.cleanup_repo(ds_name)
 
@@ -151,7 +173,14 @@ def test_push_dataset_dict_to_hub(self):
 
             # Ensure that there is a single file on the repository that has the correct name
             files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
-            self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"])
+            self.assertTrue(
+                all(
+                    fnmatch.fnmatch(file, expected_file)
+                    for file, expected_file in zip(
+                        files, [".gitattributes", "data/train-00000-of-00001-*.parquet", "dataset_infos.json"]
+                    )
+                )
+            )
         finally:
             self.cleanup_repo(ds_name)
 
@@ -171,14 +200,19 @@ def test_push_dataset_dict_to_hub_multiple_files(self):
 
             # Ensure that there are two files on the repository that have the correct name
             files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
-            self.assertListEqual(
-                files,
-                [
-                    ".gitattributes",
-                    "data/train-00000-of-00002.parquet",
-                    "data/train-00001-of-00002.parquet",
-                    "dataset_infos.json",
-                ],
+            self.assertTrue(
+                all(
+                    fnmatch.fnmatch(file, expected_file)
+                    for file, expected_file in zip(
+                        files,
+                        [
+                            ".gitattributes",
+                            "data/train-00000-of-00002-*.parquet",
+                            "data/train-00001-of-00002-*.parquet",
+                            "dataset_infos.json",
+                        ],
+                    )
+                )
             )
         finally:
             self.cleanup_repo(ds_name)
@@ -214,16 +248,22 @@ def test_push_dataset_dict_to_hub_overwrite_files(self):
 
             # Ensure that there are two files on the repository that have the correct name
             files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
-            self.assertListEqual(
-                files,
-                [
-                    ".gitattributes",
-                    "data/random-00000-of-00001.parquet",
-                    "data/train-00000-of-00002.parquet",
-                    "data/train-00001-of-00002.parquet",
-                    "datafile.txt",
-                    "dataset_infos.json",
-                ],
+
+            self.assertTrue(
+                all(
+                    fnmatch.fnmatch(file, expected_file)
+                    for file, expected_file in zip(
+                        files,
+                        [
+                            ".gitattributes",
+                            "data/random-00000-of-00001-*.parquet",
+                            "data/train-00000-of-00002-*.parquet",
+                            "data/train-00001-of-00002-*.parquet",
+                            "datafile.txt",
+                            "dataset_infos.json",
+                        ],
+                    )
+                )
             )
 
             self._api.delete_file("datafile.txt", repo_id=ds_name, repo_type="dataset", token=self._token)
@@ -260,15 +300,21 @@ def test_push_dataset_dict_to_hub_overwrite_files(self):
 
             # Ensure that there are two files on the repository that have the correct name
             files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
-            self.assertListEqual(
-                files,
-                [
-                    ".gitattributes",
-                    "data/random-00000-of-00001.parquet",
-                    "data/train-00000-of-00001.parquet",
-                    "datafile.txt",
-                    "dataset_infos.json",
-                ],
+
+            self.assertTrue(
+                all(
+                    fnmatch.fnmatch(file, expected_file)
+                    for file, expected_file in zip(
+                        files,
+                        [
+                            ".gitattributes",
+                            "data/random-00000-of-00001-*.parquet",
+                            "data/train-00000-of-00001-*.parquet",
+                            "datafile.txt",
+                            "dataset_infos.json",
+                        ],
+                    )
+                )
             )
 
             # Keeping the "datafile.txt" breaks the load_dataset to think it's a text-based dataset
@@ -403,6 +449,34 @@ def test_push_dataset_to_hub_custom_splits(self):
         finally:
             self.cleanup_repo(ds_name)
 
+    def test_push_to_dataset_skip_identical_files(self):
+        ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
+        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
+        try:
+            with patch("datasets.arrow_dataset.HfApi.upload_file", side_effect=self._api.upload_file) as mock_hf_api:
+                # Initial push
+                ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB")
+                call_count_old = mock_hf_api.call_count
+                mock_hf_api.reset_mock()
+
+                # Remove a data file
+                files = self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token)
+                data_files = [f for f in files if f.startswith("data/")]
+                self.assertGreater(len(data_files), 1)
+                self._api.delete_file(data_files[0], repo_id=ds_name, repo_type="dataset", token=self._token)
+
+                # "Resume" push - push missing files
+                ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB")
+                call_count_new = mock_hf_api.call_count
+                self.assertGreater(call_count_old, call_count_new)
+
+            hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")
+            self.assertListEqual(ds.column_names, hub_ds.column_names)
+            self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys()))
+            self.assertDictEqual(ds.features, hub_ds.features)
+        finally:
+            self.cleanup_repo(ds_name)
+
     def test_push_dataset_dict_to_hub_custom_splits(self):
         ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})