fix: use unique repo name for second scenario to eliminate race

The-Obstacle-Is-The-Way · The-Obstacle-Is-The-Way · commit e98efc8c32bf · 2025-11-29T11:52:52.000-05:00
The previous fix tried to wait for repo readiness, but had a logic flaw:
waiting could succeed by seeing the OLD repo's ghost state before full
deletion, then the push would fail against that dying repo.

The correct fix: use a unique repo name (ds_name_2) for the second
scenario instead of reusing ds_name. This completely eliminates the
race condition between repo deletion and creation with the same name.

Two independent test scenarios = two independent repos = no race.
diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -354,14 +354,11 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
 
         # Push to hub two times, but the second time with fewer files.
         # Verify that the new files contain the correct dataset and that non-necessary files have been deleted.
-        with temporary_repo(ds_name):
-            # Wait for repo to be ready after creation (avoid race with previous repo deletion)
-            self._wait_for_repo_ready(ds_name)
-
-            local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)
+        with temporary_repo() as ds_name_2:
+            local_ds.push_to_hub(ds_name_2, token=self._token, max_shard_size=500 << 5)
 
             # Wait for Hub to fully process the first push
-            self._wait_for_repo_ready(ds_name)
+            self._wait_for_repo_ready(ds_name_2)
 
             with tempfile.TemporaryDirectory() as tmp:
                 # Add a file starting with "data" to ensure it doesn't get deleted.
@@ -372,18 +369,18 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
                 self._api.upload_file(
                     path_or_fileobj=str(path),
                     path_in_repo="datafile.txt",
-                    repo_id=ds_name,
+                    repo_id=ds_name_2,
                     repo_type="dataset",
                     token=self._token,
                 )
 
             # Wait again before second push
-            self._wait_for_repo_ready(ds_name)
+            self._wait_for_repo_ready(ds_name_2)
 
-            local_ds.push_to_hub(ds_name, token=self._token)
+            local_ds.push_to_hub(ds_name_2, token=self._token)
 
             # Ensure that there are two files on the repository that have the correct name
-            files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
+            files = sorted(self._api.list_repo_files(ds_name_2, repo_type="dataset", token=self._token))
             assert files == [
                 ".gitattributes",
                 "README.md",
@@ -393,9 +390,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
             ]
 
             # Keeping the "datafile.txt" breaks the load_dataset to think it's a text-based dataset
-            self._api.delete_file("datafile.txt", repo_id=ds_name, repo_type="dataset", token=self._token)
+            self._api.delete_file("datafile.txt", repo_id=ds_name_2, repo_type="dataset", token=self._token)
 
-            hub_ds = load_dataset(ds_name, download_mode="force_redownload")
+            hub_ds = load_dataset(ds_name_2, download_mode="force_redownload")
 
             assert local_ds.column_names == hub_ds.column_names
             assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())