Skip to content

Commit e98efc8

Browse files
fix: use unique repo name for second scenario to eliminate race
The previous fix tried to wait for repo readiness, but had a logic flaw: waiting could succeed by seeing the OLD repo's ghost state before full deletion, then the push would fail against that dying repo. The correct fix: use a unique repo name (ds_name_2) for the second scenario instead of reusing ds_name. This completely eliminates the race condition between repo deletion and creation with the same name. Two independent test scenarios = two independent repos = no race.
1 parent 3c23371 commit e98efc8

File tree

1 file changed

+9
-12
lines changed

1 file changed

+9
-12
lines changed

tests/test_upstream_hub.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -354,14 +354,11 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
354354

355355
# Push to hub two times, but the second time with fewer files.
356356
# Verify that the new files contain the correct dataset and that non-necessary files have been deleted.
357-
with temporary_repo(ds_name):
358-
# Wait for repo to be ready after creation (avoid race with previous repo deletion)
359-
self._wait_for_repo_ready(ds_name)
360-
361-
local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)
357+
with temporary_repo() as ds_name_2:
358+
local_ds.push_to_hub(ds_name_2, token=self._token, max_shard_size=500 << 5)
362359

363360
# Wait for Hub to fully process the first push
364-
self._wait_for_repo_ready(ds_name)
361+
self._wait_for_repo_ready(ds_name_2)
365362

366363
with tempfile.TemporaryDirectory() as tmp:
367364
# Add a file starting with "data" to ensure it doesn't get deleted.
@@ -372,18 +369,18 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
372369
self._api.upload_file(
373370
path_or_fileobj=str(path),
374371
path_in_repo="datafile.txt",
375-
repo_id=ds_name,
372+
repo_id=ds_name_2,
376373
repo_type="dataset",
377374
token=self._token,
378375
)
379376

380377
# Wait again before second push
381-
self._wait_for_repo_ready(ds_name)
378+
self._wait_for_repo_ready(ds_name_2)
382379

383-
local_ds.push_to_hub(ds_name, token=self._token)
380+
local_ds.push_to_hub(ds_name_2, token=self._token)
384381

385382
# Ensure that there are two files on the repository that have the correct name
386-
files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
383+
files = sorted(self._api.list_repo_files(ds_name_2, repo_type="dataset", token=self._token))
387384
assert files == [
388385
".gitattributes",
389386
"README.md",
@@ -393,9 +390,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
393390
]
394391

395392
# Keeping the "datafile.txt" breaks the load_dataset to think it's a text-based dataset
396-
self._api.delete_file("datafile.txt", repo_id=ds_name, repo_type="dataset", token=self._token)
393+
self._api.delete_file("datafile.txt", repo_id=ds_name_2, repo_type="dataset", token=self._token)
397394

398-
hub_ds = load_dataset(ds_name, download_mode="force_redownload")
395+
hub_ds = load_dataset(ds_name_2, download_mode="force_redownload")
399396

400397
assert local_ds.column_names == hub_ds.column_names
401398
assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())

0 commit comments

Comments
 (0)