Skip to content

Commit 74d6021

Browse files
authored
Use hf-internal-testing repos for hosting test dataset repos (#6180)
* Use `hf-internal-testing` repos for testing * Fix
1 parent 392d8a4 commit 74d6021

File tree

6 files changed

+29
-30
lines changed

6 files changed

+29
-30
lines changed

tests/commands/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import datasets
1111
1212
13-
REPO_URL = "https://huggingface.co/datasets/albertvillanova/tests-raw-jsonl/resolve/main/"
13+
REPO_URL = "https://huggingface.co/datasets/hf-internal-testing/raw_jsonl/resolve/main/"
1414
URLS = {"train": REPO_URL + "wikiann-bn-train.jsonl", "validation": REPO_URL + "wikiann-bn-validation.jsonl"}
1515
1616

tests/packaged_modules/test_folder_based_builder.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515

1616

1717
remote_files = [
18-
"https://huggingface.co/datasets/polinaeterna/texts/resolve/main/hallo.txt",
19-
"https://huggingface.co/datasets/polinaeterna/texts/resolve/main/hello.txt",
20-
"https://huggingface.co/datasets/polinaeterna/texts/resolve/main/class1/bonjour.txt",
21-
"https://huggingface.co/datasets/polinaeterna/texts/resolve/main/class1/bonjour2.txt",
18+
"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/hallo.txt",
19+
"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/hello.txt",
20+
"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/class1/bonjour.txt",
21+
"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/class1/bonjour2.txt",
2222
]
2323

2424

tests/test_inspect.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,14 @@ def test_get_dataset_config_info_error(path, config_name, expected_exception):
6666
@pytest.mark.parametrize(
6767
"path, expected",
6868
[
69-
("squad", ["plain_text"]),
7069
("acronym_identification", ["default"]),
71-
("lhoestq/squad", ["plain_text"]),
72-
("lhoestq/test", ["default"]),
73-
("lhoestq/demo1", ["default"]),
70+
("squad", ["plain_text"]),
71+
("hf-internal-testing/dataset_with_script", ["default"]),
7472
("dalle-mini/wit", ["default"]),
75-
("datasets-maintainers/audiofolder_no_configs_in_metadata", ["default"]),
76-
("datasets-maintainers/audiofolder_single_config_in_metadata", ["custom"]),
77-
("datasets-maintainers/audiofolder_two_configs_in_metadata", ["v1", "v2"]),
73+
("hf-internal-testing/librispeech_asr_dummy", ["clean", "other"]),
74+
("hf-internal-testing/audiofolder_no_configs_in_metadata", ["default"]),
75+
("hf-internal-testing/audiofolder_single_config_in_metadata", ["custom"]),
76+
("hf-internal-testing/audiofolder_two_configs_in_metadata", ["v1", "v2"]),
7877
],
7978
)
8079
def test_get_dataset_config_names(path, expected):

tests/test_iterable_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
DEFAULT_BATCH_SIZE = 4
6060
DEFAULT_FILEPATH = "file.txt"
6161

62-
SAMPLE_DATASET_IDENTIFIER = "lhoestq/test" # has dataset script
62+
SAMPLE_DATASET_IDENTIFIER = "hf-internal-testing/dataset_with_script" # has dataset script
6363

6464

6565
def generate_examples_fn(**kwargs):

tests/test_load.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,17 @@ def _generate_examples(self, filepath, **kwargs):
8181
yield i, {"text": line.strip()}
8282
"""
8383

84-
SAMPLE_DATASET_IDENTIFIER = "lhoestq/test" # has dataset script
85-
SAMPLE_DATASET_IDENTIFIER2 = "lhoestq/test2" # only has data files
86-
SAMPLE_DATASET_IDENTIFIER3 = "mariosasko/test_multi_dir_dataset" # has multiple data directories
87-
SAMPLE_DATASET_IDENTIFIER4 = "mariosasko/test_imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories
88-
SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = "lhoestq/_dummy"
84+
SAMPLE_DATASET_IDENTIFIER = "hf-internal-testing/dataset_with_script" # has dataset script
85+
SAMPLE_DATASET_IDENTIFIER2 = "hf-internal-testing/dataset_with_data_files" # only has data files
86+
SAMPLE_DATASET_IDENTIFIER3 = "hf-internal-testing/multi_dir_dataset" # has multiple data directories
87+
SAMPLE_DATASET_IDENTIFIER4 = "hf-internal-testing/imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories
88+
SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = "hf-internal-testing/_dummy"
8989
SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST = "_dummy"
90-
SAMPLE_DATASET_NO_CONFIGS_IN_METADATA = "datasets-maintainers/audiofolder_no_configs_in_metadata"
91-
SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA = "datasets-maintainers/audiofolder_single_config_in_metadata"
92-
SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "datasets-maintainers/audiofolder_two_configs_in_metadata"
90+
SAMPLE_DATASET_NO_CONFIGS_IN_METADATA = "hf-internal-testing/audiofolder_no_configs_in_metadata"
91+
SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_single_config_in_metadata"
92+
SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_two_configs_in_metadata"
9393
SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT = (
94-
"datasets-maintainers/audiofolder_two_configs_in_metadata_with_default"
94+
"hf-internal-testing/audiofolder_two_configs_in_metadata_with_default"
9595
)
9696

9797

@@ -876,18 +876,18 @@ def test_load_dataset_from_hub(self):
876876
str(context.exception),
877877
)
878878

879-
def test_load_dataset_users(self):
879+
def test_load_dataset_namespace(self):
880880
with self.assertRaises(FileNotFoundError) as context:
881-
datasets.load_dataset("lhoestq/_dummy")
881+
datasets.load_dataset("hf-internal-testing/_dummy")
882882
self.assertIn(
883-
"lhoestq/_dummy",
883+
"hf-internal-testing/_dummy",
884884
str(context.exception),
885885
)
886886
for offline_simulation_mode in list(OfflineSimulationMode):
887887
with offline(offline_simulation_mode):
888888
with self.assertRaises(ConnectionError) as context:
889-
datasets.load_dataset("lhoestq/_dummy")
890-
self.assertIn("lhoestq/_dummy", str(context.exception), msg=offline_simulation_mode)
889+
datasets.load_dataset("hf-internal-testing/_dummy")
890+
self.assertIn("hf-internal-testing/_dummy", str(context.exception), msg=offline_simulation_mode)
891891

892892

893893
@pytest.mark.integration
@@ -1064,7 +1064,7 @@ def test_load_dataset_streaming_gz_json(jsonl_gz_path):
10641064
"path", ["sample.jsonl", "sample.jsonl.gz", "sample.tar", "sample.jsonl.xz", "sample.zip", "sample.jsonl.zst"]
10651065
)
10661066
def test_load_dataset_streaming_compressed_files(path):
1067-
repo_id = "albertvillanova/datasets-tests-compression"
1067+
repo_id = "hf-internal-testing/compressed_files"
10681068
data_files = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{path}"
10691069
if data_files[-3:] in ("zip", "tar"): # we need to glob "*" inside archives
10701070
data_files = data_files[-3:] + "://*::" + data_files
@@ -1394,7 +1394,7 @@ def test_load_from_disk_with_default_in_memory(
13941394

13951395
@pytest.mark.integration
13961396
def test_remote_data_files():
1397-
repo_id = "albertvillanova/tests-raw-jsonl"
1397+
repo_id = "hf-internal-testing/raw_jsonl"
13981398
filename = "wikiann-bn-validation.jsonl"
13991399
data_files = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{filename}"
14001400
ds = load_dataset("json", split="train", data_files=data_files, streaming=True)

tests/test_streaming_download_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from .utils import require_lz4, require_zstandard, slow
3434

3535

36-
TEST_URL = "https://huggingface.co/datasets/lhoestq/test/raw/main/some_text.txt"
36+
TEST_URL = "https://huggingface.co/datasets/hf-internal-testing/dataset_with_script/raw/main/some_text.txt"
3737
TEST_URL_CONTENT = "foo\nbar\nfoobar"
3838

3939
TEST_GG_DRIVE_FILENAME = "train.tsv"

0 commit comments

Comments
 (0)