Skip to content

Commit c900ef8

Browse files
committed
use the Hub instead of GitHub
1 parent e195bc1 commit c900ef8

File tree

5 files changed

+61
-172
lines changed

5 files changed

+61
-172
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ on:
99
- main
1010

1111
env:
12-
HF_SCRIPTS_VERSION: main
1312
HF_ALLOW_CODE_EVAL: 1
1413

1514
jobs:

src/datasets/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@
3636
"If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`."
3737
)
3838

39-
SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__
40-
4139
del platform
4240
del pyarrow
4341
del version

src/datasets/load.py

Lines changed: 48 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -424,93 +424,6 @@ def get_module(self) -> MetricModule:
424424
raise NotImplementedError
425425

426426

427-
class GithubDatasetModuleFactory(_DatasetModuleFactory):
428-
"""
429-
Get the module of a dataset from GitHub (legacy).
430-
The dataset script is downloaded from GitHub.
431-
This class will eventually be removed and a HubDatasetModuleFactory will be used instead.
432-
"""
433-
434-
def __init__(
435-
self,
436-
name: str,
437-
revision: Optional[Union[str, Version]] = None,
438-
download_config: Optional[DownloadConfig] = None,
439-
download_mode: Optional[DownloadMode] = None,
440-
dynamic_modules_path: Optional[str] = None,
441-
):
442-
self.name = name
443-
self.revision = revision
444-
self.download_config = download_config.copy() if download_config else DownloadConfig()
445-
if self.download_config.max_retries < 3:
446-
self.download_config.max_retries = 3
447-
self.download_mode = download_mode
448-
self.dynamic_modules_path = dynamic_modules_path
449-
assert self.name.count("/") == 0
450-
increase_load_count(name, resource_type="dataset")
451-
452-
def download_loading_script(self, revision: Optional[str]) -> str:
453-
file_path = hf_github_url(path=self.name, name=self.name + ".py", revision=revision)
454-
download_config = self.download_config.copy()
455-
if download_config.download_desc is None:
456-
download_config.download_desc = "Downloading builder script"
457-
return cached_path(file_path, download_config=download_config)
458-
459-
def download_dataset_infos_file(self, revision: Optional[str]) -> str:
460-
dataset_infos = hf_github_url(path=self.name, name=config.DATASETDICT_INFOS_FILENAME, revision=revision)
461-
# Download the dataset infos file if available
462-
download_config = self.download_config.copy()
463-
if download_config.download_desc is None:
464-
download_config.download_desc = "Downloading metadata"
465-
try:
466-
return cached_path(
467-
dataset_infos,
468-
download_config=download_config,
469-
)
470-
except (FileNotFoundError, ConnectionError):
471-
return None
472-
473-
def get_module(self) -> DatasetModule:
474-
# get script and other files
475-
revision = self.revision
476-
try:
477-
local_path = self.download_loading_script(revision)
478-
except FileNotFoundError:
479-
if revision is not None or os.getenv("HF_SCRIPTS_VERSION", None) is not None:
480-
raise
481-
else:
482-
revision = "main"
483-
local_path = self.download_loading_script(revision)
484-
logger.warning(
485-
f"Couldn't find a directory or a dataset named '{self.name}' in this version. "
486-
f"It was picked from the main branch on github instead."
487-
)
488-
dataset_infos_path = self.download_dataset_infos_file(revision)
489-
imports = get_imports(local_path)
490-
local_imports = _download_additional_modules(
491-
name=self.name,
492-
base_path=hf_github_url(path=self.name, name="", revision=revision),
493-
imports=imports,
494-
download_config=self.download_config,
495-
)
496-
additional_files = [(config.DATASETDICT_INFOS_FILENAME, dataset_infos_path)] if dataset_infos_path else []
497-
# copy the script and the files in an importable directory
498-
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
499-
module_path, hash = _create_importable_file(
500-
local_path=local_path,
501-
local_imports=local_imports,
502-
additional_files=additional_files,
503-
dynamic_modules_path=dynamic_modules_path,
504-
module_namespace="datasets",
505-
name=self.name,
506-
download_mode=self.download_mode,
507-
)
508-
# make the new module to be noticed by the import system
509-
importlib.invalidate_caches()
510-
builder_kwargs = {"hash": hash, "base_path": hf_hub_url(self.name, "", revision=self.revision)}
511-
return DatasetModule(module_path, hash, builder_kwargs)
512-
513-
514427
class GithubMetricModuleFactory(_MetricModuleFactory):
515428
"""Get the module of a metric. The metric script is downloaded from GitHub.
516429
@@ -554,7 +467,7 @@ def get_module(self) -> MetricModule:
554467
local_path = self.download_loading_script(revision)
555468
revision = self.revision
556469
except FileNotFoundError:
557-
if revision is not None or os.getenv("HF_SCRIPTS_VERSION", None) is not None:
470+
if revision is not None:
558471
raise
559472
else:
560473
revision = "main"
@@ -917,11 +830,11 @@ def __init__(
917830
self.download_config = download_config or DownloadConfig()
918831
self.download_mode = download_mode
919832
self.dynamic_modules_path = dynamic_modules_path
920-
assert self.name.count("/") == 1
833+
assert self.name.count("/") <= 1
921834
increase_load_count(name, resource_type="dataset")
922835

923836
def download_loading_script(self) -> str:
924-
file_path = hf_hub_url(repo_id=self.name, path=self.name.split("/")[1] + ".py", revision=self.revision)
837+
file_path = hf_hub_url(repo_id=self.name, path=self.name.split("/")[-1] + ".py", revision=self.revision)
925838
download_config = self.download_config.copy()
926839
if download_config.download_desc is None:
927840
download_config.download_desc = "Downloading builder script"
@@ -1197,67 +1110,57 @@ def dataset_module_factory(
11971110
elif is_relative_path(path) and path.count("/") <= 1:
11981111
try:
11991112
_raise_if_offline_mode_is_enabled()
1200-
if path.count("/") == 0: # even though the dataset is on the Hub, we get it from GitHub for now
1201-
# TODO(QL): use a Hub dataset module factory instead of GitHub
1202-
return GithubDatasetModuleFactory(
1113+
hf_api = HfApi(config.HF_ENDPOINT)
1114+
try:
1115+
if isinstance(download_config.use_auth_token, bool):
1116+
token = HfFolder.get_token() if download_config.use_auth_token else None
1117+
else:
1118+
token = download_config.use_auth_token
1119+
dataset_info = hf_api.dataset_info(
1120+
repo_id=path,
1121+
revision=revision,
1122+
token=token if token else "no-token",
1123+
timeout=100.0,
1124+
)
1125+
except Exception as e: # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
1126+
if isinstance(
1127+
e,
1128+
(
1129+
OfflineModeIsEnabled,
1130+
requests.exceptions.ConnectTimeout,
1131+
requests.exceptions.ConnectionError,
1132+
),
1133+
):
1134+
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
1135+
elif "404" in str(e):
1136+
msg = f"Dataset '{path}' doesn't exist on the Hub"
1137+
raise FileNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
1138+
elif "401" in str(e):
1139+
msg = f"Dataset '{path}' doesn't exist on the Hub"
1140+
msg = msg + f" at revision '{revision}'" if revision else msg
1141+
raise FileNotFoundError(
1142+
msg
1143+
+ ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
1144+
)
1145+
else:
1146+
raise e
1147+
if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
1148+
return HubDatasetModuleFactoryWithScript(
12031149
path,
12041150
revision=revision,
12051151
download_config=download_config,
12061152
download_mode=download_mode,
12071153
dynamic_modules_path=dynamic_modules_path,
12081154
).get_module()
1209-
elif path.count("/") == 1: # community dataset on the Hub
1210-
hf_api = HfApi(config.HF_ENDPOINT)
1211-
try:
1212-
if isinstance(download_config.use_auth_token, bool):
1213-
token = HfFolder.get_token() if download_config.use_auth_token else None
1214-
else:
1215-
token = download_config.use_auth_token
1216-
dataset_info = hf_api.dataset_info(
1217-
repo_id=path,
1218-
revision=revision,
1219-
token=token if token else "no-token",
1220-
timeout=100.0,
1221-
)
1222-
except Exception as e: # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
1223-
if isinstance(
1224-
e,
1225-
(
1226-
OfflineModeIsEnabled,
1227-
requests.exceptions.ConnectTimeout,
1228-
requests.exceptions.ConnectionError,
1229-
),
1230-
):
1231-
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
1232-
elif "404" in str(e):
1233-
msg = f"Dataset '{path}' doesn't exist on the Hub"
1234-
raise FileNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
1235-
elif "401" in str(e):
1236-
msg = f"Dataset '{path}' doesn't exist on the Hub"
1237-
msg = msg + f" at revision '{revision}'" if revision else msg
1238-
raise FileNotFoundError(
1239-
msg
1240-
+ ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
1241-
)
1242-
else:
1243-
raise e
1244-
if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
1245-
return HubDatasetModuleFactoryWithScript(
1246-
path,
1247-
revision=revision,
1248-
download_config=download_config,
1249-
download_mode=download_mode,
1250-
dynamic_modules_path=dynamic_modules_path,
1251-
).get_module()
1252-
else:
1253-
return HubDatasetModuleFactoryWithoutScript(
1254-
path,
1255-
revision=revision,
1256-
data_dir=data_dir,
1257-
data_files=data_files,
1258-
download_config=download_config,
1259-
download_mode=download_mode,
1260-
).get_module()
1155+
else:
1156+
return HubDatasetModuleFactoryWithoutScript(
1157+
path,
1158+
revision=revision,
1159+
data_dir=data_dir,
1160+
data_files=data_files,
1161+
download_config=download_config,
1162+
download_mode=download_mode,
1163+
).get_module()
12611164
except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
12621165
try:
12631166
return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
@@ -1624,7 +1527,6 @@ def load_dataset(
16241527
Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
16251528
contain the path or URL to the original data files and the code to load examples from the original data files.
16261529
1627-
You can find some of the scripts here: https://github.com/huggingface/datasets/tree/main/datasets
16281530
You can find the complete list of datasets in the Datasets Hub at https://huggingface.co/datasets
16291531
16301532
2. Run the dataset script which will:

src/datasets/utils/file_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from urllib.parse import urljoin, urlparse
2424

2525
import requests
26+
from packaging import version
2627

2728
from .. import __version__, config
2829
from ..download.download_config import DownloadConfig
@@ -97,9 +98,9 @@ def head_hf_s3(
9798

9899

99100
def hf_github_url(path: str, name: str, dataset=True, revision: Optional[str] = None) -> str:
100-
from .. import SCRIPTS_VERSION
101101

102-
revision = revision or os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION)
102+
default_revision = "main" if version.parse(__version__).is_devrelease else __version__
103+
revision = revision or default_revision
103104
if dataset:
104105
return config.REPO_DATASETS_URL.format(revision=revision, path=path, name=name)
105106
else:

tests/test_load.py

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import requests
1414

1515
import datasets
16-
from datasets import SCRIPTS_VERSION, config, load_dataset, load_from_disk
16+
from datasets import config, load_dataset, load_from_disk
1717
from datasets.arrow_dataset import Dataset
1818
from datasets.builder import DatasetBuilder
1919
from datasets.data_files import DataFilesDict
@@ -24,7 +24,6 @@
2424
from datasets.load import (
2525
CachedDatasetModuleFactory,
2626
CachedMetricModuleFactory,
27-
GithubDatasetModuleFactory,
2827
GithubMetricModuleFactory,
2928
HubDatasetModuleFactoryWithoutScript,
3029
HubDatasetModuleFactoryWithScript,
@@ -255,15 +254,6 @@ def setUp(self):
255254
hf_modules_cache=self.hf_modules_cache,
256255
)
257256

258-
def test_GithubDatasetModuleFactory(self):
259-
# "wmt_t2t" has additional imports (internal)
260-
factory = GithubDatasetModuleFactory(
261-
"wmt_t2t", download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path
262-
)
263-
module_factory_result = factory.get_module()
264-
assert importlib.import_module(module_factory_result.module_path) is not None
265-
assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
266-
267257
def test_GithubMetricModuleFactory_with_internal_import(self):
268258
# "squad_v2" requires additional imports (internal)
269259
factory = GithubMetricModuleFactory(
@@ -479,7 +469,6 @@ def test_CachedMetricModuleFactory(self):
479469
[
480470
CachedDatasetModuleFactory,
481471
CachedMetricModuleFactory,
482-
GithubDatasetModuleFactory,
483472
GithubMetricModuleFactory,
484473
HubDatasetModuleFactoryWithoutScript,
485474
HubDatasetModuleFactoryWithScript,
@@ -490,10 +479,7 @@ def test_CachedMetricModuleFactory(self):
490479
],
491480
)
492481
def test_module_factories(factory_class):
493-
if issubclass(factory_class, (HubDatasetModuleFactoryWithoutScript, HubDatasetModuleFactoryWithScript)):
494-
name = "dummy_org/dummy_name"
495-
else:
496-
name = "dummy_name"
482+
name = "dummy_name"
497483
factory = factory_class(name)
498484
assert factory.name == name
499485

@@ -576,18 +562,21 @@ def test_offline_dataset_module_factory(self):
576562
self.assertNotEqual(dataset_module_1.module_path, dataset_module_3.module_path)
577563
self.assertIn("Using the latest cached version of the module", self._caplog.text)
578564

579-
def test_load_dataset_from_github(self):
580-
scripts_version = os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION)
565+
def test_load_dataset_from_hub(self):
581566
with self.assertRaises(FileNotFoundError) as context:
582567
datasets.load_dataset("_dummy")
583568
self.assertIn(
584-
"https://raw.githubusercontent.com/huggingface/datasets/main/datasets/_dummy/_dummy.py",
569+
"Dataset '_dummy' doesn't exist on the Hub",
585570
str(context.exception),
586571
)
587572
with self.assertRaises(FileNotFoundError) as context:
588573
datasets.load_dataset("_dummy", revision="0.0.0")
589574
self.assertIn(
590-
"https://raw.githubusercontent.com/huggingface/datasets/0.0.0/datasets/_dummy/_dummy.py",
575+
"Dataset '_dummy' doesn't exist on the Hub",
576+
str(context.exception),
577+
)
578+
self.assertIn(
579+
"at revision '0.0.0'",
591580
str(context.exception),
592581
)
593582
for offline_simulation_mode in list(OfflineSimulationMode):
@@ -596,7 +585,7 @@ def test_load_dataset_from_github(self):
596585
datasets.load_dataset("_dummy")
597586
if offline_simulation_mode != OfflineSimulationMode.HF_DATASETS_OFFLINE_SET_TO_1:
598587
self.assertIn(
599-
f"https://raw.githubusercontent.com/huggingface/datasets/{scripts_version}/datasets/_dummy/_dummy.py",
588+
"Couldn't reach '_dummy' on the Hub",
600589
str(context.exception),
601590
)
602591

0 commit comments

Comments
 (0)