@@ -424,93 +424,6 @@ def get_module(self) -> MetricModule:
424424 raise NotImplementedError
425425
426426
427- class GithubDatasetModuleFactory (_DatasetModuleFactory ):
428- """
429- Get the module of a dataset from GitHub (legacy).
430- The dataset script is downloaded from GitHub.
431- This class will eventually be removed and a HubDatasetModuleFactory will be used instead.
432- """
433-
434- def __init__ (
435- self ,
436- name : str ,
437- revision : Optional [Union [str , Version ]] = None ,
438- download_config : Optional [DownloadConfig ] = None ,
439- download_mode : Optional [DownloadMode ] = None ,
440- dynamic_modules_path : Optional [str ] = None ,
441- ):
442- self .name = name
443- self .revision = revision
444- self .download_config = download_config .copy () if download_config else DownloadConfig ()
445- if self .download_config .max_retries < 3 :
446- self .download_config .max_retries = 3
447- self .download_mode = download_mode
448- self .dynamic_modules_path = dynamic_modules_path
449- assert self .name .count ("/" ) == 0
450- increase_load_count (name , resource_type = "dataset" )
451-
452- def download_loading_script (self , revision : Optional [str ]) -> str :
453- file_path = hf_github_url (path = self .name , name = self .name + ".py" , revision = revision )
454- download_config = self .download_config .copy ()
455- if download_config .download_desc is None :
456- download_config .download_desc = "Downloading builder script"
457- return cached_path (file_path , download_config = download_config )
458-
459- def download_dataset_infos_file (self , revision : Optional [str ]) -> str :
460- dataset_infos = hf_github_url (path = self .name , name = config .DATASETDICT_INFOS_FILENAME , revision = revision )
461- # Download the dataset infos file if available
462- download_config = self .download_config .copy ()
463- if download_config .download_desc is None :
464- download_config .download_desc = "Downloading metadata"
465- try :
466- return cached_path (
467- dataset_infos ,
468- download_config = download_config ,
469- )
470- except (FileNotFoundError , ConnectionError ):
471- return None
472-
473- def get_module (self ) -> DatasetModule :
474- # get script and other files
475- revision = self .revision
476- try :
477- local_path = self .download_loading_script (revision )
478- except FileNotFoundError :
479- if revision is not None or os .getenv ("HF_SCRIPTS_VERSION" , None ) is not None :
480- raise
481- else :
482- revision = "main"
483- local_path = self .download_loading_script (revision )
484- logger .warning (
485- f"Couldn't find a directory or a dataset named '{ self .name } ' in this version. "
486- f"It was picked from the main branch on github instead."
487- )
488- dataset_infos_path = self .download_dataset_infos_file (revision )
489- imports = get_imports (local_path )
490- local_imports = _download_additional_modules (
491- name = self .name ,
492- base_path = hf_github_url (path = self .name , name = "" , revision = revision ),
493- imports = imports ,
494- download_config = self .download_config ,
495- )
496- additional_files = [(config .DATASETDICT_INFOS_FILENAME , dataset_infos_path )] if dataset_infos_path else []
497- # copy the script and the files in an importable directory
498- dynamic_modules_path = self .dynamic_modules_path if self .dynamic_modules_path else init_dynamic_modules ()
499- module_path , hash = _create_importable_file (
500- local_path = local_path ,
501- local_imports = local_imports ,
502- additional_files = additional_files ,
503- dynamic_modules_path = dynamic_modules_path ,
504- module_namespace = "datasets" ,
505- name = self .name ,
506- download_mode = self .download_mode ,
507- )
508- # make the new module to be noticed by the import system
509- importlib .invalidate_caches ()
510- builder_kwargs = {"hash" : hash , "base_path" : hf_hub_url (self .name , "" , revision = self .revision )}
511- return DatasetModule (module_path , hash , builder_kwargs )
512-
513-
514427class GithubMetricModuleFactory (_MetricModuleFactory ):
515428 """Get the module of a metric. The metric script is downloaded from GitHub.
516429
@@ -554,7 +467,7 @@ def get_module(self) -> MetricModule:
554467 local_path = self .download_loading_script (revision )
555468 revision = self .revision
556469 except FileNotFoundError :
557- if revision is not None or os . getenv ( "HF_SCRIPTS_VERSION" , None ) is not None :
470+ if revision is not None :
558471 raise
559472 else :
560473 revision = "main"
@@ -917,11 +830,11 @@ def __init__(
917830 self .download_config = download_config or DownloadConfig ()
918831 self .download_mode = download_mode
919832 self .dynamic_modules_path = dynamic_modules_path
920- assert self .name .count ("/" ) = = 1
833+ assert self .name .count ("/" ) < = 1
921834 increase_load_count (name , resource_type = "dataset" )
922835
923836 def download_loading_script (self ) -> str :
924- file_path = hf_hub_url (repo_id = self .name , path = self .name .split ("/" )[1 ] + ".py" , revision = self .revision )
837+ file_path = hf_hub_url (repo_id = self .name , path = self .name .split ("/" )[- 1 ] + ".py" , revision = self .revision )
925838 download_config = self .download_config .copy ()
926839 if download_config .download_desc is None :
927840 download_config .download_desc = "Downloading builder script"
@@ -1197,67 +1110,57 @@ def dataset_module_factory(
11971110 elif is_relative_path (path ) and path .count ("/" ) <= 1 :
11981111 try :
11991112 _raise_if_offline_mode_is_enabled ()
1200- if path .count ("/" ) == 0 : # even though the dataset is on the Hub, we get it from GitHub for now
1201- # TODO(QL): use a Hub dataset module factory instead of GitHub
1202- return GithubDatasetModuleFactory (
1113+ hf_api = HfApi (config .HF_ENDPOINT )
1114+ try :
1115+ if isinstance (download_config .use_auth_token , bool ):
1116+ token = HfFolder .get_token () if download_config .use_auth_token else None
1117+ else :
1118+ token = download_config .use_auth_token
1119+ dataset_info = hf_api .dataset_info (
1120+ repo_id = path ,
1121+ revision = revision ,
1122+ token = token if token else "no-token" ,
1123+ timeout = 100.0 ,
1124+ )
1125+ except Exception as e : # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
1126+ if isinstance (
1127+ e ,
1128+ (
1129+ OfflineModeIsEnabled ,
1130+ requests .exceptions .ConnectTimeout ,
1131+ requests .exceptions .ConnectionError ,
1132+ ),
1133+ ):
1134+ raise ConnectionError (f"Couldn't reach '{ path } ' on the Hub ({ type (e ).__name__ } )" )
1135+ elif "404" in str (e ):
1136+ msg = f"Dataset '{ path } ' doesn't exist on the Hub"
1137+ raise FileNotFoundError (msg + f" at revision '{ revision } '" if revision else msg )
1138+ elif "401" in str (e ):
1139+ msg = f"Dataset '{ path } ' doesn't exist on the Hub"
1140+ msg = msg + f" at revision '{ revision } '" if revision else msg
1141+ raise FileNotFoundError (
1142+ msg
1143+ + ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
1144+ )
1145+ else :
1146+ raise e
1147+ if filename in [sibling .rfilename for sibling in dataset_info .siblings ]:
1148+ return HubDatasetModuleFactoryWithScript (
12031149 path ,
12041150 revision = revision ,
12051151 download_config = download_config ,
12061152 download_mode = download_mode ,
12071153 dynamic_modules_path = dynamic_modules_path ,
12081154 ).get_module ()
1209- elif path .count ("/" ) == 1 : # community dataset on the Hub
1210- hf_api = HfApi (config .HF_ENDPOINT )
1211- try :
1212- if isinstance (download_config .use_auth_token , bool ):
1213- token = HfFolder .get_token () if download_config .use_auth_token else None
1214- else :
1215- token = download_config .use_auth_token
1216- dataset_info = hf_api .dataset_info (
1217- repo_id = path ,
1218- revision = revision ,
1219- token = token if token else "no-token" ,
1220- timeout = 100.0 ,
1221- )
1222- except Exception as e : # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
1223- if isinstance (
1224- e ,
1225- (
1226- OfflineModeIsEnabled ,
1227- requests .exceptions .ConnectTimeout ,
1228- requests .exceptions .ConnectionError ,
1229- ),
1230- ):
1231- raise ConnectionError (f"Couldn't reach '{ path } ' on the Hub ({ type (e ).__name__ } )" )
1232- elif "404" in str (e ):
1233- msg = f"Dataset '{ path } ' doesn't exist on the Hub"
1234- raise FileNotFoundError (msg + f" at revision '{ revision } '" if revision else msg )
1235- elif "401" in str (e ):
1236- msg = f"Dataset '{ path } ' doesn't exist on the Hub"
1237- msg = msg + f" at revision '{ revision } '" if revision else msg
1238- raise FileNotFoundError (
1239- msg
1240- + ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
1241- )
1242- else :
1243- raise e
1244- if filename in [sibling .rfilename for sibling in dataset_info .siblings ]:
1245- return HubDatasetModuleFactoryWithScript (
1246- path ,
1247- revision = revision ,
1248- download_config = download_config ,
1249- download_mode = download_mode ,
1250- dynamic_modules_path = dynamic_modules_path ,
1251- ).get_module ()
1252- else :
1253- return HubDatasetModuleFactoryWithoutScript (
1254- path ,
1255- revision = revision ,
1256- data_dir = data_dir ,
1257- data_files = data_files ,
1258- download_config = download_config ,
1259- download_mode = download_mode ,
1260- ).get_module ()
1155+ else :
1156+ return HubDatasetModuleFactoryWithoutScript (
1157+ path ,
1158+ revision = revision ,
1159+ data_dir = data_dir ,
1160+ data_files = data_files ,
1161+ download_config = download_config ,
1162+ download_mode = download_mode ,
1163+ ).get_module ()
12611164 except Exception as e1 : # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
12621165 try :
12631166 return CachedDatasetModuleFactory (path , dynamic_modules_path = dynamic_modules_path ).get_module ()
@@ -1624,7 +1527,6 @@ def load_dataset(
16241527 Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
16251528 contain the path or URL to the original data files and the code to load examples from the original data files.
16261529
1627- You can find some of the scripts here: https://github.com/huggingface/datasets/tree/main/datasets
16281530 You can find the complete list of datasets in the Datasets Hub at https://huggingface.co/datasets
16291531
16301532 2. Run the dataset script which will:
0 commit comments