2727import time
2828import warnings
2929from collections import Counter
30+ from contextlib import nullcontext
3031from dataclasses import dataclass , field
3132from pathlib import Path
3233from typing import Any , Dict , List , Mapping , Optional , Sequence , Tuple , Type , Union
7071)
7172from .splits import Split
7273from .utils import _datasets_server
73- from .utils ._filelock import FileLock
7474from .utils .deprecation_utils import deprecated
7575from .utils .file_utils import (
7676 OfflineModeIsEnabled ,
8787from .utils .info_utils import VerificationMode , is_small_dataset
8888from .utils .logging import get_logger
8989from .utils .metadata import MetadataConfigs
90- from .utils .py_utils import get_imports
90+ from .utils .py_utils import get_imports , lock_importable_file
9191from .utils .version import Version
9292
9393
@@ -244,7 +244,10 @@ def __reduce__(self): # to make dynamically created class pickable, see _Initia
244244def get_dataset_builder_class (
245245 dataset_module : "DatasetModule" , dataset_name : Optional [str ] = None
246246) -> Type [DatasetBuilder ]:
247- builder_cls = import_main_class (dataset_module .module_path )
247+ with lock_importable_file (
248+ dataset_module .importable_file_path
249+ ) if dataset_module .importable_file_path else nullcontext ():
250+ builder_cls = import_main_class (dataset_module .module_path )
248251 if dataset_module .builder_configs_parameters .builder_configs :
249252 dataset_name = dataset_name or dataset_module .builder_kwargs .get ("dataset_name" )
250253 if dataset_name is None :
@@ -375,17 +378,15 @@ def _copy_script_and_other_resources_in_importable_dir(
375378 download_mode (Optional[Union[DownloadMode, str]]): download mode
376379
377380 Return:
378- importable_local_file : path to an importable module with importlib.import_module
381+ importable_file : path to an importable module with importlib.import_module
379382 """
380-
381383 # Define a directory with a unique name in our dataset or metric folder
382384 # path is: ./datasets|metrics/dataset|metric_name/hash_from_code/script.py
383385 # we use a hash as subdirectory_name to be able to have multiple versions of a dataset/metric processing file together
384386 importable_subdirectory = os .path .join (importable_directory_path , subdirectory_name )
385- importable_local_file = os .path .join (importable_subdirectory , name + ".py" )
387+ importable_file = os .path .join (importable_subdirectory , name + ".py" )
386388 # Prevent parallel disk operations
387- lock_path = importable_directory_path + ".lock"
388- with FileLock (lock_path ):
389+ with lock_importable_file (importable_file ):
389390 # Create main dataset/metrics folder if needed
390391 if download_mode == DownloadMode .FORCE_REDOWNLOAD and os .path .exists (importable_directory_path ):
391392 shutil .rmtree (importable_directory_path )
@@ -406,13 +407,13 @@ def _copy_script_and_other_resources_in_importable_dir(
406407 pass
407408
408409 # Copy dataset.py file in hash folder if needed
409- if not os .path .exists (importable_local_file ):
410- shutil .copyfile (original_local_path , importable_local_file )
410+ if not os .path .exists (importable_file ):
411+ shutil .copyfile (original_local_path , importable_file )
411412 # Record metadata associating original dataset path with local unique folder
412413 # Use os.path.splitext to split extension from importable_local_file
413- meta_path = os .path .splitext (importable_local_file )[0 ] + ".json"
414+ meta_path = os .path .splitext (importable_file )[0 ] + ".json"
414415 if not os .path .exists (meta_path ):
415- meta = {"original file path" : original_local_path , "local file path" : importable_local_file }
416+ meta = {"original file path" : original_local_path , "local file path" : importable_file }
416417 # the filename is *.py in our case, so better rename to filename.json instead of filename.py.json
417418 with open (meta_path , "w" , encoding = "utf-8" ) as meta_file :
418419 json .dump (meta , meta_file )
@@ -437,7 +438,7 @@ def _copy_script_and_other_resources_in_importable_dir(
437438 original_path , destination_additional_path
438439 ):
439440 shutil .copyfile (original_path , destination_additional_path )
440- return importable_local_file
441+ return importable_file
441442
442443
443444def _get_importable_file_path (
@@ -447,7 +448,7 @@ def _get_importable_file_path(
447448 name : str ,
448449) -> str :
449450 importable_directory_path = os .path .join (dynamic_modules_path , module_namespace , name .replace ("/" , "--" ))
450- return os .path .join (importable_directory_path , subdirectory_name , name + ".py" )
451+ return os .path .join (importable_directory_path , subdirectory_name , name . split ( "/" )[ - 1 ] + ".py" )
451452
452453
453454def _create_importable_file (
@@ -692,6 +693,7 @@ class DatasetModule:
692693 builder_kwargs : dict
693694 builder_configs_parameters : BuilderConfigsParameters = field (default_factory = BuilderConfigsParameters )
694695 dataset_infos : Optional [DatasetInfosDict ] = None
696+ importable_file_path : Optional [str ] = None
695697
696698
697699@dataclass
@@ -983,7 +985,7 @@ def get_module(self) -> DatasetModule:
983985 # make the new module to be noticed by the import system
984986 importlib .invalidate_caches ()
985987 builder_kwargs = {"base_path" : str (Path (self .path ).parent )}
986- return DatasetModule (module_path , hash , builder_kwargs )
988+ return DatasetModule (module_path , hash , builder_kwargs , importable_file_path = importable_file_path )
987989
988990
989991class LocalDatasetModuleFactoryWithoutScript (_DatasetModuleFactory ):
@@ -1536,7 +1538,7 @@ def get_module(self) -> DatasetModule:
15361538 "base_path" : hf_hub_url (self .name , "" , revision = self .revision ).rstrip ("/" ),
15371539 "repo_id" : self .name ,
15381540 }
1539- return DatasetModule (module_path , hash , builder_kwargs )
1541+ return DatasetModule (module_path , hash , builder_kwargs , importable_file_path = importable_file_path )
15401542
15411543
15421544class CachedDatasetModuleFactory (_DatasetModuleFactory ):
@@ -1582,21 +1584,24 @@ def _get_modification_time(module_hash):
15821584 if not config .HF_DATASETS_OFFLINE :
15831585 warning_msg += ", or remotely on the Hugging Face Hub."
15841586 logger .warning (warning_msg )
1585- # make the new module to be noticed by the import system
1586- module_path = "." .join (
1587- [
1588- os .path .basename (dynamic_modules_path ),
1589- "datasets" ,
1590- self .name .replace ("/" , "--" ),
1591- hash ,
1592- self .name .split ("/" )[- 1 ],
1593- ]
1587+ importable_file_path = _get_importable_file_path (
1588+ dynamic_modules_path = dynamic_modules_path ,
1589+ module_namespace = "datasets" ,
1590+ subdirectory_name = hash ,
1591+ name = self .name ,
1592+ )
1593+ module_path , hash = _load_importable_file (
1594+ dynamic_modules_path = dynamic_modules_path ,
1595+ module_namespace = "datasets" ,
1596+ subdirectory_name = hash ,
1597+ name = self .name ,
15941598 )
1599+ # make the new module to be noticed by the import system
15951600 importlib .invalidate_caches ()
15961601 builder_kwargs = {
15971602 "repo_id" : self .name ,
15981603 }
1599- return DatasetModule (module_path , hash , builder_kwargs )
1604+ return DatasetModule (module_path , hash , builder_kwargs , importable_file_path = importable_file_path )
16001605 cache_dir = os .path .expanduser (str (self .cache_dir or config .HF_DATASETS_CACHE ))
16011606 cached_datasets_directory_path_root = os .path .join (cache_dir , self .name .replace ("/" , "___" ))
16021607 cached_directory_paths = [
0 commit comments