2727from functools import partial
2828from typing import Dict , Mapping , Optional , Tuple , Union
2929
30- from datasets .features import Features
31- from datasets .utils .mock_download_manager import MockDownloadManager
32-
3330from . import config , utils
3431from .arrow_dataset import Dataset
3532from .arrow_reader import (
4239from .arrow_writer import ArrowWriter , BeamWriter
4340from .data_files import DataFilesDict , sanitize_patterns
4441from .dataset_dict import DatasetDict , IterableDatasetDict
42+ from .features import Features
4543from .fingerprint import Hasher
4644from .info import DatasetInfo , DatasetInfosDict , PostProcessedInfo
4745from .iterable_dataset import ExamplesIterable , IterableDataset , _generate_examples_from_tables_wrapper
4846from .naming import camelcase_to_snakecase , filename_prefix_for_split
4947from .splits import Split , SplitDict , SplitGenerator
5048from .utils import logging
5149from .utils .download_manager import DownloadManager , DownloadMode
52- from .utils .file_utils import DownloadConfig , is_remote_url
50+ from .utils .file_utils import DownloadConfig , cached_path , is_remote_url
5351from .utils .filelock import FileLock
5452from .utils .info_utils import get_size_checksum_dict , verify_checksums , verify_splits
53+ from .utils .mock_download_manager import MockDownloadManager
54+ from .utils .py_utils import (
55+ classproperty ,
56+ has_sufficient_disk_space ,
57+ map_nested ,
58+ memoize ,
59+ size_str ,
60+ temporary_assignment ,
61+ )
5562from .utils .streaming_download_manager import StreamingDownloadManager
5663
5764
@@ -389,9 +396,9 @@ def _create_builder_config(self, name=None, custom_features=None, **config_kwarg
389396
390397 return builder_config , config_id
391398
392- @utils . classproperty
399+ @classproperty
393400 @classmethod
394- @utils . memoize ()
401+ @memoize ()
395402 def builder_configs (cls ):
396403 """Pre-defined list of configurations for this builder class."""
397404 configs = {config .name : config for config in cls .BUILDER_CONFIGS }
@@ -537,9 +544,9 @@ def download_and_prepare(
537544 return
538545 logger .info (f"Generating dataset { self .name } ({ self ._cache_dir } )" )
539546 if not is_remote_url (self ._cache_dir_root ): # if cache dir is local, check for available space
540- if not utils . has_sufficient_disk_space (self .info .size_in_bytes or 0 , directory = self ._cache_dir_root ):
547+ if not has_sufficient_disk_space (self .info .size_in_bytes or 0 , directory = self ._cache_dir_root ):
541548 raise OSError (
542- f"Not enough disk space. Needed: { utils . size_str (self .info .size_in_bytes or 0 )} (download: { utils . size_str (self .info .download_size or 0 )} , generated: { utils . size_str (self .info .dataset_size or 0 )} , post-processed: { utils . size_str (self .info .post_processing_size or 0 )} )"
549+ f"Not enough disk space. Needed: { size_str (self .info .size_in_bytes or 0 )} (download: { size_str (self .info .download_size or 0 )} , generated: { size_str (self .info .dataset_size or 0 )} , post-processed: { size_str (self .info .post_processing_size or 0 )} )"
543550 )
544551
545552 @contextlib .contextmanager
@@ -565,9 +572,9 @@ def incomplete_dir(dirname):
565572 if self .info .size_in_bytes :
566573 print (
567574 f"Downloading and preparing dataset { self .info .builder_name } /{ self .info .config_name } "
568- f"(download: { utils . size_str (self .info .download_size )} , generated: { utils . size_str (self .info .dataset_size )} , "
569- f"post-processed: { utils . size_str (self .info .post_processing_size )} , "
570- f"total: { utils . size_str (self .info .size_in_bytes )} ) to { self ._cache_dir } ..."
575+ f"(download: { size_str (self .info .download_size )} , generated: { size_str (self .info .dataset_size )} , "
576+ f"post-processed: { size_str (self .info .post_processing_size )} , "
577+ f"total: { size_str (self .info .size_in_bytes )} ) to { self ._cache_dir } ..."
571578 )
572579 else :
573580 print (
@@ -580,7 +587,7 @@ def incomplete_dir(dirname):
580587 with incomplete_dir (self ._cache_dir ) as tmp_data_dir :
581588 # Temporarily assign _cache_dir to tmp_data_dir to avoid having to forward
582589 # it to every sub function.
583- with utils . temporary_assignment (self , "_cache_dir" , tmp_data_dir ):
590+ with temporary_assignment (self , "_cache_dir" , tmp_data_dir ):
584591 # Try to download the already prepared dataset files
585592 downloaded_from_gcs = False
586593 if try_from_hf_gcs :
@@ -637,7 +644,7 @@ def _download_prepared_from_hf_gcs(self, download_config: DownloadConfig):
637644 if os .sep in resource_file_name :
638645 raise ValueError (f"Resources shouldn't be in a sub-directory: { resource_file_name } " )
639646 try :
640- resource_path = utils . cached_path (remote_cache_dir + "/" + resource_file_name )
647+ resource_path = cached_path (remote_cache_dir + "/" + resource_file_name )
641648 shutil .move (resource_path , os .path .join (self ._cache_dir , resource_file_name ))
642649 except ConnectionError :
643650 logger .info (f"Couldn't download resourse file { resource_file_name } from Hf google storage." )
@@ -761,7 +768,7 @@ def as_dataset(
761768 split = {s : s for s in self .info .splits }
762769
763770 # Create a dataset for each of the given splits
764- datasets = utils . map_nested (
771+ datasets = map_nested (
765772 partial (
766773 self ._build_single_dataset ,
767774 run_post_process = run_post_process ,
@@ -903,7 +910,7 @@ def as_streaming_dataset(
903910 raise ValueError (f"Bad split: { split } . Available splits: { list (splits_generators )} " )
904911
905912 # Create a dataset for each of the given splits
906- datasets = utils . map_nested (
913+ datasets = map_nested (
907914 self ._as_streaming_dataset_single ,
908915 splits_generator ,
909916 map_tuple = True ,
@@ -1074,7 +1081,7 @@ def _prepare_split(self, split_generator, check_duplicate_keys):
10741081 check_duplicates = check_duplicate_keys ,
10751082 ) as writer :
10761083 try :
1077- for key , record in utils .tqdm (
1084+ for key , record in utils .tqdm_utils . tqdm (
10781085 generator ,
10791086 unit = " examples" ,
10801087 total = split_info .num_examples ,
@@ -1138,7 +1145,7 @@ def _prepare_split(self, split_generator):
11381145
11391146 generator = self ._generate_tables (** split_generator .gen_kwargs )
11401147 with ArrowWriter (features = self .info .features , path = fpath ) as writer :
1141- for key , table in utils .tqdm (
1148+ for key , table in utils .tqdm_utils . tqdm (
11421149 generator , unit = " tables" , leave = False , disable = True # not utils.is_progress_bar_enabled()
11431150 ):
11441151 writer .write_table (table )
0 commit comments