huggingface · mariosasko · Jul 12, 2023 · Jul 11, 2023 · Jul 11, 2023 · Jul 11, 2023
diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py
@@ -71,8 +71,7 @@
         >>> perplexity = datasets.load_metric("perplexity")
         >>> input_texts = datasets.load_dataset("wikitext",
         ...                                     "wikitext-2-raw-v1",
-        ...                                     split="test")["text"][:50] # doctest:+ELLIPSIS
-        [...]
+        ...                                     split="test")["text"][:50]
         >>> input_texts = [s for s in input_texts if s!='']
         >>> results = perplexity.compute(model_id='gpt2',
         ...                              input_texts=input_texts) # doctest:+ELLIPSIS

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1486,7 +1486,6 @@ def save_to_disk(
             disable=not logging.is_progress_bar_enabled(),
             unit=" examples",
             total=len(self),
-            leave=False,
             desc=f"Saving the dataset ({shards_done}/{num_shards} shards)",
         )
         kwargs_per_job = (
@@ -3070,15 +3069,14 @@ def load_processed_shard_from_cache(shard_kwargs):
             transformed_dataset = None
             try:
                 transformed_dataset = load_processed_shard_from_cache(dataset_kwargs)
-                logger.warning(f"Loading cached processed dataset at {dataset_kwargs['cache_file_name']}")
+                logger.info(f"Loading cached processed dataset at {dataset_kwargs['cache_file_name']}")
             except NonExistentDatasetError:
                 pass
             if transformed_dataset is None:
                 with logging.tqdm(
                     disable=not logging.is_progress_bar_enabled(),
                     unit=" examples",
                     total=pbar_total,
-                    leave=False,
                     desc=desc or "Map",
                 ) as pbar:
                     for rank, done, content in Dataset._map_single(**dataset_kwargs):
@@ -3171,7 +3169,6 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
                         disable=not logging.is_progress_bar_enabled(),
                         unit=" examples",
                         total=pbar_total,
-                        leave=False,
                         desc=(desc or "Map") + f" (num_proc={num_proc})",
                     ) as pbar:
                         for rank, done, content in iflatmap_unordered(
@@ -3187,7 +3184,7 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
                 for kwargs in kwargs_per_job:
                     del kwargs["shard"]
             else:
-                logger.warning(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
+                logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
             assert (
                 None not in transformed_shards
             ), f"Failed to retrieve results from map: result list {transformed_shards} still contains None - at least one worker failed to return its results"
@@ -4086,7 +4083,7 @@ def sort(
                 # we create a unique hash from the function, current dataset file and the mapping args
                 indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
             if os.path.exists(indices_cache_file_name) and load_from_cache_file:
-                logger.warning(f"Loading cached sorted indices for dataset at {indices_cache_file_name}")
+                logger.info(f"Loading cached sorted indices for dataset at {indices_cache_file_name}")
                 return self._new_dataset_with_indices(
                     fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
                 )
@@ -4228,7 +4225,7 @@ def shuffle(
                 # we create a unique hash from the function, current dataset file and the mapping args
                 indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
             if os.path.exists(indices_cache_file_name) and load_from_cache_file:
-                logger.warning(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}")
+                logger.info(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}")
                 return self._new_dataset_with_indices(
                     fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
                 )
@@ -4459,7 +4456,7 @@ def train_test_split(
                 and os.path.exists(test_indices_cache_file_name)
                 and load_from_cache_file
             ):
-                logger.warning(
+                logger.info(
                     f"Loading cached split indices for dataset at {train_indices_cache_file_name} and {test_indices_cache_file_name}"
                 )
                 return DatasetDict(
@@ -5273,7 +5270,7 @@ def path_in_repo(_index, shard):
         first_shard = next(shards_iter)
         first_shard_path_in_repo = path_in_repo(0, first_shard)
         if first_shard_path_in_repo in data_files and num_shards < len(data_files):
-            logger.warning("Resuming upload of the dataset shards.")
+            logger.info("Resuming upload of the dataset shards.")
 
         uploaded_size = 0
         shards_path_in_repo = []
@@ -5450,7 +5447,7 @@ def push_to_hub(
             repo_info = None
         # update the total info to dump from existing info
         if repo_info is not None:
-            logger.warning("Updating downloaded metadata with the new split.")
+            logger.info("Updating downloaded metadata with the new split.")
             if repo_info.splits and list(repo_info.splits) != [split]:
                 if self._info.features != repo_info.features:
                     raise ValueError(

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -410,7 +410,7 @@ def __init__(
                             self.info = DatasetInfo.from_directory(self._cache_dir)
                     else:  # dir exists but no data, remove the empty dir as data aren't available anymore
                         logger.warning(
-                            f"Old caching folder {self._cache_dir} for dataset {self.name} exists but no data were found. Removing it. "
+                            f"Old caching folder {self._cache_dir} for dataset {self.name} exists but no data were found. Removing it."
                         )
                         os.rmdir(self._cache_dir)
 
@@ -490,7 +490,7 @@ def _create_builder_config(
         if config_name is None and self.BUILDER_CONFIGS and not config_kwargs:
             if self.DEFAULT_CONFIG_NAME is not None:
                 builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
-                logger.warning(f"No config specified, defaulting to: {self.name}/{builder_config.name}")
+                logger.info(f"No config specified, defaulting to: {self.name}/{builder_config.name}")
             else:
                 if len(self.BUILDER_CONFIGS) > 1:
                     example_of_usage = f"load_dataset('{self.name}', '{self.BUILDER_CONFIGS[0].name}')"
@@ -843,7 +843,7 @@ def download_and_prepare(
             path_join = os.path.join if is_local else posixpath.join
             data_exists = self._fs.exists(path_join(self._output_dir, config.DATASET_INFO_FILENAME))
             if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
-                logger.warning(f"Found cached dataset {self.name} ({self._output_dir})")
+                logger.info(f"Found cached dataset {self.name} ({self._output_dir})")
                 # We need to update the info in case some splits were added in the meantime
                 # for example when calling load_dataset from multiple workers.
                 self.info = self._load_info()
@@ -882,15 +882,15 @@ def incomplete_dir(dirname):
             # information needed to cancel download/preparation if needed.
             # This comes right before the progress bar.
             if self.info.size_in_bytes:
-                print(
+                logger.info(
                     f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} "
                     f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
                     f"post-processed: {size_str(self.info.post_processing_size)}, "
                     f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..."
                 )
             else:
                 _dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir
-                print(
+                logger.info(
                     f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} to {_dest}..."
                 )
 
@@ -933,7 +933,7 @@ def incomplete_dir(dirname):
             # Download post processing resources
             self.download_post_processing_resources(dl_manager)
 
-            print(
+            logger.info(
                 f"Dataset {self.name} downloaded and prepared to {self._output_dir}. "
                 f"Subsequent calls will reuse this data."
             )
@@ -1162,7 +1162,7 @@ def as_dataset(
             ),
             split,
             map_tuple=True,
-            disable_tqdm=not logging.is_progress_bar_enabled(),
+            disable_tqdm=True,
         )
         if isinstance(datasets, dict):
             datasets = DatasetDict(datasets)
@@ -1490,7 +1490,7 @@ def _prepare_split(
                 )
                 num_proc = 1
             elif num_proc is not None and num_input_shards < num_proc:
-                logger.info(
+                logger.warning(
                     f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
                 )
                 num_proc = num_input_shards
@@ -1499,7 +1499,6 @@ def _prepare_split(
             disable=not logging.is_progress_bar_enabled(),
             unit=" examples",
             total=split_info.num_examples,
-            leave=False,
             desc=f"Generating {split_info.name} split",
         )
 
@@ -1751,7 +1750,7 @@ def _prepare_split(
                 )
                 num_proc = 1
             elif num_proc is not None and num_input_shards < num_proc:
-                logger.info(
+                logger.warning(
                     f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
                 )
                 num_proc = num_input_shards
@@ -1760,7 +1759,6 @@ def _prepare_split(
             disable=not logging.is_progress_bar_enabled(),
             unit=" examples",
             total=split_info.num_examples,
-            leave=False,
             desc=f"Generating {split_info.name} split",
         )
 

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -1628,7 +1628,7 @@ def push_to_hub(
                 raise ValueError(f"Split name should match '{_split_re}' but got '{split}'.")
 
         for split in self.keys():
-            logger.warning(f"Pushing split {split} to the Hub.")
+            logger.info(f"Pushing split {split} to the Hub.")
             # The split=key needs to be removed before merging
             repo_id, split, uploaded_size, dataset_nbytes, _, _ = self[split]._push_parquet_shards_to_hub(
                 repo_id,

diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -1233,7 +1233,7 @@ def _iter_pytorch(self):
                 f"Too many dataloader workers: {worker_info.num_workers} (max is dataset.n_shards={ex_iterable.n_shards}). "
                 f"Stopping {worker_info.num_workers - ex_iterable.n_shards} dataloader workers."
             )
-            logger.warning(
+            logger.info(
                 f"To parallelize data loading, we give each process some shards (or data sources) to process. "
                 f"Therefore it's unnecessary to have a number of workers greater than dataset.n_shards={ex_iterable.n_shards}. "
                 f"To enable more parallelism, please split the dataset in more files than {ex_iterable.n_shards}."
@@ -1304,13 +1304,13 @@ def _prepare_ex_iterable_for_iteration(self) -> _BaseExamplesIterable:
                 if self._is_main_process():
                     n_shards_per_node = ex_iterable.n_shards // world_size
                     plural = "s" if n_shards_per_node > 1 else ""
-                    logger.warning(
+                    logger.info(
                         f"Assigning {n_shards_per_node} shard{plural} (or data source{plural}) of the dataset to each node."
                     )
                 ex_iterable = ex_iterable.shard_data_sources(rank, world_size)
             else:
                 if self._is_main_process():
-                    logger.warning(
+                    logger.info(
                         f"Assigning 1 out of {world_size} examples of the dataset to each node. The others are skipped during the iteration."
                     )
                     logger.info(

diff --git a/src/datasets/utils/logging.py b/src/datasets/utils/logging.py
@@ -69,6 +69,7 @@ def _get_library_root_logger() -> logging.Logger:
 def _configure_library_root_logger() -> None:
     # Apply our default configuration to the library root logger.
     library_root_logger = _get_library_root_logger()
+    library_root_logger.addHandler(logging.StreamHandler())
     library_root_logger.setLevel(_get_default_logging_level())
 
 

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -48,7 +48,7 @@
     Summarization,
     TextClassification,
 )
-from datasets.utils.logging import WARNING
+from datasets.utils.logging import INFO, get_logger
 from datasets.utils.py_utils import temp_seed
 
 from .utils import (
@@ -1320,7 +1320,7 @@ def test_map_fn_kwargs(self, in_memory):
     def test_map_caching(self, in_memory):
         with tempfile.TemporaryDirectory() as tmp_dir:
             self._caplog.clear()
-            with self._caplog.at_level(WARNING):
+            with self._caplog.at_level(INFO, logger=get_logger().name):
                 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
                     with patch(
                         "datasets.arrow_dataset.Dataset._map_single",
@@ -1338,7 +1338,7 @@ def test_map_caching(self, in_memory):
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             self._caplog.clear()
-            with self._caplog.at_level(WARNING):
+            with self._caplog.at_level(INFO, logger=get_logger().name):
                 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
                     with dset.map(lambda x: {"foo": "bar"}) as dset_test1:
                         dset_test1_data_files = list(dset_test1.cache_files)
@@ -1349,7 +1349,7 @@ def test_map_caching(self, in_memory):
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             self._caplog.clear()
-            with self._caplog.at_level(WARNING):
+            with self._caplog.at_level(INFO, logger=get_logger().name):
                 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
                     with patch(
                         "datasets.arrow_dataset.Pool",
@@ -1369,7 +1369,7 @@ def test_map_caching(self, in_memory):
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             self._caplog.clear()
-            with self._caplog.at_level(WARNING):
+            with self._caplog.at_level(INFO, logger=get_logger().name):
                 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
                     with dset.map(lambda x: {"foo": "bar"}, num_proc=2) as dset_test1:
                         dset_test1_data_files = list(dset_test1.cache_files)
@@ -1382,7 +1382,7 @@ def test_map_caching(self, in_memory):
             try:
                 self._caplog.clear()
                 with tempfile.TemporaryDirectory() as tmp_dir:
-                    with self._caplog.at_level(WARNING):
+                    with self._caplog.at_level(INFO, logger=get_logger().name):
                         with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
                             datasets.disable_caching()
                             with dset.map(lambda x: {"foo": "bar"}) as dset_test1:
@@ -1733,7 +1733,7 @@ def test_filter_multiprocessing(self, in_memory):
     def test_filter_caching(self, in_memory):
         with tempfile.TemporaryDirectory() as tmp_dir:
             self._caplog.clear()
-            with self._caplog.at_level(WARNING):
+            with self._caplog.at_level(INFO, logger=get_logger().name):
                 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
                     with dset.filter(lambda x, i: i < 5, with_indices=True) as dset_filter_first_five1:
                         dset_test1_data_files = list(dset_filter_first_five1.cache_files)

diff --git a/tests/test_builder.py b/tests/test_builder.py
@@ -27,6 +27,7 @@
 from datasets.streaming import xjoin
 from datasets.utils.file_utils import is_local_path
 from datasets.utils.info_utils import VerificationMode
+from datasets.utils.logging import INFO, get_logger
 
 from .utils import (
     assert_arrow_memory_doesnt_increase,
@@ -1060,7 +1061,8 @@ def test_builder_with_filesystem_download_and_prepare_reload(tmp_path, mockfs, c
     DatasetInfo().write_to_directory("mock://my_dataset", storage_options=mockfs.storage_options)
     mockfs.touch(f"my_dataset/{builder.name}-train.arrow")
     caplog.clear()
-    builder.download_and_prepare("mock://my_dataset", storage_options=mockfs.storage_options)
+    with caplog.at_level(INFO, logger=get_logger().name):
+        builder.download_and_prepare("mock://my_dataset", storage_options=mockfs.storage_options)
     assert "Found cached dataset" in caplog.text
 
 

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -36,6 +36,7 @@
     infer_module_for_data_files,
     infer_module_for_data_files_in_archives,
 )
+from datasets.utils.logging import INFO, get_logger
 
 from .utils import (
     OfflineSimulationMode,
@@ -965,7 +966,8 @@ def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir
     del dataset
     os.rename(cache_dir1, cache_dir2)
     caplog.clear()
-    dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir2)
+    with caplog.at_level(INFO, logger=get_logger().name):
+        dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir2)
     assert "Found cached dataset" in caplog.text
     assert dataset._fingerprint == fingerprint1, "for the caching mechanism to work, fingerprint should stay the same"
     dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="test", cache_dir=cache_dir2)