huggingface · lhoestq · Jun 24, 2021 · Jun 22, 2021 · Jun 22, 2021 · Jun 22, 2021
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -69,18 +69,17 @@
     list_table_cache_files,
 )
 from .tasks import TaskTemplate
-from .utils import map_nested
+from .utils import logging, map_nested
 from .utils.deprecation_utils import deprecated
 from .utils.file_utils import estimate_dataset_size
 from .utils.info_utils import is_small_dataset
-from .utils.logging import WARNING, get_logger, get_verbosity, set_verbosity_warning
 from .utils.typing import PathLike
 
 
 if TYPE_CHECKING:
     from .dataset_dict import DatasetDict
 
-logger = get_logger(__name__)
+logger = logging.get_logger(__name__)
 
 if int(config.PYARROW_VERSION.split(".")[0]) == 0:
     PYARROW_V0 = True
@@ -1763,11 +1762,9 @@ def _map_single(
             not keep_in_memory or cache_file_name is None
         ), "Please use either `keep_in_memory` or `cache_file_name` but not both."
 
-        not_verbose = bool(logger.getEffectiveLevel() > WARNING)
-
         # Reduce logging to keep things readable in multiprocessing with tqdm
-        if rank is not None and get_verbosity() < WARNING:
-            set_verbosity_warning()
+        if rank is not None and logging.get_verbosity() < logging.WARNING:
+            logging.set_verbosity_warning()
         # Print at least one thing to fix tqdm in notebooks in multiprocessing
         # see https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
         if rank is not None and "notebook" in tqdm.__name__:
@@ -1934,7 +1931,13 @@ def init_buffer_and_writer():
                 pbar_iterable = input_dataset if not batched else range(0, len(input_dataset), batch_size)
                 pbar_unit = "ex" if not batched else "ba"
                 pbar_desc = (desc or "") + " #" + str(rank) if rank is not None else desc
-                pbar = tqdm(pbar_iterable, disable=not_verbose, position=rank, unit=pbar_unit, desc=pbar_desc)
+                pbar = tqdm(
+                    pbar_iterable,
+                    disable=bool(logging.get_verbosity() == logging.NOTSET),
+                    position=rank,
+                    unit=pbar_unit,
+                    desc=pbar_desc,
+                )
                 if not batched:
                     for i, example in enumerate(pbar):
                         example = apply_function_on_filtered_inputs(example, i, offset=offset)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -27,11 +27,11 @@
 from .features import Features, _ArrayXDExtensionType
 from .info import DatasetInfo
 from .keyhash import DuplicatedKeysError, KeyHasher
+from .utils import logging
 from .utils.file_utils import hash_url_to_filename
-from .utils.logging import WARNING, get_logger
 
 
-logger = get_logger(__name__)
+logger = logging.get_logger(__name__)
 
 type_ = type  # keep python's type function
 
@@ -536,11 +536,11 @@ def finalize(self, metrics_query_result: dict):
 def parquet_to_arrow(sources, destination):
     """Convert parquet files to arrow file. Inputs can be str paths or file-like objects"""
     stream = None if isinstance(destination, str) else destination
-    not_verbose = bool(logger.getEffectiveLevel() > WARNING)
+    disable = bool(logging.get_verbosity() == logging.NOTSET)
     with ArrowWriter(path=destination, stream=stream) as writer:
-        for source in tqdm(sources, unit="sources", disable=not_verbose):
+        for source in tqdm(sources, unit="sources", disable=disable):
             pf = pa.parquet.ParquetFile(source)
-            for i in tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=not_verbose):
+            for i in tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable):
                 df = pf.read_row_group(i).to_pandas()
                 for col in df.columns:
                     df[col] = df[col].apply(json.loads)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -39,14 +39,14 @@
 from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
 from .naming import camelcase_to_snakecase, filename_prefix_for_split
 from .splits import Split, SplitDict, SplitGenerator
+from .utils import logging
 from .utils.download_manager import DownloadManager, GenerateMode
 from .utils.file_utils import DownloadConfig, is_remote_url
 from .utils.filelock import FileLock
 from .utils.info_utils import get_size_checksum_dict, verify_checksums, verify_splits
-from .utils.logging import WARNING, get_logger
 
 
-logger = get_logger(__name__)
+logger = logging.get_logger(__name__)
 
 
 class InvalidConfigName(ValueError):
@@ -990,7 +990,6 @@ def _prepare_split(self, split_generator):
         fpath = os.path.join(self._cache_dir, fname)
 
         generator = self._generate_examples(**split_generator.gen_kwargs)
-        not_verbose = bool(logger.getEffectiveLevel() > WARNING)
 
         with ArrowWriter(
             features=self.info.features,
@@ -1001,7 +1000,11 @@ def _prepare_split(self, split_generator):
         ) as writer:
             try:
                 for key, record in utils.tqdm(
-                    generator, unit=" examples", total=split_info.num_examples, leave=False, disable=not_verbose
+                    generator,
+                    unit=" examples",
+                    total=split_info.num_examples,
+                    leave=False,
+                    disable=bool(logging.get_verbosity() == logging.NOTSET),
                 ):
                     example = self.info.features.encode_example(record)
                     writer.write(example, key)
@@ -1053,9 +1056,10 @@ def _prepare_split(self, split_generator):
         fpath = os.path.join(self._cache_dir, fname)
 
         generator = self._generate_tables(**split_generator.gen_kwargs)
-        not_verbose = bool(logger.getEffectiveLevel() > WARNING)
         with ArrowWriter(features=self.info.features, path=fpath) as writer:
-            for key, table in utils.tqdm(generator, unit=" tables", leave=False, disable=not_verbose):
+            for key, table in utils.tqdm(
+                generator, unit=" tables", leave=False, disable=bool(logging.get_verbosity() == logging.NOTSET)
+            ):
                 writer.write_table(table)
             num_examples, num_bytes = writer.finalize()
 

diff --git a/src/datasets/search.py b/src/datasets/search.py
@@ -7,7 +7,7 @@
 import numpy as np
 from tqdm.auto import tqdm
 
-from .utils.logging import WARNING, get_logger
+from .utils import logging
 
 
 if TYPE_CHECKING:
@@ -28,7 +28,7 @@
 _has_faiss = importlib.util.find_spec("faiss") is not None
 
 
-logger = get_logger(__name__)
+logger = logging.get_logger(__name__)
 
 
 class MissingIndex(Exception):
@@ -141,8 +141,7 @@ def add_documents(self, documents: Union[List[str], "Dataset"], column: Optional
         index_config = self.es_index_config
         self.es_client.indices.create(index=index_name, body=index_config)
         number_of_docs = len(documents)
-        not_verbose = bool(logger.getEffectiveLevel() > WARNING)
-        progress = tqdm(unit="docs", total=number_of_docs, disable=not_verbose)
+        progress = tqdm(unit="docs", total=number_of_docs, disable=bool(logging.get_verbosity() == logging.NOTSET))
         successes = 0
 
         def passage_generator():
@@ -275,8 +274,7 @@ def add_vectors(
 
         # Add vectors
         logger.info("Adding {} vectors to the faiss index".format(len(vectors)))
-        not_verbose = bool(logger.getEffectiveLevel() > WARNING)
-        for i in tqdm(range(0, len(vectors), batch_size), disable=not_verbose):
+        for i in tqdm(range(0, len(vectors), batch_size), disable=bool(logging.get_verbosity() == logging.NOTSET)):
             vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
             self.faiss_index.add(vecs)
 

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
@@ -31,11 +31,11 @@
 from tqdm.auto import tqdm
 
 from .. import __version__, config
+from . import logging
 from .filelock import FileLock
-from .logging import WARNING, get_logger
 
 
-logger = get_logger(__name__)  # pylint: disable=invalid-name
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 INCOMPLETE_SUFFIX = ".incomplete"
 
@@ -492,14 +492,13 @@ def http_get(url, temp_file, proxies=None, resume_size=0, headers=None, cookies=
         return
     content_length = response.headers.get("Content-Length")
     total = resume_size + int(content_length) if content_length is not None else None
-    not_verbose = bool(logger.getEffectiveLevel() > WARNING)
     progress = tqdm(
         unit="B",
         unit_scale=True,
         total=total,
         initial=resume_size,
         desc="Downloading",
-        disable=not_verbose,
+        disable=bool(logging.get_verbosity() == logging.NOTSET),
     )
     for chunk in response.iter_content(chunk_size=1024):
         if chunk:  # filter out keep-alive new chunks