From 443651b302ffcc18e0013edd46be5b2202f3e9ab Mon Sep 17 00:00:00 2001 From: mariosasko Date: Sat, 17 Jul 2021 17:02:53 +0200 Subject: [PATCH 1/4] Use tqdm from tqdm_utils --- datasets/ncslgr/ncslgr.py | 4 +--- src/datasets/__init__.py | 1 - src/datasets/arrow_dataset.py | 11 +++++++---- src/datasets/arrow_writer.py | 7 +++---- src/datasets/io/json.py | 9 ++++++--- src/datasets/search.py | 10 +++++++--- src/datasets/utils/__init__.py | 2 +- src/datasets/utils/file_utils.py | 5 ++--- src/datasets/utils/py_utils.py | 25 +++++++++++++++---------- src/datasets/utils/tqdm_utils.py | 11 ++++++++--- 10 files changed, 50 insertions(+), 35 deletions(-) diff --git a/datasets/ncslgr/ncslgr.py b/datasets/ncslgr/ncslgr.py index b1c583ae3a0..8fed49f87ac 100644 --- a/datasets/ncslgr/ncslgr.py +++ b/datasets/ncslgr/ncslgr.py @@ -19,8 +19,6 @@ import re from dataclasses import dataclass -from tqdm import tqdm - import datasets @@ -128,7 +126,7 @@ def get_tier_values(name: str): def _generate_examples(self, eaf_path: str, videos_path: str): """Yields examples.""" - for i, eaf_file in enumerate(tqdm(os.listdir(eaf_path))): + for i, eaf_file in enumerate(os.listdir(eaf_path)): eaf_file_path = os.path.join(eaf_path, eaf_file) videos = [] with open(eaf_file_path, "r", encoding="utf-8") as f: diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index ef9256a824e..84253d0d6ab 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -74,7 +74,6 @@ percent, ) from .utils import * -from .utils.tqdm_utils import disable_progress_bar SCRIPTS_VERSION = "master" if __version__.split(".")[-1].startswith("dev") else __version__ diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 8c2041c1c5c..20967463945 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -41,7 +41,7 @@ from datasets.tasks.text_classification import TextClassification -from . import config +from . import config, utils from .arrow_reader import ArrowReader from .arrow_writer import ArrowWriter, OptimizedTypedSequence from .features import ClassLabel, Features, Value, cast_to_python_objects @@ -1701,7 +1701,10 @@ def format_cache_file_name(cache_file_name, rank): ): logger.warning("Setting TOKENIZERS_PARALLELISM=false for forked processes.") os.environ["TOKENIZERS_PARALLELISM"] = "false" - with Pool(num_proc, initargs=(RLock(),), initializer=tqdm.set_lock) as pool: + initargs, initializer = None, None + if utils.is_progress_bar_enabled(): + initargs, initializer = (RLock(),), tqdm.set_lock + with Pool(num_proc, initargs=initargs, initializer=initializer) as pool: os.environ = prev_env shards = [ self.shard(num_shards=num_proc, index=rank, contiguous=True, keep_in_memory=keep_in_memory) @@ -1812,7 +1815,7 @@ def _map_single( logging.set_verbosity_warning() # Print at least one thing to fix tqdm in notebooks in multiprocessing # see https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308 - if rank is not None and "notebook" in tqdm.__name__: + if rank is not None and utils.is_progress_bar_enabled() and "notebook" in tqdm.__name__: print(" ", end="", flush=True) # Select the columns (arrow columns) to process @@ -1976,7 +1979,7 @@ def init_buffer_and_writer(): pbar_iterable = input_dataset if not batched else range(0, len(input_dataset), batch_size) pbar_unit = "ex" if not batched else "ba" pbar_desc = (desc or "") + " #" + str(rank) if rank is not None else desc - pbar = tqdm( + pbar = utils.tqdm( pbar_iterable, disable=bool(logging.get_verbosity() == logging.NOTSET), position=rank, diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 327d4a24743..d33fea4042d 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -21,9 +21,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union import pyarrow as pa -from tqdm.auto import tqdm -from . import config +from . import config, utils from .features import Features, _ArrayXDExtensionType from .info import DatasetInfo from .keyhash import DuplicatedKeysError, KeyHasher @@ -538,9 +537,9 @@ def parquet_to_arrow(sources, destination): stream = None if isinstance(destination, str) else destination disable = bool(logging.get_verbosity() == logging.NOTSET) with ArrowWriter(path=destination, stream=stream) as writer: - for source in tqdm(sources, unit="sources", disable=disable): + for source in utils.tqdm(sources, unit="sources", disable=disable): pf = pa.parquet.ParquetFile(source) - for i in tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable): + for i in utils.tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable): df = pf.read_row_group(i).to_pandas() for col in df.columns: df[col] = df[col].apply(json.loads) diff --git a/src/datasets/io/json.py b/src/datasets/io/json.py index fc6703b3297..ece231b5a2a 100644 --- a/src/datasets/io/json.py +++ b/src/datasets/io/json.py @@ -1,10 +1,11 @@ import os +from logging import disable from typing import BinaryIO, Optional, Union -from .. import Dataset, Features, NamedSplit, config +from .. import Dataset, Features, NamedSplit, config, utils from ..formatting import query_table from ..packaged_modules.json.json import Json -from ..utils.tqdm_utils import tqdm +from ..utils import logging from ..utils.typing import NestedDataStructureLike, PathLike from .abc import AbstractDatasetReader @@ -96,7 +97,9 @@ def _write( written = 0 _ = to_json_kwargs.pop("path_or_buf", None) - for offset in tqdm(range(0, len(self.dataset), batch_size)): + for offset in utils.tqdm( + range(0, len(self.dataset), batch_size), unit="ba", disable=bool(logging.get_verbosity() == logging.NOTSET) + ): batch = query_table( table=self.dataset.data, key=slice(offset, offset + batch_size), diff --git a/src/datasets/search.py b/src/datasets/search.py index d4cd28a9bfc..08f6949f6f0 100644 --- a/src/datasets/search.py +++ b/src/datasets/search.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Union import numpy as np -from tqdm.auto import tqdm +from . import utils from .utils import logging @@ -141,7 +141,9 @@ def add_documents(self, documents: Union[List[str], "Dataset"], column: Optional index_config = self.es_index_config self.es_client.indices.create(index=index_name, body=index_config) number_of_docs = len(documents) - progress = tqdm(unit="docs", total=number_of_docs, disable=bool(logging.get_verbosity() == logging.NOTSET)) + progress = utils.tqdm( + unit="docs", total=number_of_docs, disable=bool(logging.get_verbosity() == logging.NOTSET) + ) successes = 0 def passage_generator(): @@ -287,7 +289,9 @@ def add_vectors( # Add vectors logger.info("Adding {} vectors to the faiss index".format(len(vectors))) - for i in tqdm(range(0, len(vectors), batch_size), disable=bool(logging.get_verbosity() == logging.NOTSET)): + for i in utils.tqdm( + range(0, len(vectors), batch_size), disable=bool(logging.get_verbosity() == logging.NOTSET) + ): vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column] self.faiss_index.add(vecs) diff --git a/src/datasets/utils/__init__.py b/src/datasets/utils/__init__.py index 623f4c955b6..5f3165cb489 100644 --- a/src/datasets/utils/__init__.py +++ b/src/datasets/utils/__init__.py @@ -35,5 +35,5 @@ zip_dict, zip_nested, ) -from .tqdm_utils import async_tqdm, tqdm +from .tqdm_utils import async_tqdm, disable_progress_bar, is_progress_bar_enabled, tqdm from .version import Version diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index ee6cf8fa2bd..c40bb07dd7d 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -24,9 +24,8 @@ import numpy as np import posixpath import requests -from tqdm.auto import tqdm -from .. import __version__, config +from .. import __version__, config, utils from . import logging from .extract import ExtractManager from .filelock import FileLock @@ -431,7 +430,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, headers=None, cookies= return content_length = response.headers.get("Content-Length") total = resume_size + int(content_length) if content_length is not None else None - progress = tqdm( + progress = utils.tqdm( unit="B", unit_scale=True, total=total, diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index f4cf169a4ab..6041fe1c13a 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -33,9 +33,10 @@ import dill import numpy as np -from tqdm import tqdm +from tqdm.auto import tqdm -from .logging import INFO, WARNING, get_logger, get_verbosity, set_verbosity_warning +from .. import utils +from . import logging try: # pragma: no branch @@ -45,7 +46,7 @@ _typing_extensions = Literal = Final = None -logger = get_logger(__name__) +logger = logging.get_logger(__name__) # NOTE: When used on an instance method, the cache is shared across all @@ -142,17 +143,17 @@ def _single_map_nested(args): return function(data_struct) # Reduce logging to keep things readable in multiprocessing with tqdm - if rank is not None and get_verbosity() < WARNING: - set_verbosity_warning() + if rank is not None and logging.get_verbosity() < logging.WARNING: + logging.set_verbosity_warning() # Print at least one thing to fix tqdm in notebooks in multiprocessing # see https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308 - if rank is not None and "notebook" in tqdm.__name__: + if rank is not None and utils.is_progress_bar_enabled() and "notebook" in tqdm.__name__: print(" ", end="", flush=True) # Loop over single examples or batches and write to buffer/file if examples are to be updated pbar_iterable = data_struct.items() if isinstance(data_struct, dict) else data_struct pbar_desc = "#" + str(rank) if rank is not None else None - pbar = tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit="obj", desc=pbar_desc) + pbar = utils.tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit="obj", desc=pbar_desc) if isinstance(data_struct, dict): return {k: _single_map_nested((function, v, types, None, True)) for k, v in pbar} @@ -194,14 +195,15 @@ def map_nested( if not isinstance(data_struct, dict) and not isinstance(data_struct, types): return function(data_struct) - disable_tqdm = bool(logger.getEffectiveLevel() > INFO) + disable_tqdm = bool(logging.get_verbosity() == logging.NOTSET) iterable = list(data_struct.values()) if isinstance(data_struct, dict) else data_struct if num_proc is None: num_proc = 1 if num_proc <= 1 or len(iterable) <= num_proc: mapped = [ - _single_map_nested((function, obj, types, None, True)) for obj in tqdm(iterable, disable=disable_tqdm) + _single_map_nested((function, obj, types, None, True)) + for obj in utils.tqdm(iterable, disable=disable_tqdm) ] else: split_kwds = [] # We organize the splits ourselve (contiguous splits) @@ -221,7 +223,10 @@ def map_nested( num_proc, len(iterable), [len(i[1]) for i in split_kwds] ) ) - with Pool(num_proc, initargs=(RLock(),), initializer=tqdm.set_lock) as pool: + initargs, initializer = None, None + if utils.is_progress_bar_enabled(): + initargs, initializer = (RLock(),), tqdm.set_lock + with Pool(num_proc, initargs=initargs, initializer=initializer) as pool: mapped = pool.map(_single_map_nested, split_kwds) logger.info("Finished {} processes".format(num_proc)) mapped = [obj for proc_res in mapped for obj in proc_res] diff --git a/src/datasets/utils/tqdm_utils.py b/src/datasets/utils/tqdm_utils.py index 3231c4cfcc8..1c9b3c5c3a4 100644 --- a/src/datasets/utils/tqdm_utils.py +++ b/src/datasets/utils/tqdm_utils.py @@ -19,7 +19,7 @@ import contextlib -from tqdm import auto as tqdm_lib +from tqdm import autonotebook as tqdm_lib class EmptyTqdm: @@ -63,8 +63,13 @@ def async_tqdm(*args, **kwargs): return EmptyTqdm(*args, **kwargs) +def is_progress_bar_enabled(): + global _active + return bool(_active) + + def disable_progress_bar(): - """Disabled Tqdm progress bar. + """Disable tqdm progress bar. Usage: @@ -103,7 +108,7 @@ def _async_tqdm(*args, **kwargs): class _TqdmPbarAsync: - """Wrapper around Tqdm pbar which be shared between thread.""" + """Wrapper around Tqdm pbar which can be shared between thread.""" _tqdm_bars = [] From c0b2465e3739589103a5077633f923da8f69802f Mon Sep 17 00:00:00 2001 From: mariosasko Date: Sat, 17 Jul 2021 19:11:01 +0200 Subject: [PATCH 2/4] Style --- src/datasets/io/json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/io/json.py b/src/datasets/io/json.py index ece231b5a2a..377ff1d9c44 100644 --- a/src/datasets/io/json.py +++ b/src/datasets/io/json.py @@ -1,5 +1,4 @@ import os -from logging import disable from typing import BinaryIO, Optional, Union from .. import Dataset, Features, NamedSplit, config, utils From cf25ad0a81cffbd61a99a4b3d19b148c4c22327e Mon Sep 17 00:00:00 2001 From: mariosasko Date: Sun, 18 Jul 2021 13:10:25 +0200 Subject: [PATCH 3/4] Fix windows test failures --- src/datasets/utils/py_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index 6041fe1c13a..1a7b8d3f889 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -195,7 +195,7 @@ def map_nested( if not isinstance(data_struct, dict) and not isinstance(data_struct, types): return function(data_struct) - disable_tqdm = bool(logging.get_verbosity() == logging.NOTSET) + disable_tqdm = bool(logger.getEffectiveLevel() > logging.INFO) iterable = list(data_struct.values()) if isinstance(data_struct, dict) else data_struct if num_proc is None: From df35e94c11a10ff7ac86bff3e8273fd6665a4c49 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Sun, 18 Jul 2021 13:10:53 +0200 Subject: [PATCH 4/4] Replace tqdm autonotebook with auto in tqdm_utils --- src/datasets/utils/tqdm_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/tqdm_utils.py b/src/datasets/utils/tqdm_utils.py index 1c9b3c5c3a4..ff782515efb 100644 --- a/src/datasets/utils/tqdm_utils.py +++ b/src/datasets/utils/tqdm_utils.py @@ -19,7 +19,7 @@ import contextlib -from tqdm import autonotebook as tqdm_lib +from tqdm import auto as tqdm_lib class EmptyTqdm: