Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions datasets/ncslgr/ncslgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
import re
from dataclasses import dataclass

from tqdm import tqdm

import datasets


Expand Down Expand Up @@ -128,7 +126,7 @@ def get_tier_values(name: str):
def _generate_examples(self, eaf_path: str, videos_path: str):
"""Yields examples."""

for i, eaf_file in enumerate(tqdm(os.listdir(eaf_path))):
for i, eaf_file in enumerate(os.listdir(eaf_path)):
eaf_file_path = os.path.join(eaf_path, eaf_file)
videos = []
with open(eaf_file_path, "r", encoding="utf-8") as f:
Expand Down
1 change: 0 additions & 1 deletion src/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@
percent,
)
from .utils import *
from .utils.tqdm_utils import disable_progress_bar


SCRIPTS_VERSION = "master" if __version__.split(".")[-1].startswith("dev") else __version__
11 changes: 7 additions & 4 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

from datasets.tasks.text_classification import TextClassification

from . import config
from . import config, utils
from .arrow_reader import ArrowReader
from .arrow_writer import ArrowWriter, OptimizedTypedSequence
from .features import ClassLabel, Features, Value, cast_to_python_objects
Expand Down Expand Up @@ -1701,7 +1701,10 @@ def format_cache_file_name(cache_file_name, rank):
):
logger.warning("Setting TOKENIZERS_PARALLELISM=false for forked processes.")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
with Pool(num_proc, initargs=(RLock(),), initializer=tqdm.set_lock) as pool:
initargs, initializer = None, None
if utils.is_progress_bar_enabled():
initargs, initializer = (RLock(),), tqdm.set_lock
with Pool(num_proc, initargs=initargs, initializer=initializer) as pool:
os.environ = prev_env
shards = [
self.shard(num_shards=num_proc, index=rank, contiguous=True, keep_in_memory=keep_in_memory)
Expand Down Expand Up @@ -1812,7 +1815,7 @@ def _map_single(
logging.set_verbosity_warning()
# Print at least one thing to fix tqdm in notebooks in multiprocessing
# see https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
if rank is not None and "notebook" in tqdm.__name__:
if rank is not None and utils.is_progress_bar_enabled() and "notebook" in tqdm.__name__:
print(" ", end="", flush=True)

# Select the columns (arrow columns) to process
Expand Down Expand Up @@ -1976,7 +1979,7 @@ def init_buffer_and_writer():
pbar_iterable = input_dataset if not batched else range(0, len(input_dataset), batch_size)
pbar_unit = "ex" if not batched else "ba"
pbar_desc = (desc or "") + " #" + str(rank) if rank is not None else desc
pbar = tqdm(
pbar = utils.tqdm(
pbar_iterable,
disable=bool(logging.get_verbosity() == logging.NOTSET),
position=rank,
Expand Down
7 changes: 3 additions & 4 deletions src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@
from typing import Any, Dict, List, Optional, Tuple, Union

import pyarrow as pa
from tqdm.auto import tqdm

from . import config
from . import config, utils
from .features import Features, _ArrayXDExtensionType
from .info import DatasetInfo
from .keyhash import DuplicatedKeysError, KeyHasher
Expand Down Expand Up @@ -538,9 +537,9 @@ def parquet_to_arrow(sources, destination):
stream = None if isinstance(destination, str) else destination
disable = bool(logging.get_verbosity() == logging.NOTSET)
with ArrowWriter(path=destination, stream=stream) as writer:
for source in tqdm(sources, unit="sources", disable=disable):
for source in utils.tqdm(sources, unit="sources", disable=disable):
pf = pa.parquet.ParquetFile(source)
for i in tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable):
for i in utils.tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable):
df = pf.read_row_group(i).to_pandas()
for col in df.columns:
df[col] = df[col].apply(json.loads)
Expand Down
8 changes: 5 additions & 3 deletions src/datasets/io/json.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
from typing import BinaryIO, Optional, Union

from .. import Dataset, Features, NamedSplit, config
from .. import Dataset, Features, NamedSplit, config, utils
from ..formatting import query_table
from ..packaged_modules.json.json import Json
from ..utils.tqdm_utils import tqdm
from ..utils import logging
from ..utils.typing import NestedDataStructureLike, PathLike
from .abc import AbstractDatasetReader

Expand Down Expand Up @@ -96,7 +96,9 @@ def _write(
written = 0
_ = to_json_kwargs.pop("path_or_buf", None)

for offset in tqdm(range(0, len(self.dataset), batch_size)):
for offset in utils.tqdm(
range(0, len(self.dataset), batch_size), unit="ba", disable=bool(logging.get_verbosity() == logging.NOTSET)
):
batch = query_table(
table=self.dataset.data,
key=slice(offset, offset + batch_size),
Expand Down
10 changes: 7 additions & 3 deletions src/datasets/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Union

import numpy as np
from tqdm.auto import tqdm

from . import utils
from .utils import logging


Expand Down Expand Up @@ -141,7 +141,9 @@ def add_documents(self, documents: Union[List[str], "Dataset"], column: Optional
index_config = self.es_index_config
self.es_client.indices.create(index=index_name, body=index_config)
number_of_docs = len(documents)
progress = tqdm(unit="docs", total=number_of_docs, disable=bool(logging.get_verbosity() == logging.NOTSET))
progress = utils.tqdm(
unit="docs", total=number_of_docs, disable=bool(logging.get_verbosity() == logging.NOTSET)
)
successes = 0

def passage_generator():
Expand Down Expand Up @@ -287,7 +289,9 @@ def add_vectors(

# Add vectors
logger.info("Adding {} vectors to the faiss index".format(len(vectors)))
for i in tqdm(range(0, len(vectors), batch_size), disable=bool(logging.get_verbosity() == logging.NOTSET)):
for i in utils.tqdm(
range(0, len(vectors), batch_size), disable=bool(logging.get_verbosity() == logging.NOTSET)
):
vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
self.faiss_index.add(vecs)

Expand Down
2 changes: 1 addition & 1 deletion src/datasets/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@
zip_dict,
zip_nested,
)
from .tqdm_utils import async_tqdm, tqdm
from .tqdm_utils import async_tqdm, disable_progress_bar, is_progress_bar_enabled, tqdm
from .version import Version
5 changes: 2 additions & 3 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@
import numpy as np
import posixpath
import requests
from tqdm.auto import tqdm

from .. import __version__, config
from .. import __version__, config, utils
from . import logging
from .extract import ExtractManager
from .filelock import FileLock
Expand Down Expand Up @@ -431,7 +430,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, headers=None, cookies=
return
content_length = response.headers.get("Content-Length")
total = resume_size + int(content_length) if content_length is not None else None
progress = tqdm(
progress = utils.tqdm(
unit="B",
unit_scale=True,
total=total,
Expand Down
25 changes: 15 additions & 10 deletions src/datasets/utils/py_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@

import dill
import numpy as np
from tqdm import tqdm
from tqdm.auto import tqdm

from .logging import INFO, WARNING, get_logger, get_verbosity, set_verbosity_warning
from .. import utils
from . import logging


try: # pragma: no branch
Expand All @@ -45,7 +46,7 @@
_typing_extensions = Literal = Final = None


logger = get_logger(__name__)
logger = logging.get_logger(__name__)


# NOTE: When used on an instance method, the cache is shared across all
Expand Down Expand Up @@ -142,17 +143,17 @@ def _single_map_nested(args):
return function(data_struct)

# Reduce logging to keep things readable in multiprocessing with tqdm
if rank is not None and get_verbosity() < WARNING:
set_verbosity_warning()
if rank is not None and logging.get_verbosity() < logging.WARNING:
logging.set_verbosity_warning()
# Print at least one thing to fix tqdm in notebooks in multiprocessing
# see https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
if rank is not None and "notebook" in tqdm.__name__:
if rank is not None and utils.is_progress_bar_enabled() and "notebook" in tqdm.__name__:
print(" ", end="", flush=True)

# Loop over single examples or batches and write to buffer/file if examples are to be updated
pbar_iterable = data_struct.items() if isinstance(data_struct, dict) else data_struct
pbar_desc = "#" + str(rank) if rank is not None else None
pbar = tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit="obj", desc=pbar_desc)
pbar = utils.tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit="obj", desc=pbar_desc)

if isinstance(data_struct, dict):
return {k: _single_map_nested((function, v, types, None, True)) for k, v in pbar}
Expand Down Expand Up @@ -194,14 +195,15 @@ def map_nested(
if not isinstance(data_struct, dict) and not isinstance(data_struct, types):
return function(data_struct)

disable_tqdm = bool(logger.getEffectiveLevel() > INFO)
disable_tqdm = bool(logger.getEffectiveLevel() > logging.INFO)
Copy link
Collaborator Author

@mariosasko mariosasko Jul 18, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replacing this line with disable_tqdm = bool(logging.get_verbosity() == logging.NOTSET) (this check was introduced in #2534) causes 2 tests to fail on Windows, which is very strange.

UPDATE:
It took me some time to find the bug. This is the PR with the fix I've opened in the tqdm repo. I'll update the line in a separate PR if it gets merged.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job discovering what caused this issue !

iterable = list(data_struct.values()) if isinstance(data_struct, dict) else data_struct

if num_proc is None:
num_proc = 1
if num_proc <= 1 or len(iterable) <= num_proc:
mapped = [
_single_map_nested((function, obj, types, None, True)) for obj in tqdm(iterable, disable=disable_tqdm)
_single_map_nested((function, obj, types, None, True))
for obj in utils.tqdm(iterable, disable=disable_tqdm)
]
else:
split_kwds = [] # We organize the splits ourselve (contiguous splits)
Expand All @@ -221,7 +223,10 @@ def map_nested(
num_proc, len(iterable), [len(i[1]) for i in split_kwds]
)
)
with Pool(num_proc, initargs=(RLock(),), initializer=tqdm.set_lock) as pool:
initargs, initializer = None, None
if utils.is_progress_bar_enabled():
initargs, initializer = (RLock(),), tqdm.set_lock
with Pool(num_proc, initargs=initargs, initializer=initializer) as pool:
mapped = pool.map(_single_map_nested, split_kwds)
logger.info("Finished {} processes".format(num_proc))
mapped = [obj for proc_res in mapped for obj in proc_res]
Expand Down
9 changes: 7 additions & 2 deletions src/datasets/utils/tqdm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,13 @@ def async_tqdm(*args, **kwargs):
return EmptyTqdm(*args, **kwargs)


def is_progress_bar_enabled():
global _active
return bool(_active)


def disable_progress_bar():
"""Disabled Tqdm progress bar.
"""Disable tqdm progress bar.

Usage:

Expand Down Expand Up @@ -103,7 +108,7 @@ def _async_tqdm(*args, **kwargs):


class _TqdmPbarAsync:
"""Wrapper around Tqdm pbar which be shared between thread."""
"""Wrapper around Tqdm pbar which can be shared between thread."""

_tqdm_bars = []

Expand Down