Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,17 @@
list_table_cache_files,
)
from .tasks import TaskTemplate
from .utils import map_nested
from .utils import logging, map_nested
from .utils.deprecation_utils import deprecated
from .utils.file_utils import estimate_dataset_size
from .utils.info_utils import is_small_dataset
from .utils.logging import WARNING, get_logger, get_verbosity, set_verbosity_warning
from .utils.typing import PathLike


if TYPE_CHECKING:
from .dataset_dict import DatasetDict

logger = get_logger(__name__)
logger = logging.get_logger(__name__)

if int(config.PYARROW_VERSION.split(".")[0]) == 0:
PYARROW_V0 = True
Expand Down Expand Up @@ -1763,11 +1762,9 @@ def _map_single(
not keep_in_memory or cache_file_name is None
), "Please use either `keep_in_memory` or `cache_file_name` but not both."

not_verbose = bool(logger.getEffectiveLevel() > WARNING)

# Reduce logging to keep things readable in multiprocessing with tqdm
if rank is not None and get_verbosity() < WARNING:
set_verbosity_warning()
if rank is not None and logging.get_verbosity() < logging.WARNING:
logging.set_verbosity_warning()
# Print at least one thing to fix tqdm in notebooks in multiprocessing
# see https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
if rank is not None and "notebook" in tqdm.__name__:
Expand Down Expand Up @@ -1934,7 +1931,13 @@ def init_buffer_and_writer():
pbar_iterable = input_dataset if not batched else range(0, len(input_dataset), batch_size)
pbar_unit = "ex" if not batched else "ba"
pbar_desc = (desc or "") + " #" + str(rank) if rank is not None else desc
pbar = tqdm(pbar_iterable, disable=not_verbose, position=rank, unit=pbar_unit, desc=pbar_desc)
pbar = tqdm(
pbar_iterable,
disable=bool(logging.get_verbosity() == logging.NOTSET),
position=rank,
unit=pbar_unit,
desc=pbar_desc,
)
if not batched:
for i, example in enumerate(pbar):
example = apply_function_on_filtered_inputs(example, i, offset=offset)
Expand Down
10 changes: 5 additions & 5 deletions src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@
from .features import Features, _ArrayXDExtensionType
from .info import DatasetInfo
from .keyhash import DuplicatedKeysError, KeyHasher
from .utils import logging
from .utils.file_utils import hash_url_to_filename
from .utils.logging import WARNING, get_logger


logger = get_logger(__name__)
logger = logging.get_logger(__name__)

type_ = type # keep python's type function

Expand Down Expand Up @@ -536,11 +536,11 @@ def finalize(self, metrics_query_result: dict):
def parquet_to_arrow(sources, destination):
"""Convert parquet files to arrow file. Inputs can be str paths or file-like objects"""
stream = None if isinstance(destination, str) else destination
not_verbose = bool(logger.getEffectiveLevel() > WARNING)
disable = bool(logging.get_verbosity() == logging.NOTSET)
with ArrowWriter(path=destination, stream=stream) as writer:
for source in tqdm(sources, unit="sources", disable=not_verbose):
for source in tqdm(sources, unit="sources", disable=disable):
pf = pa.parquet.ParquetFile(source)
for i in tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=not_verbose):
for i in tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable):
df = pf.read_row_group(i).to_pandas()
for col in df.columns:
df[col] = df[col].apply(json.loads)
Expand Down
16 changes: 10 additions & 6 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
from .naming import camelcase_to_snakecase, filename_prefix_for_split
from .splits import Split, SplitDict, SplitGenerator
from .utils import logging
from .utils.download_manager import DownloadManager, GenerateMode
from .utils.file_utils import DownloadConfig, is_remote_url
from .utils.filelock import FileLock
from .utils.info_utils import get_size_checksum_dict, verify_checksums, verify_splits
from .utils.logging import WARNING, get_logger


logger = get_logger(__name__)
logger = logging.get_logger(__name__)


class InvalidConfigName(ValueError):
Expand Down Expand Up @@ -990,7 +990,6 @@ def _prepare_split(self, split_generator):
fpath = os.path.join(self._cache_dir, fname)

generator = self._generate_examples(**split_generator.gen_kwargs)
not_verbose = bool(logger.getEffectiveLevel() > WARNING)

with ArrowWriter(
features=self.info.features,
Expand All @@ -1001,7 +1000,11 @@ def _prepare_split(self, split_generator):
) as writer:
try:
for key, record in utils.tqdm(
generator, unit=" examples", total=split_info.num_examples, leave=False, disable=not_verbose
generator,
unit=" examples",
total=split_info.num_examples,
leave=False,
disable=bool(logging.get_verbosity() == logging.NOTSET),
):
example = self.info.features.encode_example(record)
writer.write(example, key)
Expand Down Expand Up @@ -1053,9 +1056,10 @@ def _prepare_split(self, split_generator):
fpath = os.path.join(self._cache_dir, fname)

generator = self._generate_tables(**split_generator.gen_kwargs)
not_verbose = bool(logger.getEffectiveLevel() > WARNING)
with ArrowWriter(features=self.info.features, path=fpath) as writer:
for key, table in utils.tqdm(generator, unit=" tables", leave=False, disable=not_verbose):
for key, table in utils.tqdm(
generator, unit=" tables", leave=False, disable=bool(logging.get_verbosity() == logging.NOTSET)
):
writer.write_table(table)
num_examples, num_bytes = writer.finalize()

Expand Down
10 changes: 4 additions & 6 deletions src/datasets/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
from tqdm.auto import tqdm

from .utils.logging import WARNING, get_logger
from .utils import logging


if TYPE_CHECKING:
Expand All @@ -28,7 +28,7 @@
_has_faiss = importlib.util.find_spec("faiss") is not None


logger = get_logger(__name__)
logger = logging.get_logger(__name__)


class MissingIndex(Exception):
Expand Down Expand Up @@ -141,8 +141,7 @@ def add_documents(self, documents: Union[List[str], "Dataset"], column: Optional
index_config = self.es_index_config
self.es_client.indices.create(index=index_name, body=index_config)
number_of_docs = len(documents)
not_verbose = bool(logger.getEffectiveLevel() > WARNING)
progress = tqdm(unit="docs", total=number_of_docs, disable=not_verbose)
progress = tqdm(unit="docs", total=number_of_docs, disable=bool(logging.get_verbosity() == logging.NOTSET))
successes = 0

def passage_generator():
Expand Down Expand Up @@ -275,8 +274,7 @@ def add_vectors(

# Add vectors
logger.info("Adding {} vectors to the faiss index".format(len(vectors)))
not_verbose = bool(logger.getEffectiveLevel() > WARNING)
for i in tqdm(range(0, len(vectors), batch_size), disable=not_verbose):
for i in tqdm(range(0, len(vectors), batch_size), disable=bool(logging.get_verbosity() == logging.NOTSET)):
vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
self.faiss_index.add(vecs)

Expand Down
7 changes: 3 additions & 4 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
from tqdm.auto import tqdm

from .. import __version__, config
from . import logging
from .filelock import FileLock
from .logging import WARNING, get_logger


logger = get_logger(__name__) # pylint: disable=invalid-name
logger = logging.get_logger(__name__) # pylint: disable=invalid-name

INCOMPLETE_SUFFIX = ".incomplete"

Expand Down Expand Up @@ -492,14 +492,13 @@ def http_get(url, temp_file, proxies=None, resume_size=0, headers=None, cookies=
return
content_length = response.headers.get("Content-Length")
total = resume_size + int(content_length) if content_length is not None else None
not_verbose = bool(logger.getEffectiveLevel() > WARNING)
progress = tqdm(
unit="B",
unit_scale=True,
total=total,
initial=resume_size,
desc="Downloading",
disable=not_verbose,
disable=bool(logging.get_verbosity() == logging.NOTSET),
)
for chunk in response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
Expand Down