From 246a6be71eb5c6d8048e3dea00fa342261aa300b Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Fri, 24 Dec 2021 18:40:42 +0200 Subject: [PATCH] Use tqdm.auto in Pipeline docs It's better for e.g. notebook. --- docs/source/main_classes/pipelines.mdx | 53 +++++++++++++------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/docs/source/main_classes/pipelines.mdx b/docs/source/main_classes/pipelines.mdx index a955cdef7054..52c2b15ab87a 100644 --- a/docs/source/main_classes/pipelines.mdx +++ b/docs/source/main_classes/pipelines.mdx @@ -79,14 +79,14 @@ GPU. If it doesn't don't hesitate to create an issue. import datasets from transformers import pipeline from transformers.pipelines.base import KeyDataset -import tqdm +from tqdm.auto import tqdm pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0) dataset = datasets.load_dataset("superb", name="asr", split="test") # KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item # as we're not interested in the *target* part of the dataset. -for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))): +for out in tqdm(pipe(KeyDataset(dataset, "file"))): print(out) # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"} # {"text": ....} @@ -101,10 +101,9 @@ All pipelines (except *zero-shot-classification* and *question-answering* curren whenever the pipeline uses its streaming ability (so when passing lists or `Dataset`). ```python -from transformers import pipeline +from transformers import pipeline from transformers.pipelines.base import KeyDataset import datasets -import tqdm dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised") pipe = pipeline("text-classification", device=0) @@ -125,28 +124,28 @@ Example where it's most a speedup: ```python -from transformers import pipeline -from torch.utils.data import Dataset -import tqdm +from transformers import pipeline +from torch.utils.data import Dataset +from tqdm.auto import tqdm -pipe = pipeline("text-classification", device=0) +pipe = pipeline("text-classification", device=0) -class MyDataset(Dataset): - def __len__(self): - return 5000 +class MyDataset(Dataset): + def __len__(self): + return 5000 - def __getitem__(self, i): - return "This is a test" + def __getitem__(self, i): + return "This is a test" -dataset = MyDataset() +dataset = MyDataset() for batch_size in [1, 8, 64, 256]: - print("-" * 30) - print(f"Streaming batch_size={batch_size}") - for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)): + print("-" * 30) + print(f"Streaming batch_size={batch_size}") + for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)): pass ``` @@ -170,15 +169,15 @@ Streaming batch_size=256 Example where it's most a slowdown: ```python -class MyDataset(Dataset): - def __len__(self): - return 5000 - - def __getitem__(self, i): - if i % 64 == 0: - n = 100 - else: - n = 1 +class MyDataset(Dataset): + def __len__(self): + return 5000 + + def __getitem__(self, i): + if i % 64 == 0: + n = 100 + else: + n = 1 return "This is a test" * n ``` @@ -202,7 +201,7 @@ Streaming batch_size=256 0%| | 0/1000 [00:00 - for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)): + for out in tqdm(pipe(dataset, batch_size=256), total=len(dataset)): .... q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)