Skip to content

Commit a765b68

Browse files
authored
Update no_trainer.py scripts to include accelerate gradient accumulation wrapper (#18473)
* Added accelerate gradient accumulation wrapper to run_image_classification_no_trainer.py example script * make fixup changes * PR comments * changed input to Acceletor based on PR comment, ran make fixup * Added comment explaining the sync_gradients statement * Fixed lr scheduler max steps * Changed run_clm_no_trainer.py script to use accelerate gradient accum wrapper * Fixed all scripts except wav2vec2 pretraining to use accelerate gradient accum wrapper * Added accelerate gradient accum wrapper for wav2vec2_pretraining_no_trainer.py script * make fixup and lr_scheduler step inserted back into run_qa_beam_search_no_trainer.py * removed changes to run_wav2vec2_pretraining_no_trainer.py script and fixed using wrong constant in qa_beam_search_no_trainer.py script
1 parent f1f5de3 commit a765b68

8 files changed

Lines changed: 173 additions & 104 deletions

File tree

examples/pytorch/image-classification/run_image_classification_no_trainer.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,14 @@ def main():
212212
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
213213
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
214214
# in the environment
215-
accelerator = (
216-
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
217-
)
215+
accelerator_log_kwargs = {}
216+
217+
if args.with_tracking:
218+
accelerator_log_kwargs["log_with"] = args.report_to
219+
accelerator_log_kwargs["logging_dir"] = args.output_dir
220+
221+
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
222+
218223
logger.info(accelerator.state)
219224
# Make one log on every process with the configuration for debugging.
220225
logging.basicConfig(
@@ -384,8 +389,8 @@ def collate_fn(examples):
384389
lr_scheduler = get_scheduler(
385390
name=args.lr_scheduler_type,
386391
optimizer=optimizer,
387-
num_warmup_steps=args.num_warmup_steps,
388-
num_training_steps=args.max_train_steps,
392+
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
393+
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
389394
)
390395

391396
# Prepare everything with our `accelerator`.
@@ -467,17 +472,20 @@ def collate_fn(examples):
467472
if resume_step is not None and step < resume_step:
468473
completed_steps += 1
469474
continue
470-
outputs = model(**batch)
471-
loss = outputs.loss
472-
# We keep track of the loss at each epoch
473-
if args.with_tracking:
474-
total_loss += loss.detach().float()
475-
loss = loss / args.gradient_accumulation_steps
476-
accelerator.backward(loss)
477-
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
475+
476+
with accelerator.accumulate(model):
477+
outputs = model(**batch)
478+
loss = outputs.loss
479+
# We keep track of the loss at each epoch
480+
if args.with_tracking:
481+
total_loss += loss.detach().float()
482+
accelerator.backward(loss)
478483
optimizer.step()
479484
lr_scheduler.step()
480485
optimizer.zero_grad()
486+
487+
# Checks if the accelerator has performed an optimization step behind the scenes
488+
if accelerator.sync_gradients:
481489
progress_bar.update(1)
482490
completed_steps += 1
483491

examples/pytorch/language-modeling/run_clm_no_trainer.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -249,9 +249,14 @@ def main():
249249
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
250250
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
251251
# in the environment
252-
accelerator = (
253-
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
254-
)
252+
accelerator_log_kwargs = {}
253+
254+
if args.with_tracking:
255+
accelerator_log_kwargs["log_with"] = args.report_to
256+
accelerator_log_kwargs["logging_dir"] = args.output_dir
257+
258+
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
259+
255260
# Make one log on every process with the configuration for debugging.
256261
logging.basicConfig(
257262
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -486,8 +491,8 @@ def group_texts(examples):
486491
lr_scheduler = get_scheduler(
487492
name=args.lr_scheduler_type,
488493
optimizer=optimizer,
489-
num_warmup_steps=args.num_warmup_steps,
490-
num_training_steps=args.max_train_steps,
494+
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
495+
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
491496
)
492497

493498
# Prepare everything with our `accelerator`.
@@ -567,17 +572,20 @@ def group_texts(examples):
567572
if resume_step is not None and step < resume_step:
568573
completed_steps += 1
569574
continue
570-
outputs = model(**batch)
571-
loss = outputs.loss
572-
# We keep track of the loss at each epoch
573-
if args.with_tracking:
574-
total_loss += loss.detach().float()
575-
loss = loss / args.gradient_accumulation_steps
576-
accelerator.backward(loss)
577-
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
575+
576+
with accelerator.accumulate(model):
577+
outputs = model(**batch)
578+
loss = outputs.loss
579+
# We keep track of the loss at each epoch
580+
if args.with_tracking:
581+
total_loss += loss.detach().float()
582+
accelerator.backward(loss)
578583
optimizer.step()
579584
lr_scheduler.step()
580585
optimizer.zero_grad()
586+
587+
# Checks if the accelerator has performed an optimization step behind the scenes
588+
if accelerator.sync_gradients:
581589
progress_bar.update(1)
582590
completed_steps += 1
583591

examples/pytorch/language-modeling/run_mlm_no_trainer.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,14 @@ def main():
258258
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
259259
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
260260
# in the environment
261-
accelerator = (
262-
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
263-
)
261+
accelerator_log_kwargs = {}
262+
263+
if args.with_tracking:
264+
accelerator_log_kwargs["log_with"] = args.report_to
265+
accelerator_log_kwargs["logging_dir"] = args.output_dir
266+
267+
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
268+
264269
# Make one log on every process with the configuration for debugging.
265270
logging.basicConfig(
266271
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -530,8 +535,8 @@ def group_texts(examples):
530535
lr_scheduler = get_scheduler(
531536
name=args.lr_scheduler_type,
532537
optimizer=optimizer,
533-
num_warmup_steps=args.num_warmup_steps,
534-
num_training_steps=args.max_train_steps,
538+
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
539+
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
535540
)
536541

537542
# Prepare everything with our `accelerator`.
@@ -611,17 +616,20 @@ def group_texts(examples):
611616
if resume_step is not None and step < resume_step:
612617
completed_steps += 1
613618
continue
614-
outputs = model(**batch)
615-
loss = outputs.loss
616-
# We keep track of the loss at each epoch
617-
if args.with_tracking:
618-
total_loss += loss.detach().float()
619-
loss = loss / args.gradient_accumulation_steps
620-
accelerator.backward(loss)
621-
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
619+
620+
with accelerator.accumulate(model):
621+
outputs = model(**batch)
622+
loss = outputs.loss
623+
# We keep track of the loss at each epoch
624+
if args.with_tracking:
625+
total_loss += loss.detach().float()
626+
accelerator.backward(loss)
622627
optimizer.step()
623628
lr_scheduler.step()
624629
optimizer.zero_grad()
630+
631+
# Checks if the accelerator has performed an optimization step behind the scenes
632+
if accelerator.sync_gradients:
625633
progress_bar.update(1)
626634
completed_steps += 1
627635

examples/pytorch/multiple-choice/run_swag_no_trainer.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565

6666

6767
def parse_args():
68-
parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
68+
parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task")
6969
parser.add_argument(
7070
"--dataset_name",
7171
type=str,
@@ -284,9 +284,14 @@ def main():
284284
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
285285
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
286286
# in the environment
287-
accelerator = (
288-
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
289-
)
287+
accelerator_log_kwargs = {}
288+
289+
if args.with_tracking:
290+
accelerator_log_kwargs["log_with"] = args.report_to
291+
accelerator_log_kwargs["logging_dir"] = args.output_dir
292+
293+
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
294+
290295
# Make one log on every process with the configuration for debugging.
291296
logging.basicConfig(
292297
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -483,8 +488,8 @@ def preprocess_function(examples):
483488
lr_scheduler = get_scheduler(
484489
name=args.lr_scheduler_type,
485490
optimizer=optimizer,
486-
num_warmup_steps=args.num_warmup_steps,
487-
num_training_steps=args.max_train_steps,
491+
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
492+
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
488493
)
489494

490495
# Prepare everything with our `accelerator`.
@@ -567,17 +572,20 @@ def preprocess_function(examples):
567572
if resume_step is not None and step < resume_step:
568573
completed_steps += 1
569574
continue
570-
outputs = model(**batch)
571-
loss = outputs.loss
572-
# We keep track of the loss at each epoch
573-
if args.with_tracking:
574-
total_loss += loss.detach().float()
575-
loss = loss / args.gradient_accumulation_steps
576-
accelerator.backward(loss)
577-
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
575+
576+
with accelerator.accumulate(model):
577+
outputs = model(**batch)
578+
loss = outputs.loss
579+
# We keep track of the loss at each epoch
580+
if args.with_tracking:
581+
total_loss += loss.detach().float()
582+
accelerator.backward(loss)
578583
optimizer.step()
579584
lr_scheduler.step()
580585
optimizer.zero_grad()
586+
587+
# Checks if the accelerator has performed an optimization step behind the scenes
588+
if accelerator.sync_gradients:
581589
progress_bar.update(1)
582590
completed_steps += 1
583591

examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,16 @@ def main():
297297
send_example_telemetry("run_qa_beam_search_no_trainer", args)
298298

299299
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
300-
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
301-
accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator()
300+
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers
301+
# in the environment
302+
accelerator_log_kwargs = {}
303+
304+
if args.with_tracking:
305+
accelerator_log_kwargs["log_with"] = args.report_to
306+
accelerator_log_kwargs["logging_dir"] = args.output_dir
307+
308+
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
309+
302310
# Make one log on every process with the configuration for debugging.
303311
logging.basicConfig(
304312
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -739,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
739747
lr_scheduler = get_scheduler(
740748
name=args.lr_scheduler_type,
741749
optimizer=optimizer,
742-
num_warmup_steps=args.num_warmup_steps,
743-
num_training_steps=args.max_train_steps,
750+
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
751+
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
744752
)
745753

746754
# Prepare everything with our `accelerator`.
@@ -818,17 +826,22 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
818826
if resume_step is not None and step < resume_step:
819827
completed_steps += 1
820828
continue
821-
outputs = model(**batch)
822-
loss = outputs.loss
823-
# We keep track of the loss at each epoch
824-
if args.with_tracking:
825-
total_loss += loss.detach().float()
826-
loss = loss / args.gradient_accumulation_steps
827-
accelerator.backward(loss)
828-
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
829+
830+
with accelerator.accumulate(model):
831+
outputs = model(**batch)
832+
loss = outputs.loss
833+
# We keep track of the loss at each epoch
834+
if args.with_tracking:
835+
total_loss += loss.detach().float()
836+
837+
accelerator.backward(loss)
838+
829839
optimizer.step()
830840
lr_scheduler.step()
831841
optimizer.zero_grad()
842+
843+
# Checks if the accelerator has performed an optimization step behind the scenes
844+
if accelerator.sync_gradients:
832845
progress_bar.update(1)
833846
completed_steps += 1
834847

examples/pytorch/question-answering/run_qa_no_trainer.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,14 @@ def main():
337337
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
338338
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
339339
# in the environment
340-
accelerator = (
341-
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
342-
)
340+
accelerator_log_kwargs = {}
341+
342+
if args.with_tracking:
343+
accelerator_log_kwargs["log_with"] = args.report_to
344+
accelerator_log_kwargs["logging_dir"] = args.output_dir
345+
346+
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
347+
343348
# Make one log on every process with the configuration for debugging.
344349
logging.basicConfig(
345350
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -757,8 +762,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
757762
lr_scheduler = get_scheduler(
758763
name=args.lr_scheduler_type,
759764
optimizer=optimizer,
760-
num_warmup_steps=args.num_warmup_steps,
761-
num_training_steps=args.max_train_steps,
765+
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
766+
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
762767
)
763768

764769
# Prepare everything with our `accelerator`.
@@ -839,17 +844,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
839844
if resume_step is not None and step < resume_step:
840845
completed_steps += 1
841846
continue
842-
outputs = model(**batch)
843-
loss = outputs.loss
844-
# We keep track of the loss at each epoch
845-
if args.with_tracking:
846-
total_loss += loss.detach().float()
847-
loss = loss / args.gradient_accumulation_steps
848-
accelerator.backward(loss)
849-
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
847+
848+
with accelerator.accumulate(model):
849+
outputs = model(**batch)
850+
loss = outputs.loss
851+
# We keep track of the loss at each epoch
852+
if args.with_tracking:
853+
total_loss += loss.detach().float()
854+
855+
accelerator.backward(loss)
850856
optimizer.step()
851857
lr_scheduler.step()
852858
optimizer.zero_grad()
859+
860+
# Checks if the accelerator has performed an optimization step behind the scenes
861+
if accelerator.sync_gradients:
853862
progress_bar.update(1)
854863
completed_steps += 1
855864

0 commit comments

Comments
 (0)