@@ -267,6 +267,7 @@ def test_loss_single_rank(
267267 "config, model_type, ckpt_type, micro_batch_size, gradient_accumulation_steps, optim_in_bwd" ,
268268 [
269269 ("llama3/8B_full" , "llama3" , "tune" , 1 , 4 , False ),
270+ ("llama3/8B_full" , "llama3" , "tune" , 4 , 1 , True ),
270271 ],
271272 )
272273 @gpu_test (gpu_count = 2 )
@@ -306,9 +307,17 @@ def test_training_state_on_resume(
306307 checkpointer.model_type={ model_type .upper ()} \
307308 tokenizer.path='{ tokenizer_path } ' \
308309 tokenizer.prompt_template=null \
309- clip_grad_norm=100 \
310310 """ .split ()
311311
312+ # "optimizer_in_bwd=True" would free gradient info before clip_grad, causing
313+ # wrong grad_norm, so we only test one of them each time. But loss values
314+ # should be the same.
315+ if not optim_in_bwd :
316+ cmd_1 .append ("clip_grad_norm=100" )
317+ cmd_1 .append ("optimizer_in_bwd=False" )
318+ else :
319+ cmd_1 .append ("optimizer_in_bwd=True" )
320+
312321 model_config = MODEL_TEST_CONFIGS [model_type ]
313322 cmd_1 = cmd_1 + self ._get_test_config_overrides () + model_config
314323
@@ -337,12 +346,17 @@ def test_training_state_on_resume(
337346 tokenizer.path='{ tokenizer_path } ' \
338347 tokenizer.prompt_template=null \
339348 resume_from_checkpoint=True \
340- metric_logger.filename={ log_file } \
341- clip_grad_norm=100 \
349+ metric_logger.filename={ log_file }
342350 """ .split ()
343351
344352 cmd_2 = cmd_2 + self ._get_test_config_overrides () + model_config
345353
354+ if not optim_in_bwd :
355+ cmd_2 .append ("clip_grad_norm=100" )
356+ cmd_2 .append ("optimizer_in_bwd=False" )
357+ else :
358+ cmd_2 .append ("optimizer_in_bwd=True" )
359+
346360 monkeypatch .setattr (sys , "argv" , cmd_2 )
347361 runpy .run_path (TUNE_PATH , run_name = "__main__" )
348362
0 commit comments