Fix optimizer_in_backward at loading opt_state_dict in distributed recipes (#2390)

mori360 · SalmanMohammadi · ebsmothers · web-flow · commit 2128559bd182 · 2025-02-13T16:16:24.000Z
Co-authored-by: Salman Mohammadi &lt;salman.mohammadi@outlook.com&gt;
Co-authored-by: Evan Smothers &lt;ebs@fb.com&gt;
diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py
@@ -610,7 +610,7 @@ def _setup_optimizer(
                 for param in opt_state_dict.keys():
                     try:
                         training.load_from_full_optimizer_state_dict(
-                            self._optim_ckpt_wrapper.state_dict()[param],
+                            self._optim_ckpt_wrapper.optim_map[param],
                             opt_state_dict[param],
                             self._device,
                         )
diff --git a/recipes/full_dpo_distributed.py b/recipes/full_dpo_distributed.py
@@ -619,7 +619,7 @@ def _setup_optimizer(
                     try:
                         training.load_from_full_optimizer_state_dict(
                             self._model,
-                            self._optim_ckpt_wrapper.state_dict()[param],
+                            self._optim_ckpt_wrapper.optim_map[param],
                             opt_state_dict[param],
                             self._device,
                         )
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -656,7 +656,7 @@ def _setup_optimizer(
                     try:
                         training.load_from_full_optimizer_state_dict(
                             self._model,
-                            self._optim_ckpt_wrapper.state_dict()[param],
+                            self._optim_ckpt_wrapper.optim_map[param],
                             opt_state_dict[param],
                             self._device,
                         )
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -564,7 +564,7 @@ def _setup_optimizer(
                     try:
                         training.load_from_full_optimizer_state_dict(
                             self._model,
-                            self._optim_ckpt_wrapper.state_dict()[param],
+                            self._optim_ckpt_wrapper.optim_map[param],
                             opt_state_dict[param],
                             self._device,
                         )
diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py
@@ -267,6 +267,7 @@ def test_loss_single_rank(
         "config, model_type, ckpt_type, micro_batch_size, gradient_accumulation_steps, optim_in_bwd",
         [
             ("llama3/8B_full", "llama3", "tune", 1, 4, False),
+            ("llama3/8B_full", "llama3", "tune", 4, 1, True),
         ],
     )
     @gpu_test(gpu_count=2)
@@ -306,9 +307,17 @@ def test_training_state_on_resume(
             checkpointer.model_type={model_type.upper()} \
             tokenizer.path='{tokenizer_path}' \
             tokenizer.prompt_template=null \
-            clip_grad_norm=100 \
         """.split()
 
+        # "optimizer_in_bwd=True" would free gradient info before clip_grad, causing
+        # wrong grad_norm, so we only test one of them each time. But loss values
+        # should be the same.
+        if not optim_in_bwd:
+            cmd_1.append("clip_grad_norm=100")
+            cmd_1.append("optimizer_in_bwd=False")
+        else:
+            cmd_1.append("optimizer_in_bwd=True")
+
         model_config = MODEL_TEST_CONFIGS[model_type]
         cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config
 
@@ -337,12 +346,17 @@ def test_training_state_on_resume(
             tokenizer.path='{tokenizer_path}' \
             tokenizer.prompt_template=null \
             resume_from_checkpoint=True \
-            metric_logger.filename={log_file} \
-            clip_grad_norm=100 \
+            metric_logger.filename={log_file}
         """.split()
 
         cmd_2 = cmd_2 + self._get_test_config_overrides() + model_config
 
+        if not optim_in_bwd:
+            cmd_2.append("clip_grad_norm=100")
+            cmd_2.append("optimizer_in_bwd=False")
+        else:
+            cmd_2.append("optimizer_in_bwd=True")
+
         monkeypatch.setattr(sys, "argv", cmd_2)
         runpy.run_path(TUNE_PATH, run_name="__main__")
 
diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
@@ -137,7 +137,11 @@ def test_loss(
         )
 
     @pytest.mark.integration_test
-    def test_training_state_on_resume(self, tmpdir, monkeypatch):
+    @pytest.mark.parametrize(
+        "optimizer_in_bwd",
+        [True, False],
+    )
+    def test_training_state_on_resume(self, tmpdir, monkeypatch, optimizer_in_bwd):
         """Test whether the recipe state is correctly updated on resume. Since this
         is model agnostic, we should run this on the small model only. The test
         consists of three stages:
@@ -169,6 +173,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
             checkpointer.model_type=LLAMA2 \
             tokenizer.path=/tmp/test-artifacts/tokenizer.model \
             tokenizer.prompt_template=null \
+            optimizer_in_bwd={optimizer_in_bwd} \
         """.split()
 
         model_config = MODEL_TEST_CONFIGS["llama2"]
@@ -200,6 +205,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
             tokenizer.prompt_template=null \
             resume_from_checkpoint=True \
             metric_logger.filename={log_file} \
+            optimizer_in_bwd={optimizer_in_bwd} \
         """.split()
 
         cmd_2 = cmd_2 + self._get_test_config_overrides() + model_config

Original file line number	Diff line number	Diff line change
`@@ -610,7 +610,7 @@ def _setup_optimizer(`
`610`	`610`	`for param in opt_state_dict.keys():`
`611`	`611`	`try:`
`612`	`612`	`training.load_from_full_optimizer_state_dict(`
`613`		`- self._optim_ckpt_wrapper.state_dict()[param],`
	`613`	`+ self._optim_ckpt_wrapper.optim_map[param],`
`614`	`614`	`opt_state_dict[param],`
`615`	`615`	`self._device,`
`616`	`616`	`)`
Original file line number	Diff line number	Diff line change
`@@ -619,7 +619,7 @@ def _setup_optimizer(`
`619`	`619`	`try:`
`620`	`620`	`training.load_from_full_optimizer_state_dict(`
`621`	`621`	`self._model,`
`622`		`- self._optim_ckpt_wrapper.state_dict()[param],`
	`622`	`+ self._optim_ckpt_wrapper.optim_map[param],`
`623`	`623`	`opt_state_dict[param],`
`624`	`624`	`self._device,`
`625`	`625`	`)`
Original file line number	Diff line number	Diff line change
`@@ -656,7 +656,7 @@ def _setup_optimizer(`
`656`	`656`	`try:`
`657`	`657`	`training.load_from_full_optimizer_state_dict(`
`658`	`658`	`self._model,`
`659`		`- self._optim_ckpt_wrapper.state_dict()[param],`
	`659`	`+ self._optim_ckpt_wrapper.optim_map[param],`
`660`	`660`	`opt_state_dict[param],`
`661`	`661`	`self._device,`
`662`	`662`	`)`
Original file line number	Diff line number	Diff line change
`@@ -564,7 +564,7 @@ def _setup_optimizer(`
`564`	`564`	`try:`
`565`	`565`	`training.load_from_full_optimizer_state_dict(`
`566`	`566`	`self._model,`
`567`		`- self._optim_ckpt_wrapper.state_dict()[param],`
	`567`	`+ self._optim_ckpt_wrapper.optim_map[param],`
`568`	`568`	`opt_state_dict[param],`
`569`	`569`	`self._device,`
`570`	`570`	`)`