|
24 | 24 | from nemo.collections.avlm import AVLMMockDataModule |
25 | 25 | from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer |
26 | 26 | from nemo.collections.llm.peft import LoRA |
27 | | -from nemo.collections.llm.recipes.log.default import tensorboard_logger |
| 27 | +from nemo.collections.llm.recipes.finetune_default import nemo_resume |
| 28 | +from nemo.collections.llm.recipes.log.default import default_resume, tensorboard_logger |
28 | 29 | from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing |
29 | 30 | from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed |
30 | 31 | from nemo.utils.exp_manager import TimingCallback |
@@ -69,7 +70,13 @@ def finetune_recipe( |
69 | 70 | num_nodes: int = 1, |
70 | 71 | num_gpus_per_node: int = 8, |
71 | 72 | peft_scheme: Optional[str] = 'none', |
72 | | - freeze_modules: Optional[dict] = None, |
| 73 | + freeze_modules: Optional[dict] = { |
| 74 | + "freeze_language_model": False, |
| 75 | + "freeze_vision_model": True, |
| 76 | + "freeze_audio_model": True, |
| 77 | + "freeze_vision_projection": False, |
| 78 | + "freeze_audio_projection": False, |
| 79 | + }, |
73 | 80 | ) -> run.Partial: |
74 | 81 | """ |
75 | 82 | Create a fine-tuning recipe for AVLM 8B model. |
@@ -102,6 +109,7 @@ def finetune_recipe( |
102 | 109 | pipeline_model_parallel_size=1, |
103 | 110 | encoder_pipeline_model_parallel_size=0, |
104 | 111 | pipeline_dtype=torch.bfloat16, |
| 112 | + ckpt_async_save=False, |
105 | 113 | ) |
106 | 114 |
|
107 | 115 | trainer = run.Config( |
@@ -143,10 +151,7 @@ def finetune_recipe( |
143 | 151 | ), |
144 | 152 | log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), |
145 | 153 | optim=distributed_fused_adam_with_cosine_annealing(max_lr=2.0e-05, min_lr=2.0e-07, warmup_steps=150), |
146 | | - resume=run.Config( |
147 | | - nl.AutoResume, |
148 | | - restore_config=run.Config(nl.RestoreConfig, path=checkpoint_path), |
149 | | - ), |
| 154 | + resume=nemo_resume(checkpoint_path), |
150 | 155 | ) |
151 | 156 |
|
152 | 157 | if peft_scheme is None or peft_scheme.lower() == 'none': |
@@ -178,7 +183,13 @@ def pretrain_recipe( |
178 | 183 | num_nodes: int = 1, |
179 | 184 | num_gpus_per_node: int = 8, |
180 | 185 | language_model_from_pretrained: Optional[str] = None, |
181 | | - freeze_modules: Optional[dict] = None, |
| 186 | + freeze_modules: Optional[dict] = { |
| 187 | + "freeze_language_model": True, |
| 188 | + "freeze_vision_model": True, |
| 189 | + "freeze_audio_model": True, |
| 190 | + "freeze_vision_projection": False, |
| 191 | + "freeze_audio_projection": False, |
| 192 | + }, |
182 | 193 | ) -> run.Partial: |
183 | 194 | """ |
184 | 195 | Create a Pre-training recipe for AVLM 8B model. |
@@ -210,6 +221,7 @@ def pretrain_recipe( |
210 | 221 | pipeline_model_parallel_size=1, |
211 | 222 | encoder_pipeline_model_parallel_size=0, |
212 | 223 | pipeline_dtype=torch.bfloat16, |
| 224 | + ckpt_async_save=False, |
213 | 225 | ) |
214 | 226 |
|
215 | 227 | trainer = run.Config( |
@@ -252,6 +264,7 @@ def pretrain_recipe( |
252 | 264 | ), |
253 | 265 | log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), |
254 | 266 | optim=distributed_fused_adam_with_cosine_annealing(max_lr=0.001, min_lr=2.0e-05, warmup_steps=150), |
| 267 | + resume=default_resume(), |
255 | 268 | ) |
256 | 269 |
|
257 | 270 | return recipe |
0 commit comments