Skip to content

Commit f35eda3

Browse files
huvunvidiaguyueh1
authored andcommitted
Update AVLM recipes for NeMo-CI runs (NVIDIA-NeMo#14397)
* Update avlm_8b.py Signed-off-by: Huy Vu <[email protected]> * Update avlm_8b.py Signed-off-by: Huy Vu <[email protected]> * Update avlm_8b.py Signed-off-by: Huy Vu <[email protected]> --------- Signed-off-by: Huy Vu <[email protected]> Signed-off-by: Guyue Huang <[email protected]>
1 parent aa17b7e commit f35eda3

File tree

1 file changed

+20
-7
lines changed

1 file changed

+20
-7
lines changed

nemo/collections/avlm/recipes/avlm_8b.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
from nemo.collections.avlm import AVLMMockDataModule
2525
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
2626
from nemo.collections.llm.peft import LoRA
27-
from nemo.collections.llm.recipes.log.default import tensorboard_logger
27+
from nemo.collections.llm.recipes.finetune_default import nemo_resume
28+
from nemo.collections.llm.recipes.log.default import default_resume, tensorboard_logger
2829
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
2930
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
3031
from nemo.utils.exp_manager import TimingCallback
@@ -69,7 +70,13 @@ def finetune_recipe(
6970
num_nodes: int = 1,
7071
num_gpus_per_node: int = 8,
7172
peft_scheme: Optional[str] = 'none',
72-
freeze_modules: Optional[dict] = None,
73+
freeze_modules: Optional[dict] = {
74+
"freeze_language_model": False,
75+
"freeze_vision_model": True,
76+
"freeze_audio_model": True,
77+
"freeze_vision_projection": False,
78+
"freeze_audio_projection": False,
79+
},
7380
) -> run.Partial:
7481
"""
7582
Create a fine-tuning recipe for AVLM 8B model.
@@ -102,6 +109,7 @@ def finetune_recipe(
102109
pipeline_model_parallel_size=1,
103110
encoder_pipeline_model_parallel_size=0,
104111
pipeline_dtype=torch.bfloat16,
112+
ckpt_async_save=False,
105113
)
106114

107115
trainer = run.Config(
@@ -143,10 +151,7 @@ def finetune_recipe(
143151
),
144152
log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
145153
optim=distributed_fused_adam_with_cosine_annealing(max_lr=2.0e-05, min_lr=2.0e-07, warmup_steps=150),
146-
resume=run.Config(
147-
nl.AutoResume,
148-
restore_config=run.Config(nl.RestoreConfig, path=checkpoint_path),
149-
),
154+
resume=nemo_resume(checkpoint_path),
150155
)
151156

152157
if peft_scheme is None or peft_scheme.lower() == 'none':
@@ -178,7 +183,13 @@ def pretrain_recipe(
178183
num_nodes: int = 1,
179184
num_gpus_per_node: int = 8,
180185
language_model_from_pretrained: Optional[str] = None,
181-
freeze_modules: Optional[dict] = None,
186+
freeze_modules: Optional[dict] = {
187+
"freeze_language_model": True,
188+
"freeze_vision_model": True,
189+
"freeze_audio_model": True,
190+
"freeze_vision_projection": False,
191+
"freeze_audio_projection": False,
192+
},
182193
) -> run.Partial:
183194
"""
184195
Create a Pre-training recipe for AVLM 8B model.
@@ -210,6 +221,7 @@ def pretrain_recipe(
210221
pipeline_model_parallel_size=1,
211222
encoder_pipeline_model_parallel_size=0,
212223
pipeline_dtype=torch.bfloat16,
224+
ckpt_async_save=False,
213225
)
214226

215227
trainer = run.Config(
@@ -252,6 +264,7 @@ def pretrain_recipe(
252264
),
253265
log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
254266
optim=distributed_fused_adam_with_cosine_annealing(max_lr=0.001, min_lr=2.0e-05, warmup_steps=150),
267+
resume=default_resume(),
255268
)
256269

257270
return recipe

0 commit comments

Comments
 (0)