From 85d61a682ab8b7d3124b6177bb9041879f10f71e Mon Sep 17 00:00:00 2001 From: Huy Vu <86480512+huvunvidia@users.noreply.github.com> Date: Sun, 3 Aug 2025 17:25:09 -0400 Subject: [PATCH 1/3] Update avlm_8b.py Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> --- nemo/collections/avlm/recipes/avlm_8b.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/nemo/collections/avlm/recipes/avlm_8b.py b/nemo/collections/avlm/recipes/avlm_8b.py index 737ebf3715e5..ac77aa426c6a 100644 --- a/nemo/collections/avlm/recipes/avlm_8b.py +++ b/nemo/collections/avlm/recipes/avlm_8b.py @@ -69,7 +69,13 @@ def finetune_recipe( num_nodes: int = 1, num_gpus_per_node: int = 8, peft_scheme: Optional[str] = 'none', - freeze_modules: Optional[dict] = None, + freeze_modules: Optional[dict] = { + "freeze_language_model": False, + "freeze_vision_model": True, + "freeze_audio_model": True, + "freeze_vision_projection": False, + "freeze_audio_projection": False, + }, ) -> run.Partial: """ Create a fine-tuning recipe for AVLM 8B model. @@ -178,7 +184,13 @@ def pretrain_recipe( num_nodes: int = 1, num_gpus_per_node: int = 8, language_model_from_pretrained: Optional[str] = None, - freeze_modules: Optional[dict] = None, + freeze_modules: Optional[dict] = { + "freeze_language_model": True, + "freeze_vision_model": True, + "freeze_audio_model": True, + "freeze_vision_projection": False, + "freeze_audio_projection": False, + }, ) -> run.Partial: """ Create a Pre-training recipe for AVLM 8B model. From 4dec107014716fcaafffb066f4b26f3a2e2129b2 Mon Sep 17 00:00:00 2001 From: Huy Vu <86480512+huvunvidia@users.noreply.github.com> Date: Mon, 4 Aug 2025 09:40:25 -0400 Subject: [PATCH 2/3] Update avlm_8b.py Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> --- nemo/collections/avlm/recipes/avlm_8b.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/nemo/collections/avlm/recipes/avlm_8b.py b/nemo/collections/avlm/recipes/avlm_8b.py index ac77aa426c6a..5b850bc16608 100644 --- a/nemo/collections/avlm/recipes/avlm_8b.py +++ b/nemo/collections/avlm/recipes/avlm_8b.py @@ -24,7 +24,8 @@ from nemo.collections.avlm import AVLMMockDataModule from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.llm.peft import LoRA -from nemo.collections.llm.recipes.log.default import tensorboard_logger +from nemo.collections.llm.recipes.finetune_default import nemo_resume +from nemo.collections.llm.recipes.log.default import default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.utils.exp_manager import TimingCallback @@ -149,10 +150,7 @@ def finetune_recipe( ), log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=2.0e-05, min_lr=2.0e-07, warmup_steps=150), - resume=run.Config( - nl.AutoResume, - restore_config=run.Config(nl.RestoreConfig, path=checkpoint_path), - ), + resume=nemo_resume(checkpoint_path), ) if peft_scheme is None or peft_scheme.lower() == 'none': @@ -264,6 +262,7 @@ def pretrain_recipe( ), log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=0.001, min_lr=2.0e-05, warmup_steps=150), + resume=default_resume(), ) return recipe From 7d016c9e23e745e63899ee257122d5a06d905cd6 Mon Sep 17 00:00:00 2001 From: Huy Vu <86480512+huvunvidia@users.noreply.github.com> Date: Mon, 4 Aug 2025 11:33:41 -0400 Subject: [PATCH 3/3] Update avlm_8b.py Signed-off-by: Huy Vu <86480512+huvunvidia@users.noreply.github.com> --- nemo/collections/avlm/recipes/avlm_8b.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo/collections/avlm/recipes/avlm_8b.py b/nemo/collections/avlm/recipes/avlm_8b.py index 5b850bc16608..015ef02604ad 100644 --- a/nemo/collections/avlm/recipes/avlm_8b.py +++ b/nemo/collections/avlm/recipes/avlm_8b.py @@ -109,6 +109,7 @@ def finetune_recipe( pipeline_model_parallel_size=1, encoder_pipeline_model_parallel_size=0, pipeline_dtype=torch.bfloat16, + ckpt_async_save=False, ) trainer = run.Config( @@ -220,6 +221,7 @@ def pretrain_recipe( pipeline_model_parallel_size=1, encoder_pipeline_model_parallel_size=0, pipeline_dtype=torch.bfloat16, + ckpt_async_save=False, ) trainer = run.Config(