Re-revert the cp_size argument for masked_token_loss.

cspades · cspades · commit 2db0330835c4 · 2025-05-07T15:49:19.000-07:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py
@@ -27,7 +27,8 @@
 from lightning.pytorch import LightningDataModule
 from megatron.core import parallel_state
 from megatron.core.tensor_parallel.mappings import _gather_along_last_dim
-from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank, get_packed_seq_params
+from megatron.core.utils import get_batch_on_this_cp_rank
+from nemo.collections.llm.gpt.model.base import get_packed_seq_params
 from nemo.collections.llm.gpt.model.hyena import HYENA_MODEL_OPTIONS, HyenaModel
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
@@ -254,7 +255,7 @@ def hyena_predict_data_step(dataloader_iter) -> dict[str, torch.Tensor]:
             _batch_required_keys[key] = None
 
     # slice batch along sequence dimension for context parallelism
-    output = get_batch_on_this_context_parallel_rank(_batch_required_keys)
+    output = get_batch_on_this_cp_rank(_batch_required_keys)
 
     return output
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py
@@ -23,7 +23,6 @@
 from nemo.collections.llm.fn.mixin import FNMixin
 from nemo.collections.llm.peft.lora import LoRA, LoRALinear
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
-from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
 from nemo.lightning.megatron_parallel import masked_token_loss
 from torch import Tensor, nn
 
@@ -36,6 +35,7 @@
     unreduced_token_loss_fn,
 )
 from bionemo.llm.utils import iomixin_utils as iom
+from bionemo.llm.utils.megatron_utils import average_losses_across_data_parallel_group
 
 
 # This package demonstrates how you can take a pretrained geneformer module and fine-tune the classifier
@@ -98,13 +98,19 @@ def forward(
 
         # TODO(@jstjohn) also handle different output keys, like the sequence loss.
 
+        # Compute loss over "valid" tokens in the microbatch, i.e. the non-masked tokens.
+        # The loss is not normalized, so you need to divide by the number of non-masked
+        # tokens (loss_mask.sum()) to compute the mean loss per token.
+        loss_for_microbatch, num_valid_tokens_in_microbatch = masked_token_loss(
+            unreduced_token_loss, batch["loss_mask"]
+        )
+
+        # Get the context parallel size for some normalizations and reductions.
         cp_size = parallel_state.get_context_parallel_world_size()
-        loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"], cp_size)
 
         # If we do not drop the last partial batch of validation, we need to do fancy reduction handling to support
         #  reducing the loss across the data parallel group.
         if self.validation_step and not self.val_drop_last:
-            num_valid_tokens_in_microbatch = batch["loss_mask"].sum()
             if loss_for_microbatch.isnan():
                 # TODO(@jomitchell): Add a unit test for this. This is the case where there are no valid tokens in the microbatch for the loss
                 #  to be computed over, so we expect a NaN loss (divide by zero for a mean) but we make this an expected and non-breaking case,
@@ -113,7 +119,7 @@ def forward(
                     raise ValueError("Got NaN loss with non-empty input")
                 loss_sum_for_microbatch = torch.zeros_like(num_valid_tokens_in_microbatch)
             else:
-                loss_sum_for_microbatch = num_valid_tokens_in_microbatch * loss_for_microbatch
+                loss_sum_for_microbatch = loss_for_microbatch
 
             # In this case we need to store the loss sum as well as the number of valid tokens in the microbatch.
             loss_sum_and_microbatch_size_all_gpu = torch.cat(
@@ -123,14 +129,20 @@ def forward(
                 ]
             )
             torch.distributed.all_reduce(
-                loss_sum_and_microbatch_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                loss_sum_and_microbatch_size_all_gpu,
+                group=parallel_state.get_data_parallel_group(with_context_parallel=True),
             )
             return loss_for_microbatch * cp_size, {
                 "loss_sum_and_microbatch_size": loss_sum_and_microbatch_size_all_gpu
             }
+
         loss_for_microbatch = loss_for_microbatch + rmse_loss  # add in the RMSE loss after reducing the logit loss
+
         # average the losses across the data parallel group, but also return the unreduced loss
-        reduced_loss: Tensor = average_losses_across_data_parallel_group([loss_for_microbatch])
+        reduced_loss: Tensor = (
+            average_losses_across_data_parallel_group([loss_for_microbatch], with_context_parallel=True)
+            / num_valid_tokens_in_microbatch
+        )
         return loss_for_microbatch * cp_size, {"avg": reduced_loss}
 
 
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py
@@ -18,13 +18,14 @@
 import torch
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
-from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
 from nemo.lightning.megatron_parallel import (
     MegatronLossReduction,
     masked_token_loss,
 )
 from torch import Tensor
 
+from bionemo.llm.utils.megatron_utils import average_losses_across_data_parallel_group
+
 
 __all__: Sequence[str] = (
     "BERTMLMLossWithReduction",
@@ -179,12 +180,14 @@ def forward(
         # TODO(@jstjohn) also handle different output keys, like the sequence loss.
 
         # Compute loss over "valid" tokens in the microbatch, i.e. the non-masked tokens.
-        # The loss is not normalized, only potentially reduced via torch.distributed.ReduceOp.SUM
-        # across the context parallel process group, so you need to divide by the number
-        # of non-masked tokens (loss_mask.sum()) to compute the mean reduced loss per token.
+        # The loss is not normalized, so you need to divide by the number of non-masked
+        # tokens (loss_mask.sum()) to compute the mean loss per token.
+        loss_for_microbatch, num_valid_tokens_in_microbatch = masked_token_loss(
+            unreduced_token_loss, batch["loss_mask"]
+        )
+
+        # Get the context parallel size for some normalizations and reductions.
         cp_size = parallel_state.get_context_parallel_world_size()
-        loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"], cp_size=cp_size)
-        num_valid_tokens_in_microbatch = batch["loss_mask"].sum()
 
         # If we do not drop the last partial batch of validation, we need to do fancy reduction handling to support
         #  reducing the loss across the data parallel group.
@@ -197,7 +200,7 @@ def forward(
                     raise ValueError("Got NaN loss with non-empty input")
                 loss_sum_for_microbatch = torch.zeros_like(num_valid_tokens_in_microbatch)
             else:
-                # The reduced loss is already the sum of all losses from masked_token_loss().
+                # The loss is already the sum of all losses from masked_token_loss().
                 loss_sum_for_microbatch = loss_for_microbatch
 
             # In this case we need to store the loss sum as well as the number of valid tokens in the microbatch.
@@ -212,7 +215,7 @@ def forward(
             # for all data parallel / distributed microbatches.
             torch.distributed.all_reduce(
                 loss_sum_and_microbatch_size_all_gpu,
-                group=parallel_state.get_data_parallel_group(),
+                group=parallel_state.get_data_parallel_group(with_context_parallel=True),
                 op=torch.distributed.ReduceOp.SUM,
             )
 
@@ -227,7 +230,8 @@ def forward(
         # Normalize the loss by the number of "valid" tokens, because masked_token_loss
         # no longer does this normalization, and BioNeMo losses expect this normalization.
         reduced_loss = (
-            average_losses_across_data_parallel_group([loss_for_microbatch]) / num_valid_tokens_in_microbatch
+            average_losses_across_data_parallel_group([loss_for_microbatch], with_context_parallel=True)
+            / num_valid_tokens_in_microbatch
         )
         return loss_for_microbatch * cp_size, {"avg": reduced_loss}
 
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/megatron_utils.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/megatron_utils.py
@@ -34,3 +34,18 @@ def is_only_data_parallel() -> bool:
     world_size: int = torch.distributed.get_world_size()
     dp_world_size: int = parallel_state.get_data_parallel_world_size()
     return world_size == dp_world_size
+
+
+def average_losses_across_data_parallel_group(losses, with_context_parallel: bool = False):
+    """Reduce a tensor of losses across all GPUs."""
+    averaged_losses = torch.cat([loss.clone().detach().view(1) for loss in losses])
+    # Reduce across the DP (or optionally, the flattened DP + CP) group.
+    # Refer to the ring attention algorithm on why we always must reduce across the CP group.
+    torch.distributed.all_reduce(
+        averaged_losses, group=parallel_state.get_data_parallel_group(with_context_parallel=with_context_parallel)
+    )
+    averaged_losses = averaged_losses / torch.distributed.get_world_size(
+        # Only average losses across the data parallel group, not the context parallel group!
+        group=parallel_state.get_data_parallel_group()
+    )
+    return averaged_losses