Fix decoding with ngpu-lm when training (NVIDIA-NeMo#13994) (NVIDIA-NeMo#13995)

hoangtch-namitech · Amir Hussein · commit 863fefe283d1 · 2025-07-23T19:53:57.000Z
* Fix decoding with ngpu-lm when training (NVIDIA-NeMo#13994) Signed-off-by: Hoang Tran <hoang.tch@namitech.io> * code_format Signed-off-by: Hoang Tran <hoang.tch@namitech.io> --------- Signed-off-by: Hoang Tran <hoang.tch@namitech.io> Signed-off-by: Amir Hussein <amhussein@nvidia.com>
diff --git a/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py b/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py
@@ -26,6 +26,7 @@
 from nemo.collections.asr.parts.submodules.ngram_lm import DEFAULT_TOKEN_OFFSET
 from nemo.collections.asr.parts.submodules.wfst_decoder import RivaDecoderConfig, WfstNbestHypothesis
 from nemo.collections.asr.parts.utils import rnnt_utils
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType
@@ -878,7 +879,7 @@ def _k2_decoding(self, x: torch.Tensor, out_len: torch.Tensor) -> List['WfstNbes
         return self.k2_decoder.decode(x.to(device=self.device), out_len.to(device=self.device))
 
 
-class BeamBatchedCTCInfer(AbstractBeamCTCInfer):
+class BeamBatchedCTCInfer(AbstractBeamCTCInfer, WithOptionalCudaGraphs):
     """
     A batched beam CTC decoder.
 
@@ -945,6 +946,16 @@ def __init__(
             allow_cuda_graphs=allow_cuda_graphs,
         )
 
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs (e.g., for decoding in training)"""
+        if isinstance(self.search_algorithm, WithOptionalCudaGraphs):
+            self.search_algorithm.disable_cuda_graphs()
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs (if allowed)"""
+        if isinstance(self.search_algorithm, WithOptionalCudaGraphs):
+            self.search_algorithm.maybe_enable_cuda_graphs()
+
     @typecheck()
     def forward(
         self,
diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -23,6 +23,7 @@
 from nemo.collections.asr.parts.submodules.ngram_lm import NGramGPULanguageModel
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType
 from nemo.core.utils.cuda_python_utils import (
@@ -389,7 +390,7 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
 
-class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin):
+class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin, WithOptionalCudaGraphs):
     """A vectorized greedy CTC decoder.
 
     This is basically always faster than GreedyCTCInfer, and supports
@@ -500,6 +501,8 @@ def __init__(
             self.ngram_lm_alpha = ngram_lm_alpha
             self.state: CTCDecoderCudaGraphsState | None = None
         else:
+            self.allow_cuda_graphs = False
+            self.cuda_graphs_mode = None
             self.ngram_lm_batch = None
 
     @typecheck()
diff --git a/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py
@@ -48,6 +48,7 @@
     is_prefix,
     select_k_expansions,
 )
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import AcousticEncodedRepresentation, HypothesisType, LengthsType, NeuralType
 from nemo.utils import logging
@@ -1526,7 +1527,7 @@ def set_decoding_type(self, decoding_type: str):
             self.token_offset = DEFAULT_TOKEN_OFFSET
 
 
-class BeamBatchedRNNTInfer(Typing, ConfidenceMethodMixin):
+class BeamBatchedRNNTInfer(Typing, ConfidenceMethodMixin, WithOptionalCudaGraphs):
     @property
     def input_types(self):
         """Returns definitions of module input ports."""
@@ -1636,6 +1637,16 @@ def __init__(
                 allow_cuda_graphs=allow_cuda_graphs,
             )
 
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs (e.g., for decoding in training)"""
+        if isinstance(self._decoding_computer, WithOptionalCudaGraphs):
+            self._decoding_computer.disable_cuda_graphs()
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs (if allowed)"""
+        if isinstance(self._decoding_computer, WithOptionalCudaGraphs):
+            self._decoding_computer.maybe_enable_cuda_graphs()
+
     @property
     def output_types(self):
         """Returns definitions of module output ports."""
diff --git a/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py b/nemo/collections/asr/parts/submodules/tdt_beam_decoding.py
@@ -39,6 +39,7 @@
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
 from nemo.collections.asr.parts.utils.batched_beam_decoding_utils import BlankLMScoreMode, PruningMode
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses, is_prefix
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import AcousticEncodedRepresentation, HypothesisType, LengthsType, NeuralType
 from nemo.utils import logging
@@ -829,7 +830,7 @@ def sort_nbest(self, hyps: List[Hypothesis]) -> List[Hypothesis]:
             return sorted(hyps, key=lambda x: x.score, reverse=True)
 
 
-class BeamBatchedTDTInfer(Typing, ConfidenceMethodMixin):
+class BeamBatchedTDTInfer(Typing, ConfidenceMethodMixin, WithOptionalCudaGraphs):
     @property
     def input_types(self):
         """Returns definitions of module input ports."""
@@ -910,6 +911,16 @@ def __init__(
         else:
             raise Exception(f"Decoding strategy {search_type} nor implemented.")
 
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs (e.g., for decoding in training)"""
+        if isinstance(self._decoding_computer, WithOptionalCudaGraphs):
+            self._decoding_computer.disable_cuda_graphs()
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs (if allowed)"""
+        if isinstance(self._decoding_computer, WithOptionalCudaGraphs):
+            self._decoding_computer.maybe_enable_cuda_graphs()
+
     @property
     def output_types(self):
         """Returns definitions of module output ports."""