cast SE weights and activations to fp32

erastorgueva-nv · erastorgueva-nv · commit 70d0d108f9a6 · 2025-09-15T12:37:46.000-07:00
diff --git a/nemo/collections/asr/parts/submodules/jasper.py b/nemo/collections/asr/parts/submodules/jasper.py
@@ -478,9 +478,11 @@ def forward_for_export(self, x, lengths):
             mask = self.make_pad_mask(lengths, max_audio_length=max_len, device=x.device)
             mask = ~mask  # 0 represents value, 1 represents pad
 
-            # Commented out the below cast in v2.5.0 to fix dtype errors when running examples/asr/transcribe_speech.py on ASR models that use jasper.py encoder.
-            # Observed minimal changes in model outputs from this change.
-            # x = x.float()
+            # Ensure SE runs in FP32: cast fc weights and activations to float32
+            if self.fc[0].weight.dtype != torch.float32:
+                self.fc.float()
+            if x.dtype != torch.float32:
+                x = x.float()
 
             x = x.masked_fill(mask, 0.0)  # mask padded values explicitly to 0
             y = self._se_pool_step(x, mask)  # [B, C, 1]
@@ -494,6 +496,8 @@ def forward_for_export(self, x, lengths):
 
             y = torch.sigmoid(y)
             y = x * y
+            # Cast back to original dtype for downstream consistency
+            y = y.to(dtype)
         return y, lengths
 
     def _se_pool_step(self, x, mask):