Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion nemo/collections/asr/parts/preprocessing/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ def stft(self, x):
center=False if self.exact_pad else True,
window=self.window.to(dtype=torch.float, device=x.device),
return_complex=True,
pad_mode="constant",
)

def log_zero_guard_value_fn(self, x):
Expand Down Expand Up @@ -417,11 +418,12 @@ def filter_banks(self):
return self.fb

def forward(self, x, seq_len, linear_spec=False):
timemask = torch.arange(x.shape[1], device=x.device).unsqueeze(0) < seq_len.unsqueeze(1)
seq_len = self.get_seq_len(seq_len)

if self.stft_pad_amount is not None:
x = torch.nn.functional.pad(
x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "reflect"
x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "constant"
).squeeze(1)

# dither (only in training mode for eval determinism)
Expand All @@ -431,6 +433,7 @@ def forward(self, x, seq_len, linear_spec=False):
# do preemphasis
if self.preemph is not None:
x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
x = x.masked_fill(~timemask, 0.0)

# disable autocast to get full range of stft values
with torch.amp.autocast(x.device.type, enabled=False):
Expand Down
106 changes: 106 additions & 0 deletions tests/collections/asr/test_padding_and_batch_size_invariance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch.testing
from lhotse.testing.random import deterministic_rng

Check notice

Code scanning / CodeQL

Unused import Note test

Import of 'deterministic_rng' is not used.

Copilot Autofix

AI 5 months ago

To fix the issue, the unused import statement for deterministic_rng should be removed from the file. Additionally, the deterministic_rng parameter should be removed from the function signatures of the test cases where it is currently included but not used. This will clean up the code and eliminate unnecessary dependencies.


Suggested changeset 1
tests/collections/asr/test_padding_and_batch_size_invariance.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/tests/collections/asr/test_padding_and_batch_size_invariance.py b/tests/collections/asr/test_padding_and_batch_size_invariance.py
--- a/tests/collections/asr/test_padding_and_batch_size_invariance.py
+++ b/tests/collections/asr/test_padding_and_batch_size_invariance.py
@@ -15,3 +15,3 @@
 import torch.testing
-from lhotse.testing.random import deterministic_rng
+
 
@@ -22,3 +22,3 @@
 @pytest.mark.parametrize("length", list(range(15950, 16050, 3)))
-def test_preprocessor_invariant_to_padding(deterministic_rng, length):
+def test_preprocessor_invariant_to_padding(length):
     # Settings corresponding to Canary-1B features
@@ -48,3 +48,3 @@
 @pytest.mark.parametrize("length", [16000])
-def test_canary_encoder_invariant_to_padding(deterministic_rng, length):
+def test_canary_encoder_invariant_to_padding(length):
     preprocessor = AudioToMelSpectrogramPreprocessor(
@@ -132,3 +132,3 @@
 
-def test_conformer_inference_invariant_to_batch_size(deterministic_rng):
+def test_conformer_inference_invariant_to_batch_size():
     model = ConformerEncoder(feat_in=128, n_layers=2, d_model=128, feat_out=128)
EOF
@@ -15,3 +15,3 @@
import torch.testing
from lhotse.testing.random import deterministic_rng


@@ -22,3 +22,3 @@
@pytest.mark.parametrize("length", list(range(15950, 16050, 3)))
def test_preprocessor_invariant_to_padding(deterministic_rng, length):
def test_preprocessor_invariant_to_padding(length):
# Settings corresponding to Canary-1B features
@@ -48,3 +48,3 @@
@pytest.mark.parametrize("length", [16000])
def test_canary_encoder_invariant_to_padding(deterministic_rng, length):
def test_canary_encoder_invariant_to_padding(length):
preprocessor = AudioToMelSpectrogramPreprocessor(
@@ -132,3 +132,3 @@

def test_conformer_inference_invariant_to_batch_size(deterministic_rng):
def test_conformer_inference_invariant_to_batch_size():
model = ConformerEncoder(feat_in=128, n_layers=2, d_model=128, feat_out=128)
Copilot is powered by AI and may make mistakes. Always verify output.
Unable to commit as this autofix suggestion is now outdated

from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.modules import ConformerEncoder
from nemo.collections.asr.parts.preprocessing import FilterbankFeatures


@pytest.mark.parametrize("length", list(range(15950, 16050)))
def test_preprocessor_invariant_to_padding(deterministic_rng, length):
# Settings corresponding to Canary-1B features
f = FilterbankFeatures(n_window_size=400, nfilt=128, pad_to=0).eval()

# Test data:
# * a1: 1s "audio"
# * a2: 1s "audio" + 1s padding, keep length tensor unchanged
a1 = torch.arange(0, length).unsqueeze(0) / 16000
a1l = torch.tensor([length])

a2 = torch.cat([a1, torch.zeros(1, 16000)], dim=1)
a2l = a1l.clone()

mels1, mels1l = f(a1, a1l)
mels2, mels2l = f(a2, a2l)

torch.testing.assert_close(mels1, mels2[..., :mels1l])


@pytest.mark.skip(reason="Used only for debugging.")
@pytest.mark.parametrize("length", [16000])
def test_canary_invariant_to_padding(deterministic_rng, length):
model = ASRModel.from_pretrained("nvidia/canary-180m-flash").eval()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no pretrained :)


# Test data:
# * a1: 1s "audio"
# * a2: 1s "audio" + 1s padding, keep length tensor unchanged
a1 = torch.arange(0, length).unsqueeze(0) / 16000
a1l = torch.tensor([length])

a2 = torch.cat([a1, torch.zeros(1, 16000)], dim=1)
a2l = a1l.clone()

mels1, mels1l = model.preprocessor(input_signal=a1, length=a1l)
mels2, mels2l = model.preprocessor(input_signal=a2, length=a2l)

torch.testing.assert_close(mels1, mels2[..., :mels1l])

h1, h1l = model.encoder(audio_signal=mels1, length=mels1l)
h2, h2l = model.encoder(audio_signal=mels2, length=mels2l)

torch.testing.assert_close(h1, h2[..., :h1l])


@pytest.mark.xfail(reason="Fixme")
@pytest.mark.parametrize("length", [16000])
def test_conformer_inference_invariant_to_padding(deterministic_rng, length):
f = FilterbankFeatures(n_window_size=400, nfilt=128, pad_to=0).eval()
model = ConformerEncoder(feat_in=128, n_layers=2, d_model=128, feat_out=128, causal_downsampling=True)

# Test data:
# * a1: 1s "audio"
# * a2: 1s "audio" + 1s padding, keep length tensor unchanged
a1 = torch.arange(0, length).unsqueeze(0) / 16000
a1l = torch.tensor([length])

a2 = torch.cat([a1, torch.zeros(1, 16000)], dim=1)
a2l = a1l.clone()

mels1, mels1l = f(a1, a1l)
mels2, mels2l = f(a2, a2l)

torch.testing.assert_close(mels1, mels2[..., :mels1l])

h1, h1l = model(audio_signal=mels1, length=mels1l)
h2, h2l = model(audio_signal=mels2, length=mels2l)

torch.testing.assert_close(h1, h2[..., :h1l])


def test_conformer_inference_invariant_to_batch_size(deterministic_rng):
model = ConformerEncoder(feat_in=128, n_layers=2, d_model=128, feat_out=128)
model = model.eval()

audio_signal_bs1, length_bs1 = model.input_example()
h_bs1, h_length_bs1 = model(audio_signal=audio_signal_bs1, length=length_bs1)

audio_signal_bs2 = audio_signal_bs1.repeat(2, 1, 1)
length_bs2 = length_bs1.repeat(2)
h_bs2, h_length_bs2 = model(audio_signal=audio_signal_bs2, length=length_bs2)

torch.testing.assert_close(h_bs1, h_bs2[:1])
torch.testing.assert_close(h_bs1, h_bs2[1:])
Loading