diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml index 8416847c9d12..2213a6e100f8 100644 --- a/.github/actions/test-template/action.yml +++ b/.github/actions/test-template/action.yml @@ -49,6 +49,10 @@ inputs: description: "Run tests on CPU only" required: false default: "false" + test_dir: + description: "Directory under tests/ containing the test scripts" + required: false + default: "functional_tests" runs: using: "composite" steps: @@ -147,7 +151,7 @@ runs: docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c '\ cp -r /opt/Megatron-LM/ /workspace/ && \ - bash tests/functional_tests/${{ inputs.script }}.sh && \ + bash tests/${{ inputs.test_dir }}/${{ inputs.script }}.sh && \ echo "Finished successfully." || echo "Did not finish."' ) 2>&1 | tee $DIR/err.log diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml index 948924a0c8ce..b23fcb72b020 100644 --- a/.github/workflows/cicd-main-speech.yml +++ b/.github/workflows/cicd-main-speech.yml @@ -218,3 +218,267 @@ jobs: image: ${{ inputs.image-name }} timeout: ${{ matrix.timeout || 10 }} is_optional: ${{ matrix.is-optional || false }} + + e2e-nightly: + if: ${{ github.event_name == 'schedule' }} + strategy: + fail-fast: false + matrix: + include: + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_de_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_it_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_ua_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_pl_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_hr_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_be_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_fr_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_ru_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_nl_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_fa_fastconformer_hybrid_large + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_kk_ru_fastconformer_hybrid_large + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_uz_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_hy_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_pt_fastconformer_hybrid_large_pc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_ctc_large + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_transducer_large + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xlarge + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xlarge + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xxlarge + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xxlarge + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__stt_en_fastconformer_tdt_large + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_stt_en_fastconformer_hybrid_large_streaming_1040ms + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_rnnt_1_1b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_ctc_1_1b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_rnnt_0_6b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_ctc_0_6b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_tdt_1_1b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_tdt_ctc_1_1b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_tdt_ctc_0_6b_ja + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_tdt_ctc_110m + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_tdt_0_6b_v2 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_rnnt_110m_da_dk + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_tdt_0_6b_v3 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_ctc_0_6b_Vietnamese + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__canary_1b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__canary_1b_flash + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__canary_180m_flash + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__canary_1b_v2 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__parakeet_realtime_eou_120m_v1 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__multitalker_parakeet_streaming_0_6b_v1 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__nemotron_speech_streaming_en_0_6b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__canary_qwen_2_5b + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__diar_sortformer_4spk_v1 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2_1 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_titanet_large + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__speakerverification_en_titanet_large + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__ssl_en_nest_large_v1_0 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__ssl_en_nest_xlarge_v1_0 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_vad_multilingual_marblenet + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_vad_multilingual_frame_marblenet + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0 + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__se_den_sb_16k_small + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__se_der_sb_16k_small + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__sr_ssl_flowmatching_16k_430m + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_mel_codec_44khz_medium + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_mel_codec_22khz_fullband_medium + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__low_frame_rate_speech_codec_22khz + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__audio_codec_22khz + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__audio_codec_44khz + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__mel_codec_22khz + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__mel_codec_44khz + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__tts_en_fastpitch + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__tts_hifigan + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_nvidia__magpie_tts_multilingual_357m + timeout: 15 + - runner: self-hosted-azure + script: L2_Model_Support_tts_en_e2e_fastspeech2hifigan + timeout: 15 + needs: [unit-tests] + runs-on: ${{ matrix.runner }} + name: ${{ matrix.script }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + path: ${{ github.run_id }} + - name: main + uses: NVIDIA/NeMo/.github/actions/test-template@main + with: + runner: ${{ runner.name }} + script: ${{ matrix.script }} + tests_to_run: ${{ inputs.test_to_run }} + image: ${{ inputs.image-name }} + timeout: ${{ matrix.timeout || 10 }} + test_dir: e2e_nightly diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py index 06666cc18ee3..2deaf0664d4c 100644 --- a/nemo/core/connectors/save_restore_connector.py +++ b/nemo/core/connectors/save_restore_connector.py @@ -433,7 +433,15 @@ def register_artifact(self, model, config_path: str, src: str, verify_src_exists ) return None - assert os.path.exists(return_path) + if not os.path.exists(return_path): + nemo_folder = app_state.nemo_file_folder + existing_files = os.listdir(nemo_folder) if nemo_folder and os.path.isdir(nemo_folder) else [] + raise FileNotFoundError( + f"Artifact not found at expected path: {return_path}\n" + f" src: {src}\n" + f" nemo_file_folder: {nemo_folder}\n" + f" Files in nemo_file_folder: {existing_files}" + ) artifact_item.path = os.path.abspath(src) model.artifacts[config_path] = artifact_item diff --git a/nemo/core/utils/cuda_python_utils.py b/nemo/core/utils/cuda_python_utils.py index 927d2d06eaa8..a35b2056bd09 100644 --- a/nemo/core/utils/cuda_python_utils.py +++ b/nemo/core/utils/cuda_python_utils.py @@ -13,7 +13,6 @@ # limitations under the License. import contextlib -import inspect import numpy as np import torch @@ -171,21 +170,16 @@ def with_conditional_node(while_loop_kernel, while_loop_args, while_loop_conditi # Use driver API here because of bug in cuda-python runtime API: https://github.com/NVIDIA/cuda-python/issues/55 # TODO: Change call to this after fix goes in (and we bump minimum cuda-python version to 12.4.0): # node, = cu_call(cudart.cudaGraphAddNode(graph, dependencies, len(dependencies), driver_params)) - # depending on cuda-python version, number of parameters vary - num_cuda_graph_add_node_params = len(inspect.signature(cuda.cuGraphAddNode).parameters) - if num_cuda_graph_add_node_params == 5: + # CUDA 13 (cuda-python >= 13.0.0) adds an edgeData parameter to cuGraphAddNode and + # cudaStreamUpdateCaptureDependencies; CUDA 12 does not accept it. + _cuda13 = Version(cuda_python_version) >= Version("13.0.0") + if _cuda13: (node,) = cu_call(cuda.cuGraphAddNode(graph, dependencies, None, len(dependencies), driver_params)) - elif num_cuda_graph_add_node_params == 4: - (node,) = cu_call(cuda.cuGraphAddNode(graph, dependencies, len(dependencies), driver_params)) else: - raise NeMoCUDAPythonException("Unexpected number of parameters for `cuGraphAddNode`") + (node,) = cu_call(cuda.cuGraphAddNode(graph, dependencies, len(dependencies), driver_params)) body_graph = driver_params.conditional.phGraph_out[0] - # depending on cuda-python version, number of parameters vary - num_cuda_stream_update_capture_dependencies_params = len( - inspect.signature(cudart.cudaStreamUpdateCaptureDependencies).parameters - ) - if num_cuda_stream_update_capture_dependencies_params == 5: + if _cuda13: cu_call( cudart.cudaStreamUpdateCaptureDependencies( torch.cuda.current_stream(device=device).cuda_stream, @@ -195,7 +189,7 @@ def with_conditional_node(while_loop_kernel, while_loop_args, while_loop_conditi cudart.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamSetCaptureDependencies, ) ) - elif num_cuda_stream_update_capture_dependencies_params == 4: + else: cu_call( cudart.cudaStreamUpdateCaptureDependencies( torch.cuda.current_stream(device=device).cuda_stream, @@ -204,8 +198,6 @@ def with_conditional_node(while_loop_kernel, while_loop_args, while_loop_conditi cudart.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamSetCaptureDependencies, ) ) - else: - raise NeMoCUDAPythonException("Unexpected number of parameters for `cudaStreamUpdateCaptureDependencies`") body_stream = torch.cuda.Stream(device) previous_stream = torch.cuda.current_stream(device=device) cu_call( diff --git a/pyproject.toml b/pyproject.toml index 32f4639c8cb5..4763f096e0c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,7 +137,8 @@ norecursedirs = [ "CVS", "dist", "venv", - "{arch}" + "{arch}", + "e2e_nightly" ] # markers to select tests, use `pytest --markers` to see all available markers, `pytest -m ""` to select tests markers = [ diff --git a/tests/e2e_nightly/L2_Model_Support_mel_codec_22khz_fullband_medium.sh b/tests/e2e_nightly/L2_Model_Support_mel_codec_22khz_fullband_medium.sh new file mode 100644 index 000000000000..38ad97107388 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_mel_codec_22khz_fullband_medium.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_mel_codec_22khz_fullband_medium.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_mel_codec_44khz_medium.sh b/tests/e2e_nightly/L2_Model_Support_mel_codec_44khz_medium.sh new file mode 100644 index 000000000000..a4d6ed9ecf82 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_mel_codec_44khz_medium.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_mel_codec_44khz_medium.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0.sh new file mode 100644 index 000000000000..079bdd9aff1d --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__audio_codec_22khz.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__audio_codec_22khz.sh new file mode 100644 index 000000000000..6b6e16036369 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__audio_codec_22khz.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__audio_codec_22khz.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__audio_codec_44khz.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__audio_codec_44khz.sh new file mode 100644 index 000000000000..889ed73c8313 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__audio_codec_44khz.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__audio_codec_44khz.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__canary_180m_flash.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_180m_flash.sh new file mode 100644 index 000000000000..e7b00bb5cd78 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_180m_flash.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__canary_180m_flash.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b.sh new file mode 100644 index 000000000000..d1d0092b624a --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__canary_1b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b_flash.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b_flash.sh new file mode 100644 index 000000000000..ad556bf2999b --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b_flash.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__canary_1b_flash.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b_v2.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b_v2.sh new file mode 100644 index 000000000000..8f1ffc2b1ae6 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_1b_v2.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__canary_1b_v2.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__canary_qwen_2_5b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_qwen_2_5b.sh new file mode 100644 index 000000000000..ddc3e376f825 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__canary_qwen_2_5b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__canary_qwen_2_5b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__diar_sortformer_4spk_v1.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__diar_sortformer_4spk_v1.sh new file mode 100644 index 000000000000..1d070ed8735c --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__diar_sortformer_4spk_v1.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__diar_sortformer_4spk_v1.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2.sh new file mode 100644 index 000000000000..283b043d95fe --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2_1.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2_1.sh new file mode 100644 index 000000000000..3db733c176fc --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__diar_streaming_sortformer_4spk_v2_1.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2_1.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__low_frame_rate_speech_codec_22khz.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__low_frame_rate_speech_codec_22khz.sh new file mode 100644 index 000000000000..bde4f74b7592 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__low_frame_rate_speech_codec_22khz.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__low_frame_rate_speech_codec_22khz.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__magpie_tts_multilingual_357m.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__magpie_tts_multilingual_357m.sh new file mode 100644 index 000000000000..da7fbaff1597 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__magpie_tts_multilingual_357m.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__magpie_tts_multilingual_357m.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__mel_codec_22khz.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__mel_codec_22khz.sh new file mode 100644 index 000000000000..a08280883ca5 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__mel_codec_22khz.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__mel_codec_22khz.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__mel_codec_44khz.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__mel_codec_44khz.sh new file mode 100644 index 000000000000..7ab345c0273c --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__mel_codec_44khz.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__mel_codec_44khz.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__multitalker_parakeet_streaming_0_6b_v1.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__multitalker_parakeet_streaming_0_6b_v1.sh new file mode 100644 index 000000000000..c744c0730fd3 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__multitalker_parakeet_streaming_0_6b_v1.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__multitalker_parakeet_streaming_0_6b_v1.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps.sh new file mode 100644 index 000000000000..f39e6e13c2a2 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps.sh new file mode 100644 index 000000000000..4110bd95aa81 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps.sh new file mode 100644 index 000000000000..2b73b3ad7145 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__nemotron_speech_streaming_en_0_6b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__nemotron_speech_streaming_en_0_6b.sh new file mode 100644 index 000000000000..13a1b730f426 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__nemotron_speech_streaming_en_0_6b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__nemotron_speech_streaming_en_0_6b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_0_6b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_0_6b.sh new file mode 100644 index 000000000000..841e6ebaf1b0 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_0_6b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_0_6b_Vietnamese.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_0_6b_Vietnamese.sh new file mode 100644 index 000000000000..f712d89d4436 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_0_6b_Vietnamese.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b_Vietnamese.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_1_1b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_1_1b.sh new file mode 100644 index 000000000000..8e6ec9e9f7cd --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_ctc_1_1b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_1_1b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_realtime_eou_120m_v1.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_realtime_eou_120m_v1.sh new file mode 100644 index 000000000000..c63577e95c68 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_realtime_eou_120m_v1.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_realtime_eou_120m_v1.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_0_6b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_0_6b.sh new file mode 100644 index 000000000000..e9c65f7e7589 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_0_6b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_0_6b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_110m_da_dk.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_110m_da_dk.sh new file mode 100644 index 000000000000..9ddfa3acc3e6 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_110m_da_dk.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_110m_da_dk.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_1_1b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_1_1b.sh new file mode 100644 index 000000000000..55baffc3b4fe --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_rnnt_1_1b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_1_1b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_0_6b_v2.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_0_6b_v2.sh new file mode 100644 index 000000000000..94c8d68af40d --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_0_6b_v2.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v2.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_0_6b_v3.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_0_6b_v3.sh new file mode 100644 index 000000000000..e9347ca8fe1c --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_0_6b_v3.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v3.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_1_1b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_1_1b.sh new file mode 100644 index 000000000000..de262a8f4b99 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_1_1b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_1_1b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_0_6b_ja.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_0_6b_ja.sh new file mode 100644 index 000000000000..935a11309fe0 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_0_6b_ja.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_0_6b_ja.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_110m.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_110m.sh new file mode 100644 index 000000000000..414ae3b03195 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_110m.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_110m.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_1_1b.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_1_1b.sh new file mode 100644 index 000000000000..5bb5cbc277d7 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__parakeet_tdt_ctc_1_1b.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_1_1b.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__se_den_sb_16k_small.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__se_den_sb_16k_small.sh new file mode 100644 index 000000000000..a79cbed42a4c --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__se_den_sb_16k_small.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__se_den_sb_16k_small.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__se_der_sb_16k_small.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__se_der_sb_16k_small.sh new file mode 100644 index 000000000000..cae181097b9d --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__se_der_sb_16k_small.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__se_der_sb_16k_small.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__speakerverification_en_titanet_large.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__speakerverification_en_titanet_large.sh new file mode 100644 index 000000000000..05149796eb81 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__speakerverification_en_titanet_large.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__speakerverification_en_titanet_large.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__sr_ssl_flowmatching_16k_430m.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__sr_ssl_flowmatching_16k_430m.sh new file mode 100644 index 000000000000..8c9c8c44b589 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__sr_ssl_flowmatching_16k_430m.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__sr_ssl_flowmatching_16k_430m.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__ssl_en_nest_large_v1_0.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__ssl_en_nest_large_v1_0.sh new file mode 100644 index 000000000000..9617cd04fa5a --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__ssl_en_nest_large_v1_0.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_large_v1_0.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__ssl_en_nest_xlarge_v1_0.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__ssl_en_nest_xlarge_v1_0.sh new file mode 100644 index 000000000000..0b5189033a86 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__ssl_en_nest_xlarge_v1_0.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_xlarge_v1_0.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0.sh new file mode 100644 index 000000000000..349cab63836f --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0.sh new file mode 100644 index 000000000000..edb66819056e --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_be_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_be_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..7e6255a23b48 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_be_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_be_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_de_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_de_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..2d5999f97c3d --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_de_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_de_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_large.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_large.sh new file mode 100644 index 000000000000..27b7d04793ed --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_large.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_large.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xlarge.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xlarge.sh new file mode 100644 index 000000000000..f0deb15d182a --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xlarge.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xlarge.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xxlarge.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xxlarge.sh new file mode 100644 index 000000000000..9dca362921df --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_ctc_xxlarge.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xxlarge.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..de713407c5e8 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.sh new file mode 100644 index 000000000000..7af8d9fd8705 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.sh new file mode 100644 index 000000000000..48b3063eff7a --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.sh new file mode 100644 index 000000000000..f73960105a2f --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_tdt_large.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_tdt_large.sh new file mode 100644 index 000000000000..cfba99f568b0 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_tdt_large.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_tdt_large.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_large.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_large.sh new file mode 100644 index 000000000000..a0a2d3320a43 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_large.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_large.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xlarge.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xlarge.sh new file mode 100644 index 000000000000..4a819e505272 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xlarge.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xlarge.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xxlarge.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xxlarge.sh new file mode 100644 index 000000000000..8566fbf6085a --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_en_fastconformer_transducer_xxlarge.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xxlarge.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..38ca8be74b1e --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc.sh new file mode 100644 index 000000000000..f3e0f78236fb --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_fa_fastconformer_hybrid_large.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_fa_fastconformer_hybrid_large.sh new file mode 100644 index 000000000000..8f8516b214fe --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_fa_fastconformer_hybrid_large.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_fa_fastconformer_hybrid_large.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_fr_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_fr_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..53e11266e637 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_fr_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_fr_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_hr_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_hr_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..82e4732455de --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_hr_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_hr_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_hy_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_hy_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..221794fd7b97 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_hy_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_hy_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_it_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_it_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..7a3d8fa10b55 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_it_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_it_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..5cdd8acc2703 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.sh new file mode 100644 index 000000000000..ff2551ae9d62 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_kk_ru_fastconformer_hybrid_large.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_kk_ru_fastconformer_hybrid_large.sh new file mode 100644 index 000000000000..657e3a2af53a --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_kk_ru_fastconformer_hybrid_large.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_kk_ru_fastconformer_hybrid_large.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_nl_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_nl_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..a06569298b4f --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_nl_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_nl_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_pl_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_pl_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..a076ac0bc13a --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_pl_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_pl_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_pt_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_pt_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..7dababc2ad31 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_pt_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_pt_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ru_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ru_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..3f7d4eac4a92 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ru_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_ru_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ua_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ua_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..a590834841ca --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_ua_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_ua_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__stt_uz_fastconformer_hybrid_large_pc.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_uz_fastconformer_hybrid_large_pc.sh new file mode 100644 index 000000000000..1ca2437f1c37 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__stt_uz_fastconformer_hybrid_large_pc.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__stt_uz_fastconformer_hybrid_large_pc.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__tts_en_fastpitch.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__tts_en_fastpitch.sh new file mode 100644 index 000000000000..5bd7197215f4 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__tts_en_fastpitch.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__tts_en_fastpitch.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_nvidia__tts_hifigan.sh b/tests/e2e_nightly/L2_Model_Support_nvidia__tts_hifigan.sh new file mode 100644 index 000000000000..14924c317d99 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_nvidia__tts_hifigan.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_nvidia__tts_hifigan.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_stt_en_fastconformer_hybrid_large_streaming_1040ms.sh b/tests/e2e_nightly/L2_Model_Support_stt_en_fastconformer_hybrid_large_streaming_1040ms.sh new file mode 100644 index 000000000000..ce8803787601 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_stt_en_fastconformer_hybrid_large_streaming_1040ms.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_stt_en_fastconformer_hybrid_large_streaming_1040ms.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.sh b/tests/e2e_nightly/L2_Model_Support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.sh new file mode 100644 index 000000000000..341d34b48285 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_titanet_large.sh b/tests/e2e_nightly/L2_Model_Support_titanet_large.sh new file mode 100644 index 000000000000..23351a4eaf9e --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_titanet_large.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_titanet_large.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_tts_en_e2e_fastspeech2hifigan.sh b/tests/e2e_nightly/L2_Model_Support_tts_en_e2e_fastspeech2hifigan.sh new file mode 100644 index 000000000000..e662c07047d8 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_tts_en_e2e_fastspeech2hifigan.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_tts_en_e2e_fastspeech2hifigan.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_vad_multilingual_frame_marblenet.sh b/tests/e2e_nightly/L2_Model_Support_vad_multilingual_frame_marblenet.sh new file mode 100644 index 000000000000..2240fe4e12a6 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_vad_multilingual_frame_marblenet.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_vad_multilingual_frame_marblenet.py" \ + -v diff --git a/tests/e2e_nightly/L2_Model_Support_vad_multilingual_marblenet.sh b/tests/e2e_nightly/L2_Model_Support_vad_multilingual_marblenet.sh new file mode 100644 index 000000000000..7a057c294f41 --- /dev/null +++ b/tests/e2e_nightly/L2_Model_Support_vad_multilingual_marblenet.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + -m pytest \ + "tests/e2e_nightly/test_model_support_vad_multilingual_marblenet.py" \ + -v diff --git a/tests/e2e_nightly/__init__.py b/tests/e2e_nightly/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/e2e_nightly/test_model_support_mel_codec_22khz_fullband_medium.py b/tests/e2e_nightly/test_model_support_mel_codec_22khz_fullband_medium.py new file mode 100644 index 000000000000..eddfb6ec04f7 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_mel_codec_22khz_fullband_medium.py @@ -0,0 +1,139 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for mel_codec_22khz_fullband_medium.""" + +import os + +import pytest +import torch + +MODEL_NAME = "mel_codec_22khz_fullband_medium" +NEMO_FILE = "mel_codec_22khz_fullband_medium.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise the generator loss path via _process_batch (no mocking). + + AudioCodecModel uses manual optimization and requires two optimizers, so + calling training_step() directly outside of a Trainer context would fail. + Instead we drive the forward computation via _process_batch, compute the + primary losses the same way the real training_step does, and verify that + loss.backward() produces gradients. + """ + model = _load_model() + model.train() + d = _DEVICE + + sr = model.sample_rate + num_samples = sr # 1 second of audio + batch = { + "audio": torch.randn(1, num_samples, device=d), + "audio_lens": torch.tensor([num_samples], device=d), + } + + audio, audio_len, audio_gen, commit_loss, codes = model._process_batch(batch) + + generator_losses = [] + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio.float(), audio_gen=audio_gen.float(), audio_len=audio_len + ) + if model.mel_loss_l1_scale: + generator_losses.append(model.mel_loss_l1_scale * loss_mel_l1) + if model.mel_loss_l2_scale: + generator_losses.append(model.mel_loss_l2_scale * loss_mel_l2) + if model.time_domain_loss_scale: + loss_td = model.time_domain_loss_fn(audio_real=audio, audio_gen=audio_gen, audio_len=audio_len) + generator_losses.append(model.time_domain_loss_scale * loss_td) + if model.commit_loss_scale and isinstance(commit_loss, torch.Tensor): + generator_losses.append(model.commit_loss_scale * commit_loss) + + assert generator_losses, "No generator losses were computed." + loss = sum(generator_losses) + + assert isinstance(loss, torch.Tensor), "Loss must be a tensor." + assert loss.ndim == 0, "Loss must be a scalar tensor." + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + loss.backward() + + +def test_model_inference(): + """Encode audio to discrete tokens then decode back to waveform. + + Checks the full encode → decode round-trip: + * ``encode`` returns tokens of shape (B, n_codebooks, T_frames) with + integer values in [0, codebook_size). + * ``decode`` produces audio of shape (B, T_audio) with finite values. + """ + model = _load_model() + model.eval() + d = _DEVICE + + # Use 1 second of audio at the model's native sample rate. + sample_rate = model.sample_rate + audio = torch.randn(1, sample_rate, device=d) + audio_len = torch.tensor([sample_rate], device=d) + + with torch.no_grad(): + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + # Shape checks for tokens. + assert tokens is not None, "encode() returned None tokens" + assert tokens.ndim == 3, f"Expected tokens shape (B, C, T), got {tokens.shape}" + assert tokens.shape[0] == 1, f"Batch dimension mismatch: {tokens.shape}" + assert ( + tokens.shape[1] == model.num_codebooks + ), f"Codebook dimension {tokens.shape[1]} != model.num_codebooks {model.num_codebooks}" + assert tokens_len.shape == (1,), f"Unexpected tokens_len shape: {tokens_len.shape}" + assert tokens_len[0] > 0, "tokens_len must be positive" + + # Token values must lie within the codebook vocabulary. + assert tokens.min() >= 0, f"Negative token index found: {tokens.min()}" + assert tokens.max() < model.codebook_size, f"Token index {tokens.max()} >= codebook_size {model.codebook_size}" + + # Decode back to audio. + audio_out, audio_out_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_out is not None, "decode() returned None audio" + assert audio_out.ndim == 2, f"Expected decoded audio shape (B, T), got {audio_out.shape}" + assert audio_out.shape[0] == 1, f"Batch dimension mismatch after decode: {audio_out.shape}" + assert audio_out_len.shape == (1,), f"Unexpected audio_out_len shape: {audio_out_len.shape}" + assert audio_out_len[0] > 0, "Decoded audio length must be positive" + assert torch.isfinite(audio_out).all(), "Decoded audio contains non-finite values" diff --git a/tests/e2e_nightly/test_model_support_mel_codec_44khz_medium.py b/tests/e2e_nightly/test_model_support_mel_codec_44khz_medium.py new file mode 100644 index 000000000000..6752204e19c1 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_mel_codec_44khz_medium.py @@ -0,0 +1,150 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for mel_codec_44khz_medium.""" + +import os + +import pytest +import torch + +MODEL_NAME = "mel_codec_44khz_medium" +NEMO_FILE = "mel_codec_44khz_medium.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Test one generator+discriminator training step without a Lightning trainer. + + AudioCodecModel uses manual optimization (automatic_optimization=False). + training_step() calls self.optimizers(), self.manual_backward(), self.log_dict(), + self.log(), and self.lr_schedulers() – all Lightning-trainer methods. We stub + those out so we can exercise the actual forward/loss computation end-to-end. + + The discriminator is updated when batch_idx % disc_update_period < disc_updates_per_period. + With disc_update_period=2 and disc_updates_per_period=1, batch_idx=0 triggers the + discriminator step; batch_idx=1 skips it and runs the generator step only. + We run batch_idx=1 to keep the test lighter (no discriminator backward). + Both generator losses (mel + feature-matching + GAN generator) are computed and + manual_backward is called with their sum. + """ + model = _load_model() + model.train() + d = _DEVICE + + # Build two Adam optimizers that mirror configure_optimizers() but without + # needing a trainer or a data-loader. + import itertools + + vq_params = list(model.vector_quantizer.parameters()) if model.vector_quantizer else [] + gen_params = itertools.chain( + model.audio_encoder.parameters(), + model.audio_decoder.parameters(), + vq_params, + ) + optim_gen = torch.optim.Adam(gen_params, lr=2e-4, betas=(0.8, 0.99)) + optim_disc = torch.optim.Adam(model.discriminator.parameters(), lr=2e-4, betas=(0.8, 0.99)) + + # Capture losses passed to manual_backward so we can assert on them. + captured_losses = [] + + def _manual_backward(loss): + captured_losses.append(loss) + loss.backward() + + # Stub out all Lightning-specific methods used inside training_step. + model.optimizers = lambda: (optim_gen, optim_disc) + model.manual_backward = _manual_backward + model.log_dict = lambda *args, **kwargs: None + model.log = lambda *args, **kwargs: None + model.lr_schedulers = lambda: None # update_lr becomes a no-op + # current_epoch and global_step are Lightning properties that return 0 when + # no Trainer is attached (self._trainer is None after restore_from), so no + # further patching is required. + + # mel_codec_44khz_medium uses sample_rate=44100 and samples_per_frame=512. + # Use ~0.37 s of audio (16384 samples, an integer multiple of 512). + n_samples = 16384 + batch = { + "audio": torch.randn(1, n_samples, device=d), + "audio_lens": torch.tensor([n_samples], device=d), + } + + # batch_idx=1 skips the discriminator update (1 % 2 >= 1) so only the + # generator losses are computed and backward is called exactly once. + model.training_step(batch, 1) + + assert len(captured_losses) >= 1, "manual_backward was never called – no generator loss was computed" + gen_loss = captured_losses[-1] + assert gen_loss.dim() == 0, f"Expected scalar generator loss, got shape {gen_loss.shape}" + assert torch.isfinite(gen_loss), f"Generator loss is not finite: {gen_loss.item()}" + + +def test_model_inference(): + """Test encode -> decode round-trip at 44.1 kHz. + + model.encode(audio, audio_len) returns (tokens, tokens_len) where + tokens has shape (batch, num_codebooks, num_frames). + model.decode(tokens, tokens_len) returns (audio, audio_len) where + audio has shape (batch, num_output_samples). + """ + model = _load_model() + model.eval() + d = _DEVICE + + # Use 16384 samples at 44100 Hz (~0.37 s), a multiple of samples_per_frame=512. + n_samples = 16384 + audio_in = torch.randn(1, n_samples, device=d) + audio_len_in = torch.tensor([n_samples], device=d) + + with torch.no_grad(): + tokens, tokens_len = model.encode(audio=audio_in, audio_len=audio_len_in) + + assert tokens is not None, "encode() returned None tokens" + assert tokens.dim() == 3, f"Expected tokens shape (B, C, T), got {tokens.shape}" + assert tokens_len is not None and tokens_len.dim() == 1 + assert tokens_len[0].item() > 0, "tokens_len must be positive" + + audio_out, audio_out_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_out is not None, "decode() returned None audio" + assert audio_out.dim() == 2, f"Expected audio shape (B, T), got {audio_out.shape}" + assert audio_out.shape[0] == 1, "Batch size mismatch after decode" + assert audio_out.shape[1] > 0, "Decoded audio has zero length" + assert audio_out_len is not None and audio_out_len[0].item() > 0 diff --git a/tests/e2e_nightly/test_model_support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0.py b/tests/e2e_nightly/test_model_support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0.py new file mode 100644 index 000000000000..b7a6340f78a8 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__Frame_VAD_Multilingual_MarbleNet_v2_0.py @@ -0,0 +1,91 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/Frame_VAD_Multilingual_MarbleNet_v2.0.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/Frame_VAD_Multilingual_MarbleNet_v2.0" +NEMO_FILE = "nvidia__Frame_VAD_Multilingual_MarbleNet_v2.0.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import EncDecClassificationModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = EncDecClassificationModel.restore_from(filepath, map_location="cpu", strict=False).to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + d = next(model.parameters()).device + # Discover output frame count to build matching labels. + model.eval() + with torch.no_grad(): + probe = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + ) + n_frames = probe.shape[1] + + prepare_for_training_step(model) + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.zeros(2, n_frames, dtype=torch.long, device=d), + torch.tensor([n_frames, n_frames], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + logits = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + ) + assert logits is not None + assert logits.dim() >= 2, f"Expected at least 2-D logits, got shape {logits.shape}" + assert torch.isfinite(logits).all() diff --git a/tests/e2e_nightly/test_model_support_nvidia__audio_codec_22khz.py b/tests/e2e_nightly/test_model_support_nvidia__audio_codec_22khz.py new file mode 100644 index 000000000000..7bcb6533583a --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__audio_codec_22khz.py @@ -0,0 +1,122 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/audio-codec-22khz.""" + +import os + +import torch + +MODEL_NAME = "nvidia/audio-codec-22khz" +NEMO_FILE = "nvidia__audio-codec-22khz.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise the generator loss path via _process_batch (no mocking). + + AudioCodecModel uses manual optimization and requires two optimizers, so + calling training_step() directly outside of a Trainer context would fail. + Instead we drive the forward computation via _process_batch, compute the + primary losses the same way the real training_step does, and verify that + loss.backward() produces gradients. + """ + model = _load_model() + model.train() + d = _DEVICE + + sr = model.sample_rate + num_samples = sr # 1 second of audio + batch = { + "audio": torch.randn(1, num_samples, device=d), + "audio_lens": torch.tensor([num_samples], device=d), + } + + audio, audio_len, audio_gen, commit_loss, codes = model._process_batch(batch) + + generator_losses = [] + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio.float(), audio_gen=audio_gen.float(), audio_len=audio_len + ) + if model.mel_loss_l1_scale: + generator_losses.append(model.mel_loss_l1_scale * loss_mel_l1) + if model.mel_loss_l2_scale: + generator_losses.append(model.mel_loss_l2_scale * loss_mel_l2) + if model.time_domain_loss_scale: + loss_td = model.time_domain_loss_fn(audio_real=audio, audio_gen=audio_gen, audio_len=audio_len) + generator_losses.append(model.time_domain_loss_scale * loss_td) + if model.commit_loss_scale and isinstance(commit_loss, torch.Tensor): + generator_losses.append(model.commit_loss_scale * commit_loss) + + assert generator_losses, "No generator losses were computed." + loss = sum(generator_losses) + + assert isinstance(loss, torch.Tensor), "Loss must be a tensor." + assert loss.ndim == 0, "Loss must be a scalar tensor." + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + + sr = model.sample_rate + num_samples = sr # one second of audio at the model's native sample rate + audio = torch.randn(1, num_samples, device=d) + audio_len = torch.tensor([num_samples], device=d) + + with torch.no_grad(): + # Encode: waveform -> discrete tokens + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + assert tokens is not None, "encode() returned None tokens" + assert tokens.ndim == 3, f"Expected tokens shape (B, C, T), got {tokens.shape}" + assert tokens.shape[0] == 1, "Batch dimension mismatch" + assert ( + tokens.shape[1] == model.num_codebooks + ), f"Expected {model.num_codebooks} codebooks, got {tokens.shape[1]}" + assert tokens_len.shape == (1,), f"Unexpected tokens_len shape: {tokens_len.shape}" + + # Decode: discrete tokens -> reconstructed waveform + audio_rec, audio_rec_len = model.decode(tokens=tokens, tokens_len=tokens_len) + assert audio_rec is not None, "decode() returned None audio" + assert audio_rec.ndim == 2, f"Expected reconstructed audio shape (B, T), got {audio_rec.shape}" + assert audio_rec.shape[0] == 1, "Batch dimension mismatch in decoded audio" + assert audio_rec_len.shape == (1,), f"Unexpected audio_rec_len shape: {audio_rec_len.shape}" diff --git a/tests/e2e_nightly/test_model_support_nvidia__audio_codec_44khz.py b/tests/e2e_nightly/test_model_support_nvidia__audio_codec_44khz.py new file mode 100644 index 000000000000..2d4f239b3381 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__audio_codec_44khz.py @@ -0,0 +1,172 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/audio-codec-44khz.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/audio-codec-44khz" +NEMO_FILE = "nvidia__audio-codec-44khz.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise the generator loss path from training_step without a Lightning trainer. + + AudioCodecModel.training_step() relies heavily on Lightning internals + (manual_backward, optimizers(), log_dict). We instead replicate the + generator-loss portion directly: build a synthetic batch, call + _process_batch(), compute every active loss term exactly as training_step + does, sum them, verify the result is a finite scalar, and confirm that + .backward() produces gradients on the encoder/decoder parameters. + """ + model = _load_model() + model.train() + d = _DEVICE + + # Build a synthetic batch. _process_batch() expects keys "audio" [B, T] + # and "audio_lens" [B]. Use 1 second of audio at the model sample rate. + sr = model.sample_rate # 44100 for this model + n_samples = sr + batch = { + "audio": torch.randn(1, n_samples, device=d), + "audio_lens": torch.tensor([n_samples], device=d), + } + + # Forward pass through encoder -> VQ -> decoder (same as training_step). + audio, audio_len, audio_gen, commit_loss, codes = model._process_batch(batch) + + generator_losses = [] + + # Mel losses (stft does not support bf16 — cast to float32 as training_step does). + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio.float(), audio_gen=audio_gen.float(), audio_len=audio_len + ) + if model.mel_loss_l1_scale: + generator_losses.append(model.mel_loss_l1_scale * loss_mel_l1) + if model.mel_loss_l2_scale: + generator_losses.append(model.mel_loss_l2_scale * loss_mel_l2) + + if model.stft_loss_scale: + loss_stft = model.stft_loss_fn(audio_real=audio.float(), audio_gen=audio_gen.float(), audio_len=audio_len) + generator_losses.append(model.stft_loss_scale * loss_stft) + + if model.time_domain_loss_scale: + loss_td = model.time_domain_loss_fn(audio_real=audio, audio_gen=audio_gen, audio_len=audio_len) + generator_losses.append(model.time_domain_loss_scale * loss_td) + + if model.si_sdr_loss_scale: + loss_si_sdr = model.si_sdr_loss_fn(audio_real=audio, audio_gen=audio_gen, audio_len=audio_len) + generator_losses.append(model.si_sdr_loss_scale * loss_si_sdr) + + # Discriminator scores for generator and feature-matching losses. + _, disc_scores_gen, fmaps_real, fmaps_gen = model.discriminator(audio_real=audio, audio_gen=audio_gen) + if model.gen_loss_scale: + loss_gen = model.gen_loss_fn(disc_scores_gen=disc_scores_gen) + generator_losses.append(model.gen_loss_scale * loss_gen) + + if model.feature_loss_scale: + loss_feature = model.feature_loss_fn(fmaps_real=fmaps_real, fmaps_gen=fmaps_gen) + generator_losses.append(model.feature_loss_scale * loss_feature) + + if model.commit_loss_scale: + generator_losses.append(model.commit_loss_scale * commit_loss) + + assert generator_losses, "No active loss terms were collected" + loss = sum(generator_losses) + + assert loss.ndim == 0, "Expected a scalar loss tensor" + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + assert loss.item() > 0, f"Expected positive loss, got {loss.item()}" + + loss.backward() + + # Verify that at least some encoder parameters received gradients. + enc_params_with_grad = [p for p in model.audio_encoder.parameters() if p.grad is not None] + assert enc_params_with_grad, "No gradients flowed back to audio_encoder parameters" + + +def test_model_inference(): + """Encode audio to discrete tokens and decode back to audio. + + Verifies the full encode -> decode roundtrip: + - encode() returns tokens of shape [B, num_codebooks, T_frames] and + valid frame lengths. + - decode() converts those tokens back to a time-domain waveform of + shape [B, T_samples]. + - The reconstructed audio length is a multiple of samples_per_frame. + """ + model = _load_model() + model.eval() + d = _DEVICE + + sr = model.sample_rate # 44100 + n_samples = sr # 1 second of audio + audio = torch.randn(1, n_samples, device=d) + audio_len = torch.tensor([n_samples], device=d) + + with torch.no_grad(): + # Encode: audio -> discrete token indices + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + assert tokens is not None + assert tokens.ndim == 3, f"Expected [B, C, T] tokens, got shape {tokens.shape}" + assert tokens.shape[0] == 1, "Batch size should be 1" + assert ( + tokens.shape[1] == model.num_codebooks + ), f"Expected {model.num_codebooks} codebooks, got {tokens.shape[1]}" + assert tokens_len.shape == (1,) + assert tokens_len[0] == tokens.shape[2], "tokens_len must match the time dimension" + + # Decode: discrete tokens -> reconstructed waveform + audio_recon, audio_recon_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_recon is not None + assert audio_recon.ndim == 2, f"Expected [B, T] audio, got shape {audio_recon.shape}" + assert audio_recon.shape[0] == 1 + assert audio_recon_len.shape == (1,) + # Reconstructed length must be a multiple of samples_per_frame + assert audio_recon_len[0] % model.samples_per_frame == 0, ( + f"Reconstructed audio length {audio_recon_len[0]} is not a multiple " + f"of samples_per_frame {model.samples_per_frame}" + ) + assert audio_recon.shape[1] == audio_recon_len[0], "Audio tensor width must match reported audio_recon_len" diff --git a/tests/e2e_nightly/test_model_support_nvidia__canary_180m_flash.py b/tests/e2e_nightly/test_model_support_nvidia__canary_180m_flash.py new file mode 100644 index 000000000000..a07b7a28f29d --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__canary_180m_flash.py @@ -0,0 +1,138 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/canary-180m-flash.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/canary-180m-flash" +NEMO_FILE = "nvidia__canary-180m-flash.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run forward + loss using the model's actual code (no mocking).""" + from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextMiniBatch + + model = _load_model() + model.train() + d = _DEVICE + + # Build a minimal prompted sequence via the model's prompt formatter. + # canary2 format requires additional slots: diarize, timestamp, itn, emotion, decodercontext + turns = [ + { + "role": "user", + "slots": { + "source_lang": "en", + "target_lang": "en", + "pnc": "yes", + "itn": "no", + "timestamp": "no", + "diarize": "no", + "emotion": "<|emo:undefined|>", + "decodercontext": "", + }, + }, + {"role": "assistant", "slots": {"text": "hello world", model.prompt.PROMPT_LANGUAGE_SLOT: "en"}}, + ] + encoded = model.prompt.encode_dialog(turns) + prompt_ids = encoded["context_ids"] + answer_ids = encoded["answer_ids"] + full_ids = encoded["input_ids"] + + audio_len = 16000 + batch = PromptedAudioToTextMiniBatch( + audio=torch.randn(1, audio_len, device=d), + audio_lens=torch.tensor([audio_len], dtype=torch.long, device=d), + transcript=answer_ids.unsqueeze(0).to(d), + transcript_lens=torch.tensor([answer_ids.shape[0]], dtype=torch.long, device=d), + prompt=prompt_ids.unsqueeze(0).to(d), + prompt_lens=torch.tensor([prompt_ids.shape[0]], dtype=torch.long, device=d), + prompted_transcript=full_ids.unsqueeze(0).to(d), + prompted_transcript_lens=torch.tensor([full_ids.shape[0]], dtype=torch.long, device=d), + cuts=None, + ) + + # Replicate the core of training_step: forward + loss. + input_ids, labels = batch.get_decoder_inputs_outputs() + input_ids_lens = batch.prompted_transcript_lens - 1 + + transf_log_probs, encoded_len, enc_states, enc_mask = model.forward( + input_signal=batch.audio, + input_signal_length=batch.audio_lens, + transcript=input_ids, + transcript_length=input_ids_lens, + ) + + loss = model.loss(log_probs=transf_log_probs, labels=labels, output_mask=None) + + assert loss.ndim == 0, f"Expected scalar loss, got shape {loss.shape}" + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe( + audio=[audio], + batch_size=1, + source_lang="en", + target_lang="en", + task="asr", + ) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) diff --git a/tests/e2e_nightly/test_model_support_nvidia__canary_1b.py b/tests/e2e_nightly/test_model_support_nvidia__canary_1b.py new file mode 100644 index 000000000000..4f50a7a0b369 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__canary_1b.py @@ -0,0 +1,149 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/canary-1b.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/canary-1b" +NEMO_FILE = "nvidia__canary-1b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """ + EncDecMultiTaskModel.training_step() expects a PromptedAudioToTextMiniBatch. + We build one using the model's own prompt formatter so the token IDs are valid, + then exercise the forward + loss path directly (training_step itself also + accesses self._optimizer which is only wired up by Lightning, so we replicate + its core logic here without that dependency). + """ + from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextMiniBatch + + model = _load_model() + model.train() + d = _DEVICE + + # Build a minimal prompted sequence via the model's own prompt formatter. + # Canary prompt format: + # The assistant turn needs 'prompt_language' to select the correct sub-tokenizer + # in the AggregateTokenizer (CanaryTokenizer). + turns = [ + {"role": "user", "slots": {"source_lang": "en", "task": "asr", "target_lang": "en", "pnc": "yes"}}, + {"role": "assistant", "slots": {"text": "hello world", model.prompt.PROMPT_LANGUAGE_SLOT: "en"}}, + ] + encoded = model.prompt.encode_dialog(turns) + # context_ids = prompt portion (user turn only) + # answer_ids = transcript portion (assistant turn, without EOS per canary.py) + # input_ids = full prompted sequence (context + answer) + prompt_ids = encoded["context_ids"] # 1-D tensor + answer_ids = encoded["answer_ids"] # 1-D tensor + full_ids = encoded["input_ids"] # prompt + answer concatenated + + # Build batch tensors (batch size = 1). + audio_len = 16000 # 1 second of audio at 16 kHz + audio = torch.randn(1, audio_len, device=d) + audio_lens = torch.tensor([audio_len], device=d, dtype=torch.long) + + prompted_transcript = full_ids.unsqueeze(0).to(d) # (1, T_full) + prompted_transcript_lens = torch.tensor([full_ids.shape[0]], device=d, dtype=torch.long) + transcript = answer_ids.unsqueeze(0).to(d) # (1, T_ans) + transcript_lens = torch.tensor([answer_ids.shape[0]], device=d, dtype=torch.long) + prompt = prompt_ids.unsqueeze(0).to(d) # (1, T_prompt) + prompt_lens = torch.tensor([prompt_ids.shape[0]], device=d, dtype=torch.long) + + batch = PromptedAudioToTextMiniBatch( + audio=audio, + audio_lens=audio_lens, + transcript=transcript, + transcript_lens=transcript_lens, + prompt=prompt, + prompt_lens=prompt_lens, + prompted_transcript=prompted_transcript, + prompted_transcript_lens=prompted_transcript_lens, + cuts=None, + ) + + # Replicate the core of training_step without the Lightning optimizer access. + input_ids, labels = batch.get_decoder_inputs_outputs() + input_ids_lens = batch.prompted_transcript_lens - 1 + + transf_log_probs, encoded_len, enc_states, enc_mask = model.forward( + input_signal=batch.audio, + input_signal_length=batch.audio_lens, + transcript=input_ids, + transcript_length=input_ids_lens, + ) + + loss = model.loss(log_probs=transf_log_probs, labels=labels, output_mask=None) + + assert loss.ndim == 0, f"Expected scalar loss, got shape {loss.shape}" + assert loss.item() > 0.0, "Expected positive loss" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe( + audio=[audio], + batch_size=1, + source_lang="en", + target_lang="en", + task="asr", + pnc="yes", + ) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) diff --git a/tests/e2e_nightly/test_model_support_nvidia__canary_1b_flash.py b/tests/e2e_nightly/test_model_support_nvidia__canary_1b_flash.py new file mode 100644 index 000000000000..08356b8e7e74 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__canary_1b_flash.py @@ -0,0 +1,138 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/canary-1b-flash.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/canary-1b-flash" +NEMO_FILE = "nvidia__canary-1b-flash.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run forward + loss using the model's actual code (no mocking).""" + from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextMiniBatch + + model = _load_model() + model.train() + d = _DEVICE + + # Build a minimal prompted sequence via the model's prompt formatter. + # canary2 format requires additional slots: diarize, timestamp, itn, emotion, decodercontext + turns = [ + { + "role": "user", + "slots": { + "source_lang": "en", + "target_lang": "en", + "pnc": "yes", + "itn": "no", + "timestamp": "no", + "diarize": "no", + "emotion": "<|emo:undefined|>", + "decodercontext": "", + }, + }, + {"role": "assistant", "slots": {"text": "hello world", model.prompt.PROMPT_LANGUAGE_SLOT: "en"}}, + ] + encoded = model.prompt.encode_dialog(turns) + prompt_ids = encoded["context_ids"] + answer_ids = encoded["answer_ids"] + full_ids = encoded["input_ids"] + + audio_len = 16000 + batch = PromptedAudioToTextMiniBatch( + audio=torch.randn(1, audio_len, device=d), + audio_lens=torch.tensor([audio_len], dtype=torch.long, device=d), + transcript=answer_ids.unsqueeze(0).to(d), + transcript_lens=torch.tensor([answer_ids.shape[0]], dtype=torch.long, device=d), + prompt=prompt_ids.unsqueeze(0).to(d), + prompt_lens=torch.tensor([prompt_ids.shape[0]], dtype=torch.long, device=d), + prompted_transcript=full_ids.unsqueeze(0).to(d), + prompted_transcript_lens=torch.tensor([full_ids.shape[0]], dtype=torch.long, device=d), + cuts=None, + ) + + # Replicate the core of training_step: forward + loss. + input_ids, labels = batch.get_decoder_inputs_outputs() + input_ids_lens = batch.prompted_transcript_lens - 1 + + transf_log_probs, encoded_len, enc_states, enc_mask = model.forward( + input_signal=batch.audio, + input_signal_length=batch.audio_lens, + transcript=input_ids, + transcript_length=input_ids_lens, + ) + + loss = model.loss(log_probs=transf_log_probs, labels=labels, output_mask=None) + + assert loss.ndim == 0, f"Expected scalar loss, got shape {loss.shape}" + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe( + audio=[audio], + batch_size=1, + source_lang="en", + target_lang="en", + task="asr", + ) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) diff --git a/tests/e2e_nightly/test_model_support_nvidia__canary_1b_v2.py b/tests/e2e_nightly/test_model_support_nvidia__canary_1b_v2.py new file mode 100644 index 000000000000..c6e5ca7cfd22 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__canary_1b_v2.py @@ -0,0 +1,166 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/canary-1b-v2.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/canary-1b-v2" +NEMO_FILE = "nvidia__canary-1b-v2.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + from lhotse import CutSet, MonoCut + + from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextMiniBatch + + model = _load_model() + model.train() + d = _DEVICE + + # Build a prompted transcript using the model's prompt formatter. + # encode_dialog with a training turn (user + assistant) returns context_ids, answer_ids, input_ids. + # The assistant turn needs 'prompt_language' to select the correct sub-tokenizer in the + # AggregateTokenizer (CanaryTokenizer). For the user turn, map_manifest_values_to_special_tokens + # auto-injects CANARY_SPECIAL_TOKENIZER whenever language/boolean special-token slots are present. + turns = [ + { + "role": "user", + "slots": { + "source_lang": "en", + "target_lang": "en", + "pnc": "yes", + "itn": "yes", + "timestamp": "notimestamp", + "diarize": "nodiarize", + "decodercontext": "", + "emotion": "<|emo:undefined|>", + }, + }, + { + "role": "assistant", + "slots": { + "text": "hello world", + # Required so the AggregateTokenizer knows which sub-tokenizer to use. + model.prompt.PROMPT_LANGUAGE_SLOT: "en", + }, + }, + ] + encoded = model.prompt.encode_dialog(turns) + # input_ids is the full prompted_transcript [prompt + answer tokens] + input_ids_1d = encoded["input_ids"] # 1-D tensor of token IDs + context_ids_1d = encoded["context_ids"] # prompt portion only + answer_ids_1d = encoded["answer_ids"] # answer portion only + + # Batch dimension = 1 + prompted_transcript = input_ids_1d.unsqueeze(0).to(d) # [1, T_dec] + prompted_transcript_lens = torch.tensor([input_ids_1d.shape[0]], dtype=torch.long, device=d) + prompt_tok = context_ids_1d.unsqueeze(0).to(d) # [1, T_prompt] + prompt_lens = torch.tensor([context_ids_1d.shape[0]], dtype=torch.long, device=d) + transcript = answer_ids_1d.unsqueeze(0).to(d) # [1, T_ans] + transcript_lens = torch.tensor([answer_ids_1d.shape[0]], dtype=torch.long, device=d) + + # 1-second of silence as audio + audio = torch.zeros(1, 16000, device=d) + audio_lens = torch.tensor([16000], dtype=torch.long, device=d) + + # MultiTaskMetric.update() iterates over batch.cuts to filter per-metric constraints, + # so we provide a minimal MonoCut with the required 'custom' attributes. + dummy_cut = MonoCut( + id="test_cut", + start=0.0, + duration=1.0, + channel=0, + custom={"source_lang": "en", "target_lang": "en", "taskname": "asr"}, + ) + cuts = CutSet([dummy_cut]) + + batch = PromptedAudioToTextMiniBatch( + audio=audio, + audio_lens=audio_lens, + transcript=transcript, + transcript_lens=transcript_lens, + prompt=prompt_tok, + prompt_lens=prompt_lens, + prompted_transcript=prompted_transcript, + prompted_transcript_lens=prompted_transcript_lens, + cuts=cuts, + ) + + # training_step reads self._optimizer.param_groups[0]['lr'], so attach a minimal optimizer + optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) + model._optimizer = optimizer + + output = model.training_step(batch, 0) + loss = output["loss"] + assert loss.ndim == 0, f"Expected scalar loss, got shape {loss.shape}" + assert torch.isfinite(loss), f"Expected finite loss, got {loss.item()}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe( + audio=[audio], + batch_size=1, + source_lang="en", + target_lang="en", + task="asr", + pnc="yes", + ) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) diff --git a/tests/e2e_nightly/test_model_support_nvidia__canary_qwen_2_5b.py b/tests/e2e_nightly/test_model_support_nvidia__canary_qwen_2_5b.py new file mode 100644 index 000000000000..774765aaa8e3 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__canary_qwen_2_5b.py @@ -0,0 +1,140 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/canary-qwen-2.5b.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/canary-qwen-2.5b" +NEMO_FILE = "nvidia__canary-qwen-2.5b" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.speechlm2.models import SALM + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = SALM.from_pretrained(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """ + Build a minimal batch matching SALMDataset output and run training_step. + + The batch must contain: + - audios: (B, T_samples) float32 raw waveform + - audio_lens: (B,) int64 sample counts + - input_ids: (B, T_tokens) int64, contains one audio_locator_tag_id per audio + - loss_mask: (B, T_tokens) bool, True where loss should be computed + + prepare_inputs() calls perception(audios, audio_lens) to get audio embeddings, then + replaces each audio_locator_tag_id placeholder with the corresponding embedding before + passing everything to the LLM. training_step() then computes cross-entropy loss and + returns a dict with key "loss". + """ + model = _load_model() + model.train() + d = _DEVICE + + B = 1 + sr = model.sampling_rate # typically 16000 + T_audio = sr * 2 # 2 seconds of audio + audio_locator_id = model.audio_locator_tag_id + + # Build text token sequence: [, , tok, tok, ..., ] + # The audio_locator_tag must appear exactly once to match the single audio sample. + T_text = 16 + # Use token id 1 as a generic text token (safe for most LLM tokenizers). + text_tok = 1 + input_ids = torch.full((B, T_text), text_tok, dtype=torch.long, device=d) + input_ids[0, 0] = audio_locator_id # one placeholder at position 0 + + # loss_mask=True on the final few tokens (the "response" portion). + loss_mask = torch.zeros(B, T_text, dtype=torch.bool, device=d) + loss_mask[0, T_text // 2 :] = True + + audios = torch.randn(B, T_audio, device=d) + audio_lens = torch.tensor([T_audio], dtype=torch.long, device=d) + + batch = { + "audios": audios, + "audio_lens": audio_lens, + "input_ids": input_ids, + "loss_mask": loss_mask, + } + + # training_step calls self.log_dict which emits a warning without a trainer but does + # not raise. We suppress that warning so the test output stays clean. + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + result = model.training_step(batch, 0) + + assert "loss" in result, f"training_step must return a dict with 'loss', got keys: {list(result.keys())}" + loss = result["loss"] + assert loss.ndim == 0, f"loss must be a scalar tensor, got shape {loss.shape}" + assert torch.isfinite(loss), f"loss must be finite, got {loss.item()}" + loss.backward() + + +def test_model_inference(): + """ + Run a text-only forward pass through the LLM backbone. + + SALM.forward() takes pre-built input embeddings of shape (B, T, H) — the same + representation that prepare_inputs() produces after splicing in audio embeddings — + and returns {"logits": Tensor[B, T, vocab_size]}. + """ + model = _load_model() + model.eval() + d = _DEVICE + hidden_size = model.llm.config.hidden_size + + B, T = 1, 10 + input_embeds = torch.randn(B, T, hidden_size, device=d) + attention_mask = torch.ones(B, T, dtype=torch.bool, device=d) + + with torch.no_grad(): + result = model.forward(input_embeds=input_embeds, attention_mask=attention_mask) + + assert isinstance(result, dict), f"forward() must return a dict, got {type(result)}" + assert "logits" in result, f"forward() output must contain 'logits', got keys: {list(result.keys())}" + logits = result["logits"] + assert logits.shape[0] == B, f"logits batch dim mismatch: expected {B}, got {logits.shape[0]}" + assert logits.shape[1] == T, f"logits time dim mismatch: expected {T}, got {logits.shape[1]}" + assert ( + logits.shape[2] == model.text_vocab_size + ), f"logits vocab dim mismatch: expected {model.text_vocab_size}, got {logits.shape[2]}" diff --git a/tests/e2e_nightly/test_model_support_nvidia__diar_sortformer_4spk_v1.py b/tests/e2e_nightly/test_model_support_nvidia__diar_sortformer_4spk_v1.py new file mode 100644 index 000000000000..9e592e3f1c96 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__diar_sortformer_4spk_v1.py @@ -0,0 +1,98 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/diar_sortformer_4spk-v1.""" + +import os + +import torch + +MODEL_NAME = "nvidia/diar_sortformer_4spk-v1" +NEMO_FILE = "nvidia__diar_sortformer_4spk-v1.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import SortformerEncLabelModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = SortformerEncLabelModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + d = next(model.parameters()).device + + # Discover output frame count and speaker count by running a forward pass. + num_samples = 64000 # 4 seconds at 16 kHz + model.eval() + with torch.no_grad(): + preds_ref = model.forward( + audio_signal=torch.randn(1, num_samples, device=d), + audio_signal_length=torch.tensor([num_samples], device=d), + ) + num_frames = preds_ref.shape[1] + num_spks = preds_ref.shape[2] + + prepare_for_training_step(model) + # Build batch as a list: [audio_signal, audio_signal_length, targets, target_lens] + batch = [ + torch.randn(1, num_samples, device=d), + torch.tensor([num_samples], dtype=torch.long, device=d), + torch.zeros(1, num_frames, num_spks, device=d), + torch.tensor([num_frames], dtype=torch.long, device=d), + ] + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + preds = model.forward( + audio_signal=torch.randn(1, 16000, device=d), + audio_signal_length=torch.tensor([16000], device=d), + ) + # preds: (batch_size, diar_frame_count, num_speakers) with sigmoid probabilities in [0, 1] + assert preds.ndim == 3, f"Expected 3-D preds tensor, got shape {preds.shape}" + assert preds.shape[0] == 1, f"Unexpected batch size: {preds.shape[0]}" + assert preds.shape[2] > 0, "num_speakers dimension is empty" + assert ( + preds.min() >= 0.0 and preds.max() <= 1.0 + ), f"Sigmoid output out of [0,1] range: min={preds.min().item():.4f}, max={preds.max().item():.4f}" diff --git a/tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2.py b/tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2.py new file mode 100644 index 000000000000..fe265c47010b --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2.py @@ -0,0 +1,108 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/diar_streaming_sortformer_4spk-v2.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/diar_streaming_sortformer_4spk-v2" +NEMO_FILE = "nvidia__diar_streaming_sortformer_4spk-v2.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import SortformerEncLabelModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = SortformerEncLabelModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + d = next(model.parameters()).device + + # Discover output frame count and speaker count by running a forward pass. + num_samples = 64000 # 4 seconds at 16 kHz + model.eval() + with torch.no_grad(): + preds_ref = model.forward( + audio_signal=torch.randn(1, num_samples, device=d), + audio_signal_length=torch.tensor([num_samples], device=d), + ) + num_frames = preds_ref.shape[1] + num_spks = preds_ref.shape[2] + + prepare_for_training_step(model) + # Build batch as a list: [audio_signal, audio_signal_length, targets, target_lens] + batch = [ + torch.randn(1, num_samples, device=d), + torch.tensor([num_samples], dtype=torch.long, device=d), + torch.zeros(1, num_frames, num_spks, device=d), + torch.tensor([num_frames], dtype=torch.long, device=d), + ] + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + + sr = 16000 + num_samples = 4 * sr # 4 seconds + n_spk = model._cfg.max_num_of_spks # 4 + + audio_signal = torch.randn(1, num_samples, device=d) + audio_signal_length = torch.tensor([num_samples], device=d) + + with torch.no_grad(): + preds = model.forward( + audio_signal=audio_signal, + audio_signal_length=audio_signal_length, + ) + + # preds: (batch, diar_frame_count, num_speakers) of sigmoid probabilities + assert preds.ndim == 3, f"Expected 3-D output, got shape {preds.shape}" + assert preds.shape[0] == 1, f"Expected batch size 1, got {preds.shape[0]}" + assert preds.shape[2] == n_spk, f"Expected {n_spk} speakers, got {preds.shape[2]}" + assert ( + preds.min() >= 0.0 and preds.max() <= 1.0 + ), f"Sigmoid outputs must be in [0, 1]; got [{preds.min():.4f}, {preds.max():.4f}]" diff --git a/tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2_1.py b/tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2_1.py new file mode 100644 index 000000000000..2dc8ea38c37c --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__diar_streaming_sortformer_4spk_v2_1.py @@ -0,0 +1,96 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/diar_streaming_sortformer_4spk-v2.1.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/diar_streaming_sortformer_4spk-v2.1" +NEMO_FILE = "nvidia__diar_streaming_sortformer_4spk-v2.1.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import SortformerEncLabelModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = SortformerEncLabelModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + d = next(model.parameters()).device + + # Discover output frame count and speaker count by running a forward pass. + num_samples = 64000 # 4 seconds at 16 kHz + model.eval() + with torch.no_grad(): + preds_ref = model.forward( + audio_signal=torch.randn(1, num_samples, device=d), + audio_signal_length=torch.tensor([num_samples], device=d), + ) + num_frames = preds_ref.shape[1] + num_spks = preds_ref.shape[2] + + prepare_for_training_step(model) + # Build batch as a list: [audio_signal, audio_signal_length, targets, target_lens] + batch = [ + torch.randn(1, num_samples, device=d), + torch.tensor([num_samples], dtype=torch.long, device=d), + torch.zeros(1, num_frames, num_spks, device=d), + torch.tensor([num_frames], dtype=torch.long, device=d), + ] + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + preds = model.forward( + audio_signal=torch.randn(1, 16000, device=d), + audio_signal_length=torch.tensor([16000], device=d), + ) + # preds shape: (batch_size, T_frames, num_speakers) + assert preds.ndim == 3, f"Expected 3-D output (B, T, C), got shape {preds.shape}" + assert preds.shape[0] == 1 + assert preds.shape[2] > 0 # at least one speaker channel diff --git a/tests/e2e_nightly/test_model_support_nvidia__low_frame_rate_speech_codec_22khz.py b/tests/e2e_nightly/test_model_support_nvidia__low_frame_rate_speech_codec_22khz.py new file mode 100644 index 000000000000..b4efdf6dd803 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__low_frame_rate_speech_codec_22khz.py @@ -0,0 +1,135 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/low-frame-rate-speech-codec-22khz.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/low-frame-rate-speech-codec-22khz" +NEMO_FILE = "nvidia__low-frame-rate-speech-codec-22khz.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Test training forward pass and backward through reconstruction losses. + + AudioCodecModel uses manual optimization (automatic_optimization=False) and + requires a Lightning trainer for training_step() (optimizers, manual_backward, + log_dict). Instead we exercise the same computation path directly: call + _process_batch() to obtain real/generated audio, then compute the mel + reconstruction loss (l1 + l2) — the primary per-step generator loss — and + call .backward() on it to verify that gradients flow back through the entire + encoder-quantizer-decoder chain. + """ + model = _load_model() + model.train() + d = _DEVICE + + # Build a synthetic batch that matches the format expected by _process_batch: + # "audio" -> [B, T] float waveform at model.sample_rate + # "audio_lens" -> [B] valid sample counts + # Use 1 second of audio at the model's native sample rate. + sample_rate = model.sample_rate # 22050 for this checkpoint + num_samples = sample_rate # 1 s + batch = { + "audio": torch.randn(1, num_samples, device=d, requires_grad=False), + "audio_lens": torch.tensor([num_samples], device=d), + } + + # Run the shared forward pass used by training_step. + audio, audio_len, audio_gen, commit_loss, codes = model._process_batch(batch) + + # Compute the mel reconstruction loss (l1 + l2) — same calls as training_step. + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio.float(), + audio_gen=audio_gen.float(), + audio_len=audio_len, + ) + loss = model.mel_loss_l1_scale * loss_mel_l1 + model.mel_loss_l2_scale * loss_mel_l2 + + assert loss is not None, "mel loss must not be None" + assert loss.ndim == 0, "mel loss must be a scalar" + assert torch.isfinite(loss), f"mel loss must be finite, got {loss.item()}" + + loss.backward() + + # Verify that at least some encoder and decoder parameters received gradients. + enc_params_with_grad = [p for p in model.audio_encoder.parameters() if p.grad is not None] + dec_params_with_grad = [p for p in model.audio_decoder.parameters() if p.grad is not None] + assert len(enc_params_with_grad) > 0, "encoder parameters must have gradients after backward" + assert len(dec_params_with_grad) > 0, "decoder parameters must have gradients after backward" + + +def test_model_inference(): + """Test encode/decode round-trip: audio -> tokens -> reconstructed audio.""" + model = _load_model() + model.eval() + d = _DEVICE + + sample_rate = model.sample_rate # 22050 for this checkpoint + num_samples = sample_rate # 1 s of audio + audio = torch.randn(1, num_samples, device=d) + audio_len = torch.tensor([num_samples], device=d) + + with torch.no_grad(): + # Encode: audio -> discrete tokens + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + assert tokens is not None, "tokens must not be None" + assert tokens.ndim == 3, f"tokens must be [B, C, T], got shape {tuple(tokens.shape)}" + assert tokens.shape[0] == 1, "batch size must be 1" + assert ( + tokens.shape[1] == model.num_codebooks + ), f"expected {model.num_codebooks} codebooks, got {tokens.shape[1]}" + assert tokens_len.shape == (1,), f"tokens_len must have shape (1,), got {tuple(tokens_len.shape)}" + + # Decode: discrete tokens -> reconstructed audio + audio_out, audio_out_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_out is not None, "decoded audio must not be None" + assert audio_out.ndim == 2, f"decoded audio must be [B, T], got shape {tuple(audio_out.shape)}" + assert audio_out.shape[0] == 1, "decoded audio batch size must be 1" + assert audio_out_len.shape == (1,), f"audio_out_len must have shape (1,), got {tuple(audio_out_len.shape)}" + # The decoded length must cover the original number of input samples. + assert ( + audio_out_len[0] >= num_samples + ), f"decoded length {audio_out_len[0].item()} must be >= input length {num_samples}" diff --git a/tests/e2e_nightly/test_model_support_nvidia__magpie_tts_multilingual_357m.py b/tests/e2e_nightly/test_model_support_nvidia__magpie_tts_multilingual_357m.py new file mode 100644 index 000000000000..2ad062a5f102 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__magpie_tts_multilingual_357m.py @@ -0,0 +1,131 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/magpie_tts_multilingual_357m.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/magpie_tts_multilingual_357m" +NEMO_FILE = "nvidia__magpie_tts_multilingual_357m.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import MagpieTTSModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = MagpieTTSModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise MagpieTTSModel's core training computation via process_batch(). + + MagpieTTSModel.training_step() calls self.log() / self.log_dict() which + require an attached Lightning Trainer. We therefore call process_batch() + directly — exactly the computation training_step() delegates to — and verify + that a scalar, finite loss flows correctly through backward(). + + The model is of type 'decoder_ce' with a baked context embedding, so no + context audio is needed in the batch. The batch only requires: + text – phoneme/character token IDs (B, T_text) + text_lens – actual text lengths (B,) + audio_codes – codec token indices (B, num_codebooks, T_audio) + audio_codes_lens – actual audio code lengths (B,) + """ + model = _load_model() + model.train() + d = _DEVICE + + # Derive safe upper bounds for token IDs from the model's embedding tables. + # text_embedding vocab = num_tokens_tokenizer + 2 (BOS/EOS). + text_vocab_size = model.text_embedding.num_embeddings + # audio_codes must be strictly less than codebook_size (special tokens are + # appended *after* codebook_size inside process_batch / add_special_tokens). + audio_token_max = model.codebook_size - 1 + + B = 1 + T_text = 8 # short phoneme sequence + T_audio = 20 # short audio sequence (frames) + + # Text tokens: use a safe non-special ID (1) to avoid BOS/EOS collisions. + text = torch.ones(B, T_text, dtype=torch.long, device=d) + text_lens = torch.tensor([T_text], dtype=torch.long, device=d) + + # Audio codec tokens: shape (B, num_codebooks, T_audio). + audio_codes = torch.randint( + low=0, + high=audio_token_max, + size=(B, model.num_audio_codebooks, T_audio), + dtype=torch.long, + device=d, + ) + audio_codes_lens = torch.tensor([T_audio], dtype=torch.long, device=d) + + batch = { + "text": text, + "text_lens": text_lens, + "audio_codes": audio_codes, + "audio_codes_lens": audio_codes_lens, + } + + batch_output = model.process_batch(batch) + loss = batch_output["loss"] + + assert isinstance(loss, torch.Tensor), "process_batch() must return a tensor loss." + assert loss.ndim == 0, f"Expected scalar loss, got shape {loss.shape}." + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + + loss.backward() + + +def test_model_inference(): + """Test MagpieTTSModel speech synthesis via do_tts(). + + do_tts() returns a tuple of (audio, audio_len) where audio has shape + (1, T_audio_samples) and audio_len has shape (1,). + """ + model = _load_model() + model.eval() + with torch.no_grad(): + audio, audio_len = model.do_tts(transcript="hello world", language="en") + + assert audio is not None, "do_tts() returned None audio." + assert isinstance(audio, torch.Tensor), f"Expected Tensor, got {type(audio)}." + assert audio.ndim == 2, f"Expected 2-D audio tensor (1, T), got shape {audio.shape}." + assert audio.shape[0] == 1, f"Batch dimension mismatch: {audio.shape}." + assert audio_len is not None, "do_tts() returned None audio_len." + assert isinstance(audio_len, torch.Tensor), f"Expected Tensor for audio_len, got {type(audio_len)}." + assert audio_len.shape == (1,), f"Unexpected audio_len shape: {audio_len.shape}." + assert audio_len[0] > 0, "Generated audio length must be positive." diff --git a/tests/e2e_nightly/test_model_support_nvidia__mel_codec_22khz.py b/tests/e2e_nightly/test_model_support_nvidia__mel_codec_22khz.py new file mode 100644 index 000000000000..da3f43832370 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__mel_codec_22khz.py @@ -0,0 +1,122 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/mel-codec-22khz.""" + +import os + +import torch + +MODEL_NAME = "nvidia/mel-codec-22khz" +NEMO_FILE = "nvidia__mel-codec-22khz.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise the generator reconstruction loss path without a Lightning Trainer. + + AudioCodecModel uses manual optimization and requires two optimizers, so + calling training_step() directly outside of a Trainer context would fail at + self.optimizers(). Instead we drive the equivalent forward computation via + _process_batch, compute the primary mel reconstruction loss the same way the + real training_step does, and verify that loss.backward() produces gradients. + """ + model = _load_model() + model.train() + d = _DEVICE + + # Use one second of audio at the model's native sample rate so that at + # least one encoded frame is produced regardless of samples_per_frame. + sr = model.sample_rate + num_samples = sr # 1 second + batch = { + "audio": torch.randn(1, num_samples, device=d), + "audio_lens": torch.tensor([num_samples], device=d), + } + + # _process_batch runs encoder -> (optional) vector quantizer -> decoder. + audio, audio_len, audio_gen, commit_loss, codes = model._process_batch(batch) + + # Compute the L1 mel loss, which is always enabled (mel_loss_l1_scale > 0 + # by default) and is the primary reconstruction objective. + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio.float(), + audio_gen=audio_gen.float(), + audio_len=audio_len, + ) + loss = model.mel_loss_l1_scale * loss_mel_l1 + if model.mel_loss_l2_scale: + loss = loss + model.mel_loss_l2_scale * loss_mel_l2 + + assert loss is not None + assert torch.isfinite(loss), f"Expected finite loss, got {loss.item()}" + + loss.backward() + + # Verify at least one encoder parameter received a gradient. + grads = [p.grad for p in model.audio_encoder.parameters() if p.grad is not None] + assert len(grads) > 0, "No gradients found in audio_encoder after backward()" + + +def test_model_inference(): + """Verify encode -> decode round-trip produces valid audio output.""" + model = _load_model() + model.eval() + d = _DEVICE + + # Use one second at the model's native sample rate. + sr = model.sample_rate + num_samples = sr + audio = torch.randn(1, num_samples, device=d) + audio_len = torch.tensor([num_samples], device=d) + + with torch.no_grad(): + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + assert tokens is not None + assert tokens.ndim == 3, f"Expected tokens of shape (B, C, T), got {tokens.shape}" + assert tokens_len is not None + + # Decode the tokens back to audio. + audio_out, audio_out_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_out is not None + assert audio_out.ndim == 2, f"Expected decoded audio of shape (B, T), got {audio_out.shape}" + assert audio_out_len is not None + assert torch.isfinite(audio_out).all(), "Decoded audio contains non-finite values" diff --git a/tests/e2e_nightly/test_model_support_nvidia__mel_codec_44khz.py b/tests/e2e_nightly/test_model_support_nvidia__mel_codec_44khz.py new file mode 100644 index 000000000000..aed169a6c7a6 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__mel_codec_44khz.py @@ -0,0 +1,123 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/mel-codec-44khz.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/mel-codec-44khz" +NEMO_FILE = "nvidia__mel-codec-44khz.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise the generator reconstruction loss path without a Lightning Trainer. + + AudioCodecModel uses manual optimization and requires two optimizers, so + calling training_step() directly outside of a Trainer context would fail at + self.optimizers(). Instead we drive the equivalent forward computation via + _process_batch, compute the primary mel reconstruction loss the same way the + real training_step does, and verify that loss.backward() produces gradients. + """ + model = _load_model() + model.train() + d = _DEVICE + + # Use one second of audio at the model's native sample rate so that at + # least one encoded frame is produced regardless of samples_per_frame. + sr = model.sample_rate + num_samples = sr # 1 second + batch = { + "audio": torch.randn(1, num_samples, device=d), + "audio_lens": torch.tensor([num_samples], device=d), + } + + # _process_batch runs encoder -> (optional) vector quantizer -> decoder. + audio, audio_len, audio_gen, commit_loss, codes = model._process_batch(batch) + + # Compute the L1 mel loss, which is always enabled (mel_loss_l1_scale > 0 + # by default) and is the primary reconstruction objective. + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio.float(), + audio_gen=audio_gen.float(), + audio_len=audio_len, + ) + loss = model.mel_loss_l1_scale * loss_mel_l1 + if model.mel_loss_l2_scale: + loss = loss + model.mel_loss_l2_scale * loss_mel_l2 + + assert loss is not None + assert torch.isfinite(loss), f"Expected finite loss, got {loss.item()}" + + loss.backward() + + # Verify at least one encoder parameter received a gradient. + grads = [p.grad for p in model.audio_encoder.parameters() if p.grad is not None] + assert len(grads) > 0, "No gradients found in audio_encoder after backward()" + + +def test_model_inference(): + """Verify encode -> decode round-trip produces valid audio output.""" + model = _load_model() + model.eval() + d = _DEVICE + + # Use one second at the model's native sample rate. + sr = model.sample_rate + num_samples = sr + audio = torch.randn(1, num_samples, device=d) + audio_len = torch.tensor([num_samples], device=d) + + with torch.no_grad(): + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + assert tokens is not None + assert tokens.ndim == 3, f"Expected tokens of shape (B, C, T), got {tokens.shape}" + assert tokens_len is not None + + # Decode the tokens back to audio. + audio_out, audio_out_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_out is not None + assert audio_out.ndim == 2, f"Expected decoded audio of shape (B, T), got {audio_out.shape}" + assert audio_out_len is not None + assert torch.isfinite(audio_out).all(), "Decoded audio contains non-finite values" diff --git a/tests/e2e_nightly/test_model_support_nvidia__multitalker_parakeet_streaming_0_6b_v1.py b/tests/e2e_nightly/test_model_support_nvidia__multitalker_parakeet_streaming_0_6b_v1.py new file mode 100644 index 000000000000..0370d9ea8758 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__multitalker_parakeet_streaming_0_6b_v1.py @@ -0,0 +1,103 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/multitalker-parakeet-streaming-0.6b-v1.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/multitalker-parakeet-streaming-0.6b-v1" +NEMO_FILE = "nvidia__multitalker-parakeet-streaming-0.6b-v1.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + import math + + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + + # Multitalker training_step expects a 6-element batch: + # (signal, signal_len, transcript, transcript_len, spk_targets, bg_spk_targets) + T_audio = 16000 + T_enc = math.ceil(math.ceil(T_audio / 160) / 8) + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, T_audio, device=d), + torch.tensor([T_audio, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + torch.ones(2, T_enc, device=d), + torch.zeros(2, T_enc, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Run encoder-only forward pass in eval mode to verify inference shapes. + + Speaker targets are set to None to trigger single-speaker / all-ones-mask mode. + """ + model = _load_model() + model.eval() + d = _DEVICE + + # Use None speaker targets to trigger single-speaker / all-ones-mask mode. + model.set_speaker_targets(None, None) + + with torch.no_grad(): + encoded, encoded_len = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + ) + + assert encoded is not None + assert encoded.ndim == 3 + assert encoded_len.shape == (1,) + + model.clear_speaker_targets() diff --git a/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps.py b/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps.py new file mode 100644 index 000000000000..dce4c94155fe --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_0_6kbps_12_5fps.py @@ -0,0 +1,129 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps.""" + +import os + +import torch + +MODEL_NAME = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps" +NEMO_FILE = "nvidia__nemo-nano-codec-22khz-0.6kbps-12.5fps.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise the generator forward pass without the discriminator. + + The nano-codec 0.6kbps model uses a WavLM-based discriminator that + raises 'requires_grad on non-leaf' errors. We bypass training_step + and exercise the generator path directly via _process_batch() + mel/ + time-domain losses, matching the approach used for 1.78kbps. + """ + model = _load_model() + model.train() + d = _DEVICE + + sample_rate = model.sample_rate + num_samples = sample_rate # 1 second of audio + audio = torch.randn(1, num_samples, device=d, requires_grad=False) + audio_len = torch.tensor([num_samples], dtype=torch.long, device=d) + batch = {"audio": audio, "audio_lens": audio_len} + + audio_ref, audio_ref_len, audio_gen, commit_loss, _ = model._process_batch(batch) + + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio_ref.float(), + audio_gen=audio_gen.float(), + audio_len=audio_ref_len, + ) + loss_time = model.time_domain_loss_fn( + audio_real=audio_ref, + audio_gen=audio_gen, + audio_len=audio_ref_len, + ) + + generator_losses = [] + if model.mel_loss_l1_scale: + generator_losses.append(model.mel_loss_l1_scale * loss_mel_l1) + if model.mel_loss_l2_scale: + generator_losses.append(model.mel_loss_l2_scale * loss_mel_l2) + if model.time_domain_loss_scale: + generator_losses.append(model.time_domain_loss_scale * loss_time) + if model.commit_loss_scale and isinstance(commit_loss, torch.Tensor): + generator_losses.append(model.commit_loss_scale * commit_loss) + + assert generator_losses, "No generator losses were computed." + loss = sum(generator_losses) + + assert isinstance(loss, torch.Tensor), "Loss must be a tensor." + assert loss.ndim == 0, "Loss must be a scalar tensor." + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + loss.backward() + + +def test_model_inference(): + """Verify encode→decode round-trip shapes and value sanity.""" + model = _load_model() + model.eval() + d = _DEVICE + + sample_rate = model.sample_rate # typically 22050 for this model + num_samples = sample_rate # 1 second of audio + audio = torch.randn(1, num_samples, device=d) + audio_len = torch.tensor([num_samples], dtype=torch.long, device=d) + + with torch.no_grad(): + # Encode: audio waveform → discrete tokens [B, num_codebooks, T_frames] + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + assert tokens is not None, "encode() returned None tokens." + assert tokens.ndim == 3, f"Expected 3-D token tensor [B, C, T], got shape {tuple(tokens.shape)}." + assert tokens.shape[0] == 1, "Batch dimension mismatch." + assert tokens_len.shape == (1,), f"Unexpected tokens_len shape: {tuple(tokens_len.shape)}." + assert tokens_len[0] > 0, "Encoded length must be positive." + + # Decode: discrete tokens → reconstructed waveform [B, T_audio] + audio_rec, audio_rec_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_rec is not None, "decode() returned None audio." + assert audio_rec.ndim == 2, f"Expected 2-D audio tensor [B, T], got shape {tuple(audio_rec.shape)}." + assert audio_rec.shape[0] == 1, "Batch dimension mismatch in decoded audio." + assert audio_rec_len[0] > 0, "Decoded audio length must be positive." + assert torch.isfinite(audio_rec).all(), "Decoded audio contains non-finite values." diff --git a/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps.py b/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps.py new file mode 100644 index 000000000000..65ce6f978ea3 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_78kbps_12_5fps.py @@ -0,0 +1,147 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/nemo-nano-codec-22khz-1.78kbps-12.5fps.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/nemo-nano-codec-22khz-1.78kbps-12.5fps" +NEMO_FILE = "nvidia__nemo-nano-codec-22khz-1.78kbps-12.5fps.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise the forward pass used during training. + + AudioCodecModel.training_step() requires a full Lightning Trainer context + (two optimizers, manual_backward, log_dict, etc.). Instead we directly + exercise the core computation: _process_batch() followed by the generator + reconstruction losses, a summed loss, and loss.backward(). + """ + model = _load_model() + model.train() + d = _DEVICE + + # One second of audio at the model's native sample rate. + sample_rate = model.sample_rate # 22050 for this codec + num_samples = sample_rate # 1 s + + audio = torch.randn(1, num_samples, device=d, requires_grad=False) + audio_len = torch.tensor([num_samples], dtype=torch.long, device=d) + batch = {"audio": audio, "audio_lens": audio_len} + + # _process_batch runs encoder → (optional) quantizer → decoder and returns + # the (padded) reference audio, its lengths, the generated audio, the + # commit loss from the quantizer, and the encoded representation. + audio_ref, audio_ref_len, audio_gen, commit_loss, _ = model._process_batch(batch) + + # Compute the mel reconstruction losses (the primary generator losses). + # mel_loss_fn returns (loss_l1, loss_l2); both are scalar tensors. + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio_ref.float(), + audio_gen=audio_gen.float(), + audio_len=audio_ref_len, + ) + + # Time-domain loss. + loss_time = model.time_domain_loss_fn( + audio_real=audio_ref, + audio_gen=audio_gen, + audio_len=audio_ref_len, + ) + + # Accumulate scaled losses exactly as training_step does. + generator_losses = [] + if model.mel_loss_l1_scale: + generator_losses.append(model.mel_loss_l1_scale * loss_mel_l1) + if model.mel_loss_l2_scale: + generator_losses.append(model.mel_loss_l2_scale * loss_mel_l2) + if model.time_domain_loss_scale: + generator_losses.append(model.time_domain_loss_scale * loss_time) + if model.commit_loss_scale and isinstance(commit_loss, torch.Tensor): + generator_losses.append(model.commit_loss_scale * commit_loss) + + assert generator_losses, "No generator losses were computed — check model config." + loss = sum(generator_losses) + + assert isinstance(loss, torch.Tensor), "Loss must be a tensor." + assert loss.ndim == 0, "Loss must be a scalar tensor." + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + + loss.backward() + + +def test_model_inference(): + """Test encode → decode round-trip (inference path).""" + model = _load_model() + model.eval() + d = _DEVICE + + sample_rate = model.sample_rate # 22050 for this codec + num_samples = sample_rate # 1 s + + audio = torch.randn(1, num_samples, device=d) + audio_len = torch.tensor([num_samples], dtype=torch.long, device=d) + + with torch.no_grad(): + # Encode: audio → discrete tokens. + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + assert tokens is not None, "encode() returned None tokens." + # tokens shape: (batch, num_codebooks, num_frames) + assert tokens.ndim == 3, f"Expected 3-D tokens tensor, got shape {tokens.shape}." + assert tokens.shape[0] == 1, "Batch dimension mismatch." + assert tokens_len.shape == (1,), f"Unexpected tokens_len shape: {tokens_len.shape}." + assert tokens_len[0] > 0, "tokens_len must be positive." + # Number of frames must match the last dimension of tokens. + assert ( + tokens_len[0] == tokens.shape[2] + ), f"tokens_len {tokens_len[0]} does not match tokens time dimension {tokens.shape[2]}." + + # Decode: discrete tokens → reconstructed audio. + audio_out, audio_out_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_out is not None, "decode() returned None audio." + assert audio_out.ndim == 2, f"Expected 2-D audio tensor, got shape {audio_out.shape}." + assert audio_out.shape[0] == 1, "Batch dimension mismatch in decoded audio." + assert audio_out_len[0] > 0, "Decoded audio length must be positive." + assert torch.isfinite(audio_out).all(), "Decoded audio contains non-finite values." diff --git a/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps.py b/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps.py new file mode 100644 index 000000000000..b968ff42ceab --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__nemo_nano_codec_22khz_1_89kbps_21_5fps.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps.""" + +import os + +import torch + +MODEL_NAME = "nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps" +NEMO_FILE = "nvidia__nemo-nano-codec-22khz-1.89kbps-21.5fps.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import AudioCodecModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioCodecModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise generator forward+loss without discriminator. + + AudioCodecModel uses manual optimisation (automatic_optimization=False) + and its discriminator architecture may produce non-leaf tensors that + prevent changing requires_grad flags. We bypass the discriminator and + compute only the generator reconstruction losses (mel + commit), which + is sufficient to verify the encoder/decoder/quantizer forward+backward. + """ + model = _load_model() + model.train() + d = _DEVICE + + sample_rate = model.sample_rate + num_samples = sample_rate # 1 second of audio + batch = { + "audio": torch.randn(1, num_samples, device=d), + "audio_lens": torch.tensor([num_samples], dtype=torch.long, device=d), + } + + audio, audio_len, audio_gen, commit_loss, codes = model._process_batch(batch) + + # Only compute mel reconstruction losses (skip discriminator). + loss_mel_l1, loss_mel_l2 = model.mel_loss_fn( + audio_real=audio.float(), audio_gen=audio_gen.float(), audio_len=audio_len + ) + loss = loss_mel_l1 + loss_mel_l2 + if model.commit_loss_scale and commit_loss is not None: + loss = loss + model.commit_loss_scale * commit_loss + + assert loss.ndim == 0 + assert torch.isfinite(loss) + loss.backward() + + +def test_model_inference(): + """Encode audio to discrete tokens then decode back to waveform. + + Checks the full encode -> decode round-trip: + * ``encode`` returns tokens of shape (B, n_codebooks, T_frames) with + integer values in [0, codebook_size). + * ``decode`` produces audio of shape (B, T_audio) with finite values. + """ + model = _load_model() + model.eval() + d = _DEVICE + + # Use 1 second of audio at the model's native sample rate. + sample_rate = model.sample_rate + audio = torch.randn(1, sample_rate, device=d) + audio_len = torch.tensor([sample_rate], device=d) + + with torch.no_grad(): + tokens, tokens_len = model.encode(audio=audio, audio_len=audio_len) + + # Shape checks for tokens. + assert tokens is not None, "encode() returned None tokens" + assert tokens.ndim == 3, f"Expected tokens shape (B, C, T), got {tokens.shape}" + assert tokens.shape[0] == 1, f"Batch dimension mismatch: {tokens.shape}" + assert ( + tokens.shape[1] == model.num_codebooks + ), f"Codebook dimension {tokens.shape[1]} != model.num_codebooks {model.num_codebooks}" + assert tokens_len.shape == (1,), f"Unexpected tokens_len shape: {tokens_len.shape}" + assert tokens_len[0] > 0, "tokens_len must be positive" + + # Token values must lie within the codebook vocabulary. + assert tokens.min() >= 0, f"Negative token index found: {tokens.min()}" + assert tokens.max() < model.codebook_size, f"Token index {tokens.max()} >= codebook_size {model.codebook_size}" + + # Decode back to audio. + audio_out, audio_out_len = model.decode(tokens=tokens, tokens_len=tokens_len) + + assert audio_out is not None, "decode() returned None audio" + assert audio_out.ndim == 2, f"Expected decoded audio shape (B, T), got {audio_out.shape}" + assert audio_out.shape[0] == 1, f"Batch dimension mismatch after decode: {audio_out.shape}" + assert audio_out_len.shape == (1,), f"Unexpected audio_out_len shape: {audio_out_len.shape}" + assert audio_out_len[0] > 0, "Decoded audio length must be positive" + assert torch.isfinite(audio_out).all(), "Decoded audio contains non-finite values" diff --git a/tests/e2e_nightly/test_model_support_nvidia__nemotron_speech_streaming_en_0_6b.py b/tests/e2e_nightly/test_model_support_nvidia__nemotron_speech_streaming_en_0_6b.py new file mode 100644 index 000000000000..069670bde55e --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__nemotron_speech_streaming_en_0_6b.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/nemotron-speech-streaming-en-0.6b.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/nemotron-speech-streaming-en-0.6b" +NEMO_FILE = "nvidia__nemotron-speech-streaming-en-0.6b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b.py new file mode 100644 index 000000000000..51db2ca3287e --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-ctc-0.6b.""" + +import os + +import numpy as np +import torch + +MODEL_NAME = "nvidia/parakeet-ctc-0.6b" +NEMO_FILE = "nvidia__parakeet-ctc-0.6b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.tokenizer.vocab_size + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, vocab_size, (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b_Vietnamese.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b_Vietnamese.py new file mode 100644 index 000000000000..c6efcd840971 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_0_6b_Vietnamese.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-ctc-0.6b-Vietnamese.""" + +import os + +import numpy as np +import torch + +MODEL_NAME = "nvidia/parakeet-ctc-0.6b-Vietnamese" +NEMO_FILE = "nvidia__parakeet-ctc-0.6b-Vietnamese.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.tokenizer.vocab_size + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, vocab_size, (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_1_1b.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_1_1b.py new file mode 100644 index 000000000000..6439fc3d1e51 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_ctc_1_1b.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-ctc-1.1b.""" + +import os + +import numpy as np +import torch + +MODEL_NAME = "nvidia/parakeet-ctc-1.1b" +NEMO_FILE = "nvidia__parakeet-ctc-1.1b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.tokenizer.vocab_size + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, vocab_size, (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_realtime_eou_120m_v1.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_realtime_eou_120m_v1.py new file mode 100644 index 000000000000..78a4471ddae7 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_realtime_eou_120m_v1.py @@ -0,0 +1,94 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet_realtime_eou_120m-v1.""" + +import os + +import torch + +MODEL_NAME = "nvidia/parakeet_realtime_eou_120m-v1" +NEMO_FILE = "nvidia__parakeet_realtime_eou_120m-v1.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_0_6b.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_0_6b.py new file mode 100644 index 000000000000..5cd0b3762d82 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_0_6b.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-rnnt-0.6b.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-rnnt-0.6b" +NEMO_FILE = "nvidia__parakeet-rnnt-0.6b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_110m_da_dk.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_110m_da_dk.py new file mode 100644 index 000000000000..aec61b458424 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_110m_da_dk.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-rnnt-110m-da-dk.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-rnnt-110m-da-dk" +NEMO_FILE = "nvidia__parakeet-rnnt-110m-da-dk.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_1_1b.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_1_1b.py new file mode 100644 index 000000000000..4b4138b899cb --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_rnnt_1_1b.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-rnnt-1.1b.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-rnnt-1.1b" +NEMO_FILE = "nvidia__parakeet-rnnt-1.1b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v2.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v2.py new file mode 100644 index 000000000000..4d3393c76b44 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v2.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-tdt-0.6b-v2.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2" +NEMO_FILE = "nvidia__parakeet-tdt-0.6b-v2.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v3.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v3.py new file mode 100644 index 000000000000..f44b46891098 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_0_6b_v3.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-tdt-0.6b-v3.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v3" +NEMO_FILE = "nvidia__parakeet-tdt-0.6b-v3.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_1_1b.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_1_1b.py new file mode 100644 index 000000000000..b31e19082c92 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_1_1b.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-tdt-1.1b.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-tdt-1.1b" +NEMO_FILE = "nvidia__parakeet-tdt-1.1b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_0_6b_ja.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_0_6b_ja.py new file mode 100644 index 000000000000..662fad8e70b1 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_0_6b_ja.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-tdt_ctc-0.6b-ja.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-tdt_ctc-0.6b-ja" +NEMO_FILE = "nvidia__parakeet-tdt_ctc-0.6b-ja.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_110m.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_110m.py new file mode 100644 index 000000000000..0e5ddd579c2e --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_110m.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-tdt_ctc-110m.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-tdt_ctc-110m" +NEMO_FILE = "nvidia__parakeet-tdt_ctc-110m.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_1_1b.py b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_1_1b.py new file mode 100644 index 000000000000..d67f8705bfd5 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__parakeet_tdt_ctc_1_1b.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/parakeet-tdt_ctc-1.1b.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/parakeet-tdt_ctc-1.1b" +NEMO_FILE = "nvidia__parakeet-tdt_ctc-1.1b.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__se_den_sb_16k_small.py b/tests/e2e_nightly/test_model_support_nvidia__se_den_sb_16k_small.py new file mode 100644 index 000000000000..1eaa00ccdec2 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__se_den_sb_16k_small.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/se_den_sb_16k_small.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/se_den_sb_16k_small" +NEMO_FILE = "nvidia__se_den_sb_16k_small.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.audio.models import AudioToAudioModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioToAudioModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + # SchroedingerBridgeAudioToAudioModel uses loss_encoded + loss_time (no single self.loss). + # training_step() calls self.log() which requires a Lightning trainer context, so we call + # _step() directly with the same batch format: (input_signal, input_length, target_signal, _). + # Input shape is (B, C, T) — batch, channel, time. + model = _load_model() + model.train() + d = _DEVICE + B, C, T = 1, 1, 16000 + input_signal = torch.randn(B, C, T, device=d) + input_length = torch.tensor([T], device=d) + target_signal = torch.randn(B, C, T, device=d) + loss, _loss_enc, _loss_time = model._step( + target_signal=target_signal, + input_signal=input_signal, + input_length=input_length, + ) + assert loss.shape == torch.Size([]), f"Expected scalar loss, got shape {loss.shape}" + loss.backward() + + +def test_model_inference(): + # forward() is decorated with @torch.inference_mode(), so no_grad wrapper is not needed. + # It returns (output_signal, output_length); output_signal has shape (B, C, T). + model = _load_model() + model.eval() + d = _DEVICE + B, C, T = 1, 1, 16000 + input_signal = torch.randn(B, C, T, device=d) + input_length = torch.tensor([T], device=d) + output_signal, output_length = model( + input_signal=input_signal, + input_length=input_length, + ) + assert output_signal.ndim == 3, f"Expected 3D output (B, C, T), got shape {output_signal.shape}" + assert output_signal.shape[0] == B + assert output_signal.shape[-1] == T diff --git a/tests/e2e_nightly/test_model_support_nvidia__se_der_sb_16k_small.py b/tests/e2e_nightly/test_model_support_nvidia__se_der_sb_16k_small.py new file mode 100644 index 000000000000..23b793a22c3f --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__se_der_sb_16k_small.py @@ -0,0 +1,101 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/se_der_sb_16k_small.""" + +import os + +import torch + +MODEL_NAME = "nvidia/se_der_sb_16k_small" +NEMO_FILE = "nvidia__se_der_sb_16k_small.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.audio.models import AudioToAudioModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioToAudioModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + # SchroedingerBridgeAudioToAudioModel uses loss_encoded + loss_time (no single self.loss). + # Call _step() directly to avoid self.log() / self._optimizer accesses that require a Trainer. + # Batch format (tuple path): (input_signal, input_length, target_signal, target_length) + # Signals are shape (B, C, T); the model internally rearranges 2-D inputs to (B, 1, T). + model = _load_model() + model.train() + d = _DEVICE + + batch_size = 2 + num_samples = 16000 # 1 second at 16 kHz + # Build synthetic multi-channel (C=1) audio tensors in the correct (B, C, T) layout + input_signal = torch.randn(batch_size, 1, num_samples, device=d) + input_length = torch.tensor([num_samples, num_samples], device=d) + target_signal = torch.randn(batch_size, 1, num_samples, device=d) + + loss, loss_encoded, loss_time = model._step( + target_signal=target_signal, + input_signal=input_signal, + input_length=input_length, + ) + + assert isinstance(loss, torch.Tensor), "loss must be a tensor" + assert loss.ndim == 0, "loss must be a scalar tensor" + assert torch.isfinite(loss), "loss must be finite" + loss.backward() + + +def test_model_inference(): + # forward() is decorated with @torch.inference_mode() and returns (output_signal, output_length). + model = _load_model() + model.eval() + d = _DEVICE + + input_signal = torch.randn(1, 1, 16000, device=d) + input_length = torch.tensor([16000], device=d) + + with torch.no_grad(): + output_signal, output_length = model( + input_signal=input_signal, + input_length=input_length, + ) + + assert isinstance(output_signal, torch.Tensor), "output_signal must be a tensor" + assert ( + output_signal.shape == input_signal.shape + ), f"output shape {output_signal.shape} must match input shape {input_signal.shape}" + assert isinstance(output_length, torch.Tensor), "output_length must be a tensor" + assert ( + output_length.shape == input_length.shape + ), f"output_length shape {output_length.shape} must match input_length shape {input_length.shape}" diff --git a/tests/e2e_nightly/test_model_support_nvidia__speakerverification_en_titanet_large.py b/tests/e2e_nightly/test_model_support_nvidia__speakerverification_en_titanet_large.py new file mode 100644 index 000000000000..b5bb5597b6a5 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__speakerverification_en_titanet_large.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/speakerverification_en_titanet_large.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/speakerverification_en_titanet_large" +NEMO_FILE = "nvidia__speakerverification_en_titanet_large.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import EncDecSpeakerLabelModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = EncDecSpeakerLabelModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + num_classes = model.decoder._num_classes + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, num_classes, (2,), device=d), + torch.tensor([1, 1], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + logits, embs = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + ) + assert logits is not None + assert embs is not None + assert embs.ndim == 2 # (B, embedding_dim) + assert embs.shape[0] == 1 + assert torch.isfinite(embs).all() diff --git a/tests/e2e_nightly/test_model_support_nvidia__sr_ssl_flowmatching_16k_430m.py b/tests/e2e_nightly/test_model_support_nvidia__sr_ssl_flowmatching_16k_430m.py new file mode 100644 index 000000000000..d9845a3db872 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__sr_ssl_flowmatching_16k_430m.py @@ -0,0 +1,97 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/sr_ssl_flowmatching_16k_430m.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/sr_ssl_flowmatching_16k_430m" +NEMO_FILE = "nvidia__sr_ssl_flowmatching_16k_430m.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.audio.models import AudioToAudioModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = AudioToAudioModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + model = _load_model() + model.train() + d = _DEVICE + + # FlowMatchingAudioToAudioModel.training_step expects a batch dict with + # input_signal (B, C, T), input_length (B,), and optionally target_signal. + # For the SSL variant the model uses input as its own target when + # target_signal is absent, so we only need to supply input_signal and + # input_length. + T = 16000 + input_signal = torch.randn(1, 1, T, device=d) + input_length = torch.tensor([T], device=d) + + # Call _step directly to avoid the self.log() calls that require a Trainer. + loss = model._step( + target_signal=input_signal.clone(), + input_signal=input_signal, + input_length=input_length, + ) + + assert loss.ndim == 0, f"Expected scalar loss, got shape {loss.shape}" + assert torch.isfinite(loss), f"Expected finite loss, got {loss.item()}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + + T = 16000 + input_signal = torch.randn(1, 1, T, device=d) + input_length = torch.tensor([T], device=d) + + # forward() is decorated with @torch.inference_mode() so no_grad is implicit. + output_signal, output_length = model( + input_signal=input_signal, + input_length=input_length, + ) + + assert output_signal is not None, "Expected non-None output signal" + assert ( + output_signal.shape == input_signal.shape + ), f"Expected output shape {input_signal.shape}, got {output_signal.shape}" + assert output_length is not None, "Expected non-None output length" diff --git a/tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_large_v1_0.py b/tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_large_v1_0.py new file mode 100644 index 000000000000..8c4afa85038a --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_large_v1_0.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/ssl_en_nest_large_v1.0.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/ssl_en_nest_large_v1.0" +NEMO_FILE = "nvidia__ssl_en_nest_large_v1.0.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import EncDecDenoiseMaskedTokenPredModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = EncDecDenoiseMaskedTokenPredModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + from nemo.collections.asr.data.ssl_dataset import AudioNoiseBatch + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + torch.manual_seed(0) # fixed seed to avoid intermittent NaN with random inputs + batch = AudioNoiseBatch( + audio=torch.randn(2, 16000, device=d), + audio_len=torch.tensor([16000, 12000], device=d), + noise=torch.randn(2, 16000, device=d), + noise_len=torch.tensor([16000, 12000], device=d), + noisy_audio=torch.randn(2, 16000, device=d), + noisy_audio_len=torch.tensor([16000, 12000], device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + log_probs, encoded_len, masks, tokens = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + noisy_input_signal=torch.randn(1, 16000, device=d), + noisy_input_signal_length=torch.tensor([16000], device=d), + ) + assert log_probs is not None + assert encoded_len is not None diff --git a/tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_xlarge_v1_0.py b/tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_xlarge_v1_0.py new file mode 100644 index 000000000000..0709c0d517ae --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__ssl_en_nest_xlarge_v1_0.py @@ -0,0 +1,90 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/ssl_en_nest_xlarge_v1.0.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/ssl_en_nest_xlarge_v1.0" +NEMO_FILE = "nvidia__ssl_en_nest_xlarge_v1.0.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import EncDecDenoiseMaskedTokenPredModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = EncDecDenoiseMaskedTokenPredModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + from nemo.collections.asr.data.ssl_dataset import AudioNoiseBatch + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + torch.manual_seed(0) # fixed seed to avoid intermittent NaN with random inputs + batch = AudioNoiseBatch( + audio=torch.randn(2, 16000, device=d), + audio_len=torch.tensor([16000, 12000], device=d), + noise=torch.randn(2, 16000, device=d), + noise_len=torch.tensor([16000, 12000], device=d), + noisy_audio=torch.randn(2, 16000, device=d), + noisy_audio_len=torch.tensor([16000, 12000], device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + log_probs, encoded_len, masks, tokens = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + noisy_input_signal=torch.randn(1, 16000, device=d), + noisy_input_signal_length=torch.tensor([16000], device=d), + ) + assert log_probs is not None + assert encoded_len is not None + assert masks is not None + assert tokens is not None diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0.py b/tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0.py new file mode 100644 index 000000000000..08f3b5f0c561 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pc_v1_0.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_ar_fastconformer_hybrid_large_pc_v1.0.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_ar_fastconformer_hybrid_large_pc_v1.0" +NEMO_FILE = "nvidia__stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0.py b/tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0.py new file mode 100644 index 000000000000..2854f5d5b261 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1_0.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_ar_fastconformer_hybrid_large_pcd_v1.0.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_ar_fastconformer_hybrid_large_pcd_v1.0" +NEMO_FILE = "nvidia__stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_be_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_be_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..5c1bf10c3f5e --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_be_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_be_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_be_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_be_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_de_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_de_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..670aa3a53333 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_de_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_de_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_de_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_de_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_large.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_large.py new file mode 100644 index 000000000000..6abffad179cb --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_large.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_ctc_large.""" + +import os + +import numpy as np +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_ctc_large" +NEMO_FILE = "nvidia__stt_en_fastconformer_ctc_large.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.tokenizer.vocab_size + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, vocab_size, (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xlarge.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xlarge.py new file mode 100644 index 000000000000..8d479d8ab667 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xlarge.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_ctc_xlarge.""" + +import os + +import numpy as np +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_ctc_xlarge" +NEMO_FILE = "nvidia__stt_en_fastconformer_ctc_xlarge.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.tokenizer.vocab_size + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, vocab_size, (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xxlarge.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xxlarge.py new file mode 100644 index 000000000000..afbc55c6bc8f --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_ctc_xxlarge.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_ctc_xxlarge.""" + +import os + +import numpy as np +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_ctc_xxlarge" +NEMO_FILE = "nvidia__stt_en_fastconformer_ctc_xxlarge.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.tokenizer.vocab_size + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, vocab_size, (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..c7e90436f654 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_en_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.py new file mode 100644 index 000000000000..7510266c53db --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_hybrid_large_streaming_multi.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_hybrid_large_streaming_multi" +NEMO_FILE = "nvidia__stt_en_fastconformer_hybrid_large_streaming_multi.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.py new file mode 100644 index 000000000000..7195def6f086 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_hybrid_medium_streaming_80ms.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_hybrid_medium_streaming_80ms" +NEMO_FILE = "nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.py new file mode 100644 index 000000000000..c00d3de77632 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_hybrid_medium_streaming_80ms_pc" +NEMO_FILE = "nvidia__stt_en_fastconformer_hybrid_medium_streaming_80ms_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_tdt_large.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_tdt_large.py new file mode 100644 index 000000000000..c541ee532946 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_tdt_large.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_tdt_large.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_tdt_large" +NEMO_FILE = "nvidia__stt_en_fastconformer_tdt_large.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_large.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_large.py new file mode 100644 index 000000000000..1efeb911cc71 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_large.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_transducer_large.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_transducer_large" +NEMO_FILE = "nvidia__stt_en_fastconformer_transducer_large.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xlarge.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xlarge.py new file mode 100644 index 000000000000..1a8292aea3dd --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xlarge.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_transducer_xlarge.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_transducer_xlarge" +NEMO_FILE = "nvidia__stt_en_fastconformer_transducer_xlarge.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xxlarge.py b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xxlarge.py new file mode 100644 index 000000000000..ff94f0cdc21a --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_en_fastconformer_transducer_xxlarge.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_en_fastconformer_transducer_xxlarge.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_en_fastconformer_transducer_xxlarge" +NEMO_FILE = "nvidia__stt_en_fastconformer_transducer_xxlarge.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..ce3e73ee1b7f --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_es_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_es_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_es_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc.py new file mode 100644 index 000000000000..dae7dd0b87bb --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_es_fastconformer_hybrid_large_pc_nc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_es_fastconformer_hybrid_large_pc_nc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_es_fastconformer_hybrid_large_pc_nc" +NEMO_FILE = "nvidia__stt_es_fastconformer_hybrid_large_pc_nc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_fa_fastconformer_hybrid_large.py b/tests/e2e_nightly/test_model_support_nvidia__stt_fa_fastconformer_hybrid_large.py new file mode 100644 index 000000000000..9b99c1398774 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_fa_fastconformer_hybrid_large.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_fa_fastconformer_hybrid_large.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_fa_fastconformer_hybrid_large" +NEMO_FILE = "nvidia__stt_fa_fastconformer_hybrid_large.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_fr_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_fr_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..37d6f80cc200 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_fr_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_fr_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_fr_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_fr_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_hr_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_hr_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..c2b61d81ba4b --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_hr_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_hr_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_hr_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_hr_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_hy_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_hy_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..fbb6e9cdf63a --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_hy_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_hy_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_hy_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_hy_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_it_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_it_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..bb0a0dae59f2 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_it_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_it_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_it_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_it_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..716a68a91b80 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_ka_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_ka_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_ka_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.py new file mode 100644 index 000000000000..6e4df48cfb03 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc" +NEMO_FILE = "nvidia__stt_ka_fastconformer_hybrid_transducer_ctc_large_streaming_80ms_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_kk_ru_fastconformer_hybrid_large.py b/tests/e2e_nightly/test_model_support_nvidia__stt_kk_ru_fastconformer_hybrid_large.py new file mode 100644 index 000000000000..b634b8dde69c --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_kk_ru_fastconformer_hybrid_large.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_kk_ru_fastconformer_hybrid_large.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_kk_ru_fastconformer_hybrid_large" +NEMO_FILE = "nvidia__stt_kk_ru_fastconformer_hybrid_large.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_nl_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_nl_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..86986f957ae9 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_nl_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_nl_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_nl_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_nl_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_pl_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_pl_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..83d319fe3bbf --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_pl_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_pl_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_pl_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_pl_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_pt_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_pt_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..7106cf74bdee --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_pt_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_pt_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_pt_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_pt_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_ru_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_ru_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..e19482890755 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_ru_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_ru_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_ru_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_ru_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_ua_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_ua_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..0ed46403fd56 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_ua_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_ua_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_ua_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_ua_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__stt_uz_fastconformer_hybrid_large_pc.py b/tests/e2e_nightly/test_model_support_nvidia__stt_uz_fastconformer_hybrid_large_pc.py new file mode 100644 index 000000000000..0563a536f9f5 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__stt_uz_fastconformer_hybrid_large_pc.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/stt_uz_fastconformer_hybrid_large_pc.""" + +import os + +import pytest +import torch + +MODEL_NAME = "nvidia/stt_uz_fastconformer_hybrid_large_pc" +NEMO_FILE = "nvidia__stt_uz_fastconformer_hybrid_large_pc.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_nvidia__tts_en_fastpitch.py b/tests/e2e_nightly/test_model_support_nvidia__tts_en_fastpitch.py new file mode 100644 index 000000000000..afc1f811304f --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__tts_en_fastpitch.py @@ -0,0 +1,112 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/tts_en_fastpitch.""" + +import os + +import torch + +MODEL_NAME = "nvidia/tts_en_fastpitch" +NEMO_FILE = "nvidia__tts_en_fastpitch.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import FastPitchModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = FastPitchModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + import math + + model = _load_model() + model.train() + + # Model parameters from config: sample_rate=22050, hop_length=256, n_mel_channels=80 + sample_rate = 22050 + hop_length = 256 + B = 1 + T_audio = sample_rate # 1 second of audio + + # T_mel: number of mel frames produced by the preprocessor (pad_to=1) + T_mel = math.ceil(T_audio / hop_length) + T_text = 10 # number of text tokens + + audio = torch.randn(B, T_audio).to(_DEVICE) + audio_lens = torch.tensor([T_audio], dtype=torch.int32).to(_DEVICE) + text = torch.randint(1, 80, (B, T_text), dtype=torch.long).to(_DEVICE) + text_lens = torch.tensor([T_text], dtype=torch.int32).to(_DEVICE) + # pitch: frame-level (per mel frame), shape (B, T_mel) + pitch = torch.rand(B, T_mel).to(_DEVICE) + # align_prior_matrix: (B, T_mel, T_text) — attention prior (mel-frames x text-tokens) + # Normalised so each mel frame sums to 1 over text tokens. + align_prior = torch.ones(B, T_mel, T_text).to(_DEVICE) + align_prior = align_prior / align_prior.sum(dim=-1, keepdim=True).clamp(min=1e-8) + + batch = { + "audio": audio, + "audio_lens": audio_lens, + "text": text, + "text_lens": text_lens, + "pitch": pitch, + "align_prior_matrix": align_prior, + } + + # Temporarily switch ds_class so training_step treats batch as a plain dict + # rather than calling process_batch (which requires _train_dl to be set up). + original_ds_class = model.ds_class + model.ds_class = "nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset" + try: + loss = model.training_step(batch, 0) + finally: + model.ds_class = original_ds_class + + assert loss.shape == torch.Size([]), f"Expected scalar loss, got shape {loss.shape}" + assert torch.isfinite(loss), f"Loss is not finite: {loss.item()}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + with torch.no_grad(): + tokens = model.parse("hello world") + if _DEVICE.type == "cuda": + tokens = tokens.to(_DEVICE) + spec = model.generate_spectrogram(tokens=tokens) + assert spec is not None, "generate_spectrogram returned None" + assert spec.ndim == 3, f"Expected 3D spectrogram (B, D, T), got shape {spec.shape}" + assert spec.shape[1] == 80, f"Expected 80 mel channels, got {spec.shape[1]}" + assert spec.shape[2] > 0, "Spectrogram time dimension is empty" diff --git a/tests/e2e_nightly/test_model_support_nvidia__tts_hifigan.py b/tests/e2e_nightly/test_model_support_nvidia__tts_hifigan.py new file mode 100644 index 000000000000..e65c169684e1 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_nvidia__tts_hifigan.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for nvidia/tts_hifigan.""" + +import os + +import torch + +MODEL_NAME = "nvidia/tts_hifigan" +NEMO_FILE = "nvidia__tts_hifigan.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.tts.models import HifiGanModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = HifiGanModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Exercise generator forward pass and mel loss without discriminator. + + HifiGanModel uses manual optimization (automatic_optimization=False) + and the discriminator path can produce size-mismatch errors with + synthetic audio. We bypass the full training_step and instead run the + generator directly on a mel spectrogram, then compute a simple L1 loss + in waveform domain to verify the forward+backward path. + """ + model = _load_model() + model.train() + d = _DEVICE + + # Generate mel spec from audio + n_samples = 16000 + audio = torch.randn(1, n_samples, device=d) + audio_len = torch.tensor([n_samples], device=d) + spec, spec_len = model.audio_to_melspec_precessor(audio, audio_len) + + # Generator forward + audio_gen = model.generator(x=spec) + + # Trim to match + min_len = min(audio.shape[-1], audio_gen.shape[-1]) + audio_t = audio[..., :min_len] + audio_gen_t = audio_gen[..., :min_len] + + # Simple L1 loss in waveform domain + loss = torch.nn.functional.l1_loss(audio_gen_t, audio_t) + assert loss.ndim == 0 + assert torch.isfinite(loss) + loss.backward() + + +def test_model_inference(): + """Exercise convert_spectrogram_to_audio with a random mel spectrogram. + + HifiGanModel.convert_spectrogram_to_audio(spec) takes a mel + spectrogram of shape (B, n_mels, T) and returns a waveform of shape + (B, T_audio). The model uses nfilt=80 mel bins. + """ + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + # spec shape: (batch=1, n_mels=80, time_frames=100) + spec = torch.randn(1, 80, 100, device=d) + audio = model.convert_spectrogram_to_audio(spec=spec) + + assert audio is not None, "convert_spectrogram_to_audio() returned None" + assert audio.ndim == 2, f"Expected audio shape (B, T), got {audio.shape}" + assert audio.shape[0] == 1, f"Batch dimension mismatch: {audio.shape}" + assert audio.shape[1] > 0, "Output audio has zero length" + assert torch.isfinite(audio).all(), "Output audio contains non-finite values" diff --git a/tests/e2e_nightly/test_model_support_stt_en_fastconformer_hybrid_large_streaming_1040ms.py b/tests/e2e_nightly/test_model_support_stt_en_fastconformer_hybrid_large_streaming_1040ms.py new file mode 100644 index 000000000000..85d52b66a178 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_stt_en_fastconformer_hybrid_large_streaming_1040ms.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for stt_en_fastconformer_hybrid_large_streaming_1040ms.""" + +import os + +import pytest +import torch + +MODEL_NAME = "stt_en_fastconformer_hybrid_large_streaming_1040ms" +NEMO_FILE = "stt_en_fastconformer_hybrid_large_streaming_1040ms.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.py b/tests/e2e_nightly/test_model_support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.py new file mode 100644 index 000000000000..f54d5d250917 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.""" + +import os + +import pytest +import torch + +MODEL_NAME = "stt_multilingual_fastconformer_hybrid_large_pc_blend_eu" +NEMO_FILE = "stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import ASRModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = ASRModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + vocab_size = model.joint.num_classes_with_blank - 1 + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, max(1, vocab_size), (2, 5), dtype=torch.long, device=d), + torch.tensor([5, 3], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test full inference pipeline via model.transcribe().""" + import numpy as np + + model = _load_model() + model.eval() + + from conftest import prepare_for_transcribe + + prepare_for_transcribe(model) + + audio = np.random.randn(16000).astype(np.float32) + + result = model.transcribe(audio=[audio], batch_size=1) + assert isinstance(result, list) + assert len(result) == 1 + # transcribe() may return strings or Hypothesis objects + text = result[0] if isinstance(result[0], str) else result[0].text + assert isinstance(text, str) + + hyps = model.transcribe(audio=[audio], batch_size=1, return_hypotheses=True) + assert isinstance(hyps, list) + assert len(hyps) == 1 + assert hasattr(hyps[0], 'text') diff --git a/tests/e2e_nightly/test_model_support_titanet_large.py b/tests/e2e_nightly/test_model_support_titanet_large.py new file mode 100644 index 000000000000..39ce245d46bf --- /dev/null +++ b/tests/e2e_nightly/test_model_support_titanet_large.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for titanet_large.""" + +import os + +import pytest +import torch + +MODEL_NAME = "titanet_large" +NEMO_FILE = "titanet_large.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import EncDecSpeakerLabelModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = EncDecSpeakerLabelModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + num_classes = model.decoder._num_classes + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.randint(0, num_classes, (2,), device=d), + torch.tensor([1, 1], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + logits, embs = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + ) + assert logits is not None + assert embs is not None + assert embs.ndim == 2 # (B, embedding_dim) + assert embs.shape[0] == 1 + assert torch.isfinite(embs).all() diff --git a/tests/e2e_nightly/test_model_support_tts_en_e2e_fastspeech2hifigan.py b/tests/e2e_nightly/test_model_support_tts_en_e2e_fastspeech2hifigan.py new file mode 100644 index 000000000000..31e10dc30212 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_tts_en_e2e_fastspeech2hifigan.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for tts_en_e2e_fastspeech2hifigan (model class removed from codebase).""" + +import pytest + + +@pytest.mark.xfail(reason="model class removed from codebase", strict=True) +def test_model_init(): + pytest.fail("model class removed from codebase") + + +@pytest.mark.xfail(reason="model class removed from codebase", strict=True) +def test_model_training_step(): + pytest.fail("model class removed from codebase") + + +@pytest.mark.xfail(reason="model class removed from codebase", strict=True) +def test_model_inference(): + pytest.fail("model class removed from codebase") diff --git a/tests/e2e_nightly/test_model_support_vad_multilingual_frame_marblenet.py b/tests/e2e_nightly/test_model_support_vad_multilingual_frame_marblenet.py new file mode 100644 index 000000000000..2ee9690c4c9c --- /dev/null +++ b/tests/e2e_nightly/test_model_support_vad_multilingual_frame_marblenet.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for vad_multilingual_frame_marblenet.""" + +import os + +import pytest +import torch + +MODEL_NAME = "vad_multilingual_frame_marblenet" +NEMO_FILE = "vad_multilingual_frame_marblenet.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models.classification_models import EncDecFrameClassificationModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = EncDecFrameClassificationModel.restore_from(filepath, map_location="cpu", strict=False).to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + d = next(model.parameters()).device + # Discover output frame count to build matching labels. + model.eval() + with torch.no_grad(): + probe = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + ) + n_frames = probe.shape[1] + + prepare_for_training_step(model) + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.zeros(2, n_frames, dtype=torch.long, device=d), + torch.tensor([n_frames, n_frames], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + """Test forward pass in eval mode and assert per-frame logit output.""" + model = _load_model() + model.eval() + d = _DEVICE + batch_size = 1 + n_samples = 16000 + input_signal = torch.randn(batch_size, n_samples, device=d) + input_signal_length = torch.tensor([n_samples], device=d) + + with torch.no_grad(): + logits = model.forward( + input_signal=input_signal, + input_signal_length=input_signal_length, + ) + + # logits must be a 3-D tensor: [B, T, C] (batch, frames, classes). + assert logits is not None + assert logits.ndim == 3, f"Expected 3-D logits [B, T, C], got shape {logits.shape}" + assert logits.shape[0] == batch_size, f"Expected batch size {batch_size}, got {logits.shape[0]}" + # At least one output frame must be produced. + assert logits.shape[1] > 0, "Expected at least one output frame" + # Number of classes must be positive. + assert logits.shape[2] > 0, "Expected at least one output class" + assert torch.isfinite(logits).all(), "Logits contain non-finite values" diff --git a/tests/e2e_nightly/test_model_support_vad_multilingual_marblenet.py b/tests/e2e_nightly/test_model_support_vad_multilingual_marblenet.py new file mode 100644 index 000000000000..aa396aab96b4 --- /dev/null +++ b/tests/e2e_nightly/test_model_support_vad_multilingual_marblenet.py @@ -0,0 +1,82 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests for vad_multilingual_marblenet.""" + +import os + +import pytest +import torch + +MODEL_NAME = "vad_multilingual_marblenet" +NEMO_FILE = "vad_multilingual_marblenet.nemo" + +MODEL_DIR = os.environ.get( + "NEMO_MODEL_SUPPORT_DIR", + os.environ.get("NEMO_MODEL_SUPPORT_DIR_CI", "/home/TestData/nemo-speech-ci-models"), +) +_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_model = None + + +def _load_model(): + global _model + if _model is not None: + return _model + from nemo.collections.asr.models import EncDecClassificationModel + + filepath = os.path.join(MODEL_DIR, NEMO_FILE) + _model = EncDecClassificationModel.restore_from(filepath, map_location="cpu").to(_DEVICE) + return _model + + +def test_model_init(): + model = _load_model() + assert model is not None + if hasattr(model, "to_config_dict"): + cfg = model.to_config_dict() + assert cfg is not None + + +def test_model_training_step(): + """Run one training step via direct training_step() call.""" + from conftest import prepare_for_training_step + + model = _load_model() + prepare_for_training_step(model) + d = next(model.parameters()).device + batch = ( + torch.randn(2, 16000, device=d), + torch.tensor([16000, 12000], device=d), + torch.zeros(2, dtype=torch.long, device=d), + torch.tensor([1, 1], dtype=torch.long, device=d), + ) + result = model.training_step(batch, 0) + loss = result if isinstance(result, torch.Tensor) else result['loss'] + assert torch.isfinite(loss), f"Loss is not finite: {loss}" + loss.backward() + + +def test_model_inference(): + model = _load_model() + model.eval() + d = _DEVICE + with torch.no_grad(): + logits = model.forward( + input_signal=torch.randn(1, 16000, device=d), + input_signal_length=torch.tensor([16000], device=d), + ) + assert logits is not None + assert logits.ndim >= 2 + assert torch.isfinite(logits).all() diff --git a/tests/functional_tests/L0_Unit_Tests_CPU_Others.sh b/tests/functional_tests/L0_Unit_Tests_CPU_Others.sh index a3fa4977fa14..b063a3efe292 100644 --- a/tests/functional_tests/L0_Unit_Tests_CPU_Others.sh +++ b/tests/functional_tests/L0_Unit_Tests_CPU_Others.sh @@ -25,4 +25,5 @@ CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 coverage run -a --data-file=/work --ignore=tests/hydra \ --ignore=tests/lightning \ --ignore=tests/export \ - --ignore=tests/deploy + --ignore=tests/deploy \ + --ignore=tests/functional_tests diff --git a/tests/functional_tests/L0_Unit_Tests_GPU_Others.sh b/tests/functional_tests/L0_Unit_Tests_GPU_Others.sh index 88fcc46b743f..3b34b87b5f06 100644 --- a/tests/functional_tests/L0_Unit_Tests_GPU_Others.sh +++ b/tests/functional_tests/L0_Unit_Tests_GPU_Others.sh @@ -25,4 +25,5 @@ NEMO_NUMBA_MINVER=0.53 CUDA_VISIBLE_DEVICES=0 coverage run -a --data-file=/works --ignore=tests/hydra \ --ignore=tests/lightning \ --ignore=tests/export \ - --ignore=tests/deploy + --ignore=tests/deploy \ + --ignore=tests/functional_tests diff --git a/tests/functional_tests/conftest.py b/tests/functional_tests/conftest.py new file mode 100644 index 000000000000..29384a24bc28 --- /dev/null +++ b/tests/functional_tests/conftest.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared fixtures and utilities for per-model functional tests.""" + +import torch + + +class _MinimalTrainerStub: + """Provides just enough Trainer-like interface for training_step().""" + + global_step = 0 + log_every_n_steps = 1_000_000 # large value to skip WER computation during test + + # Lightning's log() checks these attributes: + training = True + sanity_checking = False + barebones = False + + @property + def callback_metrics(self): + return {} + + +def prepare_for_training_step(model): + """Prepare a model for a direct training_step() call without a full Trainer.""" + model.train() + + # Attach minimal trainer stub for models that access self.trainer + stub = _MinimalTrainerStub() + model._trainer = stub + + # Suppress Lightning logging (requires active Trainer control flow). + # We don't need logging in tests — we only care about loss computation. + model.log = lambda *a, **kw: None + model.log_dict = lambda *a, **kw: None + + # Many NeMo models access self._optimizer.param_groups[0]['lr'] in training_step + # to log the learning rate. Provide a minimal stand-in if no optimizer is set. + if getattr(model, '_optimizer', None) is None: + model._optimizer = torch.optim.SGD([torch.nn.Parameter(torch.zeros(1))], lr=1e-4) + + +def prepare_for_transcribe(model): + """Placeholder for any pre-transcribe model preparation. + + Previously disabled CUDA graph decoding, but the root cause + (CUDA 12/13 API mismatch in cudaStreamGetCaptureInfo) is now + fixed in the NeMo source. Kept as a no-op for compatibility + with test files that call it. + """ + pass