diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py index 52b7545cf3be..eeca509874f3 100644 --- a/nemo/collections/common/data/lhotse/nemo_adapters.py +++ b/nemo/collections/common/data/lhotse/nemo_adapters.py @@ -127,6 +127,7 @@ def __iter__(self) -> Generator[Cut, None, None]: recording_id=cut.recording_id, start=0, duration=cut.duration, + channel=cut.channel, text=data.get(self.text_field), language=data.get(self.lang_field), ) diff --git a/tests/collections/common/test_lhotse_nemo_adapters.py b/tests/collections/common/test_lhotse_nemo_adapters.py index b080d26d4be6..9eb5d0294946 100644 --- a/tests/collections/common/test_lhotse_nemo_adapters.py +++ b/tests/collections/common/test_lhotse_nemo_adapters.py @@ -14,9 +14,9 @@ import numpy as np import pytest -from lhotse import AudioSource, CutSet, MonoCut, Recording, SupervisionSegment +from lhotse import AudioSource, CutSet, MonoCut, MultiCut, Recording, SupervisionSegment from lhotse.serialization import save_to_jsonl -from lhotse.testing.dummies import DummyManifest +from lhotse.testing.dummies import DummyManifest, dummy_multi_cut, dummy_supervision from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator @@ -41,6 +41,29 @@ def nemo_manifest_path(tmp_path_factory): return p +@pytest.fixture +def nemo_manifest_path_multichannel(tmp_path_factory): + """2 utterances of length 1s with 3 channels as a NeMo manifest.""" + tmpdir = tmp_path_factory.mktemp("nemo_data") + cuts = CutSet.from_cuts( + dummy_multi_cut(idx, supervisions=[dummy_supervision(idx)], channel=[0, 1, 2], with_data=True) + for idx in range(0, 2) + ).save_audios(tmpdir, progress_bar=False) + nemo = [] + for c in cuts: + nemo.append( + { + "audio_filepath": c.recording.sources[0].source, + "text": "irrelevant", + "duration": c.duration, + "lang": "en", + } + ) + p = tmpdir / "nemo_manifest_multichannel.json" + save_to_jsonl(nemo, p) + return p + + def test_lazy_nemo_iterator(nemo_manifest_path): cuts = CutSet(LazyNeMoIterator(nemo_manifest_path)) @@ -78,6 +101,45 @@ def test_lazy_nemo_iterator(nemo_manifest_path): assert s.language == "en" +def test_lazy_nemo_iterator_multichannel(nemo_manifest_path_multichannel): + cuts = CutSet(LazyNeMoIterator(nemo_manifest_path_multichannel)) + + assert len(cuts) == 2 + + for c in cuts: + assert isinstance(c, MultiCut) + assert c.start == 0.0 + assert c.duration == 1.0 + assert c.num_channels == 3 + assert c.channel == [0, 1, 2] # cuts have three channels + assert c.sampling_rate == 16000 + assert c.num_samples == 16000 + + assert c.has_recording + assert isinstance(c.recording, Recording) + assert c.recording.duration == 1.0 + assert c.recording.num_channels == 3 + assert c.recording.num_samples == 16000 + assert len(c.recording.sources) == 1 + assert isinstance(c.recording.sources[0], AudioSource) + assert c.recording.sources[0].type == "file" + assert c.recording.sources[0].channels == c.channel # recording has same channels as the cut + + audio = c.load_audio() + assert isinstance(audio, np.ndarray) + assert audio.shape == (c.num_channels, 16000) # audio has same num_channels as the cut + assert audio.dtype == np.float32 + + assert len(c.supervisions) == 1 + s = c.supervisions[0] + assert isinstance(s, SupervisionSegment) + assert s.start == 0 + assert s.duration == 1 + assert s.channel == c.channel # supervision has same channels as the cut + assert s.text == "irrelevant" + assert s.language == "en" + + @pytest.fixture def nemo_offset_manifest_path(tmp_path_factory): """