Fix LazyNeMoIterator supervision for multi-channel cuts (#14409)

anteju · web-flow · commit fd3ee7444d99 · 2025-08-06T09:33:25.000-04:00
* Fix LazyNeMoIterator supervision for multi-channel cuts

Signed-off-by: Ante Jukić &lt;ajukic@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: anteju &lt;anteju@users.noreply.github.com&gt;

---------

Signed-off-by: Ante Jukić &lt;ajukic@nvidia.com&gt;
Signed-off-by: anteju &lt;anteju@users.noreply.github.com&gt;
Co-authored-by: anteju &lt;anteju@users.noreply.github.com&gt;
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -127,6 +127,7 @@ def __iter__(self) -> Generator[Cut, None, None]:
                     recording_id=cut.recording_id,
                     start=0,
                     duration=cut.duration,
+                    channel=cut.channel,
                     text=data.get(self.text_field),
                     language=data.get(self.lang_field),
                 )
diff --git a/tests/collections/common/test_lhotse_nemo_adapters.py b/tests/collections/common/test_lhotse_nemo_adapters.py
@@ -14,9 +14,9 @@
 
 import numpy as np
 import pytest
-from lhotse import AudioSource, CutSet, MonoCut, Recording, SupervisionSegment
+from lhotse import AudioSource, CutSet, MonoCut, MultiCut, Recording, SupervisionSegment
 from lhotse.serialization import save_to_jsonl
-from lhotse.testing.dummies import DummyManifest
+from lhotse.testing.dummies import DummyManifest, dummy_multi_cut, dummy_supervision
 
 from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
 
@@ -41,6 +41,29 @@ def nemo_manifest_path(tmp_path_factory):
     return p
 
 
+@pytest.fixture
+def nemo_manifest_path_multichannel(tmp_path_factory):
+    """2 utterances of length 1s with 3 channels as a NeMo manifest."""
+    tmpdir = tmp_path_factory.mktemp("nemo_data")
+    cuts = CutSet.from_cuts(
+        dummy_multi_cut(idx, supervisions=[dummy_supervision(idx)], channel=[0, 1, 2], with_data=True)
+        for idx in range(0, 2)
+    ).save_audios(tmpdir, progress_bar=False)
+    nemo = []
+    for c in cuts:
+        nemo.append(
+            {
+                "audio_filepath": c.recording.sources[0].source,
+                "text": "irrelevant",
+                "duration": c.duration,
+                "lang": "en",
+            }
+        )
+    p = tmpdir / "nemo_manifest_multichannel.json"
+    save_to_jsonl(nemo, p)
+    return p
+
+
 def test_lazy_nemo_iterator(nemo_manifest_path):
     cuts = CutSet(LazyNeMoIterator(nemo_manifest_path))
 
@@ -78,6 +101,45 @@ def test_lazy_nemo_iterator(nemo_manifest_path):
         assert s.language == "en"
 
 
+def test_lazy_nemo_iterator_multichannel(nemo_manifest_path_multichannel):
+    cuts = CutSet(LazyNeMoIterator(nemo_manifest_path_multichannel))
+
+    assert len(cuts) == 2
+
+    for c in cuts:
+        assert isinstance(c, MultiCut)
+        assert c.start == 0.0
+        assert c.duration == 1.0
+        assert c.num_channels == 3
+        assert c.channel == [0, 1, 2]  # cuts have three channels
+        assert c.sampling_rate == 16000
+        assert c.num_samples == 16000
+
+        assert c.has_recording
+        assert isinstance(c.recording, Recording)
+        assert c.recording.duration == 1.0
+        assert c.recording.num_channels == 3
+        assert c.recording.num_samples == 16000
+        assert len(c.recording.sources) == 1
+        assert isinstance(c.recording.sources[0], AudioSource)
+        assert c.recording.sources[0].type == "file"
+        assert c.recording.sources[0].channels == c.channel  # recording has same channels as the cut
+
+        audio = c.load_audio()
+        assert isinstance(audio, np.ndarray)
+        assert audio.shape == (c.num_channels, 16000)  # audio has same num_channels as the cut
+        assert audio.dtype == np.float32
+
+        assert len(c.supervisions) == 1
+        s = c.supervisions[0]
+        assert isinstance(s, SupervisionSegment)
+        assert s.start == 0
+        assert s.duration == 1
+        assert s.channel == c.channel  # supervision has same channels as the cut
+        assert s.text == "irrelevant"
+        assert s.language == "en"
+
+
 @pytest.fixture
 def nemo_offset_manifest_path(tmp_path_factory):
     """

Original file line number	Diff line number	Diff line change
`@@ -127,6 +127,7 @@ def __iter__(self) -> Generator[Cut, None, None]:`
`127`	`127`	`recording_id=cut.recording_id,`
`128`	`128`	`start=0,`
`129`	`129`	`duration=cut.duration,`
	`130`	`+ channel=cut.channel,`
`130`	`131`	`text=data.get(self.text_field),`
`131`	`132`	`language=data.get(self.lang_field),`
`132`	`133`	`)`