Add Nemotron-H prompt format, fix cut-to-conversation custom attr propagation (NVIDIA-NeMo#13963)

pzelasko · chtruong814 · Amir Hussein · commit b0b5151bea59 · 2025-08-05T19:23:58.000Z
* Add Nemotron-H prompt format

Signed-off-by: Piotr Żelasko &lt;petezor@gmail.com&gt;

* Fix propagation of custom attr in cut_to_conversation

Signed-off-by: Piotr Żelasko &lt;petezor@gmail.com&gt;

* Fix CI

Signed-off-by: Piotr Żelasko &lt;petezor@gmail.com&gt;

* Unit test for the conversion fix

Signed-off-by: Piotr Żelasko &lt;petezor@gmail.com&gt;

---------

Signed-off-by: Piotr Żelasko &lt;petezor@gmail.com&gt;
Co-authored-by: Charlie Truong &lt;chtruong@nvidia.com&gt;
Signed-off-by: Amir Hussein &lt;amhussein@nvidia.com&gt;
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
@@ -478,6 +478,8 @@ def cut_to_conversation(
     ]
     if hasattr(cut, "context"):
         turns = [TextTurn(value=cut.context, role="user")] + turns
+    if hasattr(cut, "system_prompt"):
+        turns = [TextTurn(value=cut.system_prompt, role="system")] + turns
     return NeMoMultimodalConversation(
         id=cut.id,
         turns=turns,
@@ -489,6 +491,10 @@ def cut_to_conversation(
 @data_type_parser(["lhotse_as_conversation"])
 def read_lhotse_as_conversation(config) -> tuple[CutSet, bool]:
     cuts, is_tarred = read_cutset_from_config(config)
+    # Attach extra tags to every utterance dynamically, if provided.
+    # We need to attach them before cuts are converted to conversations.
+    if (extra_tags := config.get("tags")) is not None:
+        cuts = cuts.map(partial(attach_tags, tags=extra_tags), apply_fn=None)
     cuts = cuts.map(
         partial(
             cut_to_conversation,
diff --git a/nemo/collections/common/prompts/__init__.py b/nemo/collections/common/prompts/__init__.py
@@ -18,6 +18,7 @@
 from nemo.collections.common.prompts.gemma import GemmaPromptFormatter
 from nemo.collections.common.prompts.llama import Llama2PromptFormatter, Llama3PromptFormatter
 from nemo.collections.common.prompts.mistral import MistralPromptFormatter
+from nemo.collections.common.prompts.nemotron_h import NemotronHPromptFormatter
 from nemo.collections.common.prompts.phi2 import (
     Phi2ChatPromptFormatter,
     Phi2CodePromptFormatter,
diff --git a/nemo/collections/common/prompts/nemotron_h.py b/nemo/collections/common/prompts/nemotron_h.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=missing-function-docstring,missing-class-docstring
+from lhotse.cut import Cut, MixedCut
+
+from nemo.collections.common.data.prompt_fn import registered_prompt_format_fn
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+SYSTEM_BOS = "<SPECIAL_10>"
+TURN_BOS = "<SPECIAL_11>"
+
+
+class NemotronHPromptFormatter(PromptFormatter):
+    NAME = "nemotron-h"
+    OUTPUT_ROLE = "assistant"
+    INFERENCE_PREFIX = f"\n{TURN_BOS}Assistant\n"
+    TEMPLATE = {
+        "system": {
+            "template": f"{SYSTEM_BOS}System\n|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        "user": {
+            "template": f"\n{TURN_BOS}User\n|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"{INFERENCE_PREFIX}|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+@registered_prompt_format_fn(Cut, NemotronHPromptFormatter)
+def nemotron_h(cut: Cut, prompt: NemotronHPromptFormatter):
+    if isinstance(cut, MixedCut):
+        cut = cut.first_non_padding_cut
+
+    turns = []
+
+    system = ""
+    if cut.has_custom("system_prompt"):
+        system = cut.system_prompt
+    turns.append({"role": "system", "content": system})
+
+    if cut.has_custom("context"):
+        ctx = cut.context
+    else:
+        ctx = ""
+    turns.append({"role": "user", "content": ctx})
+
+    if (answer := cut.supervisions[0].text) is not None:
+        turns.append({"role": "assistant", "content": answer})
+
+    return prompt.encode_dialog(turns)
diff --git a/tests/collections/common/prompt_formatters/conftest.py b/tests/collections/common/prompt_formatters/conftest.py
@@ -41,6 +41,7 @@
 Feel free to add new tokens for your own tests!?
 But know that if you do so, you may need to update the token IDs in the existing tests! 
 So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
+SYSTEM
 """
 
 
@@ -58,7 +59,7 @@ def bpe_tokenizer(tmp_path_factory):
         remove_extra_whitespaces=True,
         bos=True,
         eos=True,
-        user_defined_symbols=['\n', '<|im_start|>', '<|im_end|>'],
+        user_defined_symbols=['\n', '<|im_start|>', '<|im_end|>', '<SPECIAL_10>', '<SPECIAL_11>'],
     )
     return SentencePieceTokenizer(str(tmpdir / "tokenizer.model"))
 
diff --git a/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
@@ -37,7 +37,7 @@ def test_canary_prompt_formatter_training(canary_tokenizer):
     assert canary_tokenizer.ids_to_text(ans["input_ids"].tolist()) == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|> TEST<|endoftext|>'
     assert canary_tokenizer.ids_to_text(ans["context_ids"].tolist()) == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|>'
     assert canary_tokenizer.ids_to_text(ans["answer_ids"].tolist()) == ' TEST<|endoftext|>'
-    assert ans["mask"].tolist() == [False] * 5 + [True] * 5
+    assert ans["mask"].shape[0] == ans["input_ids"].shape[0]
     # fmt: on
 
 
diff --git a/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
@@ -28,7 +28,7 @@ def test_gemma_prompt_formatter_training(bpe_tokenizer):
     assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()) == '<start_of_turn>user\nTEST<end_of_turn>\n<start_of_turn>model\n TEST<end_of_turn>\n'
     assert bpe_tokenizer.ids_to_text(ans["context_ids"].tolist()) == '<start_of_turn>user\nTEST<end_of_turn>\n<start_of_turn>model\n'
     assert bpe_tokenizer.ids_to_text(ans["answer_ids"].tolist()) == 'TEST<end_of_turn>\n'
-    assert ans["mask"].tolist() == [False] * 39 + [True] * 15
+    assert ans["mask"].shape[0] == ans["input_ids"].shape[0]
     # fmt: on
 
 
diff --git a/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
@@ -28,7 +28,7 @@ def test_llama2_prompt_formatter_training(bpe_tokenizer):
     assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()[1:-1]) == '[INST] TEST [/INST] TEST'
     assert bpe_tokenizer.ids_to_text(ans["context_ids"].tolist()[1:]) == '[INST] TEST [/INST]'
     assert bpe_tokenizer.ids_to_text(ans["answer_ids"].tolist()[:-1]) == 'TEST'
-    assert ans["mask"].tolist() == [False] * 16 + [True] * 5
+    assert ans["mask"].shape[0] == ans["input_ids"].shape[0]
     # fmt: on
 
 
@@ -59,7 +59,7 @@ def test_llama2_prompt_formatter_training_with_system(bpe_tokenizer):
     assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()[1:-1]) == '[INST] <<SYS>>\nTEST\n<</SYS>>\n\nTEST [/INST] TEST'
     assert bpe_tokenizer.ids_to_text(ans["context_ids"].tolist()[1:]) == '[INST] <<SYS>>\nTEST\n<</SYS>>\n\nTEST [/INST]'
     assert bpe_tokenizer.ids_to_text(ans["answer_ids"].tolist()[:-1]) == 'TEST'
-    assert ans["mask"].tolist() == [False] * 36 + [True] * 5
+    assert ans["mask"].shape[0] == ans["input_ids"].shape[0]
     # fmt: on
 
 
diff --git a/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
@@ -28,7 +28,7 @@ def test_mistral_prompt_formatter_training(bpe_tokenizer):
     assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()) == '<s> [INST] TEST [/INST] TEST</s>'
     assert bpe_tokenizer.ids_to_text(ans["context_ids"].tolist()) == '<s> [INST] TEST [/INST]'
     assert bpe_tokenizer.ids_to_text(ans["answer_ids"].tolist()) == 'TEST</s>'
-    assert ans["mask"].tolist() == [False] * 18 + [True] * 8
+    assert ans["mask"].shape[0] == ans["input_ids"].shape[0]
     # fmt: on
 
 
diff --git a/tests/collections/common/prompt_formatters/test_nemotronh_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_nemotronh_prompt_formatter.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.common.prompts.nemotron_h import NemotronHPromptFormatter
+
+
+def test_nemotronh_prompt_formatter_training(bpe_tokenizer):
+    formatter = NemotronHPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system", "slots": {"message": ""}},
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()) == '<SPECIAL_10>System\n \n<SPECIAL_11>User\nTEST \n<SPECIAL_11>Assistant\nTEST'
+    assert bpe_tokenizer.ids_to_text(ans["context_ids"].tolist()) == '<SPECIAL_10>System\n \n<SPECIAL_11>User\nTEST'
+    assert bpe_tokenizer.ids_to_text(ans["answer_ids"].tolist()) == '\n<SPECIAL_11>Assistant\nTEST'
+    assert ans["mask"].shape[0] == ans["input_ids"].shape[0]
+    # fmt: on
+
+
+def test_nemotronh_prompt_formatter_inference(bpe_tokenizer):
+    formatter = NemotronHPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system", "slots": {"message": ""}},
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()[1:]) == '<SPECIAL_10>System\n \n<SPECIAL_11>User\nTEST \n<SPECIAL_11>Assistant\n'
+    # fmt: on
+
+
+def test_nemotronh_prompt_formatter_training_with_system(bpe_tokenizer):
+    formatter = NemotronHPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system", "slots": {"message": "SYSTEM"}},
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()) == '<SPECIAL_10>System\nSYSTEM \n<SPECIAL_11>User\nTEST \n<SPECIAL_11>Assistant\nTEST'
+    assert bpe_tokenizer.ids_to_text(ans["context_ids"].tolist()) == '<SPECIAL_10>System\nSYSTEM \n<SPECIAL_11>User\nTEST'
+    assert bpe_tokenizer.ids_to_text(ans["answer_ids"].tolist()) == '\n<SPECIAL_11>Assistant\nTEST'
+    assert ans["mask"].shape[0] == ans["input_ids"].shape[0]
+    # fmt: on
+
+
+def test_nemotronh_prompt_formatter_inference_with_system(bpe_tokenizer):
+    formatter = NemotronHPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system", "slots": {"message": "SYSTEM"}},
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert bpe_tokenizer.ids_to_text(ans["input_ids"].tolist()[1:]) == '<SPECIAL_10>System\nSYSTEM \n<SPECIAL_11>User\nTEST \n<SPECIAL_11>Assistant\n'
+    # fmt: on
diff --git a/tests/collections/common/test_lhotse_multimodal_dataloading.py b/tests/collections/common/test_lhotse_multimodal_dataloading.py
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from pathlib import Path
+
 import lhotse
 import numpy as np
 import pytest
 import torch
+from lhotse import CutSet, SupervisionSegment
 from lhotse.testing.dummies import dummy_cut, dummy_recording
 from omegaconf import OmegaConf
 
@@ -485,3 +488,117 @@ def test_multimodal_conversation_duration_filter():
         ],
     )
     assert fltr(conv_s2s_7s) is False
+
+
+@pytest.fixture(scope="session")
+def cutset_path(tmp_path_factory) -> Path:
+    """3 utterances of lengths 1s, 2s, and 3s, with different context/system_prompt, as a Lhotse CutSet."""
+    cuts = CutSet(
+        [
+            dummy_cut(
+                0,
+                duration=1.0,
+                supervisions=[SupervisionSegment("e1", "e1", 0.0, 1.0, text="transcript")],
+                with_data=True,
+            ),
+            dummy_cut(
+                1,
+                duration=2.0,
+                recording_duration=2.0,
+                supervisions=[SupervisionSegment("e2", "e2", 0.0, 2.0, text="context and transcript")],
+                with_data=True,
+            ),
+            dummy_cut(
+                2,
+                duration=3.0,
+                recording_duration=3.0,
+                supervisions=[SupervisionSegment("e3", "e3", 0.0, 2.0, text="system context and transcript")],
+                with_data=True,
+            ),
+        ]
+    )
+    cuts[1].context = "some prompt"
+    cuts[2].context = "other prompt"
+    cuts[2].system_prompt = "system prompt"
+
+    tmp_path = tmp_path_factory.mktemp("data")
+    p = tmp_path / "cuts.jsonl.gz"
+    pa = tmp_path / "audio"
+    cuts.save_audios(pa).drop_in_memory_data().to_file(p)
+    return p
+
+
+def test_cut_to_conversation_conversion(cutset_path, tokenizer):
+    cuts = CutSet.from_file(cutset_path)
+    config = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "lhotse_as_conversation",
+                    "cuts_path": cutset_path,
+                    "audio_locator_tag": "[audio]",
+                    "tags": {"test_key": "test_value"},
+                },
+            ],
+            "token_equivalent_duration": 0.08,
+            "prompt_format": "llama3",
+            "force_finite": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=Identity(), tokenizer=tokenizer
+    )
+    batches = [batch for batch in dl]
+    assert len(batches) == 1
+
+    # Check the cut that has no 'context' or 'system_prompt'
+    conv = batches[0][0]
+    assert isinstance(conv, NeMoMultimodalConversation)
+    assert conv.id == cuts[0].id
+    assert len(conv.turns) == 2
+    assert isinstance(conv.turns[0], AudioTurn)
+    assert conv.turns[0].role == "user"
+    assert isinstance(conv.turns[1], TextTurn)
+    assert conv.turns[1].role == "assistant"
+    assert conv.turns[1].value == "transcript"
+    assert conv.custom["test_key"] == "test_value"
+    assert conv.turns[0].cut.custom["test_key"] == "test_value"
+
+    # Check the cut that has only 'context' and no 'system_prompt'
+    conv = batches[0][1]
+    assert isinstance(conv, NeMoMultimodalConversation)
+    assert conv.id == cuts[1].id
+    assert len(conv.turns) == 3
+    assert isinstance(conv.turns[0], TextTurn)
+    assert conv.turns[0].role == "user"
+    assert conv.turns[0].value == "some prompt"
+    assert isinstance(conv.turns[1], AudioTurn)
+    assert conv.turns[1].role == "user"
+    assert isinstance(conv.turns[2], TextTurn)
+    assert conv.turns[2].role == "assistant"
+    assert conv.turns[2].value == "context and transcript"
+    assert conv.custom["test_key"] == "test_value"
+    assert conv.turns[1].cut.custom["test_key"] == "test_value"
+
+    # Check the cut that has both 'context' and 'system_prompt'
+    conv = batches[0][2]
+    assert isinstance(conv, NeMoMultimodalConversation)
+    assert conv.id == cuts[2].id
+    assert len(conv.turns) == 4
+    assert isinstance(conv.turns[0], TextTurn)
+    assert conv.turns[0].role == "system"
+    assert conv.turns[0].value == "system prompt"
+    assert isinstance(conv.turns[1], TextTurn)
+    assert conv.turns[1].role == "user"
+    assert conv.turns[1].value == "other prompt"
+    assert isinstance(conv.turns[2], AudioTurn)
+    assert conv.turns[2].role == "user"
+    assert isinstance(conv.turns[3], TextTurn)
+    assert conv.turns[3].role == "assistant"
+    assert conv.turns[3].value == "system context and transcript"
+    assert conv.custom["test_key"] == "test_value"
+    assert conv.turns[2].cut.custom["test_key"] == "test_value"