Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
94 commits
Select commit Hold shift + click to select a range
33917f5
MagpieTTS decoder model working on top of NeMo main branch
paarthneekhara Jan 8, 2026
156f16f
merge wit main again
paarthneekhara Jan 8, 2026
6ba3699
Apply isort and black reformatting
paarthneekhara Jan 8, 2026
94fcf03
Apply isort and black reformatting
paarthneekhara Jan 8, 2026
ae8f800
handling changes in dataloader
paarthneekhara Jan 9, 2026
c2ee249
hack to avoid HF error
paarthneekhara Jan 10, 2026
88a7576
Apply isort and black reformatting
paarthneekhara Jan 9, 2026
76ce3d1
remove discriminatory temporarily
paarthneekhara Jan 11, 2026
6f3987c
Apply isort and black reformatting
paarthneekhara Jan 10, 2026
aefe97f
fix errors
paarthneekhara Jan 11, 2026
9d52822
bug fix
paarthneekhara Jan 11, 2026
90a6c54
add moe
paarthneekhara Jan 11, 2026
324b803
Apply isort and black reformatting
paarthneekhara Jan 11, 2026
1c4a568
20 layer moe
paarthneekhara Jan 11, 2026
a19012a
some refactoring and clean up
paarthneekhara Jan 22, 2026
d88eda2
Apply isort and black reformatting
paarthneekhara Jan 22, 2026
122af0a
bug fix related to spectral codec
paarthneekhara Jan 28, 2026
59208f1
Apply isort and black reformatting
paarthneekhara Jan 28, 2026
3c8bb40
some clean up
paarthneekhara Jan 28, 2026
2067ae9
add docstrings and data classes
paarthneekhara Jan 28, 2026
ef6a0e0
more doc strings
paarthneekhara Jan 29, 2026
0101a1a
Apply isort and black reformatting
paarthneekhara Jan 29, 2026
ce19ed6
support multiple training modes
paarthneekhara Jan 29, 2026
704a5c8
Apply isort and black reformatting
paarthneekhara Jan 29, 2026
038d224
default mode for backward compatibility
paarthneekhara Jan 29, 2026
3f58202
Apply isort and black reformatting
paarthneekhara Jan 29, 2026
d58a560
default config changes
shehzeen Jan 30, 2026
a7fa478
Magpietts decoderonly 2601 bpe ipa tokenizer (#57)
shehzeen Feb 1, 2026
5e78e46
Apply isort and black reformatting
paarthneekhara Feb 1, 2026
91f71c8
nemotron mamba model (#58)
paarthneekhara Feb 2, 2026
3c05549
Apply isort and black reformatting
paarthneekhara Feb 2, 2026
067a6e8
inference function refactoring
paarthneekhara Feb 2, 2026
79457c6
revert some changes and remove scripts
paarthneekhara Feb 4, 2026
7c4a9d6
Apply isort and black reformatting
paarthneekhara Feb 4, 2026
61b8afd
Magpietts decoderonly 2601 simplify code (#60)
paarthneekhara Feb 5, 2026
cd59639
Apply isort and black reformatting
paarthneekhara Feb 5, 2026
e96f344
include vocab file
paarthneekhara Feb 5, 2026
4e09d1c
Magpietts decoderonly 2601 valinfer (#61)
paarthneekhara Feb 7, 2026
97d98da
Apply isort and black reformatting
paarthneekhara Feb 7, 2026
003c439
bug fixes in inference and logging
shehzeen Feb 7, 2026
af7e76b
more tests
shehzeen Feb 7, 2026
4a0e36b
tested and verified that infer batch works correctly with teacher for…
shehzeen Feb 7, 2026
0518c99
added legacy option to still work with 21fps F2F model
shehzeen Feb 7, 2026
fa0fafb
remove streaming decode because it not being used
shehzeen Feb 7, 2026
432605b
pass phoneme EOS to next step
shehzeen Feb 8, 2026
b239c2f
exlcude codec model from optimizer params
shehzeen Feb 9, 2026
ff68871
reduce dropout prob, change default delays to 0,1
shehzeen Feb 9, 2026
021bd9e
bug fix
shehzeen Feb 9, 2026
cb2cff1
phoneme EOS handling bug fix
shehzeen Feb 9, 2026
386814d
phoneme corruption methodology implemented
shehzeen Feb 10, 2026
2bd08ed
revisit defaults and update
shehzeen Feb 10, 2026
18e39b0
bug fix phoneme loss
shehzeen Feb 10, 2026
e5d141b
another inference bug fix
shehzeen Feb 10, 2026
b1b86f0
phoneme vocab size fix
shehzeen Feb 10, 2026
4a872aa
bug fix
shehzeen Feb 10, 2026
beaee7b
handle legacy model phoneme vocab size
shehzeen Feb 10, 2026
0879a12
context duration handling - stop repeating excessively
shehzeen Feb 11, 2026
ae557ac
clamp cer and wer to 1
shehzeen Feb 11, 2026
3d69a12
Preference Optimization for EasyMagpieTTS (#64)
shehzeen Feb 17, 2026
2ca7181
po stabilize
shehzeen Feb 18, 2026
1af65a9
mamba config update
paarthneekhara Feb 19, 2026
89cee8f
fix weight initialization bugs in mamba
paarthneekhara Feb 19, 2026
fb3343f
Magpietts decoderonly 2601 flash (#65)
paarthneekhara Feb 20, 2026
dab6437
add do tts method
shehzeen Feb 21, 2026
d58581b
bug fix
paarthneekhara Feb 23, 2026
9ec6767
Magpietts decoderonly 2601 utmos po (#67)
shehzeen Feb 24, 2026
acc05a1
full phoneme channel dropout option
paarthneekhara Feb 24, 2026
2c4520b
gt phoneme option in do_tts
shehzeen Feb 27, 2026
6501ada
bug fix
shehzeen Feb 27, 2026
6d635aa
ignore phoneme channel for some languages
paarthneekhara Mar 2, 2026
df40277
PO updates, cross lingual dataset creation
shehzeen Mar 3, 2026
c14083a
add language to text contexts.
shehzeen Mar 5, 2026
19ff0ea
Apply isort and black reformatting
paarthneekhara Mar 9, 2026
1f0f83f
tokenizer import change
paarthneekhara Mar 9, 2026
a61b60a
remove unnecessary imports
paarthneekhara Mar 9, 2026
f090106
cleanup
paarthneekhara Mar 9, 2026
b70c495
remove name from training modes
paarthneekhara Mar 9, 2026
00acdb4
Apply isort and black reformatting
paarthneekhara Mar 9, 2026
49bd6ff
removing some debugging statements
shehzeen Mar 9, 2026
fdbf72d
new base class (#68)
paarthneekhara Mar 10, 2026
f627523
Apply isort and black reformatting
paarthneekhara Mar 10, 2026
c8437ac
cleanup
paarthneekhara Mar 10, 2026
dc52f0a
sanitize logits only for easy magpie to preserve magpietts functionality
paarthneekhara Mar 10, 2026
f680b8f
remove custom phoneme tokenizer instantiation and handle it in the to…
paarthneekhara Mar 10, 2026
40fb7eb
Apply isort and black reformatting
paarthneekhara Mar 10, 2026
c8ad57a
remove streaming inference script
shehzeen Mar 10, 2026
cfa582f
Magpietts decoderonly 2601 inference refactoring (#69)
shehzeen Mar 11, 2026
5eaf1a4
Apply isort and black reformatting
shehzeen Mar 11, 2026
6d96df4
Paarthneekhara/magpietts decoderonly 2601 (#70)
shehzeen Mar 12, 2026
cb69267
Apply isort and black reformatting
shehzeen Mar 12, 2026
db9763d
refactoring to remove magpie base class
paarthneekhara Mar 12, 2026
7883ed9
Apply isort and black reformatting
paarthneekhara Mar 12, 2026
9d95ed4
bug fixed
paarthneekhara Mar 12, 2026
d9500e2
Merge branch 'magpietts_decoderonly_2601' of github.com:paarthneekhar…
paarthneekhara Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions examples/tts/conf/magpietts/easy_magpietts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
name: Magpie-TTS-DecoderOnly-EN
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have we tested the non-Lhotse path?


max_epochs: ???
# Adjust batch size based on GPU memory
batch_size: 2
# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
# If null, then weighted sampling is disabled.
weighted_sampling_steps_per_epoch: null

# Dataset metadata for each manifest
# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41
train_ds_meta: ???
val_ds_meta: ???

model:
# Decoder backend selection
# Options: "huggingface" (default), "nemotron_h"
decoder_type: "huggingface"

# HuggingFace backend config (used when decoder_type: "huggingface")
transformer_hf_backend: "Qwen/Qwen2.5-1.5B"

# NemotronH config (used when decoder_type: "nemotron_h")
# Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern:
# 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
nemotron_h_config:
hidden_size: 1536 # Should match embedding_dim
num_hidden_layers: 48
vocab_size: 131072
# Attention config
num_attention_heads: 12
num_key_value_heads: 4
attention_dropout: 0.0
attention_bias: false
max_position_embeddings: 8192
# Mamba config
mamba_num_heads: 64
mamba_head_dim: 24
ssm_state_size: 128
conv_kernel: 4
n_groups: 8
chunk_size: 256
mamba_hidden_act: "silu"
use_conv_bias: true
use_bias: false
# MLP config
intermediate_size: 4096
mlp_hidden_act: "silu"
mlp_bias: false
# MoE config (scaled from Nemotron-3-Nano-30B-A3B)
n_routed_experts: 48
num_experts_per_tok: 6
moe_intermediate_size: 1024
moe_shared_expert_intermediate_size: 2048
n_group: 1
topk_group: 1
routed_scaling_factor: 2.5
norm_topk_prob: true
# Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention
hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*"
# Normalization
layer_norm_epsilon: 1e-5
residual_in_fp32: true

use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
context_duration_min: 5.0
context_duration_max: 5.0
load_cached_codes_if_available: true

embedding_dim: 1536
hidden_dim: 1536
audio_embedding_dim: 1536 # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
codecmodel_path: ???
max_epochs: ${max_epochs}
steps_per_epoch: ${weighted_sampling_steps_per_epoch}

# Local transformer parameters for autoregressive codebook prediction within a frame
local_transformer_type: "autoregressive" # "none", "autoregressive"
# Below args are only relevant if use_local_transformer is autoregressive
local_transformer_loss_scale: 1.0
phoneme_loss_weight: 1.0
local_transformer_n_layers: 3
local_transformer_n_heads: 12
local_transformer_hidden_dim: 1536

cfg_unconditional_prob: 0.05
# To get special_tokens of the tokenzer, you can do:
# model.tokenizer.first_tokenizer.additional_special_tokens

# Multi-mode training configuration
# The model will randomly select one of the modes for each batch during training.
# Each mode has its own task embedding that is prepended to the context.
# During inference, you can specify which mode to use via the derived
# 'inference_mode' string: "{text_input_mode}_{streaming_phonemes_delay}_{streaming_speech_delay}".
training_modes:
- text_input_mode: "streaming" # Options: "full", "streaming"
streaming_phonemes_delay: 0
streaming_speech_delay: 1

frame_stacking_factor: 2
phoneme_stacking_factor: 1
phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation.
dropout_text_input_prob: 0.1
phoneme_corruption_batch_prob: 0.1
phoneme_corruption_timestep_ratio: 0.15
phoneme_corruption_unk_mode_prob: 0.5
phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel"

phoneme_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
tokenizer_path: "scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json"

text_tokenizers:
nemotron_nano_30b:
_target_: AutoTokenizer
pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"

train_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
dataset_meta: ${train_ds_meta}
weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
min_duration: 0.2
max_duration: 20.0

dataloader_params:
batch_size: ${batch_size}
num_workers: 4
drop_last: true
pin_memory: true

validation_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
dataset_meta: ${val_ds_meta}
min_duration: 0.2
max_duration: 20.0

dataloader_params:
batch_size: ${batch_size}
num_workers: 4
pin_memory: true

optim:
_target_: torch.optim.AdamW
lr: 1e-4

sched:
name: ExponentialLR
gamma: 0.998

trainer:
num_nodes: 1
devices: -1
accelerator: gpu
strategy: ddp_find_unused_parameters_true
precision: bf16-mixed
max_epochs: ${max_epochs}
accumulate_grad_batches: 1
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager
log_every_n_steps: 100
check_val_every_n_epoch: 1
num_sanity_val_steps: 0
benchmark: false
gradient_clip_val: 2.5

exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_wandb_logger: false
wandb_logger_kwargs:
entity: null
name: ${name}
project: null
group: null
resume: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_loss
mode: min
save_top_k: 5
save_best_model: true
always_save_nemo: true
resume_if_exists: true
resume_ignore_no_checkpoint: true
Loading
Loading