From 33917f5cefc841248703f1cad00ecdb316bb67e3 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 7 Jan 2026 21:31:21 -0500 Subject: [PATCH 01/94] MagpieTTS decoder model working on top of NeMo main branch Signed-off-by: Paarth Neekhara --- .../magpietts/magpietts_decoder_only.yaml | 144 ++ .../magpietts_decoder_only_lhotse.yaml | 169 ++ examples/tts/evalset_config.json | 5 + examples/tts/magpietts_decoder_only.py | 57 + examples/tts/magpietts_inference.py | 10 +- .../text_to_speech/tts_tokenizers.py | 1 + .../tts/data/text_to_speech_dataset.py | 20 + .../tts/data/text_to_speech_dataset_lhotse.py | 29 + nemo/collections/tts/models/__init__.py | 2 + .../tts/models/magpietts_decoder_only.py | 1729 +++++++++++++++++ .../modules/magpietts_inference/inference.py | 148 +- .../tts/modules/magpietts_inference/utils.py | 18 +- 12 files changed, 2301 insertions(+), 31 deletions(-) create mode 100644 examples/tts/conf/magpietts/magpietts_decoder_only.yaml create mode 100644 examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml create mode 100644 examples/tts/magpietts_decoder_only.py create mode 100644 nemo/collections/tts/models/magpietts_decoder_only.py diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only.yaml b/examples/tts/conf/magpietts/magpietts_decoder_only.yaml new file mode 100644 index 000000000000..8518fa79060b --- /dev/null +++ b/examples/tts/conf/magpietts/magpietts_decoder_only.yaml @@ -0,0 +1,144 @@ +name: Magpie-TTS-DecoderOnly-EN + +max_epochs: ??? +# Adjust batch size based on GPU memory +batch_size: 2 +# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch. +# If null, then weighted sampling is disabled. +weighted_sampling_steps_per_epoch: null + +# Dataset metadata for each manifest +# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41 +train_ds_meta: ??? +val_ds_meta: ??? + +model: + transformer_hf_backend: "Qwen/Qwen2.5-1.5B" + use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided. + context_duration_min: 5.0 + context_duration_max: 5.0 + load_cached_codes_if_available: true + + embedding_dim: 1536 + hidden_dim: 1536 + codecmodel_path: ??? + max_epochs: ${max_epochs} + steps_per_epoch: ${weighted_sampling_steps_per_epoch} + + # Local transformer parameters for autoregressive codebook prediction within a frame + local_transformer_type: "none" # "none", "autoregressive", "maskgit" + # Below args are only relevant if use_local_transformer is autoregressive, maskgit + local_transformer_loss_scale: 1.0 + local_transformer_n_layers: 3 + local_transformer_n_heads: 1 + local_transformer_hidden_dim: 256 + + cfg_unconditional_prob: 0.1 + # To get special_tokens of the tokenzer, you can do: + # model.tokenizer.first_tokenizer.additional_special_tokens + text_input_mode: "streaming" + frame_stacking_factor: 1 + phoneme_stacking_factor: 2 + streaming_phonemes_delay: 4 + streaming_speech_delay: 8 + dropout_text_input_prob: 0.3 + + phoneme_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: false + g2p: + _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p + phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" + heteronyms: "scripts/tts_dataset_files/heteronyms-052722" + phoneme_probability: 1.0 + ignore_ambiguous_words: false + use_chars: true + use_stresses: true + + text_tokenizers: # Add more languages for multi-lingual TTS + english_phoneme: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: false + g2p: + _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p + phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" + heteronyms: "scripts/tts_dataset_files/heteronyms-052722" + phoneme_probability: 0.8 + ignore_ambiguous_words: false + use_chars: true + use_stresses: true + + train_ds: + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset + dataset_meta: ${train_ds_meta} + weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} + min_duration: 0.2 + max_duration: 20.0 + + dataloader_params: + batch_size: ${batch_size} + num_workers: 4 + drop_last: true + pin_memory: true + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset + dataset_meta: ${val_ds_meta} + min_duration: 0.2 + max_duration: 20.0 + + dataloader_params: + batch_size: ${batch_size} + num_workers: 4 + pin_memory: true + + optim: + _target_: torch.optim.AdamW + lr: 1e-4 + + sched: + name: ExponentialLR + gamma: 0.998 + +trainer: + num_nodes: 1 + devices: -1 + accelerator: gpu + strategy: ddp_find_unused_parameters_true + precision: bf16-mixed + max_epochs: ${max_epochs} + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 1 + num_sanity_val_steps: 0 + benchmark: false + gradient_clip_val: 2.5 + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_wandb_logger: false + wandb_logger_kwargs: + entity: null + name: ${name} + project: null + group: null + resume: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + mode: min + save_top_k: 5 + save_best_model: true + always_save_nemo: true + resume_if_exists: true + resume_ignore_no_checkpoint: true \ No newline at end of file diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml b/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml new file mode 100644 index 000000000000..6ed9b529eac6 --- /dev/null +++ b/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml @@ -0,0 +1,169 @@ +name: Magpie-TTS-DecoderOnly-EN + +quadratic_duration: 20 + +# Adjust batch size based on GPU memory +# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch. +# If null, then weighted sampling is disabled. + +# Dataset metadata for each manifest +# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41 + +model: + use_lhotse: true + transformer_hf_backend: "Qwen/Qwen2.5-1.5B" + use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided. + context_duration_min: 5.0 + context_duration_max: 5.0 + load_cached_codes_if_available: true + + embedding_dim: 1536 + hidden_dim: 1536 + codecmodel_path: ??? + + # Local transformer parameters for autoregressive codebook prediction within a frame + local_transformer_type: "none" # "none", "autoregressive", "maskgit" + # Below args are only relevant if use_local_transformer is autoregressive, maskgit + local_transformer_loss_scale: 1.0 + local_transformer_n_layers: 3 + local_transformer_n_heads: 1 + local_transformer_hidden_dim: 256 + + cfg_unconditional_prob: 0.1 + + text_input_mode: "streaming" + frame_stacking_factor: 1 + phoneme_stacking_factor: 2 + streaming_phonemes_delay: 4 + streaming_speech_delay: 8 + dropout_text_input_prob: 0.3 + + phoneme_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: false + g2p: + _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p + phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" + heteronyms: "scripts/tts_dataset_files/heteronyms-052722" + phoneme_probability: 1.0 + ignore_ambiguous_words: false + use_chars: true + use_stresses: true + + text_tokenizers: # Add more languages for multi-lingual TTS + english_phoneme: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: false + g2p: + _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p + phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" + heteronyms: "scripts/tts_dataset_files/heteronyms-052722" + phoneme_probability: 0.8 + ignore_ambiguous_words: false + use_chars: true + use_stresses: true + + train_ds: + use_lhotse: ${model.use_lhotse} + volume_norm: true + + dataset: + min_duration: 0.2 + min_context_speaker_similarity: 0.6 + max_cer: 0.03 + batch_duration : ??? # in seconds. Adjust based on your GPU memory. + quadratic_duration: ${quadratic_duration} + use_bucketing: true + num_buckets: 20 + bucket_buffer_size: 20_000 + shuffle_buffer_size: 20_000 + num_cuts_for_bins_estimate: 20_000 + shard_seed: "trng" + drop_last: true + shuffle: true + num_workers: 6 + pin_memory: true + + input_cfg: + - type: lhotse_shar + shar_path: ??? + weight: 1.0 + tags: + tokenizer_names: ["english_phoneme"] + + + validation_ds: + use_lhotse: ${model.use_lhotse} + volume_norm: true + + dataset: + min_duration: 0.2 + min_context_speaker_similarity: 0.6 + max_cer: 0.03 + batch_duration: ??? # recommend to use smaller batch_duration for validation dataset than training dataset. + quadratic_duration: ${quadratic_duration} + use_bucketing: false + force_finite: true + drop_last: false + shuffle: false + num_workers: 2 + pin_memory: true + + input_cfg: + - type: lhotse_shar + shar_path: ??? + weight: 1.0 + tags: + tokenizer_names: ["english_phoneme"] + + optim: + _target_: torch.optim.AdamW + lr: 1e-4 + + sched: + name: ExponentialLR + gamma: 0.998 + +trainer: + num_nodes: 1 + devices: -1 + accelerator: gpu + strategy: ddp_find_unused_parameters_true + precision: bf16-mixed + max_steps: ??? + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + limit_train_batches: 1_000 + val_check_interval: 1_000 + num_sanity_val_steps: 0 + benchmark: false + use_distributed_sampler: false # required because Lhotse has its own handling + gradient_clip_val: 2.5 + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_wandb_logger: false + wandb_logger_kwargs: + entity: null + name: ${name} + project: null + group: null + resume: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + mode: min + save_top_k: 5 + save_best_model: true + always_save_nemo: true + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.4f}-{step}-{epoch}' + resume_if_exists: true + resume_ignore_no_checkpoint: true \ No newline at end of file diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json index 2d61a601f880..4ff4d12ad9eb 100644 --- a/examples/tts/evalset_config.json +++ b/examples/tts/evalset_config.json @@ -13,6 +13,11 @@ "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1_longform_tiny.json", "audio_dir": "/", "feature_dir": null + }, + "riva_hard_digits": { + "manifest_path": "/Data/evaluation_manifests/hard-digits-path-corrected.ndjson", + "audio_dir": "/Data/RIVA-TTS", + "feature_dir": "/Data/RIVA-TTS" } } diff --git a/examples/tts/magpietts_decoder_only.py b/examples/tts/magpietts_decoder_only.py new file mode 100644 index 000000000000..44859fee8d64 --- /dev/null +++ b/examples/tts/magpietts_decoder_only.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import lightning.pytorch as pl +import torch.multiprocessing as mp +from omegaconf import OmegaConf + +from nemo.collections.tts.models import MagpieTTSDecoderModel +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +@hydra_runner(config_path="conf/magpietts", config_name="magpietts_decoderonly_en") +def main(cfg): + logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True)) + + # forcing "spawn" method for multiprocessing over "fork" when choosing multiple + # worker processes for dataloaders. By default, multiprocessing uses "fork" to create + # worker processes, which inherit the memory state of the main process, including its + # already initialized CUDA state. When the worker processes trieds to use + # CUDA, it runs into conflicts with the inherited, now potentially invalid, + # CUDA context, resuling in the CUDA initialization error. When + # num_workers=0, all dataloading happens in the main process, so there is no + # process forking and no CUDA context conflict. When num_workers>0, the standard way + # to fix this is to use "spawn" to create a completely new and clean python process for + # each worker, avoding the problematic CUDA state inheritance. + mp.set_start_method("spawn", force=True) + + trainer = pl.Trainer(**cfg.trainer) + trainer.callbacks.append(pl.callbacks.LearningRateMonitor(logging_interval='step', log_weight_decay=True)) + exp_manager(trainer, cfg.get("exp_manager", None)) + + model = MagpieTTSDecoderModel(cfg=cfg.model, trainer=trainer) + model.maybe_init_from_pretrained_checkpoint(cfg=cfg) + + if cfg.get('mode', 'train') == 'train': + trainer.fit(model) + elif cfg.get('mode', 'train') == 'test': + trainer.test(model) + else: + raise NotImplementedError(f"Only train and test modes are supported. Got {cfg.mode}") + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter \ No newline at end of file diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index 9379b29668aa..d8d9e883fd04 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -190,7 +190,7 @@ def run_inference_and_evaluation( violin_plot_metrics.remove('utmosv2') # Load model - model, checkpoint_name = load_magpie_model(model_config) + model, checkpoint_name = load_magpie_model(model_config, is_decoder_only_model=inference_config.is_decoder_only_model) # Log architecture summary and get MoE info + FLOPs metrics moe_info, flops_per_component = log_model_architecture_summary(model) @@ -502,6 +502,10 @@ def create_argument_parser() -> argparse.ArgumentParser: target_group = parser.add_argument_group('Quality Targets') target_group.add_argument('--cer_target', type=float, default=None) target_group.add_argument('--ssim_target', type=float, default=None) + target_group.add_argument('--is_decoder_only_model', action='store_true') + target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt']) + target_group.add_argument('--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial']) + target_group.add_argument('--dropout_text_input', action='store_true') return parser @@ -553,6 +557,10 @@ def main(argv=None): maskgit_noise_scale=args.maskgit_noise_scale, maskgit_fixed_schedule=args.maskgit_fixed_schedule, maskgit_sampling_type=args.maskgit_sampling_type, + is_decoder_only_model=args.is_decoder_only_model, + phoneme_input_type=args.phoneme_input_type, + phoneme_sampling_method=args.phoneme_sampling_method, + dropout_text_input=args.dropout_text_input, ) eval_config = EvaluationConfig( diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index 75a12da269ee..4ecd544df81e 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -1216,6 +1216,7 @@ def __init__(self, tokenizers: List[Union[BaseTokenizer, PreTrainedTokenizerBase self.tokenizer_pad_ids = tokenizer_pad_ids # Define aggregated token's pad value from the first tokenizer's pad value first_tokenizer = self.tokenizers[tokenizer_names[0]] + self.first_tokenizer = first_tokenizer if hasattr(first_tokenizer, "pad_token_id"): # Defined in PreTrainedTokenizerBase subclasses self.pad = first_tokenizer.pad_token_id elif hasattr(first_tokenizer, "pad"): # Defined in BaseTokenizer subclasses diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index 3c158ee4bd8e..789636a569e3 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -403,6 +403,7 @@ def __init__( self.dataset_type = dataset_type self.tokenizer_config = tokenizer_config self.text_tokenizer = None # Assigned in worker_init_fn in model file + self.phoneme_tokenizer = None # Assigned in worker_init_fn in model file (if any) self.load_16khz_audio = load_16khz_audio self.use_text_conditioning_tokenizer = use_text_conditioning_tokenizer self.text_conditioning_tokenizer_name = text_conditioning_tokenizer_name @@ -434,6 +435,13 @@ def __getitem__(self, index): "text_len": text_len, } + if self.phoneme_tokenizer is not None: + phoneme_tokens = self.phoneme_tokenizer.encode(data.text) + phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + phoneme_tokens_len = len(phoneme_tokens) + example["phoneme_tokens"] = torch.tensor(phoneme_tokens, dtype=torch.int32) + example["phoneme_tokens_len"] = phoneme_tokens_len + if self.load_cached_codes_if_available and 'target_audio_codes_path' in data.manifest_entry: audio_codes_path = data.manifest_entry['target_audio_codes_path'] audio_codes = torch.load(audio_codes_path) # (C, T) @@ -632,6 +640,8 @@ def collate_fn(self, batch: List[dict]): raw_text_list = [] language_list = [] speaker_indices_list = [] + phoneme_tokens_list = [] + phoneme_tokens_len_list = [] for example in batch: dataset_name_list.append(example["dataset_name"]) raw_text_list.append(example["raw_text"]) @@ -642,6 +652,9 @@ def collate_fn(self, batch: List[dict]): if 'audio_filepath' in example: audio_filepath_list.append(example["audio_filepath"]) + if 'phoneme_tokens' in example: + phoneme_tokens_list.append(example["phoneme_tokens"]) + phoneme_tokens_len_list.append(example["phoneme_tokens_len"]) if 'audio' in example: audio_list.append(example["audio"]) @@ -711,6 +724,13 @@ def collate_fn(self, batch: List[dict]): batch_dict['audio_codes'] = batch_audio_codes batch_dict['audio_codes_lens'] = batch_audio_codes_len + if len(phoneme_tokens_list) > 0: + batch_phoneme_tokens_len = torch.IntTensor(phoneme_tokens_len_list) + phoneme_tokens_max_len = int(batch_phoneme_tokens_len.max().item()) + batch_phoneme_tokens = stack_tensors(phoneme_tokens_list, max_lens=[phoneme_tokens_max_len], pad_value=self.phoneme_tokenizer.pad) + batch_dict['phoneme_tokens'] = batch_phoneme_tokens + batch_dict['phoneme_tokens_lens'] = batch_phoneme_tokens_len + if len(context_audio_list) > 0: batch_context_audio_len = torch.IntTensor(context_audio_len_list) context_audio_max_len = int(batch_context_audio_len.max().item()) diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 1ee0b05bef62..4bd378151b9a 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -59,6 +59,13 @@ def setup_tokenizers(all_tokenizers_config, mode='train'): return aggregated_tokenizer +def instantiate_phoneme_tokenizer(phoneme_tokenizer_config): + phoneme_tokenizer = instantiate(phoneme_tokenizer_config) + phoneme_vocab_size = len(phoneme_tokenizer.tokens) + phoneme_tokenizer.bos_token_id = phoneme_vocab_size + phoneme_tokenizer.eos_token_id = phoneme_vocab_size + 1 + phoneme_tokenizer.vocab_size = phoneme_vocab_size + 2 + return phoneme_tokenizer def check_speaker_format(item: str): # enforce the format as example like "| Language:en Dataset:HiFiTTS Speaker:9136_other |". @@ -140,6 +147,7 @@ def __init__( tokenizer_config: DictConfig = None, text_context_remapping: Dict[str, str] = None, text_context_remapping_prob: float = 0.0, + phoneme_tokenizer_config: DictConfig = None, ): super().__init__() self.sample_rate = sample_rate @@ -160,8 +168,10 @@ def __init__( self.context_duration_max = context_duration_max self.tokenizer_config = tokenizer_config self.text_tokenizer = None + self.phoneme_tokenizer = None self.text_context_remapping = text_context_remapping self.text_context_remapping_prob = text_context_remapping_prob + self.phoneme_tokenizer_config = phoneme_tokenizer_config def get_num_audio_samples_to_slice(self, duration, sample_rate): num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame) @@ -188,6 +198,12 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: self.eos_id = self.bos_id + 1 self.pad_id = self.text_tokenizer.pad + if self.phoneme_tokenizer is None and self.phoneme_tokenizer_config is not None: + worker_info = torch.utils.data.get_worker_info() + worker_id = worker_info.id if worker_info is not None else 0 + logging.info(f"Worker {worker_id} initializing phoneme tokenizer...") + self.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.phoneme_tokenizer_config) + # define list to store batched information dataset_name_list = [] audio_list = [] @@ -210,6 +226,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: raw_text_list = ( [] ) # raw text here is the string of normalized text or text stored in the supervision segment. Used to distinguish from text tokens. + phoneme_token_list = [] + phoneme_token_len_list = [] for cut in cuts: speaker = cut.supervisions[0].speaker if not check_speaker_format(speaker): @@ -390,6 +408,13 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: token_list.append(tokens) token_len_list.append(text_len) + if self.phoneme_tokenizer is not None: + phoneme_tokens = self.phoneme_tokenizer.encode(text_str) + phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + phoneme_tokens_len = len(phoneme_tokens) + phoneme_token_list.append(torch.tensor(phoneme_tokens, dtype=torch.int32)) + phoneme_token_len_list.append(phoneme_tokens_len) + if self.include_align_prior: align_prior = beta_binomial_prior_distribution( phoneme_count=text_len, mel_count=spec_len, scaling_factor=self.prior_scaling_factor @@ -409,6 +434,10 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: "text_lens": torch.IntTensor(token_len_list), } + if self.phoneme_tokenizer is not None: + batch_dict["phoneme_tokens"] = collate_vectors(phoneme_token_list, padding_value=self.phoneme_tokenizer.pad) + batch_dict["phoneme_tokens_lens"] = torch.IntTensor(phoneme_token_len_list) + # audio for SV. if len(audio_list_16khz) > 0: batch_dict["audio_16khz"] = collate_vectors(audio_list_16khz, padding_value=0.0) diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py index 15d592dca2f7..6e781bed19ef 100644 --- a/nemo/collections/tts/models/__init__.py +++ b/nemo/collections/tts/models/__init__.py @@ -18,6 +18,7 @@ from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL from nemo.collections.tts.models.hifigan import HifiGanModel from nemo.collections.tts.models.magpietts import InferBatchOutput, MagpieTTSModel +from nemo.collections.tts.models.magpietts_decoder_only import MagpieTTSDecoderModel from nemo.collections.tts.models.magpietts_preference_optimization import ( MagpieTTSModelOfflinePO, MagpieTTSModelOfflinePODataGen, @@ -34,6 +35,7 @@ "HifiGanModel", "InferBatchOutput", "MagpieTTSModel", + "MagpieTTSDecoderModel", "MagpieTTSModelOfflinePODataGen", "MagpieTTSModelOfflinePO", "MagpieTTSModelOnlinePO", diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py new file mode 100644 index 000000000000..1b60b4b7b6ed --- /dev/null +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -0,0 +1,1729 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Sequence, Tuple +import torch +import wandb +from hydra.utils import instantiate +from functools import partial +from lightning.pytorch import Trainer +from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger +from omegaconf import DictConfig +from torch import nn +from torch.utils.data import get_worker_info + +from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers, instantiate_phoneme_tokenizer + +from nemo.collections.tts.models import AudioCodecModel +from nemo.collections.tts.modules import transformer_2501 + +from nemo.collections.tts.modules.magpietts_modules import CharAwareSubwordEncoder, SpecialAudioToken, LocalTransformerType, cosine_schedule +from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths + +from nemo.core.classes import ModelPT +from nemo.core.classes.common import PretrainedModelInfo +from nemo.utils import logging +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForCausalLM +) +import time +from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter +import random + + + +def worker_init_fn(worker_id): + # For mp.set_start_method("spawn", force=True) + # The dataset class should be picklable, so we initialize non-picklable objects here + logging.info(f"Worker {worker_id} initializing...") + worker_info = get_worker_info() + dataset = worker_info.dataset # Get the dataset instance in this worker + tokenizer = setup_tokenizers( + dataset.tokenizer_config, mode=dataset.dataset_type + ) + dataset.text_tokenizer = tokenizer + if hasattr(dataset, 'phoneme_tokenizer_config'): + dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config) + + +class MagpieTTSDecoderModel(ModelPT): + """ + Magpie-TTS Model Decoder Only Model + audio/text + """ + + def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): + self.world_size = 1 + if trainer is not None: + self.world_size = trainer.num_nodes * trainer.num_devices + + # load codec + codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False) + self.sample_rate = codec_model.sample_rate + + if hasattr(codec_model, "discriminator"): + # del codec discriminator to free memory + del codec_model.discriminator + + # Set up codebook configuration + vector_quantizer = cfg.get('vector_quantizer') + if vector_quantizer is not None: + vector_quantizer = instantiate(vector_quantizer) + num_audio_codebooks = vector_quantizer.num_codebooks + codebook_size = vector_quantizer.codebook_size + codec_converter = VectorQuantizerIndexConverter( + vector_quantizer_original=codec_model.vector_quantizer, + vector_quantizer_new=vector_quantizer, + ) + data_num_audio_codebooks = codec_model.vector_quantizer.num_codebooks + else: + num_audio_codebooks = codec_model.num_codebooks + data_num_audio_codebooks = num_audio_codebooks + codebook_size = codec_model.codebook_size + codec_converter = None + + + # The dataloader needs to know the number of codebooks that the context codes were stored in + # In the case where there are no context codes saved, and there is no context audio (in the text context path), + # We create a dummy context code tensor that is only [context_BOS, context_EOS] that is repeated for + # data_num_audio_codebooks + self.data_num_audio_codebooks = data_num_audio_codebooks + self.num_audio_codebooks = num_audio_codebooks + self.codebook_size = codebook_size + + + self.codec_model_samples_per_frame = codec_model.samples_per_frame + # Our codebooks start with actual audio codec tokens, followed by special tokens. + # The `forced_*` options are for backward compatibility for models trained with older code. + num_audio_tokens = codec_model.codebook_size + # Our codebooks start with actual audio codec tokens, followed by special tokens. + # The `forced_*` options are for backward compatibility for models trained with older code. + get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size) + self.audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_BOS) + self.audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_EOS) + self.context_audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_BOS) + self.context_audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_EOS) + self.mask_token_id = get_token_index(SpecialAudioToken.MASK_TOKEN) + self.num_all_tokens_per_codebook = self.codebook_size + len(SpecialAudioToken) + self.use_bpe_char_tokenizer = cfg.get('use_bpe_char_tokenizer', False) + + # If specified, use this as the text conditioning tokenizer. Otherwise, use the first tokenizer. + self.text_conditioning_tokenizer_name = cfg.get('text_conditioning_tokenizer_name', None) + if self.text_conditioning_tokenizer_name is None: + self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0] + + self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0) + self.text_input_mode = cfg.get('text_input_mode', 'full') + self.streaming_speech_delay = cfg.get('streaming_speech_delay', 3) + self.streaming_phonemes_delay = cfg.get('streaming_phonemes_delay', 2) + self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1) + + self.tokenizer = setup_tokenizers( + all_tokenizers_config=cfg.text_tokenizers, + mode='train', + ) + + num_tokens_tokenizer = len(self.tokenizer.tokens) + num_tokens = num_tokens_tokenizer + 3 # +2 for BOS and EOS + self.bos_id = num_tokens - 3 + self.eos_id = num_tokens - 2 + self.cfg_unk_token_id = num_tokens - 1 + self.phoneme_tokenizer = None + self.dropout_text_input_prob = cfg.get('dropout_text_input_prob', 0.0) + self.dropout_phoneme_input_prob = cfg.get('dropout_phoneme_input_prob', 0.0) + if cfg.get('phoneme_tokenizer', None) is not None: + self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) + self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) + self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size + + + self.pad_context_text_to_max_duration = False + + super().__init__(cfg=cfg, trainer=trainer) + + # This needs to happen after super().__init__() + self._codec_model = codec_model + self._codec_model.freeze() #Lightning does requires_grad = False and self.eval() + self._codec_converter = codec_converter + + audio_embeddings = [] + for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): + audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim)) + self.audio_embeddings = nn.ModuleList(audio_embeddings) + + if self.phoneme_tokenizer is not None: + phoneme_embeddings = [] + for _ in range(self.phoneme_stacking_factor): + phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim)) + self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) + self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) + + + if cfg.transformer_hf_backend == "custom_qwen3_moe": + # from transformers.models import qwen3_moe + # config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(intermediate_size=3072, num_hidden_layers=5, num_experts=64) + # self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) + from transformers.models import qwen2_moe + config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32) + self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2) + else: + self.transformer_backend_config = AutoConfig.from_pretrained( + cfg.transformer_hf_backend, + trust_remote_code=True, + ) + + hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) + self.decoder = hf_transformer.model + self.lm_text_head = hf_transformer.lm_head + + self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) + self.decoder.set_input_embeddings(self.text_embedding) + + if self.use_bpe_char_tokenizer: + # BPE char tokenizer + assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer" + tokenizer_name = self.tokenizer.tokenizer_names[0] + tokenizer = self.tokenizer.tokenizers[tokenizer_name] + subword_vocab = tokenizer.get_vocab() + # special tokens will be stored as it is in the char_vocab + # Each special token will only be mapped to one char id + special_vocab = { + '': self.bos_id, + '': self.eos_id, + '': self.cfg_unk_token_id, + } + self.cas_encoder = CharAwareSubwordEncoder( + d_embed=cfg.embedding_dim, + llm_tokenizer_vocab=subword_vocab, + subword_padding_idx=self.tokenizer.pad, + special_vocab=special_vocab + ) + + self.final_proj = nn.Linear(cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor) + self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none') + + self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower()) + logging.info(f"Local transformer type: {self.local_transformer_type}") + if self.local_transformer_type != LocalTransformerType.NO_LT: + local_transformer_hidden_dim = cfg.get('local_transformer_hidden_dim', 256) + if local_transformer_hidden_dim != cfg.hidden_dim: + self.local_transformer_in_projection = nn.Linear(cfg.hidden_dim, local_transformer_hidden_dim) + else: + self.local_transformer_in_projection = nn.Identity() + self.local_transformer = transformer_2501.Transformer( + n_layers=self.cfg.get('local_transformer_n_layers', 2), + d_model=local_transformer_hidden_dim, + d_ffn=local_transformer_hidden_dim*4, + sa_n_heads=self.cfg.get('local_transformer_n_heads', 1), + kernel_size=1, + is_causal=self.local_transformer_type == LocalTransformerType.AR, + max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2, + use_learnable_pos_emb=True, + ) + local_transformer_out_projections = [] + for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): + # Have a separate projection layer for each codebook, to distinguish between them + local_transformer_out_projections.append(nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook)) + self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) + + + def state_dict(self, destination=None, prefix='', keep_vars=False): + """ + Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model + from the checkpoint. The codec model is saved in a separate checkpoint. + """ + if hasattr(self, '_no_state_dict') and self._no_state_dict: + return {} + # Don't save the speaker verification and codec model in the state dict + state_dict = super().state_dict(destination, prefix, keep_vars) + keys_substrings_to_exclude = ['_speaker_verification_model', '_codec_model'] + for key in list(state_dict.keys()): + if any([substring in key for substring in keys_substrings_to_exclude]): + del state_dict[key] + return state_dict + + def load_state_dict(self, state_dict, strict=True): + """ + Modify load_state_dict so that we don't restore weights to _speaker_verification_model and _codec_model when + strict is True. + When strict is False, we can call pytorch's load_state_dict. + When strict is True, we loop through all parameters and rename them to enable loading. + """ + if strict == False: + super().load_state_dict(state_dict, strict=False) + for name, child in self.named_children(): + if name in ['_speaker_verification_model', '_codec_model']: + continue + if any(param.numel() > 0 for param in child.parameters()): + # If the module has parameters, we want to change the default mapping so that the state_dict gets + # loaded. + # Ex: state_dict[encoder.position_embeddings.weight] -> new_state_dict[position_embeddings.weight] + new_state_dict = {} + for key in state_dict.keys(): + name_with_dot = f"{name}." + if key.startswith(name_with_dot): + new_state_dict[key[len(name_with_dot):]] = state_dict[key] + child.load_state_dict(new_state_dict) + + def audio_to_codes(self, audio, audio_len, audio_type='target'): + # audio: (B, T) + # audio_len: (B,) + if audio_type == 'target': + audio_eos_id = self.audio_eos_id + audio_bos_id = self.audio_bos_id + elif audio_type == 'context': + audio_eos_id = self.context_audio_eos_id + audio_bos_id = self.context_audio_bos_id + else: + raise ValueError(f"Received audio_type of {audio_type}. Must be `target` or `context`") + + self._codec_model.eval() + with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): + codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len) + if self._codec_converter is not None: + codes = self._codec_converter.convert_original_to_new(audio_tokens=codes, audio_lens=codes_len) + # Add a timestep to begining and end of codes tensor + bos_tensor = torch.full( + (codes.size(0), codes.size(1), 1), audio_bos_id, dtype=codes.dtype, device=codes.device + ) + pad_tensor = torch.full( + (codes.size(0), codes.size(1), 1), 0, dtype=codes.dtype, device=codes.device + ) # 0 is the padding token in the audio codebook + codes = torch.cat([bos_tensor, codes, pad_tensor], dim=-1) + # codes: (B, C, T') + # codes_len: (B,) + for idx in range(codes.size(0)): + codes[idx, :, codes_len[idx] + 1] = audio_eos_id + codes_len = codes_len + 2 + + return codes.long(), codes_len.long() + + def codes_to_audio(self, codes, codes_len): + # codes: (B, C, T') + # codes_len: (B,) + if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: + # Unstack the audio codes if they are stacked + codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) + + if codes.size(2) < 5: + # If the codes are too short, we need to pad them + codes = torch.cat([codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2).long() + codes_len = codes_len + 5 - codes.size(2) + + self._codec_model.eval() + with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): + # Make a copy to avoid modifying the original tensor if it's used elsewhere + codes_copy = codes.clone() + # Replace eos and bos tokens with padding in the copied tensor + codes_copy[codes == self.audio_bos_id] = 0 # zero is the padding token + codes_copy[codes == self.audio_eos_id] = 0 + # Pass the modified integer token IDs + if self._codec_converter is not None: + codes_copy = self._codec_converter.convert_new_to_original( + audio_tokens=codes_copy, audio_lens=codes_len + ) + audio, audio_len = self._codec_model.decode(tokens=codes_copy, tokens_len=codes_len) + # audio: (B, T) + # audio_len: (B,) + return audio, audio_len + + def embed_audio_tokens(self, audio_tokens): + # audio_tokens: (B, C, T') + # Add and average the embeddings of the audio tokens across the codebooks + audio_embedding = None + for c in range(audio_tokens.size(1)): + embedding = self.audio_embeddings[c](audio_tokens[:, c, :]) + if audio_embedding is None: + audio_embedding = embedding + else: + audio_embedding = audio_embedding + embedding + audio_embedding = audio_embedding / audio_tokens.size(1) + return audio_embedding + + def embed_phoneme_tokens(self, phoneme_tokens): + # phoneme_tokens: (B, S, T') + phoneme_embedding = None + for c in range(phoneme_tokens.size(1)): + embedding = self.phoneme_embeddings[c](phoneme_tokens[:, c, :]) + if phoneme_embedding is None: + phoneme_embedding = embedding + else: + phoneme_embedding = phoneme_embedding + embedding + phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1) + return phoneme_embedding + + def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): + """ + Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes. + This function is used in training and validation, not inference/sampling. + The sequence layout is slightly different between AR and MG modes, as shown in the diagram below, + (using an 8-codebook setup as an example): + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | AR target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | none | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | MG target | none | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | Input | Magpie | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + | | Latent | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | Seq. Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + + dec_out: (B, T', E) + audio_codes_target: (B, C, T') + targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive) + if True, the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit) + """ + dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) + local_transformer_input = [dec_out_all] + for codebook_num in range(audio_codes_target.size(1)): + codes = audio_codes_target[:, codebook_num] # (B, T') + codes = codes.reshape(-1) # (B*T',) + codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', E) + local_transformer_input.append(codebook_embedding) + + local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E) + local_transformer_input = self.local_transformer_in_projection(local_transformer_input) # (B*T', C+1, 128) + _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B*T', C+1, E) + if not targets_offset_by_one: + # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc. + local_transformer_output = local_transformer_output[:, :-1, :] # (B*T', C, E) + else: + # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc. + local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E) + all_code_logits = [] + for codebook_num in range(audio_codes_target.size(1)): + # Using a separate projection layer for each codebook (to distinguish between them) + # Checked the time - this loop is not taking much time (compared to the local transformer forward pass) + codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num, :]) # (B*T', num_all_tokens_per_codebook) + all_code_logits.append(codebook_logits) + all_code_logits = torch.cat(all_code_logits, dim=1) # (B*T', num_codebooks * num_all_tokens_per_codebook) + + all_code_logits = all_code_logits.view( + audio_codes_target.size(0), audio_codes_target.size(2), -1 + ) # (B, T', C * num_all_tokens_per_codebook) + + return all_code_logits + + def maskgit_create_random_mask(self, codes): + """ + Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN. + """ + # Codes: (B, C, T) + B,C,T = codes.shape + # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right? + rand_values = torch.rand(B,T, device=codes.device) + # apply the cosine schedule + frac_masked = cosine_schedule(rand_values) + # how many positions to mask + n_masked = torch.ceil(frac_masked * C).long() # B,T + # start from all unmasked + mask = torch.zeros_like(codes, dtype=torch.bool) + # The code further below is the vectorized version of this: + # for b in range(B): + # for t in range(T): + # if n_masked[b,t] > 0: + # # get a random permutation of the codebook indices + # perm = torch.randperm(C) + # # mask the top n_masked positions + # mask[b, perm[:n_masked[b,t]], t] = True + # + # Create random permutations + random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) # (B, C, T) + # Create a mask tensor where each position indicates if it should be masked + mask_indices = torch.arange(C, device=codes.device).view(1, C, 1) + mask = mask_indices < n_masked.view(B, 1, T) # (B, C, T) + # Apply the random permutations to the mask + mask = torch.gather(mask, 1, random_permutations) + + return mask # (B, C, T) + + def maskgit_apply_random_mask(self, codes): + # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule. + # Codes: (B, C, T) + mask = self.maskgit_create_random_mask(codes) + ## replace some tokens with MASK_TOKEN + codes_with_mask = torch.where(mask, self.mask_token_id, codes) + return codes_with_mask, mask + + def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None): + """ + Computes the audio codebook loss. Used by + (1) The main Magpie-TTS transformer + (2) The local transformer, for both autoregressive and MaskGit methods + + logits: (B, T', num_codebooks * num_tokens_per_codebook) + audio_codes: (B, C, T') + audio_codes_lens: (B,) + mask_tokens_mask: (B, C, T') True for tokens that were replaced with the MASK_TOKEN and should + therefore be the only ones included in the loss computation. + """ + loss_mask = get_mask_from_lengths(audio_codes_lens) + if mask_tokens_mask is not None: + # For MaskGit we only compute loss for the masked tokens. + # *Both* conditions must be true: + # 1. the token is masked + # 2. the token is not padding + loss_mask = loss_mask.unsqueeze(1) * mask_tokens_mask + if not loss_mask.any(): + # Without this we were very rarely getting NaNs in the loss + logging.warning("No tokens valid were found in compute_loss()!") + return torch.tensor(0.0, device=loss_mask.device), loss_mask + else: + # repeat loss mask for each codebook to simplify code below + loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1) + total_codebook_loss = None + for codebook in range(audio_codes.size(1)): + si = codebook * self.num_all_tokens_per_codebook + ei = si + self.num_all_tokens_per_codebook + codebook_logits = logits[:, :, si:ei] # (B, T', num_tokens_per_codebook) + codebook_targets = audio_codes[:, codebook] # (B, T') + codebook_loss = self.cross_entropy_loss( + codebook_logits.permute(0, 2, 1), codebook_targets # (B, num_tokens_per_codebook, T') + ) # (B, T') + codebook_loss = codebook_loss * loss_mask[:, codebook, :] + codebook_loss = codebook_loss.sum() / loss_mask[:, codebook, :].sum() + if total_codebook_loss is None: + total_codebook_loss = codebook_loss + else: + total_codebook_loss = total_codebook_loss + codebook_loss + + total_codebook_loss = total_codebook_loss / audio_codes.size(1) + return total_codebook_loss, loss_mask + + def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens): + loss_mask = get_mask_from_lengths(phoneme_tokens_lens) + total_phoneme_loss = None + for codebook in range(self.phoneme_stacking_factor): + si = codebook * self.phoneme_vocab_size + ei = si + self.phoneme_vocab_size + phoneme_logits = logits[:, :, si:ei] + phoneme_targets = phoneme_tokens[:, codebook] + phoneme_loss = self.cross_entropy_loss(phoneme_logits.permute(0, 2, 1), phoneme_targets) + phoneme_loss = phoneme_loss * loss_mask + phoneme_loss = phoneme_loss.sum() / loss_mask.sum() + if total_phoneme_loss is None: + total_phoneme_loss = phoneme_loss + else: + total_phoneme_loss = total_phoneme_loss + phoneme_loss + total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor + return total_phoneme_loss, loss_mask + + + def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None): + backend_out = self.decoder( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + use_cache=use_cache, + past_key_values=past_key_values, + ) + # hidden_states = backend_out.last_hidden_state # (B, T_total, H) + return backend_out + + + def logits_to_audio_codes(self, all_code_logits, audio_codes_lens): + # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook) + # audio_codes_lens: (B,) + all_preds = [] + for idx in range(self.num_audio_codebooks * self.frame_stacking_factor): + si = idx * self.num_all_tokens_per_codebook + ei = si + self.num_all_tokens_per_codebook + codebook_logits = all_code_logits[:, :, si:ei] + codebook_probs = torch.softmax(codebook_logits, dim=-1) # (B, T', num_tokens_per_codebook) + # argmax to get the tokens + codebook_preds = torch.argmax(codebook_probs, dim=-1) # (B, T') + all_preds.append(codebook_preds) + + all_preds = torch.stack(all_preds, dim=1) # (B, C, T') + audio_mask = get_mask_from_lengths(audio_codes_lens) + all_preds = all_preds * audio_mask.unsqueeze(1) + + return all_preds + + def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0, n_steps=3): + """ + Sample codes for one timestep from the local transformer using MaskGit. + """ + if self.frame_stacking_factor > 1: + raise NotImplementedError("MaskGit sampling is not implemented for frame stacking factor > 1") + # dec_output: (B, E) + device = dec_output.device + # disable KV cache since our transformer is not causal + self.local_transformer.reset_cache(use_cache=False) + dec_output = dec_output.unsqueeze(1) # (B, 1, E) + local_transformer_input_init = self.local_transformer_in_projection(dec_output) # (B, 1, D) where D is the dimension of the local transformer + C = self.num_audio_codebooks + B = dec_output.size(0) + + min_confidence = float("-inf") + max_confidence = 10000 # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')? + confidences = min_confidence * torch.ones(B, C, device=device) + # initialize to all masked + codes = self.mask_token_id * torch.ones((B, C), device=device, dtype=torch.long) + sampled_codes = codes.clone() + for step in range(n_steps): + # get mask fraction + frac_masked = cosine_schedule(torch.tensor(step / (n_steps))) + # how many codebooks to mask + n_masked = torch.ceil(C * frac_masked).long() # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)? + n_unmasked = C - n_masked + # pick top-confidence codebooks up to n_unmasked + _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) + + # replace masks of the top-k confident codebooks with the the codes that were sampled for them + unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices) + codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) + + # build transformer input + local_transformer_input = local_transformer_input_init + for codebook_num in range(C): + next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) # (B, 1, 768) + next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, d_local) + local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, codebook_num+1, d_local) + + # run transformer + _mask = torch.ones(B, C+1, device=device) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, C+1, d_local) + + # get logits + logits = [] + for codebook_num in range(C): + # The `codebook_num+1` is to drop first position which corresponds to the magpie latent + codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num+1, :]) # (B, num_audio_tokens_per_codebook) + logits.append(codebook_logits) + logits = torch.stack(logits, dim=1) # (B, C, num_audio_tokens_per_codebook) + + # apply CFG + if use_cfg: + actual_batch_size = logits.size(0) // 2 + conditional_logits = logits[:actual_batch_size] + unconditional_logits = logits[actual_batch_size:] + cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + logits[:actual_batch_size] = cfg_logits + + # handle unfinished and finished items + for item_idx in unfinished_items: + logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + logits[item_idx, :, :] = float('-inf') + logits[item_idx, :, self.audio_eos_id] = 0.0 + + # sample with top-k + logits_topk = torch.topk(logits, topk, dim=-1)[0] # (B, C, topk) + indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) # (B, C, num_audio_tokens_per_codebook) + logits_rescored = logits.clone() + logits_rescored[indices_to_remove] = float('-inf') + probs = torch.softmax(logits_rescored / temperature, dim=-1) # (B, C, num_audio_tokens_per_codebook) + sampled_codes = torch.multinomial(probs.view(B*C, -1), 1).view(B, C) + if use_cfg: + # TODO @rfejgin: why do we need to keep second half of the batch? can probably optimize this + sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size] + probs[actual_batch_size:] = probs[:actual_batch_size] + confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) + + # set confidence to max for unmasked codebooks so that they will remain unmasked + confidences.scatter_(index=topk_indices, dim=1, src=max_confidence*torch.ones_like(topk_indices, dtype=torch.float)) + + # replace entries in sampled_codes with previously unmasked codebooks + sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) + # optionally: add noise to confidences here (as in token-critic paper) (not implemented) + + codes = sampled_codes + assert not (codes == self.mask_token_id).any(), f"Codes contain mask tokens after completion of MaskGit sampling" + if use_cfg: + codes = codes[:actual_batch_size] + return codes + + def local_transformer_sample_autoregressive(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0): + # dec_output: (B, E) + self.local_transformer.reset_cache(use_cache=False) + dec_output = dec_output.unsqueeze(1) # (B, 1, E) + local_transformer_input = self.local_transformer_in_projection(dec_output) # (B, 1, 128) + all_preds = [] + for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor): + _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128) + codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, -1, :]) # (B, num_all_tokens_per_codebook) + if use_cfg: + actual_batch_size = codebook_logits.size(0) // 2 + conditional_logits = codebook_logits[:actual_batch_size] + unconditional_logits = codebook_logits[actual_batch_size:] + cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + codebook_logits[:actual_batch_size] = cfg_logits + + for item_idx in unfinished_items: + codebook_logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + codebook_logits[item_idx, :] = float('-inf') + codebook_logits[item_idx, self.audio_eos_id] = 0.0 + + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1) # (B, num_tokens_per_codebook) + codebook_logits_rescored = codebook_logits.clone() + codebook_logits_rescored[indices_to_remove] = float('-inf') + codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + if use_cfg: + codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] + all_preds.append(codebook_preds) + next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1) # (B, 1, 128) + next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, 128) + local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, T+1, 128) + + all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) + if use_cfg: + all_preds = all_preds[:actual_batch_size] + + return all_preds + + + def sample_codes_from_logits(self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={}): + # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep + all_preds = [] + for idx in range(self.num_audio_codebooks * self.frame_stacking_factor): + si = idx * self.num_all_tokens_per_codebook + ei = si + self.num_all_tokens_per_codebook + codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) + for item_idx in unfinished_items: + codebook_logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + codebook_logits[item_idx, :] = float('-inf') + codebook_logits[item_idx, self.audio_eos_id] = 0.0 + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( + -1 + ) # (B, num_tokens_per_codebook) + codebook_logits_rescored = codebook_logits.clone() + codebook_logits_rescored[indices_to_remove] = float('-inf') + + codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + all_preds.append(codebook_preds) + all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) + return all_preds + + def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, topk=80): + # all_code_logits_t: (B, phoneme_stacking_factor * phoneme_vocab_size), logits at a given timestep + all_preds = [] + for idx in range(self.phoneme_stacking_factor): + si = idx * self.phoneme_vocab_size + ei = si + self.phoneme_vocab_size + codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( + -1 + ) # (B, num_tokens_per_codebook) + codebook_logits_rescored = codebook_logits.clone() + codebook_logits_rescored[indices_to_remove] = float('-inf') + + codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + all_preds.append(codebook_preds) + all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) + return all_preds + + def log_val_audio_example( + self, + logits, + target_audio_codes, + audio_codes_lens_target, + context_audio_codes=None, + context_audio_codes_lens=None, + ): + wandb_audio_log = {} + + pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens_target) + pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target) + target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target) + + context_audio, context_audio_lens = None, None + if context_audio_codes is not None and context_audio_codes.shape[2] > 3: + # > 3 ensures, it is a valid context audio tensor (and not dummy tensor used in text context) + context_audio, context_audio_lens = self.codes_to_audio(context_audio_codes, context_audio_codes_lens) + + for logger in self.loggers: + is_wandb = isinstance(logger, WandbLogger) + is_tb = isinstance(logger, TensorBoardLogger) + if not is_wandb and not is_tb: + raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.") + + for idx in range(min(3, pred_audio.size(0))): + pred_audio_np = pred_audio[idx].float().detach().cpu().numpy() + target_audio_np = target_audio[idx].float().detach().cpu().numpy() + pred_audio_np = pred_audio_np[: pred_audio_lens[idx]] + target_audio_np = target_audio_np[: target_audio_lens[idx]] + context_audio_np = None + if context_audio is not None: + context_audio_np = context_audio[idx].float().detach().cpu().numpy() + context_audio_np = context_audio_np[: context_audio_lens[idx]] + + if is_wandb: + wandb_audio_log[f"Audio/Example_{idx}"] = list() + if context_audio_np is not None: + wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context")) + wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction")) + wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target")) + + if is_tb: + if context_audio_np is not None: + logger.experiment.add_audio( + f'Example_{idx}/context', + context_audio_np, + global_step=self.global_step, + sample_rate=self.sample_rate, + ) + logger.experiment.add_audio( + f'Example_{idx}/prediction', + pred_audio_np, + global_step=self.global_step, + sample_rate=self.sample_rate, + ) + logger.experiment.add_audio( + f'Example_{idx}/target', + target_audio_np, + global_step=self.global_step, + sample_rate=self.sample_rate, + ) + + return wandb_audio_log + + + def join_embeddings_temporally( + self, + embeddings: Sequence[torch.Tensor], # [ (B, Ti, E), … ] + lengths: Sequence[torch.Tensor], # [ (B,), … ] same order/size as `embeddings` + pad_embed: torch.Tensor | None = None # (E,) defaults to zeros + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Merges Multiple Embedding sequences into a single Embedding Sequence. + + Args: + embeddings : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding + lengths : Sequence of tensors, each of shape (B,) + pad_embed : (E,) — embedding to use for padding, defaults to zeros + + Returns: + joined : (B, max_sum_len, E) — merged & padded + out_lengths : (B,) — total lengths of each batch element after merging + """ + if len(embeddings) == 0: + raise ValueError("contexts must be non-empty") + + B, _, E = embeddings[0].shape + device = embeddings[0].device + dtype = embeddings[0].dtype + + # 1. compute output sizes + len_stack = torch.stack(tuple(lengths), dim=0) # (N, B) + out_lengths = len_stack.sum(0) + max_len = int(out_lengths.max()) + + if pad_embed is None: + pad_embed = torch.zeros(E, dtype=dtype, device=device) + + joined = pad_embed.expand(B, max_len, E).clone() # (B,max_len,E) + + # batch row indices + batch_rows = torch.arange(B, device=device).unsqueeze(1) # (B,1) + + # running offset keeps “write cursor” for each row + offset = torch.zeros(B, dtype=torch.long, device=device) # (B,) + + for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)): + Ti = embedding_i.shape[1] + t_idx = torch.arange(Ti, device=device) # (Ti,) + mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1) # (B,Ti) + + # destination columns: offset + t + dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti) + + # Assign embedding_i to the correct positions in joined + joined[batch_rows.expand_as(mask)[mask], + dest_cols[mask]] = embedding_i[mask] + + # move cursor past this segment + offset += len_i + + return joined, out_lengths + + def prepare_context_tensors(self, batch, dropout_text_input=False): + # Transcript + text = batch['text'] + text_lens = batch['text_lens'] + text_embedded = self.decoder.get_input_embeddings()(text) + if self.use_bpe_char_tokenizer: + text_mask = get_mask_from_lengths(text_lens) + cas_embedding = self.cas_encoder(text, subword_mask=text_mask) # (B, L, E) + text_embedded = text_embedded + cas_embedding + + if text_embedded.shape[1] < self.streaming_speech_delay + 1: + # If text is too short, pad it with zeros + padding_tensor = torch.zeros(text_embedded.shape[0], self.streaming_speech_delay + 1 - text_embedded.shape[1], text_embedded.shape[2], device=text_embedded.device) + text_embedded = torch.cat([text_embedded, padding_tensor], dim=1) + + if dropout_text_input: + # Make text embedding all zeros + text_embedded = text_embedded * 0.0 + + # Context Audio + if 'context_audio_codes' in batch: + context_audio_codes = batch['context_audio_codes'] + context_audio_codes_lens = batch['context_audio_codes_lens'] + if self._codec_converter is not None: + context_audio_codes = self._codec_converter.convert_original_to_new( + audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens + ).long() + else: + context_audio_codes, context_audio_codes_lens = self.audio_to_codes( + batch['context_audio'], batch['context_audio_lens'], audio_type='context' + ) + + context_audio_codes, context_audio_codes_lens = self.stack_codes(context_audio_codes, context_audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks) + context_audio_embedded = self.embed_audio_tokens(context_audio_codes) # (B, T', E) + + # Context Text + context_text_tokens = batch['context_text_tokens'] + context_text_lens = batch['context_text_tokens_lens'] + context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens) # (B, L, E) + + remaining_text_embedded = None + remaining_text_lens = None + if self.text_input_mode == 'full': + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[context_audio_embedded, context_text_embedded, text_embedded], + lengths=[context_audio_codes_lens, context_text_lens, text_lens], + ) + elif self.text_input_mode == 'streaming': + prompt_text_embedded = text_embedded[:,:self.streaming_speech_delay,:] + prompt_text_lens = torch.ones_like(text_lens) * self.streaming_speech_delay + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded], + lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens], + ) + remaining_text_embedded = text_embedded[:,self.streaming_speech_delay:,:] + remaining_text_lens = text_lens - self.streaming_speech_delay + remaining_text_lens = remaining_text_lens.clamp(min=0) + remaining_text_mask = get_mask_from_lengths(remaining_text_lens) + remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2) # (B, T, E) + else: + raise ValueError(f"Invalid text input mode: {self.text_input_mode}") + + return { + 'context_embedding': context_embedding, # (B, T_total, E) + 'context_lens': context_lens, # (B,) + 'context_audio_codes': context_audio_codes, # (B, C, T') + 'context_audio_embedded': context_audio_embedded, # (B, T', E) + 'context_audio_codes_lens': context_audio_codes_lens, # (B,) + 'text_embedded': text_embedded, # (B, L, E) + 'text_lens': text_lens, # (B,) + 'context_text_tokens': context_text_tokens, # (B, L) + 'context_text_lens': context_text_lens, # (B,) + 'remaining_text_embedded': remaining_text_embedded, # (B, T, E) + 'remaining_text_lens': remaining_text_lens, # (B,) + } + + def slice_pred_embeddings(self, transformer_out, context_lens, target_lens): + """ + Slices the transformer output to get the predicted embeddings for the target sequence. + Args: + transformer_out: (B, T, E) + context_lens: (B,) - start index of target per batch + target_lens: (B,) - length of target per batch + + Returns: (B, T_max, E) tensor where T_max = max(target_lens) + """ + B, T, E = transformer_out.shape + device = transformer_out.device + + # Compute max target length in batch for padding + max_len = target_lens.max().item() + + # Build index tensor for each batch element + # Shape: (B, max_len) + range_indices = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1) + gather_indices = context_lens.unsqueeze(1) + range_indices # (B, max_len) + gather_indices = torch.clamp(gather_indices, max=transformer_out.size(1) - 1) + + # Expand to shape (B, max_len, E) for gather + gather_indices_exp = gather_indices.unsqueeze(2).expand(-1, -1, E) + sliced = torch.gather(transformer_out, dim=1, index=gather_indices_exp) + return sliced + + + def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks): + if stacking_factor == 1: + return codes, codes_lens + + contains_bos = codes[0,0,0].item() == bos_id + if contains_bos: + bos_tensor_repeated = torch.full((codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device) # (B,stacking_factor*C, 1) + codes = codes[:, :, 1:] # Remove the bos token + codes_lens = codes_lens - 1 # Remove the bos token + B, C, T = codes.shape + s = int(stacking_factor) + + # --- Compute max padding needed --- + pad_t = (-T) % s # pad so that T' is divisible by s + pad_tail = torch.full((B, C, pad_t), eos_id, + dtype=codes.dtype, device=codes.device) + codes = torch.cat([codes, pad_tail], dim=-1) + + # --- Stack time into channel dimension --- + Tp = codes.shape[-1] + T_out = Tp // s + codes = codes.view(B, C, T_out, s) + codes = codes.permute(0, 1, 3, 2).reshape(B, C * s, T_out) + + new_lens = torch.div(codes_lens + s - 1, s, rounding_mode='floor') + if contains_bos: + codes = torch.cat([bos_tensor_repeated, codes], dim=2) + new_lens = new_lens + 1 + + return codes, new_lens + + def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): + if stacking_factor == 1: + return stacked_codes, stacked_lens + + B, CxS, T_out = stacked_codes.shape + s = int(stacking_factor) + assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})" + + C = CxS // s + # Reshape: split channels back into (C, s) + x = stacked_codes.view(B, C, s, T_out) + # Bring s back into time dimension + x = x.permute(0, 1, 3, 2).reshape(B, C, T_out * s) + + # Recover original lengths (before padding) + orig_lens = stacked_lens * s + + return x, orig_lens + + def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens): + # import ipdb; ipdb.set_trace() + phoneme_tokens = phoneme_tokens.unsqueeze(1) # (B, 1, L) + phoneme_tokens, phoneme_tokens_lens = self.stack_codes( + phoneme_tokens, + phoneme_tokens_lens, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.eos_token_id, + self.phoneme_stacking_factor, + 1 + ) + # import ipdb; ipdb.set_trace() + phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens) # (B, T', E) + + phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens) + phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2) # (B, T', E) + + zero_context_tensor = torch.zeros(context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device) + phoneme_channel_input, phoneme_channel_input_lens = self.join_embeddings_temporally( + embeddings=[zero_context_tensor, phoneme_tokens_embedded], + lengths=[context_lens, phoneme_tokens_lens], + ) + return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens + + + def process_batch(self, batch, mode="train"): + dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False + dropout_phoneme_input = ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input)) if mode == 'train' else False + context_tensors = self.prepare_context_tensors(batch, dropout_text_input) + print("text lens", context_tensors['text_lens']) + remaining_text_embedded = context_tensors['remaining_text_embedded'] + context_embedding = context_tensors['context_embedding'] + context_lens = context_tensors['context_lens'] + + dropout_conditional_input = False + if mode == 'train' and self.cfg_unconditional_prob > 0.0: + if torch.rand(1).item() < self.cfg_unconditional_prob: + dropout_conditional_input = True + # Get embedding of a special UNCONDITIONAL_TOKEN + cfg_token_id = self.cfg_unk_token_id # int + cfg_token_embedding = self.decoder.get_input_embeddings()(torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device)) # (B, 1, E) + # Keeping the dummy context same size as the context embedding makes + # inference easier especially with KV caching and using a duplicated batch. + context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1) # (B, T_total, E) + # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation. + if self.text_input_mode == 'streaming': + remaining_text_embedded = torch.zeros_like(remaining_text_embedded) + + if 'audio_codes' not in batch: + audio_codes, audio_codes_lens = self.audio_to_codes(batch['audio'], batch['audio_lens']) + else: + audio_codes = batch['audio_codes'] + audio_codes_lens = batch['audio_codes_lens'] + if self._codec_converter is not None: + audio_codes = self._codec_converter.convert_original_to_new( + audio_tokens=audio_codes, audio_lens=audio_codes_lens + ).long() + + audio_codes, audio_codes_lens = self.stack_codes(audio_codes, audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks) + audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1 + audio_codes_target = audio_codes[:, :, 1:] # (B, C, T') Target for the decoder + audio_codes_input = audio_codes[:, :, :-1] # (B, C, T') Input to the decoder + audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) # Computing this to be use in the alignment encoder + if remaining_text_embedded is not None: + # Make remaining text embedded the same size as audio_codes_input_embedded by padding with zeros on the right + padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1) + padding_tensor = torch.zeros(remaining_text_embedded.size(0), padding_len, remaining_text_embedded.size(2), device=remaining_text_embedded.device) + remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1) + audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded + + + context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( + embeddings=[context_embedding, audio_codes_input_embedded], + lengths=[context_lens, audio_codes_lens_input], + ) + + if self.phoneme_tokenizer is not None: + context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay + phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = self.prepare_phoneme_channel_input( + batch['phoneme_tokens'], + batch['phoneme_tokens_lens'], + context_lens_for_phonemes + ) + print("phoneme_tokens_lens", phoneme_tokens_lens) + print("audio_codes_lens", audio_codes_lens_input) + if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]: + padding_tensor = torch.zeros(phoneme_channel_input.shape[0], context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1], phoneme_channel_input.shape[2], device=phoneme_channel_input.device) + phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1) + else: + phoneme_channel_input = phoneme_channel_input[:, :context_plus_audio_embedded.shape[1], :] + + if (not dropout_conditional_input) and (not dropout_phoneme_input): + context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input + + transformer_out = self.forward( + inputs_embeds=context_plus_audio_embedded, + attention_mask=get_mask_from_lengths(context_plus_audio_lens), + ) + transformer_hidden_states = transformer_out.last_hidden_state # (B, T_total, E) + + pred_embeddings = self.slice_pred_embeddings( + transformer_hidden_states, + context_lens=context_lens, + target_lens=audio_codes_lens_target, + ) + + logits = self.final_proj(pred_embeddings) # (B, T', num_codebooks * num_tokens_per_codebook) + # import ipdb; ipdb.set_trace() + codebook_loss, loss_mask = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) + loss = codebook_loss + + local_transformer_loss = None + local_transformer_logits = None + if self.local_transformer_type != LocalTransformerType.NO_LT: + if self.local_transformer_type == LocalTransformerType.MASKGIT: + # randomly replace some positions with MASK_TOKEN + audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target) + local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_masked, targets_offset_by_one=True) + #audio_codes_masked = audio_codes_masked[:, 1:, :] + local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask) + else: + # autoregressive + assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" + local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_target, targets_offset_by_one=False) + local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, None) + local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0) + loss = loss + local_transformer_loss_scale * local_transformer_loss + + phoneme_loss = None + if self.phoneme_tokenizer is not None: + pred_embeddings_phoneme = self.slice_pred_embeddings( + transformer_hidden_states, + context_lens=context_lens_for_phonemes, + target_lens=phoneme_tokens_lens-1, + ) + phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) # (B, T', phoneme_stacking_factor * phoneme_vocab_size) + if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input): + # Only compute phoneme loss if not doing unconditional training or text dropout + phoneme_loss, _ = self.compute_phoneme_loss(phoneme_logits, phoneme_tokens[:,:,1:].long(), phoneme_tokens_lens - 1) + print("No Dropout - phoneme loss:", phoneme_loss.item()) + else: + phoneme_loss = torch.tensor(0.0, device=logits.device) + print("Dropout - phoneme loss skipped", phoneme_loss.item()) + + loss = loss + phoneme_loss + + return { + 'loss': loss, + 'codebook_loss': codebook_loss, + 'phoneme_loss': phoneme_loss, + 'local_transformer_loss': local_transformer_loss, + 'local_transformer_logits': local_transformer_logits, # (B, T', num_codebooks * num_tokens_per_codebook) + 'logits': logits, + 'audio_codes_target': audio_codes_target, # (B, C, T') + 'audio_codes_lens_target': audio_codes_lens_target, # (B,) + 'context_audio_codes': context_tensors['context_audio_codes'], # (B, C, T') + 'context_audio_codes_lens': context_tensors['context_audio_codes_lens'], # (B,) + } + + + + def training_step(self, batch, batch_idx): + batch_output = self.process_batch(batch) + loss = batch_output['loss'] + codebook_loss = batch_output['codebook_loss'] + self.log('train/codebook_loss', codebook_loss, prog_bar=True, sync_dist=True) + self.log('train/loss', loss, prog_bar=True, sync_dist=True) + + if self.phoneme_tokenizer is not None: + phoneme_loss = batch_output['phoneme_loss'] + self.log('train/phoneme_loss', phoneme_loss, prog_bar=True, sync_dist=True) + + local_transformer_loss = batch_output['local_transformer_loss'] + if local_transformer_loss is not None: + self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True) + + # Log batch info + batch_size, text_token_max_len = batch["text"].shape + text_token_total_num = batch["text_lens"].sum() + batch_info_dict = { + "train/batch_size": batch_size, + "train/text_token_max_len": text_token_max_len, + "train/text_token_total_num_in_batch": text_token_total_num, + "train/text_token_pad_ratio_percent_in_batch": 100 * (1 - text_token_total_num / (batch_size * text_token_max_len)), + } + + if "audio_codes" in batch: + audio_codes_max_len = batch["audio_codes"].shape[-1] + audio_codes_total_num = batch["audio_codes_lens"].sum() + batch_info_dict.update({ + "train/audio_codes_max_len": audio_codes_max_len, + "train/audio_codes_total_num_in_batch": audio_codes_total_num, + "train/audio_codes_pad_ratio_percent_in_batch": 100 * (1 - audio_codes_total_num / (batch_size * audio_codes_max_len)), + }) + else: + audio_samples_max_len = batch["audio"].shape[-1] + audio_samples_total_num = batch["audio_lens"].sum() + batch_info_dict.update({ + "train/audio_samples_max_len": audio_samples_max_len, + "train/audio_samples_total_num_in_batch": audio_samples_total_num, + "train/audio_samples_pad_ratio_percent_in_batch": 100 * (1 - audio_samples_total_num / (batch_size * audio_samples_max_len)), + }) + + self.log_dict(batch_info_dict, on_step=True) + + return loss + + def validation_step(self, batch, batch_idx): + batch_output = self.process_batch(batch, mode="val") + # self.process_batch returns a dict. We currently only log "logits" which come from the parallel prediction + # head. If we use local_transformer, then the local_transformer returns "local_transformer_logits" + loss = batch_output['loss'] + codebook_loss = batch_output['codebook_loss'] + logits = batch_output['logits'] + audio_codes_target = batch_output['audio_codes_target'] + audio_codes_lens_target = batch_output['audio_codes_lens_target'] + context_audio_codes = batch_output['context_audio_codes'] + context_audio_codes_lens = batch_output['context_audio_codes_lens'] + + if batch_idx == 0 and self.global_rank == 0: + # Prepare dictionary for aggregated wandb logging + wandb_log_dict = {} + + # Get audio data for logging + wandb_log_dict.update( + self.log_val_audio_example( + logits, audio_codes_target, audio_codes_lens_target, context_audio_codes, context_audio_codes_lens + ) + ) + + # Perform single wandb log call if wandb is active and there is data + for logger in self.loggers: + if isinstance(logger, WandbLogger) and wandb_log_dict: + logger.experiment.log(wandb_log_dict) + + # infer_output_no_cfg_noLT = self.infer_batch( + # batch, + # max_decoder_steps=500, + # temperature=0.7, + # topk=80, + # use_local_transformer_for_inference=False, + # maskgit_n_steps=3, + # use_cfg=False, + # cfg_scale=1.0 + # ) + # infer_output_cfg_withLT = self.infer_batch( + # batch, + # max_decoder_steps=500, + # temperature=0.7, + # topk=80, + # use_local_transformer_for_inference=self.local_transformer_type != LocalTransformerType.NO_LT, + # maskgit_n_steps=3, + # use_cfg=True, + # cfg_scale=2.5 + # ) + # pred_audio_no_cfg_noLT, pred_audio_no_cfg_noLT_lens = infer_output_no_cfg_noLT[0], infer_output_no_cfg_noLT[1] + # pred_audio_cfg_withLT, pred_audio_cfg_withLT_lens = infer_output_cfg_withLT[0], infer_output_cfg_withLT[1] + + # for logger in self.loggers: + # is_wandb = isinstance(logger, WandbLogger) + # is_tb = isinstance(logger, TensorBoardLogger) + # if not is_wandb and not is_tb: + # raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.") + # for idx in range(pred_audio_no_cfg_noLT.size(0)): + # pred_audio_no_cfg_noLT_idx = pred_audio_no_cfg_noLT[idx][:pred_audio_no_cfg_noLT_lens[idx]].float().cpu().numpy() + # pred_audio_cfg_withLT_idx = pred_audio_cfg_withLT[idx][:pred_audio_cfg_withLT_lens[idx]].float().cpu().numpy() + # if is_wandb: + # logger.experiment.log({ + # "val/pred_audio_no_cfg_noLT": wandb.Audio(pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, caption="Inference No CFG, No LT"), + # "val/pred_audio_cfg_withLT": wandb.Audio(pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, caption="Inference CFG, With LT"), + # }) + # if is_tb: + # logger.experiment.add_audio( + # "val/pred_audio_no_cfg_noLT", pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, global_step=batch_idx + # ) + # logger.experiment.add_audio( + # "val/pred_audio_cfg_withLT", pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, global_step=batch_idx + # ) + + local_transformer_loss = batch_output['local_transformer_loss'] + val_output = { + 'val_loss': loss, + 'val_codebook_loss': codebook_loss, + 'val_local_transformer_loss': local_transformer_loss, + } + + if self.phoneme_tokenizer is not None: + phoneme_loss = batch_output['phoneme_loss'] + val_output['val_phoneme_loss'] = phoneme_loss + + self.validation_step_outputs.append(val_output) + + return val_output + + def on_validation_epoch_end(self): + collect = lambda key: torch.stack([x[key] for x in self.validation_step_outputs]).mean() + val_loss = collect("val_loss") + val_codebook_loss = collect("val_codebook_loss") + + self.log("val_loss", val_loss, prog_bar=True, sync_dist=True) + self.log("val/codebook_loss", val_codebook_loss, prog_bar=True, sync_dist=True) + + if self.local_transformer_type != LocalTransformerType.NO_LT: + val_local_transformer_loss = collect("val_local_transformer_loss") + self.log("val/local_transformer_loss", val_local_transformer_loss, prog_bar=True, sync_dist=True) + + if self.phoneme_tokenizer is not None: + val_phoneme_loss = collect("val_phoneme_loss") + self.log("val/phoneme_loss", val_phoneme_loss, prog_bar=True, sync_dist=True) + + self.validation_step_outputs.clear() # free memory + + def get_dataset(self, dataset_cfg, dataset_type): + dataset = instantiate( + dataset_cfg.dataset, + sample_rate=self.sample_rate, + bos_id=None, + eos_id=self.eos_id, + audio_bos_id=self.audio_bos_id, + audio_eos_id=self.audio_eos_id, + context_audio_bos_id=self.context_audio_bos_id, + context_audio_eos_id=self.context_audio_eos_id, + num_audio_codebooks=self.data_num_audio_codebooks, + codec_model_samples_per_frame=self.codec_model_samples_per_frame, + prior_scaling_factor=0.0, + load_cached_codes_if_available=self.cfg.load_cached_codes_if_available, + dataset_type=dataset_type, # train or test used for setting phone prob to 1.0 in test dataset (worker_init_fn) + use_text_conditioning_tokenizer=True, + text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name, + pad_context_text_to_max_duration=self.pad_context_text_to_max_duration, + context_duration_min=self.cfg.context_duration_min, + context_duration_max=self.cfg.context_duration_max, + ) + dataset.load_16khz_audio = False + dataset.tokenizer_config = ( + self.cfg.text_tokenizers + ) # This will be used in worker_init_fn for instantiating tokenizer + if self.phoneme_tokenizer is not None: + dataset.phoneme_tokenizer_config = self.cfg.phoneme_tokenizer + + return dataset + + def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.DataLoader: + # TODO @xueyang: better to distinguish cfg. self.cfg is the model cfg, while cfg here is train_ds cfg. Also + # cfg is a classifier-free guidance. + dataset = MagpieTTSLhotseDataset( + sample_rate=self.sample_rate, + volume_norm=dataset_cfg.volume_norm, + codec_model_samples_per_frame=self.codec_model_samples_per_frame, + audio_bos_id=self.audio_bos_id, + audio_eos_id=self.audio_eos_id, + context_audio_bos_id=self.context_audio_bos_id, + context_audio_eos_id=self.context_audio_eos_id, + num_audio_codebooks=self.data_num_audio_codebooks, + prior_scaling_factor=0.0, + load_cached_codes_if_available=self.cfg.load_cached_codes_if_available, + dataset_type=mode, # train or test used for setting phone prob to 1.0 in test dataset (worker_init_fn) + load_16khz_audio=False, + pad_context_text_to_max_duration=self.pad_context_text_to_max_duration, + context_duration_min=self.cfg.context_duration_min, + context_duration_max=self.cfg.context_duration_max, + use_text_conditioning_tokenizer=True, + text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name, + tokenizer_config=self.cfg.text_tokenizers, + phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None) + ) + + data_loader = get_lhotse_dataloader_from_config( + config=dataset_cfg.dataset, + global_rank=self.global_rank, + world_size=self.world_size, + dataset=dataset, + ) + return data_loader + + def setup_training_data(self, dataset_cfg): + if dataset_cfg.get("use_lhotse", False): + # TODO @xueyang: better to distinguish cfg. self.cfg is the model cfg, while cfg here is train_ds cfg. Also + # cfg is a classifier-free guidance. + self._train_dl = self.get_lhotse_dataloader(dataset_cfg, mode='train') + else: + dataset = self.get_dataset(dataset_cfg, dataset_type='train') + sampler = dataset.get_sampler(dataset_cfg.dataloader_params.batch_size, world_size=self.trainer.world_size) + persistent_workers = True + if dataset_cfg.dataloader_params.num_workers == 0: + persistent_workers = False + # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable) + dataset.text_tokenizer = setup_tokenizers( + all_tokenizers_config=self.cfg.text_tokenizers, + mode='train', + ) + if self.cfg.get("phoneme_tokenizer", None) is not None: + dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer) + + self._train_dl = torch.utils.data.DataLoader( + dataset, + collate_fn=dataset.collate_fn, + sampler=sampler, + **dataset_cfg.dataloader_params, + worker_init_fn=worker_init_fn, + persistent_workers=persistent_workers, + ) + + def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader: + if dataset_cfg.get("use_lhotse", False): + data_loader = self.get_lhotse_dataloader(dataset_cfg, mode='test') + else: + dataset = self.get_dataset(dataset_cfg, dataset_type='test') + persistent_workers = True + if dataset_cfg.dataloader_params.num_workers == 0: + persistent_workers = False + # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable) + dataset.text_tokenizer = setup_tokenizers( + all_tokenizers_config=self.cfg.text_tokenizers, + mode='test' + ) + if self.cfg.get("phoneme_tokenizer", None) is not None: + dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer) + + data_loader = torch.utils.data.DataLoader( + dataset, + collate_fn=dataset.collate_fn, + **dataset_cfg.dataloader_params, + worker_init_fn=worker_init_fn, + persistent_workers=persistent_workers, + ) + return data_loader + + def setup_validation_data(self, cfg): + self._validation_dl = self._setup_test_dataloader(cfg) + + def setup_test_data(self, cfg): + self._test_dl = self._setup_test_dataloader(cfg) + + def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, use_local_transformer_for_inference=False, maskgit_n_steps=3, use_cfg=False, cfg_scale=1.0, phoneme_input_type='gt', phoneme_sampling_method='argmax', dropout_text_input=False): + with torch.inference_mode(): + start_time = time.time() + context_tensors = self.prepare_context_tensors(batch, dropout_text_input=dropout_text_input) + context_embedding = context_tensors['context_embedding'] # (B, T_total, E) + context_lens = context_tensors['context_lens'] # (B,) + remaining_text_embedded = context_tensors['remaining_text_embedded'] + remaining_text_lens = context_tensors['remaining_text_lens'] + + if self.phoneme_tokenizer is not None: + context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay + phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = self.prepare_phoneme_channel_input( + batch['phoneme_tokens'], + batch['phoneme_tokens_lens'], + context_lens_for_phonemes + ) + phoneme_channel_input_pad_tensor = torch.zeros(phoneme_channel_input.size(0), max_decoder_steps, phoneme_channel_input.size(2), device=phoneme_channel_input.device) + phoneme_channel_input = torch.cat([phoneme_channel_input, phoneme_channel_input_pad_tensor], dim=1) + + audio_codes_bos = torch.full( + (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1), self.audio_bos_id, device=context_embedding.device + ).long() + audio_codes_lens = torch.full((context_embedding.size(0),), 1, device=context_embedding.device).long() + audio_codes_input = audio_codes_bos + + audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) + if self.text_input_mode == 'streaming': + remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1 + remaining_text_pad_tensor = torch.zeros(remaining_text_embedded.size(0), remaining_text_pad_length, remaining_text_embedded.size(2), device=remaining_text_embedded.device) + remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1) + audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :] # :1 corresponds to audio BOS. + + context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( + embeddings=[context_embedding, audio_codes_input_embedded], + lengths=[context_lens, audio_codes_lens], + ) + min_context_len = context_plus_audio_lens.min().item() + if self.phoneme_tokenizer is not None: + min_context_len = min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1 # 1 for audio BOS that we had added. + + actual_batch_size = context_embedding.size(0) + if use_cfg: + dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()( + torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=context_embedding.device) + ) # (B, 1, E) + dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand(-1, context_embedding.size(1), -1) # (B, T_total, E) + + dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally( + embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded], + lengths=[context_lens, audio_codes_lens], + ) + first_inference_input = torch.cat( + [context_plus_audio_embedded, dummy_context_plus_audio_embedded], + dim=0 + )[:,:min_context_len, :] # (2B, T_min, E) + else: + first_inference_input = context_plus_audio_embedded[:, :min_context_len, :] # (B, T_min, E) + # First forward pass to get the initial hidden state and past key values + transformer_out = self.forward( + inputs_embeds=first_inference_input, + attention_mask=None, + use_cache=True, + past_key_values=None, # No past key values for the first step + ) + + time_to_first_prediction = time.time() - start_time + last_hidden = transformer_out.last_hidden_state # (B, T_total, E) + past_kv = transformer_out.past_key_values + + all_predictions = [] + end_indices = {} + + current_text_positions = [] + for item_idx in range(context_embedding.size(0)): + # 0 if we have started reading the remaining text otherwise negative (indicating how far we are before we start reading the remaining text) + current_text_positions.append(min_context_len - context_plus_audio_lens[item_idx]) + current_text_positions = torch.tensor(current_text_positions, device=context_embedding.device).long() + if self.phoneme_tokenizer is not None: + current_phoneme_positions = current_text_positions - current_text_positions.max() - 1 # Make it 0-indexed. + # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay + pred_phoneme_token_lists = [ + [] for _ in range(actual_batch_size) + ] + gt_phoneme_token_lists = [ + [] for _ in range(actual_batch_size) + ] + phoneme_stream_ended = torch.zeros(actual_batch_size, device=context_embedding.device).bool() # (B,) Whether phoneme stream has ended for this item. + for idx in range(max_decoder_steps): + # import ipdb; ipdb.set_trace() + current_text_positions += 1 + if self.phoneme_tokenizer is not None: + current_phoneme_positions += 1 + print("current_phoneme_positions", current_phoneme_positions) + if idx % 20 == 0: + print(f"Decoding timestep {idx}") + + all_code_logits_t = self.final_proj(last_hidden[:, -1, :]) # (B, num_codebooks * num_tokens_per_codebook) + + if self.phoneme_tokenizer is not None: + all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) # (B, phoneme_stacking_factor * phoneme_vocab_size) + all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] + + if use_cfg: + conditional_logits = all_code_logits_t[:actual_batch_size] + unconditional_logits = all_code_logits_t[actual_batch_size:] + all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + + if use_local_transformer_for_inference: + if self.local_transformer_type == LocalTransformerType.AR : + # Autoregressive sampling with local transformer + audio_codes_next = self.local_transformer_sample_autoregressive( + dec_output=last_hidden[:, -1, :], + temperature=temperature, + topk=topk, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + ) + elif self.local_transformer_type == LocalTransformerType.MASKGIT: + audio_codes_next = self.local_transformer_sample_maskgit( + dec_output=last_hidden[:, -1, :], + temperature=temperature, + topk=topk, + n_steps=maskgit_n_steps, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + ) + else: + raise ValueError(f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}") + # TODO @rfejgin: should we add argmax sampling for EOS here too? + all_codes_next_argmax = audio_codes_next + else: + # Parallel sampling from logits + audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) # (B, num_codebooks) + all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) # (B, num_codebooks) + + phoneme_channel_input_t = None + + if self.phoneme_tokenizer is not None: + all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=temperature, topk=topk) # (B, phoneme_stacking_factor) + all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01) # (B, phoneme_stacking_factor) + pred_phoneme_tokens = all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme # B, phoneme_stacking_factor + phoneme_bos_tensor = torch.full( + (actual_batch_size, self.phoneme_stacking_factor), + self.phoneme_tokenizer.bos_token_id, + device=context_embedding.device + ).long() # (B, phoneme_stacking_factor) + use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long() + print("use_bos_phoneme", use_bos_phoneme) + pred_phoneme_tokens = (use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens).long() # (B, phoneme_stacking_factor) + + print("pred_phoneme_tokens", pred_phoneme_tokens) + gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1) + gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # (B, phoneme_stacking_factor) + print("gt_phoneme_tokens_current", gt_phoneme_tokens_current) + + input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens + input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2)) # (B, phoneme_stacking_factor, E) + + use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended) # (B,) + use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float() # (B, 1, 1) + zero_phoneme_embedding = torch.zeros(actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device).unsqueeze(1) # (B, 1, E) + # phoneme_channel_input_t = phoneme_channel_input[torch.arange(actual_batch_size), current_phoneme_positions.clamp(min=0) + min_context_len, :].unsqueeze(1) # (B, 1, E) + phoneme_channel_input_t = use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding + print("use_phoneme_input", use_phoneme_input) + for item_idx in range(actual_batch_size): + if use_phoneme_input[item_idx,0,0] > 0: + for phoneme_channel_idx in range(self.phoneme_stacking_factor): + _phoneme_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item() + if _phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]: + pred_phoneme_token_lists[item_idx].append(_phoneme_token) + + _gt_phoneme_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item() + if _gt_phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]: + gt_phoneme_token_lists[item_idx].append(_gt_phoneme_token) + + if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id): + print("Phoneme end detected for item {} at timestep {}".format(item_idx, idx)) + phoneme_stream_ended[item_idx] = True + all_codes_next_phoneme = all_codes_next_phoneme.unsqueeze(1) + # import ipdb; ipdb.set_trace() + + for item_idx in range(all_codes_next_argmax.size(0)): + if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]: + pred_tokens = all_codes_next_argmax[item_idx] + pred_tokens_multinomial = audio_codes_next[item_idx] + if torch.any(pred_tokens == self.audio_eos_id) or torch.any(pred_tokens_multinomial == self.audio_eos_id): + print("End detected for item {} at timestep {}".format(item_idx, idx)) + end_indices[item_idx] = idx + + all_predictions.append(audio_codes_next) + + new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2)) # (B, 1, E) + new_emb_unconditional = new_emb * 1 + + if self.text_input_mode == 'streaming': + _bs = context_embedding.size(0) + remaining_text_embedded_current = remaining_text_embedded[torch.arange(_bs), current_text_positions.clamp(min=0) , :].unsqueeze(1) # (B, 1, E) + new_emb = new_emb + remaining_text_embedded_current + + + context_incomplete_mask = context_plus_audio_lens > idx + min_context_len # (B,) + # import ipdb; ipdb.set_trace() + # True if we have not yet reached the end of the context for this item + # import ipdb; ipdb.set_trace() + if context_incomplete_mask.any(): + # If some contexts are not yet complete. + context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float() # (B, 1, 1) + context_embedding = context_plus_audio_embedded[:,min_context_len+idx:min_context_len+idx+1,:] # (B, 1, E) + next_input = context_incomplete_mask * context_embedding + (1 - context_incomplete_mask) * new_emb + if phoneme_channel_input_t is not None: + next_input += phoneme_channel_input_t + if use_cfg: + next_input_unconditional = context_incomplete_mask * dummy_context_embedding_unconditional + (1 - context_incomplete_mask) * new_emb_unconditional + next_input = torch.cat([next_input, next_input_unconditional], dim=0) # (2B, 1, E) + else: + next_input = new_emb + if phoneme_channel_input_t is not None: + next_input += phoneme_channel_input_t + if use_cfg: + next_input = torch.cat([next_input, new_emb_unconditional], dim=0) # (2B, 1, E) + + transformer_out = self.forward( + inputs_embeds=next_input, + attention_mask=None, + use_cache=True, + past_key_values=past_kv, + ) + last_hidden = transformer_out.last_hidden_state + past_kv = transformer_out.past_key_values + if len(end_indices) == audio_codes_next.size(0): + print("All items finished at timestep {}".format(idx)) + break + + if self.phoneme_tokenizer is not None: + for item_idx in range(actual_batch_size): + print("Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx])) + print("GT phoneme tokens for item {}: {}".format(item_idx, gt_phoneme_token_lists[item_idx])) + predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx]) + gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx]) + print("Predicted phoneme text for item {}: {}".format(item_idx, predicted_phoneme_text)) + print("GT phoneme text for item {}: {}".format(item_idx, gt_phoneme_text)) + + tts_generation_time = time.time() - start_time + tts_generation_time_per_frame = tts_generation_time / len(all_predictions) + pred_codes_start_indices = context_plus_audio_lens - min_context_len # (B,) + predicted_lens = [end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0))] # Ensure that the codec is atleast of length 4 + predicted_codes_lens = torch.tensor(predicted_lens, device=context_embedding.device).long() + predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices # (B,) + + predicted_codes = torch.stack(all_predictions, dim=-1) # (B, num_codebooks, T) + predicted_codes = self.slice_pred_embeddings( + predicted_codes.permute(0, 2, 1), + context_lens=pred_codes_start_indices, + target_lens=predicted_codes_lens, + ) + predicted_codes = predicted_codes.permute(0, 2, 1) # (B, num_codebooks, T) + predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens) + + end_time = time.time() + total_audio_duration_generated = (predicted_audio_lens.max().item() * predicted_audio_lens.shape[0])/self.sample_rate + rtf = total_audio_duration_generated / (end_time - start_time) + + rtf_metrics = { + 'rtf': rtf, + 'time_to_first_prediction': time_to_first_prediction, + 'tts_generation_time': tts_generation_time, + 'max_frames_generated': len(all_predictions), + 'tts_generation_time_per_frame': tts_generation_time_per_frame, + 'batch_size': context_embedding.size(0), + } + + return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics + + + + @classmethod + def list_available_models(cls) -> List[PretrainedModelInfo]: + return [] + diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index 369380768566..cc3083f30e2c 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -26,15 +26,15 @@ import shutil import time from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import soundfile as sf import torch from nemo.collections.asr.parts.utils.manifest_utils import read_manifest from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer -from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset -from nemo.collections.tts.models import MagpieTTSModel +from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset +from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel from nemo.collections.tts.models.magpietts import ModelInferenceParameters from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors from nemo.utils import logging @@ -73,6 +73,12 @@ class InferenceConfig: maskgit_fixed_schedule: Optional[List[int]] = None maskgit_sampling_type: Optional[str] = None + # Decoder-only inference options + phoneme_input_type: str = "gt" # gt or predicted + phoneme_sampling_method: str = "argmax" # argmax or multinomial + dropout_text_input: bool = False + + is_decoder_only_model: bool = False def build_identifier(self) -> str: """Build a unique identifier string for this configuration. @@ -127,8 +133,8 @@ class MagpieInferenceRunner: """ def __init__( - self, - model: MagpieTTSModel, + self,# model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel + model: Union[MagpieTTSModel, MagpieTTSDecoderModel], config: InferenceConfig, ): """Initialize the inference runner. @@ -151,7 +157,8 @@ def _configure_tokenizer(self) -> None: """Configure the tokenizer for inference (phoneme prob = 1.0).""" g2p = None if isinstance(self.model.tokenizer, AggregatedTTSTokenizer): - g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p + if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr(self.model.tokenizer.tokenizers["english_phoneme"], "g2p"): + g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p elif isinstance(self.model.tokenizer, IPATokenizer): g2p = self.model.tokenizer.g2p @@ -163,13 +170,12 @@ def create_dataset( dataset_meta: dict, context_duration_min: Optional[float] = None, context_duration_max: Optional[float] = None, - ) -> ChunkedTTSInferenceDataset: - """Create a unified dataset for inference. + ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]: + """Create an inference dataset. - Always creates ChunkedTTSInferenceDataset which uses language-aware chunking - to automatically handle both short and long texts: - - Short text (below threshold): processed as single chunk - - Long text (above threshold): split into sentence chunks + Standard MagpieTTS uses the chunked inference dataset from `main`. + Decoder-only MagpieTTS uses the regular dataset and its dedicated + `infer_batch()` inference path. Args: dataset_meta: Dataset metadata dictionary with 'manifest_path' and 'audio_dir'. @@ -199,11 +205,35 @@ def create_dataset( self._manifest_records = read_manifest(manifest_path) self._audio_base_dir = audio_dir + if self.config.is_decoder_only_model: + logging.info("Creating standard inference dataset for decoder-only model") + dataset = MagpieTTSDataset( + dataset_meta=dataset_meta, + sample_rate=self.model.sample_rate, + min_duration=0.5, + max_duration=20, + codec_model_samples_per_frame=self.model.codec_model_samples_per_frame, + bos_id=getattr(self.model, "bos_id", None), + eos_id=self.model.eos_id, + num_audio_codebooks=self.model.num_audio_codebooks, + prior_scaling_factor=None, + load_cached_codes_if_available=False, + dataset_type='test', + tokenizer_config=None, + load_16khz_audio=False, + use_text_conditioning_tokenizer=True, + text_conditioning_tokenizer_name=self.model.text_conditioning_tokenizer_name, + pad_context_text_to_max_duration=False, + context_duration_min=context_duration_min, + context_duration_max=context_duration_max, + ) + dataset.text_tokenizer = self.model.tokenizer + else: + logging.info("Creating unified inference dataset") + dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max) - # Always use unified dataset (handles both short and long texts automatically) - # Language for chunking thresholds is determined per-sample from manifest - logging.info("Creating unified inference dataset") - dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max) + if hasattr(self.model, 'phoneme_tokenizer'): + dataset.phoneme_tokenizer = self.model.phoneme_tokenizer return dataset @@ -217,10 +247,7 @@ def run_inference_on_dataset( save_context_audio: bool = True, save_predicted_codes: bool = True, ) -> Tuple[List[dict], List[str], List[str]]: - """Run unified inference on a dataset. - - Uses the unified inference path that automatically handles both short texts - (single chunk) and long texts (multiple chunks) through the same code path. + """Run inference on a dataset. Args: dataset: The inference dataset (created by create_dataset()). @@ -248,12 +275,91 @@ def run_inference_on_dataset( raise ValueError("audio_base_dir not provided and not cached from create_dataset()") audio_base_dir = self._audio_base_dir - # Always use unified inference path + if self.config.is_decoder_only_model: + logging.info("Using decoder-only inference path") + return self._run_decoder_only_inference( + dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes + ) + logging.info("Using unified inference path") return self._run_unified_inference( dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes ) + def _run_decoder_only_inference( + self, + dataset: MagpieTTSDataset, + output_dir: str, + manifest_records: List[dict], + audio_base_dir: str, + save_context_audio: bool = True, + save_predicted_codes: bool = True, + ) -> Tuple[List[dict], List[str], List[str]]: + """Run inference for decoder-only models via `infer_batch()`.""" + os.makedirs(output_dir, exist_ok=True) + self._delete_old_generated_files(output_dir) + + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=self.config.batch_size, + collate_fn=dataset.collate_fn, + num_workers=0, + shuffle=False, + ) + + all_rtf_metrics = [] + generated_audio_paths = [] + codec_file_paths = [] + item_idx = 0 + phoneme_sampling_method = ( + "argmax" if self.config.phoneme_sampling_method == "greedy" else self.config.phoneme_sampling_method + ) + + for batch_idx, batch in enumerate(dataloader): + logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}") + batch_cuda = self._batch_to_cuda(batch) + + predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics = self.model.infer_batch( + batch_cuda, + max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps, + temperature=self.config.model_inference_parameters.temperature, + topk=self.config.model_inference_parameters.topk, + use_local_transformer_for_inference=self.config.use_local_transformer, + maskgit_n_steps=self.config.maskgit_n_steps, + use_cfg=self.config.use_cfg, + cfg_scale=self.config.model_inference_parameters.cfg_scale, + phoneme_input_type=self.config.phoneme_input_type, + phoneme_sampling_method=phoneme_sampling_method, + dropout_text_input=self.config.dropout_text_input, + ) + + all_rtf_metrics.append(rtf_metrics) + logging.info(f"Output shape: {predicted_audio.size()}") + + for idx in range(predicted_audio.size(0)): + audio_len = predicted_audio_lens[idx].item() + audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len] + audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav") + sf.write(audio_path, audio_np, self.model.sample_rate) + generated_audio_paths.append(audio_path) + + if save_context_audio and item_idx < len(manifest_records): + self._copy_reference_audio( + manifest_records[item_idx], + audio_base_dir, + output_dir, + item_idx, + ) + + if save_predicted_codes: + code_len = predicted_codes_lens[idx].item() + codes_path = os.path.join(output_dir, f"predicted_codes_{item_idx}.pt") + torch.save(predicted_codes[idx, :, :code_len].detach().cpu(), codes_path) + codec_file_paths.append(codes_path) + + item_idx += 1 + + return all_rtf_metrics, generated_audio_paths, codec_file_paths @staticmethod def _batch_to_cuda(batch: dict) -> dict: """Move batch tensors to CUDA device.""" diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index 647a8ea66a06..e0cd4c2714be 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -23,12 +23,12 @@ import os from dataclasses import dataclass -from typing import Dict, Optional, Tuple +from typing import Dict, Optional, Tuple, Union import torch from omegaconf import DictConfig, OmegaConf, open_dict -from nemo.collections.tts.models import MagpieTTSModel +from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel from nemo.utils import logging @@ -253,7 +253,7 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict: return new_state_dict -def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[MagpieTTSModel, str]: +def load_magpie_model(config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]: """Load a MagpieTTS model from checkpoint or NeMo archive. Supports two loading modes: @@ -271,7 +271,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma ValueError: If configuration is invalid or sample rates don't match. """ config.validate() - + model_cls = MagpieTTSDecoderModel if is_decoder_only_model else MagpieTTSModel if config.hparams_file is not None and config.checkpoint_file is not None: # Mode 1: Load from hparams + checkpoint model_cfg = OmegaConf.load(config.hparams_file) @@ -290,7 +290,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma config.legacy_text_conditioning, ) - model = MagpieTTSModel(cfg=model_cfg) + model = model_cls(cfg=model_cfg) model.use_kv_cache_for_inference = True # Load weights @@ -302,15 +302,15 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "") else: - if config.nemo_file.startswith("nvidia/"): # TODO @xueyang: why ignore `update_config_for_inference`? - model = MagpieTTSModel.from_pretrained(config.nemo_file) + if config.nemo_file.startswith("nvidia/"): + model = model_cls.from_pretrained(config.nemo_file) model.use_kv_cache_for_inference = True checkpoint_name = config.nemo_file.split("/")[-1] cfg_sample_rate = None else: # Mode 2: Load from .nemo archive logging.info(f"Loading model from NeMo archive: {config.nemo_file}") - model_cfg = MagpieTTSModel.restore_from(config.nemo_file, return_config=True) + model_cfg = model_cls.restore_from(config.nemo_file, return_config=True) with open_dict(model_cfg): model_cfg, cfg_sample_rate = update_config_for_inference( @@ -320,7 +320,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma config.legacy_text_conditioning, ) - model = MagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg) + model = model_cls.restore_from(config.nemo_file, override_config_path=model_cfg) model.use_kv_cache_for_inference = True checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "") From 156f16fc16f26fc7c0c14c51a06c8c618819182e Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Thu, 8 Jan 2026 17:52:45 -0500 Subject: [PATCH 02/94] merge wit main again Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/magpietts_decoder_only.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index 1b60b4b7b6ed..0a346865dcba 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -73,6 +73,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # load codec codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False) self.sample_rate = codec_model.sample_rate + self.output_sample_rate = codec_model.output_sample_rate if hasattr(codec_model, "discriminator"): # del codec discriminator to free memory @@ -1449,6 +1450,7 @@ def setup_test_data(self, cfg): self._test_dl = self._setup_test_dataloader(cfg) def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, use_local_transformer_for_inference=False, maskgit_n_steps=3, use_cfg=False, cfg_scale=1.0, phoneme_input_type='gt', phoneme_sampling_method='argmax', dropout_text_input=False): + # TODO: Make this API same as MagpieTTS model. with torch.inference_mode(): start_time = time.time() context_tensors = self.prepare_context_tensors(batch, dropout_text_input=dropout_text_input) @@ -1718,7 +1720,7 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us 'tts_generation_time_per_frame': tts_generation_time_per_frame, 'batch_size': context_embedding.size(0), } - + return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics From 6ba36996062fb56567cfa47c7cf89ce2af22155f Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 8 Jan 2026 22:12:37 +0000 Subject: [PATCH 03/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- examples/tts/magpietts_decoder_only.py | 2 +- examples/tts/magpietts_inference.py | 8 +- .../tts/data/text_to_speech_dataset.py | 8 +- .../tts/data/text_to_speech_dataset_lhotse.py | 12 +- .../tts/models/magpietts_decoder_only.py | 815 +++++++++++------- .../modules/magpietts_inference/inference.py | 8 +- .../tts/modules/magpietts_inference/utils.py | 6 +- 7 files changed, 539 insertions(+), 320 deletions(-) diff --git a/examples/tts/magpietts_decoder_only.py b/examples/tts/magpietts_decoder_only.py index 44859fee8d64..73bb87de7969 100644 --- a/examples/tts/magpietts_decoder_only.py +++ b/examples/tts/magpietts_decoder_only.py @@ -54,4 +54,4 @@ def main(cfg): if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter \ No newline at end of file + main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index d8d9e883fd04..3199f58e9970 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -190,7 +190,9 @@ def run_inference_and_evaluation( violin_plot_metrics.remove('utmosv2') # Load model - model, checkpoint_name = load_magpie_model(model_config, is_decoder_only_model=inference_config.is_decoder_only_model) + model, checkpoint_name = load_magpie_model( + model_config, is_decoder_only_model=inference_config.is_decoder_only_model + ) # Log architecture summary and get MoE info + FLOPs metrics moe_info, flops_per_component = log_model_architecture_summary(model) @@ -504,7 +506,9 @@ def create_argument_parser() -> argparse.ArgumentParser: target_group.add_argument('--ssim_target', type=float, default=None) target_group.add_argument('--is_decoder_only_model', action='store_true') target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt']) - target_group.add_argument('--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial']) + target_group.add_argument( + '--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial'] + ) target_group.add_argument('--dropout_text_input', action='store_true') return parser diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index 789636a569e3..254169f621c6 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -437,7 +437,9 @@ def __getitem__(self, index): if self.phoneme_tokenizer is not None: phoneme_tokens = self.phoneme_tokenizer.encode(data.text) - phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + phoneme_tokens = ( + [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + ) phoneme_tokens_len = len(phoneme_tokens) example["phoneme_tokens"] = torch.tensor(phoneme_tokens, dtype=torch.int32) example["phoneme_tokens_len"] = phoneme_tokens_len @@ -727,7 +729,9 @@ def collate_fn(self, batch: List[dict]): if len(phoneme_tokens_list) > 0: batch_phoneme_tokens_len = torch.IntTensor(phoneme_tokens_len_list) phoneme_tokens_max_len = int(batch_phoneme_tokens_len.max().item()) - batch_phoneme_tokens = stack_tensors(phoneme_tokens_list, max_lens=[phoneme_tokens_max_len], pad_value=self.phoneme_tokenizer.pad) + batch_phoneme_tokens = stack_tensors( + phoneme_tokens_list, max_lens=[phoneme_tokens_max_len], pad_value=self.phoneme_tokenizer.pad + ) batch_dict['phoneme_tokens'] = batch_phoneme_tokens batch_dict['phoneme_tokens_lens'] = batch_phoneme_tokens_len diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 4bd378151b9a..9bad7a36e44a 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -59,6 +59,7 @@ def setup_tokenizers(all_tokenizers_config, mode='train'): return aggregated_tokenizer + def instantiate_phoneme_tokenizer(phoneme_tokenizer_config): phoneme_tokenizer = instantiate(phoneme_tokenizer_config) phoneme_vocab_size = len(phoneme_tokenizer.tokens) @@ -67,6 +68,7 @@ def instantiate_phoneme_tokenizer(phoneme_tokenizer_config): phoneme_tokenizer.vocab_size = phoneme_vocab_size + 2 return phoneme_tokenizer + def check_speaker_format(item: str): # enforce the format as example like "| Language:en Dataset:HiFiTTS Speaker:9136_other |". pattern = r"\| Language:\w+ Dataset:[\w\d\W]+ Speaker:[\w\d\W]+ \|" @@ -410,7 +412,9 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: if self.phoneme_tokenizer is not None: phoneme_tokens = self.phoneme_tokenizer.encode(text_str) - phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + phoneme_tokens = ( + [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + ) phoneme_tokens_len = len(phoneme_tokens) phoneme_token_list.append(torch.tensor(phoneme_tokens, dtype=torch.int32)) phoneme_token_len_list.append(phoneme_tokens_len) @@ -435,9 +439,11 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: } if self.phoneme_tokenizer is not None: - batch_dict["phoneme_tokens"] = collate_vectors(phoneme_token_list, padding_value=self.phoneme_tokenizer.pad) + batch_dict["phoneme_tokens"] = collate_vectors( + phoneme_token_list, padding_value=self.phoneme_tokenizer.pad + ) batch_dict["phoneme_tokens_lens"] = torch.IntTensor(phoneme_token_len_list) - + # audio for SV. if len(audio_list_16khz) > 0: batch_dict["audio_16khz"] = collate_vectors(audio_list_16khz, padding_value=0.0) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index 0a346865dcba..f5f5be0522a6 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -11,38 +11,40 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import random +import time +from functools import partial from typing import List, Sequence, Tuple + import torch import wandb from hydra.utils import instantiate -from functools import partial from lightning.pytorch import Trainer from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger from omegaconf import DictConfig from torch import nn from torch.utils.data import get_worker_info +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config -from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers, instantiate_phoneme_tokenizer - +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( + MagpieTTSLhotseDataset, + instantiate_phoneme_tokenizer, + setup_tokenizers, +) from nemo.collections.tts.models import AudioCodecModel from nemo.collections.tts.modules import transformer_2501 - -from nemo.collections.tts.modules.magpietts_modules import CharAwareSubwordEncoder, SpecialAudioToken, LocalTransformerType, cosine_schedule +from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter +from nemo.collections.tts.modules.magpietts_modules import ( + CharAwareSubwordEncoder, + LocalTransformerType, + SpecialAudioToken, + cosine_schedule, +) from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths - from nemo.core.classes import ModelPT from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging -from transformers import ( - AutoConfig, - AutoModel, - AutoModelForCausalLM -) -import time -from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter -import random - def worker_init_fn(worker_id): @@ -51,9 +53,7 @@ def worker_init_fn(worker_id): logging.info(f"Worker {worker_id} initializing...") worker_info = get_worker_info() dataset = worker_info.dataset # Get the dataset instance in this worker - tokenizer = setup_tokenizers( - dataset.tokenizer_config, mode=dataset.dataset_type - ) + tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type) dataset.text_tokenizer = tokenizer if hasattr(dataset, 'phoneme_tokenizer_config'): dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config) @@ -74,7 +74,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False) self.sample_rate = codec_model.sample_rate self.output_sample_rate = codec_model.output_sample_rate - if hasattr(codec_model, "discriminator"): # del codec discriminator to free memory del codec_model.discriminator @@ -95,7 +94,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): data_num_audio_codebooks = num_audio_codebooks codebook_size = codec_model.codebook_size codec_converter = None - # The dataloader needs to know the number of codebooks that the context codes were stored in # In the case where there are no context codes saved, and there is no context audio (in the text context path), @@ -105,7 +103,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.num_audio_codebooks = num_audio_codebooks self.codebook_size = codebook_size - self.codec_model_samples_per_frame = codec_model.samples_per_frame # Our codebooks start with actual audio codec tokens, followed by special tokens. # The `forced_*` options are for backward compatibility for models trained with older code. @@ -136,7 +133,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): all_tokenizers_config=cfg.text_tokenizers, mode='train', ) - + num_tokens_tokenizer = len(self.tokenizer.tokens) num_tokens = num_tokens_tokenizer + 3 # +2 for BOS and EOS self.bos_id = num_tokens - 3 @@ -150,21 +147,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size - self.pad_context_text_to_max_duration = False super().__init__(cfg=cfg, trainer=trainer) # This needs to happen after super().__init__() self._codec_model = codec_model - self._codec_model.freeze() #Lightning does requires_grad = False and self.eval() + self._codec_model.freeze() # Lightning does requires_grad = False and self.eval() self._codec_converter = codec_converter audio_embeddings = [] for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim)) self.audio_embeddings = nn.ModuleList(audio_embeddings) - + if self.phoneme_tokenizer is not None: phoneme_embeddings = [] for _ in range(self.phoneme_stacking_factor): @@ -172,13 +168,15 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) - if cfg.transformer_hf_backend == "custom_qwen3_moe": # from transformers.models import qwen3_moe # config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(intermediate_size=3072, num_hidden_layers=5, num_experts=64) # self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) from transformers.models import qwen2_moe - config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32) + + config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig( + hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32 + ) self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2) else: self.transformer_backend_config = AutoConfig.from_pretrained( @@ -192,7 +190,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) self.decoder.set_input_embeddings(self.text_embedding) - + if self.use_bpe_char_tokenizer: # BPE char tokenizer assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer" @@ -210,10 +208,12 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): d_embed=cfg.embedding_dim, llm_tokenizer_vocab=subword_vocab, subword_padding_idx=self.tokenizer.pad, - special_vocab=special_vocab + special_vocab=special_vocab, ) - self.final_proj = nn.Linear(cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor) + self.final_proj = nn.Linear( + cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor + ) self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none') self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower()) @@ -227,7 +227,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.local_transformer = transformer_2501.Transformer( n_layers=self.cfg.get('local_transformer_n_layers', 2), d_model=local_transformer_hidden_dim, - d_ffn=local_transformer_hidden_dim*4, + d_ffn=local_transformer_hidden_dim * 4, sa_n_heads=self.cfg.get('local_transformer_n_heads', 1), kernel_size=1, is_causal=self.local_transformer_type == LocalTransformerType.AR, @@ -237,10 +237,11 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): local_transformer_out_projections = [] for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): # Have a separate projection layer for each codebook, to distinguish between them - local_transformer_out_projections.append(nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook)) + local_transformer_out_projections.append( + nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook) + ) self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) - def state_dict(self, destination=None, prefix='', keep_vars=False): """ Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model @@ -255,7 +256,7 @@ def state_dict(self, destination=None, prefix='', keep_vars=False): if any([substring in key for substring in keys_substrings_to_exclude]): del state_dict[key] return state_dict - + def load_state_dict(self, state_dict, strict=True): """ Modify load_state_dict so that we don't restore weights to _speaker_verification_model and _codec_model when @@ -276,7 +277,7 @@ def load_state_dict(self, state_dict, strict=True): for key in state_dict.keys(): name_with_dot = f"{name}." if key.startswith(name_with_dot): - new_state_dict[key[len(name_with_dot):]] = state_dict[key] + new_state_dict[key[len(name_with_dot) :]] = state_dict[key] child.load_state_dict(new_state_dict) def audio_to_codes(self, audio, audio_len, audio_type='target'): @@ -318,12 +319,14 @@ def codes_to_audio(self, codes, codes_len): if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: # Unstack the audio codes if they are stacked codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) - + if codes.size(2) < 5: # If the codes are too short, we need to pad them - codes = torch.cat([codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2).long() + codes = torch.cat( + [codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2 + ).long() codes_len = codes_len + 5 - codes.size(2) - + self._codec_model.eval() with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): # Make a copy to avoid modifying the original tensor if it's used elsewhere @@ -365,7 +368,7 @@ def embed_phoneme_tokens(self, phoneme_tokens): phoneme_embedding = phoneme_embedding + embedding phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1) return phoneme_embedding - + def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): """ Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes. @@ -382,41 +385,45 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_ +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | Seq. Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - + dec_out: (B, T', E) audio_codes_target: (B, C, T') targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive) if True, the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit) """ - dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) + dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) local_transformer_input = [dec_out_all] for codebook_num in range(audio_codes_target.size(1)): - codes = audio_codes_target[:, codebook_num] # (B, T') - codes = codes.reshape(-1) # (B*T',) - codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', E) + codes = audio_codes_target[:, codebook_num] # (B, T') + codes = codes.reshape(-1) # (B*T',) + codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', E) local_transformer_input.append(codebook_embedding) - local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E) - local_transformer_input = self.local_transformer_in_projection(local_transformer_input) # (B*T', C+1, 128) - _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B*T', C+1, E) + local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E) + local_transformer_input = self.local_transformer_in_projection(local_transformer_input) # (B*T', C+1, 128) + _mask = torch.ones( + local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device + ) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B*T', C+1, E) if not targets_offset_by_one: # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc. - local_transformer_output = local_transformer_output[:, :-1, :] # (B*T', C, E) + local_transformer_output = local_transformer_output[:, :-1, :] # (B*T', C, E) else: # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc. - local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E) + local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E) all_code_logits = [] for codebook_num in range(audio_codes_target.size(1)): # Using a separate projection layer for each codebook (to distinguish between them) # Checked the time - this loop is not taking much time (compared to the local transformer forward pass) - codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num, :]) # (B*T', num_all_tokens_per_codebook) + codebook_logits = self.local_transformer_out_projections[codebook_num]( + local_transformer_output[:, codebook_num, :] + ) # (B*T', num_all_tokens_per_codebook) all_code_logits.append(codebook_logits) - all_code_logits = torch.cat(all_code_logits, dim=1) # (B*T', num_codebooks * num_all_tokens_per_codebook) + all_code_logits = torch.cat(all_code_logits, dim=1) # (B*T', num_codebooks * num_all_tokens_per_codebook) all_code_logits = all_code_logits.view( audio_codes_target.size(0), audio_codes_target.size(2), -1 - ) # (B, T', C * num_all_tokens_per_codebook) + ) # (B, T', C * num_all_tokens_per_codebook) return all_code_logits @@ -425,13 +432,13 @@ def maskgit_create_random_mask(self, codes): Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN. """ # Codes: (B, C, T) - B,C,T = codes.shape + B, C, T = codes.shape # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right? - rand_values = torch.rand(B,T, device=codes.device) - # apply the cosine schedule + rand_values = torch.rand(B, T, device=codes.device) + # apply the cosine schedule frac_masked = cosine_schedule(rand_values) # how many positions to mask - n_masked = torch.ceil(frac_masked * C).long() # B,T + n_masked = torch.ceil(frac_masked * C).long() # B,T # start from all unmasked mask = torch.zeros_like(codes, dtype=torch.bool) # The code further below is the vectorized version of this: @@ -443,19 +450,19 @@ def maskgit_create_random_mask(self, codes): # # mask the top n_masked positions # mask[b, perm[:n_masked[b,t]], t] = True # - # Create random permutations - random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) # (B, C, T) - # Create a mask tensor where each position indicates if it should be masked + # Create random permutations + random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) # (B, C, T) + # Create a mask tensor where each position indicates if it should be masked mask_indices = torch.arange(C, device=codes.device).view(1, C, 1) - mask = mask_indices < n_masked.view(B, 1, T) # (B, C, T) + mask = mask_indices < n_masked.view(B, 1, T) # (B, C, T) # Apply the random permutations to the mask mask = torch.gather(mask, 1, random_permutations) - - return mask # (B, C, T) - + + return mask # (B, C, T) + def maskgit_apply_random_mask(self, codes): # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule. - # Codes: (B, C, T) + # Codes: (B, C, T) mask = self.maskgit_create_random_mask(codes) ## replace some tokens with MASK_TOKEN codes_with_mask = torch.where(mask, self.mask_token_id, codes) @@ -466,7 +473,7 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N Computes the audio codebook loss. Used by (1) The main Magpie-TTS transformer (2) The local transformer, for both autoregressive and MaskGit methods - + logits: (B, T', num_codebooks * num_tokens_per_codebook) audio_codes: (B, C, T') audio_codes_lens: (B,) @@ -483,8 +490,8 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N if not loss_mask.any(): # Without this we were very rarely getting NaNs in the loss logging.warning("No tokens valid were found in compute_loss()!") - return torch.tensor(0.0, device=loss_mask.device), loss_mask - else: + return torch.tensor(0.0, device=loss_mask.device), loss_mask + else: # repeat loss mask for each codebook to simplify code below loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1) total_codebook_loss = None @@ -523,7 +530,6 @@ def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens): total_phoneme_loss = total_phoneme_loss + phoneme_loss total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor return total_phoneme_loss, loss_mask - def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None): backend_out = self.decoder( @@ -534,7 +540,6 @@ def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_value ) # hidden_states = backend_out.last_hidden_state # (B, T_total, H) return backend_out - def logits_to_audio_codes(self, all_code_logits, audio_codes_lens): # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook) @@ -555,7 +560,17 @@ def logits_to_audio_codes(self, all_code_logits, audio_codes_lens): return all_preds - def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0, n_steps=3): + def local_transformer_sample_maskgit( + self, + dec_output, + temperature=0.7, + topk=80, + unfinished_items={}, + finished_items={}, + use_cfg=False, + cfg_scale=1.0, + n_steps=3, + ): """ Sample codes for one timestep from the local transformer using MaskGit. """ @@ -565,13 +580,15 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, device = dec_output.device # disable KV cache since our transformer is not causal self.local_transformer.reset_cache(use_cache=False) - dec_output = dec_output.unsqueeze(1) # (B, 1, E) - local_transformer_input_init = self.local_transformer_in_projection(dec_output) # (B, 1, D) where D is the dimension of the local transformer + dec_output = dec_output.unsqueeze(1) # (B, 1, E) + local_transformer_input_init = self.local_transformer_in_projection( + dec_output + ) # (B, 1, D) where D is the dimension of the local transformer C = self.num_audio_codebooks B = dec_output.size(0) min_confidence = float("-inf") - max_confidence = 10000 # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')? + max_confidence = 10000 # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')? confidences = min_confidence * torch.ones(B, C, device=device) # initialize to all masked codes = self.mask_token_id * torch.ones((B, C), device=device, dtype=torch.long) @@ -580,7 +597,9 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, # get mask fraction frac_masked = cosine_schedule(torch.tensor(step / (n_steps))) # how many codebooks to mask - n_masked = torch.ceil(C * frac_masked).long() # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)? + n_masked = torch.ceil( + C * frac_masked + ).long() # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)? n_unmasked = C - n_masked # pick top-confidence codebooks up to n_unmasked _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) @@ -588,32 +607,42 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, # replace masks of the top-k confident codebooks with the the codes that were sampled for them unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices) codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) - - # build transformer input + + # build transformer input local_transformer_input = local_transformer_input_init for codebook_num in range(C): - next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) # (B, 1, 768) - next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, d_local) - local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, codebook_num+1, d_local) + next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze( + 1 + ) # (B, 1, 768) + next_local_transformer_input = self.local_transformer_in_projection( + next_local_transformer_input + ) # (B, 1, d_local) + local_transformer_input = torch.cat( + [local_transformer_input, next_local_transformer_input], dim=1 + ) # (B, codebook_num+1, d_local) # run transformer - _mask = torch.ones(B, C+1, device=device) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, C+1, d_local) - + _mask = torch.ones(B, C + 1, device=device) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)[ + 'output' + ] # (B, C+1, d_local) + # get logits logits = [] for codebook_num in range(C): # The `codebook_num+1` is to drop first position which corresponds to the magpie latent - codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, codebook_num+1, :]) # (B, num_audio_tokens_per_codebook) + codebook_logits = self.local_transformer_out_projections[codebook_num]( + local_transformer_output[:, codebook_num + 1, :] + ) # (B, num_audio_tokens_per_codebook) logits.append(codebook_logits) - logits = torch.stack(logits, dim=1) # (B, C, num_audio_tokens_per_codebook) + logits = torch.stack(logits, dim=1) # (B, C, num_audio_tokens_per_codebook) # apply CFG if use_cfg: actual_batch_size = logits.size(0) // 2 conditional_logits = logits[:actual_batch_size] unconditional_logits = logits[actual_batch_size:] - cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits logits[:actual_batch_size] = cfg_logits # handle unfinished and finished items @@ -622,48 +651,65 @@ def local_transformer_sample_maskgit(self, dec_output, temperature=0.7, topk=80, for item_idx in finished_items: logits[item_idx, :, :] = float('-inf') logits[item_idx, :, self.audio_eos_id] = 0.0 - + # sample with top-k - logits_topk = torch.topk(logits, topk, dim=-1)[0] # (B, C, topk) - indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) # (B, C, num_audio_tokens_per_codebook) + logits_topk = torch.topk(logits, topk, dim=-1)[0] # (B, C, topk) + indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) # (B, C, num_audio_tokens_per_codebook) logits_rescored = logits.clone() logits_rescored[indices_to_remove] = float('-inf') - probs = torch.softmax(logits_rescored / temperature, dim=-1) # (B, C, num_audio_tokens_per_codebook) - sampled_codes = torch.multinomial(probs.view(B*C, -1), 1).view(B, C) + probs = torch.softmax(logits_rescored / temperature, dim=-1) # (B, C, num_audio_tokens_per_codebook) + sampled_codes = torch.multinomial(probs.view(B * C, -1), 1).view(B, C) if use_cfg: # TODO @rfejgin: why do we need to keep second half of the batch? can probably optimize this sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size] probs[actual_batch_size:] = probs[:actual_batch_size] - confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) + confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) # set confidence to max for unmasked codebooks so that they will remain unmasked - confidences.scatter_(index=topk_indices, dim=1, src=max_confidence*torch.ones_like(topk_indices, dtype=torch.float)) + confidences.scatter_( + index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float) + ) # replace entries in sampled_codes with previously unmasked codebooks sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) # optionally: add noise to confidences here (as in token-critic paper) (not implemented) - + codes = sampled_codes - assert not (codes == self.mask_token_id).any(), f"Codes contain mask tokens after completion of MaskGit sampling" + assert not ( + codes == self.mask_token_id + ).any(), f"Codes contain mask tokens after completion of MaskGit sampling" if use_cfg: codes = codes[:actual_batch_size] return codes - def local_transformer_sample_autoregressive(self, dec_output, temperature=0.7, topk=80, unfinished_items={}, finished_items={}, use_cfg=False, cfg_scale=1.0): + def local_transformer_sample_autoregressive( + self, + dec_output, + temperature=0.7, + topk=80, + unfinished_items={}, + finished_items={}, + use_cfg=False, + cfg_scale=1.0, + ): # dec_output: (B, E) self.local_transformer.reset_cache(use_cache=False) - dec_output = dec_output.unsqueeze(1) # (B, 1, E) - local_transformer_input = self.local_transformer_in_projection(dec_output) # (B, 1, 128) + dec_output = dec_output.unsqueeze(1) # (B, 1, E) + local_transformer_input = self.local_transformer_in_projection(dec_output) # (B, 1, 128) all_preds = [] for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor): - _mask = torch.ones( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128) - codebook_logits = self.local_transformer_out_projections[codebook_num](local_transformer_output[:, -1, :]) # (B, num_all_tokens_per_codebook) + _mask = torch.ones( + local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device + ) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128) + codebook_logits = self.local_transformer_out_projections[codebook_num]( + local_transformer_output[:, -1, :] + ) # (B, num_all_tokens_per_codebook) if use_cfg: actual_batch_size = codebook_logits.size(0) // 2 conditional_logits = codebook_logits[:actual_batch_size] unconditional_logits = codebook_logits[actual_batch_size:] - cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits codebook_logits[:actual_batch_size] = cfg_logits for item_idx in unfinished_items: @@ -672,27 +718,38 @@ def local_transformer_sample_autoregressive(self, dec_output, temperature=0.7, t codebook_logits[item_idx, :] = float('-inf') codebook_logits[item_idx, self.audio_eos_id] = 0.0 - codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) - indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1) # (B, num_tokens_per_codebook) + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( + -1 + ) # (B, num_tokens_per_codebook) codebook_logits_rescored = codebook_logits.clone() codebook_logits_rescored[indices_to_remove] = float('-inf') - codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) if use_cfg: codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] all_preds.append(codebook_preds) - next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1) # (B, 1, 128) - next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) # (B, 1, 128) - local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) # (B, T+1, 128) + next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze( + 1 + ) # (B, 1, 128) + next_local_transformer_input = self.local_transformer_in_projection( + next_local_transformer_input + ) # (B, 1, 128) + local_transformer_input = torch.cat( + [local_transformer_input, next_local_transformer_input], dim=1 + ) # (B, T+1, 128) - all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) + all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) if use_cfg: all_preds = all_preds[:actual_batch_size] return all_preds - - def sample_codes_from_logits(self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={}): + def sample_codes_from_logits( + self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={} + ): # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep all_preds = [] for idx in range(self.num_audio_codebooks * self.frame_stacking_factor): @@ -711,7 +768,9 @@ def sample_codes_from_logits(self, all_code_logits_t, temperature=0.7, topk=80, codebook_logits_rescored = codebook_logits.clone() codebook_logits_rescored[indices_to_remove] = float('-inf') - codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook) + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) all_preds.append(codebook_preds) all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) @@ -731,7 +790,9 @@ def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, t codebook_logits_rescored = codebook_logits.clone() codebook_logits_rescored[indices_to_remove] = float('-inf') - codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) # (B, num_tokens_per_codebook) + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) all_preds.append(codebook_preds) all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) @@ -760,7 +821,9 @@ def log_val_audio_example( is_wandb = isinstance(logger, WandbLogger) is_tb = isinstance(logger, TensorBoardLogger) if not is_wandb and not is_tb: - raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.") + raise ValueError( + f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported." + ) for idx in range(min(3, pred_audio.size(0))): pred_audio_np = pred_audio[idx].float().detach().cpu().numpy() @@ -775,9 +838,15 @@ def log_val_audio_example( if is_wandb: wandb_audio_log[f"Audio/Example_{idx}"] = list() if context_audio_np is not None: - wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context")) - wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction")) - wandb_audio_log[f"Audio/Example_{idx}"].append(wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target")) + wandb_audio_log[f"Audio/Example_{idx}"].append( + wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context") + ) + wandb_audio_log[f"Audio/Example_{idx}"].append( + wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction") + ) + wandb_audio_log[f"Audio/Example_{idx}"].append( + wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target") + ) if is_tb: if context_audio_np is not None: @@ -802,12 +871,11 @@ def log_val_audio_example( return wandb_audio_log - def join_embeddings_temporally( self, - embeddings: Sequence[torch.Tensor], # [ (B, Ti, E), … ] - lengths: Sequence[torch.Tensor], # [ (B,), … ] same order/size as `embeddings` - pad_embed: torch.Tensor | None = None # (E,) defaults to zeros + embeddings: Sequence[torch.Tensor], # [ (B, Ti, E), … ] + lengths: Sequence[torch.Tensor], # [ (B,), … ] same order/size as `embeddings` + pad_embed: torch.Tensor | None = None, # (E,) defaults to zeros ) -> Tuple[torch.Tensor, torch.Tensor]: """ Merges Multiple Embedding sequences into a single Embedding Sequence. @@ -816,7 +884,7 @@ def join_embeddings_temporally( embeddings : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding lengths : Sequence of tensors, each of shape (B,) pad_embed : (E,) — embedding to use for padding, defaults to zeros - + Returns: joined : (B, max_sum_len, E) — merged & padded out_lengths : (B,) — total lengths of each batch element after merging @@ -829,14 +897,14 @@ def join_embeddings_temporally( dtype = embeddings[0].dtype # 1. compute output sizes - len_stack = torch.stack(tuple(lengths), dim=0) # (N, B) + len_stack = torch.stack(tuple(lengths), dim=0) # (N, B) out_lengths = len_stack.sum(0) - max_len = int(out_lengths.max()) + max_len = int(out_lengths.max()) if pad_embed is None: pad_embed = torch.zeros(E, dtype=dtype, device=device) - joined = pad_embed.expand(B, max_len, E).clone() # (B,max_len,E) + joined = pad_embed.expand(B, max_len, E).clone() # (B,max_len,E) # batch row indices batch_rows = torch.arange(B, device=device).unsqueeze(1) # (B,1) @@ -846,15 +914,14 @@ def join_embeddings_temporally( for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)): Ti = embedding_i.shape[1] - t_idx = torch.arange(Ti, device=device) # (Ti,) - mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1) # (B,Ti) + t_idx = torch.arange(Ti, device=device) # (Ti,) + mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1) # (B,Ti) # destination columns: offset + t - dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti) + dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti) # Assign embedding_i to the correct positions in joined - joined[batch_rows.expand_as(mask)[mask], - dest_cols[mask]] = embedding_i[mask] + joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask] # move cursor past this segment offset += len_i @@ -870,10 +937,15 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): text_mask = get_mask_from_lengths(text_lens) cas_embedding = self.cas_encoder(text, subword_mask=text_mask) # (B, L, E) text_embedded = text_embedded + cas_embedding - + if text_embedded.shape[1] < self.streaming_speech_delay + 1: # If text is too short, pad it with zeros - padding_tensor = torch.zeros(text_embedded.shape[0], self.streaming_speech_delay + 1 - text_embedded.shape[1], text_embedded.shape[2], device=text_embedded.device) + padding_tensor = torch.zeros( + text_embedded.shape[0], + self.streaming_speech_delay + 1 - text_embedded.shape[1], + text_embedded.shape[2], + device=text_embedded.device, + ) text_embedded = torch.cat([text_embedded, padding_tensor], dim=1) if dropout_text_input: @@ -892,15 +964,22 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): context_audio_codes, context_audio_codes_lens = self.audio_to_codes( batch['context_audio'], batch['context_audio_lens'], audio_type='context' ) - - context_audio_codes, context_audio_codes_lens = self.stack_codes(context_audio_codes, context_audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks) + + context_audio_codes, context_audio_codes_lens = self.stack_codes( + context_audio_codes, + context_audio_codes_lens, + self.audio_bos_id, + self.audio_eos_id, + self.frame_stacking_factor, + self.num_audio_codebooks, + ) context_audio_embedded = self.embed_audio_tokens(context_audio_codes) # (B, T', E) # Context Text context_text_tokens = batch['context_text_tokens'] context_text_lens = batch['context_text_tokens_lens'] context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens) # (B, L, E) - + remaining_text_embedded = None remaining_text_lens = None if self.text_input_mode == 'full': @@ -909,17 +988,17 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): lengths=[context_audio_codes_lens, context_text_lens, text_lens], ) elif self.text_input_mode == 'streaming': - prompt_text_embedded = text_embedded[:,:self.streaming_speech_delay,:] + prompt_text_embedded = text_embedded[:, : self.streaming_speech_delay, :] prompt_text_lens = torch.ones_like(text_lens) * self.streaming_speech_delay context_embedding, context_lens = self.join_embeddings_temporally( embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded], lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens], ) - remaining_text_embedded = text_embedded[:,self.streaming_speech_delay:,:] + remaining_text_embedded = text_embedded[:, self.streaming_speech_delay :, :] remaining_text_lens = text_lens - self.streaming_speech_delay remaining_text_lens = remaining_text_lens.clamp(min=0) remaining_text_mask = get_mask_from_lengths(remaining_text_lens) - remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2) # (B, T, E) + remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2) # (B, T, E) else: raise ValueError(f"Invalid text input mode: {self.text_input_mode}") @@ -944,7 +1023,7 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens): transformer_out: (B, T, E) context_lens: (B,) - start index of target per batch target_lens: (B,) - length of target per batch - + Returns: (B, T_max, E) tensor where T_max = max(target_lens) """ B, T, E = transformer_out.shape @@ -958,29 +1037,29 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens): range_indices = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1) gather_indices = context_lens.unsqueeze(1) + range_indices # (B, max_len) gather_indices = torch.clamp(gather_indices, max=transformer_out.size(1) - 1) - + # Expand to shape (B, max_len, E) for gather gather_indices_exp = gather_indices.unsqueeze(2).expand(-1, -1, E) sliced = torch.gather(transformer_out, dim=1, index=gather_indices_exp) return sliced - def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks): if stacking_factor == 1: return codes, codes_lens - - contains_bos = codes[0,0,0].item() == bos_id + + contains_bos = codes[0, 0, 0].item() == bos_id if contains_bos: - bos_tensor_repeated = torch.full((codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device) # (B,stacking_factor*C, 1) - codes = codes[:, :, 1:] # Remove the bos token - codes_lens = codes_lens - 1 # Remove the bos token + bos_tensor_repeated = torch.full( + (codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device + ) # (B,stacking_factor*C, 1) + codes = codes[:, :, 1:] # Remove the bos token + codes_lens = codes_lens - 1 # Remove the bos token B, C, T = codes.shape s = int(stacking_factor) # --- Compute max padding needed --- pad_t = (-T) % s # pad so that T' is divisible by s - pad_tail = torch.full((B, C, pad_t), eos_id, - dtype=codes.dtype, device=codes.device) + pad_tail = torch.full((B, C, pad_t), eos_id, dtype=codes.dtype, device=codes.device) codes = torch.cat([codes, pad_tail], dim=-1) # --- Stack time into channel dimension --- @@ -995,11 +1074,11 @@ def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_co new_lens = new_lens + 1 return codes, new_lens - + def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): if stacking_factor == 1: return stacked_codes, stacked_lens - + B, CxS, T_out = stacked_codes.shape s = int(stacking_factor) assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})" @@ -1017,32 +1096,37 @@ def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens): # import ipdb; ipdb.set_trace() - phoneme_tokens = phoneme_tokens.unsqueeze(1) # (B, 1, L) + phoneme_tokens = phoneme_tokens.unsqueeze(1) # (B, 1, L) phoneme_tokens, phoneme_tokens_lens = self.stack_codes( - phoneme_tokens, - phoneme_tokens_lens, - self.phoneme_tokenizer.bos_token_id, - self.phoneme_tokenizer.eos_token_id, - self.phoneme_stacking_factor, - 1 + phoneme_tokens, + phoneme_tokens_lens, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.eos_token_id, + self.phoneme_stacking_factor, + 1, ) # import ipdb; ipdb.set_trace() - phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens) # (B, T', E) + phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens) # (B, T', E) phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens) - phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2) # (B, T', E) + phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2) # (B, T', E) - zero_context_tensor = torch.zeros(context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device) + zero_context_tensor = torch.zeros( + context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device + ) phoneme_channel_input, phoneme_channel_input_lens = self.join_embeddings_temporally( embeddings=[zero_context_tensor, phoneme_tokens_embedded], lengths=[context_lens, phoneme_tokens_lens], ) return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens - def process_batch(self, batch, mode="train"): dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False - dropout_phoneme_input = ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input)) if mode == 'train' else False + dropout_phoneme_input = ( + ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input)) + if mode == 'train' + else False + ) context_tensors = self.prepare_context_tensors(batch, dropout_text_input) print("text lens", context_tensors['text_lens']) remaining_text_embedded = context_tensors['remaining_text_embedded'] @@ -1054,9 +1138,11 @@ def process_batch(self, batch, mode="train"): if torch.rand(1).item() < self.cfg_unconditional_prob: dropout_conditional_input = True # Get embedding of a special UNCONDITIONAL_TOKEN - cfg_token_id = self.cfg_unk_token_id # int - cfg_token_embedding = self.decoder.get_input_embeddings()(torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device)) # (B, 1, E) - # Keeping the dummy context same size as the context embedding makes + cfg_token_id = self.cfg_unk_token_id # int + cfg_token_embedding = self.decoder.get_input_embeddings()( + torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device) + ) # (B, 1, E) + # Keeping the dummy context same size as the context embedding makes # inference easier especially with KV caching and using a duplicated batch. context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1) # (B, T_total, E) # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation. @@ -1072,19 +1158,32 @@ def process_batch(self, batch, mode="train"): audio_codes = self._codec_converter.convert_original_to_new( audio_tokens=audio_codes, audio_lens=audio_codes_lens ).long() - - audio_codes, audio_codes_lens = self.stack_codes(audio_codes, audio_codes_lens, self.audio_bos_id, self.audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks) + + audio_codes, audio_codes_lens = self.stack_codes( + audio_codes, + audio_codes_lens, + self.audio_bos_id, + self.audio_eos_id, + self.frame_stacking_factor, + self.num_audio_codebooks, + ) audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1 audio_codes_target = audio_codes[:, :, 1:] # (B, C, T') Target for the decoder audio_codes_input = audio_codes[:, :, :-1] # (B, C, T') Input to the decoder - audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) # Computing this to be use in the alignment encoder + audio_codes_input_embedded = self.embed_audio_tokens( + audio_codes_input + ) # (B, T, E) # Computing this to be use in the alignment encoder if remaining_text_embedded is not None: # Make remaining text embedded the same size as audio_codes_input_embedded by padding with zeros on the right padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1) - padding_tensor = torch.zeros(remaining_text_embedded.size(0), padding_len, remaining_text_embedded.size(2), device=remaining_text_embedded.device) + padding_tensor = torch.zeros( + remaining_text_embedded.size(0), + padding_len, + remaining_text_embedded.size(2), + device=remaining_text_embedded.device, + ) remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1) audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded - context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( embeddings=[context_embedding, audio_codes_input_embedded], @@ -1093,18 +1192,23 @@ def process_batch(self, batch, mode="train"): if self.phoneme_tokenizer is not None: context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay - phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = self.prepare_phoneme_channel_input( - batch['phoneme_tokens'], - batch['phoneme_tokens_lens'], - context_lens_for_phonemes + phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = ( + self.prepare_phoneme_channel_input( + batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes + ) ) print("phoneme_tokens_lens", phoneme_tokens_lens) print("audio_codes_lens", audio_codes_lens_input) if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]: - padding_tensor = torch.zeros(phoneme_channel_input.shape[0], context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1], phoneme_channel_input.shape[2], device=phoneme_channel_input.device) + padding_tensor = torch.zeros( + phoneme_channel_input.shape[0], + context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1], + phoneme_channel_input.shape[2], + device=phoneme_channel_input.device, + ) phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1) else: - phoneme_channel_input = phoneme_channel_input[:, :context_plus_audio_embedded.shape[1], :] + phoneme_channel_input = phoneme_channel_input[:, : context_plus_audio_embedded.shape[1], :] if (not dropout_conditional_input) and (not dropout_phoneme_input): context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input @@ -1114,13 +1218,13 @@ def process_batch(self, batch, mode="train"): attention_mask=get_mask_from_lengths(context_plus_audio_lens), ) transformer_hidden_states = transformer_out.last_hidden_state # (B, T_total, E) - + pred_embeddings = self.slice_pred_embeddings( transformer_hidden_states, context_lens=context_lens, target_lens=audio_codes_lens_target, ) - + logits = self.final_proj(pred_embeddings) # (B, T', num_codebooks * num_tokens_per_codebook) # import ipdb; ipdb.set_trace() codebook_loss, loss_mask = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) @@ -1132,14 +1236,22 @@ def process_batch(self, batch, mode="train"): if self.local_transformer_type == LocalTransformerType.MASKGIT: # randomly replace some positions with MASK_TOKEN audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target) - local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_masked, targets_offset_by_one=True) - #audio_codes_masked = audio_codes_masked[:, 1:, :] - local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask) + local_transformer_logits = self.compute_local_transformer_logits( + pred_embeddings, audio_codes_masked, targets_offset_by_one=True + ) + # audio_codes_masked = audio_codes_masked[:, 1:, :] + local_transformer_loss, _ = self.compute_loss( + local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask + ) else: # autoregressive assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" - local_transformer_logits = self.compute_local_transformer_logits(pred_embeddings, audio_codes_target, targets_offset_by_one=False) - local_transformer_loss, _ = self.compute_loss(local_transformer_logits, audio_codes_target, audio_codes_lens_target, None) + local_transformer_logits = self.compute_local_transformer_logits( + pred_embeddings, audio_codes_target, targets_offset_by_one=False + ) + local_transformer_loss, _ = self.compute_loss( + local_transformer_logits, audio_codes_target, audio_codes_lens_target, None + ) local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0) loss = loss + local_transformer_loss_scale * local_transformer_loss @@ -1148,12 +1260,16 @@ def process_batch(self, batch, mode="train"): pred_embeddings_phoneme = self.slice_pred_embeddings( transformer_hidden_states, context_lens=context_lens_for_phonemes, - target_lens=phoneme_tokens_lens-1, + target_lens=phoneme_tokens_lens - 1, ) - phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) # (B, T', phoneme_stacking_factor * phoneme_vocab_size) + phoneme_logits = self.phoneme_final_proj( + pred_embeddings_phoneme + ) # (B, T', phoneme_stacking_factor * phoneme_vocab_size) if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input): # Only compute phoneme loss if not doing unconditional training or text dropout - phoneme_loss, _ = self.compute_phoneme_loss(phoneme_logits, phoneme_tokens[:,:,1:].long(), phoneme_tokens_lens - 1) + phoneme_loss, _ = self.compute_phoneme_loss( + phoneme_logits, phoneme_tokens[:, :, 1:].long(), phoneme_tokens_lens - 1 + ) print("No Dropout - phoneme loss:", phoneme_loss.item()) else: phoneme_loss = torch.tensor(0.0, device=logits.device) @@ -1174,8 +1290,6 @@ def process_batch(self, batch, mode="train"): 'context_audio_codes_lens': context_tensors['context_audio_codes_lens'], # (B,) } - - def training_step(self, batch, batch_idx): batch_output = self.process_batch(batch) loss = batch_output['loss'] @@ -1186,7 +1300,7 @@ def training_step(self, batch, batch_idx): if self.phoneme_tokenizer is not None: phoneme_loss = batch_output['phoneme_loss'] self.log('train/phoneme_loss', phoneme_loss, prog_bar=True, sync_dist=True) - + local_transformer_loss = batch_output['local_transformer_loss'] if local_transformer_loss is not None: self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True) @@ -1198,25 +1312,32 @@ def training_step(self, batch, batch_idx): "train/batch_size": batch_size, "train/text_token_max_len": text_token_max_len, "train/text_token_total_num_in_batch": text_token_total_num, - "train/text_token_pad_ratio_percent_in_batch": 100 * (1 - text_token_total_num / (batch_size * text_token_max_len)), + "train/text_token_pad_ratio_percent_in_batch": 100 + * (1 - text_token_total_num / (batch_size * text_token_max_len)), } if "audio_codes" in batch: audio_codes_max_len = batch["audio_codes"].shape[-1] audio_codes_total_num = batch["audio_codes_lens"].sum() - batch_info_dict.update({ - "train/audio_codes_max_len": audio_codes_max_len, - "train/audio_codes_total_num_in_batch": audio_codes_total_num, - "train/audio_codes_pad_ratio_percent_in_batch": 100 * (1 - audio_codes_total_num / (batch_size * audio_codes_max_len)), - }) + batch_info_dict.update( + { + "train/audio_codes_max_len": audio_codes_max_len, + "train/audio_codes_total_num_in_batch": audio_codes_total_num, + "train/audio_codes_pad_ratio_percent_in_batch": 100 + * (1 - audio_codes_total_num / (batch_size * audio_codes_max_len)), + } + ) else: audio_samples_max_len = batch["audio"].shape[-1] audio_samples_total_num = batch["audio_lens"].sum() - batch_info_dict.update({ - "train/audio_samples_max_len": audio_samples_max_len, - "train/audio_samples_total_num_in_batch": audio_samples_total_num, - "train/audio_samples_pad_ratio_percent_in_batch": 100 * (1 - audio_samples_total_num / (batch_size * audio_samples_max_len)), - }) + batch_info_dict.update( + { + "train/audio_samples_max_len": audio_samples_max_len, + "train/audio_samples_total_num_in_batch": audio_samples_total_num, + "train/audio_samples_pad_ratio_percent_in_batch": 100 + * (1 - audio_samples_total_num / (batch_size * audio_samples_max_len)), + } + ) self.log_dict(batch_info_dict, on_step=True) @@ -1233,7 +1354,7 @@ def validation_step(self, batch, batch_idx): audio_codes_lens_target = batch_output['audio_codes_lens_target'] context_audio_codes = batch_output['context_audio_codes'] context_audio_codes_lens = batch_output['context_audio_codes_lens'] - + if batch_idx == 0 and self.global_rank == 0: # Prepare dictionary for aggregated wandb logging wandb_log_dict = {} @@ -1249,25 +1370,25 @@ def validation_step(self, batch, batch_idx): for logger in self.loggers: if isinstance(logger, WandbLogger) and wandb_log_dict: logger.experiment.log(wandb_log_dict) - + # infer_output_no_cfg_noLT = self.infer_batch( - # batch, - # max_decoder_steps=500, - # temperature=0.7, - # topk=80, - # use_local_transformer_for_inference=False, - # maskgit_n_steps=3, - # use_cfg=False, + # batch, + # max_decoder_steps=500, + # temperature=0.7, + # topk=80, + # use_local_transformer_for_inference=False, + # maskgit_n_steps=3, + # use_cfg=False, # cfg_scale=1.0 # ) # infer_output_cfg_withLT = self.infer_batch( - # batch, - # max_decoder_steps=500, - # temperature=0.7, - # topk=80, + # batch, + # max_decoder_steps=500, + # temperature=0.7, + # topk=80, # use_local_transformer_for_inference=self.local_transformer_type != LocalTransformerType.NO_LT, # maskgit_n_steps=3, - # use_cfg=True, + # use_cfg=True, # cfg_scale=2.5 # ) # pred_audio_no_cfg_noLT, pred_audio_no_cfg_noLT_lens = infer_output_no_cfg_noLT[0], infer_output_no_cfg_noLT[1] @@ -1292,7 +1413,7 @@ def validation_step(self, batch, batch_idx): # ) # logger.experiment.add_audio( # "val/pred_audio_cfg_withLT", pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, global_step=batch_idx - # ) + # ) local_transformer_loss = batch_output['local_transformer_loss'] val_output = { @@ -1313,18 +1434,18 @@ def on_validation_epoch_end(self): collect = lambda key: torch.stack([x[key] for x in self.validation_step_outputs]).mean() val_loss = collect("val_loss") val_codebook_loss = collect("val_codebook_loss") - + self.log("val_loss", val_loss, prog_bar=True, sync_dist=True) self.log("val/codebook_loss", val_codebook_loss, prog_bar=True, sync_dist=True) - + if self.local_transformer_type != LocalTransformerType.NO_LT: val_local_transformer_loss = collect("val_local_transformer_loss") self.log("val/local_transformer_loss", val_local_transformer_loss, prog_bar=True, sync_dist=True) - + if self.phoneme_tokenizer is not None: val_phoneme_loss = collect("val_phoneme_loss") self.log("val/phoneme_loss", val_phoneme_loss, prog_bar=True, sync_dist=True) - + self.validation_step_outputs.clear() # free memory def get_dataset(self, dataset_cfg, dataset_type): @@ -1354,7 +1475,7 @@ def get_dataset(self, dataset_cfg, dataset_type): ) # This will be used in worker_init_fn for instantiating tokenizer if self.phoneme_tokenizer is not None: dataset.phoneme_tokenizer_config = self.cfg.phoneme_tokenizer - + return dataset def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.DataLoader: @@ -1379,9 +1500,9 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D use_text_conditioning_tokenizer=True, text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name, tokenizer_config=self.cfg.text_tokenizers, - phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None) + phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None), ) - + data_loader = get_lhotse_dataloader_from_config( config=dataset_cfg.dataset, global_rank=self.global_rank, @@ -1427,10 +1548,7 @@ def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader: if dataset_cfg.dataloader_params.num_workers == 0: persistent_workers = False # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable) - dataset.text_tokenizer = setup_tokenizers( - all_tokenizers_config=self.cfg.text_tokenizers, - mode='test' - ) + dataset.text_tokenizer = setup_tokenizers(all_tokenizers_config=self.cfg.text_tokenizers, mode='test') if self.cfg.get("phoneme_tokenizer", None) is not None: dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer) @@ -1449,7 +1567,20 @@ def setup_validation_data(self, cfg): def setup_test_data(self, cfg): self._test_dl = self._setup_test_dataloader(cfg) - def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, use_local_transformer_for_inference=False, maskgit_n_steps=3, use_cfg=False, cfg_scale=1.0, phoneme_input_type='gt', phoneme_sampling_method='argmax', dropout_text_input=False): + def infer_batch( + self, + batch, + max_decoder_steps=500, + temperature=0.7, + topk=80, + use_local_transformer_for_inference=False, + maskgit_n_steps=3, + use_cfg=False, + cfg_scale=1.0, + phoneme_input_type='gt', + phoneme_sampling_method='argmax', + dropout_text_input=False, + ): # TODO: Make this API same as MagpieTTS model. with torch.inference_mode(): start_time = time.time() @@ -1458,29 +1589,43 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us context_lens = context_tensors['context_lens'] # (B,) remaining_text_embedded = context_tensors['remaining_text_embedded'] remaining_text_lens = context_tensors['remaining_text_lens'] - + if self.phoneme_tokenizer is not None: context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay - phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = self.prepare_phoneme_channel_input( - batch['phoneme_tokens'], - batch['phoneme_tokens_lens'], - context_lens_for_phonemes + phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = ( + self.prepare_phoneme_channel_input( + batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes + ) + ) + phoneme_channel_input_pad_tensor = torch.zeros( + phoneme_channel_input.size(0), + max_decoder_steps, + phoneme_channel_input.size(2), + device=phoneme_channel_input.device, ) - phoneme_channel_input_pad_tensor = torch.zeros(phoneme_channel_input.size(0), max_decoder_steps, phoneme_channel_input.size(2), device=phoneme_channel_input.device) phoneme_channel_input = torch.cat([phoneme_channel_input, phoneme_channel_input_pad_tensor], dim=1) - + audio_codes_bos = torch.full( - (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1), self.audio_bos_id, device=context_embedding.device - ).long() + (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1), + self.audio_bos_id, + device=context_embedding.device, + ).long() audio_codes_lens = torch.full((context_embedding.size(0),), 1, device=context_embedding.device).long() audio_codes_input = audio_codes_bos audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) if self.text_input_mode == 'streaming': remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1 - remaining_text_pad_tensor = torch.zeros(remaining_text_embedded.size(0), remaining_text_pad_length, remaining_text_embedded.size(2), device=remaining_text_embedded.device) + remaining_text_pad_tensor = torch.zeros( + remaining_text_embedded.size(0), + remaining_text_pad_length, + remaining_text_embedded.size(2), + device=remaining_text_embedded.device, + ) remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1) - audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :] # :1 corresponds to audio BOS. + audio_codes_input_embedded = ( + audio_codes_input_embedded + remaining_text_embedded[:, :1, :] + ) # :1 corresponds to audio BOS. context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( embeddings=[context_embedding, audio_codes_input_embedded], @@ -1488,23 +1633,28 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us ) min_context_len = context_plus_audio_lens.min().item() if self.phoneme_tokenizer is not None: - min_context_len = min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1 # 1 for audio BOS that we had added. + min_context_len = ( + min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1 + ) # 1 for audio BOS that we had added. actual_batch_size = context_embedding.size(0) if use_cfg: dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()( torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=context_embedding.device) - ) # (B, 1, E) - dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand(-1, context_embedding.size(1), -1) # (B, T_total, E) - + ) # (B, 1, E) + dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand( + -1, context_embedding.size(1), -1 + ) # (B, T_total, E) + dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally( embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded], lengths=[context_lens, audio_codes_lens], ) first_inference_input = torch.cat( - [context_plus_audio_embedded, dummy_context_plus_audio_embedded], - dim=0 - )[:,:min_context_len, :] # (2B, T_min, E) + [context_plus_audio_embedded, dummy_context_plus_audio_embedded], dim=0 + )[ + :, :min_context_len, : + ] # (2B, T_min, E) else: first_inference_input = context_plus_audio_embedded[:, :min_context_len, :] # (B, T_min, E) # First forward pass to get the initial hidden state and past key values @@ -1521,22 +1671,22 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us all_predictions = [] end_indices = {} - + current_text_positions = [] for item_idx in range(context_embedding.size(0)): # 0 if we have started reading the remaining text otherwise negative (indicating how far we are before we start reading the remaining text) current_text_positions.append(min_context_len - context_plus_audio_lens[item_idx]) current_text_positions = torch.tensor(current_text_positions, device=context_embedding.device).long() if self.phoneme_tokenizer is not None: - current_phoneme_positions = current_text_positions - current_text_positions.max() - 1 # Make it 0-indexed. - # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay - pred_phoneme_token_lists = [ - [] for _ in range(actual_batch_size) - ] - gt_phoneme_token_lists = [ - [] for _ in range(actual_batch_size) - ] - phoneme_stream_ended = torch.zeros(actual_batch_size, device=context_embedding.device).bool() # (B,) Whether phoneme stream has ended for this item. + current_phoneme_positions = ( + current_text_positions - current_text_positions.max() - 1 + ) # Make it 0-indexed. + # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay + pred_phoneme_token_lists = [[] for _ in range(actual_batch_size)] + gt_phoneme_token_lists = [[] for _ in range(actual_batch_size)] + phoneme_stream_ended = torch.zeros( + actual_batch_size, device=context_embedding.device + ).bool() # (B,) Whether phoneme stream has ended for this item. for idx in range(max_decoder_steps): # import ipdb; ipdb.set_trace() current_text_positions += 1 @@ -1546,19 +1696,23 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us if idx % 20 == 0: print(f"Decoding timestep {idx}") - all_code_logits_t = self.final_proj(last_hidden[:, -1, :]) # (B, num_codebooks * num_tokens_per_codebook) - + all_code_logits_t = self.final_proj( + last_hidden[:, -1, :] + ) # (B, num_codebooks * num_tokens_per_codebook) + if self.phoneme_tokenizer is not None: - all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) # (B, phoneme_stacking_factor * phoneme_vocab_size) + all_code_logits_t_phoneme = self.phoneme_final_proj( + last_hidden[:, -1, :] + ) # (B, phoneme_stacking_factor * phoneme_vocab_size) all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] if use_cfg: conditional_logits = all_code_logits_t[:actual_batch_size] unconditional_logits = all_code_logits_t[actual_batch_size:] - all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits if use_local_transformer_for_inference: - if self.local_transformer_type == LocalTransformerType.AR : + if self.local_transformer_type == LocalTransformerType.AR: # Autoregressive sampling with local transformer audio_codes_next = self.local_transformer_sample_autoregressive( dec_output=last_hidden[:, -1, :], @@ -1577,54 +1731,88 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us cfg_scale=cfg_scale, ) else: - raise ValueError(f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}") + raise ValueError( + f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}" + ) # TODO @rfejgin: should we add argmax sampling for EOS here too? all_codes_next_argmax = audio_codes_next else: # Parallel sampling from logits - audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) # (B, num_codebooks) - all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) # (B, num_codebooks) + audio_codes_next = self.sample_codes_from_logits( + all_code_logits_t, temperature=temperature, topk=topk + ) # (B, num_codebooks) + all_codes_next_argmax = self.sample_codes_from_logits( + all_code_logits_t, temperature=0.01 + ) # (B, num_codebooks) phoneme_channel_input_t = None - + if self.phoneme_tokenizer is not None: - all_codes_next_phoneme = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=temperature, topk=topk) # (B, phoneme_stacking_factor) - all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01) # (B, phoneme_stacking_factor) - pred_phoneme_tokens = all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme # B, phoneme_stacking_factor + all_codes_next_phoneme = self.sample_codes_from_logits_phoneme( + all_code_logits_t_phoneme, temperature=temperature, topk=topk + ) # (B, phoneme_stacking_factor) + all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme( + all_code_logits_t_phoneme, temperature=0.01 + ) # (B, phoneme_stacking_factor) + pred_phoneme_tokens = ( + all_codes_next_phoneme_argmax + if phoneme_sampling_method == 'argmax' + else all_codes_next_phoneme + ) # B, phoneme_stacking_factor phoneme_bos_tensor = torch.full( (actual_batch_size, self.phoneme_stacking_factor), - self.phoneme_tokenizer.bos_token_id, - device=context_embedding.device - ).long() # (B, phoneme_stacking_factor) + self.phoneme_tokenizer.bos_token_id, + device=context_embedding.device, + ).long() # (B, phoneme_stacking_factor) use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long() print("use_bos_phoneme", use_bos_phoneme) - pred_phoneme_tokens = (use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens).long() # (B, phoneme_stacking_factor) - + pred_phoneme_tokens = ( + use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens + ).long() # (B, phoneme_stacking_factor) + print("pred_phoneme_tokens", pred_phoneme_tokens) gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1) - gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # (B, phoneme_stacking_factor) + gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # (B, phoneme_stacking_factor) print("gt_phoneme_tokens_current", gt_phoneme_tokens_current) - - input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens - input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2)) # (B, phoneme_stacking_factor, E) - - use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended) # (B,) + + input_phoneme_tokens_current = ( + gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens + ) + input_phoneme_embedding = self.embed_phoneme_tokens( + input_phoneme_tokens_current.unsqueeze(2) + ) # (B, phoneme_stacking_factor, E) + + use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended) # (B,) use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float() # (B, 1, 1) - zero_phoneme_embedding = torch.zeros(actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device).unsqueeze(1) # (B, 1, E) + zero_phoneme_embedding = torch.zeros( + actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device + ).unsqueeze( + 1 + ) # (B, 1, E) # phoneme_channel_input_t = phoneme_channel_input[torch.arange(actual_batch_size), current_phoneme_positions.clamp(min=0) + min_context_len, :].unsqueeze(1) # (B, 1, E) - phoneme_channel_input_t = use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding + phoneme_channel_input_t = ( + use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding + ) print("use_phoneme_input", use_phoneme_input) for item_idx in range(actual_batch_size): - if use_phoneme_input[item_idx,0,0] > 0: + if use_phoneme_input[item_idx, 0, 0] > 0: for phoneme_channel_idx in range(self.phoneme_stacking_factor): _phoneme_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item() - if _phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]: + if _phoneme_token not in [ + self.phoneme_tokenizer.eos_token_id, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.pad, + ]: pred_phoneme_token_lists[item_idx].append(_phoneme_token) - + _gt_phoneme_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item() - if _gt_phoneme_token not in [self.phoneme_tokenizer.eos_token_id, self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.pad]: + if _gt_phoneme_token not in [ + self.phoneme_tokenizer.eos_token_id, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.pad, + ]: gt_phoneme_token_lists[item_idx].append(_gt_phoneme_token) - + if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id): print("Phoneme end detected for item {} at timestep {}".format(item_idx, idx)) phoneme_stream_ended[item_idx] = True @@ -1635,34 +1823,44 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]: pred_tokens = all_codes_next_argmax[item_idx] pred_tokens_multinomial = audio_codes_next[item_idx] - if torch.any(pred_tokens == self.audio_eos_id) or torch.any(pred_tokens_multinomial == self.audio_eos_id): + if torch.any(pred_tokens == self.audio_eos_id) or torch.any( + pred_tokens_multinomial == self.audio_eos_id + ): print("End detected for item {} at timestep {}".format(item_idx, idx)) end_indices[item_idx] = idx - + all_predictions.append(audio_codes_next) - + new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2)) # (B, 1, E) new_emb_unconditional = new_emb * 1 - + if self.text_input_mode == 'streaming': _bs = context_embedding.size(0) - remaining_text_embedded_current = remaining_text_embedded[torch.arange(_bs), current_text_positions.clamp(min=0) , :].unsqueeze(1) # (B, 1, E) + remaining_text_embedded_current = remaining_text_embedded[ + torch.arange(_bs), current_text_positions.clamp(min=0), : + ].unsqueeze( + 1 + ) # (B, 1, E) new_emb = new_emb + remaining_text_embedded_current - - - context_incomplete_mask = context_plus_audio_lens > idx + min_context_len # (B,) + + context_incomplete_mask = context_plus_audio_lens > idx + min_context_len # (B,) # import ipdb; ipdb.set_trace() # True if we have not yet reached the end of the context for this item # import ipdb; ipdb.set_trace() if context_incomplete_mask.any(): # If some contexts are not yet complete. context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float() # (B, 1, 1) - context_embedding = context_plus_audio_embedded[:,min_context_len+idx:min_context_len+idx+1,:] # (B, 1, E) + context_embedding = context_plus_audio_embedded[ + :, min_context_len + idx : min_context_len + idx + 1, : + ] # (B, 1, E) next_input = context_incomplete_mask * context_embedding + (1 - context_incomplete_mask) * new_emb if phoneme_channel_input_t is not None: next_input += phoneme_channel_input_t if use_cfg: - next_input_unconditional = context_incomplete_mask * dummy_context_embedding_unconditional + (1 - context_incomplete_mask) * new_emb_unconditional + next_input_unconditional = ( + context_incomplete_mask * dummy_context_embedding_unconditional + + (1 - context_incomplete_mask) * new_emb_unconditional + ) next_input = torch.cat([next_input, next_input_unconditional], dim=0) # (2B, 1, E) else: next_input = new_emb @@ -1670,7 +1868,7 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us next_input += phoneme_channel_input_t if use_cfg: next_input = torch.cat([next_input, new_emb_unconditional], dim=0) # (2B, 1, E) - + transformer_out = self.forward( inputs_embeds=next_input, attention_mask=None, @@ -1682,10 +1880,12 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us if len(end_indices) == audio_codes_next.size(0): print("All items finished at timestep {}".format(idx)) break - + if self.phoneme_tokenizer is not None: for item_idx in range(actual_batch_size): - print("Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx])) + print( + "Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx]) + ) print("GT phoneme tokens for item {}: {}".format(item_idx, gt_phoneme_token_lists[item_idx])) predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx]) gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx]) @@ -1694,10 +1894,12 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us tts_generation_time = time.time() - start_time tts_generation_time_per_frame = tts_generation_time / len(all_predictions) - pred_codes_start_indices = context_plus_audio_lens - min_context_len # (B,) - predicted_lens = [end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0))] # Ensure that the codec is atleast of length 4 + pred_codes_start_indices = context_plus_audio_lens - min_context_len # (B,) + predicted_lens = [ + end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0)) + ] # Ensure that the codec is atleast of length 4 predicted_codes_lens = torch.tensor(predicted_lens, device=context_embedding.device).long() - predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices # (B,) + predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices # (B,) predicted_codes = torch.stack(all_predictions, dim=-1) # (B, num_codebooks, T) predicted_codes = self.slice_pred_embeddings( @@ -1707,9 +1909,11 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us ) predicted_codes = predicted_codes.permute(0, 2, 1) # (B, num_codebooks, T) predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens) - + end_time = time.time() - total_audio_duration_generated = (predicted_audio_lens.max().item() * predicted_audio_lens.shape[0])/self.sample_rate + total_audio_duration_generated = ( + predicted_audio_lens.max().item() * predicted_audio_lens.shape[0] + ) / self.sample_rate rtf = total_audio_duration_generated / (end_time - start_time) rtf_metrics = { @@ -1723,9 +1927,6 @@ def infer_batch(self, batch, max_decoder_steps=500, temperature=0.7, topk=80, us return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics - - @classmethod def list_available_models(cls) -> List[PretrainedModelInfo]: return [] - diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index cc3083f30e2c..19e5793a892b 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -34,7 +34,7 @@ from nemo.collections.asr.parts.utils.manifest_utils import read_manifest from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset -from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel +from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel from nemo.collections.tts.models.magpietts import ModelInferenceParameters from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors from nemo.utils import logging @@ -133,7 +133,7 @@ class MagpieInferenceRunner: """ def __init__( - self,# model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel + self, # model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel model: Union[MagpieTTSModel, MagpieTTSDecoderModel], config: InferenceConfig, ): @@ -157,7 +157,9 @@ def _configure_tokenizer(self) -> None: """Configure the tokenizer for inference (phoneme prob = 1.0).""" g2p = None if isinstance(self.model.tokenizer, AggregatedTTSTokenizer): - if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr(self.model.tokenizer.tokenizers["english_phoneme"], "g2p"): + if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr( + self.model.tokenizer.tokenizers["english_phoneme"], "g2p" + ): g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p elif isinstance(self.model.tokenizer, IPATokenizer): g2p = self.model.tokenizer.g2p diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index e0cd4c2714be..cce2855dd82b 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -28,7 +28,7 @@ import torch from omegaconf import DictConfig, OmegaConf, open_dict -from nemo.collections.tts.models import MagpieTTSModel, MagpieTTSDecoderModel +from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel from nemo.utils import logging @@ -253,7 +253,9 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict: return new_state_dict -def load_magpie_model(config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]: +def load_magpie_model( + config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False +) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]: """Load a MagpieTTS model from checkpoint or NeMo archive. Supports two loading modes: From 94fcf032f98dac042c7d37add69c66272f3cdfd2 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 8 Jan 2026 22:57:12 +0000 Subject: [PATCH 04/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/magpietts_decoder_only.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index f5f5be0522a6..7cecfce31573 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -74,6 +74,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False) self.sample_rate = codec_model.sample_rate self.output_sample_rate = codec_model.output_sample_rate + if hasattr(codec_model, "discriminator"): # del codec discriminator to free memory del codec_model.discriminator @@ -1924,7 +1925,7 @@ def infer_batch( 'tts_generation_time_per_frame': tts_generation_time_per_frame, 'batch_size': context_embedding.size(0), } - + return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics @classmethod From ae8f800c2d69196b4886316ed06b9e648e47130b Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Fri, 9 Jan 2026 16:43:13 -0500 Subject: [PATCH 05/94] handling changes in dataloader Signed-off-by: Paarth Neekhara --- .../tts/models/magpietts_decoder_only.py | 164 +++++++++++------- 1 file changed, 99 insertions(+), 65 deletions(-) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index 7cecfce31573..aa3588426281 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -281,69 +281,82 @@ def load_state_dict(self, state_dict, strict=True): new_state_dict[key[len(name_with_dot) :]] = state_dict[key] child.load_state_dict(new_state_dict) - def audio_to_codes(self, audio, audio_len, audio_type='target'): - # audio: (B, T) - # audio_len: (B,) - if audio_type == 'target': - audio_eos_id = self.audio_eos_id - audio_bos_id = self.audio_bos_id - elif audio_type == 'context': - audio_eos_id = self.context_audio_eos_id - audio_bos_id = self.context_audio_bos_id - else: - raise ValueError(f"Received audio_type of {audio_type}. Must be `target` or `context`") + def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1): + # codes: (B, C, T') + # codes_len: (B,) + codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0) + codes_len = codes_len + num_eos_tokens + # Insert EOS token at new final token entry + for idx in range(codes.size(0)): + codes[idx, :, codes_len[idx] - 1] = eos_id + + return codes, codes_len + + def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1): + # codes: (B, C, T') + # codes_len: (B,) + codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id) + codes_len = codes_len + num_bos_tokens + codes, codes_len = self.add_eos_token( + codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens + ) + return codes, codes_len + + def remove_bos_token(self, codes, codes_len, num_tokens=1): + # codes: (B, C, T') + # codes_len: (B,) + codes = codes[:, :, num_tokens:] + codes_len = codes_len - num_tokens + return codes, codes_len + def remove_embedded_bos_token(self, embedded, embedded_len): + # codes: (B, T', C) + # codes_len: (B,) + embedded = embedded[:, 1:, :] + embedded_len = embedded_len - 1 + return embedded, embedded_len + + def remove_eos_token(self, codes, codes_len): + # codes: (B, C, T') + # codes_len: (B,) + codes_len = codes_len - 1 + codes = codes[:, :, :-1] + mask = get_mask_from_lengths(lengths=codes_len) + codes = codes * mask.unsqueeze(1) + return codes, codes_len + + def remove_embedded_eos_token(self, embedded, embedded_len): + # embedded: (B, T', D) + # embedded_len: (B,) + embedded_len = embedded_len - 1 + embedded = embedded[:, :-1, :] + mask = get_mask_from_lengths(lengths=embedded_len) + embedded = embedded * mask.unsqueeze(2) + return embedded, embedded_len + + def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1): + codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens) + codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len) + return codes, codes_len + + def audio_to_codes(self, audio, audio_len, sample_rate=None): self._codec_model.eval() with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): - codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len) - if self._codec_converter is not None: - codes = self._codec_converter.convert_original_to_new(audio_tokens=codes, audio_lens=codes_len) - # Add a timestep to begining and end of codes tensor - bos_tensor = torch.full( - (codes.size(0), codes.size(1), 1), audio_bos_id, dtype=codes.dtype, device=codes.device - ) - pad_tensor = torch.full( - (codes.size(0), codes.size(1), 1), 0, dtype=codes.dtype, device=codes.device - ) # 0 is the padding token in the audio codebook - codes = torch.cat([bos_tensor, codes, pad_tensor], dim=-1) - # codes: (B, C, T') - # codes_len: (B,) - for idx in range(codes.size(0)): - codes[idx, :, codes_len[idx] + 1] = audio_eos_id - codes_len = codes_len + 2 - - return codes.long(), codes_len.long() + codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate) + return codes, codes_len def codes_to_audio(self, codes, codes_len): # codes: (B, C, T') # codes_len: (B,) - if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: - # Unstack the audio codes if they are stacked - codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) - - if codes.size(2) < 5: - # If the codes are too short, we need to pad them - codes = torch.cat( - [codes, torch.zeros(codes.size(0), codes.size(1), 5 - codes.size(2), device=codes.device)], dim=2 - ).long() - codes_len = codes_len + 5 - codes.size(2) - self._codec_model.eval() with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): - # Make a copy to avoid modifying the original tensor if it's used elsewhere - codes_copy = codes.clone() - # Replace eos and bos tokens with padding in the copied tensor - codes_copy[codes == self.audio_bos_id] = 0 # zero is the padding token - codes_copy[codes == self.audio_eos_id] = 0 # Pass the modified integer token IDs if self._codec_converter is not None: - codes_copy = self._codec_converter.convert_new_to_original( - audio_tokens=codes_copy, audio_lens=codes_len - ) - audio, audio_len = self._codec_model.decode(tokens=codes_copy, tokens_len=codes_len) + codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) + audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) # audio: (B, T) # audio_len: (B,) - return audio, audio_len + return audio, audio_len, codes def embed_audio_tokens(self, audio_tokens): # audio_tokens: (B, C, T') @@ -502,7 +515,7 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N codebook_logits = logits[:, :, si:ei] # (B, T', num_tokens_per_codebook) codebook_targets = audio_codes[:, codebook] # (B, T') codebook_loss = self.cross_entropy_loss( - codebook_logits.permute(0, 2, 1), codebook_targets # (B, num_tokens_per_codebook, T') + codebook_logits.permute(0, 2, 1), codebook_targets.long() # (B, num_tokens_per_codebook, T') ) # (B, T') codebook_loss = codebook_loss * loss_mask[:, codebook, :] codebook_loss = codebook_loss.sum() / loss_mask[:, codebook, :].sum() @@ -810,12 +823,24 @@ def log_val_audio_example( wandb_audio_log = {} pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens_target) - pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target) - target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target) + pred_audio_codes, _ = self.remove_eos_token( + codes=pred_audio_codes, + codes_len=audio_codes_lens_target, + ) + pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target-1) + target_audio_codes, _ = self.remove_eos_token( + codes=target_audio_codes, + codes_len=audio_codes_lens_target, + ) + target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target-1) context_audio, context_audio_lens = None, None if context_audio_codes is not None and context_audio_codes.shape[2] > 3: # > 3 ensures, it is a valid context audio tensor (and not dummy tensor used in text context) + context_audio_codes, context_audio_codes_lens = self.remove_special_tokens( + codes=context_audio_codes, + codes_len=context_audio_codes_lens, + ) context_audio, context_audio_lens = self.codes_to_audio(context_audio_codes, context_audio_codes_lens) for logger in self.loggers: @@ -963,8 +988,16 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): ).long() else: context_audio_codes, context_audio_codes_lens = self.audio_to_codes( - batch['context_audio'], batch['context_audio_lens'], audio_type='context' + batch['context_audio'], batch['context_audio_lens'] ) + + context_audio_codes, context_audio_codes_lens = self.add_special_tokens( + codes=context_audio_codes, + codes_len=context_audio_codes_lens, + bos_id=self.context_audio_bos_id, + eos_id=self.context_audio_eos_id, + ) + context_audio_codes, context_audio_codes_lens = self.stack_codes( context_audio_codes, @@ -1129,7 +1162,7 @@ def process_batch(self, batch, mode="train"): else False ) context_tensors = self.prepare_context_tensors(batch, dropout_text_input) - print("text lens", context_tensors['text_lens']) + # print("text lens", context_tensors['text_lens']) remaining_text_embedded = context_tensors['remaining_text_embedded'] context_embedding = context_tensors['context_embedding'] context_lens = context_tensors['context_lens'] @@ -1160,6 +1193,14 @@ def process_batch(self, batch, mode="train"): audio_tokens=audio_codes, audio_lens=audio_codes_lens ).long() + + audio_codes, audio_codes_lens = self.add_special_tokens( + codes=audio_codes, + codes_len=audio_codes_lens, + bos_id=self.audio_bos_id, + eos_id=self.audio_eos_id, + ) + audio_codes, audio_codes_lens = self.stack_codes( audio_codes, audio_codes_lens, @@ -1198,8 +1239,8 @@ def process_batch(self, batch, mode="train"): batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes ) ) - print("phoneme_tokens_lens", phoneme_tokens_lens) - print("audio_codes_lens", audio_codes_lens_input) + # print("phoneme_tokens_lens", phoneme_tokens_lens) + # print("audio_codes_lens", audio_codes_lens_input) if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]: padding_tensor = torch.zeros( phoneme_channel_input.shape[0], @@ -1455,10 +1496,6 @@ def get_dataset(self, dataset_cfg, dataset_type): sample_rate=self.sample_rate, bos_id=None, eos_id=self.eos_id, - audio_bos_id=self.audio_bos_id, - audio_eos_id=self.audio_eos_id, - context_audio_bos_id=self.context_audio_bos_id, - context_audio_eos_id=self.context_audio_eos_id, num_audio_codebooks=self.data_num_audio_codebooks, codec_model_samples_per_frame=self.codec_model_samples_per_frame, prior_scaling_factor=0.0, @@ -1486,10 +1523,6 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D sample_rate=self.sample_rate, volume_norm=dataset_cfg.volume_norm, codec_model_samples_per_frame=self.codec_model_samples_per_frame, - audio_bos_id=self.audio_bos_id, - audio_eos_id=self.audio_eos_id, - context_audio_bos_id=self.context_audio_bos_id, - context_audio_eos_id=self.context_audio_eos_id, num_audio_codebooks=self.data_num_audio_codebooks, prior_scaling_factor=0.0, load_cached_codes_if_available=self.cfg.load_cached_codes_if_available, @@ -1909,6 +1942,7 @@ def infer_batch( target_lens=predicted_codes_lens, ) predicted_codes = predicted_codes.permute(0, 2, 1) # (B, num_codebooks, T) + predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens) predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens) end_time = time.time() From c2ee2490864df2597c9d3dfb53b83630cfb38cb2 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sat, 10 Jan 2026 03:06:08 -0500 Subject: [PATCH 06/94] hack to avoid HF error Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/audio_codec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index 2cdd5f0f8c9c..86097e134849 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -183,7 +183,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): # load pretrained model # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar") self.speaker_encoder.load_checkpoint( - "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", strict=False + "/gitrepos/checkpoints/pytorch_model.bin", strict=False ) # freeze the pretrained speaker encoder self.speaker_encoder.freeze() From 88a7576f534d37e532930b7bfa4b92877f38da7e Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Fri, 9 Jan 2026 21:44:13 +0000 Subject: [PATCH 07/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/magpietts_decoder_only.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index aa3588426281..4bec12088047 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -338,7 +338,7 @@ def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1): codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens) codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len) return codes, codes_len - + def audio_to_codes(self, audio, audio_len, sample_rate=None): self._codec_model.eval() with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): @@ -827,12 +827,12 @@ def log_val_audio_example( codes=pred_audio_codes, codes_len=audio_codes_lens_target, ) - pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target-1) + pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1) target_audio_codes, _ = self.remove_eos_token( codes=target_audio_codes, codes_len=audio_codes_lens_target, ) - target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target-1) + target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1) context_audio, context_audio_lens = None, None if context_audio_codes is not None and context_audio_codes.shape[2] > 3: @@ -990,7 +990,7 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): context_audio_codes, context_audio_codes_lens = self.audio_to_codes( batch['context_audio'], batch['context_audio_lens'] ) - + context_audio_codes, context_audio_codes_lens = self.add_special_tokens( codes=context_audio_codes, codes_len=context_audio_codes_lens, @@ -998,7 +998,6 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): eos_id=self.context_audio_eos_id, ) - context_audio_codes, context_audio_codes_lens = self.stack_codes( context_audio_codes, context_audio_codes_lens, @@ -1193,7 +1192,6 @@ def process_batch(self, batch, mode="train"): audio_tokens=audio_codes, audio_lens=audio_codes_lens ).long() - audio_codes, audio_codes_lens = self.add_special_tokens( codes=audio_codes, codes_len=audio_codes_lens, From 76ce3d1545cff18507639f6d8e434331675e5458 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sat, 10 Jan 2026 22:21:05 -0500 Subject: [PATCH 08/94] remove discriminatory temporarily Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/audio_codec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index 86097e134849..b91f32582ad4 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -110,7 +110,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.audio_decoder = instantiate(cfg.audio_decoder) # Discriminator setup - self.discriminator = instantiate(cfg.discriminator) + # self.discriminator = instantiate(cfg.discriminator) # Mel loss setup loss_resolutions = cfg.loss_resolutions From 6f3987ce82d7c969b1a79c4f37f09a2209c2792b Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Sat, 10 Jan 2026 08:07:24 +0000 Subject: [PATCH 09/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/audio_codec.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index b91f32582ad4..6ec1f8eb60e7 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -182,9 +182,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.speaker_encoder = ResNetSpeakerEncoder() # load pretrained model # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar") - self.speaker_encoder.load_checkpoint( - "/gitrepos/checkpoints/pytorch_model.bin", strict=False - ) + self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False) # freeze the pretrained speaker encoder self.speaker_encoder.freeze() logging.info("Speaker encoder loaded and frozen !!") From aefe97f06147e1e76cdfb310bcfaedeb4f257761 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sat, 10 Jan 2026 22:42:48 -0500 Subject: [PATCH 10/94] fix errors Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/magpietts_decoder_only.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index 4bec12088047..b08f355d8e29 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -827,12 +827,12 @@ def log_val_audio_example( codes=pred_audio_codes, codes_len=audio_codes_lens_target, ) - pred_audio, pred_audio_lens = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1) + pred_audio, pred_audio_lens, _ = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1) target_audio_codes, _ = self.remove_eos_token( codes=target_audio_codes, codes_len=audio_codes_lens_target, ) - target_audio, target_audio_lens = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1) + target_audio, target_audio_lens, _ = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1) context_audio, context_audio_lens = None, None if context_audio_codes is not None and context_audio_codes.shape[2] > 3: @@ -841,7 +841,7 @@ def log_val_audio_example( codes=context_audio_codes, codes_len=context_audio_codes_lens, ) - context_audio, context_audio_lens = self.codes_to_audio(context_audio_codes, context_audio_codes_lens) + context_audio, context_audio_lens, _ = self.codes_to_audio(context_audio_codes, context_audio_codes_lens) for logger in self.loggers: is_wandb = isinstance(logger, WandbLogger) @@ -1941,7 +1941,7 @@ def infer_batch( ) predicted_codes = predicted_codes.permute(0, 2, 1) # (B, num_codebooks, T) predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens) - predicted_audio, predicted_audio_lens = self.codes_to_audio(predicted_codes, predicted_codes_lens) + predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens) end_time = time.time() total_audio_duration_generated = ( From 9d52822d9fbbad507fc981a631278383f2e5ab16 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sat, 10 Jan 2026 23:47:24 -0500 Subject: [PATCH 11/94] bug fix Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/magpietts_decoder_only.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index b08f355d8e29..d7d13d8e2310 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -349,6 +349,9 @@ def codes_to_audio(self, codes, codes_len): # codes: (B, C, T') # codes_len: (B,) self._codec_model.eval() + if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: + # Unstack the audio codes if they are stacked + codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): # Pass the modified integer token IDs if self._codec_converter is not None: From 90a6c541a187d7753c6ade72c8a8e07dc31c8bfb Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sun, 11 Jan 2026 03:13:31 -0500 Subject: [PATCH 12/94] add moe Signed-off-by: Paarth Neekhara --- .../tts/models/magpietts_decoder_only.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index d7d13d8e2310..c053a92309a2 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -169,16 +169,23 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) - if cfg.transformer_hf_backend == "custom_qwen3_moe": - # from transformers.models import qwen3_moe - # config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(intermediate_size=3072, num_hidden_layers=5, num_experts=64) - # self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) - from transformers.models import qwen2_moe - - config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig( - hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32 - ) - self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2) + if cfg.transformer_hf_backend == "custom_qwen3_moe_5layer": + from transformers.models import qwen3_moe + config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64) + self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) + elif cfg.transformer_hf_backend == "custom_qwen3_moe_10layer": + from transformers.models import qwen3_moe + config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64) + self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) + elif cfg.transformer_hf_backend == "custom_qwen3_moe_15layer": + from transformers.models import qwen3_moe + config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64) + self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) + # from transformers.models import qwen2_moe + # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig( + # hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32 + # ) + # self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2) else: self.transformer_backend_config = AutoConfig.from_pretrained( cfg.transformer_hf_backend, @@ -352,6 +359,7 @@ def codes_to_audio(self, codes, codes_len): if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: # Unstack the audio codes if they are stacked codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) + with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): # Pass the modified integer token IDs if self._codec_converter is not None: From 324b8038f103ba9839489dd8ef6d1fcb9cd3361c Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Sun, 11 Jan 2026 08:14:21 +0000 Subject: [PATCH 13/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- .../tts/models/magpietts_decoder_only.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index c053a92309a2..4a16f01d5b12 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -171,15 +171,24 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if cfg.transformer_hf_backend == "custom_qwen3_moe_5layer": from transformers.models import qwen3_moe - config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64) + + config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( + hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64 + ) self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) elif cfg.transformer_hf_backend == "custom_qwen3_moe_10layer": from transformers.models import qwen3_moe - config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64) + + config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( + hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64 + ) self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) elif cfg.transformer_hf_backend == "custom_qwen3_moe_15layer": from transformers.models import qwen3_moe - config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64) + + config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( + hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64 + ) self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) # from transformers.models import qwen2_moe # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig( From 1c4a568d12833b7e5e75526dcd60b0cacd59bd1f Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sun, 11 Jan 2026 03:20:58 -0500 Subject: [PATCH 14/94] 20 layer moe Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/magpietts_decoder_only.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/magpietts_decoder_only.py index 4a16f01d5b12..b0b4e149bcf1 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/magpietts_decoder_only.py @@ -190,6 +190,13 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64 ) self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) + elif cfg.transformer_hf_backend == "custom_qwen3_moe_20layer": + from transformers.models import qwen3_moe + + config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( + hidden_size=1536, intermediate_size=3072, num_hidden_layers=20, num_experts=64 + ) + self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) # from transformers.models import qwen2_moe # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig( # hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32 From a19012af4b283e98f56ba60f337866f54796ca63 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Thu, 22 Jan 2026 14:32:58 -0500 Subject: [PATCH 15/94] some refactoring and clean up Signed-off-by: Paarth Neekhara --- ..._decoder_only.yaml => easy_magpietts.yaml} | 0 ...lhotse.yaml => easy_magpietts_lhotse.yaml} | 0 ...etts_decoder_only.py => easy_magpietts.py} | 6 +- examples/tts/evalset_config.json | 36 ++- examples/tts/magpietts_inference.py | 10 + nemo/collections/tts/models/__init__.py | 4 +- nemo/collections/tts/models/audio_codec.py | 9 +- ...etts_decoder_only.py => easy_magpietts.py} | 245 +----------------- .../modules/magpietts_inference/inference.py | 4 +- .../tts/modules/magpietts_inference/utils.py | 6 +- 10 files changed, 71 insertions(+), 249 deletions(-) rename examples/tts/conf/magpietts/{magpietts_decoder_only.yaml => easy_magpietts.yaml} (100%) rename examples/tts/conf/magpietts/{magpietts_decoder_only_lhotse.yaml => easy_magpietts_lhotse.yaml} (100%) rename examples/tts/{magpietts_decoder_only.py => easy_magpietts.py} (91%) rename nemo/collections/tts/models/{magpietts_decoder_only.py => easy_magpietts.py} (87%) diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml similarity index 100% rename from examples/tts/conf/magpietts/magpietts_decoder_only.yaml rename to examples/tts/conf/magpietts/easy_magpietts.yaml diff --git a/examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml similarity index 100% rename from examples/tts/conf/magpietts/magpietts_decoder_only_lhotse.yaml rename to examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml diff --git a/examples/tts/magpietts_decoder_only.py b/examples/tts/easy_magpietts.py similarity index 91% rename from examples/tts/magpietts_decoder_only.py rename to examples/tts/easy_magpietts.py index 73bb87de7969..4195060b87ef 100644 --- a/examples/tts/magpietts_decoder_only.py +++ b/examples/tts/easy_magpietts.py @@ -16,13 +16,13 @@ import torch.multiprocessing as mp from omegaconf import OmegaConf -from nemo.collections.tts.models import MagpieTTSDecoderModel +from nemo.collections.tts.models import EasyMagpieTTSModel from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -@hydra_runner(config_path="conf/magpietts", config_name="magpietts_decoderonly_en") +@hydra_runner(config_path="conf/magpietts", config_name="easy_magpietts") def main(cfg): logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True)) @@ -42,7 +42,7 @@ def main(cfg): trainer.callbacks.append(pl.callbacks.LearningRateMonitor(logging_interval='step', log_weight_decay=True)) exp_manager(trainer, cfg.get("exp_manager", None)) - model = MagpieTTSDecoderModel(cfg=cfg.model, trainer=trainer) + model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer) model.maybe_init_from_pretrained_checkpoint(cfg=cfg) if cfg.get('mode', 'train') == 'train': diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json index 4ff4d12ad9eb..029f818ef53b 100644 --- a/examples/tts/evalset_config.json +++ b/examples/tts/evalset_config.json @@ -14,10 +14,44 @@ "audio_dir": "/", "feature_dir": null }, + "riva_multibpe": { + "manifest_path": "/Data/evaluation_manifests/riva_hard_multi_bpe.ndjson", + "audio_dir": "/Data/RIVA-TTS", + "feature_dir": "/Data/RIVA-TTS" + }, "riva_hard_digits": { "manifest_path": "/Data/evaluation_manifests/hard-digits-path-corrected.ndjson", "audio_dir": "/Data/RIVA-TTS", "feature_dir": "/Data/RIVA-TTS" + }, + "riva_hard_letters": { + "manifest_path": "/Data/evaluation_manifests/hard-letters-path-corrected.ndjson", + "audio_dir": "/Data/RIVA-TTS", + "feature_dir": "/Data/RIVA-TTS" + }, + "riva_hard_money": { + "manifest_path": "/Data/evaluation_manifests/hard-money-path-corrected.ndjson", + "audio_dir": "/Data/RIVA-TTS", + "feature_dir": "/Data/RIVA-TTS" + }, + "riva_hard_short": { + "manifest_path": "/Data/evaluation_manifests/hard-short-path-corrected.ndjson", + "audio_dir": "/Data/RIVA-TTS", + "feature_dir": "/Data/RIVA-TTS" + }, + "vctk": { + "manifest_path": "/Data/evaluation_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json", + "audio_dir": "/Data/VCTK-Corpus-0.92", + "feature_dir": "/Data/VCTK-Corpus-0.92" + }, + "libritts_seen": { + "manifest_path": "/Data/evaluation_manifests/LibriTTS_seen_evalset_from_testclean_v2.json", + "audio_dir": "/Data/LibriTTS", + "feature_dir": "/Data/LibriTTS" + }, + "libritts_test_clean": { + "manifest_path": "/Data/evaluation_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl", + "audio_dir": "/Data/LibriTTS", + "feature_dir": "/Data/LibriTTS" } } - diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index 3199f58e9970..1e7753798db4 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -193,6 +193,8 @@ def run_inference_and_evaluation( model, checkpoint_name = load_magpie_model( model_config, is_decoder_only_model=inference_config.is_decoder_only_model ) + # change model to fp32 for inference + model = model.float() # Log architecture summary and get MoE info + FLOPs metrics moe_info, flops_per_component = log_model_architecture_summary(model) @@ -551,6 +553,14 @@ def main(argv=None): else: model_inference_parameters[field_name] = arg_from_cmdline + if "max_decoder_steps" not in model_inference_parameters: + if args.longform_mode in {'always', 'auto'}: + model_inference_parameters["max_decoder_steps"] = args.longform_max_decoder_steps + elif args.is_decoder_only_model: + model_inference_parameters["max_decoder_steps"] = 220 + else: + model_inference_parameters["max_decoder_steps"] = 440 + inference_config = InferenceConfig( model_inference_parameters=ModelInferenceParameters.from_dict(model_inference_parameters), batch_size=args.batch_size, diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py index 6e781bed19ef..d9f406a3ba3d 100644 --- a/nemo/collections/tts/models/__init__.py +++ b/nemo/collections/tts/models/__init__.py @@ -18,7 +18,7 @@ from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL from nemo.collections.tts.models.hifigan import HifiGanModel from nemo.collections.tts.models.magpietts import InferBatchOutput, MagpieTTSModel -from nemo.collections.tts.models.magpietts_decoder_only import MagpieTTSDecoderModel +from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel from nemo.collections.tts.models.magpietts_preference_optimization import ( MagpieTTSModelOfflinePO, MagpieTTSModelOfflinePODataGen, @@ -35,7 +35,7 @@ "HifiGanModel", "InferBatchOutput", "MagpieTTSModel", - "MagpieTTSDecoderModel", + "EasyMagpieTTSModel", "MagpieTTSModelOfflinePODataGen", "MagpieTTSModelOfflinePO", "MagpieTTSModelOnlinePO", diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index 6ec1f8eb60e7..d5c1afb3a5bf 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -182,7 +182,14 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.speaker_encoder = ResNetSpeakerEncoder() # load pretrained model # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar") - self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False) + import os + # TODO: revert this + if os.path.exists("/gitrepos/checkpoints/pytorch_model.bin"): + self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False) + else: + self.speaker_encoder.load_checkpoint( + "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", strict=False + ) # freeze the pretrained speaker encoder self.speaker_encoder.freeze() logging.info("Speaker encoder loaded and frozen !!") diff --git a/nemo/collections/tts/models/magpietts_decoder_only.py b/nemo/collections/tts/models/easy_magpietts.py similarity index 87% rename from nemo/collections/tts/models/magpietts_decoder_only.py rename to nemo/collections/tts/models/easy_magpietts.py index b0b4e149bcf1..bab703c242ad 100644 --- a/nemo/collections/tts/models/magpietts_decoder_only.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -59,7 +59,7 @@ def worker_init_fn(worker_id): dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config) -class MagpieTTSDecoderModel(ModelPT): +class EasyMagpieTTSModel(ModelPT): """ Magpie-TTS Model Decoder Only Model audio/text @@ -107,7 +107,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.codec_model_samples_per_frame = codec_model.samples_per_frame # Our codebooks start with actual audio codec tokens, followed by special tokens. # The `forced_*` options are for backward compatibility for models trained with older code. - num_audio_tokens = codec_model.codebook_size # Our codebooks start with actual audio codec tokens, followed by special tokens. # The `forced_*` options are for backward compatibility for models trained with older code. get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size) @@ -468,47 +467,6 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_ return all_code_logits - def maskgit_create_random_mask(self, codes): - """ - Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN. - """ - # Codes: (B, C, T) - B, C, T = codes.shape - # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right? - rand_values = torch.rand(B, T, device=codes.device) - # apply the cosine schedule - frac_masked = cosine_schedule(rand_values) - # how many positions to mask - n_masked = torch.ceil(frac_masked * C).long() # B,T - # start from all unmasked - mask = torch.zeros_like(codes, dtype=torch.bool) - # The code further below is the vectorized version of this: - # for b in range(B): - # for t in range(T): - # if n_masked[b,t] > 0: - # # get a random permutation of the codebook indices - # perm = torch.randperm(C) - # # mask the top n_masked positions - # mask[b, perm[:n_masked[b,t]], t] = True - # - # Create random permutations - random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) # (B, C, T) - # Create a mask tensor where each position indicates if it should be masked - mask_indices = torch.arange(C, device=codes.device).view(1, C, 1) - mask = mask_indices < n_masked.view(B, 1, T) # (B, C, T) - # Apply the random permutations to the mask - mask = torch.gather(mask, 1, random_permutations) - - return mask # (B, C, T) - - def maskgit_apply_random_mask(self, codes): - # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule. - # Codes: (B, C, T) - mask = self.maskgit_create_random_mask(codes) - ## replace some tokens with MASK_TOKEN - codes_with_mask = torch.where(mask, self.mask_token_id, codes) - return codes_with_mask, mask - def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None): """ Computes the audio codebook loss. Used by @@ -601,128 +559,6 @@ def logits_to_audio_codes(self, all_code_logits, audio_codes_lens): return all_preds - def local_transformer_sample_maskgit( - self, - dec_output, - temperature=0.7, - topk=80, - unfinished_items={}, - finished_items={}, - use_cfg=False, - cfg_scale=1.0, - n_steps=3, - ): - """ - Sample codes for one timestep from the local transformer using MaskGit. - """ - if self.frame_stacking_factor > 1: - raise NotImplementedError("MaskGit sampling is not implemented for frame stacking factor > 1") - # dec_output: (B, E) - device = dec_output.device - # disable KV cache since our transformer is not causal - self.local_transformer.reset_cache(use_cache=False) - dec_output = dec_output.unsqueeze(1) # (B, 1, E) - local_transformer_input_init = self.local_transformer_in_projection( - dec_output - ) # (B, 1, D) where D is the dimension of the local transformer - C = self.num_audio_codebooks - B = dec_output.size(0) - - min_confidence = float("-inf") - max_confidence = 10000 # this needs to be large enough that unmasked items will always remain unmasked. # TODO @rfejgin: use float('inf')? - confidences = min_confidence * torch.ones(B, C, device=device) - # initialize to all masked - codes = self.mask_token_id * torch.ones((B, C), device=device, dtype=torch.long) - sampled_codes = codes.clone() - for step in range(n_steps): - # get mask fraction - frac_masked = cosine_schedule(torch.tensor(step / (n_steps))) - # how many codebooks to mask - n_masked = torch.ceil( - C * frac_masked - ).long() # TODO @rfejgin: should we force this to be initialized to exactly `C` (to avoid numerical issues)? - n_unmasked = C - n_masked - # pick top-confidence codebooks up to n_unmasked - _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) - - # replace masks of the top-k confident codebooks with the the codes that were sampled for them - unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices) - codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) - - # build transformer input - local_transformer_input = local_transformer_input_init - for codebook_num in range(C): - next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze( - 1 - ) # (B, 1, 768) - next_local_transformer_input = self.local_transformer_in_projection( - next_local_transformer_input - ) # (B, 1, d_local) - local_transformer_input = torch.cat( - [local_transformer_input, next_local_transformer_input], dim=1 - ) # (B, codebook_num+1, d_local) - - # run transformer - _mask = torch.ones(B, C + 1, device=device) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)[ - 'output' - ] # (B, C+1, d_local) - - # get logits - logits = [] - for codebook_num in range(C): - # The `codebook_num+1` is to drop first position which corresponds to the magpie latent - codebook_logits = self.local_transformer_out_projections[codebook_num]( - local_transformer_output[:, codebook_num + 1, :] - ) # (B, num_audio_tokens_per_codebook) - logits.append(codebook_logits) - logits = torch.stack(logits, dim=1) # (B, C, num_audio_tokens_per_codebook) - - # apply CFG - if use_cfg: - actual_batch_size = logits.size(0) // 2 - conditional_logits = logits[:actual_batch_size] - unconditional_logits = logits[actual_batch_size:] - cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits - logits[:actual_batch_size] = cfg_logits - - # handle unfinished and finished items - for item_idx in unfinished_items: - logits[item_idx, self.audio_eos_id] = float('-inf') - for item_idx in finished_items: - logits[item_idx, :, :] = float('-inf') - logits[item_idx, :, self.audio_eos_id] = 0.0 - - # sample with top-k - logits_topk = torch.topk(logits, topk, dim=-1)[0] # (B, C, topk) - indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) # (B, C, num_audio_tokens_per_codebook) - logits_rescored = logits.clone() - logits_rescored[indices_to_remove] = float('-inf') - probs = torch.softmax(logits_rescored / temperature, dim=-1) # (B, C, num_audio_tokens_per_codebook) - sampled_codes = torch.multinomial(probs.view(B * C, -1), 1).view(B, C) - if use_cfg: - # TODO @rfejgin: why do we need to keep second half of the batch? can probably optimize this - sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size] - probs[actual_batch_size:] = probs[:actual_batch_size] - confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) - - # set confidence to max for unmasked codebooks so that they will remain unmasked - confidences.scatter_( - index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float) - ) - - # replace entries in sampled_codes with previously unmasked codebooks - sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) - # optionally: add noise to confidences here (as in token-critic paper) (not implemented) - - codes = sampled_codes - assert not ( - codes == self.mask_token_id - ).any(), f"Codes contain mask tokens after completion of MaskGit sampling" - if use_cfg: - codes = codes[:actual_batch_size] - return codes - def local_transformer_sample_autoregressive( self, dec_output, @@ -1300,25 +1136,13 @@ def process_batch(self, batch, mode="train"): local_transformer_loss = None local_transformer_logits = None if self.local_transformer_type != LocalTransformerType.NO_LT: - if self.local_transformer_type == LocalTransformerType.MASKGIT: - # randomly replace some positions with MASK_TOKEN - audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target) - local_transformer_logits = self.compute_local_transformer_logits( - pred_embeddings, audio_codes_masked, targets_offset_by_one=True - ) - # audio_codes_masked = audio_codes_masked[:, 1:, :] - local_transformer_loss, _ = self.compute_loss( - local_transformer_logits, audio_codes_target, audio_codes_lens_target, mask_tokens_mask - ) - else: - # autoregressive - assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" - local_transformer_logits = self.compute_local_transformer_logits( - pred_embeddings, audio_codes_target, targets_offset_by_one=False - ) - local_transformer_loss, _ = self.compute_loss( - local_transformer_logits, audio_codes_target, audio_codes_lens_target, None - ) + assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" + local_transformer_logits = self.compute_local_transformer_logits( + pred_embeddings, audio_codes_target, targets_offset_by_one=False + ) + local_transformer_loss, _ = self.compute_loss( + local_transformer_logits, audio_codes_target, audio_codes_lens_target, None + ) local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0) loss = loss + local_transformer_loss_scale * local_transformer_loss @@ -1438,50 +1262,6 @@ def validation_step(self, batch, batch_idx): if isinstance(logger, WandbLogger) and wandb_log_dict: logger.experiment.log(wandb_log_dict) - # infer_output_no_cfg_noLT = self.infer_batch( - # batch, - # max_decoder_steps=500, - # temperature=0.7, - # topk=80, - # use_local_transformer_for_inference=False, - # maskgit_n_steps=3, - # use_cfg=False, - # cfg_scale=1.0 - # ) - # infer_output_cfg_withLT = self.infer_batch( - # batch, - # max_decoder_steps=500, - # temperature=0.7, - # topk=80, - # use_local_transformer_for_inference=self.local_transformer_type != LocalTransformerType.NO_LT, - # maskgit_n_steps=3, - # use_cfg=True, - # cfg_scale=2.5 - # ) - # pred_audio_no_cfg_noLT, pred_audio_no_cfg_noLT_lens = infer_output_no_cfg_noLT[0], infer_output_no_cfg_noLT[1] - # pred_audio_cfg_withLT, pred_audio_cfg_withLT_lens = infer_output_cfg_withLT[0], infer_output_cfg_withLT[1] - - # for logger in self.loggers: - # is_wandb = isinstance(logger, WandbLogger) - # is_tb = isinstance(logger, TensorBoardLogger) - # if not is_wandb and not is_tb: - # raise ValueError(f"Invalid logger type for audio logging: {type(logger)}. Only `WandbLogger` and `TensorBoardLogger` are supported.") - # for idx in range(pred_audio_no_cfg_noLT.size(0)): - # pred_audio_no_cfg_noLT_idx = pred_audio_no_cfg_noLT[idx][:pred_audio_no_cfg_noLT_lens[idx]].float().cpu().numpy() - # pred_audio_cfg_withLT_idx = pred_audio_cfg_withLT[idx][:pred_audio_cfg_withLT_lens[idx]].float().cpu().numpy() - # if is_wandb: - # logger.experiment.log({ - # "val/pred_audio_no_cfg_noLT": wandb.Audio(pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, caption="Inference No CFG, No LT"), - # "val/pred_audio_cfg_withLT": wandb.Audio(pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, caption="Inference CFG, With LT"), - # }) - # if is_tb: - # logger.experiment.add_audio( - # "val/pred_audio_no_cfg_noLT", pred_audio_no_cfg_noLT_idx, sample_rate=self.sample_rate, global_step=batch_idx - # ) - # logger.experiment.add_audio( - # "val/pred_audio_cfg_withLT", pred_audio_cfg_withLT_idx, sample_rate=self.sample_rate, global_step=batch_idx - # ) - local_transformer_loss = batch_output['local_transformer_loss'] val_output = { 'val_loss': loss, @@ -1780,15 +1560,6 @@ def infer_batch( use_cfg=use_cfg, cfg_scale=cfg_scale, ) - elif self.local_transformer_type == LocalTransformerType.MASKGIT: - audio_codes_next = self.local_transformer_sample_maskgit( - dec_output=last_hidden[:, -1, :], - temperature=temperature, - topk=topk, - n_steps=maskgit_n_steps, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - ) else: raise ValueError( f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}" diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index 19e5793a892b..34ba8d62c730 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -34,7 +34,7 @@ from nemo.collections.asr.parts.utils.manifest_utils import read_manifest from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset -from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel +from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel from nemo.collections.tts.models.magpietts import ModelInferenceParameters from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors from nemo.utils import logging @@ -134,7 +134,7 @@ class MagpieInferenceRunner: def __init__( self, # model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel - model: Union[MagpieTTSModel, MagpieTTSDecoderModel], + model: Union[MagpieTTSModel, EasyMagpieTTSModel], config: InferenceConfig, ): """Initialize the inference runner. diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index cce2855dd82b..d7dd672867c3 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -28,7 +28,7 @@ import torch from omegaconf import DictConfig, OmegaConf, open_dict -from nemo.collections.tts.models import MagpieTTSDecoderModel, MagpieTTSModel +from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel from nemo.utils import logging @@ -255,7 +255,7 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict: def load_magpie_model( config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False -) -> Tuple[Union[MagpieTTSModel, MagpieTTSDecoderModel], str]: +) -> Tuple[Union[MagpieTTSModel, EasyMagpieTTSModel], str]: """Load a MagpieTTS model from checkpoint or NeMo archive. Supports two loading modes: @@ -273,7 +273,7 @@ def load_magpie_model( ValueError: If configuration is invalid or sample rates don't match. """ config.validate() - model_cls = MagpieTTSDecoderModel if is_decoder_only_model else MagpieTTSModel + model_cls = EasyMagpieTTSModel if is_decoder_only_model else MagpieTTSModel if config.hparams_file is not None and config.checkpoint_file is not None: # Mode 1: Load from hparams + checkpoint model_cfg = OmegaConf.load(config.hparams_file) From d88eda2ad5e7c24b838c5c23161c24ede71919f5 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 22 Jan 2026 19:33:55 +0000 Subject: [PATCH 16/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/__init__.py | 2 +- nemo/collections/tts/models/audio_codec.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py index d9f406a3ba3d..20984cfccc6a 100644 --- a/nemo/collections/tts/models/__init__.py +++ b/nemo/collections/tts/models/__init__.py @@ -14,11 +14,11 @@ from nemo.collections.tts.models.aligner import AlignerModel from nemo.collections.tts.models.audio_codec import AudioCodecModel +from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel from nemo.collections.tts.models.fastpitch import FastPitchModel from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL from nemo.collections.tts.models.hifigan import HifiGanModel from nemo.collections.tts.models.magpietts import InferBatchOutput, MagpieTTSModel -from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel from nemo.collections.tts.models.magpietts_preference_optimization import ( MagpieTTSModelOfflinePO, MagpieTTSModelOfflinePODataGen, diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index d5c1afb3a5bf..de11bb4f9229 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -183,13 +183,15 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): # load pretrained model # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar") import os + # TODO: revert this if os.path.exists("/gitrepos/checkpoints/pytorch_model.bin"): self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False) else: self.speaker_encoder.load_checkpoint( - "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", strict=False - ) + "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", + strict=False, + ) # freeze the pretrained speaker encoder self.speaker_encoder.freeze() logging.info("Speaker encoder loaded and frozen !!") From 122af0ab96dac2129aa53e69ea9196ff7fb8c773 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Tue, 27 Jan 2026 19:29:58 -0500 Subject: [PATCH 17/94] bug fix related to spectral codec Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index bab703c242ad..120b63aef46c 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -845,14 +845,15 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): if 'context_audio_codes' in batch: context_audio_codes = batch['context_audio_codes'] context_audio_codes_lens = batch['context_audio_codes_lens'] - if self._codec_converter is not None: - context_audio_codes = self._codec_converter.convert_original_to_new( - audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens - ).long() else: context_audio_codes, context_audio_codes_lens = self.audio_to_codes( batch['context_audio'], batch['context_audio_lens'] ) + + if self._codec_converter is not None: + context_audio_codes = self._codec_converter.convert_original_to_new( + audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens + ).long() context_audio_codes, context_audio_codes_lens = self.add_special_tokens( codes=context_audio_codes, @@ -1050,10 +1051,11 @@ def process_batch(self, batch, mode="train"): else: audio_codes = batch['audio_codes'] audio_codes_lens = batch['audio_codes_lens'] - if self._codec_converter is not None: - audio_codes = self._codec_converter.convert_original_to_new( - audio_tokens=audio_codes, audio_lens=audio_codes_lens - ).long() + + if self._codec_converter is not None: + audio_codes = self._codec_converter.convert_original_to_new( + audio_tokens=audio_codes, audio_lens=audio_codes_lens + ).long() audio_codes, audio_codes_lens = self.add_special_tokens( codes=audio_codes, From 59208f1913b47df67d1973c2a9b621222e81035f Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Wed, 28 Jan 2026 00:31:15 +0000 Subject: [PATCH 18/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 120b63aef46c..421d80c453fa 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -849,7 +849,7 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): context_audio_codes, context_audio_codes_lens = self.audio_to_codes( batch['context_audio'], batch['context_audio_lens'] ) - + if self._codec_converter is not None: context_audio_codes = self._codec_converter.convert_original_to_new( audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens @@ -1051,7 +1051,7 @@ def process_batch(self, batch, mode="train"): else: audio_codes = batch['audio_codes'] audio_codes_lens = batch['audio_codes_lens'] - + if self._codec_converter is not None: audio_codes = self._codec_converter.convert_original_to_new( audio_tokens=audio_codes, audio_lens=audio_codes_lens From 3c8bb40067c4c98304c0929f613e339e0dc0850b Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 28 Jan 2026 15:32:36 -0500 Subject: [PATCH 19/94] some clean up Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts.py | 77 ++++--------------- 1 file changed, 14 insertions(+), 63 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 421d80c453fa..35d7c73d54fa 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -135,7 +135,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): ) num_tokens_tokenizer = len(self.tokenizer.tokens) - num_tokens = num_tokens_tokenizer + 3 # +2 for BOS and EOS + num_tokens = num_tokens_tokenizer + 3 # +3 for BOS, EOS, CFG_UNK self.bos_id = num_tokens - 3 self.eos_id = num_tokens - 2 self.cfg_unk_token_id = num_tokens - 1 @@ -168,48 +168,14 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) - if cfg.transformer_hf_backend == "custom_qwen3_moe_5layer": - from transformers.models import qwen3_moe - - config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( - hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=64 - ) - self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) - elif cfg.transformer_hf_backend == "custom_qwen3_moe_10layer": - from transformers.models import qwen3_moe - - config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( - hidden_size=1536, intermediate_size=3072, num_hidden_layers=10, num_experts=64 - ) - self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) - elif cfg.transformer_hf_backend == "custom_qwen3_moe_15layer": - from transformers.models import qwen3_moe - - config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( - hidden_size=1536, intermediate_size=3072, num_hidden_layers=15, num_experts=64 - ) - self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) - elif cfg.transformer_hf_backend == "custom_qwen3_moe_20layer": - from transformers.models import qwen3_moe - - config = qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig( - hidden_size=1536, intermediate_size=3072, num_hidden_layers=20, num_experts=64 - ) - self.decoder = qwen3_moe.modeling_qwen3_moe.Qwen3MoeModel(config) - # from transformers.models import qwen2_moe - # config_qwen2 = qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig( - # hidden_size=1536, intermediate_size=3072, num_hidden_layers=5, num_experts=32 - # ) - # self.decoder = qwen2_moe.modeling_qwen2_moe.Qwen2MoeModel(config_qwen2) - else: - self.transformer_backend_config = AutoConfig.from_pretrained( - cfg.transformer_hf_backend, - trust_remote_code=True, - ) + self.transformer_backend_config = AutoConfig.from_pretrained( + cfg.transformer_hf_backend, + trust_remote_code=True, + ) - hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) - self.decoder = hf_transformer.model - self.lm_text_head = hf_transformer.lm_head + hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) + self.decoder = hf_transformer.model + self.lm_text_head = hf_transformer.lm_head self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) self.decoder.set_input_embeddings(self.text_embedding) @@ -467,32 +433,18 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_ return all_code_logits - def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None): + def compute_loss(self, logits, audio_codes, audio_codes_lens): """ Computes the audio codebook loss. Used by (1) The main Magpie-TTS transformer - (2) The local transformer, for both autoregressive and MaskGit methods + (2) The local transformer logits: (B, T', num_codebooks * num_tokens_per_codebook) audio_codes: (B, C, T') audio_codes_lens: (B,) - mask_tokens_mask: (B, C, T') True for tokens that were replaced with the MASK_TOKEN and should - therefore be the only ones included in the loss computation. """ loss_mask = get_mask_from_lengths(audio_codes_lens) - if mask_tokens_mask is not None: - # For MaskGit we only compute loss for the masked tokens. - # *Both* conditions must be true: - # 1. the token is masked - # 2. the token is not padding - loss_mask = loss_mask.unsqueeze(1) * mask_tokens_mask - if not loss_mask.any(): - # Without this we were very rarely getting NaNs in the loss - logging.warning("No tokens valid were found in compute_loss()!") - return torch.tensor(0.0, device=loss_mask.device), loss_mask - else: - # repeat loss mask for each codebook to simplify code below - loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1) + loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1) total_codebook_loss = None for codebook in range(audio_codes.size(1)): si = codebook * self.num_all_tokens_per_codebook @@ -818,7 +770,6 @@ def join_embeddings_temporally( return joined, out_lengths def prepare_context_tensors(self, batch, dropout_text_input=False): - # Transcript text = batch['text'] text_lens = batch['text_lens'] text_embedded = self.decoder.get_input_embeddings()(text) @@ -1131,8 +1082,8 @@ def process_batch(self, batch, mode="train"): ) logits = self.final_proj(pred_embeddings) # (B, T', num_codebooks * num_tokens_per_codebook) - # import ipdb; ipdb.set_trace() - codebook_loss, loss_mask = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) + + codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) loss = codebook_loss local_transformer_loss = None @@ -1143,7 +1094,7 @@ def process_batch(self, batch, mode="train"): pred_embeddings, audio_codes_target, targets_offset_by_one=False ) local_transformer_loss, _ = self.compute_loss( - local_transformer_logits, audio_codes_target, audio_codes_lens_target, None + local_transformer_logits, audio_codes_target, audio_codes_lens_target ) local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0) loss = loss + local_transformer_loss_scale * local_transformer_loss From 2067ae944ee027ee57b12112ea200632b2f53cbf Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 28 Jan 2026 16:01:52 -0500 Subject: [PATCH 20/94] add docstrings and data classes Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts.py | 429 ++++++++++++++---- 1 file changed, 348 insertions(+), 81 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 35d7c73d54fa..d9a6705d4a74 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -13,8 +13,9 @@ # limitations under the License. import random import time +from dataclasses import dataclass from functools import partial -from typing import List, Sequence, Tuple +from typing import List, Optional, Sequence, Tuple import torch import wandb @@ -47,6 +48,68 @@ from nemo.utils import logging +@dataclass +class ContextTensors: + """ + Output dataclass from prepare_context_tensors containing all context-related tensors. + + Attributes: + context_embedding: Combined context embedding tensor (B, T_total, E) + context_lens: Length of context for each batch item (B,) + context_audio_codes: Audio codes for context audio (B, C, T') + context_audio_embedded: Embedded context audio codes (B, T', E) + context_audio_codes_lens: Length of context audio codes (B,) + text_embedded: Embedded text tokens (B, L, E) + text_lens: Length of text for each batch item (B,) + context_text_tokens: Context text token IDs (B, L) + context_text_lens: Length of context text (B,) + remaining_text_embedded: Embedded remaining text for streaming mode, None otherwise (B, T, E) + remaining_text_lens: Length of remaining text for streaming mode, None otherwise (B,) + """ + + context_embedding: torch.Tensor + context_lens: torch.Tensor + context_audio_codes: torch.Tensor + context_audio_embedded: torch.Tensor + context_audio_codes_lens: torch.Tensor + text_embedded: torch.Tensor + text_lens: torch.Tensor + context_text_tokens: torch.Tensor + context_text_lens: torch.Tensor + remaining_text_embedded: Optional[torch.Tensor] + remaining_text_lens: Optional[torch.Tensor] + + +@dataclass +class ProcessBatchOutput: + """ + Output dataclass from process_batch containing loss values and model predictions. + + Attributes: + loss: Total combined loss (codebook_loss + phoneme_loss + local_transformer_loss) + codebook_loss: Loss for audio codebook prediction + phoneme_loss: Loss for phoneme prediction (None if phoneme_tokenizer is not used) + local_transformer_loss: Loss from local transformer (None if not using local transformer) + local_transformer_logits: Logits from local transformer, shape (B, T', num_codebooks * num_tokens_per_codebook) + logits: Predicted logits from the main decoder, shape (B, T', num_codebooks * num_tokens_per_codebook) + audio_codes_target: Target audio codes for the decoder, shape (B, C, T') + audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,) + context_audio_codes: Audio codes extracted from context audio, shape (B, C, T') + context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,) + """ + + loss: torch.Tensor + codebook_loss: torch.Tensor + phoneme_loss: Optional[torch.Tensor] + local_transformer_loss: Optional[torch.Tensor] + local_transformer_logits: Optional[torch.Tensor] + logits: torch.Tensor + audio_codes_target: torch.Tensor + audio_codes_lens_target: torch.Tensor + context_audio_codes: torch.Tensor + context_audio_codes_lens: torch.Tensor + + def worker_init_fn(worker_id): # For mp.set_start_method("spawn", force=True) # The dataset class should be picklable, so we initialize non-picklable objects here @@ -769,9 +832,58 @@ def join_embeddings_temporally( return joined, out_lengths - def prepare_context_tensors(self, batch, dropout_text_input=False): - text = batch['text'] - text_lens = batch['text_lens'] + def prepare_context_tensors( + self, + text: torch.Tensor, + text_lens: torch.Tensor, + context_text_tokens: torch.Tensor, + context_text_tokens_lens: torch.Tensor, + context_audio_codes: Optional[torch.Tensor] = None, + context_audio_codes_lens: Optional[torch.Tensor] = None, + context_audio: Optional[torch.Tensor] = None, + context_audio_lens: Optional[torch.Tensor] = None, + dropout_text_input: bool = False, + ) -> ContextTensors: + """ + Prepare context tensors for the EasyMagpieTTS model. + + This function processes the input text, context audio, and context text to create + the combined context embedding that will be fed to the transformer decoder. It handles + both 'full' and 'streaming' text input modes. + + Args: + text: Input text token IDs (B, L) + text_lens: Length of text for each batch item (B,) + context_text_tokens: Context text token IDs for speaker/style conditioning (B, L) + context_text_tokens_lens: Length of context text for each batch item (B,) + context_audio_codes: Pre-computed audio codes for context audio (B, C, T'). + If None, will be computed from context_audio. + context_audio_codes_lens: Length of context audio codes (B,). + Required if context_audio_codes is provided. + context_audio: Raw context audio waveform (B, T). + Used to compute context_audio_codes if not provided. + context_audio_lens: Length of context audio (B,). + Required if context_audio is provided. + dropout_text_input: If True, zero out the text embedding for classifier-free guidance. + + Returns: + ContextTensors: A dataclass containing all prepared context tensors including: + - context_embedding: Combined context embedding (B, T_total, E) + - context_lens: Total context length per batch item (B,) + - context_audio_codes: Processed audio codes with special tokens (B, C, T') + - context_audio_embedded: Embedded context audio (B, T', E) + - context_audio_codes_lens: Length of processed context audio codes (B,) + - text_embedded: Embedded text tokens (B, L, E) + - text_lens: Text length per batch item (B,) + - context_text_tokens: Context text token IDs (B, L) + - context_text_lens: Context text length per batch item (B,) + - remaining_text_embedded: For streaming mode, embedded remaining text (B, T, E) + - remaining_text_lens: For streaming mode, remaining text length (B,) + + Raises: + ValueError: If neither context_audio_codes nor context_audio is provided. + ValueError: If text_input_mode is not 'full' or 'streaming'. + """ text_embedded = self.decoder.get_input_embeddings()(text) if self.use_bpe_char_tokenizer: text_mask = get_mask_from_lengths(text_lens) @@ -793,13 +905,10 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): text_embedded = text_embedded * 0.0 # Context Audio - if 'context_audio_codes' in batch: - context_audio_codes = batch['context_audio_codes'] - context_audio_codes_lens = batch['context_audio_codes_lens'] - else: - context_audio_codes, context_audio_codes_lens = self.audio_to_codes( - batch['context_audio'], batch['context_audio_lens'] - ) + if context_audio_codes is None: + if context_audio is None: + raise ValueError("Either context_audio_codes or context_audio must be provided") + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) if self._codec_converter is not None: context_audio_codes = self._codec_converter.convert_original_to_new( @@ -824,8 +933,7 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): context_audio_embedded = self.embed_audio_tokens(context_audio_codes) # (B, T', E) # Context Text - context_text_tokens = batch['context_text_tokens'] - context_text_lens = batch['context_text_tokens_lens'] + context_text_lens = context_text_tokens_lens context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens) # (B, L, E) remaining_text_embedded = None @@ -850,19 +958,19 @@ def prepare_context_tensors(self, batch, dropout_text_input=False): else: raise ValueError(f"Invalid text input mode: {self.text_input_mode}") - return { - 'context_embedding': context_embedding, # (B, T_total, E) - 'context_lens': context_lens, # (B,) - 'context_audio_codes': context_audio_codes, # (B, C, T') - 'context_audio_embedded': context_audio_embedded, # (B, T', E) - 'context_audio_codes_lens': context_audio_codes_lens, # (B,) - 'text_embedded': text_embedded, # (B, L, E) - 'text_lens': text_lens, # (B,) - 'context_text_tokens': context_text_tokens, # (B, L) - 'context_text_lens': context_text_lens, # (B,) - 'remaining_text_embedded': remaining_text_embedded, # (B, T, E) - 'remaining_text_lens': remaining_text_lens, # (B,) - } + return ContextTensors( + context_embedding=context_embedding, + context_lens=context_lens, + context_audio_codes=context_audio_codes, + context_audio_embedded=context_audio_embedded, + context_audio_codes_lens=context_audio_codes_lens, + text_embedded=text_embedded, + text_lens=text_lens, + context_text_tokens=context_text_tokens, + context_text_lens=context_text_lens, + remaining_text_embedded=remaining_text_embedded, + remaining_text_lens=remaining_text_lens, + ) def slice_pred_embeddings(self, transformer_out, context_lens, target_lens): """ @@ -968,19 +1076,98 @@ def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, con ) return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens - def process_batch(self, batch, mode="train"): + def process_batch( + self, + text: torch.Tensor, + text_lens: torch.Tensor, + context_text_tokens: torch.Tensor, + context_text_tokens_lens: torch.Tensor, + audio: Optional[torch.Tensor] = None, + audio_lens: Optional[torch.Tensor] = None, + audio_codes: Optional[torch.Tensor] = None, + audio_codes_lens: Optional[torch.Tensor] = None, + context_audio: Optional[torch.Tensor] = None, + context_audio_lens: Optional[torch.Tensor] = None, + context_audio_codes: Optional[torch.Tensor] = None, + context_audio_codes_lens: Optional[torch.Tensor] = None, + phoneme_tokens: Optional[torch.Tensor] = None, + phoneme_tokens_lens: Optional[torch.Tensor] = None, + mode: str = "train", + ) -> ProcessBatchOutput: + """ + Process a batch of inputs to compute model outputs and losses. + + This function performs the following steps: + 1. Prepares context tensors from text and audio inputs + 2. Optionally applies dropout to text/phoneme inputs for regularization + 3. Optionally applies classifier-free guidance (CFG) unconditional training + 4. Converts audio to codes if not already provided + 5. Embeds audio codes and combines with context embeddings + 6. Runs the transformer forward pass + 7. Computes codebook loss, phoneme loss (if applicable), and local transformer loss (if applicable) + + Args: + text: Input text token IDs, shape (B, L) + text_lens: Length of text for each batch item, shape (B,) + context_text_tokens: Context text token IDs for conditioning, shape (B, L_ctx) + context_text_tokens_lens: Length of context text for each batch item, shape (B,) + audio: Raw audio waveform (used if audio_codes not provided), shape (B, T_audio) + audio_lens: Length of audio for each batch item, shape (B,) + audio_codes: Pre-computed audio codes (optional, computed from audio if not provided), shape (B, C, T) + audio_codes_lens: Length of audio codes for each batch item, shape (B,) + context_audio: Raw context audio waveform (optional), shape (B, T_ctx_audio) + context_audio_lens: Length of context audio for each batch item, shape (B,) + context_audio_codes: Pre-computed context audio codes (optional), shape (B, C, T_ctx) + context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,) + phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, P, L_phoneme) + phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,) + mode: Training mode, either "train" or "val". Affects dropout behavior. + + Returns: + ProcessBatchOutput: Dataclass containing: + - loss: Total combined loss + - codebook_loss: Loss for audio codebook prediction + - phoneme_loss: Loss for phoneme prediction (None if not using phonemes) + - local_transformer_loss: Loss from local transformer (None if not used) + - local_transformer_logits: Logits from local transformer + - logits: Predicted logits from the main decoder + - audio_codes_target: Target audio codes + - audio_codes_lens_target: Length of target audio codes + - context_audio_codes: Audio codes from context + - context_audio_codes_lens: Length of context audio codes + """ + # Determine whether to apply text/phoneme dropout for regularization during training + # Text dropout: randomly drop text input to encourage the model to rely on other signals dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False + # Phoneme dropout: randomly drop phoneme input, but only if text is not already dropped + # This ensures we don't drop both simultaneously dropout_phoneme_input = ( ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input)) if mode == 'train' else False ) - context_tensors = self.prepare_context_tensors(batch, dropout_text_input) - # print("text lens", context_tensors['text_lens']) - remaining_text_embedded = context_tensors['remaining_text_embedded'] - context_embedding = context_tensors['context_embedding'] - context_lens = context_tensors['context_lens'] + # Prepare context tensors by combining text and audio context information + context_tensors = self.prepare_context_tensors( + text=text, + text_lens=text_lens, + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + context_audio=context_audio, + context_audio_lens=context_audio_lens, + dropout_text_input=dropout_text_input, + ) + + # Extract context tensors for use in the forward pass + remaining_text_embedded = context_tensors.remaining_text_embedded + context_embedding = context_tensors.context_embedding + context_lens = context_tensors.context_lens + + # Classifier-Free Guidance (CFG) unconditional training: + # With some probability, replace the context with a special unconditional token + # This allows the model to generate without conditioning during inference dropout_conditional_input = False if mode == 'train' and self.cfg_unconditional_prob > 0.0: if torch.rand(1).item() < self.cfg_unconditional_prob: @@ -997,17 +1184,17 @@ def process_batch(self, batch, mode="train"): if self.text_input_mode == 'streaming': remaining_text_embedded = torch.zeros_like(remaining_text_embedded) - if 'audio_codes' not in batch: - audio_codes, audio_codes_lens = self.audio_to_codes(batch['audio'], batch['audio_lens']) - else: - audio_codes = batch['audio_codes'] - audio_codes_lens = batch['audio_codes_lens'] + # Convert raw audio to discrete codes if codes are not already provided + if audio_codes is None: + audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens) + # Apply codec conversion if a converter is configured (e.g., for different codec formats) if self._codec_converter is not None: audio_codes = self._codec_converter.convert_original_to_new( audio_tokens=audio_codes, audio_lens=audio_codes_lens ).long() + # Add BOS (beginning of sequence) and EOS (end of sequence) tokens to audio codes audio_codes, audio_codes_lens = self.add_special_tokens( codes=audio_codes, codes_len=audio_codes_lens, @@ -1015,6 +1202,8 @@ def process_batch(self, batch, mode="train"): eos_id=self.audio_eos_id, ) + # Stack audio codes across codebooks for multi-codebook processing + # This reshapes codes for parallel prediction of multiple codebooks audio_codes, audio_codes_lens = self.stack_codes( audio_codes, audio_codes_lens, @@ -1023,14 +1212,23 @@ def process_batch(self, batch, mode="train"): self.frame_stacking_factor, self.num_audio_codebooks, ) + + # Prepare input and target sequences for autoregressive training + # Input: all tokens except the last (teacher forcing) + # Target: all tokens except the first (shifted by one position) audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1 audio_codes_target = audio_codes[:, :, 1:] # (B, C, T') Target for the decoder audio_codes_input = audio_codes[:, :, :-1] # (B, C, T') Input to the decoder + + # Embed audio tokens to get continuous representations audio_codes_input_embedded = self.embed_audio_tokens( audio_codes_input - ) # (B, T, E) # Computing this to be use in the alignment encoder + ) # (B, T, E) + + # In streaming mode, add remaining text embeddings to audio embeddings + # This provides text information at each audio timestep if remaining_text_embedded is not None: - # Make remaining text embedded the same size as audio_codes_input_embedded by padding with zeros on the right + # Pad remaining text to match audio sequence length by adding zeros on the right padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1) padding_tensor = torch.zeros( remaining_text_embedded.size(0), @@ -1039,23 +1237,32 @@ def process_batch(self, batch, mode="train"): device=remaining_text_embedded.device, ) remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1) + # Add text information to audio embeddings (element-wise addition) audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded + # Concatenate context embeddings with audio embeddings along the time dimension + # Result: [context_embedding | audio_codes_input_embedded] context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( embeddings=[context_embedding, audio_codes_input_embedded], lengths=[context_lens, audio_codes_lens_input], ) + # Process phoneme input if phoneme tokenizer is configured if self.phoneme_tokenizer is not None: + # Compute context length offset for phoneme alignment + # This accounts for different delays in speech vs phoneme streams context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay - phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens = ( + + # Prepare phoneme channel input with proper alignment + phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens_processed, phoneme_tokens_lens_processed = ( self.prepare_phoneme_channel_input( - batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes + phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes ) ) - # print("phoneme_tokens_lens", phoneme_tokens_lens) - # print("audio_codes_lens", audio_codes_lens_input) + + # Align phoneme channel input to match the combined context+audio sequence length if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]: + # Pad phoneme channel with zeros if shorter than context+audio padding_tensor = torch.zeros( phoneme_channel_input.shape[0], context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1], @@ -1064,88 +1271,120 @@ def process_batch(self, batch, mode="train"): ) phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1) else: + # Truncate phoneme channel if longer than context+audio phoneme_channel_input = phoneme_channel_input[:, : context_plus_audio_embedded.shape[1], :] + # Add phoneme information unless doing unconditional or phoneme dropout training if (not dropout_conditional_input) and (not dropout_phoneme_input): context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input + # Run the transformer forward pass transformer_out = self.forward( inputs_embeds=context_plus_audio_embedded, attention_mask=get_mask_from_lengths(context_plus_audio_lens), ) transformer_hidden_states = transformer_out.last_hidden_state # (B, T_total, E) + # Extract prediction embeddings by slicing out the audio portion (excluding context) pred_embeddings = self.slice_pred_embeddings( transformer_hidden_states, context_lens=context_lens, target_lens=audio_codes_lens_target, ) + # Project embeddings to logits for each codebook logits = self.final_proj(pred_embeddings) # (B, T', num_codebooks * num_tokens_per_codebook) - + + # Compute the main codebook prediction loss codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) loss = codebook_loss + # Compute local transformer loss if using local transformer architecture local_transformer_loss = None local_transformer_logits = None if self.local_transformer_type != LocalTransformerType.NO_LT: assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" + # Compute logits using the local (autoregressive) transformer local_transformer_logits = self.compute_local_transformer_logits( pred_embeddings, audio_codes_target, targets_offset_by_one=False ) local_transformer_loss, _ = self.compute_loss( local_transformer_logits, audio_codes_target, audio_codes_lens_target ) + # Scale and add local transformer loss to total loss local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0) loss = loss + local_transformer_loss_scale * local_transformer_loss + # Compute phoneme prediction loss if using phoneme tokenizer phoneme_loss = None if self.phoneme_tokenizer is not None: + # Extract phoneme prediction embeddings with proper alignment pred_embeddings_phoneme = self.slice_pred_embeddings( transformer_hidden_states, context_lens=context_lens_for_phonemes, - target_lens=phoneme_tokens_lens - 1, + target_lens=phoneme_tokens_lens_processed - 1, ) + # Project to phoneme logits phoneme_logits = self.phoneme_final_proj( pred_embeddings_phoneme ) # (B, T', phoneme_stacking_factor * phoneme_vocab_size) + + # Only compute phoneme loss if not doing any dropout + # (unconditional, text dropout, or phoneme dropout) if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input): - # Only compute phoneme loss if not doing unconditional training or text dropout phoneme_loss, _ = self.compute_phoneme_loss( - phoneme_logits, phoneme_tokens[:, :, 1:].long(), phoneme_tokens_lens - 1 + phoneme_logits, phoneme_tokens_processed[:, :, 1:].long(), phoneme_tokens_lens_processed - 1 ) print("No Dropout - phoneme loss:", phoneme_loss.item()) else: + # Skip phoneme loss computation during dropout training phoneme_loss = torch.tensor(0.0, device=logits.device) print("Dropout - phoneme loss skipped", phoneme_loss.item()) loss = loss + phoneme_loss - return { - 'loss': loss, - 'codebook_loss': codebook_loss, - 'phoneme_loss': phoneme_loss, - 'local_transformer_loss': local_transformer_loss, - 'local_transformer_logits': local_transformer_logits, # (B, T', num_codebooks * num_tokens_per_codebook) - 'logits': logits, - 'audio_codes_target': audio_codes_target, # (B, C, T') - 'audio_codes_lens_target': audio_codes_lens_target, # (B,) - 'context_audio_codes': context_tensors['context_audio_codes'], # (B, C, T') - 'context_audio_codes_lens': context_tensors['context_audio_codes_lens'], # (B,) - } + return ProcessBatchOutput( + loss=loss, + codebook_loss=codebook_loss, + phoneme_loss=phoneme_loss, + local_transformer_loss=local_transformer_loss, + local_transformer_logits=local_transformer_logits, + logits=logits, + audio_codes_target=audio_codes_target, + audio_codes_lens_target=audio_codes_lens_target, + context_audio_codes=context_tensors.context_audio_codes, + context_audio_codes_lens=context_tensors.context_audio_codes_lens, + ) def training_step(self, batch, batch_idx): - batch_output = self.process_batch(batch) - loss = batch_output['loss'] - codebook_loss = batch_output['codebook_loss'] + # Extract inputs from batch and pass explicitly to process_batch + batch_output = self.process_batch( + text=batch['text'], + text_lens=batch['text_lens'], + context_text_tokens=batch['context_text_tokens'], + context_text_tokens_lens=batch['context_text_tokens_lens'], + audio=batch.get('audio'), + audio_lens=batch.get('audio_lens'), + audio_codes=batch.get('audio_codes'), + audio_codes_lens=batch.get('audio_codes_lens'), + context_audio=batch.get('context_audio'), + context_audio_lens=batch.get('context_audio_lens'), + context_audio_codes=batch.get('context_audio_codes'), + context_audio_codes_lens=batch.get('context_audio_codes_lens'), + phoneme_tokens=batch.get('phoneme_tokens'), + phoneme_tokens_lens=batch.get('phoneme_tokens_lens'), + mode="train", + ) + loss = batch_output.loss + codebook_loss = batch_output.codebook_loss self.log('train/codebook_loss', codebook_loss, prog_bar=True, sync_dist=True) self.log('train/loss', loss, prog_bar=True, sync_dist=True) if self.phoneme_tokenizer is not None: - phoneme_loss = batch_output['phoneme_loss'] + phoneme_loss = batch_output.phoneme_loss self.log('train/phoneme_loss', phoneme_loss, prog_bar=True, sync_dist=True) - local_transformer_loss = batch_output['local_transformer_loss'] + local_transformer_loss = batch_output.local_transformer_loss if local_transformer_loss is not None: self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True) @@ -1188,16 +1427,34 @@ def training_step(self, batch, batch_idx): return loss def validation_step(self, batch, batch_idx): - batch_output = self.process_batch(batch, mode="val") - # self.process_batch returns a dict. We currently only log "logits" which come from the parallel prediction - # head. If we use local_transformer, then the local_transformer returns "local_transformer_logits" - loss = batch_output['loss'] - codebook_loss = batch_output['codebook_loss'] - logits = batch_output['logits'] - audio_codes_target = batch_output['audio_codes_target'] - audio_codes_lens_target = batch_output['audio_codes_lens_target'] - context_audio_codes = batch_output['context_audio_codes'] - context_audio_codes_lens = batch_output['context_audio_codes_lens'] + # Extract inputs from batch and pass explicitly to process_batch + batch_output = self.process_batch( + text=batch['text'], + text_lens=batch['text_lens'], + context_text_tokens=batch['context_text_tokens'], + context_text_tokens_lens=batch['context_text_tokens_lens'], + audio=batch.get('audio'), + audio_lens=batch.get('audio_lens'), + audio_codes=batch.get('audio_codes'), + audio_codes_lens=batch.get('audio_codes_lens'), + context_audio=batch.get('context_audio'), + context_audio_lens=batch.get('context_audio_lens'), + context_audio_codes=batch.get('context_audio_codes'), + context_audio_codes_lens=batch.get('context_audio_codes_lens'), + phoneme_tokens=batch.get('phoneme_tokens'), + phoneme_tokens_lens=batch.get('phoneme_tokens_lens'), + mode="val", + ) + # Access ProcessBatchOutput dataclass attributes + # logits come from the parallel prediction head + # If using local_transformer, local_transformer_logits are also available + loss = batch_output.loss + codebook_loss = batch_output.codebook_loss + logits = batch_output.logits + audio_codes_target = batch_output.audio_codes_target + audio_codes_lens_target = batch_output.audio_codes_lens_target + context_audio_codes = batch_output.context_audio_codes + context_audio_codes_lens = batch_output.context_audio_codes_lens if batch_idx == 0 and self.global_rank == 0: # Prepare dictionary for aggregated wandb logging @@ -1215,7 +1472,7 @@ def validation_step(self, batch, batch_idx): if isinstance(logger, WandbLogger) and wandb_log_dict: logger.experiment.log(wandb_log_dict) - local_transformer_loss = batch_output['local_transformer_loss'] + local_transformer_loss = batch_output.local_transformer_loss val_output = { 'val_loss': loss, 'val_codebook_loss': codebook_loss, @@ -1223,7 +1480,7 @@ def validation_step(self, batch, batch_idx): } if self.phoneme_tokenizer is not None: - phoneme_loss = batch_output['phoneme_loss'] + phoneme_loss = batch_output.phoneme_loss val_output['val_phoneme_loss'] = phoneme_loss self.validation_step_outputs.append(val_output) @@ -1376,11 +1633,21 @@ def infer_batch( # TODO: Make this API same as MagpieTTS model. with torch.inference_mode(): start_time = time.time() - context_tensors = self.prepare_context_tensors(batch, dropout_text_input=dropout_text_input) - context_embedding = context_tensors['context_embedding'] # (B, T_total, E) - context_lens = context_tensors['context_lens'] # (B,) - remaining_text_embedded = context_tensors['remaining_text_embedded'] - remaining_text_lens = context_tensors['remaining_text_lens'] + context_tensors = self.prepare_context_tensors( + text=batch['text'], + text_lens=batch['text_lens'], + context_text_tokens=batch['context_text_tokens'], + context_text_tokens_lens=batch['context_text_tokens_lens'], + context_audio_codes=batch.get('context_audio_codes'), + context_audio_codes_lens=batch.get('context_audio_codes_lens'), + context_audio=batch.get('context_audio'), + context_audio_lens=batch.get('context_audio_lens'), + dropout_text_input=dropout_text_input, + ) + context_embedding = context_tensors.context_embedding # (B, T_total, E) + context_lens = context_tensors.context_lens # (B,) + remaining_text_embedded = context_tensors.remaining_text_embedded + remaining_text_lens = context_tensors.remaining_text_lens if self.phoneme_tokenizer is not None: context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay From ef6a0e0e86e6c345f2e8574cda72caa914b78d0f Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 28 Jan 2026 19:33:27 -0500 Subject: [PATCH 21/94] more doc strings Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts.py | 68 ++++++++++++++++++- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index d9a6705d4a74..7d6e8bccadd2 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -1000,6 +1000,28 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens): return sliced def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks): + """ + Stack multiple time steps into the channel dimension to reduce sequence length. + + This function reshapes audio/phoneme codes by grouping consecutive time steps together + and placing them in the channel dimension. This allows the model to process multiple + frames in parallel while reducing the sequence length. + + Args: + codes: Input codes tensor of shape (B, C, T) where B is batch size, + C is number of codebooks, and T is sequence length. + codes_lens: Length of valid codes for each batch item, shape (B,). + bos_id: Beginning-of-sequence token ID used to detect and handle BOS tokens. + eos_id: End-of-sequence token ID used for padding. + stacking_factor: Number of time steps to stack together. If 1, no stacking is performed. + num_codebooks: Number of codebooks in the input. + + Returns: + Tuple of: + - stacked_codes: Reshaped codes of shape (B, C * stacking_factor, T // stacking_factor). + If input contains BOS tokens, they are preserved at the beginning. + - new_lens: Updated sequence lengths after stacking, shape (B,). + """ if stacking_factor == 1: return codes, codes_lens @@ -1032,6 +1054,26 @@ def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_co return codes, new_lens def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): + """ + Reverse the stacking operation to recover the original time dimension. + + This is the inverse of `stack_codes`. It takes codes that have been stacked + in the channel dimension and expands them back into the time dimension. + + Args: + stacked_codes: Stacked codes tensor of shape (B, C * stacking_factor, T_stacked) + where T_stacked = T_original // stacking_factor. + stacked_lens: Length of valid stacked sequences for each batch item, shape (B,). + stacking_factor: The stacking factor used in the original `stack_codes` call. + If 1, no unstacking is performed. + + Returns: + Tuple of: + - unstacked_codes: Codes with restored time dimension, shape (B, C, T_stacked * stacking_factor). + - orig_lens: Recovered sequence lengths, shape (B,). Note that these are the + maximum possible lengths; actual valid lengths may be shorter due to + padding applied during stacking. + """ if stacking_factor == 1: return stacked_codes, stacked_lens @@ -1051,7 +1093,29 @@ def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): return x, orig_lens def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens): - # import ipdb; ipdb.set_trace() + """ + Prepare phoneme tokens as an auxiliary input channel for the decoder. + + This function processes phoneme tokens by stacking them (if configured), embedding them, + and prepending a zero-padded context region. The resulting tensor can be used as an + additional input channel to provide phoneme conditioning to the audio decoder. + + Args: + phoneme_tokens: Phoneme token IDs, shape (B, L) where B is batch size and + L is the phoneme sequence length. + phoneme_tokens_lens: Length of valid phoneme tokens for each batch item, shape (B,). + context_lens: Length of the context region for each batch item, shape (B,). + Used to prepend zero-padding to align with audio context. + + Returns: + Tuple of: + - phoneme_channel_input: Embedded phoneme tokens with zero-padded context, + shape (B, T_context + T_phoneme, E) where E is the embedding dimension. + - phoneme_channel_input_lens: Total length of phoneme channel input for each + batch item (context_lens + phoneme_tokens_lens after stacking), shape (B,). + - phoneme_tokens: Stacked phoneme tokens, shape (B, phoneme_stacking_factor, T_stacked). + - phoneme_tokens_lens: Length of stacked phoneme tokens, shape (B,). + """ phoneme_tokens = phoneme_tokens.unsqueeze(1) # (B, 1, L) phoneme_tokens, phoneme_tokens_lens = self.stack_codes( phoneme_tokens, @@ -1119,7 +1183,7 @@ def process_batch( context_audio_lens: Length of context audio for each batch item, shape (B,) context_audio_codes: Pre-computed context audio codes (optional), shape (B, C, T_ctx) context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,) - phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, P, L_phoneme) + phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, L_phoneme) phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,) mode: Training mode, either "train" or "val". Affects dropout behavior. From 0101a1aaedc204521465b3b5763158207afc00e1 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 29 Jan 2026 01:07:43 +0000 Subject: [PATCH 22/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 7d6e8bccadd2..62f3aa99e46d 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -1285,9 +1285,7 @@ def process_batch( audio_codes_input = audio_codes[:, :, :-1] # (B, C, T') Input to the decoder # Embed audio tokens to get continuous representations - audio_codes_input_embedded = self.embed_audio_tokens( - audio_codes_input - ) # (B, T, E) + audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) # In streaming mode, add remaining text embeddings to audio embeddings # This provides text information at each audio timestep @@ -1318,11 +1316,12 @@ def process_batch( context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay # Prepare phoneme channel input with proper alignment - phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens_processed, phoneme_tokens_lens_processed = ( - self.prepare_phoneme_channel_input( - phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes - ) - ) + ( + phoneme_channel_input, + phoneme_channel_input_lens, + phoneme_tokens_processed, + phoneme_tokens_lens_processed, + ) = self.prepare_phoneme_channel_input(phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes) # Align phoneme channel input to match the combined context+audio sequence length if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]: From ce19ed6c1c80791a772f3d911a9cccd749f1ff7c Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 28 Jan 2026 20:43:56 -0500 Subject: [PATCH 23/94] support multiple training modes Signed-off-by: Paarth Neekhara --- .../tts/conf/magpietts/easy_magpietts.yaml | 22 +- .../conf/magpietts/easy_magpietts_lhotse.yaml | 21 +- nemo/collections/tts/models/easy_magpietts.py | 215 +++++++++++++++--- 3 files changed, 225 insertions(+), 33 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 8518fa79060b..76f39121322e 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -36,11 +36,27 @@ model: cfg_unconditional_prob: 0.1 # To get special_tokens of the tokenzer, you can do: # model.tokenizer.first_tokenizer.additional_special_tokens - text_input_mode: "streaming" + + # Multi-mode training configuration + # The model will randomly select one of the modes for each batch during training. + # Each mode has its own task embedding that is prepended to the context. + # During inference, you can specify which mode to use via the 'inference_mode' parameter. + training_modes: + - name: "full" + text_input_mode: "full" + streaming_phonemes_delay: 0 # Not used in full mode + streaming_speech_delay: 0 # Not used in full mode + - name: "streaming_4_8" + text_input_mode: "streaming" + streaming_phonemes_delay: 4 + streaming_speech_delay: 8 + - name: "streaming_2_4" + text_input_mode: "streaming" + streaming_phonemes_delay: 2 + streaming_speech_delay: 4 + frame_stacking_factor: 1 phoneme_stacking_factor: 2 - streaming_phonemes_delay: 4 - streaming_speech_delay: 8 dropout_text_input_prob: 0.3 phoneme_tokenizer: diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index 6ed9b529eac6..1683a27f4238 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -31,11 +31,26 @@ model: cfg_unconditional_prob: 0.1 - text_input_mode: "streaming" + # Multi-mode training configuration + # The model will randomly select one of the modes for each batch during training. + # Each mode has its own task embedding that is prepended to the context. + # During inference, you can specify which mode to use via the 'inference_mode' parameter. + training_modes: + - name: "full" + text_input_mode: "full" + streaming_phonemes_delay: 0 # Not used in full mode + streaming_speech_delay: 0 # Not used in full mode + - name: "streaming_4_8" + text_input_mode: "streaming" + streaming_phonemes_delay: 4 + streaming_speech_delay: 8 + - name: "streaming_2_4" + text_input_mode: "streaming" + streaming_phonemes_delay: 2 + streaming_speech_delay: 4 + frame_stacking_factor: 1 phoneme_stacking_factor: 2 - streaming_phonemes_delay: 4 - streaming_speech_delay: 8 dropout_text_input_prob: 0.3 phoneme_tokenizer: diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 62f3aa99e46d..07057a618d85 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -48,6 +48,26 @@ from nemo.utils import logging +@dataclass +class TrainingMode: + """ + Configuration for a training mode in multi-mode training. + + Attributes: + name: Unique identifier for this mode (e.g., "full", "streaming_4_8") + text_input_mode: Either "full" or "streaming" + streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode) + streaming_speech_delay: Delay for speech stream (only used in streaming mode) + mode_idx: Index of this mode in the list of modes (used for task embedding lookup) + """ + + name: str + text_input_mode: str + streaming_phonemes_delay: int + streaming_speech_delay: int + mode_idx: int + + @dataclass class ContextTensors: """ @@ -96,6 +116,7 @@ class ProcessBatchOutput: audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,) context_audio_codes: Audio codes extracted from context audio, shape (B, C, T') context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,) + selected_training_mode: Name of the selected training mode (None if multi_mode_training is disabled) """ loss: torch.Tensor @@ -108,6 +129,7 @@ class ProcessBatchOutput: audio_codes_lens_target: torch.Tensor context_audio_codes: torch.Tensor context_audio_codes_lens: torch.Tensor + selected_training_mode: Optional[str] = None def worker_init_fn(worker_id): @@ -187,9 +209,36 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0] self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0) - self.text_input_mode = cfg.get('text_input_mode', 'full') - self.streaming_speech_delay = cfg.get('streaming_speech_delay', 3) - self.streaming_phonemes_delay = cfg.get('streaming_phonemes_delay', 2) + + # Multi-mode training configuration + # The model trains with multiple text input modes (full, streaming with various delays) + # Each mode has its own task embedding that is prepended to the context + training_modes_cfg = cfg.get('training_modes', None) + if training_modes_cfg is None: + raise ValueError("training_modes must be specified in the config") + + self.training_modes = [] + for mode_idx, mode_cfg in enumerate(training_modes_cfg): + mode = TrainingMode( + name=mode_cfg.name, + text_input_mode=mode_cfg.text_input_mode, + streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0), + streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0), + mode_idx=mode_idx, + ) + self.training_modes.append(mode) + + logging.info(f"Multi-mode training with {len(self.training_modes)} modes:") + for mode in self.training_modes: + logging.info(f" - {mode.name}: text_input_mode={mode.text_input_mode}, " + f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, " + f"streaming_speech_delay={mode.streaming_speech_delay}") + + # Create a mapping from mode name to mode object for easy lookup during inference + self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes} + # Default mode for inference if not specified (first mode in the list) + self.default_inference_mode = self.training_modes[0].name + self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1) self.tokenizer = setup_tokenizers( @@ -243,6 +292,17 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) self.decoder.set_input_embeddings(self.text_embedding) + # Task embedding for multi-mode training + # Each mode has a unique task embedding that is prepended to the context + # Only create task embedding if there are multiple modes + num_modes = len(self.training_modes) + if num_modes > 1: + self.task_embedding = nn.Embedding(num_modes, cfg.embedding_dim) + logging.info(f"Created task embedding with {num_modes} modes, embedding_dim={cfg.embedding_dim}") + else: + self.task_embedding = None + logging.info(f"Single training mode '{self.training_modes[0].name}', skipping task embedding") + if self.use_bpe_char_tokenizer: # BPE char tokenizer assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer" @@ -843,6 +903,7 @@ def prepare_context_tensors( context_audio: Optional[torch.Tensor] = None, context_audio_lens: Optional[torch.Tensor] = None, dropout_text_input: bool = False, + training_mode: Optional[TrainingMode] = None, ) -> ContextTensors: """ Prepare context tensors for the EasyMagpieTTS model. @@ -865,6 +926,8 @@ def prepare_context_tensors( context_audio_lens: Length of context audio (B,). Required if context_audio is provided. dropout_text_input: If True, zero out the text embedding for classifier-free guidance. + training_mode: Optional TrainingMode object specifying the mode to use. + If None, uses the first mode from training_modes as default. Returns: ContextTensors: A dataclass containing all prepared context tensors including: @@ -884,17 +947,27 @@ def prepare_context_tensors( ValueError: If neither context_audio_codes nor context_audio is provided. ValueError: If text_input_mode is not 'full' or 'streaming'. """ + # Determine the mode parameters to use + # If no mode is specified, use the first (default) mode + if training_mode is None: + training_mode = self.training_modes[0] + + current_text_input_mode = training_mode.text_input_mode + current_streaming_speech_delay = training_mode.streaming_speech_delay + current_streaming_phonemes_delay = training_mode.streaming_phonemes_delay + current_mode_idx = training_mode.mode_idx + text_embedded = self.decoder.get_input_embeddings()(text) if self.use_bpe_char_tokenizer: text_mask = get_mask_from_lengths(text_lens) cas_embedding = self.cas_encoder(text, subword_mask=text_mask) # (B, L, E) text_embedded = text_embedded + cas_embedding - if text_embedded.shape[1] < self.streaming_speech_delay + 1: + if text_embedded.shape[1] < current_streaming_speech_delay + 1: # If text is too short, pad it with zeros padding_tensor = torch.zeros( text_embedded.shape[0], - self.streaming_speech_delay + 1 - text_embedded.shape[1], + current_streaming_speech_delay + 1 - text_embedded.shape[1], text_embedded.shape[2], device=text_embedded.device, ) @@ -936,27 +1009,51 @@ def prepare_context_tensors( context_text_lens = context_text_tokens_lens context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens) # (B, L, E) + # Prepare task embedding for multi-mode training + # Only use task embedding if there are multiple modes (task_embedding is not None) + task_embedding = None + task_embedding_lens = None + if self.task_embedding is not None and current_mode_idx is not None: + batch_size = text.size(0) + mode_idx_tensor = torch.full( + (batch_size,), current_mode_idx, dtype=torch.long, device=text.device + ) + task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1) # (B, 1, E) + task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=text.device) # (B,) + remaining_text_embedded = None remaining_text_lens = None - if self.text_input_mode == 'full': - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[context_audio_embedded, context_text_embedded, text_embedded], - lengths=[context_audio_codes_lens, context_text_lens, text_lens], - ) - elif self.text_input_mode == 'streaming': - prompt_text_embedded = text_embedded[:, : self.streaming_speech_delay, :] - prompt_text_lens = torch.ones_like(text_lens) * self.streaming_speech_delay - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded], - lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens], - ) - remaining_text_embedded = text_embedded[:, self.streaming_speech_delay :, :] - remaining_text_lens = text_lens - self.streaming_speech_delay + if current_text_input_mode == 'full': + if task_embedding is not None: + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[task_embedding, context_audio_embedded, context_text_embedded, text_embedded], + lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, text_lens], + ) + else: + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[context_audio_embedded, context_text_embedded, text_embedded], + lengths=[context_audio_codes_lens, context_text_lens, text_lens], + ) + elif current_text_input_mode == 'streaming': + prompt_text_embedded = text_embedded[:, :current_streaming_speech_delay, :] + prompt_text_lens = torch.ones_like(text_lens) * current_streaming_speech_delay + if task_embedding is not None: + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[task_embedding, context_audio_embedded, context_text_embedded, prompt_text_embedded], + lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, prompt_text_lens], + ) + else: + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded], + lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens], + ) + remaining_text_embedded = text_embedded[:, current_streaming_speech_delay:, :] + remaining_text_lens = text_lens - current_streaming_speech_delay remaining_text_lens = remaining_text_lens.clamp(min=0) remaining_text_mask = get_mask_from_lengths(remaining_text_lens) remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2) # (B, T, E) else: - raise ValueError(f"Invalid text input mode: {self.text_input_mode}") + raise ValueError(f"Invalid text input mode: {current_text_input_mode}") return ContextTensors( context_embedding=context_embedding, @@ -1157,6 +1254,7 @@ def process_batch( phoneme_tokens: Optional[torch.Tensor] = None, phoneme_tokens_lens: Optional[torch.Tensor] = None, mode: str = "train", + training_mode: Optional[TrainingMode] = None, ) -> ProcessBatchOutput: """ Process a batch of inputs to compute model outputs and losses. @@ -1186,6 +1284,8 @@ def process_batch( phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, L_phoneme) phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,) mode: Training mode, either "train" or "val". Affects dropout behavior. + training_mode: Optional TrainingMode object specifying which mode to use. + If None and multi_mode_training is enabled, a random mode is selected during training. Returns: ProcessBatchOutput: Dataclass containing: @@ -1200,6 +1300,23 @@ def process_batch( - context_audio_codes: Audio codes from context - context_audio_codes_lens: Length of context audio codes """ + # Select training mode for multi-mode training + # During training, randomly select a mode if not specified + # During validation, use the first mode (default) if not specified + selected_training_mode = training_mode + if selected_training_mode is None: + if mode == 'train': + # Randomly select a mode during training + selected_training_mode = random.choice(self.training_modes) + else: + # Use the first mode during validation + selected_training_mode = self.training_modes[0] + + # Get the current mode's parameters + current_text_input_mode = selected_training_mode.text_input_mode + current_streaming_speech_delay = selected_training_mode.streaming_speech_delay + current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay + # Determine whether to apply text/phoneme dropout for regularization during training # Text dropout: randomly drop text input to encourage the model to rely on other signals dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False @@ -1222,6 +1339,7 @@ def process_batch( context_audio=context_audio, context_audio_lens=context_audio_lens, dropout_text_input=dropout_text_input, + training_mode=selected_training_mode, ) # Extract context tensors for use in the forward pass @@ -1245,7 +1363,7 @@ def process_batch( # inference easier especially with KV caching and using a duplicated batch. context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1) # (B, T_total, E) # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation. - if self.text_input_mode == 'streaming': + if current_text_input_mode == 'streaming': remaining_text_embedded = torch.zeros_like(remaining_text_embedded) # Convert raw audio to discrete codes if codes are not already provided @@ -1313,7 +1431,8 @@ def process_batch( if self.phoneme_tokenizer is not None: # Compute context length offset for phoneme alignment # This accounts for different delays in speech vs phoneme streams - context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay + # Use the selected mode's streaming delays + context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay # Prepare phoneme channel input with proper alignment ( @@ -1417,6 +1536,7 @@ def process_batch( audio_codes_lens_target=audio_codes_lens_target, context_audio_codes=context_tensors.context_audio_codes, context_audio_codes_lens=context_tensors.context_audio_codes_lens, + selected_training_mode=selected_training_mode.name if selected_training_mode is not None else None, ) def training_step(self, batch, batch_idx): @@ -1451,6 +1571,13 @@ def training_step(self, batch, batch_idx): if local_transformer_loss is not None: self.log('train/local_transformer_loss', local_transformer_loss, prog_bar=True, sync_dist=True) + # Log training mode info for multi-mode training + if batch_output.selected_training_mode is not None: + # Log which mode was selected for this batch + # Convert mode name to an index for logging + mode_idx = self.mode_name_to_mode[batch_output.selected_training_mode].mode_idx + self.log('train/training_mode_idx', float(mode_idx), on_step=True) + # Log batch info batch_size, text_token_max_len = batch["text"].shape text_token_total_num = batch["text_lens"].sum() @@ -1692,10 +1819,43 @@ def infer_batch( phoneme_input_type='gt', phoneme_sampling_method='argmax', dropout_text_input=False, + inference_mode: Optional[str] = None, ): - # TODO: Make this API same as MagpieTTS model. + """ + Run inference on a batch of inputs. + + Args: + batch: Input batch containing text, context, etc. + max_decoder_steps: Maximum number of decoding steps. + temperature: Sampling temperature. + topk: Top-k sampling parameter. + use_local_transformer_for_inference: Whether to use local transformer. + maskgit_n_steps: Number of MaskGit steps. + use_cfg: Whether to use classifier-free guidance. + cfg_scale: CFG scale factor. + phoneme_input_type: 'gt' for ground truth or 'pred' for predicted phonemes. + phoneme_sampling_method: 'argmax' or 'sample'. + dropout_text_input: Whether to dropout text input. + inference_mode: Name of the inference mode to use (e.g., "full", "streaming_4_8"). + If None, uses the default inference mode (first mode in training_modes). + """ with torch.inference_mode(): start_time = time.time() + + # Resolve inference mode + mode_name = inference_mode if inference_mode is not None else self.default_inference_mode + if mode_name in self.mode_name_to_mode: + selected_training_mode = self.mode_name_to_mode[mode_name] + logging.info(f"Using inference mode: {selected_training_mode.name}") + else: + available_modes = list(self.mode_name_to_mode.keys()) + raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}") + + # Get current mode parameters + current_text_input_mode = selected_training_mode.text_input_mode + current_streaming_speech_delay = selected_training_mode.streaming_speech_delay + current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay + context_tensors = self.prepare_context_tensors( text=batch['text'], text_lens=batch['text_lens'], @@ -1706,6 +1866,7 @@ def infer_batch( context_audio=batch.get('context_audio'), context_audio_lens=batch.get('context_audio_lens'), dropout_text_input=dropout_text_input, + training_mode=selected_training_mode, ) context_embedding = context_tensors.context_embedding # (B, T_total, E) context_lens = context_tensors.context_lens # (B,) @@ -1713,7 +1874,7 @@ def infer_batch( remaining_text_lens = context_tensors.remaining_text_lens if self.phoneme_tokenizer is not None: - context_lens_for_phonemes = context_lens - self.streaming_speech_delay + self.streaming_phonemes_delay + context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = ( self.prepare_phoneme_channel_input( batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes @@ -1736,7 +1897,7 @@ def infer_batch( audio_codes_input = audio_codes_bos audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) - if self.text_input_mode == 'streaming': + if current_text_input_mode == 'streaming': remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1 remaining_text_pad_tensor = torch.zeros( remaining_text_embedded.size(0), @@ -1756,7 +1917,7 @@ def infer_batch( min_context_len = context_plus_audio_lens.min().item() if self.phoneme_tokenizer is not None: min_context_len = ( - min_context_len - self.streaming_speech_delay + self.streaming_phonemes_delay - 1 + min_context_len - current_streaming_speech_delay + current_streaming_phonemes_delay - 1 ) # 1 for audio BOS that we had added. actual_batch_size = context_embedding.size(0) @@ -1947,7 +2108,7 @@ def infer_batch( new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2)) # (B, 1, E) new_emb_unconditional = new_emb * 1 - if self.text_input_mode == 'streaming': + if current_text_input_mode == 'streaming': _bs = context_embedding.size(0) remaining_text_embedded_current = remaining_text_embedded[ torch.arange(_bs), current_text_positions.clamp(min=0), : From 704a5c843a05fad96b8701681ca8296b516032d7 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 29 Jan 2026 01:46:11 +0000 Subject: [PATCH 24/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 07057a618d85..628808c50d92 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -230,9 +230,11 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): logging.info(f"Multi-mode training with {len(self.training_modes)} modes:") for mode in self.training_modes: - logging.info(f" - {mode.name}: text_input_mode={mode.text_input_mode}, " - f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, " - f"streaming_speech_delay={mode.streaming_speech_delay}") + logging.info( + f" - {mode.name}: text_input_mode={mode.text_input_mode}, " + f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, " + f"streaming_speech_delay={mode.streaming_speech_delay}" + ) # Create a mapping from mode name to mode object for easy lookup during inference self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes} @@ -1015,9 +1017,7 @@ def prepare_context_tensors( task_embedding_lens = None if self.task_embedding is not None and current_mode_idx is not None: batch_size = text.size(0) - mode_idx_tensor = torch.full( - (batch_size,), current_mode_idx, dtype=torch.long, device=text.device - ) + mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=text.device) task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1) # (B, 1, E) task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=text.device) # (B,) @@ -1432,7 +1432,9 @@ def process_batch( # Compute context length offset for phoneme alignment # This accounts for different delays in speech vs phoneme streams # Use the selected mode's streaming delays - context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay + context_lens_for_phonemes = ( + context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay + ) # Prepare phoneme channel input with proper alignment ( @@ -1874,7 +1876,9 @@ def infer_batch( remaining_text_lens = context_tensors.remaining_text_lens if self.phoneme_tokenizer is not None: - context_lens_for_phonemes = context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay + context_lens_for_phonemes = ( + context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay + ) phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = ( self.prepare_phoneme_channel_input( batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes From 038d224311ab8d8f2972dfdeb92288ecdc4a84f0 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Thu, 29 Jan 2026 12:02:58 -0500 Subject: [PATCH 25/94] default mode for backward compatibility Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts.py | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 628808c50d92..e2f080903700 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -215,18 +215,28 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Each mode has its own task embedding that is prepended to the context training_modes_cfg = cfg.get('training_modes', None) if training_modes_cfg is None: - raise ValueError("training_modes must be specified in the config") - - self.training_modes = [] - for mode_idx, mode_cfg in enumerate(training_modes_cfg): - mode = TrainingMode( - name=mode_cfg.name, - text_input_mode=mode_cfg.text_input_mode, - streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0), - streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0), - mode_idx=mode_idx, - ) - self.training_modes.append(mode) + # Create a default training mode for backward compatibility + self.training_modes = [ + TrainingMode( + name="streaming_4_8", + text_input_mode="streaming", + streaming_phonemes_delay=4, + streaming_speech_delay=8, + mode_idx=0, + ) + ] + + else: + self.training_modes = [] + for mode_idx, mode_cfg in enumerate(training_modes_cfg): + mode = TrainingMode( + name=mode_cfg.name, + text_input_mode=mode_cfg.text_input_mode, + streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0), + streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0), + mode_idx=mode_idx, + ) + self.training_modes.append(mode) logging.info(f"Multi-mode training with {len(self.training_modes)} modes:") for mode in self.training_modes: From 3f582023c964876c1afb85a15ca10cdf0ba68a0a Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 29 Jan 2026 17:03:54 +0000 Subject: [PATCH 26/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index e2f080903700..06313cd34ec4 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -225,7 +225,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): mode_idx=0, ) ] - + else: self.training_modes = [] for mode_idx, mode_cfg in enumerate(training_modes_cfg): From d58a560adcf4efeb98f062b4832ad0225d2a0d7c Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Thu, 29 Jan 2026 16:12:07 -0800 Subject: [PATCH 27/94] default config changes Signed-off-by: Shehzeen Hussain --- .../tts/conf/magpietts/easy_magpietts.yaml | 28 ++++--------------- .../conf/magpietts/easy_magpietts_lhotse.yaml | 28 ++++--------------- 2 files changed, 10 insertions(+), 46 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 76f39121322e..15ccfbba9f2a 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -42,18 +42,10 @@ model: # Each mode has its own task embedding that is prepended to the context. # During inference, you can specify which mode to use via the 'inference_mode' parameter. training_modes: - - name: "full" - text_input_mode: "full" - streaming_phonemes_delay: 0 # Not used in full mode - streaming_speech_delay: 0 # Not used in full mode - name: "streaming_4_8" - text_input_mode: "streaming" + text_input_mode: "streaming" # Options: "full", "streaming" streaming_phonemes_delay: 4 streaming_speech_delay: 8 - - name: "streaming_2_4" - text_input_mode: "streaming" - streaming_phonemes_delay: 2 - streaming_speech_delay: 4 frame_stacking_factor: 1 phoneme_stacking_factor: 2 @@ -73,20 +65,10 @@ model: use_chars: true use_stresses: true - text_tokenizers: # Add more languages for multi-lingual TTS - english_phoneme: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: false - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" - heteronyms: "scripts/tts_dataset_files/heteronyms-052722" - phoneme_probability: 0.8 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true + text_tokenizers: + qwen2.5b: + _target_: AutoTokenizer + pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct" train_ds: dataset: diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index 1683a27f4238..cd4b314ee970 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -36,18 +36,10 @@ model: # Each mode has its own task embedding that is prepended to the context. # During inference, you can specify which mode to use via the 'inference_mode' parameter. training_modes: - - name: "full" - text_input_mode: "full" - streaming_phonemes_delay: 0 # Not used in full mode - streaming_speech_delay: 0 # Not used in full mode - name: "streaming_4_8" - text_input_mode: "streaming" + text_input_mode: "streaming" # Options: "full", "streaming" streaming_phonemes_delay: 4 streaming_speech_delay: 8 - - name: "streaming_2_4" - text_input_mode: "streaming" - streaming_phonemes_delay: 2 - streaming_speech_delay: 4 frame_stacking_factor: 1 phoneme_stacking_factor: 2 @@ -67,20 +59,10 @@ model: use_chars: true use_stresses: true - text_tokenizers: # Add more languages for multi-lingual TTS - english_phoneme: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: false - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" - heteronyms: "scripts/tts_dataset_files/heteronyms-052722" - phoneme_probability: 0.8 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true + text_tokenizers: + qwen2.5b: + _target_: AutoTokenizer + pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct" train_ds: use_lhotse: ${model.use_lhotse} From a7fa4781e6ac8c9ae406a0b54732708bfde1da5f Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sun, 1 Feb 2026 15:09:18 -0800 Subject: [PATCH 28/94] Magpietts decoderonly 2601 bpe ipa tokenizer (#57) * multilingual BPE IPA tokenizer Signed-off-by: Shehzeen Hussain * BPE IPA tokenizer, configurable audio embedding size and data processing scripts Signed-off-by: Shehzeen Hussain * remove unnecessary scripts Signed-off-by: Shehzeen Hussain * clean up scripts Signed-off-by: Shehzeen Hussain * simplify dropout logic Signed-off-by: Shehzeen Hussain * handle corner cases Signed-off-by: Shehzeen Hussain * trainer strategy ddp Signed-off-by: Shehzeen Hussain * trainer strategy undo ddp Signed-off-by: Shehzeen Hussain --------- Signed-off-by: Shehzeen Hussain --- .../tts/conf/magpietts/easy_magpietts.yaml | 32 +- .../conf/magpietts/easy_magpietts_lhotse.yaml | 32 +- examples/tts/easy_magpietts.py | 1 - .../text_to_speech/tts_tokenizers.py | 45 +- .../tts/data/text_to_speech_dataset.py | 14 +- .../tts/data/text_to_speech_dataset_lhotse.py | 16 +- nemo/collections/tts/models/easy_magpietts.py | 94 +- .../ipa_scripts/add_ipa_to_lhotse_shards.py | 359 + .../ipa_scripts/analyze_ipa_tokenization.py | 728 ++ .../ipa_scripts/cuts_dirs_config.json | 45 + .../ipa_scripts/train_ipa_bpe_tokenizer.py | 521 + ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 +++++++++++++++++ 12 files changed, 11767 insertions(+), 74 deletions(-) create mode 100644 scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py create mode 100644 scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py create mode 100644 scripts/magpietts/ipa_scripts/cuts_dirs_config.json create mode 100644 scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py create mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 15ccfbba9f2a..eea075870b07 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -21,17 +21,18 @@ model: embedding_dim: 1536 hidden_dim: 1536 + audio_embedding_dim: 256 # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection. codecmodel_path: ??? max_epochs: ${max_epochs} steps_per_epoch: ${weighted_sampling_steps_per_epoch} # Local transformer parameters for autoregressive codebook prediction within a frame - local_transformer_type: "none" # "none", "autoregressive", "maskgit" - # Below args are only relevant if use_local_transformer is autoregressive, maskgit + local_transformer_type: "autoregressive" # "none", "autoregressive" + # Below args are only relevant if use_local_transformer is autoregressive local_transformer_loss_scale: 1.0 local_transformer_n_layers: 3 - local_transformer_n_heads: 1 - local_transformer_hidden_dim: 256 + local_transformer_n_heads: 12 + local_transformer_hidden_dim: 1536 cfg_unconditional_prob: 0.1 # To get special_tokens of the tokenzer, you can do: @@ -47,28 +48,19 @@ model: streaming_phonemes_delay: 4 streaming_speech_delay: 8 - frame_stacking_factor: 1 - phoneme_stacking_factor: 2 + frame_stacking_factor: 2 + phoneme_stacking_factor: 1 dropout_text_input_prob: 0.3 + dropout_phoneme_input_prob: 0.3 phoneme_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: false - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" - heteronyms: "scripts/tts_dataset_files/heteronyms-052722" - phoneme_probability: 1.0 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer + tokenizer_path: "scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json" text_tokenizers: - qwen2.5b: + nemotron_nano_30b: _target_: AutoTokenizer - pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct" + pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" train_ds: dataset: diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index cd4b314ee970..2327820e44a4 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -19,15 +19,16 @@ model: embedding_dim: 1536 hidden_dim: 1536 + audio_embedding_dim: 256 # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection. codecmodel_path: ??? # Local transformer parameters for autoregressive codebook prediction within a frame - local_transformer_type: "none" # "none", "autoregressive", "maskgit" - # Below args are only relevant if use_local_transformer is autoregressive, maskgit + local_transformer_type: "autoregressive" # "none", "autoregressive" + # Below args are only relevant if use_local_transformer is autoregressive local_transformer_loss_scale: 1.0 local_transformer_n_layers: 3 - local_transformer_n_heads: 1 - local_transformer_hidden_dim: 256 + local_transformer_n_heads: 12 + local_transformer_hidden_dim: 1536 cfg_unconditional_prob: 0.1 @@ -41,28 +42,19 @@ model: streaming_phonemes_delay: 4 streaming_speech_delay: 8 - frame_stacking_factor: 1 - phoneme_stacking_factor: 2 + frame_stacking_factor: 2 + phoneme_stacking_factor: 1 dropout_text_input_prob: 0.3 + dropout_phoneme_input_prob: 0.3 phoneme_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: false - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" - heteronyms: "scripts/tts_dataset_files/heteronyms-052722" - phoneme_probability: 1.0 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer + tokenizer_path: "scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json" text_tokenizers: - qwen2.5b: + nemotron_nano_30b: _target_: AutoTokenizer - pretrained_model: "Qwen/Qwen2.5-1.5B-Instruct" + pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" train_ds: use_lhotse: ${model.use_lhotse} diff --git a/examples/tts/easy_magpietts.py b/examples/tts/easy_magpietts.py index 4195060b87ef..705c4ab77134 100644 --- a/examples/tts/easy_magpietts.py +++ b/examples/tts/easy_magpietts.py @@ -21,7 +21,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager - @hydra_runner(config_path="conf/magpietts", config_name="easy_magpietts") def main(cfg): logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True)) diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index 4ecd544df81e..81f875750d64 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -1172,6 +1172,39 @@ def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None): return [self._token2id[p] for p in ps] +class IPABPETokenizer: + """Simple IPA BPE tokenizer wrapper around HuggingFace tokenizers. + + Args: + tokenizer_path: Path to the tokenizer.json file (or directory containing it). + """ + + def __init__(self, tokenizer_path: str): + import os + + from tokenizers import Tokenizer + + if os.path.isdir(tokenizer_path): + tokenizer_file = os.path.join(tokenizer_path, "tokenizer.json") + else: + tokenizer_file = tokenizer_path + + if not os.path.exists(tokenizer_file): + raise ValueError(f"Tokenizer file not found: {tokenizer_file}") + + self._tokenizer = Tokenizer.from_file(tokenizer_file) + self.tokens = self._tokenizer.get_vocab() + self.pad = self.tokens.get("", None) + + def encode(self, text: str) -> List[int]: + """Encode IPA text to token IDs.""" + return self._tokenizer.encode(text).ids + + def decode(self, tokens: List[int]) -> str: + """Decode token IDs back to IPA text.""" + return self._tokenizer.decode(tokens) + + # TODO @xueyang: subclassing from `nemo/collections/common/tokenizers/tokenizer_spec.py::TokenizerSpec`, and/or # adjust to reuse `nemo/collections/common/tokenizers/aggregate_tokenizer.py::AggregateTokenizer` class AggregatedTTSTokenizer: @@ -1202,7 +1235,13 @@ def __init__(self, tokenizers: List[Union[BaseTokenizer, PreTrainedTokenizerBase _tokens = list(tokenizer.get_vocab().keys()) tokens.extend(_tokens) num_tokens = len(_tokens) - tokenizer_pad_ids[tokenizer_name] = tokenizer.pad_token_id + tokenizer_offset + pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.unk_token_id + if pad_token_id is None: + raise ValueError( + f"Tokenizer '{tokenizer_name}' has no pad_token_id or unk_token_id. " + "Please set one before using with AggregatedTTSTokenizer." + ) + tokenizer_pad_ids[tokenizer_name] = pad_token_id + tokenizer_offset else: raise ValueError("Tokenizers must be either BaseTokenizer or HuggingFace PreTrainedTokenizerBase.") tokenizer_offset += num_tokens @@ -1217,8 +1256,10 @@ def __init__(self, tokenizers: List[Union[BaseTokenizer, PreTrainedTokenizerBase # Define aggregated token's pad value from the first tokenizer's pad value first_tokenizer = self.tokenizers[tokenizer_names[0]] self.first_tokenizer = first_tokenizer - if hasattr(first_tokenizer, "pad_token_id"): # Defined in PreTrainedTokenizerBase subclasses + if hasattr(first_tokenizer, "pad_token_id") and first_tokenizer.pad_token_id is not None: self.pad = first_tokenizer.pad_token_id + elif hasattr(first_tokenizer, "unk_token_id") and first_tokenizer.unk_token_id is not None: + self.pad = first_tokenizer.unk_token_id elif hasattr(first_tokenizer, "pad"): # Defined in BaseTokenizer subclasses self.pad = first_tokenizer.pad else: diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index 254169f621c6..e25e703f52ee 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -24,7 +24,7 @@ import torch.utils.data from nemo.collections.asr.parts.utils.manifest_utils import read_manifest -from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer +from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer, IPABPETokenizer from nemo.collections.tts.parts.preprocessing.feature_processors import FeatureProcessor from nemo.collections.tts.parts.preprocessing.features import Featurizer from nemo.collections.tts.parts.utils.tts_dataset_utils import ( @@ -436,7 +436,17 @@ def __getitem__(self, index): } if self.phoneme_tokenizer is not None: - phoneme_tokens = self.phoneme_tokenizer.encode(data.text) + # Use IPA text for IPABPETokenizer (required), otherwise use regular text + if isinstance(self.phoneme_tokenizer, IPABPETokenizer): + if 'ipa' not in data.manifest_entry: + raise ValueError( + f"IPABPETokenizer requires 'ipa' field but it is not available in the manifest entry. " + f"Text: {data.text}" + ) + phoneme_text = data.manifest_entry['ipa'] + else: + phoneme_text = data.text + phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text) phoneme_tokens = ( [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] ) diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 9bad7a36e44a..480119202e28 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -24,7 +24,7 @@ from omegaconf import DictConfig from transformers import AutoTokenizer, T5Tokenizer -from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer +from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer from nemo.collections.tts.parts.utils.tts_dataset_utils import ( beta_binomial_prior_distribution, normalize_volume, @@ -41,7 +41,7 @@ def setup_tokenizers(all_tokenizers_config, mode='train'): for tokenizer_name in all_tokenizers_config: tokenizer_config = all_tokenizers_config[tokenizer_name] if tokenizer_config._target_ == 'AutoTokenizer': - tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.pretrained_model) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.pretrained_model, trust_remote_code=True) elif tokenizer_config._target_ == 'T5Tokenizer': tokenizer = T5Tokenizer.from_pretrained(tokenizer_config.pretrained_model) else: @@ -411,7 +411,17 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: token_len_list.append(text_len) if self.phoneme_tokenizer is not None: - phoneme_tokens = self.phoneme_tokenizer.encode(text_str) + # Use IPA text for IPABPETokenizer (required), otherwise use regular text_str + if isinstance(self.phoneme_tokenizer, IPABPETokenizer): + if not cut.supervisions[0].has_custom("ipa"): + raise ValueError( + f"IPABPETokenizer requires 'ipa' field but it is not available in the cut. " + f"Cut ID: {cut.id}, Text: {text_str}" + ) + phoneme_text = cut.supervisions[0].ipa + else: + phoneme_text = text_str + phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text) phoneme_tokens = ( [self.phoneme_tokenizer.bos_token_id] + phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] ) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 06313cd34ec4..68a48ab9701c 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -280,11 +280,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self._codec_model.freeze() # Lightning does requires_grad = False and self.eval() self._codec_converter = codec_converter + # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters + self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim) + audio_embeddings = [] for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): - audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim)) + audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, self.audio_embedding_dim)) self.audio_embeddings = nn.ModuleList(audio_embeddings) + # Projection from audio_embedding_dim to embedding_dim (Identity if same) + if self.audio_embedding_dim != cfg.embedding_dim: + self.audio_in_projection = nn.Linear(self.audio_embedding_dim, cfg.embedding_dim) + else: + self.audio_in_projection = nn.Identity() + if self.phoneme_tokenizer is not None: phoneme_embeddings = [] for _ in range(self.phoneme_stacking_factor): @@ -299,6 +308,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) self.decoder = hf_transformer.model + # self.decoder.to(torch.float32) self.lm_text_head = hf_transformer.lm_head self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) @@ -335,8 +345,14 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): special_vocab=special_vocab, ) + # Projection from hidden_dim to audio_embedding_dim before final_proj (Identity if same) + if self.audio_embedding_dim != cfg.hidden_dim: + self.audio_out_projection = nn.Linear(cfg.hidden_dim, self.audio_embedding_dim) + else: + self.audio_out_projection = nn.Identity() + self.final_proj = nn.Linear( - cfg.hidden_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor + self.audio_embedding_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor ) self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none') @@ -358,11 +374,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2, use_learnable_pos_emb=True, ) + # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same) + if self.audio_embedding_dim != local_transformer_hidden_dim: + self.local_transformer_audio_out_projection = nn.Linear(local_transformer_hidden_dim, self.audio_embedding_dim) + else: + self.local_transformer_audio_out_projection = nn.Identity() local_transformer_out_projections = [] for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): # Have a separate projection layer for each codebook, to distinguish between them local_transformer_out_projections.append( - nn.Linear(local_transformer_hidden_dim, self.num_all_tokens_per_codebook) + nn.Linear(self.audio_embedding_dim, self.num_all_tokens_per_codebook) ) self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) @@ -496,6 +517,8 @@ def embed_audio_tokens(self, audio_tokens): else: audio_embedding = audio_embedding + embedding audio_embedding = audio_embedding / audio_tokens.size(1) + # Project from audio_embedding_dim to embedding_dim + audio_embedding = self.audio_in_projection(audio_embedding) return audio_embedding def embed_phoneme_tokens(self, phoneme_tokens): @@ -532,12 +555,14 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_ targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive) if True, the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit) """ - dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) + dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', hidden_dim) local_transformer_input = [dec_out_all] for codebook_num in range(audio_codes_target.size(1)): codes = audio_codes_target[:, codebook_num] # (B, T') codes = codes.reshape(-1) # (B*T',) - codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', E) + codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', audio_embedding_dim) + # Project from audio_embedding_dim to embedding_dim + codebook_embedding = self.audio_in_projection(codebook_embedding) local_transformer_input.append(codebook_embedding) local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E) @@ -552,6 +577,8 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_ else: # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc. local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E) + # Project from local_transformer_hidden_dim to audio_embedding_dim + local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output) all_code_logits = [] for codebook_num in range(audio_codes_target.size(1)): # Using a separate projection layer for each codebook (to distinguish between them) @@ -666,8 +693,12 @@ def local_transformer_sample_autoregressive( local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device ) local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128) - codebook_logits = self.local_transformer_out_projections[codebook_num]( + # Project from local_transformer_hidden_dim to audio_embedding_dim + local_transformer_output_projected = self.local_transformer_audio_out_projection( local_transformer_output[:, -1, :] + ) + codebook_logits = self.local_transformer_out_projections[codebook_num]( + local_transformer_output_projected ) # (B, num_all_tokens_per_codebook) if use_cfg: actual_batch_size = codebook_logits.size(0) // 2 @@ -697,13 +728,15 @@ def local_transformer_sample_autoregressive( all_preds.append(codebook_preds) next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze( 1 - ) # (B, 1, 128) + ) # (B, 1, audio_embedding_dim) + # Project from audio_embedding_dim to embedding_dim, then to local_transformer_hidden_dim + next_local_transformer_input = self.audio_in_projection(next_local_transformer_input) next_local_transformer_input = self.local_transformer_in_projection( next_local_transformer_input - ) # (B, 1, 128) + ) # (B, 1, local_transformer_hidden_dim) local_transformer_input = torch.cat( [local_transformer_input, next_local_transformer_input], dim=1 - ) # (B, T+1, 128) + ) # (B, T+1, local_transformer_hidden_dim) all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) if use_cfg: @@ -897,7 +930,8 @@ def join_embeddings_temporally( dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti) # Assign embedding_i to the correct positions in joined - joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask] + # Ensure dtype matches to avoid errors during mixed-precision training + joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask].to(joined.dtype) # move cursor past this segment offset += len_i @@ -1330,13 +1364,11 @@ def process_batch( # Determine whether to apply text/phoneme dropout for regularization during training # Text dropout: randomly drop text input to encourage the model to rely on other signals dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False - # Phoneme dropout: randomly drop phoneme input, but only if text is not already dropped - # This ensures we don't drop both simultaneously - dropout_phoneme_input = ( - ((random.random() < self.dropout_phoneme_input_prob) and (not dropout_text_input)) - if mode == 'train' - else False - ) + dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False + if (dropout_phoneme_input and dropout_text_input): + # Only one of the two can be True, so choose randomly + dropout_phoneme_input = random.random() < 0.5 + dropout_text_input = not dropout_phoneme_input # Prepare context tensors by combining text and audio context information context_tensors = self.prepare_context_tensors( @@ -1420,13 +1452,18 @@ def process_batch( if remaining_text_embedded is not None: # Pad remaining text to match audio sequence length by adding zeros on the right padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1) - padding_tensor = torch.zeros( - remaining_text_embedded.size(0), - padding_len, - remaining_text_embedded.size(2), - device=remaining_text_embedded.device, - ) - remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1) + if padding_len > 0: + padding_tensor = torch.zeros( + remaining_text_embedded.size(0), + padding_len, + remaining_text_embedded.size(2), + device=remaining_text_embedded.device, + ) + remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1) + else: + # Log Warning + print(f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}") + remaining_text_embedded = remaining_text_embedded[:, : audio_codes_input_embedded.size(1), :] # Add text information to audio embeddings (element-wise addition) audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded @@ -1487,7 +1524,9 @@ def process_batch( ) # Project embeddings to logits for each codebook - logits = self.final_proj(pred_embeddings) # (B, T', num_codebooks * num_tokens_per_codebook) + # First project from hidden_dim to audio_embedding_dim, then to logits + pred_embeddings_audio = self.audio_out_projection(pred_embeddings) + logits = self.final_proj(pred_embeddings_audio) # (B, T', num_codebooks * num_tokens_per_codebook) # Compute the main codebook prediction loss codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) @@ -1553,6 +1592,7 @@ def process_batch( def training_step(self, batch, batch_idx): # Extract inputs from batch and pass explicitly to process_batch + # import ipdb; ipdb.set_trace() batch_output = self.process_batch( text=batch['text'], text_lens=batch['text_lens'], @@ -1993,8 +2033,10 @@ def infer_batch( if idx % 20 == 0: print(f"Decoding timestep {idx}") + # Project from hidden_dim to audio_embedding_dim, then to logits + last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) all_code_logits_t = self.final_proj( - last_hidden[:, -1, :] + last_hidden_audio ) # (B, num_codebooks * num_tokens_per_codebook) if self.phoneme_tokenizer is not None: diff --git a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py new file mode 100644 index 000000000000..61a124d56ccc --- /dev/null +++ b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 +""" +Add IPA strings (from espeak/espeak-ng) to Lhotse cuts jsonl.gz shards. + +For each cuts directory like: + /Data/.../de/.../cuts +creates: + /Data/.../de/.../cuts_with_ipa +and writes corresponding cuts.000000.jsonl.gz, etc. with an added IPA field. + +IPA is added to each supervision under: + cut["supervisions"][i]["custom"]["ipa"] + +Usage: + python add_ipa_to_cuts.py --lang de + python add_ipa_to_cuts.py --lang all # run all languages + +Edit the `CUTS_DIRS_BY_LANG` dict below (or replace with argparse/config as desired). +""" + +from __future__ import annotations + +import argparse +import concurrent.futures as cf +import gzip +import json +import os +import re +import shutil +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Tuple + +# ------------------------- +# USER CONFIG +# ------------------------- + +# Default config file path (same directory as this script) +DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json" + + +def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: + """Load CUTS_DIRS_BY_LANG from a JSON config file.""" + if config_path is None: + config_path = DEFAULT_CONFIG_PATH + + if not config_path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f) + +# Map your dataset language keys to espeak voice codes (adjust as needed). +# For German, espeak-ng uses "de" typically. +ESPEAK_VOICE_BY_LANG: Dict[str, str] = { + "de": "de", + "en": "en", + "es": "es", + "fr": "fr", + "hi": "hi", + "it": "it", + "vi": "vi", + "zh": "zh", + "ru": "ru", + "ja": "ja", + "ko": "ko", + "ar": "ar", + "he": "he", + "nl": "nl", + "pl": "pl", + "pt": "pt", +} + +OUTPUT_SUFFIX = "_with_ipa" # cuts -> cuts_with_ipa +SHARD_GLOB = "cuts.*.jsonl.gz" + +# Parallelism +MAX_WORKERS = max(1, (os.cpu_count() or 4) - 1) +# MAX_WORKERS = 8 + +# If True, skip writing if output shard exists (basic resume) +SKIP_EXISTING_OUTPUT_SHARDS = False +# ------------------------- +# IMPLEMENTATION +# ------------------------- + +IPA_FLAG = "--ipa" # espeak-ng uses --ipa, espeak supports --ipa in many builds +# Use --quiet if available; safe to try. +COMMON_FLAGS = ["-q"] + +# Some espeak builds output extra spaces/newlines; we normalize. +_WS_RE = re.compile(r"\s+") + + +def _find_espeak_binary() -> str: + """Prefer espeak-ng if present, else espeak.""" + for exe in ("espeak-ng", "espeak"): + if shutil.which(exe): + return exe + raise RuntimeError( + "Neither 'espeak-ng' nor 'espeak' was found on PATH. " + "Install espeak-ng (recommended) or espeak." + ) + + +@dataclass(frozen=True) +class EspeakRunner: + exe: str + voice: str + + def text_to_ipa(self, text: str) -> str: + """ + Convert text -> IPA using espeak/espeak-ng. + """ + # Note: We pass text via stdin to avoid shell escaping issues. + cmd = [self.exe, "-v", self.voice, IPA_FLAG] + COMMON_FLAGS + try: + proc = subprocess.run( + cmd, + input=text.encode("utf-8"), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + except Exception as e: + raise RuntimeError(f"Failed to run {cmd}: {e}") from e + + if proc.returncode != 0: + raise RuntimeError( + f"espeak command failed (rc={proc.returncode})\n" + f"cmd: {' '.join(cmd)}\n" + f"stderr: {proc.stderr.decode('utf-8', errors='replace')}" + ) + + out = proc.stdout.decode("utf-8", errors="replace").strip() + # Normalize whitespace to single spaces + out = _WS_RE.sub(" ", out).strip() + return out + + +def iter_shards(cuts_dir: Path) -> List[Path]: + return sorted(cuts_dir.glob(SHARD_GLOB)) + + +def derive_output_dir(cuts_dir: Path) -> Path: + # If dir name ends with "cuts", produce "cuts_with_ipa". + # Otherwise append suffix to the directory name. + name = cuts_dir.name + if name == "cuts": + out_name = f"cuts{OUTPUT_SUFFIX}" + else: + out_name = f"{name}{OUTPUT_SUFFIX}" + return cuts_dir.parent / out_name + + +def load_json_line(line: str) -> dict: + return json.loads(line) + + +def dump_json_line(obj: dict) -> str: + # compact, consistent output + return json.dumps(obj, ensure_ascii=False) + + +class IPACache: + """ + Process-local cache. Speeds up repeated identical texts. + """ + + def __init__(self) -> None: + self._cache: Dict[Tuple[str, str], str] = {} + + def get(self, voice: str, text: str) -> Optional[str]: + return self._cache.get((voice, text)) + + def set(self, voice: str, text: str, ipa: str) -> None: + self._cache[(voice, text)] = ipa + + +def add_ipa_to_cut( + cut: dict, + espeak: EspeakRunner, + cache: IPACache, +) -> dict: + """ + Adds IPA to each supervision custom field: custom["ipa"]. + Uses supervision["custom"]["normalized_text"] if available, otherwise supervision["text"] as source text. + For Vietnamese (vi), uses original_text and updates text/normalized_text fields. + """ + sups = cut.get("supervisions") or [] + is_vietnamese = espeak.voice == "vi" + for sup in sups: + custom = sup.get("custom") + if custom is None: + custom = {} + sup["custom"] = custom + + # For Vietnamese, use original_text and fix the text fields + if is_vietnamese and custom.get("original_text"): + text = custom["original_text"] + sup["text"] = text + custom["normalized_text"] = text + else: + text = custom.get("normalized_text") or sup.get("text") + + if not text: + continue + + # If already has IPA, keep it + if "ipa" in custom and isinstance(custom["ipa"], str) and custom["ipa"].strip(): + continue + + cached = cache.get(espeak.voice, text) + if cached is None: + cached = espeak.text_to_ipa(text) + cache.set(espeak.voice, text, cached) + + custom["ipa"] = cached + + return cut + + +def process_shard( + shard_path: Path, + out_shard_path: Path, + espeak: EspeakRunner, +) -> Tuple[Path, int]: + """ + Read shard jsonl.gz, add IPA, write out shard jsonl.gz + Returns: (out_shard_path, num_lines) + """ + cache = IPACache() + n = 0 + + with gzip.open(shard_path, "rt", encoding="utf-8") as fin, gzip.open( + out_shard_path, "wt", encoding="utf-8" + ) as fout: + for line in fin: + line = line.strip() + if not line: + continue + cut = load_json_line(line) + cut = add_ipa_to_cut(cut, espeak=espeak, cache=cache) + fout.write(dump_json_line(cut)) + fout.write("\n") + n += 1 + + return out_shard_path, n + + +def process_cuts_dir(lang: str, cuts_dir: Path) -> None: + voice = ESPEAK_VOICE_BY_LANG.get(lang, lang) + exe = _find_espeak_binary() + espeak = EspeakRunner(exe=exe, voice=voice) + + out_dir = derive_output_dir(cuts_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + shards = iter_shards(cuts_dir) + if not shards: + print(f"[WARN] No shards matched {SHARD_GLOB} in {cuts_dir}", file=sys.stderr) + return + + print(f"[INFO] {lang}: {cuts_dir} -> {out_dir} (shards={len(shards)})") + + jobs: List[Tuple[Path, Path]] = [] + for shard in shards: + out_shard = out_dir / shard.name + if SKIP_EXISTING_OUTPUT_SHARDS and out_shard.exists(): + continue + jobs.append((shard, out_shard)) + + if not jobs: + print(f"[INFO] {lang}: nothing to do in {cuts_dir} (all outputs exist).") + return + + # Parallelize per shard + with cf.ProcessPoolExecutor(max_workers=MAX_WORKERS) as ex: + futures = [] + for shard, out_shard in jobs: + futures.append(ex.submit(_process_shard_worker, shard, out_shard, espeak.exe, espeak.voice)) + + for fut in cf.as_completed(futures): + out_shard_path, n = fut.result() + print(f"[OK] wrote {out_shard_path} (lines={n})") + + +def _process_shard_worker(shard: Path, out_shard: Path, exe: str, voice: str) -> Tuple[Path, int]: + # Re-create runner in worker process + espeak = EspeakRunner(exe=exe, voice=voice) + return process_shard(shard, out_shard, espeak) + + +def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]: + """Return list of all available language codes.""" + return list(cuts_dirs.keys()) + + +def process_language(lang: str, cuts_dirs: Dict[str, List[str]]) -> bool: + """ + Process all directories for a given language. + Returns True if successful, False if there was an issue. + """ + if lang not in cuts_dirs: + print(f"[ERROR] Unknown language: {lang}", file=sys.stderr) + print(f"[ERROR] Available languages: {get_available_languages(cuts_dirs)}", file=sys.stderr) + return False + + dirs = cuts_dirs[lang] + for d in dirs: + cuts_dir = Path(d) + if not cuts_dir.exists(): + print(f"[WARN] missing dir: {cuts_dir}", file=sys.stderr) + continue + process_cuts_dir(lang, cuts_dir) + + return True + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Add IPA strings to Lhotse cuts jsonl.gz shards." + ) + parser.add_argument( + "--lang", + type=str, + required=True, + help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages." + ) + parser.add_argument( + "--config", + type=str, + default=None, + help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}" + ) + args = parser.parse_args() + + # Load config + config_path = Path(args.config) if args.config else None + cuts_dirs = load_cuts_dirs_config(config_path) + print(f"[INFO] Loaded config with languages: {get_available_languages(cuts_dirs)}") + + if args.lang == "all": + # Process all languages + for lang in cuts_dirs.keys(): + print(f"\n{'='*60}") + print(f"[INFO] Processing language: {lang}") + print(f"{'='*60}") + process_language(lang, cuts_dirs) + else: + success = process_language(args.lang, cuts_dirs) + if not success: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py new file mode 100644 index 000000000000..7032e1eeca0f --- /dev/null +++ b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py @@ -0,0 +1,728 @@ +#!/usr/bin/env python3 +""" +Analyze and compare tokenization (tokens per second of audio) between: +1. Qwen/Qwen2.5-1.5B-Instruct tokenizer on raw text +2. NVIDIA Nemotron Nano 30B tokenizer on raw text +3. IPABPETokenizer on phonemized IPA text at different vocab sizes + +This script: +1. Creates a balanced IPA corpus (equal samples per language) from train_langs +2. Trains IPA BPE tokenizers at vocab sizes 512, 1024, 2048, 4096 +3. For each test language, samples text pairs from cuts_with_ipa directories +4. Computes tokens per second (tokens / audio duration) for each tokenizer +5. Outputs comparison statistics showing tokens/second for each tokenizer + +Features: +- Reads data once and reuses across all vocab sizes (efficient) +- Balances training data across languages (uses min count across all train langs) +- Supports separate train and test language sets +- Computes tokens per second using audio duration from cuts + +Usage: + # Train and test on all languages + python analyze_ipa_tokenization.py --output_dir /path/to/output + + # Train on en,de,fr but test on all languages + python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs en,de,fr --test_langs all + + # Train on all, test on specific languages + python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs all --test_langs en,zh + + # Cap training samples per language + python analyze_ipa_tokenization.py --output_dir /path/to/output --max_samples_per_lang 50000 +""" + +import argparse +import gzip +import json +import os +import random +import sys +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Generator, List, Optional, Tuple + +import numpy as np +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.pre_tokenizers import ByteLevel +from tokenizers.trainers import BpeTrainer +from transformers import AutoTokenizer + +# ------------------------- +# CONFIGURATION +# ------------------------- + +VOCAB_SIZES = [512, 1024, 2048, 4096] + +# Default config file path (same directory as this script) +DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json" + + +def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: + """Load CUTS_DIRS_BY_LANG from a JSON config file.""" + if config_path is None: + config_path = DEFAULT_CONFIG_PATH + + if not config_path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f) + +OUTPUT_SUFFIX = "_with_ipa" +SHARD_GLOB = "cuts.*.jsonl.gz" + + +@dataclass +class TextPair: + """A pair of raw text and its IPA phonemization with audio duration.""" + raw_text: str + ipa_text: str + lang: str + duration: float # audio duration in seconds + + +@dataclass +class TokenizationStats: + """Statistics for tokenization comparison (tokens per second).""" + lang: str + num_samples: int + total_duration: float # sum of all durations in seconds + qwen_tokens_per_second: float + nemotron_tokens_per_second: float + ipa_tokens_per_second: Dict[int, float] # vocab_size -> tokens/sec + + +def get_ipa_dir(cuts_dir: Path) -> Path: + """Convert a cuts directory path to its corresponding cuts_with_ipa path.""" + name = cuts_dir.name + if name == "cuts": + out_name = f"cuts{OUTPUT_SUFFIX}" + else: + out_name = f"{name}{OUTPUT_SUFFIX}" + return cuts_dir.parent / out_name + + +def iter_shards(ipa_dir: Path) -> List[Path]: + """Get all shard files in a directory.""" + return sorted(ipa_dir.glob(SHARD_GLOB)) + + +def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[TextPair, None, None]: + """ + Extract text pairs (raw text + IPA) from a single shard file. + + Yields: + TextPair objects with raw_text, ipa_text, and duration + """ + with gzip.open(shard_path, "rt", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + cut = json.loads(line) + # Get duration from the top-level cut object + duration = cut.get("duration", 0.0) + supervisions = cut.get("supervisions", []) + for sup in supervisions: + custom = sup.get("custom", {}) + ipa = custom.get("ipa") + # Get raw text - prefer normalized_text, fallback to text + raw_text = custom.get("normalized_text") or sup.get("text") + + if ipa and raw_text and isinstance(ipa, str) and isinstance(raw_text, str): + ipa = ipa.strip() + raw_text = raw_text.strip() + if ipa and raw_text and duration > 0: + yield TextPair(raw_text=raw_text, ipa_text=ipa, lang=lang, duration=duration) + except json.JSONDecodeError: + continue + + +def sample_text_pairs( + lang: str, + cuts_dirs: Dict[str, List[str]], + num_samples: int = 1000, + seed: int = 42, +) -> List[TextPair]: + """ + Sample text pairs from a language's cuts_with_ipa directories. + + Args: + lang: Language code + cuts_dirs: Dictionary mapping language codes to lists of cuts directories + num_samples: Number of samples to collect + seed: Random seed for reproducibility + + Returns: + List of TextPair objects + """ + random.seed(seed) + + if lang not in cuts_dirs: + raise ValueError(f"Unknown language: {lang}") + + # Collect all text pairs from all directories + all_pairs = [] + for cuts_dir_str in cuts_dirs[lang]: + cuts_dir = Path(cuts_dir_str) + ipa_dir = get_ipa_dir(cuts_dir) + + if not ipa_dir.exists(): + print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr) + continue + + shards = iter_shards(ipa_dir) + for shard in shards: + for pair in extract_text_pairs_from_shard(shard, lang): + all_pairs.append(pair) + # Early exit if we have way more than needed + if len(all_pairs) >= num_samples * 10: + break + if len(all_pairs) >= num_samples * 10: + break + if len(all_pairs) >= num_samples * 10: + break + + # Sample + if len(all_pairs) <= num_samples: + print(f"[INFO] {lang}: Only found {len(all_pairs)} pairs, using all") + return all_pairs + + return random.sample(all_pairs, num_samples) + + +def iter_ipa_strings_for_lang( + lang: str, + cuts_dirs: Dict[str, List[str]], +) -> Generator[str, None, None]: + """Iterate over all IPA strings for a single language (memory-efficient).""" + if lang not in cuts_dirs: + return + + for cuts_dir_str in cuts_dirs[lang]: + cuts_dir = Path(cuts_dir_str) + ipa_dir = get_ipa_dir(cuts_dir) + + if not ipa_dir.exists(): + continue + + shards = iter_shards(ipa_dir) + for shard in shards: + with gzip.open(shard, "rt", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + cut = json.loads(line) + for sup in cut.get("supervisions", []): + ipa = sup.get("custom", {}).get("ipa") + if ipa and isinstance(ipa, str) and ipa.strip(): + yield ipa.strip() + except json.JSONDecodeError: + continue + + +def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int: + """Count IPA strings for a language without loading into memory.""" + count = 0 + for _ in iter_ipa_strings_for_lang(lang, cuts_dirs): + count += 1 + if count >= max_count: + break + return count + + +def simple_sample_ipa_strings( + lang: str, + cuts_dirs: Dict[str, List[str]], + k: int, + max_collect: int = 100000, + seed: int = 42, +) -> List[str]: + """ + Simple sampling: collect up to max_collect IPA strings, then randomly sample k. + + This avoids reading through all data like reservoir sampling does. + + Args: + lang: Language code + cuts_dirs: Dictionary mapping language codes to lists of cuts directories + k: Number of samples to select + max_collect: Maximum number of strings to collect before sampling + seed: Random seed for reproducibility + + Returns: + List of up to k sampled IPA strings + """ + rng = random.Random(seed) + collected: List[str] = [] + + for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs): + collected.append(ipa) + if len(collected) >= max_collect: + break + + # If we have fewer than k, return all + if len(collected) <= k: + return collected + + # Otherwise, randomly sample k + return rng.sample(collected, k) + + +def create_balanced_corpus( + train_langs: List[str], + cuts_dirs: Dict[str, List[str]], + output_file: str, + max_samples_per_lang: Optional[int] = None, + max_count_per_lang: int = 100000, + seed: int = 42, +) -> Tuple[str, Dict[str, int]]: + """ + Create a balanced IPA corpus file with equal samples from each language. + + Uses a memory-efficient two-pass approach: + 1. First pass: Count sentences per language (up to max_count_per_lang) + 2. Second pass: Use simple sampling to select samples + + Args: + train_langs: List of language codes to include + cuts_dirs: Dictionary mapping language codes to lists of cuts directories + output_file: Path to write the balanced corpus + max_samples_per_lang: Optional cap on samples per language + max_count_per_lang: Max count per language when counting IPA strings + seed: Random seed for reproducibility + + Returns: + Tuple of (corpus_file_path, dict of lang -> actual_count) + """ + # First pass: Count sentences per language + print("[INFO] Pass 1: Counting IPA strings per language...") + lang_counts: Dict[str, int] = {} + + for lang in train_langs: + if lang not in cuts_dirs: + print(f"[WARN] Language {lang} not in config, skipping") + continue + print(f"[INFO] Counting {lang}...", end=" ", flush=True) + count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang) + lang_counts[lang] = count + print(f"{count} IPA strings") + + if not lang_counts: + raise ValueError("No IPA strings found for any language") + + # Find minimum count across languages + min_count = min(lang_counts.values()) + print(f"[INFO] Minimum count across languages: {min_count}") + + # Apply max_samples_per_lang cap if specified + samples_per_lang = min_count + if max_samples_per_lang is not None and max_samples_per_lang < min_count: + samples_per_lang = max_samples_per_lang + print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}") + + # Second pass: Sample from each language using simple sampling + print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...") + actual_counts: Dict[str, int] = {} + total_written = 0 + + with open(output_file, "w", encoding="utf-8") as f: + for lang in lang_counts.keys(): + print(f"[INFO] Sampling from {lang}...", end=" ", flush=True) + # Use different seed per language for variety, but reproducible + lang_seed = seed + hash(lang) % 10000 + sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed) + + for ipa in sampled: + f.write(ipa + "\n") + total_written += 1 + + actual_counts[lang] = len(sampled) + print(f"sampled {len(sampled)} strings") + + print(f"[INFO] Total IPA strings written to corpus: {total_written}") + print(f"[INFO] Balanced corpus saved to: {output_file}") + + return output_file, actual_counts + + +def train_ipa_bpe_tokenizer( + output_dir: str, + vocab_size: int, + corpus_file: str, + min_frequency: int = 2, +) -> Tokenizer: + """ + Train a byte-level BPE tokenizer on IPA strings from a pre-built corpus file. + + Args: + output_dir: Directory to save tokenizer files + vocab_size: Target vocabulary size + corpus_file: Path to the IPA corpus file (one IPA string per line) + min_frequency: Minimum frequency for a token to be included + + Returns: + Trained Tokenizer object + """ + tokenizer_dir = os.path.join(output_dir, f"ipa_bpe_v{vocab_size}") + os.makedirs(tokenizer_dir, exist_ok=True) + + tokenizer_file = os.path.join(tokenizer_dir, "tokenizer.json") + + # Check if already trained + if os.path.exists(tokenizer_file): + print(f"[INFO] Loading existing tokenizer from {tokenizer_file}") + return Tokenizer.from_file(tokenizer_file) + + # Initialize tokenizer + tokenizer = Tokenizer(BPE(unk_token="")) + tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) + + special_tokens = ["", "", ""] + + trainer = BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + special_tokens=special_tokens, + show_progress=True, + ) + + print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}...") + tokenizer.train(files=[corpus_file], trainer=trainer) + + # Save + tokenizer.save(tokenizer_file) + tokenizer.model.save(tokenizer_dir) + + print(f"[INFO] Saved tokenizer to {tokenizer_dir}") + + return tokenizer + + +def compute_stats( + text_pairs: List[TextPair], + qwen_tokenizer: AutoTokenizer, + nemotron_tokenizer: AutoTokenizer, + ipa_tokenizers: Dict[int, Tokenizer], + lang: str, +) -> TokenizationStats: + """ + Compute tokenization statistics (tokens per second) for a set of text pairs. + """ + qwen_counts = [] + nemotron_counts = [] + ipa_counts = {vs: [] for vs in ipa_tokenizers.keys()} + + for pair in text_pairs: + # Qwen tokenizer on raw text + qwen_tokens = qwen_tokenizer.encode(pair.raw_text) + qwen_counts.append(len(qwen_tokens)) + + # Nemotron tokenizer on raw text + nemotron_tokens = nemotron_tokenizer.encode(pair.raw_text) + nemotron_counts.append(len(nemotron_tokens)) + + # IPA tokenizers on IPA text + for vocab_size, tokenizer in ipa_tokenizers.items(): + ipa_tokens = tokenizer.encode(pair.ipa_text) + ipa_counts[vocab_size].append(len(ipa_tokens.ids)) + + # Calculate total duration and token counts + total_duration = sum(pair.duration for pair in text_pairs) + qwen_total = sum(qwen_counts) + nemotron_total = sum(nemotron_counts) + + # Compute tokens per second + qwen_tps = qwen_total / total_duration if total_duration > 0 else 0.0 + nemotron_tps = nemotron_total / total_duration if total_duration > 0 else 0.0 + + ipa_tps = {} + for vocab_size in ipa_tokenizers.keys(): + ipa_total = sum(ipa_counts[vocab_size]) + ipa_tps[vocab_size] = ipa_total / total_duration if total_duration > 0 else 0.0 + + return TokenizationStats( + lang=lang, + num_samples=len(text_pairs), + total_duration=total_duration, + qwen_tokens_per_second=qwen_tps, + nemotron_tokens_per_second=nemotron_tps, + ipa_tokens_per_second=ipa_tps, + ) + + +def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int]): + """Print a formatted table of tokens per second statistics.""" + print("\n" + "=" * 120) + print("TOKENS PER SECOND: Qwen2.5-1.5B-Instruct & Nemotron Nano 30B (raw text) vs IPA BPE (phonemized)") + print("=" * 120) + + # Header + header = f"{'Lang':<6} {'Samples':>8} {'Duration(s)':>12} {'Qwen tok/s':>12} {'Nemo tok/s':>12}" + for vs in vocab_sizes: + header += f" {'IPA-' + str(vs):>10}" + print(header) + print("-" * 120) + + # Data rows + for stats in all_stats: + row = f"{stats.lang:<6} {stats.num_samples:>8} {stats.total_duration:>12.2f} {stats.qwen_tokens_per_second:>12.2f} {stats.nemotron_tokens_per_second:>12.2f}" + for vs in vocab_sizes: + row += f" {stats.ipa_tokens_per_second[vs]:>10.2f}" + print(row) + + # Aggregated stats + print("-" * 120) + total_samples = sum(s.num_samples for s in all_stats) + total_duration = sum(s.total_duration for s in all_stats) + + # Compute overall tokens per second (weighted by duration) + total_qwen_tokens = sum(s.qwen_tokens_per_second * s.total_duration for s in all_stats) + total_nemotron_tokens = sum(s.nemotron_tokens_per_second * s.total_duration for s in all_stats) + overall_qwen_tps = total_qwen_tokens / total_duration if total_duration > 0 else 0 + overall_nemotron_tps = total_nemotron_tokens / total_duration if total_duration > 0 else 0 + + agg_row = f"{'TOTAL':<6} {total_samples:>8} {total_duration:>12.2f} {overall_qwen_tps:>12.2f} {overall_nemotron_tps:>12.2f}" + for vs in vocab_sizes: + total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats) + overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0 + agg_row += f" {overall_ipa_tps:>10.2f}" + print(agg_row) + print("=" * 120) + + # Summary + print("\nSUMMARY:") + print(f" - Total samples analyzed: {total_samples}") + print(f" - Total audio duration: {total_duration:.2f} seconds ({total_duration/3600:.2f} hours)") + print(f" - Qwen tokens/second: {overall_qwen_tps:.2f}") + print(f" - Nemotron tokens/second: {overall_nemotron_tps:.2f}") + for vs in vocab_sizes: + total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats) + overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0 + print(f" - IPA-{vs} tokens/second: {overall_ipa_tps:.2f}") + print() + + +def save_results_json( + all_stats: List[TokenizationStats], + output_path: str, + train_langs: Optional[List[str]] = None, + test_langs: Optional[List[str]] = None, +): + """Save results to JSON file with metadata.""" + output = { + "metadata": { + "train_langs": train_langs or [], + "test_langs": test_langs or [], + }, + "results": [], + } + + for stats in all_stats: + output["results"].append({ + "lang": stats.lang, + "num_samples": stats.num_samples, + "total_duration_seconds": stats.total_duration, + "qwen_tokens_per_second": stats.qwen_tokens_per_second, + "nemotron_tokens_per_second": stats.nemotron_tokens_per_second, + "ipa_tokens_per_second": { + str(vs): stats.ipa_tokens_per_second[vs] + for vs in stats.ipa_tokens_per_second.keys() + } + }) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(output, f, indent=2) + print(f"[INFO] Saved results to {output_path}") + + +def parse_lang_arg(arg: str, available_langs: List[str]) -> List[str]: + """Parse a language argument (comma-separated or 'all').""" + if arg == "all": + return available_langs + langs = [l.strip() for l in arg.split(",") if l.strip()] + # Validate languages + for lang in langs: + if lang not in available_langs: + raise ValueError(f"Unknown language: {lang}. Available: {available_langs}") + return langs + + +def main(): + parser = argparse.ArgumentParser( + description="Compare tokenization between Qwen and IPA BPE tokenizers." + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Directory to save tokenizers and results", + ) + parser.add_argument( + "--samples_per_lang", + type=int, + default=1000, + help="Number of samples per language for testing (default: 1000)", + ) + parser.add_argument( + "--train_langs", + type=str, + default="all", + help="Comma-separated languages for training tokenizer, or 'all' (default: all)", + ) + parser.add_argument( + "--test_langs", + type=str, + default="all", + help="Comma-separated languages for testing/analysis, or 'all' (default: all)", + ) + parser.add_argument( + "--max_samples_per_lang", + type=int, + default=None, + help="Optional cap on training samples per language (default: use min count across langs)", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for sampling (default: 42)", + ) + parser.add_argument( + "--config", + type=str, + default=None, + help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}" + ) + parser.add_argument( + "--max_count_per_lang", + type=int, + default=100000, + help="Max count per language when counting IPA strings (default: 100000)", + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + # Load config + config_path = Path(args.config) if args.config else None + cuts_dirs = load_cuts_dirs_config(config_path) + available_langs = list(cuts_dirs.keys()) + print(f"[INFO] Loaded config with languages: {available_langs}") + + # Parse train and test languages + try: + train_langs = parse_lang_arg(args.train_langs, available_langs) + test_langs = parse_lang_arg(args.test_langs, available_langs) + except ValueError as e: + print(f"[ERROR] {e}") + sys.exit(1) + + print(f"[INFO] Training languages: {train_langs}") + print(f"[INFO] Testing languages: {test_langs}") + print(f"[INFO] Samples per language for testing: {args.samples_per_lang}") + print(f"[INFO] Max samples per language for training: {args.max_samples_per_lang or 'auto (min across langs)'}") + print(f"[INFO] Vocab sizes: {VOCAB_SIZES}") + + # Step 1: Create balanced IPA corpus once + print("\n" + "=" * 60) + print("STEP 1: Creating balanced IPA corpus") + print("=" * 60) + + corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt") + + # Check if corpus already exists + if os.path.exists(corpus_file): + print(f"[INFO] Using existing corpus file: {corpus_file}") + with open(corpus_file, "r", encoding="utf-8") as f: + line_count = sum(1 for _ in f) + print(f"[INFO] Corpus contains {line_count} IPA strings") + else: + corpus_file, lang_counts = create_balanced_corpus( + train_langs=train_langs, + cuts_dirs=cuts_dirs, + output_file=corpus_file, + max_samples_per_lang=args.max_samples_per_lang, + max_count_per_lang=args.max_count_per_lang, + seed=args.seed, + ) + + # Step 2: Train IPA BPE tokenizers at different vocab sizes (reusing corpus) + print("\n" + "=" * 60) + print("STEP 2: Training IPA BPE tokenizers") + print("=" * 60) + + ipa_tokenizers = {} + for vocab_size in VOCAB_SIZES: + print(f"\n[INFO] Training tokenizer with vocab_size={vocab_size}") + ipa_tokenizers[vocab_size] = train_ipa_bpe_tokenizer( + output_dir=args.output_dir, + vocab_size=vocab_size, + corpus_file=corpus_file, + min_frequency=2, + ) + + # Step 3: Load Qwen and Nemotron tokenizers + print("\n" + "=" * 60) + print("STEP 3: Loading Qwen and Nemotron tokenizers") + print("=" * 60) + + print("[INFO] Loading Qwen/Qwen2.5-1.5B-Instruct tokenizer...") + qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") + print(f"[INFO] Qwen tokenizer vocab size: {qwen_tokenizer.vocab_size}") + + print("[INFO] Loading nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 tokenizer...") + + nemotron_tokenizer = AutoTokenizer.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True) + + print(f"[INFO] Nemotron tokenizer vocab size: {nemotron_tokenizer.vocab_size}") + + # Step 4: Sample text pairs and compute statistics (on test languages) + print("\n" + "=" * 60) + print("STEP 4: Sampling and analyzing (test languages)") + print("=" * 60) + + all_stats = [] + for lang in test_langs: + print(f"\n[INFO] Processing language: {lang}") + + # Sample text pairs + text_pairs = sample_text_pairs(lang, cuts_dirs, args.samples_per_lang, args.seed) + + if not text_pairs: + print(f"[WARN] No text pairs found for {lang}, skipping") + continue + + print(f"[INFO] Sampled {len(text_pairs)} text pairs for {lang}") + + # Compute stats + stats = compute_stats(text_pairs, qwen_tokenizer, nemotron_tokenizer, ipa_tokenizers, lang) + all_stats.append(stats) + + # Print intermediate results + print(f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s") + for vs in VOCAB_SIZES: + print(f" IPA-{vs}={stats.ipa_tokens_per_second[vs]:.2f} tok/s") + + # Step 5: Print and save results + print("\n" + "=" * 60) + print("STEP 5: Results") + print("=" * 60) + + print_stats_table(all_stats, VOCAB_SIZES) + + # Save to JSON with metadata + results_path = os.path.join(args.output_dir, "tokenization_comparison.json") + save_results_json(all_stats, results_path, train_langs, test_langs) + + print("[INFO] Done!") + + +if __name__ == "__main__": + main() diff --git a/scripts/magpietts/ipa_scripts/cuts_dirs_config.json b/scripts/magpietts/ipa_scripts/cuts_dirs_config.json new file mode 100644 index 000000000000..8785de53211e --- /dev/null +++ b/scripts/magpietts/ipa_scripts/cuts_dirs_config.json @@ -0,0 +1,45 @@ +{ + "de": ["/Data/tts_lhotse_datasets/speech_data/de/cmltts_de_train/cuts"], + "es": [ + "/Data/tts_lhotse_datasets/speech_data/es/cmltts_es_train/cuts", + "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts", + "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts_textContext" + ], + "fr": [ + "/Data/tts_lhotse_datasets/speech_data/fr/cmltts_fr_train/cuts", + "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts", + "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts_textContext" + ], + "hi": [ + "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_1/cuts", + "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_2/cuts", + "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_1/cuts", + "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_2/cuts" + ], + "it": ["/Data/tts_lhotse_datasets/speech_data/it/cmltts_it_train/cuts"], + "vi": [ + "/Data/tts_lhotse_datasets/speech_data/vi/Infore1_2_lsvsc/cuts", + "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts", + "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts_textContext", + "/Data/tts_lhotse_datasets/speech_data/vi/NorthFemale/cuts", + "/Data/tts_lhotse_datasets/speech_data/vi/nvyt_vi/nvyt_yt2025/cuts" + ], + "zh": [ + "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts", + "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts_textContext", + "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_1/cuts", + "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_2/cuts" + ], + "en": [ + "/Data/tts_lhotse_datasets/speech_data/en/nvyt2505/lhotse_shar_shuffle_shardSize256/cuts", + "/Data/tts_lhotse_datasets/speech_data/en/hifitts/lhotse_shar_shuffle_shardSize256/cuts", + "/Data/tts_lhotse_datasets/speech_data/en/hifitts2/lhotse_shar_shuffle_shardSize256/cuts", + "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts", + "/Data/tts_lhotse_datasets/speech_data/en/libritts/lhotse_shar_shuffle_shardSize256/cuts", + "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts", + "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts_textContext", + "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts", + "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts_textContext", + "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts_textContext" + ] +} diff --git a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py new file mode 100644 index 000000000000..c6098d93839a --- /dev/null +++ b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py @@ -0,0 +1,521 @@ +#!/usr/bin/env python3 +""" +Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards. + +This script: +1. Reads IPA strings from cuts_with_ipa directories (output of add_ipa_to_lhotse_shards.py) +2. Optionally balances data across languages (samples equal amounts from each) +3. Trains a HuggingFace ByteLevelBPETokenizer on all extracted IPA strings +4. Saves vocab.json and merges.txt to the specified output directory + +Features: +- Language balancing: uses the same number of samples from each language +- Configurable max samples per language + +Usage: + python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --vocab_size 1024 + python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs en,de --vocab_size 2048 + python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs all --max_samples_per_lang 50000 + +The trained tokenizer can be loaded using the IPABPETokenizer class in: + nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +""" + +from __future__ import annotations + +import argparse +import gzip +import json +import os +import random +import sys +from pathlib import Path +from typing import Dict, Generator, List, Optional, Tuple + +from tokenizers import Tokenizer +from tokenizers.decoders import ByteLevel as ByteLevelDecoder +from tokenizers.models import BPE +from tokenizers.pre_tokenizers import ByteLevel +from tokenizers.trainers import BpeTrainer + +# ------------------------- +# USER CONFIG - Same structure as add_ipa_to_lhotse_shards.py +# ------------------------- + +# Default config file path (same directory as this script) +DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json" + + +def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: + """Load CUTS_DIRS_BY_LANG from a JSON config file.""" + if config_path is None: + config_path = DEFAULT_CONFIG_PATH + + if not config_path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f) + +OUTPUT_SUFFIX = "_with_ipa" # cuts -> cuts_with_ipa +SHARD_GLOB = "cuts.*.jsonl.gz" + + +def get_ipa_dir(cuts_dir: Path) -> Path: + """Convert a cuts directory path to its corresponding cuts_with_ipa path.""" + name = cuts_dir.name + if name == "cuts": + out_name = f"cuts{OUTPUT_SUFFIX}" + elif name.endswith("_textContext"): + # Handle cuts_textContext -> cuts_textContext_with_ipa + out_name = f"{name}{OUTPUT_SUFFIX}" + else: + out_name = f"{name}{OUTPUT_SUFFIX}" + return cuts_dir.parent / out_name + + +def iter_shards(ipa_dir: Path) -> List[Path]: + """Get all shard files in a directory.""" + return sorted(ipa_dir.glob(SHARD_GLOB)) + + +def extract_ipa_from_shard(shard_path: Path) -> Generator[str, None, None]: + """ + Extract all IPA strings from a single shard file. + + Yields: + IPA strings from cut["supervisions"][i]["custom"]["ipa"] + """ + with gzip.open(shard_path, "rt", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + cut = json.loads(line) + supervisions = cut.get("supervisions", []) + for sup in supervisions: + custom = sup.get("custom", {}) + ipa = custom.get("ipa") + if ipa and isinstance(ipa, str) and ipa.strip(): + yield ipa.strip() + except json.JSONDecodeError: + continue + + +def extract_ipa_from_dir(ipa_dir: Path) -> Generator[str, None, None]: + """Extract all IPA strings from all shards in a directory.""" + shards = iter_shards(ipa_dir) + for shard in shards: + yield from extract_ipa_from_shard(shard) + + +def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]: + """Return list of all available language codes.""" + return list(cuts_dirs.keys()) + + +def collect_ipa_strings( + cuts_dirs: Dict[str, List[str]], + lang: Optional[str] = None, +) -> Generator[str, None, None]: + """ + Collect all IPA strings from the specified language(s). + + Args: + cuts_dirs: Dictionary mapping language codes to lists of cuts directories + lang: Language code or None for all languages. + + Yields: + IPA strings + """ + if lang is None or lang == "all": + langs_to_process = list(cuts_dirs.keys()) + else: + if lang not in cuts_dirs: + raise ValueError(f"Unknown language: {lang}. Available: {get_available_languages(cuts_dirs)}") + langs_to_process = [lang] + + for lang_code in langs_to_process: + print(f"[INFO] Processing language: {lang_code}") + for cuts_dir_str in cuts_dirs[lang_code]: + cuts_dir = Path(cuts_dir_str) + ipa_dir = get_ipa_dir(cuts_dir) + + if not ipa_dir.exists(): + print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr) + continue + + print(f"[INFO] Reading from: {ipa_dir}") + count = 0 + for ipa in extract_ipa_from_dir(ipa_dir): + yield ipa + count += 1 + print(f"[INFO] Extracted {count} IPA strings from {ipa_dir}") + + +def iter_ipa_strings_for_lang( + lang: str, + cuts_dirs: Dict[str, List[str]], +) -> Generator[str, None, None]: + """Iterate over all IPA strings for a single language (memory-efficient).""" + if lang not in cuts_dirs: + return + + for cuts_dir_str in cuts_dirs[lang]: + cuts_dir = Path(cuts_dir_str) + ipa_dir = get_ipa_dir(cuts_dir) + + if not ipa_dir.exists(): + continue + + for ipa in extract_ipa_from_dir(ipa_dir): + yield ipa + + +def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int: + """Count IPA strings for a language without loading into memory.""" + count = 0 + for _ in iter_ipa_strings_for_lang(lang, cuts_dirs): + count += 1 + if count >= max_count: + break + return count + + +def simple_sample_ipa_strings( + lang: str, + cuts_dirs: Dict[str, List[str]], + k: int, + max_collect: int = 100000, + seed: int = 42, +) -> List[str]: + """ + Simple sampling: collect up to max_collect IPA strings, then randomly sample k. + + This avoids reading through all data like reservoir sampling does. + + Args: + lang: Language code + cuts_dirs: Dictionary mapping language codes to lists of cuts directories + k: Number of samples to select + max_collect: Maximum number of strings to collect before sampling + seed: Random seed for reproducibility + + Returns: + List of up to k sampled IPA strings + """ + rng = random.Random(seed) + collected: List[str] = [] + + for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs): + collected.append(ipa) + if len(collected) >= max_collect: + break + + # If we have fewer than k, return all + if len(collected) <= k: + return collected + + # Otherwise, randomly sample k + return rng.sample(collected, k) + + +def parse_langs_arg(arg: str, available_langs: List[str]) -> List[str]: + """Parse a language argument (comma-separated or 'all').""" + if arg == "all": + return available_langs + langs = [l.strip() for l in arg.split(",") if l.strip()] + for lang in langs: + if lang not in available_langs: + raise ValueError(f"Unknown language: {lang}. Available: {available_langs}") + return langs + + +def create_balanced_corpus( + train_langs: List[str], + cuts_dirs: Dict[str, List[str]], + output_file: str, + max_samples_per_lang: Optional[int] = None, + max_count_per_lang: int = 100000, + seed: int = 42, +) -> Tuple[str, Dict[str, int]]: + """ + Create a balanced IPA corpus file with equal samples from each language. + + Uses a memory-efficient two-pass approach: + 1. First pass: Count sentences per language (up to max_count_per_lang) + 2. Second pass: Use simple sampling to select samples + + Args: + train_langs: List of language codes to include + cuts_dirs: Dictionary mapping language codes to lists of cuts directories + output_file: Path to write the balanced corpus + max_samples_per_lang: Optional cap on samples per language + max_count_per_lang: Max count per language when counting IPA strings + seed: Random seed for reproducibility + + Returns: + Tuple of (corpus_file_path, dict of lang -> actual_count) + """ + # First pass: Count sentences per language + print("[INFO] Pass 1: Counting IPA strings per language...") + lang_counts: Dict[str, int] = {} + + for lang in train_langs: + if lang not in cuts_dirs: + print(f"[WARN] Language {lang} not in config, skipping") + continue + print(f"[INFO] Counting {lang}...", end=" ", flush=True) + count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang) + lang_counts[lang] = count + print(f"{count} IPA strings") + + if not lang_counts: + raise ValueError("No IPA strings found for any language") + + # Find minimum count across languages + min_count = min(lang_counts.values()) + print(f"[INFO] Minimum count across languages: {min_count}") + + # Apply max_samples_per_lang cap if specified + samples_per_lang = min_count + if max_samples_per_lang is not None and max_samples_per_lang < min_count: + samples_per_lang = max_samples_per_lang + print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}") + + # Second pass: Sample from each language using simple sampling + print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...") + actual_counts: Dict[str, int] = {} + total_written = 0 + + with open(output_file, "w", encoding="utf-8") as f: + for lang in lang_counts.keys(): + print(f"[INFO] Sampling from {lang}...", end=" ", flush=True) + # Use different seed per language for variety, but reproducible + lang_seed = seed + hash(lang) % 10000 + sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed) + + for ipa in sampled: + f.write(ipa + "\n") + total_written += 1 + + actual_counts[lang] = len(sampled) + print(f"sampled {len(sampled)} strings") + + print(f"[INFO] Total IPA strings written to corpus: {total_written}") + print(f"[INFO] Balanced corpus saved to: {output_file}") + + return output_file, actual_counts + + +def train_bpe_tokenizer( + corpus_file: str, + vocab_size: int = 1024, + min_frequency: int = 2, + output_dir: str = "./ipa_bpe_tokenizer", +) -> Tokenizer: + """ + Train a byte-level BPE tokenizer on IPA strings from a corpus file. + + Args: + corpus_file: Path to the IPA corpus file (one IPA string per line) + vocab_size: Target vocabulary size + min_frequency: Minimum frequency for a token to be included + output_dir: Directory to save the tokenizer files + + Returns: + Trained Tokenizer object + """ + # Create output directory + os.makedirs(output_dir, exist_ok=True) + + # Check if tokenizer already exists + tokenizer_path = os.path.join(output_dir, "tokenizer.json") + if os.path.exists(tokenizer_path): + print(f"[INFO] Loading existing tokenizer from {tokenizer_path}") + return Tokenizer.from_file(tokenizer_path) + + # Count lines in corpus + with open(corpus_file, "r", encoding="utf-8") as f: + total_count = sum(1 for _ in f) + print(f"[INFO] Corpus contains {total_count} IPA strings") + + if total_count == 0: + raise ValueError("Corpus file is empty. Make sure the cuts_with_ipa directories exist.") + + # Initialize a byte-level BPE tokenizer + tokenizer = Tokenizer(BPE(unk_token="")) + + # Use byte-level pre-tokenization (like GPT-2) + tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) + + # Add byte-level decoder to properly convert back to original text + tokenizer.decoder = ByteLevelDecoder() + + # Define special tokens + special_tokens = ["", "", ""] + + # Create trainer + trainer = BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + special_tokens=special_tokens, + show_progress=True, + ) + + # Train the tokenizer + print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}, min_frequency={min_frequency}") + tokenizer.train(files=[corpus_file], trainer=trainer) + + # Save the tokenizer + vocab_path = os.path.join(output_dir, "vocab.json") + merges_path = os.path.join(output_dir, "merges.txt") + + # Save using the tokenizer's model save method + tokenizer.model.save(output_dir) + + # Also save the full tokenizer for easy loading + tokenizer.save(tokenizer_path) + + print(f"[INFO] Tokenizer saved to: {output_dir}") + print(f"[INFO] - vocab.json: {vocab_path}") + print(f"[INFO] - merges.txt: {merges_path}") + print(f"[INFO] - tokenizer.json: {tokenizer_path}") + print(f"[INFO] Vocabulary size: {tokenizer.get_vocab_size()}") + + return tokenizer + + +def main(): + parser = argparse.ArgumentParser( + description="Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards." + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Directory to save the trained tokenizer files (vocab.json, merges.txt, tokenizer.json)", + ) + parser.add_argument( + "--vocab_size", + type=int, + default=1024, + help="Vocabulary size for the BPE tokenizer (default: 1024)", + ) + parser.add_argument( + "--min_frequency", + type=int, + default=2, + help="Minimum frequency for a token to be included in vocabulary (default: 2)", + ) + parser.add_argument( + "--train_langs", + type=str, + default="all", + help="Comma-separated language codes for training, or 'all' (default: all)", + ) + parser.add_argument( + "--max_samples_per_lang", + type=int, + default=None, + help="Optional cap on samples per language (default: use min count across langs for balance)", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for sampling (default: 42)", + ) + parser.add_argument( + "--config", + type=str, + default=None, + help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}" + ) + parser.add_argument( + "--max_count_per_lang", + type=int, + default=100000, + help="Max count per language when counting IPA strings (default: 100000)", + ) + args = parser.parse_args() + + # Load config + config_path = Path(args.config) if args.config else None + cuts_dirs = load_cuts_dirs_config(config_path) + available_langs = get_available_languages(cuts_dirs) + + # Parse train_langs + try: + train_langs = parse_langs_arg(args.train_langs, available_langs) + except ValueError as e: + print(f"[ERROR] {e}") + sys.exit(1) + + print(f"[INFO] Training IPA BPE tokenizer") + print(f"[INFO] Output directory: {args.output_dir}") + print(f"[INFO] Vocabulary size: {args.vocab_size}") + print(f"[INFO] Min frequency: {args.min_frequency}") + print(f"[INFO] Training languages: {train_langs}") + print(f"[INFO] Max samples per lang: {args.max_samples_per_lang or 'auto (min across langs)'}") + print(f"[INFO] Max count per lang: {args.max_count_per_lang}") + print(f"[INFO] Available languages: {available_langs}") + + os.makedirs(args.output_dir, exist_ok=True) + + # Step 1: Create balanced corpus + print("\n" + "=" * 60) + print("STEP 1: Creating balanced IPA corpus") + print("=" * 60) + + corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt") + + if os.path.exists(corpus_file): + print(f"[INFO] Using existing corpus file: {corpus_file}") + with open(corpus_file, "r", encoding="utf-8") as f: + line_count = sum(1 for _ in f) + print(f"[INFO] Corpus contains {line_count} IPA strings") + else: + corpus_file, lang_counts = create_balanced_corpus( + train_langs=train_langs, + cuts_dirs=cuts_dirs, + output_file=corpus_file, + max_samples_per_lang=args.max_samples_per_lang, + max_count_per_lang=args.max_count_per_lang, + seed=args.seed, + ) + + # Step 2: Train tokenizer + print("\n" + "=" * 60) + print("STEP 2: Training BPE tokenizer") + print("=" * 60) + + tokenizer = train_bpe_tokenizer( + corpus_file=corpus_file, + vocab_size=args.vocab_size, + min_frequency=args.min_frequency, + output_dir=args.output_dir, + ) + + # Test the tokenizer + print("\n[INFO] Testing tokenizer with sample IPA strings:") + test_strings = [ + "həˈloʊ wɜːld", # hello world + "ˈaɪ pʰiː eɪ", # IPA + "ˈtɛstɪŋ wʌn tuː θriː", # testing one two three + ] + for test_str in test_strings: + encoded = tokenizer.encode(test_str) + decoded = tokenizer.decode(encoded.ids) + print(f" Input: '{test_str}'") + print(f" Tokens: {encoded.tokens}") + print(f" IDs: {encoded.ids}") + print(f" Decoded: '{decoded}'") + print() + + print("[INFO] Done!") + + +if __name__ == "__main__": + main() diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json new file mode 100644 index 000000000000..6d7e35116405 --- /dev/null +++ b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json @@ -0,0 +1,9954 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": null, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "(": 3, + ")": 4, + "-": 5, + ".": 6, + "1": 7, + "2": 8, + "4": 9, + "5": 10, + "6": 11, + "7": 12, + "F": 13, + "a": 14, + "b": 15, + "c": 16, + "d": 17, + "e": 18, + "f": 19, + "h": 20, + "i": 21, + "j": 22, + "k": 23, + "l": 24, + "m": 25, + "n": 26, + "o": 27, + "p": 28, + "q": 29, + "r": 30, + "s": 31, + "t": 32, + "u": 33, + "v": 34, + "w": 35, + "x": 36, + "y": 37, + "z": 38, + "¡": 39, + "£": 40, + "¦": 41, + "§": 42, + "©": 43, + "ª": 44, + "¬": 45, + "°": 46, + "²": 47, + "³": 48, + "¸": 49, + "¹": 50, + "¾": 51, + "Ã": 52, + "Å": 53, + "É": 54, + "Ê": 55, + "Ë": 56, + "Ì": 57, + "Î": 58, + "Ï": 59, + "Ċ": 60, + "Ġ": 61, + "Ģ": 62, + "ģ": 63, + "Ĥ": 64, + "ĥ": 65, + "ĩ": 66, + "Ī": 67, + "Ĭ": 68, + "ĭ": 69, + "Į": 70, + "į": 71, + "İ": 72, + "ı": 73, + "IJ": 74, + "ij": 75, + "Ĵ": 76, + "ĵ": 77, + "Ķ": 78, + "ķ": 79, + "ĸ": 80, + "Ĺ": 81, + "Ļ": 82, + "Ľ": 83, + "ľ": 84, + "Ŀ": 85, + "Ł": 86, + "ËĪ": 87, + "ËIJ": 88, + "ËĪÉ": 89, + "ËĮ": 90, + "ÉĻ": 91, + "ËĪa": 92, + "ËĪi": 93, + "Ġt": 94, + "ɪ": 95, + "ɾ": 96, + "ĠÉ": 97, + "Ġk": 98, + "Éľ": 99, + "Ġs": 100, + "ËĪe": 101, + "ÉĽ": 102, + "ËĪo": 103, + "Ġl": 104, + "ËĪÉĽ": 105, + "Ġd": 106, + "ÊĬ": 107, + "ËĪaËIJ": 108, + "Ġp": 109, + "Ìĥ": 110, + "Ġm": 111, + "ËĪu": 112, + "Åĭ": 113, + "ð": 114, + "ËĪÉĶ": 115, + "ÊĮ": 116, + "ËĮa": 117, + "Ġh": 118, + "ËĪÊĮ": 119, + "Ġn": 120, + "Êģ": 121, + "ËĪÉij": 122, + "Êĥ": 123, + "eËIJ": 124, + "Ġa": 125, + "Ġb": 126, + "ÉĶ": 127, + "ËĪÉĻ": 128, + "ÉĻn": 129, + "Ġf": 130, + "ËĪɪ": 131, + "É¡": 132, + "ËĪeËIJ": 133, + "Ġj": 134, + "nt": 135, + "Ġð": 136, + "ĠËĮ": 137, + "Ġts": 138, + "ĠÉ¡": 139, + "Éķ": 140, + "ËĪoËIJ": 141, + "ʰ": 142, + "aËIJ": 143, + "ËĪy": 144, + "ĠtÉķ": 145, + "ËĪiËIJ": 146, + "ĠÊ": 147, + "Ġv": 148, + "Ġw": 149, + "st": 150, + "Éij": 151, + "nd": 152, + "ËĮi": 153, + "̪": 154, + "ËĮe": 155, + "Ġz": 156, + "ËĪaɪ": 157, + "ËĪiÉĽ": 158, + "β": 159, + "ɹ": 160, + "ĠËĮa": 161, + "θ": 162, + "ĠhÉĽ": 163, + "ÊĪ": 164, + "iËIJ": 165, + "ËĮo": 166, + "Ġɪ": 167, + "Éľn": 168, + "Ġx": 169, + "ĠtÉĻ": 170, + "ËĪuËIJ": 171, + "ËĮÉĻ": 172, + "ĠjËĪi": 173, + "ËĮÉĽ": 174, + "ĠÉĽ": 175, + "ĠËĪa": 176, + "ËĮaËIJ": 177, + "Ġla": 178, + "Ġðe": 179, + "ĠhÉĽËIJ": 180, + "Ġe": 181, + "ç": 182, + "ÉĻl": 183, + "oËIJ": 184, + "ËĪÉiju": 185, + "ÊĴ": 186, + "uËIJ": 187, + "ĠÉĹ": 188, + "ĠÉķ": 189, + "ËĮeËIJ": 190, + "ĠtÉķËĪi": 191, + "os": 192, + "ËĪÉĶËIJ": 193, + "as": 194, + "ËĪÊĬ": 195, + "Ġi": 196, + "ËĪai": 197, + "ɲ": 198, + "ɪn": 199, + "ts": 200, + "ÉľÅĭ": 201, + "ĠÉŁ": 202, + "ĠÊĥ": 203, + "ËĪeɪ": 204, + "ÉĽÉ¾": 205, + "ËĪÉĽËIJ": 206, + "ËĪÉĽÉ¾": 207, + "Ġr": 208, + "tÊĥ": 209, + "ËĮÉĶ": 210, + "ĠdÉĻ": 211, + "tÉĻ": 212, + "ou": 213, + "ËĪyÉĻ": 214, + "ĠËĮi": 215, + "ÉĻɾ": 216, + "ËĪÉĻÊĬ": 217, + "ËĪÊĮɾ": 218, + "ËĪÉĴ": 219, + "Ġth": 220, + "ËĪon": 221, + "Êĭ": 222, + "ËĪÉijËIJ": 223, + "ËĪÊĮh": 224, + "wËĪa": 225, + "ËĪei": 226, + "ll": 227, + "ĠÉIJ": 228, + "ÉijËIJ": 229, + "an": 230, + "ÉŁ": 231, + "ĠÊĭ": 232, + "Ġko": 233, + "kh": 234, + "ɪÅĭ": 235, + "ËĪaËIJɪ": 236, + "ĠtÊĥ": 237, + "ËĪaËIJt": 238, + "ĠËĮe": 239, + "ĠtÉķh": 240, + "ËĪuo": 241, + "ËĪonÉ¡": 242, + "Éĸ": 243, + "at": 244, + "Ġke": 245, + "ÉĴ": 246, + "ĠÉķËĪi": 247, + "ø": 248, + "ĠÉij": 249, + "ËĪeËIJk": 250, + "Åĵ": 251, + "re": 252, + "Ġɾ": 253, + "ĠkÉĶ": 254, + "ËĮÊĬ": 255, + "sk": 256, + "ĠÊĬ": 257, + "Ġand": 258, + "ɪç": 259, + "Ġme": 260, + "ËĪaɾ": 261, + "ĠËĪɪ": 262, + "na": 263, + "Ġβ": 264, + "ĠlËĪi": 265, + "jaËIJ": 266, + "li": 267, + "no": 268, + "Ġɪn": 269, + "ĠdËĮi": 270, + "Ġɲ": 271, + "tËIJ": 272, + "ÉĻm": 273, + "ĠlÉĻ": 274, + "ĠðÉĻ": 275, + "ɪk": 276, + "ËĪÉĽl": 277, + "Éľt": 278, + "Ġse": 279, + "es": 280, + "ËĪou": 281, + "ËĪaÊĬ": 282, + "ĠÉĶ": 283, + "ɪt": 284, + "ĠÅĭ": 285, + "ËĪÉĽn": 286, + "Êİ": 287, + "Ġkh": 288, + "ËĪÉĽnt": 289, + "ËĪaËIJɾ": 290, + "Ġki": 291, + "mp": 292, + "lt": 293, + "É£": 294, + "Ġpa": 295, + "ËĪÉĻËIJ": 296, + "ɪs": 297, + "ĠÉĴ": 298, + "Ġle": 299, + "ÉªÉľ": 300, + "ËĪÉĽt": 301, + "Ġde": 302, + "Ġɹ": 303, + "ĠtËĪoËIJ": 304, + "ĠÊģ": 305, + "ÊĥÉĻn": 306, + "ĠÊĬnt": 307, + "ËĪÉĶɾ": 308, + "ËĪað": 309, + "Ġaɪ": 310, + "ĠÊIJ": 311, + "ĠmËĪa": 312, + "ra": 313, + "ĠkËĪɪ": 314, + "kt": 315, + "ËIJp": 316, + "ĠÊĪ": 317, + "ËĪaËIJÊĬ": 318, + "ĠkËĪÊĮɾ": 319, + "ĠËĪÊĮ": 320, + "ĠÉĴv": 321, + "Ġel": 322, + "ks": 323, + "Ġkw": 324, + "ÉĻt": 325, + "ndo": 326, + "ei": 327, + "ĠËĮaËIJp": 328, + "se": 329, + "ÉĻɹ": 330, + "ËĪuei": 331, + "ÉĻs": 332, + "ĠkËĮo": 333, + "ĠÊĤ": 334, + "ĠËĮÊĬ": 335, + "Ġc": 336, + "ĠÉĽn": 337, + "ËĪant": 338, + "θj": 339, + "ËĮoËIJ": 340, + "ĠËĪaËIJ": 341, + "Ġpɾ": 342, + "si": 343, + "ĠËĪe": 344, + "ĠjuËIJ": 345, + "ĠkËĮe": 346, + "ËĮɪ": 347, + "ÉĶn": 348, + "ĠsËĪÊĮ": 349, + "ĠËĪu": 350, + "ni": 351, + "Ġst": 352, + "ĠdiËIJ": 353, + "ĠkeËIJ": 354, + "ĠjËĪiou": 355, + "ËĪaiÉľ": 356, + "ĠdÊĴ": 357, + "ĠËĪÉĶ": 358, + "va": 359, + "ËIJɾ": 360, + "ËĪø": 361, + "ËĮÉĻÊĬ": 362, + "ĠpËĪu": 363, + "Ġsu": 364, + "Ġma": 365, + "ĠÉĻ": 366, + "dÊĴ": 367, + "Ġpʰ": 368, + "le": 369, + "in": 370, + "ĠtÉķhËĪi": 371, + "ĠwËĪo": 372, + "ro": 373, + "ËĮy": 374, + "ɾa": 375, + "ĠsËĪi": 376, + "ðÉĻ": 377, + "ĠseËIJ": 378, + "la": 379, + "ĠÊĴ": 380, + "mb": 381, + "ĠhËĪoËIJ": 382, + "Ġbʰ": 383, + "ĠÉĽÉ¾": 384, + "Ġðat": 385, + "sp": 386, + "ÉĶɾ": 387, + "en": 388, + "ĠsÉĻ": 389, + "ËĪÉĶÉľ": 390, + "ĠlËĮa": 391, + "ĠËĮÉĽ": 392, + "ĠËĪy": 393, + "É¡aËIJ": 394, + "ĠdÉĽÉ¾": 395, + "ËĪÉĽÊģ": 396, + "Éľkh": 397, + "ËĪiÉĻ": 398, + "ËĪan": 399, + "ĠmËĪo": 400, + "ËĪaβ": 401, + "Ġal": 402, + "ĠËĪeËIJ": 403, + "Ġθ": 404, + "ĠnËĪi": 405, + "pʰ": 406, + "lla": 407, + "Ġpl": 408, + "ËĪÅĵ": 409, + "jËĪÉiju": 410, + "Ġav": 411, + "ĠmËĪi": 412, + "ĠfËĪa": 413, + "ËĪÉľ": 414, + "me": 415, + "ËĮÉĻh": 416, + "ËĪuÉĻ": 417, + "it": 418, + "jËĪe": 419, + "Ġo": 420, + "ËĪÉľËIJ": 421, + "ĠtÉķËĪiou": 422, + "ÉĶËIJ": 423, + "ĠnÉĻ": 424, + "ËĪÉĻÉľn": 425, + "ĠmÉĻ": 426, + "ĠdeËIJ": 427, + "mo": 428, + "sa": 429, + "jËĪÉĶ": 430, + "ËĪal": 431, + "ĠtÉķËĪiÉĽ": 432, + "ĠÉ¡ÉĻ": 433, + "ða": 434, + "Ġɪz": 435, + "Ġsa": 436, + "ri": 437, + "ĠËĮil": 438, + "ËĮu": 439, + "ĠkaËIJ": 440, + "ĠÉĻËIJ": 441, + "ĠÉĸ": 442, + "Ġka": 443, + "ËĪÊĮhi": 444, + "ĠjeËIJ": 445, + "Ġtʰ": 446, + "ne": 447, + "kËIJ": 448, + "ĠtsËĪai": 449, + "ĠËĪeËIJk": 450, + "nk": 451, + "ti": 452, + "ËĪaÉľn": 453, + "ĠkËIJ": 454, + "É¡ÉĻn": 455, + "ËĪia": 456, + "ĠÉĶËIJɾ": 457, + "Êı": 458, + "ĠËĮÊĮ": 459, + "ĠzËĪaËIJ": 460, + "Ġlos": 461, + "ÉĽs": 462, + "ËĪÉĶn": 463, + "ÉĽnt": 464, + "ÉĽn": 465, + "ĠÉŁËĪoËIJ": 466, + "çt": 467, + "Ġdas": 468, + "ĠxËĮo": 469, + "ËĪuÉľ": 470, + "ËĪas": 471, + "ĠbËĪÊĮ": 472, + "ËĪiÉĽÉľn": 473, + "ÉIJ": 474, + "ĠtsuËIJ": 475, + "ĠpËĮÉĽ": 476, + "ĠnËĪÉĶ": 477, + "ÊĬt": 478, + "ma": 479, + "ĠnËĪo": 480, + "ĠlËĪɪ": 481, + "ËĪÉĽs": 482, + "ɪl": 483, + "ĠÉķËĪiÉĽ": 484, + "ĠËĪÊĬ": 485, + "ÉĴt": 486, + "to": 487, + "ĠËĪo": 488, + "ËĮon": 489, + "ĠkwËĪa": 490, + "Ġɪt": 491, + "ĠhoËIJ": 492, + "ËĪiËIJk": 493, + "ĠËĮaËIJpk": 494, + "ËĪaɪn": 495, + "æ": 496, + "ÉĻnt": 497, + "ta": 498, + "lo": 499, + "ĠnËĪÉij": 500, + "ĠlËĪa": 501, + "ËĪiÉľ": 502, + "ĠwËĪei": 503, + "ÉĽÊģ": 504, + "ĠtËĪa": 505, + "ĠɾËĮÉĻh": 506, + "ĠÉķËĪiÉij": 507, + "ËĮiËIJ": 508, + "ËĮÉĽl": 509, + "ĠtÉĻÉľ": 510, + "ĠkËĪuo": 511, + "ĠtËĪu": 512, + "jËĪÉĽ": 513, + "ĠËĮin": 514, + "ɾe": 515, + "ĠkoËIJ": 516, + "ĠkËĪa": 517, + "ɾi": 518, + "ĠtÉķËĪiÉij": 519, + "lÉĻ": 520, + "ĠkÉĻ": 521, + "ĠtËĪi": 522, + "ĠÅĭËĪyÉĻ": 523, + "Ġtsh": 524, + "er": 525, + "av": 526, + "ĠkÉĶn": 527, + "ËĪÉĻÉľÅĭ": 528, + "ðo": 529, + "ËĪaËIJn": 530, + "ĠbʰËĪi": 531, + "ĠkËIJjaËIJ": 532, + "ÉĻz": 533, + "ĠpÊģ": 534, + "ĠdËĪɪ": 535, + "ĠziËIJ": 536, + "É¡eËIJ": 537, + "ĠtËĪÉĻ": 538, + "ɪz": 539, + "ĠnËĮon": 540, + "taËIJ": 541, + "bl": 542, + "te": 543, + "nËĮeËIJ": 544, + "ËĪɪl": 545, + "so": 546, + "ko": 547, + "uÊģ": 548, + "ĠÉ£": 549, + "ĠpaÊģ": 550, + "ĠËĪÉĽ": 551, + "jËĪuËIJ": 552, + "ËĮÊĮ": 553, + "yn": 554, + "ËĪiËIJn": 555, + "ĠlËĪaɪ": 556, + "ËĪɪÅĭ": 557, + "ĠtÉķhËĪy": 558, + "ĠnËĪÊĮhi": 559, + "ĠdËĮe": 560, + "ĠjËĪÉiju": 561, + "ĠtËĪÉiju": 562, + "ĠhËĪo": 563, + "ɪd": 564, + "ĠthËĪÉij": 565, + "mËĪe": 566, + "ĠËĪÉĻ": 567, + "ja": 568, + "Ġph": 569, + "ÉĽt": 570, + "ĠkËĪÊĮ": 571, + "tÉĻn": 572, + "mËĪÉij": 573, + "wËĪe": 574, + "ĠËĮaɪn": 575, + "Ġðɪs": 576, + "É¡ÉĻ": 577, + "ĠnËĪaËIJ": 578, + "ĠbËĪaËIJ": 579, + "Ġaθ": 580, + "ĠmËĮa": 581, + "ËĪÊĮha": 582, + "ĠdËĮa": 583, + "ËĪÊı": 584, + "ĠɲËĮy": 585, + "ĠpËĪa": 586, + "ËĪaðo": 587, + "di": 588, + "bÉľ": 589, + "ɳ": 590, + "ĠwiËIJ": 591, + "ĠnËĪɪ": 592, + "ĠÉ¡ËĪÉĶÉľ": 593, + "tËIJo": 594, + "ËĮÉĻm": 595, + "ËĪaËIJr": 596, + "ĠmÉĽ": 597, + "ËĪeËIJÉ¡aËIJ": 598, + "ĠsËĮi": 599, + "ĠlËĮaËIJ": 600, + "nËĮaËIJ": 601, + "Ġsp": 602, + "tÊģ": 603, + "ĠÊİ": 604, + "ËĮÉijËIJ": 605, + "Ġkl": 606, + "kʰ": 607, + "il": 608, + "ĠÊĥt": 609, + "ĠËĮÊĬn": 610, + "al": 611, + "ĠsËĪÉĽ": 612, + "ĠmËĪaËIJ": 613, + "ĠÅĵ": 614, + "ĠÉ¡ËĪÊĮ": 615, + "ĠpËĮÉĽr": 616, + "ɾËĪa": 617, + "ËIJÊĪ": 618, + "ËĪaβa": 619, + "ĠwËĪÉĴ": 620, + "ĠxËĪuei": 621, + "ĠkhËĪo": 622, + "Ġlas": 623, + "ĠÉĹËĪo": 624, + "ĠfÉĽÉ¾": 625, + "ĠjËĪiÉĽ": 626, + "ĠtËĪe": 627, + "ĠkËĮÉĶ": 628, + "ĠdeËIJn": 629, + "Ġmo": 630, + "ĠpËĪi": 631, + "ĠtËĪÉij": 632, + "ËĪÉĽst": 633, + "wËĪÉij": 634, + "ËĪaɪt": 635, + "ÉĻÊĬ": 636, + "ĠËĪi": 637, + "ɪj": 638, + "aɪ": 639, + "ËĪaËIJÉľ": 640, + "ĠËĪɪs": 641, + "ĠpÉĶɾ": 642, + "Ã¦Éľn": 643, + "ka": 644, + "ÅĭÉ¡": 645, + "bÉĻn": 646, + "ÊĬf": 647, + "Ġpɹ": 648, + "ĠlËĮe": 649, + "ËĪiËIJd": 650, + "ËĪaËIJre": 651, + "ĠmËĪÊĮ": 652, + "ÉĻr": 653, + "ĠdÉij": 654, + "ËĪaËIJto": 655, + "ĠpËĪeËIJ": 656, + "ĠdËĪoËIJ": 657, + "ĠsËĮÊĬ": 658, + "ĠhËĪi": 659, + "ĠsËĪa": 660, + "ËĪeËIJn": 661, + "dÉĻ": 662, + "Ġpj": 663, + "ËĪÅĵÊģ": 664, + "lɪç": 665, + "ÉĴn": 666, + "ĠËĪÉĻr": 667, + "tËĪe": 668, + "Ġil": 669, + "ËĪaËIJl": 670, + "ĠsËĮÉĻÊĬ": 671, + "sÊĪ": 672, + "ĠdËĪuËIJ": 673, + "hËĪÉij": 674, + "ĠxËĪou": 675, + "ĠlËĪaiÉľ": 676, + "wËĪo": 677, + "ËĪÉĽnte": 678, + "Ġsy": 679, + "Ġzɪç": 680, + "ĠÉ¡ËĪu": 681, + "ĠÉķËĪy": 682, + "ËĪÉĶËIJl": 683, + "ÉĶl": 684, + "ĠtËĪo": 685, + "ĠÊĭoËIJ": 686, + "ĠiËIJ": 687, + "wËĪaða": 688, + "ËĪando": 689, + "Ġaθɼnt": 690, + "ĠaθɼntwËĪaða": 691, + "ĠtËĪiÉĽ": 692, + "ËĪeiÉľ": 693, + "ĠpËĮa": 694, + "ĠnËĪaɪ": 695, + "wa": 696, + "Ġfr": 697, + "ĠÊIJËĪÉĻÉľn": 698, + "ËĪua": 699, + "mi": 700, + "ĠmËĪÉĽ": 701, + "ËĪeËIJkʰ": 702, + "cʰ": 703, + "ĠwËĪÉij": 704, + "sta": 705, + "Ġtu": 706, + "Ġsk": 707, + "ËĪÉĶl": 708, + "ËĪeËIJÊĪ": 709, + "ĠlËĪaËIJɪ": 710, + "ĠlËĪaËIJ": 711, + "ËĪÉĽËIJs": 712, + "ËĪÉĽÉ¾a": 713, + "ËĪÉĻÉľt": 714, + "Ġyn": 715, + "dÉĻn": 716, + "Ġdi": 717, + "ËĪiËIJs": 718, + "Ġðel": 719, + "ËĪÊĮr": 720, + "ĠhËĪaËIJ": 721, + "ĠbÉĻ": 722, + "ĠjËĪuËIJ": 723, + "lle": 724, + "sto": 725, + "ËĪɪt": 726, + "ËĪoËIJɾ": 727, + "bʰ": 728, + "mÉĻn": 729, + "ËĮuÉĻ": 730, + "ËĮÉĻɾ": 731, + "ËĪÊĮn": 732, + "ĠlËĪaɪk": 733, + "ĠbËĪa": 734, + "ɪð": 735, + "Ġlo": 736, + "zi": 737, + "ËĪÊĮst": 738, + "mËĪi": 739, + "ÉĶÊģ": 740, + "ĠnËĪɪçt": 741, + "Ġtɾ": 742, + "ĠdËĪeËIJkʰ": 743, + "ĠsËĮe": 744, + "ĠnËĪÉĻÊĬ": 745, + "Ġu": 746, + "Ġsi": 747, + "Ġɪç": 748, + "Ġpr": 749, + "ĠtÉķËĪy": 750, + "ĠmËĪu": 751, + "za": 752, + "ĠtÊģ": 753, + "Ġwɪð": 754, + "tËĪÉĽ": 755, + "ĠpËĪÊĮɾ": 756, + "ĠkËĪÉĶ": 757, + "ËĪoËIJr": 758, + "ĠhËĮa": 759, + "ĠkËĪonÉ¡": 760, + "ĠpuÊģ": 761, + "Ġdy": 762, + "ËĪɪn": 763, + "nte": 764, + "ĠkËĮa": 765, + "ËĪÉĻɪ": 766, + "Ġmi": 767, + "ĠÉ¡ËĮuÉĻ": 768, + "Ġʲ": 769, + "ĠfËĪÉij": 770, + "ĠvÉijËIJ": 771, + "ĠËĮaÊĬ": 772, + "ËĮuËIJ": 773, + "ĠËĪun": 774, + "ĠjËĪÊĮha": 775, + "juËIJ": 776, + "Ġmɪt": 777, + "ĠlËĪÉĽ": 778, + "ËĪeËIJÊĥ": 779, + "ĠfÉĶËIJ": 780, + "mÉĻ": 781, + "ɾt": 782, + "ĠkËĮon": 783, + "ĠlËĪÉĶ": 784, + "ĠxËĪÉiju": 785, + "pl": 786, + "ĠdËĪi": 787, + "ĠlËĪoËIJ": 788, + "sÉĻ": 789, + "ËĪaËIJva": 790, + "ĠlËĪu": 791, + "ĠÉ¡ËĮÉĻÊĬ": 792, + "Ġhav": 793, + "ĠËĮaËIJpkËĮoËIJ": 794, + "ɾËĪi": 795, + "ĠfËĪÉĻ": 796, + "ĠhËĮÉĻm": 797, + "ËĪonÉ¡Éľ": 798, + "jo": 799, + "ĠsÉĶ": 800, + "ËĪaËIJd": 801, + "wËĪiÉĻ": 802, + "ËĪand": 803, + "ËĮaɪn": 804, + "tɾ": 805, + "ĠËĮɪ": 806, + "ĠËĪuna": 807, + "ĠxwËĪÉij": 808, + "ĠjÉĶËIJ": 809, + "ÊģËĪi": 810, + "ĠkËĪuoÉľ": 811, + "Ġaβ": 812, + "ĠÉ¡ËĪaËIJ": 813, + "ano": 814, + "tÉĻl": 815, + "ĠrËĮe": 816, + "ËĮÊĮt": 817, + "ĠjËĪiÉij": 818, + "ĠɾËĮÉĻhaËIJ": 819, + "ĠmËĪe": 820, + "ĠËĪyÃ¦Éľn": 821, + "ĠfËĪu": 822, + "Ġbl": 823, + "nËĪi": 824, + "sÉĻn": 825, + "Ġaɪn": 826, + "ËĪiÊĬ": 827, + "Ġðeɪ": 828, + "Ġɪts": 829, + "Ġ(": 830, + "ËĪyËIJ": 831, + "ÉĻd": 832, + "ĠËĮo": 833, + "ĠÉĽs": 834, + "ĠviËIJ": 835, + "ËIJÉ¡eËIJ": 836, + "kËĪe": 837, + "ĠËĪal": 838, + "ÉĽl": 839, + "ĠÊĮ": 840, + "ËIJo": 841, + "ĠkËĪo": 842, + "ĠÊĪËĪuËIJ": 843, + "ĠsËĪɪ": 844, + "ËĪeËIJɾ": 845, + "Éľm": 846, + "ËĮÉĻn": 847, + "ËĪaËIJi": 848, + "ËĪoËIJl": 849, + "ɪËĮeËIJ": 850, + "ĠʲËĪy": 851, + "ĠkËĪÉĶËIJ": 852, + "sËĪi": 853, + "ĠlËĪe": 854, + "ËĮÉĴt": 855, + "ËĪiËIJp": 856, + "aÊģ": 857, + "ĠθËĪɪÅĭ": 858, + "ËĪÉĻËIJɪ": 859, + "ËĪÊĮl": 860, + "ĠhËĪoËIJtaËIJ": 861, + "ËĪoɪ": 862, + "nto": 863, + "zh": 864, + "ĠdeËIJm": 865, + "ĠkÉĶm": 866, + "ʰËĪiËIJk": 867, + "ĠdÊĴËĪÊĮst": 868, + "pɾ": 869, + "Ġly": 870, + "hËĪu": 871, + "ËĪÉĶø": 872, + "ËĪaËIJs": 873, + "ĠËĪan": 874, + "ĠËĪÉĴ": 875, + "Ġkan": 876, + "ĠtsËĪuo": 877, + "ËĪeËIJva": 878, + "Ġɡɾ": 879, + "Ġpo": 880, + "ĠtÊĥËĪÉĶ": 881, + "Êİa": 882, + "ĠmËĮi": 883, + "Êĥt": 884, + "tËĪi": 885, + "ĠhËĪÊĮ": 886, + "tÊĥe": 887, + "ĠfÉĶn": 888, + "ve": 889, + "ĠnËĮe": 890, + "ËĪÉĶÊģ": 891, + "iz": 892, + "ĠsËĪuo": 893, + "ËĪÉĽËIJr": 894, + "wËĪaÊģ": 895, + "ËĪaða": 896, + "Åĭk": 897, + "po": 898, + "ĠkËĪi": 899, + "ËĪad": 900, + "ĠvËĪi": 901, + "tÉķ": 902, + "ĠkËĪÉĻ": 903, + "ĠwËĪu": 904, + "ÉĴz": 905, + "ĠvÉijËIJɾ": 906, + "ÊģËĪÉĽ": 907, + "ĠkËĪaËIJ": 908, + "ke": 909, + "nÉĻ": 910, + "ËĪÊĮb": 911, + "ËĪuËIJɾ": 912, + "ËĮÉĻËIJ": 913, + "ĠÊĪʰËĪiËIJk": 914, + "ĠkËĪu": 915, + "ĠbËĮÊĮt": 916, + "Ġat": 917, + "Ġfɹ": 918, + "ËĪax": 919, + "ĠzoËIJ": 920, + "ĠtËĪaËIJ": 921, + "ĠðËĮe": 922, + "neËIJ": 923, + "ĠÉijËIJ": 924, + "ĠaÊĬf": 925, + "am": 926, + "ÊĬÅĭ": 927, + "ĠÉĶËIJ": 928, + "ĠÉķËĪiÉľÅĭ": 929, + "ĠËĪÉĶËIJl": 930, + "ɪm": 931, + "jËĪo": 932, + "ËĪiËIJÉŁ": 933, + "ĠkwËĮÉĽ": 934, + "ĠmËĪas": 935, + "ÉĻh": 936, + "ĠËĪaÊĬ": 937, + "ËĪÉĶɪ": 938, + "É¡ÉĻɾ": 939, + "rÉĻn": 940, + "ËĪɪk": 941, + "sse": 942, + "ĠpËĪÉij": 943, + "ĠÉĹËĮe": 944, + "ĠÉĹËĪi": 945, + "Ġaz": 946, + "ĠÉ¡ËĪÊĮjaËIJ": 947, + "ze": 948, + "ĠÉĹËĮaËIJ": 949, + "ĠfËĪi": 950, + "ĠËĮÉĴn": 951, + "ĠxËĪo": 952, + "ĠËĮÊĬna": 953, + "ĠtʰaËIJ": 954, + "ĠsÉij": 955, + "ËĪeɪÊĥÉĻn": 956, + "ĠtÉķËĪiÉľ": 957, + "ĠÉŁaËIJ": 958, + "pËIJ": 959, + "Ġply": 960, + "θËĪi": 961, + "ËIJÉĸ": 962, + "ĠtËĪuei": 963, + "ĠlËĪÉĻ": 964, + "ĠdÉijËIJ": 965, + "ft": 966, + "ËĪam": 967, + "ĠsËĪÊĮkt": 968, + "ĠtËĪou": 969, + "ĠpËĪiÉĽ": 970, + "ĠËĪai": 971, + "ĠwËĪÉĴn": 972, + "ĠzËĮaɪn": 973, + "Ġest": 974, + "ĠmÉĶ": 975, + "ĠtÉķjËĪÉiju": 976, + "Éľp": 977, + "ËĪÊĮz": 978, + "bi": 979, + "ËĪÉĽËIJseËIJ": 980, + "ĠlËĪy": 981, + "ĠmËĮe": 982, + "ĠdËĮÉĽl": 983, + "ËĪiËIJl": 984, + "ĠkËĮomo": 985, + "ĠhËĪaÉľn": 986, + "ËĪoËIJne": 987, + "ĠkËĪÊĮɾt": 988, + "ĠsyÊģ": 989, + "ËĮÉĶɾ": 990, + "Ġɪf": 991, + "uv": 992, + "zÉĻn": 993, + "ol": 994, + "Ïĩ": 995, + "im": 996, + "ĠmËĪiÉĽ": 997, + "Ġðɪ": 998, + "ĠvËĪÉĽ": 999, + "ÊĬd": 1000, + "Ġtr": 1001, + "ËĪeËIJs": 1002, + "ðe": 1003, + "de": 1004, + "ʰÏĩ": 1005, + "ÉŁÊ°": 1006, + "ËĮÉĻËIJÉªÉľ": 1007, + "bËIJ": 1008, + "ËĪÊĬk": 1009, + "ĠnËĪÉĶÉªÉľ": 1010, + "ĠËĮiËIJ": 1011, + "ËĪÉijËIJt": 1012, + "ËĪiËIJɾ": 1013, + "Ġtɹ": 1014, + "ɾÉĶ": 1015, + "ĠwÉĴz": 1016, + "Ġvu": 1017, + "bÉĻl": 1018, + "bÉĻ": 1019, + "ɹi": 1020, + "nts": 1021, + "ĠsËĪaËIJ": 1022, + "dʰ": 1023, + "ĠtÊĬ": 1024, + "ĠÊİËĮi": 1025, + "βa": 1026, + "hËĪÉĻÉľÅĭ": 1027, + "ĠsËĪiËIJ": 1028, + "ĠpËĮaɾa": 1029, + "ËĪÉĽÉ¾ÉĶ": 1030, + "ËĪɪs": 1031, + "É£o": 1032, + "ĠËĮal": 1033, + "or": 1034, + "ĠbËĪÊĮh": 1035, + "ĠkËĪoËIJ": 1036, + "ĠtËĪÉĽ": 1037, + "ĠpËĪo": 1038, + "ĠÊĴÉĻ": 1039, + "pÊģ": 1040, + "ĠËĪaɪ": 1041, + "hËĪÉijÉľÅĭ": 1042, + "ÉĻli": 1043, + "ËĪeɪt": 1044, + "ĠjËĪiouÉľ": 1045, + "ĠdËĪÉĻ": 1046, + "ĠmËĪÉĶËIJ": 1047, + "lËĪi": 1048, + "ËĮyÉĻ": 1049, + "ĠlËĪoËIJÉ¡": 1050, + "ĠnËĪÊĮ": 1051, + "ĠhËĪÊĬ": 1052, + "ĠnËĪÉĻÉľÅĭ": 1053, + "ĠÊģÉĻ": 1054, + "zËĪi": 1055, + "ĠtËĪuËIJ": 1056, + "ĠkËĮome": 1057, + "ĠlËĪeËIJ": 1058, + "ËĪaËIJtaËIJ": 1059, + "Ġan": 1060, + "ĠËĪyu": 1061, + "ĠËĮÊĮÉ¡ÉĻɾ": 1062, + "ĠËĪɪn": 1063, + "ĠhËĪoÉĻ": 1064, + "vÉĻ": 1065, + "ËĪøËIJ": 1066, + "θja": 1067, + "ËĪuÉĻÉľn": 1068, + "ĠkÉĻɾ": 1069, + "ËĪat": 1070, + "jËĪø": 1071, + "ËĪÉĽtÊģ": 1072, + "ĠpËĪÉiju": 1073, + "stÉĻ": 1074, + "ĠwÉĴt": 1075, + "ËĪeËIJl": 1076, + "ÊĪi": 1077, + "ĠxËĪaiÉľ": 1078, + "ËĪyÊģ": 1079, + "ĠhËĪoËIJÉ¡aËIJ": 1080, + "ĠtsËĪi": 1081, + "ĠËĪÊĮp": 1082, + "ĠnËĮÉĴt": 1083, + "ĠlËĪɪeËIJ": 1084, + "ĠhËĪa": 1085, + "Ġfl": 1086, + "ĠnËĪeËIJ": 1087, + "ËĮaËIJɪ": 1088, + "ĠtËĪuo": 1089, + "tÊĥËIJ": 1090, + "sËĪe": 1091, + "bʰi": 1092, + "ĠbËĪÊĮhÊĬt": 1093, + "ËĪÉĽnd": 1094, + "ĠsËĪÉĶ": 1095, + "ÉĻns": 1096, + "ËĮÉĻl": 1097, + "ÉĽÉľ": 1098, + "ĠÉ¡l": 1099, + "ËĪɪɾ": 1100, + "ËĪaËIJta": 1101, + "ÉľËIJ": 1102, + "ËĪÉĽnto": 1103, + "skËĮoËIJ": 1104, + "ËĪÉĽk": 1105, + "tsi": 1106, + "ĠtËĪonÉ¡": 1107, + "ĠbiËIJ": 1108, + "ĠhËĪaËIJɪ": 1109, + "ĠbËĪi": 1110, + "jj": 1111, + "Êİi": 1112, + "Ġkʰ": 1113, + "ĠsËĪo": 1114, + "llo": 1115, + "Ġbaɪ": 1116, + "ĠÉĽnt": 1117, + "ĠËĪiËIJ": 1118, + "ĠÉ¡ËĪo": 1119, + "ɾeËIJ": 1120, + "ĠkÊĭ": 1121, + "ĠmËĪeiÉľ": 1122, + "ÊĬËĪÉĶËIJ": 1123, + "ĠtËĪaɪ": 1124, + "Ġsus": 1125, + "Ġri": 1126, + "ĠvËĮÉĽ": 1127, + "ËĪiËIJno": 1128, + "vano": 1129, + "ĠdËĮiËIJ": 1130, + "ĠÊIJËĪaÉľn": 1131, + "ÊĤ": 1132, + "ĠÉIJb": 1133, + "ËĪaËIJh": 1134, + "ɪÊĥ": 1135, + "ĠdËĮella": 1136, + "tËIJi": 1137, + "ĠËĪÊĬn": 1138, + "ĠhiËIJ": 1139, + "ĠbËĪaËIJt": 1140, + "ĠthËĪi": 1141, + "Ġam": 1142, + "ĠËĪoËIJ": 1143, + "Ġhu": 1144, + "ĠkËĪÊĮh": 1145, + "ĠzËĪÉijËIJ": 1146, + "ĠÉ¡ËĮÉĶ": 1147, + "ĠËĪÉĻÊĬ": 1148, + "yËĪi": 1149, + "ĠlËĪÊĮ": 1150, + "ĠdËĪeËIJ": 1151, + "ĠsËĪÉĶËIJ": 1152, + "skËĮeËIJ": 1153, + "ɾo": 1154, + "ÊģËĪÉij": 1155, + "tËĪa": 1156, + "ĠkËĪÊĬ": 1157, + "ËĪante": 1158, + "ĠdÉĶ": 1159, + "ĠsËĪeɪ": 1160, + "ĠsÉĽt": 1161, + "ɹɪ": 1162, + "ĠÉ¡ËĮÉĻÊĬɪÅĭ": 1163, + "zo": 1164, + "ĠjËĪaËIJ": 1165, + "ĠÉĴvðÉĻ": 1166, + "ĠÊĿ": 1167, + "ĠÉĽl": 1168, + "ĠsËĪoËIJ": 1169, + "ĠthËĪiÉľ": 1170, + "ĠËĪÉĽl": 1171, + "ĠlyËĮi": 1172, + "ndÊĴ": 1173, + "ĠÉķjËĪÉiju": 1174, + "θa": 1175, + "ĠɾËĮÉĻheËIJ": 1176, + "Ġmaɪ": 1177, + "jÉĻ": 1178, + "ĠËĪÊĮb": 1179, + "asjËĪÉĶ": 1180, + "dÊģ": 1181, + "ĠkhËĪa": 1182, + "ĠËĪes": 1183, + "vi": 1184, + "fi": 1185, + "ËĮÉĻb": 1186, + "Ġre": 1187, + "ĠavËĮÉĽ": 1188, + "ĠtËĮi": 1189, + "Ġkɾ": 1190, + "Ġbɪk": 1191, + "ste": 1192, + "ËĪeËIJÊĥc": 1193, + "pt": 1194, + "zÉĻ": 1195, + "ĠwËĪaËIJ": 1196, + "kl": 1197, + "ĠsËĪÊĮm": 1198, + "ɪÊĪ": 1199, + "dz": 1200, + "vo": 1201, + "ËĮaÊĬt": 1202, + "nde": 1203, + "ĠdÉĽs": 1204, + "ĠÉŁËĪaËIJ": 1205, + "ĠrËĮi": 1206, + "sËĮeËIJ": 1207, + "É¡i": 1208, + "Ġals": 1209, + "ËĪiðo": 1210, + "ĠnËĪiÉľn": 1211, + "ÊĬl": 1212, + "tsËIJ": 1213, + "ËĪanto": 1214, + "ĠÉĹËĪÉĻÊĬ": 1215, + "kËIJi": 1216, + "ĠsËĪÊĮb": 1217, + "ĠnËĪa": 1218, + "ĠlËĮo": 1219, + "ĠphËĪi": 1220, + "mËĮe": 1221, + "Ġfa": 1222, + "kÉĻ": 1223, + "ĠzËĪu": 1224, + "ns": 1225, + "ĠÊģe": 1226, + "ĠbËĪo": 1227, + "ËĪaËIJti": 1228, + "Ġman": 1229, + "ĠlËĪiÉij": 1230, + "ĠÉĹËĮyÉĻ": 1231, + "ĠfËĪÉĶËIJ": 1232, + "ĠkÊĭËĪeËIJÊĥc": 1233, + "ĠxËĪÉij": 1234, + "ĠtÉķËĪu": 1235, + "jÉĻɾ": 1236, + "Ġɪst": 1237, + "wËĪi": 1238, + "ĠËĮaɪnÉĻ": 1239, + "ɪɡ": 1240, + "ĠsÊĪ": 1241, + "ËĪiÉĻl": 1242, + "ĠnËĪiÉĽÉľn": 1243, + "ĠËĮÉĽËIJ": 1244, + "ËĪaɪnd": 1245, + "ĠzËĪi": 1246, + "vÉĻn": 1247, + "mz": 1248, + "ðos": 1249, + "dÊĴËIJ": 1250, + "jËĪa": 1251, + "ɾËĪÉĶ": 1252, + "lËĪe": 1253, + "ʲ": 1254, + "ĠvËĪÉĶ": 1255, + "ĠlËĪiÉĽ": 1256, + "θe": 1257, + "mËĪente": 1258, + "ĠɪnðÉĻ": 1259, + "Ġaɪm": 1260, + "nÉĻn": 1261, + "ĠhÉĻm": 1262, + "ɾaËIJ": 1263, + "ĠsËĪuoÉľ": 1264, + "ĠɲËĪi": 1265, + "ĠɹËĪiÉĻl": 1266, + "lËĪa": 1267, + "ĠbËĪÉĶ": 1268, + "ĠkËĪai": 1269, + "ÊģËĪa": 1270, + "ĠwËĪÉľËIJ": 1271, + "ĠaËIJ": 1272, + "Ġpas": 1273, + "ËĪÊĮs": 1274, + "wËĪÉĽÉ¾": 1275, + "ĠÉĹËĪe": 1276, + "ĠhËĮatÉĻ": 1277, + "aɪn": 1278, + "ĠËĪÉĶpʰ": 1279, + "ÊģËĪe": 1280, + "ĠÉŁaËIJËĪeËIJÉ¡aËIJ": 1281, + "ĠËĪÊĬs": 1282, + "ĠtÉķhËĪiÉľ": 1283, + "ntÊĥ": 1284, + "ĠxËĪuo": 1285, + "ËĪuÊģ": 1286, + "Ġɪm": 1287, + "ɳÉĸ": 1288, + "ËĪyÉĻÉľkh": 1289, + "ĠËĪyÉĽ": 1290, + "ĠmËĮaËIJ": 1291, + "ÅĵÊģ": 1292, + "ĠËĪalt": 1293, + "ĠkÉĻm": 1294, + "Êİo": 1295, + "ĠÉIJn": 1296, + "Ġfy": 1297, + "ĠËĮÉĽra": 1298, + "ĠÉ¡ËĪÊĬ": 1299, + "ĠpËĪÊĮ": 1300, + "ls": 1301, + "ĠlËĪiËIJ": 1302, + "ĠÊĤËĪy": 1303, + "ĠbɪkËĪÊĮz": 1304, + "ĠÉ¡ÉĽt": 1305, + "Ġbɾ": 1306, + "tʰ": 1307, + "tÉĻlËĮÉĻb": 1308, + "xo": 1309, + "skËĮaËIJ": 1310, + "ɲʲ": 1311, + "ËĪeËIJkÊĪ": 1312, + "rÉĻ": 1313, + "tÊĥo": 1314, + "ĠpÊģÉĶ": 1315, + "ĠɹËĪaɪt": 1316, + "ĠpËĪei": 1317, + "ËĮɪç": 1318, + "jËĪÉĽÉ¾": 1319, + "tËIJa": 1320, + "ĠÉIJbËĮaÊĬt": 1321, + "ĠkÊĭËĪeËIJÊĥcÉĻn": 1322, + "ĠvËĪe": 1323, + "ÊĬÉľ": 1324, + "ĠakËĪe": 1325, + "ĠpËĪai": 1326, + "vËĪÉĽ": 1327, + "Ġθɹ": 1328, + "ɪf": 1329, + "ĠavËĪÉĽ": 1330, + "ĠkËĪe": 1331, + "dËĪi": 1332, + "ËĪeËIJÉĸ": 1333, + "ĠbÉĻt": 1334, + "ÊĪʰ": 1335, + "teËIJ": 1336, + "θjËĪÉĶn": 1337, + "dÉľ": 1338, + "ĠjËĪiÉľ": 1339, + "Ġve": 1340, + "É£ËĪu": 1341, + "ËĪÊĮhÉĻl": 1342, + "ĠpÉĶ": 1343, + "ĠÉ¡r": 1344, + "Ġða": 1345, + "ĠvËĪiËIJ": 1346, + "ĠËĮÉijËIJ": 1347, + "ËĪÉĻÊĬnt": 1348, + "ĠbËĪaËIJɾ": 1349, + "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350, + "ld": 1351, + "ĠtÉķËĮÉĶ": 1352, + "pa": 1353, + "ðËĪad": 1354, + "ËĪiɾ": 1355, + "ĠxËĪu": 1356, + "ĠlËĪiÉľÅĭ": 1357, + "ËĪeɪs": 1358, + "ĠÉĹËĮeÉľn": 1359, + "ĠthËĪiÉĽ": 1360, + "tËIJe": 1361, + "ĠavËĮÉĽk": 1362, + "ĠËĮÉĶ": 1363, + "ĠkËĪÉiju": 1364, + "ɪv": 1365, + "iËIJz": 1366, + "ËĪos": 1367, + "Ġɡɹ": 1368, + "and": 1369, + "ĠlËĪiou": 1370, + "ĠËĪoÉľ": 1371, + "É¡l": 1372, + "ĠpËĪÉĶËIJ": 1373, + "ĠmËĮeËIJ": 1374, + "ĠkËĪÉĴ": 1375, + "nos": 1376, + "çÉĻn": 1377, + "fÉĻn": 1378, + "ĠsËĪÊĮktËĮeËIJ": 1379, + "ĠËĪaɪn": 1380, + "ËĪoËIJre": 1381, + "jËĪÉĽn": 1382, + "ĠðËĪÉĽn": 1383, + "ĠtÉķhËĪiÉĽÉľn": 1384, + "ĠhËĪaɪ": 1385, + "ɾËĪÉĽ": 1386, + "ĠsËĪu": 1387, + "ĠkËĪɪjaËIJ": 1388, + "ĠpjËĮÊĬ": 1389, + "ĠhÉĻmËĮaËIJ": 1390, + "ĠËĮÊĮp": 1391, + "ĠpËĪÊĮhÉĻl": 1392, + "ĠxËĪÉĻ": 1393, + "dËĪe": 1394, + "ĠmÉij": 1395, + "ĠÊĬm": 1396, + "ndÉĻ": 1397, + "ĠdËĪÉĻÊĬnt": 1398, + "ËĪeËIJÊĥÉĻn": 1399, + "Ġðats": 1400, + "is": 1401, + "ĠcËĪaËIJh": 1402, + "pe": 1403, + "ĠsËĮo": 1404, + "ĠðËĪe": 1405, + "ĠsËĪaËIJt": 1406, + "ËĪaÊģ": 1407, + "ĠsËĪe": 1408, + "ÉĻk": 1409, + "ɪÊĭ": 1410, + "ĠkËĪoËIJi": 1411, + "kÉĶ": 1412, + "ĠvËĪaËIJÊĬ": 1413, + "ĠfËĪei": 1414, + "ĠlËĪeËIJk": 1415, + "ĠhËĪiÉĻ": 1416, + "ĠaÊĬ": 1417, + "ËĪÉĽndo": 1418, + "ËĪes": 1419, + "ĠzËĪÉĶ": 1420, + "ĠËĪÉĽÉ¾a": 1421, + "nËĪiÉľn": 1422, + "ĠkËĪÊĮm": 1423, + "ĠlËĪÉĴ": 1424, + "ɪst": 1425, + "ĠpÉij": 1426, + "ĠfËĪÉĶ": 1427, + "ĠthËĪonÉ¡": 1428, + "nke": 1429, + "ËĮɪk": 1430, + "ĠɲËĪÉĻ": 1431, + "ËĮÊĮm": 1432, + "ËĪiËIJt": 1433, + "ĠwËĪÉĴnt": 1434, + "ËĪaβan": 1435, + "ĠbËĪÊĮr": 1436, + "ÉĽnd": 1437, + "ĠËĮÉijËIJbÉľ": 1438, + "ĠvËĪaɪ": 1439, + "ĠtÊĥËĮi": 1440, + "ĠθËĪɪÅĭk": 1441, + "sti": 1442, + "Ġkɹ": 1443, + "ĠËĪaÊĬt": 1444, + "stÉĻn": 1445, + "ĠÊĭËĪÊĮn": 1446, + "ĠÉ¡ËĮaËIJ": 1447, + "ËĪaËIJÉľÉ²": 1448, + "Êģi": 1449, + "ĠnËĪÉĶx": 1450, + "ĠɹËĪiÉĻlɪ": 1451, + "ĠvËĮi": 1452, + "ĠðeÉĻ": 1453, + "ËĮɪtÊĥ": 1454, + "ĠvËĪyÉĻ": 1455, + "ĠËĮaËIJpkËĮaËIJ": 1456, + "ĠfËĮaËIJɪ": 1457, + "ĠpËĪÉĶ": 1458, + "ĠnËĪÊĮmb": 1459, + "θes": 1460, + "jËĪÉĽÊģ": 1461, + "ĠkËĪÊĬcʰ": 1462, + "mËĪÉĽ": 1463, + "ĠvËĪu": 1464, + "ĠlÅĵÊģ": 1465, + "ĠiËIJm": 1466, + "ÊĪÉĻɾ": 1467, + "tÊĥi": 1468, + "ËIJs": 1469, + "ĠtËĪy": 1470, + "ĠmËĪiÉľÅĭ": 1471, + "ɾËĪe": 1472, + "mËĮa": 1473, + "ĠmËĮiËIJ": 1474, + "ĠÉĽks": 1475, + "ɪp": 1476, + "ĠkËĪÊĮɾnËĮaËIJ": 1477, + "ĠËĮaÊĬx": 1478, + "rËĪiËIJ": 1479, + "ĠcËĪÊĮl": 1480, + "mos": 1481, + "ĠkËĪÊĮɾtËĮeËIJ": 1482, + "iËIJɾ": 1483, + "kÉĻn": 1484, + "ĠdËĪu": 1485, + "naËIJ": 1486, + "ĠpwËĪe": 1487, + "ËĮÉĶɪ": 1488, + "ĠtÉķhËĪiÉĽ": 1489, + "ĠβËĪi": 1490, + "ËĪiÉĽÉľt": 1491, + "Ġte": 1492, + "ËĪaðos": 1493, + "mËĪa": 1494, + "ĠvËĪo": 1495, + "ĠmËĪɪ": 1496, + "ĠbËĮi": 1497, + "ad": 1498, + "do": 1499, + "ĠnËĪaÊĬ": 1500, + "ĠʲËĪyÉľ": 1501, + "wËĪÉĽ": 1502, + "ËĪis": 1503, + "el": 1504, + "Ġpar": 1505, + "ĠtËĪai": 1506, + "ĠdËĪɪjaËIJ": 1507, + "hËĪi": 1508, + "ĠɾËĪÊĮ": 1509, + "ĠdËĪe": 1510, + "ËĪaɪd": 1511, + "Ġper": 1512, + "ĠsËĮÉĶ": 1513, + "we": 1514, + "ÊĬm": 1515, + "Ġin": 1516, + "ĠjËĪuËIJz": 1517, + "ËĪiËIJpÉĻl": 1518, + "ĠÊĭËĪaËIJl": 1519, + "ĠetËĪÉĽ": 1520, + "ËĮÉĽm": 1521, + "ĠnËĪu": 1522, + "ËĪÉĽkt": 1523, + "ĠiËIJɾ": 1524, + "Ġbɹ": 1525, + "ĠtshËĪi": 1526, + "ĠÉĹËĪÉĶÉľ": 1527, + "ĠkwËĮa": 1528, + "ĠfËĪuÉľ": 1529, + "wËĮa": 1530, + "ĠdËĪiËIJ": 1531, + "ĠÉ¡ËĪyÉĻ": 1532, + "ËĮÉĽËIJ": 1533, + "rËĪa": 1534, + "Ġne": 1535, + "ĠzËĪyÉĻ": 1536, + "ĠbËĪaɪ": 1537, + "ĠÉŁËĪÊĮb": 1538, + "ËĪuËIJto": 1539, + "ÊĬnt": 1540, + "Ġcʰ": 1541, + "ËĪÉĽnti": 1542, + "ËĪoÉĻ": 1543, + "ĠsËĮÊĮm": 1544, + "ĠlÉij": 1545, + "ËĮeva": 1546, + "É¾ÉĽ": 1547, + "ntÉľ": 1548, + "ĠmËĪÉĽn": 1549, + "ËĪÉijËIJk": 1550, + "Ġkil": 1551, + "ËĪones": 1552, + "ff": 1553, + "ĠmËĪÉĽËIJ": 1554, + "ĠvËĪÉĻɪ": 1555, + "ĠËĪÉĶËIJ": 1556, + "ĠËĮɪnt": 1557, + "ÊĬn": 1558, + "Ġwɪl": 1559, + "Ġsin": 1560, + "ĠËĮalla": 1561, + "ĠaβËĪia": 1562, + "pi": 1563, + "ËĪoÉľ": 1564, + "ɪjËĮaËIJ": 1565, + "ku": 1566, + "ĠvËĪɪ": 1567, + "Ġtut": 1568, + "ĠtËĪeÉľ": 1569, + "ĠhËĪÉĶ": 1570, + "βɾe": 1571, + "sÉĻɾ": 1572, + "ĠkhËĪai": 1573, + "ĠmËĪÉĶ": 1574, + "Ġta": 1575, + "ĠɲËĪaËIJ": 1576, + "Ġnu": 1577, + "ËĪuËIJn": 1578, + "ĠÉĻËIJÉľ": 1579, + "ĠËĪaÊĬf": 1580, + "ËĪiËIJdÉľ": 1581, + "nti": 1582, + "ĠpËĪiËIJpÉĻl": 1583, + "Ġkj": 1584, + "Ġpe": 1585, + "ĠmËĪÉij": 1586, + "ËĮaɪ": 1587, + "ËĪaËIJle": 1588, + "ĠvËĮÉĻËIJÉªÉľ": 1589, + "mpo": 1590, + "ĠkËĪɪt": 1591, + "ĠnËĮÉĽ": 1592, + "ĠÉŁËĪaËIJtaËIJ": 1593, + "ĠsËĪaËIJtʰ": 1594, + "ĠÉŁËĪi": 1595, + "Ġso": 1596, + "ĠbËĪÉĽ": 1597, + "kËĪi": 1598, + "ɪti": 1599, + "Ġtsi": 1600, + "ĠkÊģ": 1601, + "ËĮÉĴ": 1602, + "É¡ÉĻl": 1603, + "kst": 1604, + "ĠmËĪÉĻËIJ": 1605, + "ËĪÊĮk": 1606, + "ĠnËĪaËIJÊĬ": 1607, + "Ġap": 1608, + "ĠlËĪɪkʰ": 1609, + "lli": 1610, + "ĠkwËĪal": 1611, + "ĠËĪÉĻËIJ": 1612, + "ĠtsËĪuei": 1613, + "Ġdo": 1614, + "ĠkËIJjËĪo": 1615, + "ÊĬz": 1616, + "ĠpËĪaËIJ": 1617, + "ĠmËĪuËIJ": 1618, + "ĠÉ¡ÉĻv": 1619, + "rËĪi": 1620, + "Ġtw": 1621, + "ËĮɪn": 1622, + "dËĪÉij": 1623, + "ĠðËĪi": 1624, + "ĠËĪaËIJi": 1625, + "ĠhËĪiÉĽ": 1626, + "ĠðËĮÉĽm": 1627, + "ĠpʰËĪɪɾ": 1628, + "ÉĴm": 1629, + "ĠËĮeËIJ": 1630, + "ĠthËĪaiÉľ": 1631, + "ĠvËĪas": 1632, + "ĠnÉijËIJ": 1633, + "pÉĻn": 1634, + "ĠpËĮÉĻɾ": 1635, + "ĠÉĹËĪaËIJɪ": 1636, + "ËĪouÉľ": 1637, + "ĠÊIJËĪuÉľ": 1638, + "ĠmËĪan": 1639, + "ĠtËĪÉĻÉªÉľ": 1640, + "ĠlËĪaËIJÊĬ": 1641, + "mËĪÉĽnte": 1642, + "ĠfËĪam": 1643, + "sjËĪÉĶ": 1644, + "ĠpËĪÉĻ": 1645, + "ËĪeËIJm": 1646, + "ĠpËĪÊĮr": 1647, + "jËĪi": 1648, + "ĠlÉĽ": 1649, + "Ġten": 1650, + "ËĪoËIJra": 1651, + "ki": 1652, + "ĠÊĤËĪaËIJÊĬ": 1653, + "kɪ": 1654, + "bËIJe": 1655, + "ËĪalt": 1656, + "ðɪ": 1657, + "pËĪi": 1658, + "ĠËĮÉĽnt": 1659, + "ĠmËĪei": 1660, + "ĠhËĪÉĻÊĬ": 1661, + "ĠhËĪÉĽÉ¾": 1662, + "jËĪÉij": 1663, + "ĠhËĪÊĬaËIJ": 1664, + "mÉľ": 1665, + "Ġdʰ": 1666, + "ĠtÊĥËĪe": 1667, + "lËĪÉĽ": 1668, + "ËĪaËIJte": 1669, + "ĠpËĪuËIJ": 1670, + "ĠmËĪÊĬ": 1671, + "ËĪaËIJɪÊĪ": 1672, + "diËIJ": 1673, + "ĠfɹÉĴm": 1674, + "ĠhËĪÉijËIJ": 1675, + "βo": 1676, + "ĠmËĪiÉľn": 1677, + "ĠðiËIJz": 1678, + "ĠkËĪou": 1679, + "ËĪiËIJna": 1680, + "ĠavËĮeva": 1681, + "ĠËĪaËIJɾ": 1682, + "ĠnËĪuËIJɾ": 1683, + "ĠβËĪe": 1684, + "Ġzaɪn": 1685, + "ËĪÉĽd": 1686, + "ÉĹ": 1687, + "ËĪeɪk": 1688, + "sËĮÉĻÊĬ": 1689, + "ËĪeËIJÉŁ": 1690, + "ĠÊĤËĪÉĻËIJ": 1691, + "je": 1692, + "cʰËIJ": 1693, + "ËĪÉĶr": 1694, + "ÉĽËIJ": 1695, + "ĠtÉķhËĪyÃ¦Éľn": 1696, + "ĠËĮaɪnÉĻn": 1697, + "ĠiËIJn": 1698, + "ĠbËĪÊĮc": 1699, + "ËĪiËIJm": 1700, + "ɾas": 1701, + "ËĮÉĻs": 1702, + "ĠvËĪeËIJ": 1703, + "ĠËĪÉĻrÉľ": 1704, + "ĠduËIJ": 1705, + "ntÉĻ": 1706, + "ĠpɹËĪÉĴ": 1707, + "ĠbËĪɪ": 1708, + "ĠwËĪoÉľ": 1709, + "nËĮi": 1710, + "ĠhÉIJ": 1711, + "ĠkËĪÉĽ": 1712, + "Ġet": 1713, + "jËĪÉĽndo": 1714, + "ĠËĪaiÉľ": 1715, + "Ġli": 1716, + "ĠËĪaÊĬs": 1717, + "kËIJo": 1718, + "ĠÉĹËĪyÉĻ": 1719, + "keËIJ": 1720, + "ĠfËĪiËIJl": 1721, + "ĠbʰËĪaËIJi": 1722, + "ĠÉ¡ÉĻÊĥ": 1723, + "ÊĴËĪe": 1724, + "ĠnjËĪuËIJ": 1725, + "ĠËĪak": 1726, + "ĠÉĹËĪaËIJ": 1727, + "zËĪa": 1728, + "vËĪe": 1729, + "ĠhËĮaÊĬ": 1730, + "ÉIJç": 1731, + "ĠɾËĪÊĮkʰ": 1732, + "pËĪe": 1733, + "ĠtÉĻbi": 1734, + "ĠpËĪÊĮhÉĻlËĮeËIJ": 1735, + "ĠfËĪÉĽ": 1736, + "ĠwËĮɪtÊĥ": 1737, + "ĠtÉķËĪyÉĽÉľ": 1738, + "wËĮe": 1739, + "ËĮaɪt": 1740, + "ĠnÉijËIJx": 1741, + "ĠkËĪÉĶËIJn": 1742, + "ÊĬk": 1743, + "ĠbËĪaËIJd": 1744, + "ÅĭÉĻn": 1745, + "Ġni": 1746, + "ĠbËĪe": 1747, + "ĠmËĮÊĬ": 1748, + "ËĪar": 1749, + "ĠmËĮeɪk": 1750, + "ĠsËĪaËIJɾ": 1751, + "βe": 1752, + "ĠtÉķhËĪiÉľÅĭ": 1753, + "itËĪe": 1754, + "kËĮe": 1755, + "ËĪÉĽËIJl": 1756, + "ËĮÉĴn": 1757, + "ËĮÉij": 1758, + "ĠbËĪɪl": 1759, + "ĠwÊĬd": 1760, + "ĠbËĪoËIJl": 1761, + "rd": 1762, + "iÉĻ": 1763, + "Ġda": 1764, + "ĠbËĪaËIJÊĬ": 1765, + "ĠnËĪÊĮmbÉĻɾ": 1766, + "ËĪaËIJÉªÉľ": 1767, + "ĠÉĽm": 1768, + "ĠmiËIJɾ": 1769, + "ËĪeɪm": 1770, + "los": 1771, + "ËĮÉĽt": 1772, + "ĠËĮaÊĬs": 1773, + "ĠmËĪaÉľt": 1774, + "ĠwËĪuÉĻ": 1775, + "ĠwËĪeɪ": 1776, + "Ġseɲ": 1777, + "ĠbjËĪÉĽ": 1778, + "ĠwÉĽn": 1779, + "fl": 1780, + "ĠkhwËĪa": 1781, + "dËĪÉĽ": 1782, + "vɹɪ": 1783, + "ĠËĪaɾ": 1784, + "jËĪÉijuÉľ": 1785, + "ĠËĮaËIJpkËĮeËIJ": 1786, + "bÊģ": 1787, + "ĠtËĪaɪm": 1788, + "ĠËĪÉij": 1789, + "ĠsËĮa": 1790, + "ĠzËĪoɪ": 1791, + "ËĪÉĶɾa": 1792, + "ĠdËĪø": 1793, + "ËĪÉĶɾt": 1794, + "ĠÅĭËĪÉĶ": 1795, + "min": 1796, + "ĠlËĪÊĬk": 1797, + "ËĪÉĶËIJt": 1798, + "ĠËĪÉĶtɾ": 1799, + "ĠfËĪaɪ": 1800, + "ĠÉ¡ÉĴt": 1801, + "ËĪeËIJÉĻn": 1802, + "kËĪÉĶ": 1803, + "ĠvËĪÉĽÉ¹i": 1804, + "mÉĽ": 1805, + "ËĪaɪz": 1806, + "Ġesp": 1807, + "ɲa": 1808, + "ĠlËĪo": 1809, + "ËĪÉĽËIJra": 1810, + "βËĪi": 1811, + "ouÉľ": 1812, + "ËĮÉĻk": 1813, + "tÊĥuËIJ": 1814, + "ĠnËĪyÉĻ": 1815, + "ÊĪɾ": 1816, + "ĠÉ¡ËĪy": 1817, + "ĠtËĪoðo": 1818, + "ËĪɪçt": 1819, + "Ġmɪç": 1820, + "ĠËĪand": 1821, + "ĠkwËĮÉĽl": 1822, + "ĠÊĤËĪaËIJ": 1823, + "ĠnËĪiÉľ": 1824, + "ËĪÉĶp": 1825, + "ËĪiËIJz": 1826, + "ĠÊĤËĪaÊĬ": 1827, + "ĠɾËĮÉĻhi": 1828, + "ĠsËĮÊĬo": 1829, + "ĠÉĽÉ¡": 1830, + "ĠdÅĵ": 1831, + "ĠÉ¡ËĮaËIJÉªÉľ": 1832, + "dɪ": 1833, + "lËĮa": 1834, + "stËĪi": 1835, + "ĠdËĮiËIJz": 1836, + "ĠtËĮÊĬ": 1837, + "θi": 1838, + "ĠËĪɪskËĮoËIJ": 1839, + "ndÉĻn": 1840, + "Ġtsv": 1841, + "ĠhËĪÉĻËIJ": 1842, + "ĠÊĥËĪÊĬ": 1843, + "ÉĻtËĮeËIJ": 1844, + "pËĮÉĽ": 1845, + "ËĪaɾÉĶn": 1846, + "ĠpÉĽÊģ": 1847, + "Ġy": 1848, + "mnËĮeËIJ": 1849, + "ËĪÉĽllo": 1850, + "ĠÉ¡ËĪÉĻ": 1851, + "ĠËĮad": 1852, + "ĠÊĥv": 1853, + "ËĪÊıɾ": 1854, + "rËĪe": 1855, + "yËIJ": 1856, + "ĠpËĪaËIJs": 1857, + "ĠËĪÉĽn": 1858, + "ɪdÊĴ": 1859, + "ËĪuai": 1860, + "Ġfi": 1861, + "ĠtËĪyÉĻ": 1862, + "ËĪaËIJÉŁ": 1863, + "ĠtjËĪe": 1864, + "ËĪaËIJnaËIJ": 1865, + "stɾ": 1866, + "Êİe": 1867, + "ËĮeɪt": 1868, + "ba": 1869, + "ðas": 1870, + "vÊģ": 1871, + "ĠzËĪÉĻËIJ": 1872, + "ËĪaËIJli": 1873, + "ÉŁÊ°eËIJ": 1874, + "ËĪaËIJteËIJ": 1875, + "ĠvËĪa": 1876, + "Ġsal": 1877, + "ËĪaËIJno": 1878, + "ĠÉ¡ÉĻz": 1879, + "ĠhËĪoËIJti": 1880, + "ĠɲËĪiÉĽ": 1881, + "tÉľ": 1882, + "ĠËĪaËIJp": 1883, + "ĠwËĪÉĽl": 1884, + "ĠmËĪɪl": 1885, + "ĠfyËIJɾ": 1886, + "ËĪÉĽËIJsaËIJ": 1887, + "ĠbËĮiËIJ": 1888, + "ËĪaËIJjaËIJ": 1889, + "ËĪɪp": 1890, + "ĠfÊģ": 1891, + "tsiËĪoËIJne": 1892, + "ĠwËĪuÉľ": 1893, + "Ġvi": 1894, + "ĠwËĪÉijÉľn": 1895, + "ËĪoËIJn": 1896, + "ĠÉĹËĪÉĻɪ": 1897, + "ĠÊĿËĪo": 1898, + "Ġra": 1899, + "mÉĻnt": 1900, + "ËĪaÊĬnd": 1901, + "ĠpÉĽÉ¾": 1902, + "ĠÉĹËĪaËIJÊĬ": 1903, + "oËIJɾ": 1904, + "hËĪo": 1905, + "ĠÉĴn": 1906, + "ĠÊİe": 1907, + "ĠsËĪɪks": 1908, + "É¡n": 1909, + "ĠÉ¡ËĪa": 1910, + "Ġθj": 1911, + "ĠpËĪe": 1912, + "spe": 1913, + "ĠvËĪÉĻ": 1914, + "ĠfËĪɪ": 1915, + "ĠËĮɪntÊĬ": 1916, + "lÉĻn": 1917, + "ĠnËĪiËIJd": 1918, + "ĠsËĮÊĬa": 1919, + "ĠËĪum": 1920, + "ĠdËĪeɪ": 1921, + "ĠËĪÊĮbʰi": 1922, + "ËĪÉijËIJɾ": 1923, + "ĠbËĪiÉĽÉľt": 1924, + "Êİos": 1925, + "ĠtshËĪaiÉľ": 1926, + "ĠËĮɪskËĮaËIJ": 1927, + "ĠaÊĬÉĻ": 1928, + "ĠËĪyæ": 1929, + "Ġdyn": 1930, + "ĠmËĪiËIJn": 1931, + "ĠËĪÊĮcʰËIJ": 1932, + "ĠsÉĽ": 1933, + "ĠnËĪy": 1934, + "ĠnËĮÉĽl": 1935, + "ɡɾ": 1936, + "ÊĥËĪe": 1937, + "ĠÊĤËĮÉĽ": 1938, + "ĠËĪÉĽvɹɪ": 1939, + "ËĪÉĽlp": 1940, + "ĠbËĪak": 1941, + "ĠeËIJ": 1942, + "ĠfËĪaËIJ": 1943, + "ĠkÉĽl": 1944, + "ĠËĪeËIJs": 1945, + "jËĪaËIJd": 1946, + "ĠlËĮi": 1947, + "mbɾe": 1948, + "ktÉĻ": 1949, + "nta": 1950, + "tËĪu": 1951, + "ĠðËĪat": 1952, + "ĠËĪaβ": 1953, + "ÉĻɹi": 1954, + "ĠkwËĮÉĽlla": 1955, + "ĠbÉĻn": 1956, + "rËĮÉĽ": 1957, + "ĠnÉĶ": 1958, + "ĠÉ¡ËĪɪ": 1959, + "ĠËĪap": 1960, + "ɹÉĻ": 1961, + "ËĪaÉľkh": 1962, + "ĠÊIJËĪi": 1963, + "ĠËĪÉijËIJ": 1964, + "ɪɡÉĻn": 1965, + "ĠwËĪai": 1966, + "ĠpÉĻt": 1967, + "kËIJa": 1968, + "ĠbËĪÉĽËIJ": 1969, + "ËĪeËIJÊĭ": 1970, + "lsÉĻÊĬ": 1971, + "ĠcËĪaËIJhɪËĮeËIJ": 1972, + "ĠkÉĻn": 1973, + "ĠËĮaɪnÉĻm": 1974, + "ËĪuËIJt": 1975, + "ĠhËĪaÊĬ": 1976, + "ĠtËĪanto": 1977, + "ĠhÉIJz": 1978, + "ĠsËĪÊĮɾ": 1979, + "Ġno": 1980, + "ĠtËĪÉĶËIJ": 1981, + "ĠzËĪaɪ": 1982, + "ĠtÉķËĪiÉĽÉľ": 1983, + "ĠkozËĪi": 1984, + "ĠkËĪei": 1985, + "ðËĪÉĶɾ": 1986, + "ËĮÉĶÊģ": 1987, + "ĠtËĪÊĮɾ": 1988, + "ĠÊIJËĪÉĻ": 1989, + "ĠÉķËĪyÉĽÉľ": 1990, + "ĠmËĮÊĬÉŁÊ°eËIJ": 1991, + "mf": 1992, + "ĠvËĪiËIJdÉľ": 1993, + "kËĪa": 1994, + "ĠÉIJÉ¡": 1995, + "kw": 1996, + "ĠÊģÉĽ": 1997, + "xÉĻn": 1998, + "ĠdÊĬ": 1999, + "ĠkËĪÊĮɾnËĮeËIJ": 2000, + "jËĪaËIJdaËIJ": 2001, + "ĠfÉĻ": 2002, + "ĠËĮimp": 2003, + "Ġhɪz": 2004, + "ĠʰÏĩ": 2005, + "ËĪoËIJni": 2006, + "ĠxËĪiÉľ": 2007, + "ËĪeËIJsÊĪ": 2008, + "ÊıbÉľ": 2009, + "ËĮÉĶɾke": 2010, + "ĠÉ¡ËĪÉĻÊĬ": 2011, + "ËĪɪÊĥÉĻn": 2012, + "les": 2013, + "ĠfËĪiËIJ": 2014, + "É¡tÉĻ": 2015, + "ËĪeËIJre": 2016, + "ĠvËĮaËIJ": 2017, + "ĠËĪeɪ": 2018, + "ĠmËĪuÉĻÉľn": 2019, + "ĠÉ¡ËĪÊĬd": 2020, + "ĠmËĮaɪn": 2021, + "zËĪe": 2022, + "ĠlËĪiÉľ": 2023, + "Ġmu": 2024, + "ĠkËĮÉĽl": 2025, + "ĠjËĮÉĻh": 2026, + "ĠfËĮÉĶɾ": 2027, + "fɹ": 2028, + "ĠkËĪaɪn": 2029, + "ĠËĪÉĴlsÉĻÊĬ": 2030, + "θɪÅĭ": 2031, + "ĠthËĪonÉ¡Éľ": 2032, + "tËĪÉij": 2033, + "θjo": 2034, + "mËĪÉĶ": 2035, + "Ġos": 2036, + "ĠsÊĬ": 2037, + "ĠsËĪÊĮmÉĻ": 2038, + "ĠvËĮÉĽn": 2039, + "nËĪo": 2040, + "ĠËĪaktÊĥuËIJ": 2041, + "É£a": 2042, + "Ġtʰi": 2043, + "ĠfËĮi": 2044, + "ĠvËĪÉĽl": 2045, + "ĠtËĪutËIJi": 2046, + "xos": 2047 + }, + "merges": [ + [ + "Ë", + "Ī" + ], + [ + "Ë", + "IJ" + ], + [ + "ËĪ", + "É" + ], + [ + "Ë", + "Į" + ], + [ + "É", + "Ļ" + ], + [ + "ËĪ", + "a" + ], + [ + "ËĪ", + "i" + ], + [ + "Ġ", + "t" + ], + [ + "É", + "ª" + ], + [ + "É", + "¾" + ], + [ + "Ġ", + "É" + ], + [ + "Ġ", + "k" + ], + [ + "É", + "ľ" + ], + [ + "Ġ", + "s" + ], + [ + "ËĪ", + "e" + ], + [ + "É", + "Ľ" + ], + [ + "ËĪ", + "o" + ], + [ + "Ġ", + "l" + ], + [ + "ËĪÉ", + "Ľ" + ], + [ + "Ġ", + "d" + ], + [ + "Ê", + "Ĭ" + ], + [ + "ËĪa", + "ËIJ" + ], + [ + "Ġ", + "p" + ], + [ + "Ì", + "ĥ" + ], + [ + "Ġ", + "m" + ], + [ + "ËĪ", + "u" + ], + [ + "Å", + "ĭ" + ], + [ + "Ã", + "°" + ], + [ + "ËĪÉ", + "Ķ" + ], + [ + "Ê", + "Į" + ], + [ + "ËĮ", + "a" + ], + [ + "Ġ", + "h" + ], + [ + "ËĪ", + "ÊĮ" + ], + [ + "Ġ", + "n" + ], + [ + "Ê", + "ģ" + ], + [ + "ËĪÉ", + "ij" + ], + [ + "Ê", + "ĥ" + ], + [ + "e", + "ËIJ" + ], + [ + "Ġ", + "a" + ], + [ + "Ġ", + "b" + ], + [ + "É", + "Ķ" + ], + [ + "ËĪÉ", + "Ļ" + ], + [ + "ÉĻ", + "n" + ], + [ + "Ġ", + "f" + ], + [ + "ËĪÉ", + "ª" + ], + [ + "É", + "¡" + ], + [ + "ËĪe", + "ËIJ" + ], + [ + "Ġ", + "j" + ], + [ + "n", + "t" + ], + [ + "Ġ", + "ð" + ], + [ + "Ġ", + "ËĮ" + ], + [ + "Ġt", + "s" + ], + [ + "ĠÉ", + "¡" + ], + [ + "É", + "ķ" + ], + [ + "ËĪo", + "ËIJ" + ], + [ + "Ê", + "°" + ], + [ + "a", + "ËIJ" + ], + [ + "ËĪ", + "y" + ], + [ + "Ġt", + "Éķ" + ], + [ + "ËĪi", + "ËIJ" + ], + [ + "Ġ", + "Ê" + ], + [ + "Ġ", + "v" + ], + [ + "Ġ", + "w" + ], + [ + "s", + "t" + ], + [ + "É", + "ij" + ], + [ + "n", + "d" + ], + [ + "ËĮ", + "i" + ], + [ + "Ì", + "ª" + ], + [ + "ËĮ", + "e" + ], + [ + "Ġ", + "z" + ], + [ + "ËĪa", + "ɪ" + ], + [ + "ËĪi", + "ÉĽ" + ], + [ + "Î", + "²" + ], + [ + "É", + "¹" + ], + [ + "Ġ", + "ËĮa" + ], + [ + "Î", + "¸" + ], + [ + "Ġh", + "ÉĽ" + ], + [ + "Ê", + "Ī" + ], + [ + "i", + "ËIJ" + ], + [ + "ËĮ", + "o" + ], + [ + "Ġ", + "ɪ" + ], + [ + "Éľ", + "n" + ], + [ + "Ġ", + "x" + ], + [ + "Ġt", + "ÉĻ" + ], + [ + "ËĪu", + "ËIJ" + ], + [ + "ËĮ", + "ÉĻ" + ], + [ + "Ġj", + "ËĪi" + ], + [ + "ËĮ", + "ÉĽ" + ], + [ + "ĠÉ", + "Ľ" + ], + [ + "Ġ", + "ËĪa" + ], + [ + "ËĮa", + "ËIJ" + ], + [ + "Ġl", + "a" + ], + [ + "Ġð", + "e" + ], + [ + "ĠhÉĽ", + "ËIJ" + ], + [ + "Ġ", + "e" + ], + [ + "Ã", + "§" + ], + [ + "ÉĻ", + "l" + ], + [ + "o", + "ËIJ" + ], + [ + "ËĪÉij", + "u" + ], + [ + "Ê", + "Ĵ" + ], + [ + "u", + "ËIJ" + ], + [ + "ĠÉ", + "Ĺ" + ], + [ + "ĠÉ", + "ķ" + ], + [ + "ËĮ", + "eËIJ" + ], + [ + "ĠtÉķ", + "ËĪi" + ], + [ + "o", + "s" + ], + [ + "ËĪÉĶ", + "ËIJ" + ], + [ + "a", + "s" + ], + [ + "ËĪ", + "ÊĬ" + ], + [ + "Ġ", + "i" + ], + [ + "ËĪa", + "i" + ], + [ + "É", + "²" + ], + [ + "ɪ", + "n" + ], + [ + "t", + "s" + ], + [ + "Éľ", + "Åĭ" + ], + [ + "ĠÉ", + "Ł" + ], + [ + "Ġ", + "Êĥ" + ], + [ + "ËĪe", + "ɪ" + ], + [ + "ÉĽ", + "ɾ" + ], + [ + "ËĪÉĽ", + "ËIJ" + ], + [ + "ËĪÉĽ", + "ɾ" + ], + [ + "Ġ", + "r" + ], + [ + "t", + "Êĥ" + ], + [ + "ËĮ", + "ÉĶ" + ], + [ + "Ġd", + "ÉĻ" + ], + [ + "t", + "ÉĻ" + ], + [ + "o", + "u" + ], + [ + "ËĪy", + "ÉĻ" + ], + [ + "ĠËĮ", + "i" + ], + [ + "ÉĻ", + "ɾ" + ], + [ + "ËĪÉĻ", + "ÊĬ" + ], + [ + "ËĪÊĮ", + "ɾ" + ], + [ + "ËĪÉ", + "Ĵ" + ], + [ + "Ġt", + "h" + ], + [ + "ËĪo", + "n" + ], + [ + "Ê", + "ĭ" + ], + [ + "ËĪÉij", + "ËIJ" + ], + [ + "ËĪÊĮ", + "h" + ], + [ + "w", + "ËĪa" + ], + [ + "ËĪe", + "i" + ], + [ + "l", + "l" + ], + [ + "ĠÉ", + "IJ" + ], + [ + "Éij", + "ËIJ" + ], + [ + "a", + "n" + ], + [ + "É", + "Ł" + ], + [ + "ĠÊ", + "ĭ" + ], + [ + "Ġk", + "o" + ], + [ + "k", + "h" + ], + [ + "ɪ", + "Åĭ" + ], + [ + "ËĪaËIJ", + "ɪ" + ], + [ + "Ġt", + "Êĥ" + ], + [ + "ËĪaËIJ", + "t" + ], + [ + "ĠËĮ", + "e" + ], + [ + "ĠtÉķ", + "h" + ], + [ + "ËĪu", + "o" + ], + [ + "ËĪon", + "É¡" + ], + [ + "É", + "ĸ" + ], + [ + "a", + "t" + ], + [ + "Ġk", + "e" + ], + [ + "É", + "Ĵ" + ], + [ + "ĠÉķ", + "ËĪi" + ], + [ + "Ã", + "¸" + ], + [ + "ĠÉ", + "ij" + ], + [ + "ËĪeËIJ", + "k" + ], + [ + "Å", + "ĵ" + ], + [ + "r", + "e" + ], + [ + "Ġ", + "ɾ" + ], + [ + "Ġk", + "ÉĶ" + ], + [ + "ËĮ", + "ÊĬ" + ], + [ + "s", + "k" + ], + [ + "Ġ", + "ÊĬ" + ], + [ + "Ġa", + "nd" + ], + [ + "ɪ", + "ç" + ], + [ + "Ġm", + "e" + ], + [ + "ËĪa", + "ɾ" + ], + [ + "Ġ", + "ËĪɪ" + ], + [ + "n", + "a" + ], + [ + "Ġ", + "β" + ], + [ + "Ġl", + "ËĪi" + ], + [ + "j", + "aËIJ" + ], + [ + "l", + "i" + ], + [ + "n", + "o" + ], + [ + "Ġɪ", + "n" + ], + [ + "Ġd", + "ËĮi" + ], + [ + "ĠÉ", + "²" + ], + [ + "t", + "ËIJ" + ], + [ + "ÉĻ", + "m" + ], + [ + "Ġl", + "ÉĻ" + ], + [ + "Ġð", + "ÉĻ" + ], + [ + "ɪ", + "k" + ], + [ + "ËĪÉĽ", + "l" + ], + [ + "Éľ", + "t" + ], + [ + "Ġs", + "e" + ], + [ + "e", + "s" + ], + [ + "ËĪo", + "u" + ], + [ + "ËĪa", + "ÊĬ" + ], + [ + "ĠÉ", + "Ķ" + ], + [ + "ɪ", + "t" + ], + [ + "Ġ", + "Åĭ" + ], + [ + "ËĪÉĽ", + "n" + ], + [ + "Ê", + "İ" + ], + [ + "Ġk", + "h" + ], + [ + "ËĪÉĽ", + "nt" + ], + [ + "ËĪaËIJ", + "ɾ" + ], + [ + "Ġk", + "i" + ], + [ + "m", + "p" + ], + [ + "l", + "t" + ], + [ + "É", + "£" + ], + [ + "Ġp", + "a" + ], + [ + "ËĪÉĻ", + "ËIJ" + ], + [ + "ɪ", + "s" + ], + [ + "ĠÉ", + "Ĵ" + ], + [ + "Ġl", + "e" + ], + [ + "ɪ", + "Éľ" + ], + [ + "ËĪÉĽ", + "t" + ], + [ + "Ġd", + "e" + ], + [ + "ĠÉ", + "¹" + ], + [ + "Ġt", + "ËĪoËIJ" + ], + [ + "Ġ", + "Êģ" + ], + [ + "Êĥ", + "ÉĻn" + ], + [ + "ĠÊĬ", + "nt" + ], + [ + "ËĪÉĶ", + "ɾ" + ], + [ + "ËĪa", + "ð" + ], + [ + "Ġa", + "ɪ" + ], + [ + "ĠÊ", + "IJ" + ], + [ + "Ġm", + "ËĪa" + ], + [ + "r", + "a" + ], + [ + "Ġk", + "ËĪɪ" + ], + [ + "k", + "t" + ], + [ + "ËIJ", + "p" + ], + [ + "ĠÊ", + "Ī" + ], + [ + "ËĪaËIJ", + "ÊĬ" + ], + [ + "Ġk", + "ËĪÊĮɾ" + ], + [ + "Ġ", + "ËĪÊĮ" + ], + [ + "ĠÉĴ", + "v" + ], + [ + "Ġe", + "l" + ], + [ + "k", + "s" + ], + [ + "Ġk", + "w" + ], + [ + "ÉĻ", + "t" + ], + [ + "nd", + "o" + ], + [ + "e", + "i" + ], + [ + "ĠËĮa", + "ËIJp" + ], + [ + "s", + "e" + ], + [ + "ÉĻ", + "ɹ" + ], + [ + "ËĪu", + "ei" + ], + [ + "ÉĻ", + "s" + ], + [ + "Ġk", + "ËĮo" + ], + [ + "ĠÊ", + "Ĥ" + ], + [ + "ĠËĮ", + "ÊĬ" + ], + [ + "Ġ", + "c" + ], + [ + "ĠÉĽ", + "n" + ], + [ + "ËĪa", + "nt" + ], + [ + "θ", + "j" + ], + [ + "ËĮo", + "ËIJ" + ], + [ + "Ġ", + "ËĪaËIJ" + ], + [ + "Ġp", + "ɾ" + ], + [ + "s", + "i" + ], + [ + "Ġ", + "ËĪe" + ], + [ + "Ġj", + "uËIJ" + ], + [ + "Ġk", + "ËĮe" + ], + [ + "ËĮ", + "ɪ" + ], + [ + "ÉĶ", + "n" + ], + [ + "Ġs", + "ËĪÊĮ" + ], + [ + "Ġ", + "ËĪu" + ], + [ + "n", + "i" + ], + [ + "Ġs", + "t" + ], + [ + "Ġd", + "iËIJ" + ], + [ + "Ġk", + "eËIJ" + ], + [ + "ĠjËĪi", + "ou" + ], + [ + "ËĪai", + "Éľ" + ], + [ + "Ġd", + "ÊĴ" + ], + [ + "Ġ", + "ËĪÉĶ" + ], + [ + "v", + "a" + ], + [ + "ËIJ", + "ɾ" + ], + [ + "ËĪ", + "ø" + ], + [ + "ËĮÉĻ", + "ÊĬ" + ], + [ + "Ġp", + "ËĪu" + ], + [ + "Ġs", + "u" + ], + [ + "Ġm", + "a" + ], + [ + "Ġ", + "ÉĻ" + ], + [ + "d", + "ÊĴ" + ], + [ + "Ġp", + "ʰ" + ], + [ + "l", + "e" + ], + [ + "i", + "n" + ], + [ + "ĠtÉķh", + "ËĪi" + ], + [ + "Ġw", + "ËĪo" + ], + [ + "r", + "o" + ], + [ + "ËĮ", + "y" + ], + [ + "ɾ", + "a" + ], + [ + "Ġs", + "ËĪi" + ], + [ + "ð", + "ÉĻ" + ], + [ + "Ġs", + "eËIJ" + ], + [ + "l", + "a" + ], + [ + "ĠÊ", + "Ĵ" + ], + [ + "m", + "b" + ], + [ + "Ġh", + "ËĪoËIJ" + ], + [ + "Ġb", + "ʰ" + ], + [ + "ĠÉĽ", + "ɾ" + ], + [ + "Ġð", + "at" + ], + [ + "s", + "p" + ], + [ + "ÉĶ", + "ɾ" + ], + [ + "e", + "n" + ], + [ + "Ġs", + "ÉĻ" + ], + [ + "ËĪÉĶ", + "Éľ" + ], + [ + "Ġl", + "ËĮa" + ], + [ + "ĠËĮ", + "ÉĽ" + ], + [ + "Ġ", + "ËĪy" + ], + [ + "É¡", + "aËIJ" + ], + [ + "Ġd", + "ÉĽÉ¾" + ], + [ + "ËĪÉĽ", + "Êģ" + ], + [ + "Éľ", + "kh" + ], + [ + "ËĪi", + "ÉĻ" + ], + [ + "ËĪa", + "n" + ], + [ + "Ġm", + "ËĪo" + ], + [ + "ËĪa", + "β" + ], + [ + "Ġa", + "l" + ], + [ + "Ġ", + "ËĪeËIJ" + ], + [ + "Ġ", + "θ" + ], + [ + "Ġn", + "ËĪi" + ], + [ + "p", + "ʰ" + ], + [ + "ll", + "a" + ], + [ + "Ġp", + "l" + ], + [ + "ËĪ", + "Åĵ" + ], + [ + "j", + "ËĪÉiju" + ], + [ + "Ġa", + "v" + ], + [ + "Ġm", + "ËĪi" + ], + [ + "Ġf", + "ËĪa" + ], + [ + "ËĪÉ", + "ľ" + ], + [ + "m", + "e" + ], + [ + "ËĮÉĻ", + "h" + ], + [ + "ËĪu", + "ÉĻ" + ], + [ + "i", + "t" + ], + [ + "j", + "ËĪe" + ], + [ + "Ġ", + "o" + ], + [ + "ËĪÉľ", + "ËIJ" + ], + [ + "ĠtÉķËĪi", + "ou" + ], + [ + "ÉĶ", + "ËIJ" + ], + [ + "Ġn", + "ÉĻ" + ], + [ + "ËĪÉĻ", + "Éľn" + ], + [ + "Ġm", + "ÉĻ" + ], + [ + "Ġd", + "eËIJ" + ], + [ + "m", + "o" + ], + [ + "s", + "a" + ], + [ + "j", + "ËĪÉĶ" + ], + [ + "ËĪa", + "l" + ], + [ + "ĠtÉķ", + "ËĪiÉĽ" + ], + [ + "ĠÉ¡", + "ÉĻ" + ], + [ + "ð", + "a" + ], + [ + "Ġɪ", + "z" + ], + [ + "Ġs", + "a" + ], + [ + "r", + "i" + ], + [ + "ĠËĮi", + "l" + ], + [ + "ËĮ", + "u" + ], + [ + "Ġk", + "aËIJ" + ], + [ + "ĠÉĻ", + "ËIJ" + ], + [ + "ĠÉ", + "ĸ" + ], + [ + "Ġk", + "a" + ], + [ + "ËĪÊĮh", + "i" + ], + [ + "Ġj", + "eËIJ" + ], + [ + "Ġt", + "ʰ" + ], + [ + "n", + "e" + ], + [ + "k", + "ËIJ" + ], + [ + "Ġts", + "ËĪai" + ], + [ + "Ġ", + "ËĪeËIJk" + ], + [ + "n", + "k" + ], + [ + "t", + "i" + ], + [ + "ËĪa", + "Éľn" + ], + [ + "Ġk", + "ËIJ" + ], + [ + "É¡", + "ÉĻn" + ], + [ + "ËĪi", + "a" + ], + [ + "ĠÉĶ", + "ËIJɾ" + ], + [ + "Ê", + "ı" + ], + [ + "ĠËĮ", + "ÊĮ" + ], + [ + "Ġz", + "ËĪaËIJ" + ], + [ + "Ġl", + "os" + ], + [ + "ÉĽ", + "s" + ], + [ + "ËĪÉĶ", + "n" + ], + [ + "ÉĽ", + "nt" + ], + [ + "ÉĽ", + "n" + ], + [ + "ĠÉŁ", + "ËĪoËIJ" + ], + [ + "ç", + "t" + ], + [ + "Ġd", + "as" + ], + [ + "Ġx", + "ËĮo" + ], + [ + "ËĪu", + "Éľ" + ], + [ + "ËĪa", + "s" + ], + [ + "Ġb", + "ËĪÊĮ" + ], + [ + "ËĪiÉĽ", + "Éľn" + ], + [ + "É", + "IJ" + ], + [ + "Ġts", + "uËIJ" + ], + [ + "Ġp", + "ËĮÉĽ" + ], + [ + "Ġn", + "ËĪÉĶ" + ], + [ + "ÊĬ", + "t" + ], + [ + "m", + "a" + ], + [ + "Ġn", + "ËĪo" + ], + [ + "Ġl", + "ËĪɪ" + ], + [ + "ËĪÉĽ", + "s" + ], + [ + "ɪ", + "l" + ], + [ + "ĠÉķ", + "ËĪiÉĽ" + ], + [ + "Ġ", + "ËĪÊĬ" + ], + [ + "ÉĴ", + "t" + ], + [ + "t", + "o" + ], + [ + "Ġ", + "ËĪo" + ], + [ + "ËĮo", + "n" + ], + [ + "Ġk", + "wËĪa" + ], + [ + "Ġɪ", + "t" + ], + [ + "Ġh", + "oËIJ" + ], + [ + "ËĪiËIJ", + "k" + ], + [ + "ĠËĮaËIJp", + "k" + ], + [ + "ËĪaɪ", + "n" + ], + [ + "Ã", + "¦" + ], + [ + "ÉĻn", + "t" + ], + [ + "t", + "a" + ], + [ + "l", + "o" + ], + [ + "Ġn", + "ËĪÉij" + ], + [ + "Ġl", + "ËĪa" + ], + [ + "ËĪi", + "Éľ" + ], + [ + "Ġw", + "ËĪei" + ], + [ + "ÉĽ", + "Êģ" + ], + [ + "Ġt", + "ËĪa" + ], + [ + "Ġɾ", + "ËĮÉĻh" + ], + [ + "ĠÉķËĪi", + "Éij" + ], + [ + "ËĮi", + "ËIJ" + ], + [ + "ËĮÉĽ", + "l" + ], + [ + "ĠtÉĻ", + "Éľ" + ], + [ + "Ġk", + "ËĪuo" + ], + [ + "Ġt", + "ËĪu" + ], + [ + "j", + "ËĪÉĽ" + ], + [ + "ĠËĮi", + "n" + ], + [ + "ɾ", + "e" + ], + [ + "Ġk", + "oËIJ" + ], + [ + "Ġk", + "ËĪa" + ], + [ + "ɾ", + "i" + ], + [ + "ĠtÉķËĪi", + "Éij" + ], + [ + "l", + "ÉĻ" + ], + [ + "Ġk", + "ÉĻ" + ], + [ + "Ġt", + "ËĪi" + ], + [ + "ĠÅĭ", + "ËĪyÉĻ" + ], + [ + "Ġts", + "h" + ], + [ + "e", + "r" + ], + [ + "a", + "v" + ], + [ + "ĠkÉĶ", + "n" + ], + [ + "ËĪÉĻ", + "ÉľÅĭ" + ], + [ + "ð", + "o" + ], + [ + "ËĪaËIJ", + "n" + ], + [ + "Ġbʰ", + "ËĪi" + ], + [ + "ĠkËIJ", + "jaËIJ" + ], + [ + "ÉĻ", + "z" + ], + [ + "Ġp", + "Êģ" + ], + [ + "Ġd", + "ËĪɪ" + ], + [ + "Ġz", + "iËIJ" + ], + [ + "É¡", + "eËIJ" + ], + [ + "Ġt", + "ËĪÉĻ" + ], + [ + "ɪ", + "z" + ], + [ + "Ġn", + "ËĮon" + ], + [ + "t", + "aËIJ" + ], + [ + "b", + "l" + ], + [ + "t", + "e" + ], + [ + "n", + "ËĮeËIJ" + ], + [ + "ËĪɪ", + "l" + ], + [ + "s", + "o" + ], + [ + "k", + "o" + ], + [ + "u", + "Êģ" + ], + [ + "ĠÉ", + "£" + ], + [ + "Ġpa", + "Êģ" + ], + [ + "Ġ", + "ËĪÉĽ" + ], + [ + "j", + "ËĪuËIJ" + ], + [ + "ËĮ", + "ÊĮ" + ], + [ + "y", + "n" + ], + [ + "ËĪiËIJ", + "n" + ], + [ + "Ġl", + "ËĪaɪ" + ], + [ + "ËĪɪ", + "Åĭ" + ], + [ + "ĠtÉķh", + "ËĪy" + ], + [ + "Ġn", + "ËĪÊĮhi" + ], + [ + "Ġd", + "ËĮe" + ], + [ + "Ġj", + "ËĪÉiju" + ], + [ + "Ġt", + "ËĪÉiju" + ], + [ + "Ġh", + "ËĪo" + ], + [ + "ɪ", + "d" + ], + [ + "Ġth", + "ËĪÉij" + ], + [ + "m", + "ËĪe" + ], + [ + "Ġ", + "ËĪÉĻ" + ], + [ + "j", + "a" + ], + [ + "Ġp", + "h" + ], + [ + "ÉĽ", + "t" + ], + [ + "Ġk", + "ËĪÊĮ" + ], + [ + "t", + "ÉĻn" + ], + [ + "m", + "ËĪÉij" + ], + [ + "w", + "ËĪe" + ], + [ + "ĠËĮa", + "ɪn" + ], + [ + "Ġð", + "ɪs" + ], + [ + "É¡", + "ÉĻ" + ], + [ + "Ġn", + "ËĪaËIJ" + ], + [ + "Ġb", + "ËĪaËIJ" + ], + [ + "Ġa", + "θ" + ], + [ + "Ġm", + "ËĮa" + ], + [ + "ËĪÊĮh", + "a" + ], + [ + "Ġd", + "ËĮa" + ], + [ + "ËĪ", + "Êı" + ], + [ + "Ġɲ", + "ËĮy" + ], + [ + "Ġp", + "ËĪa" + ], + [ + "ËĪað", + "o" + ], + [ + "d", + "i" + ], + [ + "b", + "Éľ" + ], + [ + "É", + "³" + ], + [ + "Ġw", + "iËIJ" + ], + [ + "Ġn", + "ËĪɪ" + ], + [ + "ĠÉ¡", + "ËĪÉĶÉľ" + ], + [ + "tËIJ", + "o" + ], + [ + "ËĮÉĻ", + "m" + ], + [ + "ËĪaËIJ", + "r" + ], + [ + "Ġm", + "ÉĽ" + ], + [ + "ËĪeËIJ", + "É¡aËIJ" + ], + [ + "Ġs", + "ËĮi" + ], + [ + "Ġl", + "ËĮaËIJ" + ], + [ + "n", + "ËĮaËIJ" + ], + [ + "Ġs", + "p" + ], + [ + "t", + "Êģ" + ], + [ + "ĠÊ", + "İ" + ], + [ + "ËĮ", + "ÉijËIJ" + ], + [ + "Ġk", + "l" + ], + [ + "k", + "ʰ" + ], + [ + "i", + "l" + ], + [ + "ĠÊĥ", + "t" + ], + [ + "ĠËĮÊĬ", + "n" + ], + [ + "a", + "l" + ], + [ + "Ġs", + "ËĪÉĽ" + ], + [ + "Ġm", + "ËĪaËIJ" + ], + [ + "Ġ", + "Åĵ" + ], + [ + "ĠÉ¡", + "ËĪÊĮ" + ], + [ + "ĠpËĮÉĽ", + "r" + ], + [ + "ɾ", + "ËĪa" + ], + [ + "ËIJ", + "ÊĪ" + ], + [ + "ËĪaβ", + "a" + ], + [ + "Ġw", + "ËĪÉĴ" + ], + [ + "Ġx", + "ËĪuei" + ], + [ + "Ġkh", + "ËĪo" + ], + [ + "Ġla", + "s" + ], + [ + "ĠÉĹ", + "ËĪo" + ], + [ + "Ġf", + "ÉĽÉ¾" + ], + [ + "Ġj", + "ËĪiÉĽ" + ], + [ + "Ġt", + "ËĪe" + ], + [ + "Ġk", + "ËĮÉĶ" + ], + [ + "ĠdeËIJ", + "n" + ], + [ + "Ġm", + "o" + ], + [ + "Ġp", + "ËĪi" + ], + [ + "Ġt", + "ËĪÉij" + ], + [ + "ËĪÉĽ", + "st" + ], + [ + "w", + "ËĪÉij" + ], + [ + "ËĪaɪ", + "t" + ], + [ + "ÉĻ", + "ÊĬ" + ], + [ + "Ġ", + "ËĪi" + ], + [ + "ɪ", + "j" + ], + [ + "a", + "ɪ" + ], + [ + "ËĪaËIJ", + "Éľ" + ], + [ + "ĠËĪɪ", + "s" + ], + [ + "Ġp", + "ÉĶɾ" + ], + [ + "æ", + "Éľn" + ], + [ + "k", + "a" + ], + [ + "Åĭ", + "É¡" + ], + [ + "b", + "ÉĻn" + ], + [ + "ÊĬ", + "f" + ], + [ + "Ġp", + "ɹ" + ], + [ + "Ġl", + "ËĮe" + ], + [ + "ËĪiËIJ", + "d" + ], + [ + "ËĪaËIJ", + "re" + ], + [ + "Ġm", + "ËĪÊĮ" + ], + [ + "ÉĻ", + "r" + ], + [ + "Ġd", + "Éij" + ], + [ + "ËĪaËIJt", + "o" + ], + [ + "Ġp", + "ËĪeËIJ" + ], + [ + "Ġd", + "ËĪoËIJ" + ], + [ + "Ġs", + "ËĮÊĬ" + ], + [ + "Ġh", + "ËĪi" + ], + [ + "Ġs", + "ËĪa" + ], + [ + "ËĪeËIJ", + "n" + ], + [ + "d", + "ÉĻ" + ], + [ + "Ġp", + "j" + ], + [ + "ËĪÅĵ", + "Êģ" + ], + [ + "l", + "ɪç" + ], + [ + "ÉĴ", + "n" + ], + [ + "ĠËĪÉĻ", + "r" + ], + [ + "t", + "ËĪe" + ], + [ + "Ġi", + "l" + ], + [ + "ËĪaËIJ", + "l" + ], + [ + "Ġs", + "ËĮÉĻÊĬ" + ], + [ + "s", + "ÊĪ" + ], + [ + "Ġd", + "ËĪuËIJ" + ], + [ + "h", + "ËĪÉij" + ], + [ + "Ġx", + "ËĪou" + ], + [ + "Ġl", + "ËĪaiÉľ" + ], + [ + "w", + "ËĪo" + ], + [ + "ËĪÉĽnt", + "e" + ], + [ + "Ġs", + "y" + ], + [ + "Ġz", + "ɪç" + ], + [ + "ĠÉ¡", + "ËĪu" + ], + [ + "ĠÉķ", + "ËĪy" + ], + [ + "ËĪÉĶËIJ", + "l" + ], + [ + "ÉĶ", + "l" + ], + [ + "Ġt", + "ËĪo" + ], + [ + "ĠÊĭ", + "oËIJ" + ], + [ + "Ġ", + "iËIJ" + ], + [ + "wËĪa", + "ða" + ], + [ + "ËĪa", + "ndo" + ], + [ + "Ġaθ", + "ÉĽnt" + ], + [ + "Ġaθɼnt", + "wËĪaða" + ], + [ + "Ġt", + "ËĪiÉĽ" + ], + [ + "ËĪei", + "Éľ" + ], + [ + "Ġp", + "ËĮa" + ], + [ + "Ġn", + "ËĪaɪ" + ], + [ + "w", + "a" + ], + [ + "Ġf", + "r" + ], + [ + "ĠÊIJ", + "ËĪÉĻÉľn" + ], + [ + "ËĪu", + "a" + ], + [ + "m", + "i" + ], + [ + "Ġm", + "ËĪÉĽ" + ], + [ + "ËĪeËIJk", + "ʰ" + ], + [ + "c", + "ʰ" + ], + [ + "Ġw", + "ËĪÉij" + ], + [ + "st", + "a" + ], + [ + "Ġt", + "u" + ], + [ + "Ġs", + "k" + ], + [ + "ËĪÉĶ", + "l" + ], + [ + "ËĪeËIJ", + "ÊĪ" + ], + [ + "Ġl", + "ËĪaËIJɪ" + ], + [ + "Ġl", + "ËĪaËIJ" + ], + [ + "ËĪÉĽËIJ", + "s" + ], + [ + "ËĪÉĽÉ¾", + "a" + ], + [ + "ËĪÉĻ", + "Éľt" + ], + [ + "Ġ", + "yn" + ], + [ + "d", + "ÉĻn" + ], + [ + "Ġd", + "i" + ], + [ + "ËĪiËIJ", + "s" + ], + [ + "Ġðe", + "l" + ], + [ + "ËĪÊĮ", + "r" + ], + [ + "Ġh", + "ËĪaËIJ" + ], + [ + "Ġb", + "ÉĻ" + ], + [ + "Ġj", + "ËĪuËIJ" + ], + [ + "ll", + "e" + ], + [ + "st", + "o" + ], + [ + "ËĪɪ", + "t" + ], + [ + "ËĪoËIJ", + "ɾ" + ], + [ + "b", + "ʰ" + ], + [ + "m", + "ÉĻn" + ], + [ + "ËĮu", + "ÉĻ" + ], + [ + "ËĮÉĻ", + "ɾ" + ], + [ + "ËĪÊĮ", + "n" + ], + [ + "ĠlËĪaɪ", + "k" + ], + [ + "Ġb", + "ËĪa" + ], + [ + "ɪ", + "ð" + ], + [ + "Ġl", + "o" + ], + [ + "z", + "i" + ], + [ + "ËĪÊĮ", + "st" + ], + [ + "m", + "ËĪi" + ], + [ + "ÉĶ", + "Êģ" + ], + [ + "ĠnËĪɪ", + "çt" + ], + [ + "Ġt", + "ɾ" + ], + [ + "Ġd", + "ËĪeËIJkʰ" + ], + [ + "Ġs", + "ËĮe" + ], + [ + "Ġn", + "ËĪÉĻÊĬ" + ], + [ + "Ġ", + "u" + ], + [ + "Ġs", + "i" + ], + [ + "Ġɪ", + "ç" + ], + [ + "Ġp", + "r" + ], + [ + "ĠtÉķ", + "ËĪy" + ], + [ + "Ġm", + "ËĪu" + ], + [ + "z", + "a" + ], + [ + "Ġt", + "Êģ" + ], + [ + "Ġw", + "ɪð" + ], + [ + "t", + "ËĪÉĽ" + ], + [ + "Ġp", + "ËĪÊĮɾ" + ], + [ + "Ġk", + "ËĪÉĶ" + ], + [ + "ËĪoËIJ", + "r" + ], + [ + "Ġh", + "ËĮa" + ], + [ + "Ġk", + "ËĪonÉ¡" + ], + [ + "Ġp", + "uÊģ" + ], + [ + "Ġd", + "y" + ], + [ + "ËĪɪ", + "n" + ], + [ + "nt", + "e" + ], + [ + "Ġk", + "ËĮa" + ], + [ + "ËĪÉĻ", + "ɪ" + ], + [ + "Ġm", + "i" + ], + [ + "ĠÉ¡", + "ËĮuÉĻ" + ], + [ + "ĠÊ", + "²" + ], + [ + "Ġf", + "ËĪÉij" + ], + [ + "Ġv", + "ÉijËIJ" + ], + [ + "ĠËĮa", + "ÊĬ" + ], + [ + "ËĮ", + "uËIJ" + ], + [ + "ĠËĪu", + "n" + ], + [ + "Ġj", + "ËĪÊĮha" + ], + [ + "j", + "uËIJ" + ], + [ + "Ġm", + "ɪt" + ], + [ + "Ġl", + "ËĪÉĽ" + ], + [ + "ËĪeËIJ", + "Êĥ" + ], + [ + "Ġf", + "ÉĶËIJ" + ], + [ + "m", + "ÉĻ" + ], + [ + "ɾ", + "t" + ], + [ + "ĠkËĮo", + "n" + ], + [ + "Ġl", + "ËĪÉĶ" + ], + [ + "Ġx", + "ËĪÉiju" + ], + [ + "p", + "l" + ], + [ + "Ġd", + "ËĪi" + ], + [ + "Ġl", + "ËĪoËIJ" + ], + [ + "s", + "ÉĻ" + ], + [ + "ËĪaËIJ", + "va" + ], + [ + "Ġl", + "ËĪu" + ], + [ + "ĠÉ¡", + "ËĮÉĻÊĬ" + ], + [ + "Ġh", + "av" + ], + [ + "ĠËĮaËIJpk", + "ËĮoËIJ" + ], + [ + "ɾ", + "ËĪi" + ], + [ + "Ġf", + "ËĪÉĻ" + ], + [ + "Ġh", + "ËĮÉĻm" + ], + [ + "ËĪonÉ¡", + "Éľ" + ], + [ + "j", + "o" + ], + [ + "Ġs", + "ÉĶ" + ], + [ + "ËĪaËIJ", + "d" + ], + [ + "w", + "ËĪiÉĻ" + ], + [ + "ËĪa", + "nd" + ], + [ + "ËĮa", + "ɪn" + ], + [ + "t", + "ɾ" + ], + [ + "ĠËĮ", + "ɪ" + ], + [ + "ĠËĪu", + "na" + ], + [ + "Ġx", + "wËĪÉij" + ], + [ + "Ġj", + "ÉĶËIJ" + ], + [ + "Êģ", + "ËĪi" + ], + [ + "ĠkËĪuo", + "Éľ" + ], + [ + "Ġa", + "β" + ], + [ + "ĠÉ¡", + "ËĪaËIJ" + ], + [ + "an", + "o" + ], + [ + "t", + "ÉĻl" + ], + [ + "Ġr", + "ËĮe" + ], + [ + "ËĮÊĮ", + "t" + ], + [ + "ĠjËĪi", + "Éij" + ], + [ + "ĠɾËĮÉĻh", + "aËIJ" + ], + [ + "Ġm", + "ËĪe" + ], + [ + "ĠËĪy", + "Ã¦Éľn" + ], + [ + "Ġf", + "ËĪu" + ], + [ + "Ġb", + "l" + ], + [ + "n", + "ËĪi" + ], + [ + "s", + "ÉĻn" + ], + [ + "Ġa", + "ɪn" + ], + [ + "ËĪi", + "ÊĬ" + ], + [ + "Ġðe", + "ɪ" + ], + [ + "Ġɪ", + "ts" + ], + [ + "Ġ", + "(" + ], + [ + "ËĪy", + "ËIJ" + ], + [ + "ÉĻ", + "d" + ], + [ + "ĠËĮ", + "o" + ], + [ + "ĠÉĽ", + "s" + ], + [ + "Ġv", + "iËIJ" + ], + [ + "ËIJ", + "É¡eËIJ" + ], + [ + "k", + "ËĪe" + ], + [ + "ĠËĪa", + "l" + ], + [ + "ÉĽ", + "l" + ], + [ + "Ġ", + "ÊĮ" + ], + [ + "ËIJ", + "o" + ], + [ + "Ġk", + "ËĪo" + ], + [ + "ĠÊĪ", + "ËĪuËIJ" + ], + [ + "Ġs", + "ËĪɪ" + ], + [ + "ËĪeËIJ", + "ɾ" + ], + [ + "Éľ", + "m" + ], + [ + "ËĮ", + "ÉĻn" + ], + [ + "ËĪaËIJ", + "i" + ], + [ + "ËĪoËIJ", + "l" + ], + [ + "ɪ", + "ËĮeËIJ" + ], + [ + "Ġʲ", + "ËĪy" + ], + [ + "Ġk", + "ËĪÉĶËIJ" + ], + [ + "s", + "ËĪi" + ], + [ + "Ġl", + "ËĪe" + ], + [ + "ËĮ", + "ÉĴt" + ], + [ + "ËĪiËIJ", + "p" + ], + [ + "a", + "Êģ" + ], + [ + "Ġθ", + "ËĪɪÅĭ" + ], + [ + "ËĪÉĻËIJ", + "ɪ" + ], + [ + "ËĪÊĮ", + "l" + ], + [ + "ĠhËĪoËIJ", + "taËIJ" + ], + [ + "ËĪo", + "ɪ" + ], + [ + "nt", + "o" + ], + [ + "z", + "h" + ], + [ + "ĠdeËIJ", + "m" + ], + [ + "ĠkÉĶ", + "m" + ], + [ + "ʰ", + "ËĪiËIJk" + ], + [ + "ĠdÊĴ", + "ËĪÊĮst" + ], + [ + "p", + "ɾ" + ], + [ + "Ġl", + "y" + ], + [ + "h", + "ËĪu" + ], + [ + "ËĪÉĶ", + "ø" + ], + [ + "ËĪaËIJ", + "s" + ], + [ + "ĠËĪa", + "n" + ], + [ + "Ġ", + "ËĪÉĴ" + ], + [ + "Ġk", + "an" + ], + [ + "Ġts", + "ËĪuo" + ], + [ + "ËĪeËIJ", + "va" + ], + [ + "ĠÉ¡", + "ɾ" + ], + [ + "Ġp", + "o" + ], + [ + "ĠtÊĥ", + "ËĪÉĶ" + ], + [ + "Êİ", + "a" + ], + [ + "Ġm", + "ËĮi" + ], + [ + "Êĥ", + "t" + ], + [ + "t", + "ËĪi" + ], + [ + "Ġh", + "ËĪÊĮ" + ], + [ + "tÊĥ", + "e" + ], + [ + "Ġf", + "ÉĶn" + ], + [ + "v", + "e" + ], + [ + "Ġn", + "ËĮe" + ], + [ + "ËĪÉĶ", + "Êģ" + ], + [ + "i", + "z" + ], + [ + "Ġs", + "ËĪuo" + ], + [ + "ËĪÉĽËIJ", + "r" + ], + [ + "wËĪa", + "Êģ" + ], + [ + "ËĪað", + "a" + ], + [ + "Åĭ", + "k" + ], + [ + "p", + "o" + ], + [ + "Ġk", + "ËĪi" + ], + [ + "ËĪa", + "d" + ], + [ + "Ġv", + "ËĪi" + ], + [ + "t", + "Éķ" + ], + [ + "Ġk", + "ËĪÉĻ" + ], + [ + "Ġw", + "ËĪu" + ], + [ + "ÉĴ", + "z" + ], + [ + "ĠvÉijËIJ", + "ɾ" + ], + [ + "Êģ", + "ËĪÉĽ" + ], + [ + "Ġk", + "ËĪaËIJ" + ], + [ + "k", + "e" + ], + [ + "n", + "ÉĻ" + ], + [ + "ËĪÊĮ", + "b" + ], + [ + "ËĪuËIJ", + "ɾ" + ], + [ + "ËĮÉĻ", + "ËIJ" + ], + [ + "ĠÊĪ", + "ʰËĪiËIJk" + ], + [ + "Ġk", + "ËĪu" + ], + [ + "Ġb", + "ËĮÊĮt" + ], + [ + "Ġa", + "t" + ], + [ + "Ġf", + "ɹ" + ], + [ + "ËĪa", + "x" + ], + [ + "Ġz", + "oËIJ" + ], + [ + "Ġt", + "ËĪaËIJ" + ], + [ + "Ġð", + "ËĮe" + ], + [ + "n", + "eËIJ" + ], + [ + "ĠÉij", + "ËIJ" + ], + [ + "Ġa", + "ÊĬf" + ], + [ + "a", + "m" + ], + [ + "ÊĬ", + "Åĭ" + ], + [ + "ĠÉĶ", + "ËIJ" + ], + [ + "ĠÉķËĪi", + "ÉľÅĭ" + ], + [ + "Ġ", + "ËĪÉĶËIJl" + ], + [ + "ɪ", + "m" + ], + [ + "j", + "ËĪo" + ], + [ + "ËĪiËIJ", + "ÉŁ" + ], + [ + "Ġkw", + "ËĮÉĽ" + ], + [ + "ĠmËĪa", + "s" + ], + [ + "ÉĻ", + "h" + ], + [ + "ĠËĪa", + "ÊĬ" + ], + [ + "ËĪÉĶ", + "ɪ" + ], + [ + "É¡", + "ÉĻɾ" + ], + [ + "r", + "ÉĻn" + ], + [ + "ËĪɪ", + "k" + ], + [ + "s", + "se" + ], + [ + "Ġp", + "ËĪÉij" + ], + [ + "ĠÉĹ", + "ËĮe" + ], + [ + "ĠÉĹ", + "ËĪi" + ], + [ + "Ġa", + "z" + ], + [ + "ĠÉ¡ËĪÊĮ", + "jaËIJ" + ], + [ + "z", + "e" + ], + [ + "ĠÉĹ", + "ËĮaËIJ" + ], + [ + "Ġf", + "ËĪi" + ], + [ + "ĠËĮ", + "ÉĴn" + ], + [ + "Ġx", + "ËĪo" + ], + [ + "ĠËĮÊĬ", + "na" + ], + [ + "Ġtʰ", + "aËIJ" + ], + [ + "Ġs", + "Éij" + ], + [ + "ËĪeɪ", + "ÊĥÉĻn" + ], + [ + "ĠtÉķËĪi", + "Éľ" + ], + [ + "ĠÉŁ", + "aËIJ" + ], + [ + "p", + "ËIJ" + ], + [ + "Ġpl", + "y" + ], + [ + "θ", + "ËĪi" + ], + [ + "ËIJ", + "Éĸ" + ], + [ + "Ġt", + "ËĪuei" + ], + [ + "Ġl", + "ËĪÉĻ" + ], + [ + "Ġd", + "ÉijËIJ" + ], + [ + "f", + "t" + ], + [ + "ËĪa", + "m" + ], + [ + "ĠsËĪÊĮ", + "kt" + ], + [ + "Ġt", + "ËĪou" + ], + [ + "Ġp", + "ËĪiÉĽ" + ], + [ + "ĠËĪa", + "i" + ], + [ + "ĠwËĪÉĴ", + "n" + ], + [ + "Ġz", + "ËĮaɪn" + ], + [ + "Ġe", + "st" + ], + [ + "Ġm", + "ÉĶ" + ], + [ + "ĠtÉķ", + "jËĪÉiju" + ], + [ + "Éľ", + "p" + ], + [ + "ËĪÊĮ", + "z" + ], + [ + "b", + "i" + ], + [ + "ËĪÉĽËIJs", + "eËIJ" + ], + [ + "Ġl", + "ËĪy" + ], + [ + "Ġm", + "ËĮe" + ], + [ + "Ġd", + "ËĮÉĽl" + ], + [ + "ËĪiËIJ", + "l" + ], + [ + "ĠkËĮo", + "mo" + ], + [ + "Ġh", + "ËĪaÉľn" + ], + [ + "ËĪoËIJ", + "ne" + ], + [ + "ĠkËĪÊĮɾ", + "t" + ], + [ + "Ġsy", + "Êģ" + ], + [ + "ËĮÉĶ", + "ɾ" + ], + [ + "Ġɪ", + "f" + ], + [ + "u", + "v" + ], + [ + "z", + "ÉĻn" + ], + [ + "o", + "l" + ], + [ + "Ï", + "ĩ" + ], + [ + "i", + "m" + ], + [ + "Ġm", + "ËĪiÉĽ" + ], + [ + "Ġð", + "ɪ" + ], + [ + "Ġv", + "ËĪÉĽ" + ], + [ + "ÊĬ", + "d" + ], + [ + "Ġt", + "r" + ], + [ + "ËĪeËIJ", + "s" + ], + [ + "ð", + "e" + ], + [ + "d", + "e" + ], + [ + "ʰ", + "Ïĩ" + ], + [ + "ÉŁ", + "ʰ" + ], + [ + "ËĮÉĻËIJ", + "ÉªÉľ" + ], + [ + "b", + "ËIJ" + ], + [ + "ËĪÊĬ", + "k" + ], + [ + "ĠnËĪÉĶ", + "ÉªÉľ" + ], + [ + "ĠËĮ", + "iËIJ" + ], + [ + "ËĪÉijËIJ", + "t" + ], + [ + "ËĪiËIJ", + "ɾ" + ], + [ + "Ġt", + "ɹ" + ], + [ + "ɾ", + "ÉĶ" + ], + [ + "Ġw", + "ÉĴz" + ], + [ + "Ġv", + "u" + ], + [ + "b", + "ÉĻl" + ], + [ + "b", + "ÉĻ" + ], + [ + "ɹ", + "i" + ], + [ + "nt", + "s" + ], + [ + "Ġs", + "ËĪaËIJ" + ], + [ + "d", + "ʰ" + ], + [ + "Ġt", + "ÊĬ" + ], + [ + "ĠÊİ", + "ËĮi" + ], + [ + "β", + "a" + ], + [ + "h", + "ËĪÉĻÉľÅĭ" + ], + [ + "Ġs", + "ËĪiËIJ" + ], + [ + "ĠpËĮa", + "ɾa" + ], + [ + "ËĪÉĽÉ¾", + "ÉĶ" + ], + [ + "ËĪɪ", + "s" + ], + [ + "É£", + "o" + ], + [ + "ĠËĮa", + "l" + ], + [ + "o", + "r" + ], + [ + "Ġb", + "ËĪÊĮh" + ], + [ + "Ġk", + "ËĪoËIJ" + ], + [ + "Ġt", + "ËĪÉĽ" + ], + [ + "Ġp", + "ËĪo" + ], + [ + "ĠÊĴ", + "ÉĻ" + ], + [ + "p", + "Êģ" + ], + [ + "Ġ", + "ËĪaɪ" + ], + [ + "hËĪÉij", + "ÉľÅĭ" + ], + [ + "ÉĻl", + "i" + ], + [ + "ËĪeɪ", + "t" + ], + [ + "ĠjËĪiou", + "Éľ" + ], + [ + "Ġd", + "ËĪÉĻ" + ], + [ + "Ġm", + "ËĪÉĶËIJ" + ], + [ + "l", + "ËĪi" + ], + [ + "ËĮy", + "ÉĻ" + ], + [ + "ĠlËĪoËIJ", + "É¡" + ], + [ + "Ġn", + "ËĪÊĮ" + ], + [ + "Ġh", + "ËĪÊĬ" + ], + [ + "Ġn", + "ËĪÉĻÉľÅĭ" + ], + [ + "ĠÊģ", + "ÉĻ" + ], + [ + "z", + "ËĪi" + ], + [ + "Ġt", + "ËĪuËIJ" + ], + [ + "ĠkËĮo", + "me" + ], + [ + "Ġl", + "ËĪeËIJ" + ], + [ + "ËĪaËIJt", + "aËIJ" + ], + [ + "Ġa", + "n" + ], + [ + "ĠËĪy", + "u" + ], + [ + "ĠËĮÊĮ", + "É¡ÉĻɾ" + ], + [ + "ĠËĪɪ", + "n" + ], + [ + "ĠhËĪo", + "ÉĻ" + ], + [ + "v", + "ÉĻ" + ], + [ + "ËĪø", + "ËIJ" + ], + [ + "θj", + "a" + ], + [ + "ËĪuÉĻ", + "Éľn" + ], + [ + "Ġk", + "ÉĻɾ" + ], + [ + "ËĪa", + "t" + ], + [ + "j", + "ËĪø" + ], + [ + "ËĪÉĽt", + "Êģ" + ], + [ + "Ġp", + "ËĪÉiju" + ], + [ + "st", + "ÉĻ" + ], + [ + "Ġw", + "ÉĴt" + ], + [ + "ËĪeËIJ", + "l" + ], + [ + "ÊĪ", + "i" + ], + [ + "Ġx", + "ËĪaiÉľ" + ], + [ + "ËĪy", + "Êģ" + ], + [ + "ĠhËĪoËIJ", + "É¡aËIJ" + ], + [ + "Ġts", + "ËĪi" + ], + [ + "ĠËĪÊĮ", + "p" + ], + [ + "Ġn", + "ËĮÉĴt" + ], + [ + "ĠlËĪɪ", + "eËIJ" + ], + [ + "Ġh", + "ËĪa" + ], + [ + "Ġf", + "l" + ], + [ + "Ġn", + "ËĪeËIJ" + ], + [ + "ËĮaËIJ", + "ɪ" + ], + [ + "Ġt", + "ËĪuo" + ], + [ + "tÊĥ", + "ËIJ" + ], + [ + "s", + "ËĪe" + ], + [ + "bʰ", + "i" + ], + [ + "ĠbËĪÊĮh", + "ÊĬt" + ], + [ + "ËĪÉĽ", + "nd" + ], + [ + "Ġs", + "ËĪÉĶ" + ], + [ + "ÉĻn", + "s" + ], + [ + "ËĮÉĻ", + "l" + ], + [ + "ÉĽ", + "Éľ" + ], + [ + "ĠÉ¡", + "l" + ], + [ + "ËĪɪ", + "ɾ" + ], + [ + "ËĪaËIJt", + "a" + ], + [ + "Éľ", + "ËIJ" + ], + [ + "ËĪÉĽnt", + "o" + ], + [ + "sk", + "ËĮoËIJ" + ], + [ + "ËĪÉĽ", + "k" + ], + [ + "ts", + "i" + ], + [ + "Ġt", + "ËĪonÉ¡" + ], + [ + "Ġb", + "iËIJ" + ], + [ + "Ġh", + "ËĪaËIJɪ" + ], + [ + "Ġb", + "ËĪi" + ], + [ + "j", + "j" + ], + [ + "Êİ", + "i" + ], + [ + "Ġk", + "ʰ" + ], + [ + "Ġs", + "ËĪo" + ], + [ + "ll", + "o" + ], + [ + "Ġb", + "aɪ" + ], + [ + "ĠÉĽ", + "nt" + ], + [ + "Ġ", + "ËĪiËIJ" + ], + [ + "ĠÉ¡", + "ËĪo" + ], + [ + "ɾ", + "eËIJ" + ], + [ + "Ġk", + "Êĭ" + ], + [ + "Ġm", + "ËĪeiÉľ" + ], + [ + "ÊĬ", + "ËĪÉĶËIJ" + ], + [ + "Ġt", + "ËĪaɪ" + ], + [ + "Ġsu", + "s" + ], + [ + "Ġr", + "i" + ], + [ + "Ġv", + "ËĮÉĽ" + ], + [ + "ËĪiËIJ", + "no" + ], + [ + "v", + "ano" + ], + [ + "ĠdËĮi", + "ËIJ" + ], + [ + "ĠÊIJ", + "ËĪaÉľn" + ], + [ + "Ê", + "Ĥ" + ], + [ + "ĠÉIJ", + "b" + ], + [ + "ËĪaËIJ", + "h" + ], + [ + "ɪ", + "Êĥ" + ], + [ + "ĠdËĮe", + "lla" + ], + [ + "tËIJ", + "i" + ], + [ + "ĠËĪÊĬ", + "n" + ], + [ + "Ġh", + "iËIJ" + ], + [ + "Ġb", + "ËĪaËIJt" + ], + [ + "Ġth", + "ËĪi" + ], + [ + "Ġa", + "m" + ], + [ + "Ġ", + "ËĪoËIJ" + ], + [ + "Ġh", + "u" + ], + [ + "Ġk", + "ËĪÊĮh" + ], + [ + "Ġz", + "ËĪÉijËIJ" + ], + [ + "ĠÉ¡", + "ËĮÉĶ" + ], + [ + "Ġ", + "ËĪÉĻÊĬ" + ], + [ + "y", + "ËĪi" + ], + [ + "Ġl", + "ËĪÊĮ" + ], + [ + "Ġd", + "ËĪeËIJ" + ], + [ + "Ġs", + "ËĪÉĶËIJ" + ], + [ + "sk", + "ËĮeËIJ" + ], + [ + "ɾ", + "o" + ], + [ + "Êģ", + "ËĪÉij" + ], + [ + "t", + "ËĪa" + ], + [ + "Ġk", + "ËĪÊĬ" + ], + [ + "ËĪant", + "e" + ], + [ + "Ġd", + "ÉĶ" + ], + [ + "Ġs", + "ËĪeɪ" + ], + [ + "Ġs", + "ÉĽt" + ], + [ + "ɹ", + "ɪ" + ], + [ + "ĠÉ¡ËĮÉĻÊĬ", + "ɪÅĭ" + ], + [ + "z", + "o" + ], + [ + "Ġj", + "ËĪaËIJ" + ], + [ + "ĠÉĴv", + "ðÉĻ" + ], + [ + "ĠÊ", + "Ŀ" + ], + [ + "ĠÉĽ", + "l" + ], + [ + "Ġs", + "ËĪoËIJ" + ], + [ + "Ġth", + "ËĪiÉľ" + ], + [ + "Ġ", + "ËĪÉĽl" + ], + [ + "Ġly", + "ËĮi" + ], + [ + "nd", + "ÊĴ" + ], + [ + "ĠÉķ", + "jËĪÉiju" + ], + [ + "θ", + "a" + ], + [ + "ĠɾËĮÉĻh", + "eËIJ" + ], + [ + "Ġma", + "ɪ" + ], + [ + "j", + "ÉĻ" + ], + [ + "ĠËĪÊĮ", + "b" + ], + [ + "as", + "jËĪÉĶ" + ], + [ + "d", + "Êģ" + ], + [ + "Ġkh", + "ËĪa" + ], + [ + "ĠËĪe", + "s" + ], + [ + "v", + "i" + ], + [ + "f", + "i" + ], + [ + "ËĮÉĻ", + "b" + ], + [ + "Ġr", + "e" + ], + [ + "Ġav", + "ËĮÉĽ" + ], + [ + "Ġt", + "ËĮi" + ], + [ + "Ġk", + "ɾ" + ], + [ + "Ġb", + "ɪk" + ], + [ + "st", + "e" + ], + [ + "ËĪeËIJÊĥ", + "c" + ], + [ + "p", + "t" + ], + [ + "z", + "ÉĻ" + ], + [ + "Ġw", + "ËĪaËIJ" + ], + [ + "k", + "l" + ], + [ + "ĠsËĪÊĮ", + "m" + ], + [ + "ɪ", + "ÊĪ" + ], + [ + "d", + "z" + ], + [ + "v", + "o" + ], + [ + "ËĮa", + "ÊĬt" + ], + [ + "nd", + "e" + ], + [ + "Ġd", + "ÉĽs" + ], + [ + "ĠÉŁ", + "ËĪaËIJ" + ], + [ + "Ġr", + "ËĮi" + ], + [ + "s", + "ËĮeËIJ" + ], + [ + "É¡", + "i" + ], + [ + "Ġal", + "s" + ], + [ + "ËĪi", + "ðo" + ], + [ + "ĠnËĪi", + "Éľn" + ], + [ + "ÊĬ", + "l" + ], + [ + "ts", + "ËIJ" + ], + [ + "ËĪant", + "o" + ], + [ + "ĠÉĹ", + "ËĪÉĻÊĬ" + ], + [ + "kËIJ", + "i" + ], + [ + "ĠsËĪÊĮ", + "b" + ], + [ + "Ġn", + "ËĪa" + ], + [ + "Ġl", + "ËĮo" + ], + [ + "Ġph", + "ËĪi" + ], + [ + "m", + "ËĮe" + ], + [ + "Ġf", + "a" + ], + [ + "k", + "ÉĻ" + ], + [ + "Ġz", + "ËĪu" + ], + [ + "n", + "s" + ], + [ + "ĠÊģ", + "e" + ], + [ + "Ġb", + "ËĪo" + ], + [ + "ËĪaËIJt", + "i" + ], + [ + "Ġm", + "an" + ], + [ + "ĠlËĪi", + "Éij" + ], + [ + "ĠÉĹ", + "ËĮyÉĻ" + ], + [ + "Ġf", + "ËĪÉĶËIJ" + ], + [ + "ĠkÊĭ", + "ËĪeËIJÊĥc" + ], + [ + "Ġx", + "ËĪÉij" + ], + [ + "ĠtÉķ", + "ËĪu" + ], + [ + "j", + "ÉĻɾ" + ], + [ + "Ġɪ", + "st" + ], + [ + "w", + "ËĪi" + ], + [ + "ĠËĮaɪn", + "ÉĻ" + ], + [ + "ɪ", + "É¡" + ], + [ + "Ġs", + "ÊĪ" + ], + [ + "ËĪi", + "ÉĻl" + ], + [ + "Ġn", + "ËĪiÉĽÉľn" + ], + [ + "ĠËĮÉĽ", + "ËIJ" + ], + [ + "ËĪaɪ", + "nd" + ], + [ + "Ġz", + "ËĪi" + ], + [ + "v", + "ÉĻn" + ], + [ + "m", + "z" + ], + [ + "ð", + "os" + ], + [ + "dÊĴ", + "ËIJ" + ], + [ + "j", + "ËĪa" + ], + [ + "ɾ", + "ËĪÉĶ" + ], + [ + "l", + "ËĪe" + ], + [ + "Ê", + "²" + ], + [ + "Ġv", + "ËĪÉĶ" + ], + [ + "Ġl", + "ËĪiÉĽ" + ], + [ + "θ", + "e" + ], + [ + "mËĪe", + "nte" + ], + [ + "Ġɪn", + "ðÉĻ" + ], + [ + "Ġaɪ", + "m" + ], + [ + "n", + "ÉĻn" + ], + [ + "Ġh", + "ÉĻm" + ], + [ + "ɾ", + "aËIJ" + ], + [ + "ĠsËĪuo", + "Éľ" + ], + [ + "Ġɲ", + "ËĪi" + ], + [ + "Ġɹ", + "ËĪiÉĻl" + ], + [ + "l", + "ËĪa" + ], + [ + "Ġb", + "ËĪÉĶ" + ], + [ + "Ġk", + "ËĪai" + ], + [ + "Êģ", + "ËĪa" + ], + [ + "Ġw", + "ËĪÉľËIJ" + ], + [ + "Ġa", + "ËIJ" + ], + [ + "Ġp", + "as" + ], + [ + "ËĪÊĮ", + "s" + ], + [ + "w", + "ËĪÉĽÉ¾" + ], + [ + "ĠÉĹ", + "ËĪe" + ], + [ + "ĠhËĮa", + "tÉĻ" + ], + [ + "a", + "ɪn" + ], + [ + "ĠËĪÉĶ", + "pʰ" + ], + [ + "Êģ", + "ËĪe" + ], + [ + "ĠÉŁaËIJ", + "ËĪeËIJÉ¡aËIJ" + ], + [ + "ĠËĪÊĬ", + "s" + ], + [ + "ĠtÉķhËĪi", + "Éľ" + ], + [ + "nt", + "Êĥ" + ], + [ + "Ġx", + "ËĪuo" + ], + [ + "ËĪu", + "Êģ" + ], + [ + "Ġɪ", + "m" + ], + [ + "ɳ", + "Éĸ" + ], + [ + "ËĪyÉĻ", + "Éľkh" + ], + [ + "ĠËĪy", + "ÉĽ" + ], + [ + "Ġm", + "ËĮaËIJ" + ], + [ + "Åĵ", + "Êģ" + ], + [ + "ĠËĪa", + "lt" + ], + [ + "Ġk", + "ÉĻm" + ], + [ + "Êİ", + "o" + ], + [ + "ĠÉIJ", + "n" + ], + [ + "Ġf", + "y" + ], + [ + "ĠËĮÉĽ", + "ra" + ], + [ + "ĠÉ¡", + "ËĪÊĬ" + ], + [ + "Ġp", + "ËĪÊĮ" + ], + [ + "l", + "s" + ], + [ + "Ġl", + "ËĪiËIJ" + ], + [ + "ĠÊĤ", + "ËĪy" + ], + [ + "Ġbɪk", + "ËĪÊĮz" + ], + [ + "ĠÉ¡", + "ÉĽt" + ], + [ + "Ġb", + "ɾ" + ], + [ + "t", + "ʰ" + ], + [ + "tÉĻl", + "ËĮÉĻb" + ], + [ + "x", + "o" + ], + [ + "sk", + "ËĮaËIJ" + ], + [ + "ɲ", + "ʲ" + ], + [ + "ËĪeËIJk", + "ÊĪ" + ], + [ + "r", + "ÉĻ" + ], + [ + "tÊĥ", + "o" + ], + [ + "ĠpÊģ", + "ÉĶ" + ], + [ + "Ġɹ", + "ËĪaɪt" + ], + [ + "Ġp", + "ËĪei" + ], + [ + "ËĮ", + "ɪç" + ], + [ + "j", + "ËĪÉĽÉ¾" + ], + [ + "tËIJ", + "a" + ], + [ + "ĠÉIJb", + "ËĮaÊĬt" + ], + [ + "ĠkÊĭËĪeËIJÊĥc", + "ÉĻn" + ], + [ + "Ġv", + "ËĪe" + ], + [ + "ÊĬ", + "Éľ" + ], + [ + "Ġa", + "kËĪe" + ], + [ + "Ġp", + "ËĪai" + ], + [ + "v", + "ËĪÉĽ" + ], + [ + "Ġθ", + "ɹ" + ], + [ + "ɪ", + "f" + ], + [ + "Ġav", + "ËĪÉĽ" + ], + [ + "Ġk", + "ËĪe" + ], + [ + "d", + "ËĪi" + ], + [ + "ËĪeËIJ", + "Éĸ" + ], + [ + "Ġb", + "ÉĻt" + ], + [ + "ÊĪ", + "ʰ" + ], + [ + "t", + "eËIJ" + ], + [ + "θj", + "ËĪÉĶn" + ], + [ + "d", + "Éľ" + ], + [ + "ĠjËĪi", + "Éľ" + ], + [ + "Ġv", + "e" + ], + [ + "É£", + "ËĪu" + ], + [ + "ËĪÊĮh", + "ÉĻl" + ], + [ + "Ġp", + "ÉĶ" + ], + [ + "ĠÉ¡", + "r" + ], + [ + "Ġð", + "a" + ], + [ + "Ġv", + "ËĪiËIJ" + ], + [ + "ĠËĮ", + "ÉijËIJ" + ], + [ + "ËĪÉĻÊĬ", + "nt" + ], + [ + "Ġb", + "ËĪaËIJɾ" + ], + [ + "ĠmËĪÊĮ", + "tÉĻlËĮÉĻb" + ], + [ + "l", + "d" + ], + [ + "ĠtÉķ", + "ËĮÉĶ" + ], + [ + "p", + "a" + ], + [ + "ð", + "ËĪad" + ], + [ + "ËĪi", + "ɾ" + ], + [ + "Ġx", + "ËĪu" + ], + [ + "ĠlËĪi", + "ÉľÅĭ" + ], + [ + "ËĪeɪ", + "s" + ], + [ + "ĠÉĹËĮe", + "Éľn" + ], + [ + "Ġth", + "ËĪiÉĽ" + ], + [ + "tËIJ", + "e" + ], + [ + "ĠavËĮÉĽ", + "k" + ], + [ + "ĠËĮ", + "ÉĶ" + ], + [ + "Ġk", + "ËĪÉiju" + ], + [ + "ɪ", + "v" + ], + [ + "iËIJ", + "z" + ], + [ + "ËĪo", + "s" + ], + [ + "ĠÉ¡", + "ɹ" + ], + [ + "a", + "nd" + ], + [ + "ĠlËĪi", + "ou" + ], + [ + "ĠËĪo", + "Éľ" + ], + [ + "É¡", + "l" + ], + [ + "Ġp", + "ËĪÉĶËIJ" + ], + [ + "Ġm", + "ËĮeËIJ" + ], + [ + "Ġk", + "ËĪÉĴ" + ], + [ + "n", + "os" + ], + [ + "ç", + "ÉĻn" + ], + [ + "f", + "ÉĻn" + ], + [ + "ĠsËĪÊĮkt", + "ËĮeËIJ" + ], + [ + "Ġ", + "ËĪaɪn" + ], + [ + "ËĪoËIJ", + "re" + ], + [ + "j", + "ËĪÉĽn" + ], + [ + "Ġð", + "ËĪÉĽn" + ], + [ + "ĠtÉķh", + "ËĪiÉĽÉľn" + ], + [ + "Ġh", + "ËĪaɪ" + ], + [ + "ɾ", + "ËĪÉĽ" + ], + [ + "Ġs", + "ËĪu" + ], + [ + "ĠkËĪɪ", + "jaËIJ" + ], + [ + "Ġpj", + "ËĮÊĬ" + ], + [ + "ĠhÉĻm", + "ËĮaËIJ" + ], + [ + "ĠËĮÊĮ", + "p" + ], + [ + "Ġp", + "ËĪÊĮhÉĻl" + ], + [ + "Ġx", + "ËĪÉĻ" + ], + [ + "d", + "ËĪe" + ], + [ + "Ġm", + "Éij" + ], + [ + "ĠÊĬ", + "m" + ], + [ + "nd", + "ÉĻ" + ], + [ + "Ġd", + "ËĪÉĻÊĬnt" + ], + [ + "ËĪeËIJ", + "ÊĥÉĻn" + ], + [ + "Ġða", + "ts" + ], + [ + "i", + "s" + ], + [ + "Ġc", + "ËĪaËIJh" + ], + [ + "p", + "e" + ], + [ + "Ġs", + "ËĮo" + ], + [ + "Ġð", + "ËĪe" + ], + [ + "Ġs", + "ËĪaËIJt" + ], + [ + "ËĪa", + "Êģ" + ], + [ + "Ġs", + "ËĪe" + ], + [ + "ÉĻ", + "k" + ], + [ + "ɪ", + "Êĭ" + ], + [ + "ĠkËĪoËIJ", + "i" + ], + [ + "k", + "ÉĶ" + ], + [ + "Ġv", + "ËĪaËIJÊĬ" + ], + [ + "Ġf", + "ËĪei" + ], + [ + "Ġl", + "ËĪeËIJk" + ], + [ + "Ġh", + "ËĪiÉĻ" + ], + [ + "Ġa", + "ÊĬ" + ], + [ + "ËĪÉĽ", + "ndo" + ], + [ + "ËĪe", + "s" + ], + [ + "Ġz", + "ËĪÉĶ" + ], + [ + "Ġ", + "ËĪÉĽÉ¾a" + ], + [ + "nËĪi", + "Éľn" + ], + [ + "ĠkËĪÊĮ", + "m" + ], + [ + "Ġl", + "ËĪÉĴ" + ], + [ + "ɪ", + "st" + ], + [ + "Ġp", + "Éij" + ], + [ + "Ġf", + "ËĪÉĶ" + ], + [ + "Ġth", + "ËĪonÉ¡" + ], + [ + "nk", + "e" + ], + [ + "ËĮ", + "ɪk" + ], + [ + "Ġɲ", + "ËĪÉĻ" + ], + [ + "ËĮÊĮ", + "m" + ], + [ + "ËĪiËIJ", + "t" + ], + [ + "ĠwËĪÉĴ", + "nt" + ], + [ + "ËĪaβ", + "an" + ], + [ + "ĠbËĪÊĮ", + "r" + ], + [ + "ÉĽ", + "nd" + ], + [ + "ĠËĮÉijËIJ", + "bÉľ" + ], + [ + "Ġv", + "ËĪaɪ" + ], + [ + "ĠtÊĥ", + "ËĮi" + ], + [ + "ĠθËĪɪÅĭ", + "k" + ], + [ + "st", + "i" + ], + [ + "Ġk", + "ɹ" + ], + [ + "ĠËĪa", + "ÊĬt" + ], + [ + "st", + "ÉĻn" + ], + [ + "ĠÊĭ", + "ËĪÊĮn" + ], + [ + "ĠÉ¡", + "ËĮaËIJ" + ], + [ + "ËĪaËIJÉľ", + "ɲ" + ], + [ + "Êģ", + "i" + ], + [ + "ĠnËĪÉĶ", + "x" + ], + [ + "ĠɹËĪiÉĻl", + "ɪ" + ], + [ + "Ġv", + "ËĮi" + ], + [ + "Ġðe", + "ÉĻ" + ], + [ + "ËĮɪ", + "tÊĥ" + ], + [ + "Ġv", + "ËĪyÉĻ" + ], + [ + "ĠËĮaËIJpk", + "ËĮaËIJ" + ], + [ + "Ġf", + "ËĮaËIJɪ" + ], + [ + "Ġp", + "ËĪÉĶ" + ], + [ + "ĠnËĪÊĮ", + "mb" + ], + [ + "θ", + "es" + ], + [ + "j", + "ËĪÉĽÊģ" + ], + [ + "ĠkËĪÊĬ", + "cʰ" + ], + [ + "m", + "ËĪÉĽ" + ], + [ + "Ġv", + "ËĪu" + ], + [ + "Ġl", + "ÅĵÊģ" + ], + [ + "ĠiËIJ", + "m" + ], + [ + "ÊĪ", + "ÉĻɾ" + ], + [ + "tÊĥ", + "i" + ], + [ + "ËIJ", + "s" + ], + [ + "Ġt", + "ËĪy" + ], + [ + "ĠmËĪi", + "ÉľÅĭ" + ], + [ + "ɾ", + "ËĪe" + ], + [ + "m", + "ËĮa" + ], + [ + "Ġm", + "ËĮiËIJ" + ], + [ + "ĠÉĽ", + "ks" + ], + [ + "ɪ", + "p" + ], + [ + "ĠkËĪÊĮɾ", + "nËĮaËIJ" + ], + [ + "ĠËĮaÊĬ", + "x" + ], + [ + "r", + "ËĪiËIJ" + ], + [ + "Ġc", + "ËĪÊĮl" + ], + [ + "m", + "os" + ], + [ + "ĠkËĪÊĮɾt", + "ËĮeËIJ" + ], + [ + "iËIJ", + "ɾ" + ], + [ + "k", + "ÉĻn" + ], + [ + "Ġd", + "ËĪu" + ], + [ + "n", + "aËIJ" + ], + [ + "Ġp", + "wËĪe" + ], + [ + "ËĮÉĶ", + "ɪ" + ], + [ + "ĠtÉķh", + "ËĪiÉĽ" + ], + [ + "Ġβ", + "ËĪi" + ], + [ + "ËĪiÉĽ", + "Éľt" + ], + [ + "Ġt", + "e" + ], + [ + "ËĪað", + "os" + ], + [ + "m", + "ËĪa" + ], + [ + "Ġv", + "ËĪo" + ], + [ + "Ġm", + "ËĪɪ" + ], + [ + "Ġb", + "ËĮi" + ], + [ + "a", + "d" + ], + [ + "d", + "o" + ], + [ + "Ġn", + "ËĪaÊĬ" + ], + [ + "ĠʲËĪy", + "Éľ" + ], + [ + "w", + "ËĪÉĽ" + ], + [ + "ËĪi", + "s" + ], + [ + "e", + "l" + ], + [ + "Ġpa", + "r" + ], + [ + "Ġt", + "ËĪai" + ], + [ + "ĠdËĪɪ", + "jaËIJ" + ], + [ + "h", + "ËĪi" + ], + [ + "Ġɾ", + "ËĪÊĮ" + ], + [ + "Ġd", + "ËĪe" + ], + [ + "ËĪaɪ", + "d" + ], + [ + "Ġp", + "er" + ], + [ + "Ġs", + "ËĮÉĶ" + ], + [ + "w", + "e" + ], + [ + "ÊĬ", + "m" + ], + [ + "Ġi", + "n" + ], + [ + "ĠjËĪuËIJ", + "z" + ], + [ + "ËĪiËIJp", + "ÉĻl" + ], + [ + "ĠÊĭ", + "ËĪaËIJl" + ], + [ + "Ġe", + "tËĪÉĽ" + ], + [ + "ËĮÉĽ", + "m" + ], + [ + "Ġn", + "ËĪu" + ], + [ + "ËĪÉĽ", + "kt" + ], + [ + "ĠiËIJ", + "ɾ" + ], + [ + "Ġb", + "ɹ" + ], + [ + "Ġtsh", + "ËĪi" + ], + [ + "ĠÉĹ", + "ËĪÉĶÉľ" + ], + [ + "Ġkw", + "ËĮa" + ], + [ + "Ġf", + "ËĪuÉľ" + ], + [ + "w", + "ËĮa" + ], + [ + "Ġd", + "ËĪiËIJ" + ], + [ + "ĠÉ¡", + "ËĪyÉĻ" + ], + [ + "ËĮÉĽ", + "ËIJ" + ], + [ + "r", + "ËĪa" + ], + [ + "Ġn", + "e" + ], + [ + "Ġz", + "ËĪyÉĻ" + ], + [ + "Ġb", + "ËĪaɪ" + ], + [ + "ĠÉŁ", + "ËĪÊĮb" + ], + [ + "ËĪuËIJ", + "to" + ], + [ + "ÊĬ", + "nt" + ], + [ + "Ġc", + "ʰ" + ], + [ + "ËĪÉĽnt", + "i" + ], + [ + "ËĪo", + "ÉĻ" + ], + [ + "Ġs", + "ËĮÊĮm" + ], + [ + "Ġl", + "Éij" + ], + [ + "ËĮe", + "va" + ], + [ + "ɾ", + "ÉĽ" + ], + [ + "nt", + "Éľ" + ], + [ + "Ġm", + "ËĪÉĽn" + ], + [ + "ËĪÉijËIJ", + "k" + ], + [ + "Ġki", + "l" + ], + [ + "ËĪon", + "es" + ], + [ + "f", + "f" + ], + [ + "Ġm", + "ËĪÉĽËIJ" + ], + [ + "Ġv", + "ËĪÉĻɪ" + ], + [ + "Ġ", + "ËĪÉĶËIJ" + ], + [ + "ĠËĮɪ", + "nt" + ], + [ + "ÊĬ", + "n" + ], + [ + "Ġw", + "ɪl" + ], + [ + "Ġs", + "in" + ], + [ + "ĠËĮa", + "lla" + ], + [ + "Ġaβ", + "ËĪia" + ], + [ + "p", + "i" + ], + [ + "ËĪo", + "Éľ" + ], + [ + "ɪj", + "ËĮaËIJ" + ], + [ + "k", + "u" + ], + [ + "Ġv", + "ËĪɪ" + ], + [ + "Ġtu", + "t" + ], + [ + "ĠtËĪe", + "Éľ" + ], + [ + "Ġh", + "ËĪÉĶ" + ], + [ + "β", + "ɾe" + ], + [ + "s", + "ÉĻɾ" + ], + [ + "Ġkh", + "ËĪai" + ], + [ + "Ġm", + "ËĪÉĶ" + ], + [ + "Ġt", + "a" + ], + [ + "Ġɲ", + "ËĪaËIJ" + ], + [ + "Ġn", + "u" + ], + [ + "ËĪuËIJ", + "n" + ], + [ + "ĠÉĻËIJ", + "Éľ" + ], + [ + "ĠËĪa", + "ÊĬf" + ], + [ + "ËĪiËIJd", + "Éľ" + ], + [ + "nt", + "i" + ], + [ + "Ġp", + "ËĪiËIJpÉĻl" + ], + [ + "Ġk", + "j" + ], + [ + "Ġp", + "e" + ], + [ + "Ġm", + "ËĪÉij" + ], + [ + "ËĮa", + "ɪ" + ], + [ + "ËĪaËIJ", + "le" + ], + [ + "Ġv", + "ËĮÉĻËIJÉªÉľ" + ], + [ + "mp", + "o" + ], + [ + "ĠkËĪɪ", + "t" + ], + [ + "Ġn", + "ËĮÉĽ" + ], + [ + "ĠÉŁ", + "ËĪaËIJtaËIJ" + ], + [ + "ĠsËĪaËIJt", + "ʰ" + ], + [ + "ĠÉŁ", + "ËĪi" + ], + [ + "Ġs", + "o" + ], + [ + "Ġb", + "ËĪÉĽ" + ], + [ + "k", + "ËĪi" + ], + [ + "ɪt", + "i" + ], + [ + "Ġts", + "i" + ], + [ + "Ġk", + "Êģ" + ], + [ + "ËĮ", + "ÉĴ" + ], + [ + "É¡", + "ÉĻl" + ], + [ + "k", + "st" + ], + [ + "Ġm", + "ËĪÉĻËIJ" + ], + [ + "ËĪÊĮ", + "k" + ], + [ + "Ġn", + "ËĪaËIJÊĬ" + ], + [ + "Ġa", + "p" + ], + [ + "ĠlËĪɪ", + "kʰ" + ], + [ + "ll", + "i" + ], + [ + "ĠkwËĪa", + "l" + ], + [ + "Ġ", + "ËĪÉĻËIJ" + ], + [ + "Ġts", + "ËĪuei" + ], + [ + "Ġd", + "o" + ], + [ + "ĠkËIJ", + "jËĪo" + ], + [ + "ÊĬ", + "z" + ], + [ + "Ġp", + "ËĪaËIJ" + ], + [ + "Ġm", + "ËĪuËIJ" + ], + [ + "ĠÉ¡ÉĻ", + "v" + ], + [ + "r", + "ËĪi" + ], + [ + "Ġt", + "w" + ], + [ + "ËĮ", + "ɪn" + ], + [ + "d", + "ËĪÉij" + ], + [ + "Ġð", + "ËĪi" + ], + [ + "ĠËĪaËIJ", + "i" + ], + [ + "Ġh", + "ËĪiÉĽ" + ], + [ + "Ġð", + "ËĮÉĽm" + ], + [ + "Ġpʰ", + "ËĪɪɾ" + ], + [ + "ÉĴ", + "m" + ], + [ + "ĠËĮ", + "eËIJ" + ], + [ + "Ġth", + "ËĪaiÉľ" + ], + [ + "Ġv", + "ËĪas" + ], + [ + "Ġn", + "ÉijËIJ" + ], + [ + "p", + "ÉĻn" + ], + [ + "Ġp", + "ËĮÉĻɾ" + ], + [ + "ĠÉĹ", + "ËĪaËIJɪ" + ], + [ + "ËĪou", + "Éľ" + ], + [ + "ĠÊIJ", + "ËĪuÉľ" + ], + [ + "ĠmËĪa", + "n" + ], + [ + "ĠtËĪÉĻ", + "ÉªÉľ" + ], + [ + "Ġl", + "ËĪaËIJÊĬ" + ], + [ + "m", + "ËĪÉĽnte" + ], + [ + "ĠfËĪa", + "m" + ], + [ + "s", + "jËĪÉĶ" + ], + [ + "Ġp", + "ËĪÉĻ" + ], + [ + "ËĪeËIJ", + "m" + ], + [ + "Ġp", + "ËĪÊĮr" + ], + [ + "j", + "ËĪi" + ], + [ + "Ġl", + "ÉĽ" + ], + [ + "Ġt", + "en" + ], + [ + "ËĪoËIJ", + "ra" + ], + [ + "k", + "i" + ], + [ + "ĠÊĤ", + "ËĪaËIJÊĬ" + ], + [ + "k", + "ɪ" + ], + [ + "bËIJ", + "e" + ], + [ + "ËĪa", + "lt" + ], + [ + "ð", + "ɪ" + ], + [ + "p", + "ËĪi" + ], + [ + "ĠËĮÉĽ", + "nt" + ], + [ + "Ġm", + "ËĪei" + ], + [ + "Ġh", + "ËĪÉĻÊĬ" + ], + [ + "Ġh", + "ËĪÉĽÉ¾" + ], + [ + "j", + "ËĪÉij" + ], + [ + "ĠhËĪÊĬ", + "aËIJ" + ], + [ + "m", + "Éľ" + ], + [ + "Ġd", + "ʰ" + ], + [ + "ĠtÊĥ", + "ËĪe" + ], + [ + "l", + "ËĪÉĽ" + ], + [ + "ËĪaËIJt", + "e" + ], + [ + "Ġp", + "ËĪuËIJ" + ], + [ + "Ġm", + "ËĪÊĬ" + ], + [ + "ËĪaËIJɪ", + "ÊĪ" + ], + [ + "d", + "iËIJ" + ], + [ + "Ġfɹ", + "ÉĴm" + ], + [ + "Ġh", + "ËĪÉijËIJ" + ], + [ + "β", + "o" + ], + [ + "ĠmËĪi", + "Éľn" + ], + [ + "Ġð", + "iËIJz" + ], + [ + "Ġk", + "ËĪou" + ], + [ + "ËĪiËIJ", + "na" + ], + [ + "Ġav", + "ËĮeva" + ], + [ + "Ġ", + "ËĪaËIJɾ" + ], + [ + "Ġn", + "ËĪuËIJɾ" + ], + [ + "Ġβ", + "ËĪe" + ], + [ + "Ġz", + "aɪn" + ], + [ + "ËĪÉĽ", + "d" + ], + [ + "É", + "Ĺ" + ], + [ + "ËĪeɪ", + "k" + ], + [ + "s", + "ËĮÉĻÊĬ" + ], + [ + "ËĪeËIJ", + "ÉŁ" + ], + [ + "ĠÊĤ", + "ËĪÉĻËIJ" + ], + [ + "j", + "e" + ], + [ + "cʰ", + "ËIJ" + ], + [ + "ËĪÉĶ", + "r" + ], + [ + "ÉĽ", + "ËIJ" + ], + [ + "ĠtÉķhËĪy", + "Ã¦Éľn" + ], + [ + "ĠËĮaɪn", + "ÉĻn" + ], + [ + "ĠiËIJ", + "n" + ], + [ + "ĠbËĪÊĮ", + "c" + ], + [ + "ËĪiËIJ", + "m" + ], + [ + "ɾ", + "as" + ], + [ + "ËĮÉĻ", + "s" + ], + [ + "Ġv", + "ËĪeËIJ" + ], + [ + "ĠËĪÉĻr", + "Éľ" + ], + [ + "Ġd", + "uËIJ" + ], + [ + "nt", + "ÉĻ" + ], + [ + "Ġpɹ", + "ËĪÉĴ" + ], + [ + "Ġb", + "ËĪɪ" + ], + [ + "ĠwËĪo", + "Éľ" + ], + [ + "n", + "ËĮi" + ], + [ + "Ġh", + "ÉIJ" + ], + [ + "Ġk", + "ËĪÉĽ" + ], + [ + "Ġe", + "t" + ], + [ + "jËĪÉĽ", + "ndo" + ], + [ + "ĠËĪai", + "Éľ" + ], + [ + "Ġl", + "i" + ], + [ + "ĠËĪaÊĬ", + "s" + ], + [ + "kËIJ", + "o" + ], + [ + "ĠÉĹ", + "ËĪyÉĻ" + ], + [ + "k", + "eËIJ" + ], + [ + "Ġf", + "ËĪiËIJl" + ], + [ + "Ġbʰ", + "ËĪaËIJi" + ], + [ + "ĠÉ¡ÉĻ", + "Êĥ" + ], + [ + "ÊĴ", + "ËĪe" + ], + [ + "Ġn", + "jËĪuËIJ" + ], + [ + "ĠËĪa", + "k" + ], + [ + "ĠÉĹ", + "ËĪaËIJ" + ], + [ + "z", + "ËĪa" + ], + [ + "v", + "ËĪe" + ], + [ + "ĠhËĮa", + "ÊĬ" + ], + [ + "ÉIJ", + "ç" + ], + [ + "ĠɾËĪÊĮ", + "kʰ" + ], + [ + "p", + "ËĪe" + ], + [ + "ĠtÉĻ", + "bi" + ], + [ + "ĠpËĪÊĮhÉĻl", + "ËĮeËIJ" + ], + [ + "Ġf", + "ËĪÉĽ" + ], + [ + "Ġw", + "ËĮɪtÊĥ" + ], + [ + "ĠtÉķËĪy", + "ÉĽÉľ" + ], + [ + "w", + "ËĮe" + ], + [ + "ËĮa", + "ɪt" + ], + [ + "ĠnÉijËIJ", + "x" + ], + [ + "ĠkËĪÉĶËIJ", + "n" + ], + [ + "ÊĬ", + "k" + ], + [ + "ĠbËĪaËIJ", + "d" + ], + [ + "Åĭ", + "ÉĻn" + ], + [ + "Ġn", + "i" + ], + [ + "Ġb", + "ËĪe" + ], + [ + "Ġm", + "ËĮÊĬ" + ], + [ + "ËĪa", + "r" + ], + [ + "ĠmËĮe", + "ɪk" + ], + [ + "Ġs", + "ËĪaËIJɾ" + ], + [ + "β", + "e" + ], + [ + "ĠtÉķhËĪi", + "ÉľÅĭ" + ], + [ + "it", + "ËĪe" + ], + [ + "k", + "ËĮe" + ], + [ + "ËĪÉĽËIJ", + "l" + ], + [ + "ËĮ", + "ÉĴn" + ], + [ + "ËĮ", + "Éij" + ], + [ + "Ġb", + "ËĪɪl" + ], + [ + "Ġw", + "ÊĬd" + ], + [ + "Ġb", + "ËĪoËIJl" + ], + [ + "r", + "d" + ], + [ + "i", + "ÉĻ" + ], + [ + "Ġd", + "a" + ], + [ + "Ġb", + "ËĪaËIJÊĬ" + ], + [ + "ĠnËĪÊĮmb", + "ÉĻɾ" + ], + [ + "ËĪaËIJɪ", + "Éľ" + ], + [ + "ĠÉĽ", + "m" + ], + [ + "Ġm", + "iËIJɾ" + ], + [ + "ËĪeɪ", + "m" + ], + [ + "l", + "os" + ], + [ + "ËĮÉĽ", + "t" + ], + [ + "ĠËĮaÊĬ", + "s" + ], + [ + "ĠmËĪa", + "Éľt" + ], + [ + "Ġw", + "ËĪuÉĻ" + ], + [ + "Ġw", + "ËĪeɪ" + ], + [ + "Ġse", + "ɲ" + ], + [ + "Ġb", + "jËĪÉĽ" + ], + [ + "Ġw", + "ÉĽn" + ], + [ + "f", + "l" + ], + [ + "Ġkh", + "wËĪa" + ], + [ + "d", + "ËĪÉĽ" + ], + [ + "v", + "ɹɪ" + ], + [ + "ĠËĪa", + "ɾ" + ], + [ + "jËĪÉiju", + "Éľ" + ], + [ + "ĠËĮaËIJpk", + "ËĮeËIJ" + ], + [ + "b", + "Êģ" + ], + [ + "ĠtËĪaɪ", + "m" + ], + [ + "Ġ", + "ËĪÉij" + ], + [ + "Ġs", + "ËĮa" + ], + [ + "Ġz", + "ËĪoɪ" + ], + [ + "ËĪÉĶɾ", + "a" + ], + [ + "Ġd", + "ËĪø" + ], + [ + "ËĪÉĶɾ", + "t" + ], + [ + "ĠÅĭ", + "ËĪÉĶ" + ], + [ + "m", + "in" + ], + [ + "Ġl", + "ËĪÊĬk" + ], + [ + "ËĪÉĶËIJ", + "t" + ], + [ + "ĠËĪÉĶ", + "tɾ" + ], + [ + "Ġf", + "ËĪaɪ" + ], + [ + "ĠÉ¡", + "ÉĴt" + ], + [ + "ËĪeËIJ", + "ÉĻn" + ], + [ + "k", + "ËĪÉĶ" + ], + [ + "ĠvËĪÉĽ", + "ɹi" + ], + [ + "m", + "ÉĽ" + ], + [ + "ËĪaɪ", + "z" + ], + [ + "Ġe", + "sp" + ], + [ + "ɲ", + "a" + ], + [ + "Ġl", + "ËĪo" + ], + [ + "ËĪÉĽËIJ", + "ra" + ], + [ + "β", + "ËĪi" + ], + [ + "ou", + "Éľ" + ], + [ + "ËĮÉĻ", + "k" + ], + [ + "tÊĥ", + "uËIJ" + ], + [ + "Ġn", + "ËĪyÉĻ" + ], + [ + "ÊĪ", + "ɾ" + ], + [ + "ĠÉ¡", + "ËĪy" + ], + [ + "ĠtËĪo", + "ðo" + ], + [ + "ËĪɪ", + "çt" + ], + [ + "Ġm", + "ɪç" + ], + [ + "ĠËĪa", + "nd" + ], + [ + "Ġkw", + "ËĮÉĽl" + ], + [ + "ĠÊĤ", + "ËĪaËIJ" + ], + [ + "ĠnËĪi", + "Éľ" + ], + [ + "ËĪÉĶ", + "p" + ], + [ + "ËĪiËIJ", + "z" + ], + [ + "ĠÊĤ", + "ËĪaÊĬ" + ], + [ + "ĠɾËĮÉĻh", + "i" + ], + [ + "ĠsËĮÊĬ", + "o" + ], + [ + "ĠÉĽ", + "É¡" + ], + [ + "Ġd", + "Åĵ" + ], + [ + "ĠÉ¡ËĮaËIJ", + "ÉªÉľ" + ], + [ + "d", + "ɪ" + ], + [ + "l", + "ËĮa" + ], + [ + "st", + "ËĪi" + ], + [ + "ĠdËĮiËIJ", + "z" + ], + [ + "Ġt", + "ËĮÊĬ" + ], + [ + "θ", + "i" + ], + [ + "ĠËĪɪ", + "skËĮoËIJ" + ], + [ + "nd", + "ÉĻn" + ], + [ + "Ġts", + "v" + ], + [ + "Ġh", + "ËĪÉĻËIJ" + ], + [ + "ĠÊĥ", + "ËĪÊĬ" + ], + [ + "ÉĻt", + "ËĮeËIJ" + ], + [ + "p", + "ËĮÉĽ" + ], + [ + "ËĪaɾ", + "ÉĶn" + ], + [ + "Ġp", + "ÉĽÊģ" + ], + [ + "Ġ", + "y" + ], + [ + "m", + "nËĮeËIJ" + ], + [ + "ËĪÉĽ", + "llo" + ], + [ + "ĠÉ¡", + "ËĪÉĻ" + ], + [ + "ĠËĮa", + "d" + ], + [ + "ĠÊĥ", + "v" + ], + [ + "ËĪÊı", + "ɾ" + ], + [ + "r", + "ËĪe" + ], + [ + "y", + "ËIJ" + ], + [ + "Ġp", + "ËĪaËIJs" + ], + [ + "Ġ", + "ËĪÉĽn" + ], + [ + "ɪ", + "dÊĴ" + ], + [ + "ËĪua", + "i" + ], + [ + "Ġf", + "i" + ], + [ + "Ġt", + "ËĪyÉĻ" + ], + [ + "ËĪaËIJ", + "ÉŁ" + ], + [ + "Ġt", + "jËĪe" + ], + [ + "ËĪaËIJn", + "aËIJ" + ], + [ + "st", + "ɾ" + ], + [ + "Êİ", + "e" + ], + [ + "ËĮe", + "ɪt" + ], + [ + "b", + "a" + ], + [ + "ð", + "as" + ], + [ + "v", + "Êģ" + ], + [ + "Ġz", + "ËĪÉĻËIJ" + ], + [ + "ËĪaËIJ", + "li" + ], + [ + "ÉŁÊ°", + "eËIJ" + ], + [ + "ËĪaËIJt", + "eËIJ" + ], + [ + "Ġv", + "ËĪa" + ], + [ + "Ġsa", + "l" + ], + [ + "ËĪaËIJ", + "no" + ], + [ + "ĠÉ¡ÉĻ", + "z" + ], + [ + "ĠhËĪoËIJ", + "ti" + ], + [ + "Ġɲ", + "ËĪiÉĽ" + ], + [ + "t", + "Éľ" + ], + [ + "ĠËĪaËIJ", + "p" + ], + [ + "Ġw", + "ËĪÉĽl" + ], + [ + "Ġm", + "ËĪɪl" + ], + [ + "Ġfy", + "ËIJɾ" + ], + [ + "ËĪÉĽËIJs", + "aËIJ" + ], + [ + "Ġb", + "ËĮiËIJ" + ], + [ + "ËĪaËIJ", + "jaËIJ" + ], + [ + "ËĪɪ", + "p" + ], + [ + "Ġf", + "Êģ" + ], + [ + "tsi", + "ËĪoËIJne" + ], + [ + "Ġw", + "ËĪuÉľ" + ], + [ + "Ġv", + "i" + ], + [ + "ĠwËĪÉij", + "Éľn" + ], + [ + "ËĪoËIJ", + "n" + ], + [ + "ĠÉĹ", + "ËĪÉĻɪ" + ], + [ + "ĠÊĿ", + "ËĪo" + ], + [ + "Ġr", + "a" + ], + [ + "m", + "ÉĻnt" + ], + [ + "ËĪaÊĬ", + "nd" + ], + [ + "Ġp", + "ÉĽÉ¾" + ], + [ + "ĠÉĹ", + "ËĪaËIJÊĬ" + ], + [ + "oËIJ", + "ɾ" + ], + [ + "h", + "ËĪo" + ], + [ + "ĠÉĴ", + "n" + ], + [ + "ĠÊİ", + "e" + ], + [ + "ĠsËĪɪ", + "ks" + ], + [ + "É¡", + "n" + ], + [ + "ĠÉ¡", + "ËĪa" + ], + [ + "Ġ", + "θj" + ], + [ + "Ġp", + "ËĪe" + ], + [ + "sp", + "e" + ], + [ + "Ġv", + "ËĪÉĻ" + ], + [ + "Ġf", + "ËĪɪ" + ], + [ + "ĠËĮɪnt", + "ÊĬ" + ], + [ + "l", + "ÉĻn" + ], + [ + "Ġn", + "ËĪiËIJd" + ], + [ + "ĠsËĮÊĬ", + "a" + ], + [ + "ĠËĪu", + "m" + ], + [ + "Ġd", + "ËĪeɪ" + ], + [ + "ĠËĪÊĮ", + "bʰi" + ], + [ + "ËĪÉijËIJ", + "ɾ" + ], + [ + "Ġb", + "ËĪiÉĽÉľt" + ], + [ + "Êİ", + "os" + ], + [ + "Ġtsh", + "ËĪaiÉľ" + ], + [ + "ĠËĮɪ", + "skËĮaËIJ" + ], + [ + "ĠaÊĬ", + "ÉĻ" + ], + [ + "ĠËĪy", + "æ" + ], + [ + "Ġd", + "yn" + ], + [ + "Ġm", + "ËĪiËIJn" + ], + [ + "ĠËĪÊĮ", + "cʰËIJ" + ], + [ + "Ġs", + "ÉĽ" + ], + [ + "Ġn", + "ËĪy" + ], + [ + "Ġn", + "ËĮÉĽl" + ], + [ + "É¡", + "ɾ" + ], + [ + "Êĥ", + "ËĪe" + ], + [ + "ĠÊĤ", + "ËĮÉĽ" + ], + [ + "ĠËĪÉĽ", + "vɹɪ" + ], + [ + "ËĪÉĽl", + "p" + ], + [ + "ĠbËĪa", + "k" + ], + [ + "Ġ", + "eËIJ" + ], + [ + "Ġf", + "ËĪaËIJ" + ], + [ + "Ġk", + "ÉĽl" + ], + [ + "ĠËĪeËIJ", + "s" + ], + [ + "j", + "ËĪaËIJd" + ], + [ + "Ġl", + "ËĮi" + ], + [ + "mb", + "ɾe" + ], + [ + "k", + "tÉĻ" + ], + [ + "nt", + "a" + ], + [ + "t", + "ËĪu" + ], + [ + "Ġð", + "ËĪat" + ], + [ + "ĠËĪa", + "β" + ], + [ + "ÉĻɹ", + "i" + ], + [ + "ĠkwËĮÉĽ", + "lla" + ], + [ + "Ġb", + "ÉĻn" + ], + [ + "r", + "ËĮÉĽ" + ], + [ + "Ġn", + "ÉĶ" + ], + [ + "ĠÉ¡", + "ËĪɪ" + ], + [ + "ĠËĪa", + "p" + ], + [ + "ɹ", + "ÉĻ" + ], + [ + "ËĪa", + "Éľkh" + ], + [ + "ĠÊIJ", + "ËĪi" + ], + [ + "Ġ", + "ËĪÉijËIJ" + ], + [ + "ɪ", + "É¡ÉĻn" + ], + [ + "Ġw", + "ËĪai" + ], + [ + "Ġp", + "ÉĻt" + ], + [ + "kËIJ", + "a" + ], + [ + "Ġb", + "ËĪÉĽËIJ" + ], + [ + "ËĪeËIJ", + "Êĭ" + ], + [ + "ls", + "ÉĻÊĬ" + ], + [ + "ĠcËĪaËIJh", + "ɪËĮeËIJ" + ], + [ + "Ġk", + "ÉĻn" + ], + [ + "ĠËĮaɪn", + "ÉĻm" + ], + [ + "ËĪuËIJ", + "t" + ], + [ + "Ġh", + "ËĪaÊĬ" + ], + [ + "Ġt", + "ËĪanto" + ], + [ + "ĠhÉIJ", + "z" + ], + [ + "Ġs", + "ËĪÊĮɾ" + ], + [ + "Ġn", + "o" + ], + [ + "Ġt", + "ËĪÉĶËIJ" + ], + [ + "Ġz", + "ËĪaɪ" + ], + [ + "ĠtÉķËĪiÉĽ", + "Éľ" + ], + [ + "Ġko", + "zËĪi" + ], + [ + "Ġk", + "ËĪei" + ], + [ + "ð", + "ËĪÉĶɾ" + ], + [ + "ËĮÉĶ", + "Êģ" + ], + [ + "Ġt", + "ËĪÊĮɾ" + ], + [ + "ĠÊIJ", + "ËĪÉĻ" + ], + [ + "ĠÉķËĪy", + "ÉĽÉľ" + ], + [ + "ĠmËĮÊĬ", + "ÉŁÊ°eËIJ" + ], + [ + "m", + "f" + ], + [ + "Ġv", + "ËĪiËIJdÉľ" + ], + [ + "k", + "ËĪa" + ], + [ + "ĠÉIJ", + "É¡" + ], + [ + "k", + "w" + ], + [ + "ĠÊģ", + "ÉĽ" + ], + [ + "x", + "ÉĻn" + ], + [ + "Ġd", + "ÊĬ" + ], + [ + "ĠkËĪÊĮɾ", + "nËĮeËIJ" + ], + [ + "jËĪaËIJd", + "aËIJ" + ], + [ + "Ġf", + "ÉĻ" + ], + [ + "ĠËĮi", + "mp" + ], + [ + "Ġh", + "ɪz" + ], + [ + "Ġ", + "ʰÏĩ" + ], + [ + "ËĪoËIJ", + "ni" + ], + [ + "Ġx", + "ËĪiÉľ" + ], + [ + "ËĪeËIJ", + "sÊĪ" + ], + [ + "Êı", + "bÉľ" + ], + [ + "ËĮÉĶɾ", + "ke" + ], + [ + "ĠÉ¡", + "ËĪÉĻÊĬ" + ], + [ + "ËĪɪ", + "ÊĥÉĻn" + ], + [ + "l", + "es" + ], + [ + "Ġf", + "ËĪiËIJ" + ], + [ + "É¡", + "tÉĻ" + ], + [ + "ËĪeËIJ", + "re" + ], + [ + "Ġv", + "ËĮaËIJ" + ], + [ + "Ġ", + "ËĪeɪ" + ], + [ + "Ġm", + "ËĪuÉĻÉľn" + ], + [ + "ĠÉ¡ËĪÊĬ", + "d" + ], + [ + "ĠmËĮa", + "ɪn" + ], + [ + "z", + "ËĪe" + ], + [ + "ĠlËĪi", + "Éľ" + ], + [ + "Ġm", + "u" + ], + [ + "Ġk", + "ËĮÉĽl" + ], + [ + "Ġj", + "ËĮÉĻh" + ], + [ + "Ġf", + "ËĮÉĶɾ" + ], + [ + "f", + "ɹ" + ], + [ + "Ġk", + "ËĪaɪn" + ], + [ + "ĠËĪÉĴ", + "lsÉĻÊĬ" + ], + [ + "θ", + "ɪÅĭ" + ], + [ + "Ġth", + "ËĪonÉ¡Éľ" + ], + [ + "t", + "ËĪÉij" + ], + [ + "θj", + "o" + ], + [ + "m", + "ËĪÉĶ" + ], + [ + "Ġ", + "os" + ], + [ + "Ġs", + "ÊĬ" + ], + [ + "ĠsËĪÊĮ", + "mÉĻ" + ], + [ + "ĠvËĮÉĽ", + "n" + ], + [ + "n", + "ËĪo" + ], + [ + "ĠËĪak", + "tÊĥuËIJ" + ], + [ + "É£", + "a" + ], + [ + "Ġtʰ", + "i" + ], + [ + "Ġf", + "ËĮi" + ], + [ + "Ġv", + "ËĪÉĽl" + ], + [ + "ĠtËĪu", + "tËIJi" + ], + [ + "x", + "os" + ] + ] + } +} \ No newline at end of file From 5e78e46b35602b519dde97c8d0e3b24176c6a42b Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Sun, 1 Feb 2026 23:10:02 +0000 Subject: [PATCH 29/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- examples/tts/easy_magpietts.py | 1 + nemo/collections/tts/models/easy_magpietts.py | 17 +- .../ipa_scripts/add_ipa_to_lhotse_shards.py | 27 ++- .../ipa_scripts/analyze_ipa_tokenization.py | 208 +++++++++--------- .../ipa_scripts/train_ipa_bpe_tokenizer.py | 119 +++++----- 5 files changed, 191 insertions(+), 181 deletions(-) diff --git a/examples/tts/easy_magpietts.py b/examples/tts/easy_magpietts.py index 705c4ab77134..4195060b87ef 100644 --- a/examples/tts/easy_magpietts.py +++ b/examples/tts/easy_magpietts.py @@ -21,6 +21,7 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager + @hydra_runner(config_path="conf/magpietts", config_name="easy_magpietts") def main(cfg): logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True)) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 68a48ab9701c..f6dd9c7728b3 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -352,7 +352,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.audio_out_projection = nn.Identity() self.final_proj = nn.Linear( - self.audio_embedding_dim, self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor + self.audio_embedding_dim, + self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor, ) self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none') @@ -376,7 +377,9 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): ) # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same) if self.audio_embedding_dim != local_transformer_hidden_dim: - self.local_transformer_audio_out_projection = nn.Linear(local_transformer_hidden_dim, self.audio_embedding_dim) + self.local_transformer_audio_out_projection = nn.Linear( + local_transformer_hidden_dim, self.audio_embedding_dim + ) else: self.local_transformer_audio_out_projection = nn.Identity() local_transformer_out_projections = [] @@ -1365,7 +1368,7 @@ def process_batch( # Text dropout: randomly drop text input to encourage the model to rely on other signals dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False - if (dropout_phoneme_input and dropout_text_input): + if dropout_phoneme_input and dropout_text_input: # Only one of the two can be True, so choose randomly dropout_phoneme_input = random.random() < 0.5 dropout_text_input = not dropout_phoneme_input @@ -1462,7 +1465,9 @@ def process_batch( remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1) else: # Log Warning - print(f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}") + print( + f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}" + ) remaining_text_embedded = remaining_text_embedded[:, : audio_codes_input_embedded.size(1), :] # Add text information to audio embeddings (element-wise addition) audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded @@ -2035,9 +2040,7 @@ def infer_batch( # Project from hidden_dim to audio_embedding_dim, then to logits last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) - all_code_logits_t = self.final_proj( - last_hidden_audio - ) # (B, num_codebooks * num_tokens_per_codebook) + all_code_logits_t = self.final_proj(last_hidden_audio) # (B, num_codebooks * num_tokens_per_codebook) if self.phoneme_tokenizer is not None: all_code_logits_t_phoneme = self.phoneme_final_proj( diff --git a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py index 61a124d56ccc..10972d1bdc6a 100644 --- a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py +++ b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py @@ -45,13 +45,14 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[ """Load CUTS_DIRS_BY_LANG from a JSON config file.""" if config_path is None: config_path = DEFAULT_CONFIG_PATH - + if not config_path.exists(): raise FileNotFoundError(f"Config file not found: {config_path}") - + with open(config_path, "r", encoding="utf-8") as f: return json.load(f) + # Map your dataset language keys to espeak voice codes (adjust as needed). # For German, espeak-ng uses "de" typically. ESPEAK_VOICE_BY_LANG: Dict[str, str] = { @@ -100,8 +101,7 @@ def _find_espeak_binary() -> str: if shutil.which(exe): return exe raise RuntimeError( - "Neither 'espeak-ng' nor 'espeak' was found on PATH. " - "Install espeak-ng (recommended) or espeak." + "Neither 'espeak-ng' nor 'espeak' was found on PATH. " "Install espeak-ng (recommended) or espeak." ) @@ -204,7 +204,7 @@ def add_ipa_to_cut( custom["normalized_text"] = text else: text = custom.get("normalized_text") or sup.get("text") - + if not text: continue @@ -234,9 +234,10 @@ def process_shard( cache = IPACache() n = 0 - with gzip.open(shard_path, "rt", encoding="utf-8") as fin, gzip.open( - out_shard_path, "wt", encoding="utf-8" - ) as fout: + with ( + gzip.open(shard_path, "rt", encoding="utf-8") as fin, + gzip.open(out_shard_path, "wt", encoding="utf-8") as fout, + ): for line in fin: line = line.strip() if not line: @@ -315,25 +316,23 @@ def process_language(lang: str, cuts_dirs: Dict[str, List[str]]) -> bool: print(f"[WARN] missing dir: {cuts_dir}", file=sys.stderr) continue process_cuts_dir(lang, cuts_dir) - + return True def main() -> None: - parser = argparse.ArgumentParser( - description="Add IPA strings to Lhotse cuts jsonl.gz shards." - ) + parser = argparse.ArgumentParser(description="Add IPA strings to Lhotse cuts jsonl.gz shards.") parser.add_argument( "--lang", type=str, required=True, - help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages." + help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages.", ) parser.add_argument( "--config", type=str, default=None, - help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}" + help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}", ) args = parser.parse_args() diff --git a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py index 7032e1eeca0f..e2d53c3099d3 100644 --- a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py +++ b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py @@ -64,13 +64,14 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[ """Load CUTS_DIRS_BY_LANG from a JSON config file.""" if config_path is None: config_path = DEFAULT_CONFIG_PATH - + if not config_path.exists(): raise FileNotFoundError(f"Config file not found: {config_path}") - + with open(config_path, "r", encoding="utf-8") as f: return json.load(f) + OUTPUT_SUFFIX = "_with_ipa" SHARD_GLOB = "cuts.*.jsonl.gz" @@ -78,6 +79,7 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[ @dataclass class TextPair: """A pair of raw text and its IPA phonemization with audio duration.""" + raw_text: str ipa_text: str lang: str @@ -87,6 +89,7 @@ class TextPair: @dataclass class TokenizationStats: """Statistics for tokenization comparison (tokens per second).""" + lang: str num_samples: int total_duration: float # sum of all durations in seconds @@ -113,7 +116,7 @@ def iter_shards(ipa_dir: Path) -> List[Path]: def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[TextPair, None, None]: """ Extract text pairs (raw text + IPA) from a single shard file. - + Yields: TextPair objects with raw_text, ipa_text, and duration """ @@ -132,7 +135,7 @@ def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[Text ipa = custom.get("ipa") # Get raw text - prefer normalized_text, fallback to text raw_text = custom.get("normalized_text") or sup.get("text") - + if ipa and raw_text and isinstance(ipa, str) and isinstance(raw_text, str): ipa = ipa.strip() raw_text = raw_text.strip() @@ -150,31 +153,31 @@ def sample_text_pairs( ) -> List[TextPair]: """ Sample text pairs from a language's cuts_with_ipa directories. - + Args: lang: Language code cuts_dirs: Dictionary mapping language codes to lists of cuts directories num_samples: Number of samples to collect seed: Random seed for reproducibility - + Returns: List of TextPair objects """ random.seed(seed) - + if lang not in cuts_dirs: raise ValueError(f"Unknown language: {lang}") - + # Collect all text pairs from all directories all_pairs = [] for cuts_dir_str in cuts_dirs[lang]: cuts_dir = Path(cuts_dir_str) ipa_dir = get_ipa_dir(cuts_dir) - + if not ipa_dir.exists(): print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr) continue - + shards = iter_shards(ipa_dir) for shard in shards: for pair in extract_text_pairs_from_shard(shard, lang): @@ -186,12 +189,12 @@ def sample_text_pairs( break if len(all_pairs) >= num_samples * 10: break - + # Sample if len(all_pairs) <= num_samples: print(f"[INFO] {lang}: Only found {len(all_pairs)} pairs, using all") return all_pairs - + return random.sample(all_pairs, num_samples) @@ -202,14 +205,14 @@ def iter_ipa_strings_for_lang( """Iterate over all IPA strings for a single language (memory-efficient).""" if lang not in cuts_dirs: return - + for cuts_dir_str in cuts_dirs[lang]: cuts_dir = Path(cuts_dir_str) ipa_dir = get_ipa_dir(cuts_dir) - + if not ipa_dir.exists(): continue - + shards = iter_shards(ipa_dir) for shard in shards: with gzip.open(shard, "rt", encoding="utf-8") as f: @@ -246,31 +249,31 @@ def simple_sample_ipa_strings( ) -> List[str]: """ Simple sampling: collect up to max_collect IPA strings, then randomly sample k. - + This avoids reading through all data like reservoir sampling does. - + Args: lang: Language code cuts_dirs: Dictionary mapping language codes to lists of cuts directories k: Number of samples to select max_collect: Maximum number of strings to collect before sampling seed: Random seed for reproducibility - + Returns: List of up to k sampled IPA strings """ rng = random.Random(seed) collected: List[str] = [] - + for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs): collected.append(ipa) if len(collected) >= max_collect: break - + # If we have fewer than k, return all if len(collected) <= k: return collected - + # Otherwise, randomly sample k return rng.sample(collected, k) @@ -285,11 +288,11 @@ def create_balanced_corpus( ) -> Tuple[str, Dict[str, int]]: """ Create a balanced IPA corpus file with equal samples from each language. - + Uses a memory-efficient two-pass approach: 1. First pass: Count sentences per language (up to max_count_per_lang) 2. Second pass: Use simple sampling to select samples - + Args: train_langs: List of language codes to include cuts_dirs: Dictionary mapping language codes to lists of cuts directories @@ -297,14 +300,14 @@ def create_balanced_corpus( max_samples_per_lang: Optional cap on samples per language max_count_per_lang: Max count per language when counting IPA strings seed: Random seed for reproducibility - + Returns: Tuple of (corpus_file_path, dict of lang -> actual_count) """ # First pass: Count sentences per language print("[INFO] Pass 1: Counting IPA strings per language...") lang_counts: Dict[str, int] = {} - + for lang in train_langs: if lang not in cuts_dirs: print(f"[WARN] Language {lang} not in config, skipping") @@ -313,42 +316,42 @@ def create_balanced_corpus( count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang) lang_counts[lang] = count print(f"{count} IPA strings") - + if not lang_counts: raise ValueError("No IPA strings found for any language") - + # Find minimum count across languages min_count = min(lang_counts.values()) print(f"[INFO] Minimum count across languages: {min_count}") - + # Apply max_samples_per_lang cap if specified samples_per_lang = min_count if max_samples_per_lang is not None and max_samples_per_lang < min_count: samples_per_lang = max_samples_per_lang print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}") - + # Second pass: Sample from each language using simple sampling print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...") actual_counts: Dict[str, int] = {} total_written = 0 - + with open(output_file, "w", encoding="utf-8") as f: for lang in lang_counts.keys(): print(f"[INFO] Sampling from {lang}...", end=" ", flush=True) # Use different seed per language for variety, but reproducible lang_seed = seed + hash(lang) % 10000 sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed) - + for ipa in sampled: f.write(ipa + "\n") total_written += 1 - + actual_counts[lang] = len(sampled) print(f"sampled {len(sampled)} strings") - + print(f"[INFO] Total IPA strings written to corpus: {total_written}") print(f"[INFO] Balanced corpus saved to: {output_file}") - + return output_file, actual_counts @@ -360,48 +363,48 @@ def train_ipa_bpe_tokenizer( ) -> Tokenizer: """ Train a byte-level BPE tokenizer on IPA strings from a pre-built corpus file. - + Args: output_dir: Directory to save tokenizer files vocab_size: Target vocabulary size corpus_file: Path to the IPA corpus file (one IPA string per line) min_frequency: Minimum frequency for a token to be included - + Returns: Trained Tokenizer object """ tokenizer_dir = os.path.join(output_dir, f"ipa_bpe_v{vocab_size}") os.makedirs(tokenizer_dir, exist_ok=True) - + tokenizer_file = os.path.join(tokenizer_dir, "tokenizer.json") - + # Check if already trained if os.path.exists(tokenizer_file): print(f"[INFO] Loading existing tokenizer from {tokenizer_file}") return Tokenizer.from_file(tokenizer_file) - + # Initialize tokenizer tokenizer = Tokenizer(BPE(unk_token="")) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) - + special_tokens = ["", "", ""] - + trainer = BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, show_progress=True, ) - + print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}...") tokenizer.train(files=[corpus_file], trainer=trainer) - + # Save tokenizer.save(tokenizer_file) tokenizer.model.save(tokenizer_dir) - + print(f"[INFO] Saved tokenizer to {tokenizer_dir}") - + return tokenizer @@ -418,35 +421,35 @@ def compute_stats( qwen_counts = [] nemotron_counts = [] ipa_counts = {vs: [] for vs in ipa_tokenizers.keys()} - + for pair in text_pairs: # Qwen tokenizer on raw text qwen_tokens = qwen_tokenizer.encode(pair.raw_text) qwen_counts.append(len(qwen_tokens)) - + # Nemotron tokenizer on raw text nemotron_tokens = nemotron_tokenizer.encode(pair.raw_text) nemotron_counts.append(len(nemotron_tokens)) - + # IPA tokenizers on IPA text for vocab_size, tokenizer in ipa_tokenizers.items(): ipa_tokens = tokenizer.encode(pair.ipa_text) ipa_counts[vocab_size].append(len(ipa_tokens.ids)) - + # Calculate total duration and token counts total_duration = sum(pair.duration for pair in text_pairs) qwen_total = sum(qwen_counts) nemotron_total = sum(nemotron_counts) - + # Compute tokens per second qwen_tps = qwen_total / total_duration if total_duration > 0 else 0.0 nemotron_tps = nemotron_total / total_duration if total_duration > 0 else 0.0 - + ipa_tps = {} for vocab_size in ipa_tokenizers.keys(): ipa_total = sum(ipa_counts[vocab_size]) ipa_tps[vocab_size] = ipa_total / total_duration if total_duration > 0 else 0.0 - + return TokenizationStats( lang=lang, num_samples=len(text_pairs), @@ -462,32 +465,32 @@ def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int] print("\n" + "=" * 120) print("TOKENS PER SECOND: Qwen2.5-1.5B-Instruct & Nemotron Nano 30B (raw text) vs IPA BPE (phonemized)") print("=" * 120) - + # Header header = f"{'Lang':<6} {'Samples':>8} {'Duration(s)':>12} {'Qwen tok/s':>12} {'Nemo tok/s':>12}" for vs in vocab_sizes: header += f" {'IPA-' + str(vs):>10}" print(header) print("-" * 120) - + # Data rows for stats in all_stats: row = f"{stats.lang:<6} {stats.num_samples:>8} {stats.total_duration:>12.2f} {stats.qwen_tokens_per_second:>12.2f} {stats.nemotron_tokens_per_second:>12.2f}" for vs in vocab_sizes: row += f" {stats.ipa_tokens_per_second[vs]:>10.2f}" print(row) - + # Aggregated stats print("-" * 120) total_samples = sum(s.num_samples for s in all_stats) total_duration = sum(s.total_duration for s in all_stats) - + # Compute overall tokens per second (weighted by duration) total_qwen_tokens = sum(s.qwen_tokens_per_second * s.total_duration for s in all_stats) total_nemotron_tokens = sum(s.nemotron_tokens_per_second * s.total_duration for s in all_stats) overall_qwen_tps = total_qwen_tokens / total_duration if total_duration > 0 else 0 overall_nemotron_tps = total_nemotron_tokens / total_duration if total_duration > 0 else 0 - + agg_row = f"{'TOTAL':<6} {total_samples:>8} {total_duration:>12.2f} {overall_qwen_tps:>12.2f} {overall_nemotron_tps:>12.2f}" for vs in vocab_sizes: total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats) @@ -495,7 +498,7 @@ def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int] agg_row += f" {overall_ipa_tps:>10.2f}" print(agg_row) print("=" * 120) - + # Summary print("\nSUMMARY:") print(f" - Total samples analyzed: {total_samples}") @@ -523,20 +526,21 @@ def save_results_json( }, "results": [], } - + for stats in all_stats: - output["results"].append({ - "lang": stats.lang, - "num_samples": stats.num_samples, - "total_duration_seconds": stats.total_duration, - "qwen_tokens_per_second": stats.qwen_tokens_per_second, - "nemotron_tokens_per_second": stats.nemotron_tokens_per_second, - "ipa_tokens_per_second": { - str(vs): stats.ipa_tokens_per_second[vs] - for vs in stats.ipa_tokens_per_second.keys() + output["results"].append( + { + "lang": stats.lang, + "num_samples": stats.num_samples, + "total_duration_seconds": stats.total_duration, + "qwen_tokens_per_second": stats.qwen_tokens_per_second, + "nemotron_tokens_per_second": stats.nemotron_tokens_per_second, + "ipa_tokens_per_second": { + str(vs): stats.ipa_tokens_per_second[vs] for vs in stats.ipa_tokens_per_second.keys() + }, } - }) - + ) + with open(output_path, "w", encoding="utf-8") as f: json.dump(output, f, indent=2) print(f"[INFO] Saved results to {output_path}") @@ -555,9 +559,7 @@ def parse_lang_arg(arg: str, available_langs: List[str]) -> List[str]: def main(): - parser = argparse.ArgumentParser( - description="Compare tokenization between Qwen and IPA BPE tokenizers." - ) + parser = argparse.ArgumentParser(description="Compare tokenization between Qwen and IPA BPE tokenizers.") parser.add_argument( "--output_dir", type=str, @@ -598,7 +600,7 @@ def main(): "--config", type=str, default=None, - help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}" + help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}", ) parser.add_argument( "--max_count_per_lang", @@ -607,15 +609,15 @@ def main(): help="Max count per language when counting IPA strings (default: 100000)", ) args = parser.parse_args() - + os.makedirs(args.output_dir, exist_ok=True) - + # Load config config_path = Path(args.config) if args.config else None cuts_dirs = load_cuts_dirs_config(config_path) available_langs = list(cuts_dirs.keys()) print(f"[INFO] Loaded config with languages: {available_langs}") - + # Parse train and test languages try: train_langs = parse_lang_arg(args.train_langs, available_langs) @@ -623,20 +625,20 @@ def main(): except ValueError as e: print(f"[ERROR] {e}") sys.exit(1) - + print(f"[INFO] Training languages: {train_langs}") print(f"[INFO] Testing languages: {test_langs}") print(f"[INFO] Samples per language for testing: {args.samples_per_lang}") print(f"[INFO] Max samples per language for training: {args.max_samples_per_lang or 'auto (min across langs)'}") print(f"[INFO] Vocab sizes: {VOCAB_SIZES}") - + # Step 1: Create balanced IPA corpus once print("\n" + "=" * 60) print("STEP 1: Creating balanced IPA corpus") print("=" * 60) - + corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt") - + # Check if corpus already exists if os.path.exists(corpus_file): print(f"[INFO] Using existing corpus file: {corpus_file}") @@ -652,12 +654,12 @@ def main(): max_count_per_lang=args.max_count_per_lang, seed=args.seed, ) - + # Step 2: Train IPA BPE tokenizers at different vocab sizes (reusing corpus) print("\n" + "=" * 60) print("STEP 2: Training IPA BPE tokenizers") print("=" * 60) - + ipa_tokenizers = {} for vocab_size in VOCAB_SIZES: print(f"\n[INFO] Training tokenizer with vocab_size={vocab_size}") @@ -667,60 +669,64 @@ def main(): corpus_file=corpus_file, min_frequency=2, ) - + # Step 3: Load Qwen and Nemotron tokenizers print("\n" + "=" * 60) print("STEP 3: Loading Qwen and Nemotron tokenizers") print("=" * 60) - + print("[INFO] Loading Qwen/Qwen2.5-1.5B-Instruct tokenizer...") qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") print(f"[INFO] Qwen tokenizer vocab size: {qwen_tokenizer.vocab_size}") - + print("[INFO] Loading nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 tokenizer...") - - nemotron_tokenizer = AutoTokenizer.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True) - + + nemotron_tokenizer = AutoTokenizer.from_pretrained( + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True + ) + print(f"[INFO] Nemotron tokenizer vocab size: {nemotron_tokenizer.vocab_size}") - + # Step 4: Sample text pairs and compute statistics (on test languages) print("\n" + "=" * 60) print("STEP 4: Sampling and analyzing (test languages)") print("=" * 60) - + all_stats = [] for lang in test_langs: print(f"\n[INFO] Processing language: {lang}") - + # Sample text pairs text_pairs = sample_text_pairs(lang, cuts_dirs, args.samples_per_lang, args.seed) - + if not text_pairs: print(f"[WARN] No text pairs found for {lang}, skipping") continue - + print(f"[INFO] Sampled {len(text_pairs)} text pairs for {lang}") - + # Compute stats stats = compute_stats(text_pairs, qwen_tokenizer, nemotron_tokenizer, ipa_tokenizers, lang) all_stats.append(stats) - + # Print intermediate results - print(f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s") + print( + f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s" + ) for vs in VOCAB_SIZES: print(f" IPA-{vs}={stats.ipa_tokens_per_second[vs]:.2f} tok/s") - + # Step 5: Print and save results print("\n" + "=" * 60) print("STEP 5: Results") print("=" * 60) - + print_stats_table(all_stats, VOCAB_SIZES) - + # Save to JSON with metadata results_path = os.path.join(args.output_dir, "tokenization_comparison.json") save_results_json(all_stats, results_path, train_langs, test_langs) - + print("[INFO] Done!") diff --git a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py index c6098d93839a..825129d2c928 100644 --- a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py +++ b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py @@ -50,13 +50,14 @@ def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[ """Load CUTS_DIRS_BY_LANG from a JSON config file.""" if config_path is None: config_path = DEFAULT_CONFIG_PATH - + if not config_path.exists(): raise FileNotFoundError(f"Config file not found: {config_path}") - + with open(config_path, "r", encoding="utf-8") as f: return json.load(f) + OUTPUT_SUFFIX = "_with_ipa" # cuts -> cuts_with_ipa SHARD_GLOB = "cuts.*.jsonl.gz" @@ -82,7 +83,7 @@ def iter_shards(ipa_dir: Path) -> List[Path]: def extract_ipa_from_shard(shard_path: Path) -> Generator[str, None, None]: """ Extract all IPA strings from a single shard file. - + Yields: IPA strings from cut["supervisions"][i]["custom"]["ipa"] """ @@ -121,11 +122,11 @@ def collect_ipa_strings( ) -> Generator[str, None, None]: """ Collect all IPA strings from the specified language(s). - + Args: cuts_dirs: Dictionary mapping language codes to lists of cuts directories lang: Language code or None for all languages. - + Yields: IPA strings """ @@ -135,17 +136,17 @@ def collect_ipa_strings( if lang not in cuts_dirs: raise ValueError(f"Unknown language: {lang}. Available: {get_available_languages(cuts_dirs)}") langs_to_process = [lang] - + for lang_code in langs_to_process: print(f"[INFO] Processing language: {lang_code}") for cuts_dir_str in cuts_dirs[lang_code]: cuts_dir = Path(cuts_dir_str) ipa_dir = get_ipa_dir(cuts_dir) - + if not ipa_dir.exists(): print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr) continue - + print(f"[INFO] Reading from: {ipa_dir}") count = 0 for ipa in extract_ipa_from_dir(ipa_dir): @@ -161,14 +162,14 @@ def iter_ipa_strings_for_lang( """Iterate over all IPA strings for a single language (memory-efficient).""" if lang not in cuts_dirs: return - + for cuts_dir_str in cuts_dirs[lang]: cuts_dir = Path(cuts_dir_str) ipa_dir = get_ipa_dir(cuts_dir) - + if not ipa_dir.exists(): continue - + for ipa in extract_ipa_from_dir(ipa_dir): yield ipa @@ -192,31 +193,31 @@ def simple_sample_ipa_strings( ) -> List[str]: """ Simple sampling: collect up to max_collect IPA strings, then randomly sample k. - + This avoids reading through all data like reservoir sampling does. - + Args: lang: Language code cuts_dirs: Dictionary mapping language codes to lists of cuts directories k: Number of samples to select max_collect: Maximum number of strings to collect before sampling seed: Random seed for reproducibility - + Returns: List of up to k sampled IPA strings """ rng = random.Random(seed) collected: List[str] = [] - + for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs): collected.append(ipa) if len(collected) >= max_collect: break - + # If we have fewer than k, return all if len(collected) <= k: return collected - + # Otherwise, randomly sample k return rng.sample(collected, k) @@ -242,11 +243,11 @@ def create_balanced_corpus( ) -> Tuple[str, Dict[str, int]]: """ Create a balanced IPA corpus file with equal samples from each language. - + Uses a memory-efficient two-pass approach: 1. First pass: Count sentences per language (up to max_count_per_lang) 2. Second pass: Use simple sampling to select samples - + Args: train_langs: List of language codes to include cuts_dirs: Dictionary mapping language codes to lists of cuts directories @@ -254,14 +255,14 @@ def create_balanced_corpus( max_samples_per_lang: Optional cap on samples per language max_count_per_lang: Max count per language when counting IPA strings seed: Random seed for reproducibility - + Returns: Tuple of (corpus_file_path, dict of lang -> actual_count) """ # First pass: Count sentences per language print("[INFO] Pass 1: Counting IPA strings per language...") lang_counts: Dict[str, int] = {} - + for lang in train_langs: if lang not in cuts_dirs: print(f"[WARN] Language {lang} not in config, skipping") @@ -270,42 +271,42 @@ def create_balanced_corpus( count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang) lang_counts[lang] = count print(f"{count} IPA strings") - + if not lang_counts: raise ValueError("No IPA strings found for any language") - + # Find minimum count across languages min_count = min(lang_counts.values()) print(f"[INFO] Minimum count across languages: {min_count}") - + # Apply max_samples_per_lang cap if specified samples_per_lang = min_count if max_samples_per_lang is not None and max_samples_per_lang < min_count: samples_per_lang = max_samples_per_lang print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}") - + # Second pass: Sample from each language using simple sampling print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...") actual_counts: Dict[str, int] = {} total_written = 0 - + with open(output_file, "w", encoding="utf-8") as f: for lang in lang_counts.keys(): print(f"[INFO] Sampling from {lang}...", end=" ", flush=True) # Use different seed per language for variety, but reproducible lang_seed = seed + hash(lang) % 10000 sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed) - + for ipa in sampled: f.write(ipa + "\n") total_written += 1 - + actual_counts[lang] = len(sampled) print(f"sampled {len(sampled)} strings") - + print(f"[INFO] Total IPA strings written to corpus: {total_written}") print(f"[INFO] Balanced corpus saved to: {output_file}") - + return output_file, actual_counts @@ -317,45 +318,45 @@ def train_bpe_tokenizer( ) -> Tokenizer: """ Train a byte-level BPE tokenizer on IPA strings from a corpus file. - + Args: corpus_file: Path to the IPA corpus file (one IPA string per line) vocab_size: Target vocabulary size min_frequency: Minimum frequency for a token to be included output_dir: Directory to save the tokenizer files - + Returns: Trained Tokenizer object """ # Create output directory os.makedirs(output_dir, exist_ok=True) - + # Check if tokenizer already exists tokenizer_path = os.path.join(output_dir, "tokenizer.json") if os.path.exists(tokenizer_path): print(f"[INFO] Loading existing tokenizer from {tokenizer_path}") return Tokenizer.from_file(tokenizer_path) - + # Count lines in corpus with open(corpus_file, "r", encoding="utf-8") as f: total_count = sum(1 for _ in f) print(f"[INFO] Corpus contains {total_count} IPA strings") - + if total_count == 0: raise ValueError("Corpus file is empty. Make sure the cuts_with_ipa directories exist.") - + # Initialize a byte-level BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="")) - + # Use byte-level pre-tokenization (like GPT-2) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) - + # Add byte-level decoder to properly convert back to original text tokenizer.decoder = ByteLevelDecoder() - + # Define special tokens special_tokens = ["", "", ""] - + # Create trainer trainer = BpeTrainer( vocab_size=vocab_size, @@ -363,27 +364,27 @@ def train_bpe_tokenizer( special_tokens=special_tokens, show_progress=True, ) - + # Train the tokenizer print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}, min_frequency={min_frequency}") tokenizer.train(files=[corpus_file], trainer=trainer) - + # Save the tokenizer vocab_path = os.path.join(output_dir, "vocab.json") merges_path = os.path.join(output_dir, "merges.txt") - + # Save using the tokenizer's model save method tokenizer.model.save(output_dir) - + # Also save the full tokenizer for easy loading tokenizer.save(tokenizer_path) - + print(f"[INFO] Tokenizer saved to: {output_dir}") print(f"[INFO] - vocab.json: {vocab_path}") print(f"[INFO] - merges.txt: {merges_path}") print(f"[INFO] - tokenizer.json: {tokenizer_path}") print(f"[INFO] Vocabulary size: {tokenizer.get_vocab_size()}") - + return tokenizer @@ -431,7 +432,7 @@ def main(): "--config", type=str, default=None, - help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}" + help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}", ) parser.add_argument( "--max_count_per_lang", @@ -440,19 +441,19 @@ def main(): help="Max count per language when counting IPA strings (default: 100000)", ) args = parser.parse_args() - + # Load config config_path = Path(args.config) if args.config else None cuts_dirs = load_cuts_dirs_config(config_path) available_langs = get_available_languages(cuts_dirs) - + # Parse train_langs try: train_langs = parse_langs_arg(args.train_langs, available_langs) except ValueError as e: print(f"[ERROR] {e}") sys.exit(1) - + print(f"[INFO] Training IPA BPE tokenizer") print(f"[INFO] Output directory: {args.output_dir}") print(f"[INFO] Vocabulary size: {args.vocab_size}") @@ -461,16 +462,16 @@ def main(): print(f"[INFO] Max samples per lang: {args.max_samples_per_lang or 'auto (min across langs)'}") print(f"[INFO] Max count per lang: {args.max_count_per_lang}") print(f"[INFO] Available languages: {available_langs}") - + os.makedirs(args.output_dir, exist_ok=True) - + # Step 1: Create balanced corpus print("\n" + "=" * 60) print("STEP 1: Creating balanced IPA corpus") print("=" * 60) - + corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt") - + if os.path.exists(corpus_file): print(f"[INFO] Using existing corpus file: {corpus_file}") with open(corpus_file, "r", encoding="utf-8") as f: @@ -485,24 +486,24 @@ def main(): max_count_per_lang=args.max_count_per_lang, seed=args.seed, ) - + # Step 2: Train tokenizer print("\n" + "=" * 60) print("STEP 2: Training BPE tokenizer") print("=" * 60) - + tokenizer = train_bpe_tokenizer( corpus_file=corpus_file, vocab_size=args.vocab_size, min_frequency=args.min_frequency, output_dir=args.output_dir, ) - + # Test the tokenizer print("\n[INFO] Testing tokenizer with sample IPA strings:") test_strings = [ "həˈloʊ wɜːld", # hello world - "ˈaɪ pʰiː eɪ", # IPA + "ˈaɪ pʰiː eɪ", # IPA "ˈtɛstɪŋ wʌn tuː θriː", # testing one two three ] for test_str in test_strings: @@ -513,7 +514,7 @@ def main(): print(f" IDs: {encoded.ids}") print(f" Decoded: '{decoded}'") print() - + print("[INFO] Done!") From 91f71c8347097587208ab4a266826287ffe33230 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Mon, 2 Feb 2026 11:12:13 -0800 Subject: [PATCH 30/94] nemotron mamba model (#58) * nemotron mamba model Signed-off-by: Paarth Neekhara * lhotse config update Signed-off-by: Paarth Neekhara * mamba inference working Signed-off-by: Paarth Neekhara --------- Signed-off-by: Paarth Neekhara --- .../tts/conf/magpietts/easy_magpietts.yaml | 39 + .../conf/magpietts/easy_magpietts_lhotse.yaml | 40 + examples/tts/evalset_config.json | 40 +- nemo/collections/tts/models/easy_magpietts.py | 87 +- nemo/collections/tts/modules/__init__.py | 10 + .../tts/modules/nemotron_h_decoder.py | 1456 +++++++++++++++++ .../tts/test_nemotron_h_decoder.py | 745 +++++++++ 7 files changed, 2381 insertions(+), 36 deletions(-) create mode 100644 nemo/collections/tts/modules/nemotron_h_decoder.py create mode 100644 tests/collections/tts/test_nemotron_h_decoder.py diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index eea075870b07..6166fd68968f 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -13,7 +13,46 @@ train_ds_meta: ??? val_ds_meta: ??? model: + # Decoder backend selection + # Options: "huggingface" (default), "nemotron_h" + decoder_type: "huggingface" + + # HuggingFace backend config (used when decoder_type: "huggingface") transformer_hf_backend: "Qwen/Qwen2.5-1.5B" + + # NemotronH config (used when decoder_type: "nemotron_h") + # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern: + # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer + nemotron_h_config: + hidden_size: 1536 # Should match embedding_dim + num_hidden_layers: 24 + vocab_size: 131072 + # Attention config + num_attention_heads: 12 + num_key_value_heads: 4 + attention_dropout: 0.0 + attention_bias: false + max_position_embeddings: 8192 + # Mamba config + mamba_num_heads: 64 + mamba_head_dim: 24 + ssm_state_size: 128 + conv_kernel: 4 + n_groups: 8 + chunk_size: 256 + mamba_hidden_act: "silu" + use_conv_bias: true + use_bias: false + # MLP config + intermediate_size: 4096 + mlp_hidden_act: "silu" + mlp_bias: false + # Layer pattern: alternating Mamba and Attention + hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*" + # Normalization + layer_norm_epsilon: 1e-5 + residual_in_fp32: true + use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided. context_duration_min: 5.0 context_duration_max: 5.0 diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index 2327820e44a4..5461af8d6ee5 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -11,7 +11,47 @@ quadratic_duration: 20 model: use_lhotse: true + + # Decoder backend selection + # Options: "huggingface" (default), "nemotron_h" + decoder_type: "huggingface" + + # HuggingFace backend config (used when decoder_type: "huggingface") transformer_hf_backend: "Qwen/Qwen2.5-1.5B" + + # NemotronH config (used when decoder_type: "nemotron_h") + # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern: + # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer + nemotron_h_config: + hidden_size: 1536 # Should match embedding_dim + num_hidden_layers: 24 + vocab_size: 131072 + # Attention config + num_attention_heads: 12 + num_key_value_heads: 4 + attention_dropout: 0.0 + attention_bias: false + max_position_embeddings: 8192 + # Mamba config + mamba_num_heads: 64 + mamba_head_dim: 24 + ssm_state_size: 128 + conv_kernel: 4 + n_groups: 8 + chunk_size: 256 + mamba_hidden_act: "silu" + use_conv_bias: true + use_bias: false + # MLP config + intermediate_size: 4096 + mlp_hidden_act: "silu" + mlp_bias: false + # Layer pattern: alternating Mamba and Attention + hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*" + # Normalization + layer_norm_epsilon: 1e-5 + residual_in_fp32: true + use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided. context_duration_min: 5.0 context_duration_max: 5.0 diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json index 029f818ef53b..49822ce9cf25 100644 --- a/examples/tts/evalset_config.json +++ b/examples/tts/evalset_config.json @@ -15,43 +15,51 @@ "feature_dir": null }, "riva_multibpe": { - "manifest_path": "/Data/evaluation_manifests/riva_hard_multi_bpe.ndjson", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/riva_hard_multi_bpe.ndjson", "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS" + "feature_dir": "/Data/RIVA-TTS", + "tokenizer_names": ["nemotron_nano_30b"] }, "riva_hard_digits": { - "manifest_path": "/Data/evaluation_manifests/hard-digits-path-corrected.ndjson", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-digits-path-corrected.ndjson", "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS" + "feature_dir": "/Data/RIVA-TTS", + "tokenizer_names": ["nemotron_nano_30b"] }, "riva_hard_letters": { - "manifest_path": "/Data/evaluation_manifests/hard-letters-path-corrected.ndjson", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-letters-path-corrected.ndjson", "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS" + "feature_dir": "/Data/RIVA-TTS", + "tokenizer_names": ["nemotron_nano_30b"] }, "riva_hard_money": { - "manifest_path": "/Data/evaluation_manifests/hard-money-path-corrected.ndjson", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-money-path-corrected.ndjson", "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS" + "feature_dir": "/Data/RIVA-TTS", + "tokenizer_names": ["nemotron_nano_30b"] }, "riva_hard_short": { - "manifest_path": "/Data/evaluation_manifests/hard-short-path-corrected.ndjson", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-short-path-corrected.ndjson", "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS" + "feature_dir": "/Data/RIVA-TTS", + "tokenizer_names": ["nemotron_nano_30b"] }, "vctk": { - "manifest_path": "/Data/evaluation_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json", "audio_dir": "/Data/VCTK-Corpus-0.92", - "feature_dir": "/Data/VCTK-Corpus-0.92" + "feature_dir": "/Data/VCTK-Corpus-0.92", + "tokenizer_names": ["nemotron_nano_30b"] }, "libritts_seen": { - "manifest_path": "/Data/evaluation_manifests/LibriTTS_seen_evalset_from_testclean_v2.json", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_seen_evalset_from_testclean_v2.json", "audio_dir": "/Data/LibriTTS", - "feature_dir": "/Data/LibriTTS" + "feature_dir": "/Data/LibriTTS", + "tokenizer_names": ["nemotron_nano_30b"] }, "libritts_test_clean": { - "manifest_path": "/Data/evaluation_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl", + "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl", "audio_dir": "/Data/LibriTTS", - "feature_dir": "/Data/LibriTTS" + "feature_dir": "/Data/LibriTTS", + "tokenizer_names": ["nemotron_nano_30b"] } } diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index f6dd9c7728b3..dabdd0ae6f30 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -301,15 +301,37 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) - self.transformer_backend_config = AutoConfig.from_pretrained( - cfg.transformer_hf_backend, - trust_remote_code=True, - ) + # Decoder backend selection - supports HuggingFace models or NemotronH + self.decoder_type = cfg.get('decoder_type', 'huggingface') # backward compatible default + logging.info(f"Using decoder type: {self.decoder_type}") + + if self.decoder_type == 'huggingface': + # Existing HuggingFace path + self.transformer_backend_config = AutoConfig.from_pretrained( + cfg.transformer_hf_backend, + trust_remote_code=True, + ) + hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) + self.decoder = hf_transformer.model + self.lm_text_head = hf_transformer.lm_head + + elif self.decoder_type == 'nemotron_h': + # NemotronH hybrid Mamba2/Attention backend + from nemo.collections.tts.modules.nemotron_h_decoder import NemotronHConfig, NemotronHForCausalLM + + # Build config from YAML parameters + nemotron_h_config_dict = dict(cfg.get('nemotron_h_config', {})) + # Ensure hidden_size matches embedding_dim for compatibility + if 'hidden_size' not in nemotron_h_config_dict: + nemotron_h_config_dict['hidden_size'] = cfg.embedding_dim + nemotron_config = NemotronHConfig(**nemotron_h_config_dict) + nemotron_model = NemotronHForCausalLM(nemotron_config) + self.decoder = nemotron_model.backbone + self.lm_text_head = nemotron_model.lm_head + logging.info(f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}...") - hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) - self.decoder = hf_transformer.model - # self.decoder.to(torch.float32) - self.lm_text_head = hf_transformer.lm_head + else: + raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'") self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) self.decoder.set_input_embeddings(self.text_embedding) @@ -647,13 +669,23 @@ def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens): total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor return total_phoneme_loss, loss_mask - def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None): - backend_out = self.decoder( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - use_cache=use_cache, - past_key_values=past_key_values, - ) + def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None, cache_position=None): + # Only pass cache_position for NemotronH (HF transformers may not accept it) + if self.decoder_type == 'nemotron_h': + backend_out = self.decoder( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + use_cache=use_cache, + past_key_values=past_key_values, + cache_position=cache_position, + ) + else: + backend_out = self.decoder( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + use_cache=use_cache, + past_key_values=past_key_values, + ) # hidden_states = backend_out.last_hidden_state # (B, T_total, H) return backend_out @@ -1999,17 +2031,25 @@ def infer_batch( ] # (2B, T_min, E) else: first_inference_input = context_plus_audio_embedded[:, :min_context_len, :] # (B, T_min, E) + + # Initialize cache_position for tracking sequence position (needed for NemotronH) + cache_position = torch.arange(min_context_len, device=context_embedding.device) + # First forward pass to get the initial hidden state and past key values transformer_out = self.forward( inputs_embeds=first_inference_input, attention_mask=None, use_cache=True, past_key_values=None, # No past key values for the first step + cache_position=cache_position, ) time_to_first_prediction = time.time() - start_time last_hidden = transformer_out.last_hidden_state # (B, T_total, E) past_kv = transformer_out.past_key_values + + # Track the current sequence length for cache_position updates + current_cache_seq_len = min_context_len all_predictions = [] end_indices = {} @@ -2034,7 +2074,7 @@ def infer_batch( current_text_positions += 1 if self.phoneme_tokenizer is not None: current_phoneme_positions += 1 - print("current_phoneme_positions", current_phoneme_positions) + # print("current_phoneme_positions", current_phoneme_positions) if idx % 20 == 0: print(f"Decoding timestep {idx}") @@ -2098,15 +2138,15 @@ def infer_batch( device=context_embedding.device, ).long() # (B, phoneme_stacking_factor) use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long() - print("use_bos_phoneme", use_bos_phoneme) + # print("use_bos_phoneme", use_bos_phoneme) pred_phoneme_tokens = ( use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens ).long() # (B, phoneme_stacking_factor) - print("pred_phoneme_tokens", pred_phoneme_tokens) + # print("pred_phoneme_tokens", pred_phoneme_tokens) gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1) gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # (B, phoneme_stacking_factor) - print("gt_phoneme_tokens_current", gt_phoneme_tokens_current) + # print("gt_phoneme_tokens_current", gt_phoneme_tokens_current) input_phoneme_tokens_current = ( gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens @@ -2126,7 +2166,7 @@ def infer_batch( phoneme_channel_input_t = ( use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding ) - print("use_phoneme_input", use_phoneme_input) + # print("use_phoneme_input", use_phoneme_input) for item_idx in range(actual_batch_size): if use_phoneme_input[item_idx, 0, 0] > 0: for phoneme_channel_idx in range(self.phoneme_stacking_factor): @@ -2202,14 +2242,21 @@ def infer_batch( if use_cfg: next_input = torch.cat([next_input, new_emb_unconditional], dim=0) # (2B, 1, E) + # Update cache_position for current step (needed for NemotronH cached forward) + cache_position = torch.tensor([current_cache_seq_len], device=context_embedding.device) + transformer_out = self.forward( inputs_embeds=next_input, attention_mask=None, use_cache=True, past_key_values=past_kv, + cache_position=cache_position, ) last_hidden = transformer_out.last_hidden_state past_kv = transformer_out.past_key_values + + # Increment sequence length for next iteration + current_cache_seq_len += 1 if len(end_indices) == audio_codes_next.size(0): print("All items finished at timestep {}".format(idx)) break diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py index c4dffba34215..0c9a8c182b71 100644 --- a/nemo/collections/tts/modules/__init__.py +++ b/nemo/collections/tts/modules/__init__.py @@ -15,3 +15,13 @@ import nemo.collections.tts.modules.adapters import nemo.collections.tts.modules.ffn_modules import nemo.collections.tts.modules.moe_modules +from nemo.collections.tts.modules.nemotron_h_decoder import ( + HybridMambaAttentionDynamicCache, + NemotronHConfig, + NemotronHForCausalLM, + NemotronHModel, +) +from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder +from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder +from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet +from nemo.collections.tts.modules.waveglow import WaveGlowModule diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py new file mode 100644 index 000000000000..b33c1ecba663 --- /dev/null +++ b/nemo/collections/tts/modules/nemotron_h_decoder.py @@ -0,0 +1,1456 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +NemotronH model implementation for use as a decoder backbone in TTS models. +Ported from: https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/blob/main/modeling_nemotron_h.py + +This is a hybrid Mamba2/Attention model that can be configured with different +layer types (Mamba, Attention, MLP, MoE) via the hybrid_override_pattern config. +""" + +import math +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from nemo.utils import logging + + +# Try to import optimized kernels, fall back to pure PyTorch if unavailable +try: + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined + MAMBA_SSM_AVAILABLE = True +except ImportError: + selective_state_update = None + mamba_chunk_scan_combined = None + mamba_split_conv1d_scan_combined = None + MAMBA_SSM_AVAILABLE = False + +try: + from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn + RMSNORM_FN_AVAILABLE = True +except ImportError: + rmsnorm_fn = None + RMSNORM_FN_AVAILABLE = False + +try: + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update + CAUSAL_CONV1D_AVAILABLE = True +except ImportError: + causal_conv1d_fn = None + causal_conv1d_update = None + CAUSAL_CONV1D_AVAILABLE = False + +try: + from flash_attn import flash_attn_func + FLASH_ATTN_AVAILABLE = True +except ImportError: + flash_attn_func = None + FLASH_ATTN_AVAILABLE = False + + +# Check if fast path is available (all optimized kernels present) +IS_FAST_PATH_AVAILABLE = all([ + MAMBA_SSM_AVAILABLE, + CAUSAL_CONV1D_AVAILABLE, + selective_state_update is not None, + mamba_chunk_scan_combined is not None, + causal_conv1d_fn is not None, +]) + + +def get_activation_fn(activation: str): + """Get activation function by name.""" + if activation == "silu" or activation == "swish": + return F.silu + elif activation == "gelu": + return F.gelu + elif activation == "relu": + return F.relu + else: + raise ValueError(f"Unsupported activation: {activation}") + + +@dataclass +class NemotronHConfig: + """ + Configuration class for NemotronH model. + + This configuration controls the hybrid Mamba2/Attention architecture. + The layer types are specified via hybrid_override_pattern where: + - 'M' = Mamba2 layer + - '*' = Attention layer + - '-' = MLP layer + - 'E' = MoE layer + """ + # Model dimensions + hidden_size: int = 1536 + num_hidden_layers: int = 24 + vocab_size: int = 131072 + + # Attention config + num_attention_heads: int = 12 + num_key_value_heads: int = 4 + head_dim: Optional[int] = None + attention_dropout: float = 0.0 + attention_bias: bool = False + max_position_embeddings: int = 4096 + + # Mamba config + mamba_num_heads: int = 64 + mamba_head_dim: int = 64 + ssm_state_size: int = 128 + conv_kernel: int = 4 + n_groups: int = 8 + chunk_size: int = 256 + time_step_min: float = 0.001 + time_step_max: float = 0.1 + time_step_floor: float = 1e-4 + time_step_limit: Tuple[float, float] = (0.0, float("inf")) + mamba_hidden_act: str = "silu" + use_conv_bias: bool = True + use_bias: bool = False + + # MLP config + intermediate_size: int = 4096 + mlp_hidden_act: str = "silu" + mlp_bias: bool = False + + # MoE config (if using MoE layers) + n_routed_experts: int = 8 + num_experts_per_tok: int = 2 + moe_intermediate_size: int = 1024 + moe_shared_expert_intermediate_size: int = 2048 + n_group: int = 1 + topk_group: int = 1 + routed_scaling_factor: float = 1.0 + norm_topk_prob: bool = True + + # Layer pattern: M=Mamba, *=Attention, -=MLP, E=MoE + # Example: "M*M*M*M*" = alternating Mamba and Attention + hybrid_override_pattern: str = "M*M*M*M*M*M*M*M*M*M*M*M*" + + # Normalization + layer_norm_epsilon: float = 1e-5 + residual_in_fp32: bool = True + + # Initialization + initializer_range: float = 0.02 + rescale_prenorm_residual: bool = True + + # Output + use_cache: bool = True + use_return_dict: bool = True + output_attentions: bool = False + output_hidden_states: bool = False + num_logits_to_keep: int = 1 + + # Attention implementation + _attn_implementation: str = "sdpa" # "eager", "sdpa", or "flash_attention_2" + + def __post_init__(self): + # Derive layers_block_type from hybrid_override_pattern + pattern_map = {'M': 'mamba', '*': 'attention', '-': 'mlp', 'E': 'moe'} + self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern] + + # Ensure num_hidden_layers matches pattern length + if len(self.layers_block_type) != self.num_hidden_layers: + # Extend or truncate pattern to match num_hidden_layers + if len(self.layers_block_type) < self.num_hidden_layers: + # Repeat pattern + full_pattern = self.hybrid_override_pattern * (self.num_hidden_layers // len(self.hybrid_override_pattern) + 1) + self.hybrid_override_pattern = full_pattern[:self.num_hidden_layers] + self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern] + else: + self.layers_block_type = self.layers_block_type[:self.num_hidden_layers] + self.hybrid_override_pattern = self.hybrid_override_pattern[:self.num_hidden_layers] + + # Set head_dim if not specified + if self.head_dim is None: + self.head_dim = self.hidden_size // self.num_attention_heads + + +@dataclass +class NemotronHOutput: + """Output class for NemotronH model.""" + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Any] = None # HybridMambaAttentionDynamicCache + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class NemotronHCausalLMOutput: + """Output class for NemotronH causal LM.""" + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Any] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class HybridMambaAttentionDynamicCache: + """ + A dynamic cache that handles both attention cache (with seq_len dimension) + and mamba cache (with constant shape regardless of seq_len). + """ + + def __init__(self, config: NemotronHConfig, batch_size: int, dtype=torch.float16, device=None): + self.dtype = dtype + self.has_previous_state = False + self.conv_kernel_size = config.conv_kernel + + intermediate_size = config.mamba_num_heads * config.mamba_head_dim + ssm_state_size = config.ssm_state_size + conv_kernel_size = config.conv_kernel + + self.conv_states = [] + self.ssm_states = [] + self.key_cache = [] + self.value_cache = [] + self.transformer_layers = [] + + for i in range(config.num_hidden_layers): + if config.layers_block_type[i] == "mamba": + self.conv_states.append( + torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype) + ) + self.ssm_states.append( + torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype) + ) + else: + self.conv_states.append(torch.tensor([[]] * batch_size, device=device)) + self.ssm_states.append(torch.tensor([[]] * batch_size, device=device)) + self.transformer_layers.append(i) + + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if self.key_cache[layer_idx].shape[-1] == 0: + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + if len(self.key_cache) <= layer_idx: + return 0 + return self.key_cache[layer_idx].shape[-2] if self.key_cache[layer_idx].dim() > 2 else 0 + + def update_conv_state(self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False): + if cache_init: + self.conv_states[layer_idx] = new_conv_state.to(self.conv_states[layer_idx].device) + else: + self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1) + self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states[layer_idx].device) + return self.conv_states[layer_idx] + + def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor): + self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states[layer_idx].device) + return self.ssm_states[layer_idx] + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + device = self.conv_states[layer_idx].device + self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device)) + device = self.ssm_states[layer_idx].device + self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device)) + + def reset(self): + """Reset all cache states to zero.""" + for i in range(len(self.conv_states)): + if self.conv_states[i].numel() > 0: + self.conv_states[i].zero_() + if self.ssm_states[i].numel() > 0: + self.ssm_states[i].zero_() + for i in range(len(self.key_cache)): + if self.key_cache[i].numel() > 0: + self.key_cache[i].zero_() + if self.value_cache[i].numel() > 0: + self.value_cache[i].zero_() + + +class NemotronHRMSNorm(nn.Module): + """RMSNorm implementation for NemotronH.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return (self.weight.to(torch.float32) * hidden_states).to(input_dtype) + + +class MambaRMSNormGated(nn.Module): + """Gated RMSNorm for Mamba layers.""" + + def __init__(self, hidden_size: int, group_size: int, eps: float = 1e-5): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + self.group_size = group_size + + def forward(self, hidden_states: torch.Tensor, gate: Optional[torch.Tensor] = None) -> torch.Tensor: + # Only use Triton kernel if available AND tensors are on CUDA + use_triton = ( + RMSNORM_FN_AVAILABLE + and rmsnorm_fn is not None + and hidden_states.is_cuda + ) + + if use_triton: + return rmsnorm_fn( + x=hidden_states, + weight=self.weight, + bias=None, + z=gate, + eps=self.variance_epsilon, + group_size=self.group_size, + norm_before_gate=False + ) + else: + # Fallback: simple RMSNorm + gating (works on CPU and GPU) + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + hidden_states = (self.weight.to(torch.float32) * hidden_states).to(input_dtype) + if gate is not None: + hidden_states = hidden_states * F.silu(gate) + return hidden_states + + +def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int): + """Pad tensor on seq_len dim (dim=1).""" + pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0) + return F.pad(input_tensor, pad_shape, mode="constant", value=0) + + +def reshape_into_chunks(input_tensor, pad_size, chunk_size): + """Pad and reshape tensor into chunks.""" + input_tensor = pad_tensor_by_size(input_tensor, pad_size) + if len(input_tensor.shape) == 3: + return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) + else: + return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]) + + +def segment_sum(input_tensor): + """Compute segment sum for SSM.""" + chunk_size = input_tensor.size(-1) + input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size) + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1) + input_tensor = input_tensor.masked_fill(~mask, 0) + tensor_segsum = torch.cumsum(input_tensor, dim=-2) + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0) + tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf) + return tensor_segsum + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """Zero out hidden states for padding tokens.""" + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + return hidden_states + + +class NemotronHMamba2Mixer(nn.Module): + """ + Mamba2 mixer layer implementation. + Computes state space model operations for sequence modeling. + """ + + def __init__(self, config: NemotronHConfig, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_num_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.ssm_state_size + self.conv_kernel_size = config.conv_kernel + self.intermediate_size = config.mamba_num_heads * config.mamba_head_dim + self.layer_idx = layer_idx + self.use_conv_bias = config.use_conv_bias + self.activation = config.mamba_hidden_act + self.act = get_activation_fn(config.mamba_hidden_act) + self.layer_norm_epsilon = config.layer_norm_epsilon + self.n_groups = config.n_groups + self.head_dim = config.mamba_head_dim + self.chunk_size = config.chunk_size + self.time_step_limit = config.time_step_limit + self.time_step_min = config.time_step_min + self.time_step_max = config.time_step_max + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.use_conv_bias, + kernel_size=config.conv_kernel, + groups=self.conv_dim, + padding=config.conv_kernel - 1, + ) + + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear(self.hidden_size, projection_size, bias=config.use_bias) + + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.A_log._no_weight_decay = True + + self.norm = MambaRMSNormGated( + self.intermediate_size, + eps=self.layer_norm_epsilon, + group_size=self.intermediate_size // self.n_groups + ) + self.D = nn.Parameter(torch.ones(self.num_heads)) + self.D._no_weight_decay = True + + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) + self.use_bias = config.use_bias + + def forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + # Only use CUDA kernels if available AND tensors are on CUDA + if IS_FAST_PATH_AVAILABLE and hidden_states.is_cuda: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + d_mlp = ( + projected_states.shape[-1] + - 2 * self.intermediate_size + - 2 * self.n_groups * self.ssm_state_size + - self.num_heads + ) // 2 + + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + # Cached forward (single token) + _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + A = -torch.exp(self.A_log.float()) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, A, B, C, D, + z=None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + hidden_states = self.norm(hidden_states, gate) + out = self.out_proj(hidden_states)[:, None, ...] + else: + # Full sequence forward + A = -torch.exp(self.A_log.float()) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=None, + activation=self.activation, + rmsnorm_weight=self.norm.weight, + rmsnorm_eps=self.norm.variance_epsilon, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + else: + _, _, gate, hidden_states_B_C, dt = projected_states.split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = F.pad( + hidden_states_B_C_transposed, + (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), + ) + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) + + if self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + ).transpose(1, 2) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + dt, A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=None, + return_final_states=True, + dt_bias=self.dt_bias, + dt_softplus=True, + **dt_limit_kwargs, + ) + + if ssm_state is not None and cache_params is not None: + cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) + + scan_output = scan_output.view(batch_size, seq_len, -1) + scan_output = self.norm(scan_output, gate) + out = self.out_proj(scan_output) + + return out + + def torch_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + """Pure PyTorch implementation (slower but works without CUDA kernels).""" + batch_size, seq_len, _ = hidden_states.shape + dtype = hidden_states.dtype + + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + + d_mlp = ( + projected_states.shape[-1] - 2 * self.intermediate_size + - 2 * self.n_groups * self.ssm_state_size - self.num_heads + ) // 2 + _, _, gate, hidden_states_B_C, dt = projected_states.split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # Convolution + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False) + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + hidden_states_B_C = torch.sum(conv_states * self.conv1d.weight.squeeze(1), dim=-1) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = F.pad( + hidden_states_B_C_transposed, + (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # SSM + A = -torch.exp(self.A_log.float()) + + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + # Single step SSM update + cache_device = cache_params.ssm_states[self.layer_idx].device + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + dt = F.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + + A_expanded = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dA = (torch.exp(dt[..., None] * A_expanded)).to(device=cache_device) + + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + dB = dt[..., None] * B[..., None, :] + + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + cache_params.update_ssm_state( + layer_idx=self.layer_idx, + new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # Full sequence SSM (chunked) + dt = F.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + hidden_states = hidden_states * dt[..., None] + A_dt = A.to(hidden_states.dtype) * dt + + hidden_states, A_dt, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A_dt, B, C)] + + A_dt = A_dt.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A_dt, dim=-1) + L = torch.exp(segment_sum(A_dt)) + + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] + G = G_intermediate.sum(dim=-1) + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(F.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + y = Y_diag + Y_off + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + y = y + D_residual + + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + if ssm_state is not None and cache_params is not None: + cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) + + scan_output = self.norm(y, gate) + contextualized_states = self.out_proj(scan_output.to(dtype)) + return contextualized_states + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """Repeat key/value heads for multi-query attention.""" + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class NemotronHAttention(nn.Module): + """Multi-headed attention for NemotronH.""" + + def __init__(self, config: NemotronHConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if past_key_value is not None: + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, :key_states.shape[-2]] + + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim) + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +class NemotronHMLP(nn.Module): + """MLP layer for NemotronH.""" + + def __init__(self, config: NemotronHConfig, intermediate_size: Optional[int] = None, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.intermediate_size = intermediate_size or config.intermediate_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = get_activation_fn(config.mlp_hidden_act) + + def forward(self, x): + return self.down_proj(self.act_fn(self.up_proj(x))) + + +class NemotronHTopkRouter(nn.Module): + """ + Top-k router for Mixture of Experts. + + Routes tokens to the top-k experts based on learned routing weights. + Supports grouped routing where experts are divided into groups and + top-k groups are selected first, then top-k experts within those groups. + """ + + def __init__(self, config: NemotronHConfig): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.n_routed_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.n_group = config.n_group + self.topk_group = config.topk_group + self.norm_topk_prob = config.norm_topk_prob + + self.weight = nn.Parameter( + torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32) + ) + self.register_buffer( + "e_score_correction_bias", + torch.zeros(self.n_routed_experts, dtype=torch.float32) + ) + + @torch.no_grad() + def get_topk_indices(self, scores: torch.Tensor) -> torch.Tensor: + """Get top-k expert indices using grouped routing.""" + scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) + + # Compute group scores by taking top-2 within each group and summing + group_scores = ( + scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group) + .topk(2, dim=-1)[0] + .sum(dim=-1) + ) + + # Select top-k groups + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] + group_mask = torch.zeros_like(group_scores) + group_mask.scatter_(1, group_idx, 1) + + # Create mask for experts in selected groups + score_mask = ( + group_mask.unsqueeze(-1) + .expand(-1, self.n_group, self.n_routed_experts // self.n_group) + .reshape(-1, self.n_routed_experts) + ) + + # Zero out scores for experts not in selected groups + scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) + + # Select top-k experts from remaining + topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1] + return topk_indices + + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Route tokens to experts. + + Args: + hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size) + + Returns: + topk_indices: Indices of selected experts (batch_size * seq_len, top_k) + topk_weights: Weights for selected experts (batch_size * seq_len, top_k) + """ + hidden_states = hidden_states.view(-1, self.config.hidden_size) + + # Compute router logits and convert to probabilities via sigmoid + router_logits = F.linear(hidden_states.float(), self.weight.float()) + scores = router_logits.sigmoid() + + # Get top-k expert indices + topk_indices = self.get_topk_indices(scores) + + # Gather weights for selected experts + topk_weights = scores.gather(1, topk_indices) + + # Optionally normalize weights + if self.norm_topk_prob: + denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 + topk_weights = topk_weights / denominator + + # Apply routing scaling factor + topk_weights = topk_weights * self.routed_scaling_factor + + return topk_indices, topk_weights + + +class NemotronHMOE(nn.Module): + """ + Mixture of Experts layer for NemotronH. + + Combines multiple expert MLPs with a router that selects which experts + to use for each token. Also includes shared experts that are always used. + """ + + def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + + # Create routed experts + self.experts = nn.ModuleList([ + NemotronHMLP( + config, + intermediate_size=config.moe_intermediate_size, + layer_idx=layer_idx + ) + for _ in range(config.n_routed_experts) + ]) + + # Router for selecting experts + self.gate = NemotronHTopkRouter(config) + + # Shared experts (always used) + self.shared_experts = NemotronHMLP( + config=config, + intermediate_size=config.moe_shared_expert_intermediate_size, + layer_idx=layer_idx + ) + + def moe( + self, + hidden_states: torch.Tensor, + topk_indices: torch.Tensor, + topk_weights: torch.Tensor + ) -> torch.Tensor: + """ + Apply mixture of experts to hidden states. + + Args: + hidden_states: Input tensor of shape (batch_size * seq_len, hidden_size) + topk_indices: Expert indices of shape (batch_size * seq_len, top_k) + topk_weights: Expert weights of shape (batch_size * seq_len, top_k) + + Returns: + Output tensor of shape (batch_size * seq_len, hidden_size) + """ + final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) + + # Create one-hot mask for expert selection + expert_mask = F.one_hot(topk_indices, num_classes=len(self.experts)) + expert_mask = expert_mask.permute(2, 0, 1) # (num_experts, batch*seq, top_k) + + for expert_idx in range(len(self.experts)): + expert = self.experts[expert_idx] + mask = expert_mask[expert_idx] + token_indices, weight_indices = torch.where(mask) + + if token_indices.numel() > 0: + # Get weights and inputs for this expert + expert_weights = topk_weights[token_indices, weight_indices] + expert_input = hidden_states[token_indices] + + # Apply expert and weight the output + expert_output = expert(expert_input) + weighted_output = expert_output * expert_weights.unsqueeze(-1) + + # Accumulate weighted outputs + final_hidden_states.index_add_(0, token_indices, weighted_output) + else: + # No-op compute to mark params as used (for distributed training) + expert_dtype = expert.down_proj.weight.dtype + dummy_input = torch.zeros_like(hidden_states[0]).unsqueeze(0).to(expert_dtype) + dummy_out = expert(dummy_input) + final_hidden_states = final_hidden_states + dummy_out * 0 + + return final_hidden_states.to(hidden_states.dtype) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass through MoE layer. + + Args: + hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size) + + Returns: + Output tensor of shape (batch_size, seq_len, hidden_size) + """ + residuals = hidden_states + orig_shape = hidden_states.shape + + # Route tokens to experts + topk_indices, topk_weights = self.gate(hidden_states) + + # Flatten for expert processing + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + # Apply mixture of experts + hidden_states = self.moe(hidden_states, topk_indices, topk_weights) + + # Reshape back to original shape + hidden_states = hidden_states.view(*orig_shape) + + # Add shared expert output + hidden_states = hidden_states + self.shared_experts(residuals) + + return hidden_states + + +class NemotronHBlock(nn.Module): + """A single block in NemotronH - can be Mamba, Attention, MLP, or MoE.""" + + def __init__(self, config: NemotronHConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.residual_in_fp32 = config.residual_in_fp32 + self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + self.block_type = config.layers_block_type[layer_idx] + if self.block_type == "mamba": + self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx) + elif self.block_type == "attention": + self.mixer = NemotronHAttention(config, layer_idx=layer_idx) + elif self.block_type == "mlp": + self.mixer = NemotronHMLP(config, layer_idx=layer_idx) + elif self.block_type == "moe": + self.mixer = NemotronHMOE(config, layer_idx=layer_idx) + else: + raise ValueError(f"Invalid block type: {self.block_type}") + + def forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + # Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs + if hidden_states.is_cuda: + with torch.cuda.stream(torch.cuda.default_stream(hidden_states.device)): + return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask) + else: + return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask) + + def _forward_impl( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + residual = hidden_states + hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) + if self.residual_in_fp32: + residual = residual.to(torch.float32) + + if self.block_type == "mamba": + hidden_states = self.mixer( + hidden_states, cache_params=cache_params, cache_position=cache_position + ) + elif self.block_type == "attention": + hidden_states = self.mixer( + hidden_states, cache_position=cache_position, past_key_value=cache_params + ) + hidden_states = hidden_states[0] + elif self.block_type in ("mlp", "moe"): + hidden_states = self.mixer(hidden_states) + + hidden_states = residual + hidden_states + return hidden_states + + +class NemotronHModel(nn.Module): + """ + NemotronH backbone model. + + This is the main backbone that can be used as a decoder in TTS models. + It exposes the same interface as HuggingFace transformer models. + """ + + def __init__(self, config: NemotronHConfig): + super().__init__() + self.config = config + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]) + self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + self.gradient_checkpointing = False + self._init_weights() + + def _init_weights(self): + """Initialize weights with special handling for Mamba components.""" + for name, module in self.named_modules(): + if isinstance(module, NemotronHMamba2Mixer): + # Mark parameters that should not have weight decay + module.A_log._no_weight_decay = True + module.D._no_weight_decay = True + + # Special initialization for dt_bias using inverse softplus + # This follows the Mamba2 initialization scheme + dt = torch.exp( + torch.rand(module.num_heads) + * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) + + math.log(self.config.time_step_min) + ).clamp(min=self.config.time_step_floor) + + # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + inv_dt = dt + torch.log(-torch.expm1(-dt)) + with torch.no_grad(): + module.dt_bias.copy_(inv_dt) + module.dt_bias._no_reinit = True + + elif isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + if not getattr(module.bias, "_no_reinit", False): + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=self.config.initializer_range) + + # Rescale prenorm residual weights for better training stability + # Following GPT-2 paper: scale by 1/sqrt(2 * n_layer) + if self.config.rescale_prenorm_residual: + for name, p in self.named_parameters(): + if "out_proj.weight" in name: + # Special Scaled Initialization for residual projections + # Scale by 1/sqrt(num_hidden_layers) + with torch.no_grad(): + p /= math.sqrt(self.config.num_hidden_layers) + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[Tuple, NemotronHOutput]: + # Support both cache_params and past_key_values for compatibility + if past_key_values is not None and cache_params is None: + cache_params = past_key_values + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + hidden_states = inputs_embeds + + # Create cache if use_cache=True but no cache provided + if use_cache and cache_params is None: + cache_params = HybridMambaAttentionDynamicCache( + self.config, + batch_size=hidden_states.shape[0], + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + + # Create causal mask for attention layers + causal_mask = self._create_causal_mask(attention_mask, inputs_embeds, cache_position) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for layer_idx, layer in enumerate(self.layers): + if layer.block_type == "mamba": + layer_mask = mamba_mask + elif layer.block_type == "attention": + layer_mask = causal_mask + else: + layer_mask = None + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + hidden_states = torch.utils.checkpoint.checkpoint( + layer.__call__, hidden_states, cache_params, cache_position, layer_mask + ) + else: + hidden_states = layer( + hidden_states, + cache_params=cache_params, + cache_position=cache_position, + attention_mask=layer_mask, + ) + + hidden_states = self.norm_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) + + return NemotronHOutput( + last_hidden_state=hidden_states, + past_key_values=cache_params if use_cache else None, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _create_causal_mask(self, attention_mask, input_tensor, cache_position): + """Create causal attention mask.""" + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + target_length = cache_position[-1] + 1 + + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + + if attention_mask is not None: + causal_mask = causal_mask.clone() + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) + + return causal_mask + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + Update Mamba mask with optimization. + + No need for zeroing states when: + 1. Cached forward (cache_position[0] > 0) + 2. Attending to all inputs (all mask values are 1) + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + +class NemotronHForCausalLM(nn.Module): + """ + NemotronH model with a language modeling head. + + This is the full model that matches the AutoModelForCausalLM interface. + """ + + def __init__(self, config: NemotronHConfig): + super().__init__() + self.config = config + self.backbone = NemotronHModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self._init_weights() + + def _init_weights(self): + """Initialize weights.""" + nn.init.normal_(self.lm_head.weight, mean=0.0, std=self.config.initializer_range) + + def get_input_embeddings(self): + return self.backbone.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.backbone.set_input_embeddings(new_embeddings) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + @property + def model(self): + """Alias for backbone, for HuggingFace compatibility.""" + return self.backbone + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[Tuple, NemotronHCausalLMOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.backbone( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + cache_params=cache_params, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs.last_hidden_state if return_dict else outputs[0] + logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() + + loss = None + if labels is not None: + labels = labels.to(logits.device) + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return NemotronHCausalLMOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + """Prepare inputs for generation.""" + empty_past_kv = past_key_values is None + + # If we have cache: slice input_ids through cache_position to keep only unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids + # Exception 3: with synced GPUs cache_position may go out of bounds + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0]:] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case + input_ids = input_ids[:, cache_position] + else: + past_key_values = HybridMambaAttentionDynamicCache( + self.config, input_ids.shape[0], self.backbone.embeddings.weight.dtype, device=input_ids.device + ) + + # Create position_ids on the fly for batch generation if not provided + if attention_mask is not None and position_ids is None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1]:] + + # If inputs_embeds are passed, only use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} + + model_inputs.update({ + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "cache_position": cache_position, + }) + return model_inputs diff --git a/tests/collections/tts/test_nemotron_h_decoder.py b/tests/collections/tts/test_nemotron_h_decoder.py new file mode 100644 index 000000000000..4b21dc1ae716 --- /dev/null +++ b/tests/collections/tts/test_nemotron_h_decoder.py @@ -0,0 +1,745 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test script for NemotronH decoder module. + +This script tests: +1. NemotronHConfig initialization +2. NemotronHModel forward pass +3. NemotronHForCausalLM forward pass +4. KV caching for inference +5. Interface compatibility with EasyMagpieTTSModel requirements +""" + +try: + import pytest + PYTEST_AVAILABLE = True +except ImportError: + PYTEST_AVAILABLE = False + # Create a dummy pytest fixture decorator for standalone execution + class pytest: + @staticmethod + def fixture(func): + return func + +import torch + +from nemo.collections.tts.modules.nemotron_h_decoder import ( + HybridMambaAttentionDynamicCache, + NemotronHConfig, + NemotronHForCausalLM, + NemotronHMLP, + NemotronHMOE, + NemotronHModel, + NemotronHTopkRouter, +) + + +class TestNemotronHConfig: + """Test NemotronHConfig initialization and defaults.""" + + def test_default_config(self): + """Test default config initialization.""" + config = NemotronHConfig() + assert config.hidden_size == 1536 + assert config.num_hidden_layers == 24 + assert len(config.layers_block_type) == config.num_hidden_layers + + def test_custom_pattern(self): + """Test custom hybrid_override_pattern.""" + config = NemotronHConfig( + num_hidden_layers=8, + hybrid_override_pattern="M*M*M*M*" + ) + assert config.layers_block_type == ['mamba', 'attention'] * 4 + + def test_pattern_extension(self): + """Test that short patterns are extended to match num_hidden_layers.""" + config = NemotronHConfig( + num_hidden_layers=8, + hybrid_override_pattern="M*" + ) + assert len(config.layers_block_type) == 8 + + +class TestNemotronHModel: + """Test NemotronHModel backbone.""" + + @pytest.fixture + def small_config(self): + """Create a small config for testing.""" + return NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + hybrid_override_pattern="M*M*", + ) + + @pytest.fixture + def model(self, small_config): + """Create a small model for testing.""" + return NemotronHModel(small_config) + + def test_model_creation(self, model, small_config): + """Test model can be created.""" + assert model is not None + assert len(model.layers) == small_config.num_hidden_layers + + def test_forward_with_input_ids(self, model): + """Test forward pass with input_ids.""" + batch_size, seq_len = 2, 16 + input_ids = torch.randint(0, 1000, (batch_size, seq_len)) + + output = model(input_ids=input_ids) + + assert output.last_hidden_state is not None + assert output.last_hidden_state.shape == (batch_size, seq_len, 64) + + def test_forward_with_inputs_embeds(self, model): + """Test forward pass with inputs_embeds (required for TTS).""" + batch_size, seq_len, hidden_size = 2, 16, 64 + inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) + + output = model(inputs_embeds=inputs_embeds) + + assert output.last_hidden_state is not None + assert output.last_hidden_state.shape == (batch_size, seq_len, hidden_size) + + def test_get_set_input_embeddings(self, model): + """Test get/set input embeddings interface.""" + original_embeddings = model.get_input_embeddings() + assert original_embeddings is not None + + new_embeddings = torch.nn.Embedding(100, 64) + model.set_input_embeddings(new_embeddings) + + assert model.get_input_embeddings() is new_embeddings + + +class TestNemotronHForCausalLM: + """Test NemotronHForCausalLM full model.""" + + @pytest.fixture + def small_config(self): + """Create a small config for testing.""" + return NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + hybrid_override_pattern="M*M*", + ) + + @pytest.fixture + def model(self, small_config): + """Create a small model for testing.""" + return NemotronHForCausalLM(small_config) + + def test_model_creation(self, model, small_config): + """Test model can be created.""" + assert model is not None + assert model.backbone is not None + assert model.lm_head is not None + + def test_model_alias(self, model): + """Test that model.model returns backbone (HF compatibility).""" + assert model.model is model.backbone + + def test_forward_with_inputs_embeds(self, model): + """Test forward pass with inputs_embeds.""" + batch_size, seq_len, hidden_size = 2, 16, 64 + inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) + + output = model(inputs_embeds=inputs_embeds) + + assert output.logits is not None + assert output.logits.shape == (batch_size, seq_len, 1000) # vocab_size + + def test_interface_compatibility(self, model): + """Test that model satisfies EasyMagpieTTSModel interface requirements.""" + # Test 1: decoder.get_input_embeddings() + embeddings = model.backbone.get_input_embeddings() + assert embeddings is not None + + # Test 2: decoder.set_input_embeddings() + new_emb = torch.nn.Embedding(100, 64) + model.backbone.set_input_embeddings(new_emb) + assert model.backbone.get_input_embeddings() is new_emb + + # Reset for next tests + model.backbone.set_input_embeddings(embeddings) + + # Test 3: decoder(inputs_embeds, attention_mask, use_cache, past_key_values) + batch_size, seq_len, hidden_size = 2, 16, 64 + inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) + attention_mask = torch.ones(batch_size, seq_len) + + output = model.backbone( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + use_cache=False, + past_key_values=None, + ) + + # Test 4: Return .last_hidden_state + assert hasattr(output, 'last_hidden_state') + assert output.last_hidden_state is not None + + # Test 5: Return .past_key_values (when use_cache=True not tested here as it requires more setup) + assert hasattr(output, 'past_key_values') + + +class TestHybridCache: + """Test HybridMambaAttentionDynamicCache.""" + + def test_cache_creation(self): + """Test cache can be created.""" + config = NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + conv_kernel=4, + hybrid_override_pattern="M*M*", + ) + + batch_size = 2 + cache = HybridMambaAttentionDynamicCache(config, batch_size, dtype=torch.float32) + + assert len(cache.conv_states) == config.num_hidden_layers + assert len(cache.ssm_states) == config.num_hidden_layers + assert len(cache.key_cache) == config.num_hidden_layers + assert len(cache.value_cache) == config.num_hidden_layers + + +class TestNemotronHCausality: + """Test that NemotronH model is causal (future timesteps don't affect previous ones).""" + + @pytest.fixture + def small_config(self): + """Create a small config for testing causality.""" + return NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + hybrid_override_pattern="M*M*", + ) + + @pytest.fixture + def model(self, small_config): + """Create a small model for testing.""" + model = NemotronHModel(small_config) + model.eval() # Set to eval mode for deterministic behavior + return model + + def test_causality_with_input_modification(self, model, small_config): + """ + Test causality by modifying future timesteps and checking that earlier outputs are unchanged. + + The test: + 1. Pass sequence through the model + 2. Modify a future timestep in the input + 3. Verify outputs at earlier timesteps remain exactly the same + """ + batch_size, seq_len = 2, 16 + hidden_size = small_config.hidden_size + + # Create a base input + torch.manual_seed(42) + inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) + + # Get output with original input + with torch.no_grad(): + output_original = model(inputs_embeds=inputs_embeds_original.clone()) + + # Test at different positions + test_positions = [seq_len // 4, seq_len // 2, 3 * seq_len // 4] + + for modify_pos in test_positions: + # Create modified input where we change timesteps from modify_pos onwards + inputs_embeds_modified = inputs_embeds_original.clone() + # Add random noise to all positions from modify_pos onwards + inputs_embeds_modified[:, modify_pos:, :] += torch.randn( + batch_size, seq_len - modify_pos, hidden_size + ) * 10.0 # Large modification to ensure it would affect outputs if not causal + + # Get output with modified input + with torch.no_grad(): + output_modified = model(inputs_embeds=inputs_embeds_modified) + + # Check that outputs BEFORE modify_pos are unchanged + outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :] + outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :] + + # Should be exactly equal (within floating point tolerance) + assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \ + f"Causality violation: modifying position {modify_pos} affected earlier positions" + + # Verify that outputs AT and AFTER modify_pos are different (sanity check) + outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :] + outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :] + + assert not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3), \ + f"Sanity check failed: modifying position {modify_pos} should affect outputs at/after that position" + + def test_causality_incremental_vs_full(self, model, small_config): + """ + Test causality by comparing incremental (token-by-token) vs full sequence processing. + + A causal model should produce the same output whether we: + 1. Process the full sequence at once + 2. Process tokens incrementally one at a time + """ + batch_size, seq_len = 1, 8 # Smaller seq for incremental test + hidden_size = small_config.hidden_size + + torch.manual_seed(123) + inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) + + # Get output from full sequence + with torch.no_grad(): + output_full = model(inputs_embeds=inputs_embeds) + + # Get outputs incrementally (one token at a time) + # For a causal model, output at each position should match + incremental_outputs = [] + for t in range(1, seq_len + 1): + with torch.no_grad(): + partial_output = model(inputs_embeds=inputs_embeds[:, :t, :]) + # Take only the last timestep output for comparison + incremental_outputs.append(partial_output.last_hidden_state[:, -1:, :]) + + # Stack incremental outputs + output_incremental = torch.cat(incremental_outputs, dim=1) + + # Compare: the full sequence output should match the incrementally computed outputs + assert torch.allclose(output_full.last_hidden_state, output_incremental, atol=1e-4), \ + "Causality violation: incremental processing produces different results than full sequence" + + def test_causality_causal_lm(self, small_config): + """Test causality for NemotronHForCausalLM.""" + model = NemotronHForCausalLM(small_config) + model.eval() + + batch_size, seq_len = 2, 12 + hidden_size = small_config.hidden_size + + torch.manual_seed(456) + inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) + + modify_pos = seq_len // 2 + + # Get logits with original input + with torch.no_grad(): + output_original = model(inputs_embeds=inputs_embeds_original.clone()) + + # Modify future positions + inputs_embeds_modified = inputs_embeds_original.clone() + inputs_embeds_modified[:, modify_pos:, :] += torch.randn( + batch_size, seq_len - modify_pos, hidden_size + ) * 10.0 + + with torch.no_grad(): + output_modified = model(inputs_embeds=inputs_embeds_modified) + + # Check logits before modify_pos are unchanged + logits_before_original = output_original.logits[:, :modify_pos, :] + logits_before_modified = output_modified.logits[:, :modify_pos, :] + + assert torch.allclose(logits_before_original, logits_before_modified, atol=1e-5), \ + "Causality violation in CausalLM: modifying future positions affected earlier logits" + + def test_causality_different_layer_types(self): + """Test causality with different hybrid patterns (Mamba-only, Attention-only, mixed).""" + patterns = [ + "MMMM", # Mamba only + "****", # Attention only + "M*M*", # Alternating + "MM**", # Mixed blocks + ] + + for pattern in patterns: + config = NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + hybrid_override_pattern=pattern, + ) + + model = NemotronHModel(config) + model.eval() + + batch_size, seq_len = 2, 8 + hidden_size = config.hidden_size + + torch.manual_seed(789) + inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) + + modify_pos = 4 + + with torch.no_grad(): + output_original = model(inputs_embeds=inputs_embeds_original.clone()) + + inputs_embeds_modified = inputs_embeds_original.clone() + inputs_embeds_modified[:, modify_pos:, :] += torch.randn( + batch_size, seq_len - modify_pos, hidden_size + ) * 10.0 + + with torch.no_grad(): + output_modified = model(inputs_embeds=inputs_embeds_modified) + + outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :] + outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :] + + assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \ + f"Causality violation for pattern '{pattern}': modifying future positions affected earlier outputs" + + +class TestMoELayer: + """Test Mixture of Experts layer.""" + + @pytest.fixture + def moe_config(self): + """Create a config for MoE testing.""" + return NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + # MoE config + n_routed_experts=4, + num_experts_per_tok=2, + moe_intermediate_size=64, + moe_shared_expert_intermediate_size=128, + n_group=1, + topk_group=1, + routed_scaling_factor=1.0, + norm_topk_prob=True, + hybrid_override_pattern="M*ME", # Includes MoE layer + ) + + def test_topk_router_creation(self, moe_config): + """Test NemotronHTopkRouter creation.""" + router = NemotronHTopkRouter(moe_config) + assert router.weight.shape == (moe_config.n_routed_experts, moe_config.hidden_size) + assert router.top_k == moe_config.num_experts_per_tok + + def test_topk_router_forward(self, moe_config): + """Test NemotronHTopkRouter forward pass.""" + router = NemotronHTopkRouter(moe_config) + batch_size, seq_len = 2, 8 + hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size) + + topk_indices, topk_weights = router(hidden_states) + + # Check shapes + assert topk_indices.shape == (batch_size * seq_len, moe_config.num_experts_per_tok) + assert topk_weights.shape == (batch_size * seq_len, moe_config.num_experts_per_tok) + + # Check indices are valid + assert topk_indices.min() >= 0 + assert topk_indices.max() < moe_config.n_routed_experts + + def test_moe_layer_creation(self, moe_config): + """Test NemotronHMOE creation.""" + moe = NemotronHMOE(moe_config, layer_idx=0) + + assert len(moe.experts) == moe_config.n_routed_experts + assert moe.gate is not None + assert moe.shared_experts is not None + + def test_moe_layer_forward(self, moe_config): + """Test NemotronHMOE forward pass.""" + moe = NemotronHMOE(moe_config, layer_idx=0) + batch_size, seq_len = 2, 8 + hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size) + + output = moe(hidden_states) + + assert output.shape == hidden_states.shape + + def test_model_with_moe_pattern(self, moe_config): + """Test full model with MoE layer.""" + model = NemotronHModel(moe_config) + + # Check that MoE layer was created + assert model.layers[3].block_type == "moe" + + # Test forward pass + batch_size, seq_len = 2, 8 + inputs_embeds = torch.randn(batch_size, seq_len, moe_config.hidden_size) + + output = model(inputs_embeds=inputs_embeds) + + assert output.last_hidden_state is not None + assert output.last_hidden_state.shape == (batch_size, seq_len, moe_config.hidden_size) + + +if __name__ == "__main__": + """Run basic tests without pytest.""" + print("Testing NemotronH Decoder Module...") + + # Test 1: Config + print("\n1. Testing NemotronHConfig...") + config = NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + hybrid_override_pattern="M*M*", + ) + print(f" Config created: {config.num_hidden_layers} layers, pattern={config.hybrid_override_pattern}") + print(f" Layer types: {config.layers_block_type}") + + # Test 2: Model creation + print("\n2. Testing NemotronHModel creation...") + model = NemotronHModel(config) + print(f" Model created with {len(model.layers)} layers") + + # Test 3: Forward pass with inputs_embeds + print("\n3. Testing forward pass with inputs_embeds...") + batch_size, seq_len, hidden_size = 2, 16, 64 + inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) + output = model(inputs_embeds=inputs_embeds) + print(f" Input shape: {inputs_embeds.shape}") + print(f" Output shape: {output.last_hidden_state.shape}") + + # Test 4: Full model + print("\n4. Testing NemotronHForCausalLM...") + full_model = NemotronHForCausalLM(config) + output = full_model(inputs_embeds=inputs_embeds) + print(f" Logits shape: {output.logits.shape}") + + # Test 5: Interface compatibility + print("\n5. Testing interface compatibility for EasyMagpieTTSModel...") + decoder = full_model.backbone + + # get_input_embeddings + emb = decoder.get_input_embeddings() + print(f" get_input_embeddings(): {type(emb).__name__}") + + # set_input_embeddings + new_emb = torch.nn.Embedding(100, 64) + decoder.set_input_embeddings(new_emb) + print(f" set_input_embeddings(): OK") + decoder.set_input_embeddings(emb) # Reset + + # forward with expected args + output = decoder( + inputs_embeds=inputs_embeds, + attention_mask=torch.ones(batch_size, seq_len), + use_cache=False, + past_key_values=None, + ) + print(f" forward(inputs_embeds, attention_mask, use_cache, past_key_values): OK") + print(f" .last_hidden_state: {output.last_hidden_state.shape}") + print(f" .past_key_values: {output.past_key_values}") + + # Test 6: MoE layer + print("\n6. Testing MoE (Mixture of Experts) layer...") + moe_config = NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + # MoE config + n_routed_experts=4, + num_experts_per_tok=2, + moe_intermediate_size=64, + moe_shared_expert_intermediate_size=128, + n_group=1, + topk_group=1, + routed_scaling_factor=1.0, + norm_topk_prob=True, + hybrid_override_pattern="M*ME", # Includes MoE layer + ) + print(f" Config: pattern={moe_config.hybrid_override_pattern}, block_types={moe_config.layers_block_type}") + + # Test router + router = NemotronHTopkRouter(moe_config) + test_input = torch.randn(2, 8, 64) + topk_indices, topk_weights = router(test_input) + print(f" Router: topk_indices shape={topk_indices.shape}, topk_weights shape={topk_weights.shape}") + + # Test MoE layer + moe = NemotronHMOE(moe_config, layer_idx=0) + moe_output = moe(test_input) + print(f" MoE layer: input={test_input.shape}, output={moe_output.shape}") + + # Test full model with MoE + moe_model = NemotronHModel(moe_config) + moe_model_output = moe_model(inputs_embeds=test_input) + print(f" Full model with MoE: output={moe_model_output.last_hidden_state.shape}") + + # Test 7: Causality test + print("\n7. Testing model causality (future timesteps don't affect previous ones)...") + + # Create model for causality test + causality_config = NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + hybrid_override_pattern="M*M*", + ) + causality_model = NemotronHModel(causality_config) + causality_model.eval() + + batch_size, seq_len = 2, 16 + hidden_size = 64 + + # Create base input + torch.manual_seed(42) + inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) + + # Get output with original input + with torch.no_grad(): + output_original = causality_model(inputs_embeds=inputs_embeds_original.clone()) + + # Test at different positions + test_positions = [4, 8, 12] + causality_passed = True + + for modify_pos in test_positions: + # Create modified input where we change timesteps from modify_pos onwards + inputs_embeds_modified = inputs_embeds_original.clone() + inputs_embeds_modified[:, modify_pos:, :] += torch.randn( + batch_size, seq_len - modify_pos, hidden_size + ) * 10.0 + + # Get output with modified input + with torch.no_grad(): + output_modified = causality_model(inputs_embeds=inputs_embeds_modified) + + # Check that outputs BEFORE modify_pos are unchanged + outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :] + outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :] + + if torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5): + print(f" Position {modify_pos}: PASS (earlier outputs unchanged)") + else: + print(f" Position {modify_pos}: FAIL (causality violation!)") + causality_passed = False + + # Verify outputs at/after modify_pos are different (sanity check) + outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :] + outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :] + + if not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3): + print(f" Position {modify_pos}: Sanity check PASS (later outputs changed)") + else: + print(f" Position {modify_pos}: Sanity check FAIL (later outputs should change)") + causality_passed = False + + # Test with different layer patterns + print("\n Testing causality with different layer patterns...") + patterns = ["MMMM", "****", "M*M*", "MM**"] + for pattern in patterns: + pattern_config = NemotronHConfig( + hidden_size=64, + num_hidden_layers=4, + vocab_size=1000, + num_attention_heads=4, + num_key_value_heads=2, + mamba_num_heads=8, + mamba_head_dim=8, + ssm_state_size=16, + n_groups=2, + intermediate_size=128, + hybrid_override_pattern=pattern, + ) + pattern_model = NemotronHModel(pattern_config) + pattern_model.eval() + + torch.manual_seed(789) + test_input = torch.randn(2, 8, 64) + modify_pos = 4 + + with torch.no_grad(): + out_orig = pattern_model(inputs_embeds=test_input.clone()) + + test_input_mod = test_input.clone() + test_input_mod[:, modify_pos:, :] += torch.randn(2, 4, 64) * 10.0 + + with torch.no_grad(): + out_mod = pattern_model(inputs_embeds=test_input_mod) + + if torch.allclose(out_orig.last_hidden_state[:, :modify_pos, :], + out_mod.last_hidden_state[:, :modify_pos, :], atol=1e-5): + print(f" Pattern '{pattern}': PASS") + else: + print(f" Pattern '{pattern}': FAIL (causality violation!)") + causality_passed = False + + if causality_passed: + print(" All causality tests PASSED!") + else: + print(" WARNING: Some causality tests FAILED!") + + print("\n" + "="*50) + print("All tests passed!") + print("="*50) From 3c055497da83ab3fd4be151dbba404e530c0ceff Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Mon, 2 Feb 2026 19:13:01 +0000 Subject: [PATCH 31/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 14 +- .../tts/modules/nemotron_h_decoder.py | 568 +++++++++--------- .../tts/test_nemotron_h_decoder.py | 309 +++++----- 3 files changed, 450 insertions(+), 441 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index dabdd0ae6f30..3f107736604a 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -328,7 +328,9 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): nemotron_model = NemotronHForCausalLM(nemotron_config) self.decoder = nemotron_model.backbone self.lm_text_head = nemotron_model.lm_head - logging.info(f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}...") + logging.info( + f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}..." + ) else: raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'") @@ -2031,10 +2033,10 @@ def infer_batch( ] # (2B, T_min, E) else: first_inference_input = context_plus_audio_embedded[:, :min_context_len, :] # (B, T_min, E) - + # Initialize cache_position for tracking sequence position (needed for NemotronH) cache_position = torch.arange(min_context_len, device=context_embedding.device) - + # First forward pass to get the initial hidden state and past key values transformer_out = self.forward( inputs_embeds=first_inference_input, @@ -2047,7 +2049,7 @@ def infer_batch( time_to_first_prediction = time.time() - start_time last_hidden = transformer_out.last_hidden_state # (B, T_total, E) past_kv = transformer_out.past_key_values - + # Track the current sequence length for cache_position updates current_cache_seq_len = min_context_len @@ -2244,7 +2246,7 @@ def infer_batch( # Update cache_position for current step (needed for NemotronH cached forward) cache_position = torch.tensor([current_cache_seq_len], device=context_embedding.device) - + transformer_out = self.forward( inputs_embeds=next_input, attention_mask=None, @@ -2254,7 +2256,7 @@ def infer_batch( ) last_hidden = transformer_out.last_hidden_state past_kv = transformer_out.past_key_values - + # Increment sequence length for next iteration current_cache_seq_len += 1 if len(end_indices) == audio_codes_next.size(0): diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py index b33c1ecba663..f89e0a8fd326 100644 --- a/nemo/collections/tts/modules/nemotron_h_decoder.py +++ b/nemo/collections/tts/modules/nemotron_h_decoder.py @@ -37,6 +37,7 @@ try: from mamba_ssm.ops.triton.selective_state_update import selective_state_update from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined + MAMBA_SSM_AVAILABLE = True except ImportError: selective_state_update = None @@ -46,6 +47,7 @@ try: from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn + RMSNORM_FN_AVAILABLE = True except ImportError: rmsnorm_fn = None @@ -53,6 +55,7 @@ try: from causal_conv1d import causal_conv1d_fn, causal_conv1d_update + CAUSAL_CONV1D_AVAILABLE = True except ImportError: causal_conv1d_fn = None @@ -61,6 +64,7 @@ try: from flash_attn import flash_attn_func + FLASH_ATTN_AVAILABLE = True except ImportError: flash_attn_func = None @@ -68,13 +72,15 @@ # Check if fast path is available (all optimized kernels present) -IS_FAST_PATH_AVAILABLE = all([ - MAMBA_SSM_AVAILABLE, - CAUSAL_CONV1D_AVAILABLE, - selective_state_update is not None, - mamba_chunk_scan_combined is not None, - causal_conv1d_fn is not None, -]) +IS_FAST_PATH_AVAILABLE = all( + [ + MAMBA_SSM_AVAILABLE, + CAUSAL_CONV1D_AVAILABLE, + selective_state_update is not None, + mamba_chunk_scan_combined is not None, + causal_conv1d_fn is not None, + ] +) def get_activation_fn(activation: str): @@ -93,19 +99,20 @@ def get_activation_fn(activation: str): class NemotronHConfig: """ Configuration class for NemotronH model. - + This configuration controls the hybrid Mamba2/Attention architecture. The layer types are specified via hybrid_override_pattern where: - 'M' = Mamba2 layer - - '*' = Attention layer + - '*' = Attention layer - '-' = MLP layer - 'E' = MoE layer """ + # Model dimensions hidden_size: int = 1536 num_hidden_layers: int = 24 vocab_size: int = 131072 - + # Attention config num_attention_heads: int = 12 num_key_value_heads: int = 4 @@ -113,7 +120,7 @@ class NemotronHConfig: attention_dropout: float = 0.0 attention_bias: bool = False max_position_embeddings: int = 4096 - + # Mamba config mamba_num_heads: int = 64 mamba_head_dim: int = 64 @@ -128,12 +135,12 @@ class NemotronHConfig: mamba_hidden_act: str = "silu" use_conv_bias: bool = True use_bias: bool = False - + # MLP config intermediate_size: int = 4096 mlp_hidden_act: str = "silu" mlp_bias: bool = False - + # MoE config (if using MoE layers) n_routed_experts: int = 8 num_experts_per_tok: int = 2 @@ -143,46 +150,48 @@ class NemotronHConfig: topk_group: int = 1 routed_scaling_factor: float = 1.0 norm_topk_prob: bool = True - + # Layer pattern: M=Mamba, *=Attention, -=MLP, E=MoE # Example: "M*M*M*M*" = alternating Mamba and Attention hybrid_override_pattern: str = "M*M*M*M*M*M*M*M*M*M*M*M*" - + # Normalization layer_norm_epsilon: float = 1e-5 residual_in_fp32: bool = True - + # Initialization initializer_range: float = 0.02 rescale_prenorm_residual: bool = True - + # Output use_cache: bool = True use_return_dict: bool = True output_attentions: bool = False output_hidden_states: bool = False num_logits_to_keep: int = 1 - + # Attention implementation _attn_implementation: str = "sdpa" # "eager", "sdpa", or "flash_attention_2" - + def __post_init__(self): # Derive layers_block_type from hybrid_override_pattern pattern_map = {'M': 'mamba', '*': 'attention', '-': 'mlp', 'E': 'moe'} self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern] - + # Ensure num_hidden_layers matches pattern length if len(self.layers_block_type) != self.num_hidden_layers: # Extend or truncate pattern to match num_hidden_layers if len(self.layers_block_type) < self.num_hidden_layers: # Repeat pattern - full_pattern = self.hybrid_override_pattern * (self.num_hidden_layers // len(self.hybrid_override_pattern) + 1) - self.hybrid_override_pattern = full_pattern[:self.num_hidden_layers] + full_pattern = self.hybrid_override_pattern * ( + self.num_hidden_layers // len(self.hybrid_override_pattern) + 1 + ) + self.hybrid_override_pattern = full_pattern[: self.num_hidden_layers] self.layers_block_type = [pattern_map.get(c, 'mamba') for c in self.hybrid_override_pattern] else: - self.layers_block_type = self.layers_block_type[:self.num_hidden_layers] - self.hybrid_override_pattern = self.hybrid_override_pattern[:self.num_hidden_layers] - + self.layers_block_type = self.layers_block_type[: self.num_hidden_layers] + self.hybrid_override_pattern = self.hybrid_override_pattern[: self.num_hidden_layers] + # Set head_dim if not specified if self.head_dim is None: self.head_dim = self.hidden_size // self.num_attention_heads @@ -191,6 +200,7 @@ def __post_init__(self): @dataclass class NemotronHOutput: """Output class for NemotronH model.""" + last_hidden_state: Optional[torch.FloatTensor] = None past_key_values: Optional[Any] = None # HybridMambaAttentionDynamicCache hidden_states: Optional[Tuple[torch.FloatTensor]] = None @@ -200,6 +210,7 @@ class NemotronHOutput: @dataclass class NemotronHCausalLMOutput: """Output class for NemotronH causal LM.""" + loss: Optional[torch.FloatTensor] = None logits: Optional[torch.FloatTensor] = None past_key_values: Optional[Any] = None @@ -209,25 +220,25 @@ class NemotronHCausalLMOutput: class HybridMambaAttentionDynamicCache: """ - A dynamic cache that handles both attention cache (with seq_len dimension) + A dynamic cache that handles both attention cache (with seq_len dimension) and mamba cache (with constant shape regardless of seq_len). """ - + def __init__(self, config: NemotronHConfig, batch_size: int, dtype=torch.float16, device=None): self.dtype = dtype self.has_previous_state = False self.conv_kernel_size = config.conv_kernel - + intermediate_size = config.mamba_num_heads * config.mamba_head_dim ssm_state_size = config.ssm_state_size conv_kernel_size = config.conv_kernel - + self.conv_states = [] self.ssm_states = [] self.key_cache = [] self.value_cache = [] self.transformer_layers = [] - + for i in range(config.num_hidden_layers): if config.layers_block_type[i] == "mamba": self.conv_states.append( @@ -240,10 +251,10 @@ def __init__(self, config: NemotronHConfig, batch_size: int, dtype=torch.float16 self.conv_states.append(torch.tensor([[]] * batch_size, device=device)) self.ssm_states.append(torch.tensor([[]] * batch_size, device=device)) self.transformer_layers.append(i) - + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] - + def update( self, key_states: torch.Tensor, @@ -258,13 +269,13 @@ def update( self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) return self.key_cache[layer_idx], self.value_cache[layer_idx] - + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx if len(self.key_cache) <= layer_idx: return 0 return self.key_cache[layer_idx].shape[-2] if self.key_cache[layer_idx].dim() > 2 else 0 - + def update_conv_state(self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False): if cache_init: self.conv_states[layer_idx] = new_conv_state.to(self.conv_states[layer_idx].device) @@ -272,11 +283,11 @@ def update_conv_state(self, layer_idx: int, new_conv_state: torch.Tensor, cache_ self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1) self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states[layer_idx].device) return self.conv_states[layer_idx] - + def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor): self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states[layer_idx].device) return self.ssm_states[layer_idx] - + def reorder_cache(self, beam_idx: torch.LongTensor): """Reorders the cache for beam search, given the selected beam indices.""" for layer_idx in range(len(self.key_cache)): @@ -284,12 +295,12 @@ def reorder_cache(self, beam_idx: torch.LongTensor): self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) device = self.value_cache[layer_idx].device self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) - + device = self.conv_states[layer_idx].device self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device)) device = self.ssm_states[layer_idx].device self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device)) - + def reset(self): """Reset all cache states to zero.""" for i in range(len(self.conv_states)): @@ -306,12 +317,12 @@ def reset(self): class NemotronHRMSNorm(nn.Module): """RMSNorm implementation for NemotronH.""" - + def __init__(self, hidden_size: int, eps: float = 1e-6): super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) @@ -322,21 +333,17 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class MambaRMSNormGated(nn.Module): """Gated RMSNorm for Mamba layers.""" - + def __init__(self, hidden_size: int, group_size: int, eps: float = 1e-5): super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps self.group_size = group_size - + def forward(self, hidden_states: torch.Tensor, gate: Optional[torch.Tensor] = None) -> torch.Tensor: # Only use Triton kernel if available AND tensors are on CUDA - use_triton = ( - RMSNORM_FN_AVAILABLE - and rmsnorm_fn is not None - and hidden_states.is_cuda - ) - + use_triton = RMSNORM_FN_AVAILABLE and rmsnorm_fn is not None and hidden_states.is_cuda + if use_triton: return rmsnorm_fn( x=hidden_states, @@ -345,7 +352,7 @@ def forward(self, hidden_states: torch.Tensor, gate: Optional[torch.Tensor] = No z=gate, eps=self.variance_epsilon, group_size=self.group_size, - norm_before_gate=False + norm_before_gate=False, ) else: # Fallback: simple RMSNorm + gating (works on CPU and GPU) @@ -371,7 +378,9 @@ def reshape_into_chunks(input_tensor, pad_size, chunk_size): if len(input_tensor.shape) == 3: return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) else: - return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]) + return input_tensor.reshape( + input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3] + ) def segment_sum(input_tensor): @@ -399,7 +408,7 @@ class NemotronHMamba2Mixer(nn.Module): Mamba2 mixer layer implementation. Computes state space model operations for sequence modeling. """ - + def __init__(self, config: NemotronHConfig, layer_idx: int): super().__init__() self.num_heads = config.mamba_num_heads @@ -418,7 +427,7 @@ def __init__(self, config: NemotronHConfig, layer_idx: int): self.time_step_limit = config.time_step_limit self.time_step_min = config.time_step_min self.time_step_max = config.time_step_max - + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size self.conv1d = nn.Conv1d( in_channels=self.conv_dim, @@ -428,27 +437,25 @@ def __init__(self, config: NemotronHConfig, layer_idx: int): groups=self.conv_dim, padding=config.conv_kernel - 1, ) - + projection_size = self.intermediate_size + self.conv_dim + self.num_heads self.in_proj = nn.Linear(self.hidden_size, projection_size, bias=config.use_bias) - + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) - + A = torch.arange(1, self.num_heads + 1) self.A_log = nn.Parameter(torch.log(A)) self.A_log._no_weight_decay = True - + self.norm = MambaRMSNormGated( - self.intermediate_size, - eps=self.layer_norm_epsilon, - group_size=self.intermediate_size // self.n_groups + self.intermediate_size, eps=self.layer_norm_epsilon, group_size=self.intermediate_size // self.n_groups ) self.D = nn.Parameter(torch.ones(self.num_heads)) self.D._no_weight_decay = True - + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) self.use_bias = config.use_bias - + def forward( self, hidden_states: torch.Tensor, @@ -460,7 +467,7 @@ def forward( if IS_FAST_PATH_AVAILABLE and hidden_states.is_cuda: return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) - + def cuda_kernels_forward( self, hidden_states: torch.Tensor, @@ -470,7 +477,7 @@ def cuda_kernels_forward( ): hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) projected_states = self.in_proj(hidden_states) - + batch_size, seq_len, _ = hidden_states.shape groups_time_state_size = self.n_groups * self.ssm_state_size d_mlp = ( @@ -479,13 +486,13 @@ def cuda_kernels_forward( - 2 * self.n_groups * self.ssm_state_size - self.num_heads ) // 2 - + if cache_params is not None and cache_position is not None and cache_position[0] > 0: # Cached forward (single token) _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 ) - + hidden_states_B_C = causal_conv1d_update( hidden_states_B_C, cache_params.conv_states[self.layer_idx], @@ -493,13 +500,13 @@ def cuda_kernels_forward( self.conv1d.bias, self.activation, ) - + hidden_states, B, C = torch.split( hidden_states_B_C, [self.intermediate_size, groups_time_state_size, groups_time_state_size], dim=-1, ) - + A = -torch.exp(self.A_log.float()) A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) dt = dt[:, :, None].expand(-1, -1, self.head_dim) @@ -508,11 +515,15 @@ def cuda_kernels_forward( B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) - + hidden_states = selective_state_update( cache_params.ssm_states[self.layer_idx], hidden_states_reshaped, - dt, A, B, C, D, + dt, + A, + B, + C, + D, z=None, dt_bias=dt_bias, dt_softplus=True, @@ -524,7 +535,7 @@ def cuda_kernels_forward( # Full sequence forward A = -torch.exp(self.A_log.float()) dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} - + if self.training and cache_params is None: out = mamba_split_conv1d_scan_combined( projected_states, @@ -550,17 +561,21 @@ def cuda_kernels_forward( _, _, gate, hidden_states_B_C, dt = projected_states.split( [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 ) - + if cache_params is not None: hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) conv_states = F.pad( hidden_states_B_C_transposed, (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), ) - cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) - + cache_params.update_conv_state( + layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True + ) + if self.activation not in ["silu", "swish"]: - hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) + ) else: hidden_states_B_C = causal_conv1d_fn( x=hidden_states_B_C.transpose(1, 2), @@ -568,17 +583,18 @@ def cuda_kernels_forward( bias=self.conv1d.bias, activation=self.activation, ).transpose(1, 2) - + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) hidden_states, B, C = torch.split( hidden_states_B_C, [self.intermediate_size, groups_time_state_size, groups_time_state_size], dim=-1, ) - + scan_output, ssm_state = mamba_chunk_scan_combined( hidden_states.view(batch_size, seq_len, -1, self.head_dim), - dt, A, + dt, + A, B.view(batch_size, seq_len, self.n_groups, -1), C.view(batch_size, seq_len, self.n_groups, -1), chunk_size=self.chunk_size, @@ -590,16 +606,16 @@ def cuda_kernels_forward( dt_softplus=True, **dt_limit_kwargs, ) - + if ssm_state is not None and cache_params is not None: cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) - + scan_output = scan_output.view(batch_size, seq_len, -1) scan_output = self.norm(scan_output, gate) out = self.out_proj(scan_output) - + return out - + def torch_forward( self, hidden_states: torch.Tensor, @@ -610,21 +626,25 @@ def torch_forward( """Pure PyTorch implementation (slower but works without CUDA kernels).""" batch_size, seq_len, _ = hidden_states.shape dtype = hidden_states.dtype - + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) projected_states = self.in_proj(hidden_states) - + d_mlp = ( - projected_states.shape[-1] - 2 * self.intermediate_size - - 2 * self.n_groups * self.ssm_state_size - self.num_heads + projected_states.shape[-1] + - 2 * self.intermediate_size + - 2 * self.n_groups * self.ssm_state_size + - self.num_heads ) // 2 _, _, gate, hidden_states_B_C, dt = projected_states.split( [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 ) - + # Convolution if cache_params is not None and cache_position is not None and cache_position[0] > 0: - cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False) + cache_params.update_conv_state( + layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False + ) conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) hidden_states_B_C = torch.sum(conv_states * self.conv1d.weight.squeeze(1), dim=-1) if self.use_conv_bias: @@ -634,22 +654,22 @@ def torch_forward( if cache_params is not None: hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) conv_states = F.pad( - hidden_states_B_C_transposed, - (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + hidden_states_B_C_transposed, + (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), ) cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) - + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) hidden_states, B, C = torch.split( hidden_states_B_C, [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], - dim=-1 + dim=-1, ) - + # SSM A = -torch.exp(self.A_log.float()) - + if cache_params is not None and cache_position is not None and cache_position[0] > 0: # Single step SSM update cache_device = cache_params.ssm_states[self.layer_idx].device @@ -658,33 +678,34 @@ def torch_forward( dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) dt = F.softplus(dt + dt_bias.to(dt.dtype)) dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) - - A_expanded = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + + A_expanded = ( + A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + ) dA = (torch.exp(dt[..., None] * A_expanded)).to(device=cache_device) - + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() B = B.reshape(batch_size, -1, B.shape[-1]) dB = dt[..., None] * B[..., None, :] - + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) dBx = (dB * hidden_states[..., None]).to(device=cache_device) - + cache_params.update_ssm_state( - layer_idx=self.layer_idx, - new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx + layer_idx=self.layer_idx, new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx ) - + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() C = C.reshape(batch_size, -1, C.shape[-1]) - + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) y = torch.bmm(ssm_states_reshaped, C_reshaped) y = y.view(batch_size, self.num_heads, self.head_dim) - + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) y = (y + hidden_states * D).to(y.dtype) y = y.reshape(batch_size, -1)[:, None, ...] @@ -697,56 +718,58 @@ def torch_forward( C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) - + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) - + hidden_states = hidden_states * dt[..., None] A_dt = A.to(hidden_states.dtype) * dt - - hidden_states, A_dt, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A_dt, B, C)] - + + hidden_states, A_dt, B, C = [ + reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A_dt, B, C) + ] + A_dt = A_dt.permute(0, 3, 1, 2) A_cumsum = torch.cumsum(A_dt, dim=-1) L = torch.exp(segment_sum(A_dt)) - + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] G = G_intermediate.sum(dim=-1) M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] M = M_intermediate.sum(dim=-1) Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) - + decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) - + if cache_params is not None and cache_position is not None and cache_position[0] > 0: previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) else: previous_states = torch.zeros_like(states[:, :1]) - + states = torch.cat([previous_states, states], dim=1) decay_chunk = torch.exp(segment_sum(F.pad(A_cumsum[:, :, :, -1], (1, 0)))) decay_chunk = decay_chunk.transpose(1, 3) new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) states, ssm_state = new_states[:, :-1], new_states[:, -1] - + state_decay_out = torch.exp(A_cumsum) - C_times_states = (C[..., None, :] * states[:, :, None, ...]) + C_times_states = C[..., None, :] * states[:, :, None, ...] state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) - Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) - + Y_off = C_times_states.sum(-1) * state_decay_out_permuted[..., None] + y = Y_diag + Y_off y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) y = y + D_residual - + if pad_size > 0: y = y[:, :seq_len, :, :] y = y.reshape(batch_size, seq_len, -1) - + if ssm_state is not None and cache_params is not None: cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) - + scan_output = self.norm(y, gate) contextualized_states = self.out_proj(scan_output.to(dtype)) return contextualized_states @@ -763,7 +786,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class NemotronHAttention(nn.Module): """Multi-headed attention for NemotronH.""" - + def __init__(self, config: NemotronHConfig, layer_idx: int): super().__init__() self.config = config @@ -776,12 +799,12 @@ def __init__(self, config: NemotronHConfig, layer_idx: int): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings self.is_causal = True - + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias) - + def forward( self, hidden_states: torch.Tensor, @@ -793,32 +816,32 @@ def forward( cache_position: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - + query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - + if past_key_value is not None: key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) - + key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - + causal_mask = attention_mask if attention_mask is not None: - causal_mask = attention_mask[:, :, :, :key_states.shape[-2]] - + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + if query_states.device.type == "cuda" and attention_mask is not None: query_states = query_states.contiguous() key_states = key_states.contiguous() value_states = value_states.contiguous() - + is_causal = True if causal_mask is None and q_len > 1 else False - + attn_output = F.scaled_dot_product_attention( query_states, key_states, @@ -827,18 +850,20 @@ def forward( dropout_p=self.attention_dropout if self.training else 0.0, is_causal=is_causal, ) - + attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim) attn_output = self.o_proj(attn_output) - + return attn_output, None, past_key_value class NemotronHMLP(nn.Module): """MLP layer for NemotronH.""" - - def __init__(self, config: NemotronHConfig, intermediate_size: Optional[int] = None, layer_idx: Optional[int] = None): + + def __init__( + self, config: NemotronHConfig, intermediate_size: Optional[int] = None, layer_idx: Optional[int] = None + ): super().__init__() self.config = config self.layer_idx = layer_idx @@ -847,7 +872,7 @@ def __init__(self, config: NemotronHConfig, intermediate_size: Optional[int] = N self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) self.act_fn = get_activation_fn(config.mlp_hidden_act) - + def forward(self, x): return self.down_proj(self.act_fn(self.up_proj(x))) @@ -855,12 +880,12 @@ def forward(self, x): class NemotronHTopkRouter(nn.Module): """ Top-k router for Mixture of Experts. - + Routes tokens to the top-k experts based on learned routing weights. Supports grouped routing where experts are divided into groups and top-k groups are selected first, then top-k experts within those groups. """ - + def __init__(self, config: NemotronHConfig): super().__init__() self.config = config @@ -870,150 +895,136 @@ def __init__(self, config: NemotronHConfig): self.n_group = config.n_group self.topk_group = config.topk_group self.norm_topk_prob = config.norm_topk_prob - - self.weight = nn.Parameter( - torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32) - ) - self.register_buffer( - "e_score_correction_bias", - torch.zeros(self.n_routed_experts, dtype=torch.float32) - ) - + + self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32)) + self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32)) + @torch.no_grad() def get_topk_indices(self, scores: torch.Tensor) -> torch.Tensor: """Get top-k expert indices using grouped routing.""" scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) - + # Compute group scores by taking top-2 within each group and summing group_scores = ( scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group) .topk(2, dim=-1)[0] .sum(dim=-1) ) - + # Select top-k groups group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] group_mask = torch.zeros_like(group_scores) group_mask.scatter_(1, group_idx, 1) - + # Create mask for experts in selected groups score_mask = ( group_mask.unsqueeze(-1) .expand(-1, self.n_group, self.n_routed_experts // self.n_group) .reshape(-1, self.n_routed_experts) ) - + # Zero out scores for experts not in selected groups scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) - + # Select top-k experts from remaining topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1] return topk_indices - + def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Route tokens to experts. - + Args: hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size) - + Returns: topk_indices: Indices of selected experts (batch_size * seq_len, top_k) topk_weights: Weights for selected experts (batch_size * seq_len, top_k) """ hidden_states = hidden_states.view(-1, self.config.hidden_size) - + # Compute router logits and convert to probabilities via sigmoid router_logits = F.linear(hidden_states.float(), self.weight.float()) scores = router_logits.sigmoid() - + # Get top-k expert indices topk_indices = self.get_topk_indices(scores) - + # Gather weights for selected experts topk_weights = scores.gather(1, topk_indices) - + # Optionally normalize weights if self.norm_topk_prob: denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 topk_weights = topk_weights / denominator - + # Apply routing scaling factor topk_weights = topk_weights * self.routed_scaling_factor - + return topk_indices, topk_weights class NemotronHMOE(nn.Module): """ Mixture of Experts layer for NemotronH. - + Combines multiple expert MLPs with a router that selects which experts to use for each token. Also includes shared experts that are always used. """ - + def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None): super().__init__() self.config = config self.layer_idx = layer_idx - + # Create routed experts - self.experts = nn.ModuleList([ - NemotronHMLP( - config, - intermediate_size=config.moe_intermediate_size, - layer_idx=layer_idx - ) - for _ in range(config.n_routed_experts) - ]) - + self.experts = nn.ModuleList( + [ + NemotronHMLP(config, intermediate_size=config.moe_intermediate_size, layer_idx=layer_idx) + for _ in range(config.n_routed_experts) + ] + ) + # Router for selecting experts self.gate = NemotronHTopkRouter(config) - + # Shared experts (always used) self.shared_experts = NemotronHMLP( - config=config, - intermediate_size=config.moe_shared_expert_intermediate_size, - layer_idx=layer_idx + config=config, intermediate_size=config.moe_shared_expert_intermediate_size, layer_idx=layer_idx ) - - def moe( - self, - hidden_states: torch.Tensor, - topk_indices: torch.Tensor, - topk_weights: torch.Tensor - ) -> torch.Tensor: + + def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor) -> torch.Tensor: """ Apply mixture of experts to hidden states. - + Args: hidden_states: Input tensor of shape (batch_size * seq_len, hidden_size) topk_indices: Expert indices of shape (batch_size * seq_len, top_k) topk_weights: Expert weights of shape (batch_size * seq_len, top_k) - + Returns: Output tensor of shape (batch_size * seq_len, hidden_size) """ final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) - + # Create one-hot mask for expert selection expert_mask = F.one_hot(topk_indices, num_classes=len(self.experts)) expert_mask = expert_mask.permute(2, 0, 1) # (num_experts, batch*seq, top_k) - + for expert_idx in range(len(self.experts)): expert = self.experts[expert_idx] mask = expert_mask[expert_idx] token_indices, weight_indices = torch.where(mask) - + if token_indices.numel() > 0: # Get weights and inputs for this expert expert_weights = topk_weights[token_indices, weight_indices] expert_input = hidden_states[token_indices] - + # Apply expert and weight the output expert_output = expert(expert_input) weighted_output = expert_output * expert_weights.unsqueeze(-1) - + # Accumulate weighted outputs final_hidden_states.index_add_(0, token_indices, weighted_output) else: @@ -1022,50 +1033,50 @@ def moe( dummy_input = torch.zeros_like(hidden_states[0]).unsqueeze(0).to(expert_dtype) dummy_out = expert(dummy_input) final_hidden_states = final_hidden_states + dummy_out * 0 - + return final_hidden_states.to(hidden_states.dtype) - + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ Forward pass through MoE layer. - + Args: hidden_states: Input tensor of shape (batch_size, seq_len, hidden_size) - + Returns: Output tensor of shape (batch_size, seq_len, hidden_size) """ residuals = hidden_states orig_shape = hidden_states.shape - + # Route tokens to experts topk_indices, topk_weights = self.gate(hidden_states) - + # Flatten for expert processing hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - + # Apply mixture of experts hidden_states = self.moe(hidden_states, topk_indices, topk_weights) - + # Reshape back to original shape hidden_states = hidden_states.view(*orig_shape) - + # Add shared expert output hidden_states = hidden_states + self.shared_experts(residuals) - + return hidden_states class NemotronHBlock(nn.Module): """A single block in NemotronH - can be Mamba, Attention, MLP, or MoE.""" - + def __init__(self, config: NemotronHConfig, layer_idx: int): super().__init__() self.config = config self.layer_idx = layer_idx self.residual_in_fp32 = config.residual_in_fp32 self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - + self.block_type = config.layers_block_type[layer_idx] if self.block_type == "mamba": self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx) @@ -1077,7 +1088,7 @@ def __init__(self, config: NemotronHConfig, layer_idx: int): self.mixer = NemotronHMOE(config, layer_idx=layer_idx) else: raise ValueError(f"Invalid block type: {self.block_type}") - + def forward( self, hidden_states: torch.Tensor, @@ -1091,7 +1102,7 @@ def forward( return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask) else: return self._forward_impl(hidden_states, cache_params, cache_position, attention_mask) - + def _forward_impl( self, hidden_states: torch.Tensor, @@ -1103,19 +1114,15 @@ def _forward_impl( hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) if self.residual_in_fp32: residual = residual.to(torch.float32) - + if self.block_type == "mamba": - hidden_states = self.mixer( - hidden_states, cache_params=cache_params, cache_position=cache_position - ) + hidden_states = self.mixer(hidden_states, cache_params=cache_params, cache_position=cache_position) elif self.block_type == "attention": - hidden_states = self.mixer( - hidden_states, cache_position=cache_position, past_key_value=cache_params - ) + hidden_states = self.mixer(hidden_states, cache_position=cache_position, past_key_value=cache_params) hidden_states = hidden_states[0] elif self.block_type in ("mlp", "moe"): hidden_states = self.mixer(hidden_states) - + hidden_states = residual + hidden_states return hidden_states @@ -1123,22 +1130,22 @@ def _forward_impl( class NemotronHModel(nn.Module): """ NemotronH backbone model. - + This is the main backbone that can be used as a decoder in TTS models. It exposes the same interface as HuggingFace transformer models. """ - + def __init__(self, config: NemotronHConfig): super().__init__() self.config = config - + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]) self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - + self.gradient_checkpointing = False self._init_weights() - + def _init_weights(self): """Initialize weights with special handling for Mamba components.""" for name, module in self.named_modules(): @@ -1146,7 +1153,7 @@ def _init_weights(self): # Mark parameters that should not have weight decay module.A_log._no_weight_decay = True module.D._no_weight_decay = True - + # Special initialization for dt_bias using inverse softplus # This follows the Mamba2 initialization scheme dt = torch.exp( @@ -1154,13 +1161,13 @@ def _init_weights(self): * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) + math.log(self.config.time_step_min) ).clamp(min=self.config.time_step_floor) - + # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 inv_dt = dt + torch.log(-torch.expm1(-dt)) with torch.no_grad(): module.dt_bias.copy_(inv_dt) module.dt_bias._no_reinit = True - + elif isinstance(module, nn.Linear): nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) if module.bias is not None: @@ -1168,7 +1175,7 @@ def _init_weights(self): nn.init.zeros_(module.bias) elif isinstance(module, nn.Embedding): nn.init.normal_(module.weight, std=self.config.initializer_range) - + # Rescale prenorm residual weights for better training stability # Following GPT-2 paper: scale by 1/sqrt(2 * n_layer) if self.config.rescale_prenorm_residual: @@ -1178,13 +1185,13 @@ def _init_weights(self): # Scale by 1/sqrt(num_hidden_layers) with torch.no_grad(): p /= math.sqrt(self.config.num_hidden_layers) - + def get_input_embeddings(self): return self.embeddings - + def set_input_embeddings(self, new_embeddings): self.embeddings = new_embeddings - + def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1203,20 +1210,22 @@ def forward( # Support both cache_params and past_key_values for compatibility if past_key_values is not None and cache_params is None: cache_params = past_key_values - + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - + if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) - + hidden_states = inputs_embeds - + # Create cache if use_cache=True but no cache provided if use_cache and cache_params is None: cache_params = HybridMambaAttentionDynamicCache( @@ -1225,17 +1234,17 @@ def forward( dtype=hidden_states.dtype, device=hidden_states.device, ) - + if cache_position is None: cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) - + # Create causal mask for attention layers causal_mask = self._create_causal_mask(attention_mask, inputs_embeds, cache_position) mamba_mask = self._update_mamba_mask(attention_mask, cache_position) - + all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None - + for layer_idx, layer in enumerate(self.layers): if layer.block_type == "mamba": layer_mask = mamba_mask @@ -1243,10 +1252,10 @@ def forward( layer_mask = causal_mask else: layer_mask = None - + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - + if self.gradient_checkpointing and self.training: hidden_states = torch.utils.checkpoint.checkpoint( layer.__call__, hidden_states, cache_params, cache_position, layer_mask @@ -1258,48 +1267,48 @@ def forward( cache_position=cache_position, attention_mask=layer_mask, ) - + hidden_states = self.norm_f(hidden_states) - + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - + if not return_dict: return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) - + return NemotronHOutput( last_hidden_state=hidden_states, past_key_values=cache_params if use_cache else None, hidden_states=all_hidden_states, attentions=all_self_attns, ) - + def _create_causal_mask(self, attention_mask, input_tensor, cache_position): """Create causal attention mask.""" dtype, device = input_tensor.dtype, input_tensor.device min_dtype = torch.finfo(dtype).min sequence_length = input_tensor.shape[1] target_length = cache_position[-1] + 1 - + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) if sequence_length != 1: causal_mask = torch.triu(causal_mask, diagonal=1) causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) - + if attention_mask is not None: causal_mask = causal_mask.clone() if attention_mask.dim() == 2: mask_length = attention_mask.shape[-1] padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) - + return causal_mask - + def _update_mamba_mask(self, attention_mask, cache_position): """ Update Mamba mask with optimization. - + No need for zeroing states when: 1. Cached forward (cache_position[0] > 0) 2. Attending to all inputs (all mask values are 1) @@ -1313,40 +1322,40 @@ def _update_mamba_mask(self, attention_mask, cache_position): class NemotronHForCausalLM(nn.Module): """ NemotronH model with a language modeling head. - + This is the full model that matches the AutoModelForCausalLM interface. """ - + def __init__(self, config: NemotronHConfig): super().__init__() self.config = config self.backbone = NemotronHModel(config) self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - + self._init_weights() - + def _init_weights(self): """Initialize weights.""" nn.init.normal_(self.lm_head.weight, mean=0.0, std=self.config.initializer_range) - + def get_input_embeddings(self): return self.backbone.get_input_embeddings() - + def set_input_embeddings(self, new_embeddings): self.backbone.set_input_embeddings(new_embeddings) - + def get_output_embeddings(self): return self.lm_head - + def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings - + @property def model(self): """Alias for backbone, for HuggingFace compatibility.""" return self.backbone - + def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1364,7 +1373,7 @@ def forward( **kwargs, ) -> Union[Tuple, NemotronHCausalLMOutput]: return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + outputs = self.backbone( input_ids=input_ids, inputs_embeds=inputs_embeds, @@ -1378,10 +1387,10 @@ def forward( return_dict=return_dict, cache_position=cache_position, ) - + hidden_states = outputs.last_hidden_state if return_dict else outputs[0] logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() - + loss = None if labels is not None: labels = labels.to(logits.device) @@ -1389,11 +1398,11 @@ def forward( shift_labels = labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output - + return NemotronHCausalLMOutput( loss=loss, logits=logits, @@ -1401,7 +1410,7 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - + def prepare_inputs_for_generation( self, input_ids, @@ -1415,42 +1424,41 @@ def prepare_inputs_for_generation( ): """Prepare inputs for generation.""" empty_past_kv = past_key_values is None - + # If we have cache: slice input_ids through cache_position to keep only unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids # Exception 3: with synced GPUs cache_position may go out of bounds if not empty_past_kv: - if ( - inputs_embeds is not None # Exception 1 - or cache_position[-1] >= input_ids.shape[1] # Exception 3 - ): - input_ids = input_ids[:, -cache_position.shape[0]:] + if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]: # Exception 1 # Exception 3 + input_ids = input_ids[:, -cache_position.shape[0] :] elif input_ids.shape[1] != cache_position.shape[0]: # Default case input_ids = input_ids[:, cache_position] else: past_key_values = HybridMambaAttentionDynamicCache( self.config, input_ids.shape[0], self.backbone.embeddings.weight.dtype, device=input_ids.device ) - + # Create position_ids on the fly for batch generation if not provided if attention_mask is not None and position_ids is None: position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if not empty_past_kv: - position_ids = position_ids[:, -input_ids.shape[1]:] - + position_ids = position_ids[:, -input_ids.shape[1] :] + # If inputs_embeds are passed, only use them in the 1st generation step if inputs_embeds is not None and empty_past_kv: model_inputs = {"inputs_embeds": inputs_embeds} else: model_inputs = {"input_ids": input_ids.contiguous()} - - model_inputs.update({ - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - "cache_position": cache_position, - }) + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "cache_position": cache_position, + } + ) return model_inputs diff --git a/tests/collections/tts/test_nemotron_h_decoder.py b/tests/collections/tts/test_nemotron_h_decoder.py index 4b21dc1ae716..943abe1d9046 100644 --- a/tests/collections/tts/test_nemotron_h_decoder.py +++ b/tests/collections/tts/test_nemotron_h_decoder.py @@ -25,15 +25,18 @@ try: import pytest + PYTEST_AVAILABLE = True except ImportError: PYTEST_AVAILABLE = False + # Create a dummy pytest fixture decorator for standalone execution class pytest: @staticmethod def fixture(func): return func + import torch from nemo.collections.tts.modules.nemotron_h_decoder import ( @@ -41,42 +44,36 @@ def fixture(func): NemotronHConfig, NemotronHForCausalLM, NemotronHMLP, - NemotronHMOE, NemotronHModel, + NemotronHMOE, NemotronHTopkRouter, ) class TestNemotronHConfig: """Test NemotronHConfig initialization and defaults.""" - + def test_default_config(self): """Test default config initialization.""" config = NemotronHConfig() assert config.hidden_size == 1536 assert config.num_hidden_layers == 24 assert len(config.layers_block_type) == config.num_hidden_layers - + def test_custom_pattern(self): """Test custom hybrid_override_pattern.""" - config = NemotronHConfig( - num_hidden_layers=8, - hybrid_override_pattern="M*M*M*M*" - ) + config = NemotronHConfig(num_hidden_layers=8, hybrid_override_pattern="M*M*M*M*") assert config.layers_block_type == ['mamba', 'attention'] * 4 - + def test_pattern_extension(self): """Test that short patterns are extended to match num_hidden_layers.""" - config = NemotronHConfig( - num_hidden_layers=8, - hybrid_override_pattern="M*" - ) + config = NemotronHConfig(num_hidden_layers=8, hybrid_override_pattern="M*") assert len(config.layers_block_type) == 8 class TestNemotronHModel: """Test NemotronHModel backbone.""" - + @pytest.fixture def small_config(self): """Create a small config for testing.""" @@ -93,51 +90,51 @@ def small_config(self): intermediate_size=128, hybrid_override_pattern="M*M*", ) - + @pytest.fixture def model(self, small_config): """Create a small model for testing.""" return NemotronHModel(small_config) - + def test_model_creation(self, model, small_config): """Test model can be created.""" assert model is not None assert len(model.layers) == small_config.num_hidden_layers - + def test_forward_with_input_ids(self, model): """Test forward pass with input_ids.""" batch_size, seq_len = 2, 16 input_ids = torch.randint(0, 1000, (batch_size, seq_len)) - + output = model(input_ids=input_ids) - + assert output.last_hidden_state is not None assert output.last_hidden_state.shape == (batch_size, seq_len, 64) - + def test_forward_with_inputs_embeds(self, model): """Test forward pass with inputs_embeds (required for TTS).""" batch_size, seq_len, hidden_size = 2, 16, 64 inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) - + output = model(inputs_embeds=inputs_embeds) - + assert output.last_hidden_state is not None assert output.last_hidden_state.shape == (batch_size, seq_len, hidden_size) - + def test_get_set_input_embeddings(self, model): """Test get/set input embeddings interface.""" original_embeddings = model.get_input_embeddings() assert original_embeddings is not None - + new_embeddings = torch.nn.Embedding(100, 64) model.set_input_embeddings(new_embeddings) - + assert model.get_input_embeddings() is new_embeddings class TestNemotronHForCausalLM: """Test NemotronHForCausalLM full model.""" - + @pytest.fixture def small_config(self): """Create a small config for testing.""" @@ -154,69 +151,69 @@ def small_config(self): intermediate_size=128, hybrid_override_pattern="M*M*", ) - + @pytest.fixture def model(self, small_config): """Create a small model for testing.""" return NemotronHForCausalLM(small_config) - + def test_model_creation(self, model, small_config): """Test model can be created.""" assert model is not None assert model.backbone is not None assert model.lm_head is not None - + def test_model_alias(self, model): """Test that model.model returns backbone (HF compatibility).""" assert model.model is model.backbone - + def test_forward_with_inputs_embeds(self, model): """Test forward pass with inputs_embeds.""" batch_size, seq_len, hidden_size = 2, 16, 64 inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) - + output = model(inputs_embeds=inputs_embeds) - + assert output.logits is not None assert output.logits.shape == (batch_size, seq_len, 1000) # vocab_size - + def test_interface_compatibility(self, model): """Test that model satisfies EasyMagpieTTSModel interface requirements.""" # Test 1: decoder.get_input_embeddings() embeddings = model.backbone.get_input_embeddings() assert embeddings is not None - + # Test 2: decoder.set_input_embeddings() new_emb = torch.nn.Embedding(100, 64) model.backbone.set_input_embeddings(new_emb) assert model.backbone.get_input_embeddings() is new_emb - + # Reset for next tests model.backbone.set_input_embeddings(embeddings) - + # Test 3: decoder(inputs_embeds, attention_mask, use_cache, past_key_values) batch_size, seq_len, hidden_size = 2, 16, 64 inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) attention_mask = torch.ones(batch_size, seq_len) - + output = model.backbone( inputs_embeds=inputs_embeds, attention_mask=attention_mask, use_cache=False, past_key_values=None, ) - + # Test 4: Return .last_hidden_state assert hasattr(output, 'last_hidden_state') assert output.last_hidden_state is not None - + # Test 5: Return .past_key_values (when use_cache=True not tested here as it requires more setup) assert hasattr(output, 'past_key_values') class TestHybridCache: """Test HybridMambaAttentionDynamicCache.""" - + def test_cache_creation(self): """Test cache can be created.""" config = NemotronHConfig( @@ -228,10 +225,10 @@ def test_cache_creation(self): conv_kernel=4, hybrid_override_pattern="M*M*", ) - + batch_size = 2 cache = HybridMambaAttentionDynamicCache(config, batch_size, dtype=torch.float32) - + assert len(cache.conv_states) == config.num_hidden_layers assert len(cache.ssm_states) == config.num_hidden_layers assert len(cache.key_cache) == config.num_hidden_layers @@ -240,7 +237,7 @@ def test_cache_creation(self): class TestNemotronHCausality: """Test that NemotronH model is causal (future timesteps don't affect previous ones).""" - + @pytest.fixture def small_config(self): """Create a small config for testing causality.""" @@ -257,18 +254,18 @@ def small_config(self): intermediate_size=128, hybrid_override_pattern="M*M*", ) - + @pytest.fixture def model(self, small_config): """Create a small model for testing.""" model = NemotronHModel(small_config) model.eval() # Set to eval mode for deterministic behavior return model - + def test_causality_with_input_modification(self, model, small_config): """ Test causality by modifying future timesteps and checking that earlier outputs are unchanged. - + The test: 1. Pass sequence through the model 2. Modify a future timestep in the input @@ -276,63 +273,65 @@ def test_causality_with_input_modification(self, model, small_config): """ batch_size, seq_len = 2, 16 hidden_size = small_config.hidden_size - + # Create a base input torch.manual_seed(42) inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) - + # Get output with original input with torch.no_grad(): output_original = model(inputs_embeds=inputs_embeds_original.clone()) - + # Test at different positions test_positions = [seq_len // 4, seq_len // 2, 3 * seq_len // 4] - + for modify_pos in test_positions: # Create modified input where we change timesteps from modify_pos onwards inputs_embeds_modified = inputs_embeds_original.clone() # Add random noise to all positions from modify_pos onwards - inputs_embeds_modified[:, modify_pos:, :] += torch.randn( - batch_size, seq_len - modify_pos, hidden_size - ) * 10.0 # Large modification to ensure it would affect outputs if not causal - + inputs_embeds_modified[:, modify_pos:, :] += ( + torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0 + ) # Large modification to ensure it would affect outputs if not causal + # Get output with modified input with torch.no_grad(): output_modified = model(inputs_embeds=inputs_embeds_modified) - + # Check that outputs BEFORE modify_pos are unchanged outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :] outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :] - + # Should be exactly equal (within floating point tolerance) - assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \ - f"Causality violation: modifying position {modify_pos} affected earlier positions" - + assert torch.allclose( + outputs_before_original, outputs_before_modified, atol=1e-5 + ), f"Causality violation: modifying position {modify_pos} affected earlier positions" + # Verify that outputs AT and AFTER modify_pos are different (sanity check) outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :] outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :] - - assert not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3), \ - f"Sanity check failed: modifying position {modify_pos} should affect outputs at/after that position" - + + assert not torch.allclose( + outputs_after_original, outputs_after_modified, atol=1e-3 + ), f"Sanity check failed: modifying position {modify_pos} should affect outputs at/after that position" + def test_causality_incremental_vs_full(self, model, small_config): """ Test causality by comparing incremental (token-by-token) vs full sequence processing. - + A causal model should produce the same output whether we: 1. Process the full sequence at once 2. Process tokens incrementally one at a time """ batch_size, seq_len = 1, 8 # Smaller seq for incremental test hidden_size = small_config.hidden_size - + torch.manual_seed(123) inputs_embeds = torch.randn(batch_size, seq_len, hidden_size) - + # Get output from full sequence with torch.no_grad(): output_full = model(inputs_embeds=inputs_embeds) - + # Get outputs incrementally (one token at a time) # For a causal model, output at each position should match incremental_outputs = [] @@ -341,47 +340,47 @@ def test_causality_incremental_vs_full(self, model, small_config): partial_output = model(inputs_embeds=inputs_embeds[:, :t, :]) # Take only the last timestep output for comparison incremental_outputs.append(partial_output.last_hidden_state[:, -1:, :]) - + # Stack incremental outputs output_incremental = torch.cat(incremental_outputs, dim=1) - + # Compare: the full sequence output should match the incrementally computed outputs - assert torch.allclose(output_full.last_hidden_state, output_incremental, atol=1e-4), \ - "Causality violation: incremental processing produces different results than full sequence" - + assert torch.allclose( + output_full.last_hidden_state, output_incremental, atol=1e-4 + ), "Causality violation: incremental processing produces different results than full sequence" + def test_causality_causal_lm(self, small_config): """Test causality for NemotronHForCausalLM.""" model = NemotronHForCausalLM(small_config) model.eval() - + batch_size, seq_len = 2, 12 hidden_size = small_config.hidden_size - + torch.manual_seed(456) inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) - + modify_pos = seq_len // 2 - + # Get logits with original input with torch.no_grad(): output_original = model(inputs_embeds=inputs_embeds_original.clone()) - + # Modify future positions inputs_embeds_modified = inputs_embeds_original.clone() - inputs_embeds_modified[:, modify_pos:, :] += torch.randn( - batch_size, seq_len - modify_pos, hidden_size - ) * 10.0 - + inputs_embeds_modified[:, modify_pos:, :] += torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0 + with torch.no_grad(): output_modified = model(inputs_embeds=inputs_embeds_modified) - + # Check logits before modify_pos are unchanged logits_before_original = output_original.logits[:, :modify_pos, :] logits_before_modified = output_modified.logits[:, :modify_pos, :] - - assert torch.allclose(logits_before_original, logits_before_modified, atol=1e-5), \ - "Causality violation in CausalLM: modifying future positions affected earlier logits" - + + assert torch.allclose( + logits_before_original, logits_before_modified, atol=1e-5 + ), "Causality violation in CausalLM: modifying future positions affected earlier logits" + def test_causality_different_layer_types(self): """Test causality with different hybrid patterns (Mamba-only, Attention-only, mixed).""" patterns = [ @@ -390,7 +389,7 @@ def test_causality_different_layer_types(self): "M*M*", # Alternating "MM**", # Mixed blocks ] - + for pattern in patterns: config = NemotronHConfig( hidden_size=64, @@ -405,39 +404,40 @@ def test_causality_different_layer_types(self): intermediate_size=128, hybrid_override_pattern=pattern, ) - + model = NemotronHModel(config) model.eval() - + batch_size, seq_len = 2, 8 hidden_size = config.hidden_size - + torch.manual_seed(789) inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) - + modify_pos = 4 - + with torch.no_grad(): output_original = model(inputs_embeds=inputs_embeds_original.clone()) - + inputs_embeds_modified = inputs_embeds_original.clone() - inputs_embeds_modified[:, modify_pos:, :] += torch.randn( - batch_size, seq_len - modify_pos, hidden_size - ) * 10.0 - + inputs_embeds_modified[:, modify_pos:, :] += ( + torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0 + ) + with torch.no_grad(): output_modified = model(inputs_embeds=inputs_embeds_modified) - + outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :] outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :] - - assert torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5), \ - f"Causality violation for pattern '{pattern}': modifying future positions affected earlier outputs" + + assert torch.allclose( + outputs_before_original, outputs_before_modified, atol=1e-5 + ), f"Causality violation for pattern '{pattern}': modifying future positions affected earlier outputs" class TestMoELayer: """Test Mixture of Experts layer.""" - + @pytest.fixture def moe_config(self): """Create a config for MoE testing.""" @@ -463,60 +463,60 @@ def moe_config(self): norm_topk_prob=True, hybrid_override_pattern="M*ME", # Includes MoE layer ) - + def test_topk_router_creation(self, moe_config): """Test NemotronHTopkRouter creation.""" router = NemotronHTopkRouter(moe_config) assert router.weight.shape == (moe_config.n_routed_experts, moe_config.hidden_size) assert router.top_k == moe_config.num_experts_per_tok - + def test_topk_router_forward(self, moe_config): """Test NemotronHTopkRouter forward pass.""" router = NemotronHTopkRouter(moe_config) batch_size, seq_len = 2, 8 hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size) - + topk_indices, topk_weights = router(hidden_states) - + # Check shapes assert topk_indices.shape == (batch_size * seq_len, moe_config.num_experts_per_tok) assert topk_weights.shape == (batch_size * seq_len, moe_config.num_experts_per_tok) - + # Check indices are valid assert topk_indices.min() >= 0 assert topk_indices.max() < moe_config.n_routed_experts - + def test_moe_layer_creation(self, moe_config): """Test NemotronHMOE creation.""" moe = NemotronHMOE(moe_config, layer_idx=0) - + assert len(moe.experts) == moe_config.n_routed_experts assert moe.gate is not None assert moe.shared_experts is not None - + def test_moe_layer_forward(self, moe_config): """Test NemotronHMOE forward pass.""" moe = NemotronHMOE(moe_config, layer_idx=0) batch_size, seq_len = 2, 8 hidden_states = torch.randn(batch_size, seq_len, moe_config.hidden_size) - + output = moe(hidden_states) - + assert output.shape == hidden_states.shape - + def test_model_with_moe_pattern(self, moe_config): """Test full model with MoE layer.""" model = NemotronHModel(moe_config) - + # Check that MoE layer was created assert model.layers[3].block_type == "moe" - + # Test forward pass batch_size, seq_len = 2, 8 inputs_embeds = torch.randn(batch_size, seq_len, moe_config.hidden_size) - + output = model(inputs_embeds=inputs_embeds) - + assert output.last_hidden_state is not None assert output.last_hidden_state.shape == (batch_size, seq_len, moe_config.hidden_size) @@ -524,7 +524,7 @@ def test_model_with_moe_pattern(self, moe_config): if __name__ == "__main__": """Run basic tests without pytest.""" print("Testing NemotronH Decoder Module...") - + # Test 1: Config print("\n1. Testing NemotronHConfig...") config = NemotronHConfig( @@ -542,12 +542,12 @@ def test_model_with_moe_pattern(self, moe_config): ) print(f" Config created: {config.num_hidden_layers} layers, pattern={config.hybrid_override_pattern}") print(f" Layer types: {config.layers_block_type}") - + # Test 2: Model creation print("\n2. Testing NemotronHModel creation...") model = NemotronHModel(config) print(f" Model created with {len(model.layers)} layers") - + # Test 3: Forward pass with inputs_embeds print("\n3. Testing forward pass with inputs_embeds...") batch_size, seq_len, hidden_size = 2, 16, 64 @@ -555,27 +555,27 @@ def test_model_with_moe_pattern(self, moe_config): output = model(inputs_embeds=inputs_embeds) print(f" Input shape: {inputs_embeds.shape}") print(f" Output shape: {output.last_hidden_state.shape}") - + # Test 4: Full model print("\n4. Testing NemotronHForCausalLM...") full_model = NemotronHForCausalLM(config) output = full_model(inputs_embeds=inputs_embeds) print(f" Logits shape: {output.logits.shape}") - + # Test 5: Interface compatibility print("\n5. Testing interface compatibility for EasyMagpieTTSModel...") decoder = full_model.backbone - + # get_input_embeddings emb = decoder.get_input_embeddings() print(f" get_input_embeddings(): {type(emb).__name__}") - + # set_input_embeddings new_emb = torch.nn.Embedding(100, 64) decoder.set_input_embeddings(new_emb) print(f" set_input_embeddings(): OK") decoder.set_input_embeddings(emb) # Reset - + # forward with expected args output = decoder( inputs_embeds=inputs_embeds, @@ -586,7 +586,7 @@ def test_model_with_moe_pattern(self, moe_config): print(f" forward(inputs_embeds, attention_mask, use_cache, past_key_values): OK") print(f" .last_hidden_state: {output.last_hidden_state.shape}") print(f" .past_key_values: {output.past_key_values}") - + # Test 6: MoE layer print("\n6. Testing MoE (Mixture of Experts) layer...") moe_config = NemotronHConfig( @@ -612,26 +612,26 @@ def test_model_with_moe_pattern(self, moe_config): hybrid_override_pattern="M*ME", # Includes MoE layer ) print(f" Config: pattern={moe_config.hybrid_override_pattern}, block_types={moe_config.layers_block_type}") - + # Test router router = NemotronHTopkRouter(moe_config) test_input = torch.randn(2, 8, 64) topk_indices, topk_weights = router(test_input) print(f" Router: topk_indices shape={topk_indices.shape}, topk_weights shape={topk_weights.shape}") - + # Test MoE layer moe = NemotronHMOE(moe_config, layer_idx=0) moe_output = moe(test_input) print(f" MoE layer: input={test_input.shape}, output={moe_output.shape}") - + # Test full model with MoE moe_model = NemotronHModel(moe_config) moe_model_output = moe_model(inputs_embeds=test_input) print(f" Full model with MoE: output={moe_model_output.last_hidden_state.shape}") - + # Test 7: Causality test print("\n7. Testing model causality (future timesteps don't affect previous ones)...") - + # Create model for causality test causality_config = NemotronHConfig( hidden_size=64, @@ -648,53 +648,51 @@ def test_model_with_moe_pattern(self, moe_config): ) causality_model = NemotronHModel(causality_config) causality_model.eval() - + batch_size, seq_len = 2, 16 hidden_size = 64 - + # Create base input torch.manual_seed(42) inputs_embeds_original = torch.randn(batch_size, seq_len, hidden_size) - + # Get output with original input with torch.no_grad(): output_original = causality_model(inputs_embeds=inputs_embeds_original.clone()) - + # Test at different positions test_positions = [4, 8, 12] causality_passed = True - + for modify_pos in test_positions: # Create modified input where we change timesteps from modify_pos onwards inputs_embeds_modified = inputs_embeds_original.clone() - inputs_embeds_modified[:, modify_pos:, :] += torch.randn( - batch_size, seq_len - modify_pos, hidden_size - ) * 10.0 - + inputs_embeds_modified[:, modify_pos:, :] += torch.randn(batch_size, seq_len - modify_pos, hidden_size) * 10.0 + # Get output with modified input with torch.no_grad(): output_modified = causality_model(inputs_embeds=inputs_embeds_modified) - + # Check that outputs BEFORE modify_pos are unchanged outputs_before_original = output_original.last_hidden_state[:, :modify_pos, :] outputs_before_modified = output_modified.last_hidden_state[:, :modify_pos, :] - + if torch.allclose(outputs_before_original, outputs_before_modified, atol=1e-5): print(f" Position {modify_pos}: PASS (earlier outputs unchanged)") else: print(f" Position {modify_pos}: FAIL (causality violation!)") causality_passed = False - + # Verify outputs at/after modify_pos are different (sanity check) outputs_after_original = output_original.last_hidden_state[:, modify_pos:, :] outputs_after_modified = output_modified.last_hidden_state[:, modify_pos:, :] - + if not torch.allclose(outputs_after_original, outputs_after_modified, atol=1e-3): print(f" Position {modify_pos}: Sanity check PASS (later outputs changed)") else: print(f" Position {modify_pos}: Sanity check FAIL (later outputs should change)") causality_passed = False - + # Test with different layer patterns print("\n Testing causality with different layer patterns...") patterns = ["MMMM", "****", "M*M*", "MM**"] @@ -714,32 +712,33 @@ def test_model_with_moe_pattern(self, moe_config): ) pattern_model = NemotronHModel(pattern_config) pattern_model.eval() - + torch.manual_seed(789) test_input = torch.randn(2, 8, 64) modify_pos = 4 - + with torch.no_grad(): out_orig = pattern_model(inputs_embeds=test_input.clone()) - + test_input_mod = test_input.clone() test_input_mod[:, modify_pos:, :] += torch.randn(2, 4, 64) * 10.0 - + with torch.no_grad(): out_mod = pattern_model(inputs_embeds=test_input_mod) - - if torch.allclose(out_orig.last_hidden_state[:, :modify_pos, :], - out_mod.last_hidden_state[:, :modify_pos, :], atol=1e-5): + + if torch.allclose( + out_orig.last_hidden_state[:, :modify_pos, :], out_mod.last_hidden_state[:, :modify_pos, :], atol=1e-5 + ): print(f" Pattern '{pattern}': PASS") else: print(f" Pattern '{pattern}': FAIL (causality violation!)") causality_passed = False - + if causality_passed: print(" All causality tests PASSED!") else: print(" WARNING: Some causality tests FAILED!") - - print("\n" + "="*50) + + print("\n" + "=" * 50) print("All tests passed!") - print("="*50) + print("=" * 50) From 067a6e808e402bb166b58d7878c1ccafba8a151d Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Mon, 2 Feb 2026 17:39:33 -0500 Subject: [PATCH 32/94] inference function refactoring Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts.py | 698 ++++++++++++------ 1 file changed, 456 insertions(+), 242 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 3f107736604a..508a1332c31e 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -15,7 +15,7 @@ import time from dataclasses import dataclass from functools import partial -from typing import List, Optional, Sequence, Tuple +from typing import Dict, List, Optional, Sequence, Tuple import torch import wandb @@ -1897,6 +1897,274 @@ def setup_validation_data(self, cfg): def setup_test_data(self, cfg): self._test_dl = self._setup_test_dataloader(cfg) + def _log_phoneme_predictions( + self, + pred_phoneme_token_lists: List[List[int]], + gt_phoneme_token_lists: List[List[int]], + batch_size: int, + ) -> None: + """Log predicted vs ground truth phoneme tokens for debugging.""" + for item_idx in range(batch_size): + logging.info(f"Predicted phoneme tokens for item {item_idx}: {pred_phoneme_token_lists[item_idx]}") + logging.info(f"GT phoneme tokens for item {item_idx}: {gt_phoneme_token_lists[item_idx]}") + predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx]) + gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx]) + logging.info(f"Predicted phoneme text for item {item_idx}: {predicted_phoneme_text}") + logging.info(f"GT phoneme text for item {item_idx}: {gt_phoneme_text}") + + def _collect_phoneme_tokens_for_logging( + self, + pred_phoneme_tokens: torch.Tensor, + gt_phoneme_tokens_current: torch.Tensor, + use_phoneme_input: torch.Tensor, + pred_phoneme_token_lists: List[List[int]], + gt_phoneme_token_lists: List[List[int]], + batch_size: int, + ) -> None: + """Collect phoneme tokens into lists for later logging (does not print).""" + special_tokens = { + self.phoneme_tokenizer.eos_token_id, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.pad, + } + for item_idx in range(batch_size): + if use_phoneme_input[item_idx, 0, 0] > 0: + for phoneme_channel_idx in range(self.phoneme_stacking_factor): + pred_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item() + if pred_token not in special_tokens: + pred_phoneme_token_lists[item_idx].append(pred_token) + + gt_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item() + if gt_token not in special_tokens: + gt_phoneme_token_lists[item_idx].append(gt_token) + + def _sample_audio_codes( + self, + last_hidden: torch.Tensor, + all_code_logits_t: torch.Tensor, + temperature: float, + topk: int, + use_local_transformer_for_inference: bool, + use_cfg: bool, + cfg_scale: float, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Sample audio codes from logits using either local transformer or parallel sampling. + + Returns: + audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks) + all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks) + """ + if use_local_transformer_for_inference: + if self.local_transformer_type == LocalTransformerType.AR: + audio_codes_next = self.local_transformer_sample_autoregressive( + dec_output=last_hidden[:, -1, :], + temperature=temperature, + topk=topk, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + ) + else: + raise ValueError( + f"Local transformer inference requested but local transformer type is {self.local_transformer_type}" + ) + # TODO @rfejgin: should we add argmax sampling for EOS here too? + all_codes_next_argmax = audio_codes_next + else: + # Parallel sampling from all codebook logits + audio_codes_next = self.sample_codes_from_logits( + all_code_logits_t, temperature=temperature, topk=topk + ) + # Argmax sampling for reliable EOS detection + all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) + + return audio_codes_next, all_codes_next_argmax + + def _process_phoneme_predictions( + self, + last_hidden: torch.Tensor, + actual_batch_size: int, + current_phoneme_positions: torch.Tensor, + gt_phoneme_tokens: torch.Tensor, + phoneme_input_type: str, + phoneme_sampling_method: str, + temperature: float, + topk: int, + timestep_idx: int, + device: torch.device, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Process phoneme predictions for the current timestep. + + Returns: + pred_phoneme_tokens: Predicted phoneme tokens (B, phoneme_stacking_factor) + gt_phoneme_tokens_current: GT phoneme tokens for current timestep (B, phoneme_stacking_factor) + input_phoneme_tokens_current: Tokens to use as input (GT or predicted) + input_phoneme_embedding: Embedded phoneme tokens (B, phoneme_stacking_factor, E) + """ + # Get phoneme logits and sample + all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) + all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] + + all_codes_next_phoneme = self.sample_codes_from_logits_phoneme( + all_code_logits_t_phoneme, temperature=temperature, topk=topk + ) + all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme( + all_code_logits_t_phoneme, temperature=0.01 + ) + + # Select predicted tokens based on sampling method + pred_phoneme_tokens = ( + all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme + ) + + # Handle BOS token at position 0 + phoneme_bos_tensor = torch.full( + (actual_batch_size, self.phoneme_stacking_factor), + self.phoneme_tokenizer.bos_token_id, + device=device, + ).long() + use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long() + pred_phoneme_tokens = ( + use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens + ).long() + + # Get ground truth phoneme tokens for current timestep + gt_phoneme_idx = min(timestep_idx, gt_phoneme_tokens.size(2) - 1) + gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] + + # Select input tokens (GT or predicted) and embed + input_phoneme_tokens_current = ( + gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens + ) + input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2)) + + return pred_phoneme_tokens, gt_phoneme_tokens_current, input_phoneme_tokens_current, input_phoneme_embedding + + def _compute_phoneme_channel_input( + self, + input_phoneme_embedding: torch.Tensor, + current_phoneme_positions: torch.Tensor, + phoneme_stream_ended: torch.Tensor, + actual_batch_size: int, + device: torch.device, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Compute the phoneme channel input embedding with masking. + + Returns: + phoneme_channel_input_t: Masked phoneme embedding (B, 1, E) + use_phoneme_input: Mask indicating which items should use phoneme input (B, 1, 1) + """ + # Determine which items should use phoneme input + use_phoneme_input = (current_phoneme_positions >= 0) & (~phoneme_stream_ended) + use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float() + + # Create zero embedding for items not using phoneme input + zero_phoneme_embedding = torch.zeros( + actual_batch_size, 1, self.cfg.embedding_dim, device=device + ) + + # Combine: use phoneme embedding where active, zero otherwise + phoneme_channel_input_t = ( + use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding + ) + + return phoneme_channel_input_t, use_phoneme_input + + def _prepare_next_decoder_input( + self, + audio_codes_next: torch.Tensor, + context_plus_audio_embedded: torch.Tensor, + context_plus_audio_lens: torch.Tensor, + min_context_len: int, + idx: int, + current_text_input_mode: str, + remaining_text_embedded: Optional[torch.Tensor], + current_text_positions: torch.Tensor, + phoneme_channel_input_t: Optional[torch.Tensor], + use_cfg: bool, + dummy_context_embedding_unconditional: Optional[torch.Tensor], + ) -> torch.Tensor: + """ + Prepare the input embedding for the next decoder step. + + Handles: + - Mixing context embeddings with generated audio embeddings based on context completeness + - Adding streaming text embeddings if in streaming mode + - Adding phoneme channel input if available + - Duplicating for CFG if enabled + """ + batch_size = audio_codes_next.size(0) + device = audio_codes_next.device + + # Embed the newly generated audio codes + new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2)) # (B, 1, E) + new_emb_unconditional = new_emb.clone() + + # Add streaming text embeddings if in streaming mode + if current_text_input_mode == 'streaming': + remaining_text_idx = current_text_positions.clamp(min=0) + remaining_text_embedded_current = remaining_text_embedded[ + torch.arange(batch_size, device=device), remaining_text_idx, : + ].unsqueeze(1) + new_emb = new_emb + remaining_text_embedded_current + + # Check which items still have context to process + context_incomplete_mask = context_plus_audio_lens > idx + min_context_len + + if context_incomplete_mask.any(): + # Some items still processing context - blend context with generated embeddings + context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float() + context_embedding_slice = context_plus_audio_embedded[ + :, min_context_len + idx : min_context_len + idx + 1, : + ] + next_input = context_incomplete_mask * context_embedding_slice + (1 - context_incomplete_mask) * new_emb + + if phoneme_channel_input_t is not None: + next_input = next_input + phoneme_channel_input_t + + if use_cfg: + next_input_unconditional = ( + context_incomplete_mask * dummy_context_embedding_unconditional + + (1 - context_incomplete_mask) * new_emb_unconditional + ) + next_input = torch.cat([next_input, next_input_unconditional], dim=0) + else: + # All items finished context - use generated embeddings + next_input = new_emb + if phoneme_channel_input_t is not None: + next_input = next_input + phoneme_channel_input_t + + if use_cfg: + next_input = torch.cat([next_input, new_emb_unconditional], dim=0) + + return next_input + + def _check_eos_and_update_end_indices( + self, + all_codes_next_argmax: torch.Tensor, + audio_codes_next: torch.Tensor, + end_indices: Dict[int, int], + context_plus_audio_lens: torch.Tensor, + min_context_len: int, + idx: int, + verbose: bool = False, + ) -> None: + """Check for EOS tokens and update end indices for completed items.""" + for item_idx in range(all_codes_next_argmax.size(0)): + # Only check items that haven't ended and have passed their context + if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]: + pred_tokens = all_codes_next_argmax[item_idx] + pred_tokens_multinomial = audio_codes_next[item_idx] + + if torch.any(pred_tokens == self.audio_eos_id) or torch.any( + pred_tokens_multinomial == self.audio_eos_id + ): + if verbose: + logging.info(f"EOS detected for item {item_idx} at timestep {idx}") + end_indices[item_idx] = idx + def infer_batch( self, batch, @@ -1911,42 +2179,56 @@ def infer_batch( phoneme_sampling_method='argmax', dropout_text_input=False, inference_mode: Optional[str] = None, + verbose: bool = False, ): """ - Run inference on a batch of inputs. + Run inference on a batch of inputs to generate audio from text. Args: - batch: Input batch containing text, context, etc. + batch: Input batch containing: + - text, text_lens: Input text tokens and lengths + - context_text_tokens, context_text_tokens_lens: Context text for speaker/style + - context_audio_codes/context_audio (optional): Audio context for speaker cloning max_decoder_steps: Maximum number of decoding steps. - temperature: Sampling temperature. + temperature: Sampling temperature for audio codes. topk: Top-k sampling parameter. - use_local_transformer_for_inference: Whether to use local transformer. - maskgit_n_steps: Number of MaskGit steps. + use_local_transformer_for_inference: Whether to use local transformer for AR sampling. + maskgit_n_steps: Number of MaskGit steps (unused in AR mode). use_cfg: Whether to use classifier-free guidance. - cfg_scale: CFG scale factor. + cfg_scale: CFG scale factor (higher = stronger conditioning). phoneme_input_type: 'gt' for ground truth or 'pred' for predicted phonemes. - phoneme_sampling_method: 'argmax' or 'sample'. - dropout_text_input: Whether to dropout text input. + phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. + dropout_text_input: Whether to dropout text input for CFG training. inference_mode: Name of the inference mode to use (e.g., "full", "streaming_4_8"). - If None, uses the default inference mode (first mode in training_modes). + If None, uses the default inference mode. + verbose: If True, enables detailed logging of decoding progress, EOS detection, + and phoneme predictions. Default False for cleaner output. + + Returns: + predicted_audio: Generated audio waveforms (B, max_audio_len) + predicted_audio_lens: Lengths of generated audio (B,) + predicted_codes: Generated audio codes (B, num_codebooks, T) + predicted_codes_lens: Lengths of generated codes (B,) + rtf_metrics: Dictionary with timing metrics (rtf, time_to_first_prediction, etc.) """ with torch.inference_mode(): start_time = time.time() # Resolve inference mode mode_name = inference_mode if inference_mode is not None else self.default_inference_mode - if mode_name in self.mode_name_to_mode: - selected_training_mode = self.mode_name_to_mode[mode_name] - logging.info(f"Using inference mode: {selected_training_mode.name}") - else: + if mode_name not in self.mode_name_to_mode: available_modes = list(self.mode_name_to_mode.keys()) raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}") - # Get current mode parameters + selected_training_mode = self.mode_name_to_mode[mode_name] + if verbose: + logging.info(f"Using inference mode: {selected_training_mode.name}") + current_text_input_mode = selected_training_mode.text_input_mode current_streaming_speech_delay = selected_training_mode.streaming_speech_delay current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay + # Prepare context embeddings (text + audio context) context_tensors = self.prepare_context_tensors( text=batch['text'], text_lens=batch['text_lens'], @@ -1964,289 +2246,216 @@ def infer_batch( remaining_text_embedded = context_tensors.remaining_text_embedded remaining_text_lens = context_tensors.remaining_text_lens + actual_batch_size = context_embedding.size(0) + device = context_embedding.device + + # Prepare phoneme channel input if phoneme tokenizer is available + gt_phoneme_tokens = None if self.phoneme_tokenizer is not None: context_lens_for_phonemes = ( context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay ) - phoneme_channel_input, phoneme_channel_input_lens, gt_phoneme_tokens, gt_phoneme_token_lens = ( - self.prepare_phoneme_channel_input( - batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes - ) - ) - phoneme_channel_input_pad_tensor = torch.zeros( - phoneme_channel_input.size(0), - max_decoder_steps, - phoneme_channel_input.size(2), - device=phoneme_channel_input.device, + _, _, gt_phoneme_tokens, _ = self.prepare_phoneme_channel_input( + batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes ) - phoneme_channel_input = torch.cat([phoneme_channel_input, phoneme_channel_input_pad_tensor], dim=1) + # Initialize audio codes with BOS token audio_codes_bos = torch.full( - (context_embedding.size(0), self.num_audio_codebooks * self.frame_stacking_factor, 1), + (actual_batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1), self.audio_bos_id, - device=context_embedding.device, + device=device, ).long() - audio_codes_lens = torch.full((context_embedding.size(0),), 1, device=context_embedding.device).long() - audio_codes_input = audio_codes_bos + audio_codes_lens = torch.ones(actual_batch_size, device=device).long() - audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) + audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_bos) # (B, 1, E) + + # For streaming mode, add text embeddings to audio BOS if current_text_input_mode == 'streaming': remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1 remaining_text_pad_tensor = torch.zeros( - remaining_text_embedded.size(0), - remaining_text_pad_length, - remaining_text_embedded.size(2), - device=remaining_text_embedded.device, + actual_batch_size, remaining_text_pad_length, remaining_text_embedded.size(2), device=device ) remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1) - audio_codes_input_embedded = ( - audio_codes_input_embedded + remaining_text_embedded[:, :1, :] - ) # :1 corresponds to audio BOS. + audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :] + # Combine context and audio embeddings context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( embeddings=[context_embedding, audio_codes_input_embedded], lengths=[context_lens, audio_codes_lens], ) min_context_len = context_plus_audio_lens.min().item() + + # Adjust min_context_len for phoneme delay if using phoneme tokenizer if self.phoneme_tokenizer is not None: min_context_len = ( min_context_len - current_streaming_speech_delay + current_streaming_phonemes_delay - 1 - ) # 1 for audio BOS that we had added. + ) - actual_batch_size = context_embedding.size(0) + # Setup classifier-free guidance if enabled + dummy_context_embedding_unconditional = None if use_cfg: + # Create unconditional context embedding (all UNK tokens) dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()( - torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=context_embedding.device) - ) # (B, 1, E) + torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=device) + ) dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand( -1, context_embedding.size(1), -1 - ) # (B, T_total, E) + ) dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally( embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded], lengths=[context_lens, audio_codes_lens], ) + # Concatenate conditional and unconditional inputs: (2B, T_min, E) first_inference_input = torch.cat( [context_plus_audio_embedded, dummy_context_plus_audio_embedded], dim=0 - )[ - :, :min_context_len, : - ] # (2B, T_min, E) + )[:, :min_context_len, :] else: - first_inference_input = context_plus_audio_embedded[:, :min_context_len, :] # (B, T_min, E) + first_inference_input = context_plus_audio_embedded[:, :min_context_len, :] - # Initialize cache_position for tracking sequence position (needed for NemotronH) - cache_position = torch.arange(min_context_len, device=context_embedding.device) - - # First forward pass to get the initial hidden state and past key values + # First forward pass to process all context at once + cache_position = torch.arange(min_context_len, device=device) transformer_out = self.forward( inputs_embeds=first_inference_input, attention_mask=None, use_cache=True, - past_key_values=None, # No past key values for the first step + past_key_values=None, cache_position=cache_position, ) time_to_first_prediction = time.time() - start_time - last_hidden = transformer_out.last_hidden_state # (B, T_total, E) + last_hidden = transformer_out.last_hidden_state past_kv = transformer_out.past_key_values - - # Track the current sequence length for cache_position updates current_cache_seq_len = min_context_len + # Initialize decoding state all_predictions = [] - end_indices = {} + end_indices = {} # Maps item_idx -> timestep when EOS was detected - current_text_positions = [] - for item_idx in range(context_embedding.size(0)): - # 0 if we have started reading the remaining text otherwise negative (indicating how far we are before we start reading the remaining text) - current_text_positions.append(min_context_len - context_plus_audio_lens[item_idx]) - current_text_positions = torch.tensor(current_text_positions, device=context_embedding.device).long() - if self.phoneme_tokenizer is not None: - current_phoneme_positions = ( - current_text_positions - current_text_positions.max() - 1 - ) # Make it 0-indexed. - # current_text_positions = current_text_positions - self.streaming_speech_delay + self.streaming_phonemes_delay + # Track text position for each item in batch + # Negative values indicate we haven't started reading remaining text yet + current_text_positions = torch.tensor( + [min_context_len - context_plus_audio_lens[i] for i in range(actual_batch_size)], + device=device, + ).long() + + # Initialize phoneme tracking state + current_phoneme_positions = None pred_phoneme_token_lists = [[] for _ in range(actual_batch_size)] gt_phoneme_token_lists = [[] for _ in range(actual_batch_size)] - phoneme_stream_ended = torch.zeros( - actual_batch_size, device=context_embedding.device - ).bool() # (B,) Whether phoneme stream has ended for this item. + phoneme_stream_ended = torch.zeros(actual_batch_size, device=device).bool() + + if self.phoneme_tokenizer is not None: + current_phoneme_positions = current_text_positions - current_text_positions.max() - 1 + + # Main autoregressive decoding loop for idx in range(max_decoder_steps): - # import ipdb; ipdb.set_trace() + # Update position trackers current_text_positions += 1 if self.phoneme_tokenizer is not None: current_phoneme_positions += 1 - # print("current_phoneme_positions", current_phoneme_positions) - if idx % 20 == 0: - print(f"Decoding timestep {idx}") - # Project from hidden_dim to audio_embedding_dim, then to logits - last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) - all_code_logits_t = self.final_proj(last_hidden_audio) # (B, num_codebooks * num_tokens_per_codebook) + if verbose and idx % 20 == 0: + logging.info(f"Decoding timestep {idx}") - if self.phoneme_tokenizer is not None: - all_code_logits_t_phoneme = self.phoneme_final_proj( - last_hidden[:, -1, :] - ) # (B, phoneme_stacking_factor * phoneme_vocab_size) - all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] + # Compute audio logits from last hidden state + last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) + all_code_logits_t = self.final_proj(last_hidden_audio) + # Apply CFG to logits if enabled if use_cfg: conditional_logits = all_code_logits_t[:actual_batch_size] unconditional_logits = all_code_logits_t[actual_batch_size:] all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits - if use_local_transformer_for_inference: - if self.local_transformer_type == LocalTransformerType.AR: - # Autoregressive sampling with local transformer - audio_codes_next = self.local_transformer_sample_autoregressive( - dec_output=last_hidden[:, -1, :], - temperature=temperature, - topk=topk, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - ) - else: - raise ValueError( - f"Local transformer inference requested by but local transformer type is {self.local_transformer_type}" - ) - # TODO @rfejgin: should we add argmax sampling for EOS here too? - all_codes_next_argmax = audio_codes_next - else: - # Parallel sampling from logits - audio_codes_next = self.sample_codes_from_logits( - all_code_logits_t, temperature=temperature, topk=topk - ) # (B, num_codebooks) - all_codes_next_argmax = self.sample_codes_from_logits( - all_code_logits_t, temperature=0.01 - ) # (B, num_codebooks) + # Sample audio codes + audio_codes_next, all_codes_next_argmax = self._sample_audio_codes( + last_hidden=last_hidden, + all_code_logits_t=all_code_logits_t, + temperature=temperature, + topk=topk, + use_local_transformer_for_inference=use_local_transformer_for_inference, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + ) + # Process phoneme predictions if phoneme tokenizer exists phoneme_channel_input_t = None - if self.phoneme_tokenizer is not None: - all_codes_next_phoneme = self.sample_codes_from_logits_phoneme( - all_code_logits_t_phoneme, temperature=temperature, topk=topk - ) # (B, phoneme_stacking_factor) - all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme( - all_code_logits_t_phoneme, temperature=0.01 - ) # (B, phoneme_stacking_factor) - pred_phoneme_tokens = ( - all_codes_next_phoneme_argmax - if phoneme_sampling_method == 'argmax' - else all_codes_next_phoneme - ) # B, phoneme_stacking_factor - phoneme_bos_tensor = torch.full( - (actual_batch_size, self.phoneme_stacking_factor), - self.phoneme_tokenizer.bos_token_id, - device=context_embedding.device, - ).long() # (B, phoneme_stacking_factor) - use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long() - # print("use_bos_phoneme", use_bos_phoneme) - pred_phoneme_tokens = ( - use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens - ).long() # (B, phoneme_stacking_factor) - - # print("pred_phoneme_tokens", pred_phoneme_tokens) - gt_phoneme_idx = min(idx, gt_phoneme_tokens.size(2) - 1) - gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # (B, phoneme_stacking_factor) - # print("gt_phoneme_tokens_current", gt_phoneme_tokens_current) - - input_phoneme_tokens_current = ( - gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens + ( + pred_phoneme_tokens, + gt_phoneme_tokens_current, + input_phoneme_tokens_current, + input_phoneme_embedding, + ) = self._process_phoneme_predictions( + last_hidden=last_hidden, + actual_batch_size=actual_batch_size, + current_phoneme_positions=current_phoneme_positions, + gt_phoneme_tokens=gt_phoneme_tokens, + phoneme_input_type=phoneme_input_type, + phoneme_sampling_method=phoneme_sampling_method, + temperature=temperature, + topk=topk, + timestep_idx=idx, + device=device, ) - input_phoneme_embedding = self.embed_phoneme_tokens( - input_phoneme_tokens_current.unsqueeze(2) - ) # (B, phoneme_stacking_factor, E) - - use_phoneme_input = (current_phoneme_positions >= 0) * (~phoneme_stream_ended) # (B,) - use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float() # (B, 1, 1) - zero_phoneme_embedding = torch.zeros( - actual_batch_size, self.cfg.embedding_dim, device=all_codes_next_phoneme.device - ).unsqueeze( - 1 - ) # (B, 1, E) - # phoneme_channel_input_t = phoneme_channel_input[torch.arange(actual_batch_size), current_phoneme_positions.clamp(min=0) + min_context_len, :].unsqueeze(1) # (B, 1, E) - phoneme_channel_input_t = ( - use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding + + # Compute masked phoneme channel input + phoneme_channel_input_t, use_phoneme_input = self._compute_phoneme_channel_input( + input_phoneme_embedding=input_phoneme_embedding, + current_phoneme_positions=current_phoneme_positions, + phoneme_stream_ended=phoneme_stream_ended, + actual_batch_size=actual_batch_size, + device=device, ) - # print("use_phoneme_input", use_phoneme_input) - for item_idx in range(actual_batch_size): - if use_phoneme_input[item_idx, 0, 0] > 0: - for phoneme_channel_idx in range(self.phoneme_stacking_factor): - _phoneme_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item() - if _phoneme_token not in [ - self.phoneme_tokenizer.eos_token_id, - self.phoneme_tokenizer.bos_token_id, - self.phoneme_tokenizer.pad, - ]: - pred_phoneme_token_lists[item_idx].append(_phoneme_token) - - _gt_phoneme_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item() - if _gt_phoneme_token not in [ - self.phoneme_tokenizer.eos_token_id, - self.phoneme_tokenizer.bos_token_id, - self.phoneme_tokenizer.pad, - ]: - gt_phoneme_token_lists[item_idx].append(_gt_phoneme_token) + # Collect phoneme tokens for logging (no printing here) + self._collect_phoneme_tokens_for_logging( + pred_phoneme_tokens=pred_phoneme_tokens, + gt_phoneme_tokens_current=gt_phoneme_tokens_current, + use_phoneme_input=use_phoneme_input, + pred_phoneme_token_lists=pred_phoneme_token_lists, + gt_phoneme_token_lists=gt_phoneme_token_lists, + batch_size=actual_batch_size, + ) + + # Check for phoneme EOS + for item_idx in range(actual_batch_size): if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id): - print("Phoneme end detected for item {} at timestep {}".format(item_idx, idx)) + if verbose and not phoneme_stream_ended[item_idx]: + logging.info(f"Phoneme EOS detected for item {item_idx} at timestep {idx}") phoneme_stream_ended[item_idx] = True - all_codes_next_phoneme = all_codes_next_phoneme.unsqueeze(1) - # import ipdb; ipdb.set_trace() - - for item_idx in range(all_codes_next_argmax.size(0)): - if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]: - pred_tokens = all_codes_next_argmax[item_idx] - pred_tokens_multinomial = audio_codes_next[item_idx] - if torch.any(pred_tokens == self.audio_eos_id) or torch.any( - pred_tokens_multinomial == self.audio_eos_id - ): - print("End detected for item {} at timestep {}".format(item_idx, idx)) - end_indices[item_idx] = idx - - all_predictions.append(audio_codes_next) - new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2)) # (B, 1, E) - new_emb_unconditional = new_emb * 1 + # Check for audio EOS + self._check_eos_and_update_end_indices( + all_codes_next_argmax=all_codes_next_argmax, + audio_codes_next=audio_codes_next, + end_indices=end_indices, + context_plus_audio_lens=context_plus_audio_lens, + min_context_len=min_context_len, + idx=idx, + verbose=verbose, + ) - if current_text_input_mode == 'streaming': - _bs = context_embedding.size(0) - remaining_text_embedded_current = remaining_text_embedded[ - torch.arange(_bs), current_text_positions.clamp(min=0), : - ].unsqueeze( - 1 - ) # (B, 1, E) - new_emb = new_emb + remaining_text_embedded_current - - context_incomplete_mask = context_plus_audio_lens > idx + min_context_len # (B,) - # import ipdb; ipdb.set_trace() - # True if we have not yet reached the end of the context for this item - # import ipdb; ipdb.set_trace() - if context_incomplete_mask.any(): - # If some contexts are not yet complete. - context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float() # (B, 1, 1) - context_embedding = context_plus_audio_embedded[ - :, min_context_len + idx : min_context_len + idx + 1, : - ] # (B, 1, E) - next_input = context_incomplete_mask * context_embedding + (1 - context_incomplete_mask) * new_emb - if phoneme_channel_input_t is not None: - next_input += phoneme_channel_input_t - if use_cfg: - next_input_unconditional = ( - context_incomplete_mask * dummy_context_embedding_unconditional - + (1 - context_incomplete_mask) * new_emb_unconditional - ) - next_input = torch.cat([next_input, next_input_unconditional], dim=0) # (2B, 1, E) - else: - next_input = new_emb - if phoneme_channel_input_t is not None: - next_input += phoneme_channel_input_t - if use_cfg: - next_input = torch.cat([next_input, new_emb_unconditional], dim=0) # (2B, 1, E) + all_predictions.append(audio_codes_next) - # Update cache_position for current step (needed for NemotronH cached forward) - cache_position = torch.tensor([current_cache_seq_len], device=context_embedding.device) + # Prepare input for next decoder step + next_input = self._prepare_next_decoder_input( + audio_codes_next=audio_codes_next, + context_plus_audio_embedded=context_plus_audio_embedded, + context_plus_audio_lens=context_plus_audio_lens, + min_context_len=min_context_len, + idx=idx, + current_text_input_mode=current_text_input_mode, + remaining_text_embedded=remaining_text_embedded, + current_text_positions=current_text_positions, + phoneme_channel_input_t=phoneme_channel_input_t, + use_cfg=use_cfg, + dummy_context_embedding_unconditional=dummy_context_embedding_unconditional, + ) + # Forward pass for next token + cache_position = torch.tensor([current_cache_seq_len], device=device) transformer_out = self.forward( inputs_embeds=next_input, attention_mask=None, @@ -2256,43 +2465,48 @@ def infer_batch( ) last_hidden = transformer_out.last_hidden_state past_kv = transformer_out.past_key_values - - # Increment sequence length for next iteration current_cache_seq_len += 1 - if len(end_indices) == audio_codes_next.size(0): - print("All items finished at timestep {}".format(idx)) + + # Check if all items have finished + if len(end_indices) == actual_batch_size: + if verbose: + logging.info(f"All items finished at timestep {idx}") break - if self.phoneme_tokenizer is not None: - for item_idx in range(actual_batch_size): - print( - "Predicted phoneme tokens for item {}: {}".format(item_idx, pred_phoneme_token_lists[item_idx]) - ) - print("GT phoneme tokens for item {}: {}".format(item_idx, gt_phoneme_token_lists[item_idx])) - predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx]) - gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx]) - print("Predicted phoneme text for item {}: {}".format(item_idx, predicted_phoneme_text)) - print("GT phoneme text for item {}: {}".format(item_idx, gt_phoneme_text)) + # Log phoneme predictions if verbose + if verbose and self.phoneme_tokenizer is not None: + self._log_phoneme_predictions( + pred_phoneme_token_lists=pred_phoneme_token_lists, + gt_phoneme_token_lists=gt_phoneme_token_lists, + batch_size=actual_batch_size, + ) + # Post-process predictions tts_generation_time = time.time() - start_time tts_generation_time_per_frame = tts_generation_time / len(all_predictions) - pred_codes_start_indices = context_plus_audio_lens - min_context_len # (B,) + + # Calculate predicted lengths, accounting for context offset + pred_codes_start_indices = context_plus_audio_lens - min_context_len predicted_lens = [ - end_indices.get(idx, max_decoder_steps) for idx in range(context_embedding.size(0)) - ] # Ensure that the codec is atleast of length 4 - predicted_codes_lens = torch.tensor(predicted_lens, device=context_embedding.device).long() - predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices # (B,) + end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size) + ] + predicted_codes_lens = torch.tensor(predicted_lens, device=device).long() + predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices + # Stack and slice predictions to remove context portion predicted_codes = torch.stack(all_predictions, dim=-1) # (B, num_codebooks, T) predicted_codes = self.slice_pred_embeddings( predicted_codes.permute(0, 2, 1), context_lens=pred_codes_start_indices, target_lens=predicted_codes_lens, ) - predicted_codes = predicted_codes.permute(0, 2, 1) # (B, num_codebooks, T) + predicted_codes = predicted_codes.permute(0, 2, 1) + + # Remove EOS tokens and convert codes to audio predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens) predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens) + # Compute RTF metrics end_time = time.time() total_audio_duration_generated = ( predicted_audio_lens.max().item() * predicted_audio_lens.shape[0] @@ -2305,7 +2519,7 @@ def infer_batch( 'tts_generation_time': tts_generation_time, 'max_frames_generated': len(all_predictions), 'tts_generation_time_per_frame': tts_generation_time_per_frame, - 'batch_size': context_embedding.size(0), + 'batch_size': actual_batch_size, } return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics From 79457c6817fc2ebe6e11be825b9f0057c56cf7d7 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 4 Feb 2026 04:29:55 -0500 Subject: [PATCH 33/94] revert some changes and remove scripts Signed-off-by: Paarth Neekhara --- examples/tts/evalset_config.json | 48 - nemo/collections/tts/models/audio_codec.py | 16 +- .../ipa_scripts/add_ipa_to_lhotse_shards.py | 358 - .../ipa_scripts/analyze_ipa_tokenization.py | 734 -- .../ipa_scripts/cuts_dirs_config.json | 45 - .../ipa_scripts/train_ipa_bpe_tokenizer.py | 522 - ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 ----------------- 7 files changed, 5 insertions(+), 11672 deletions(-) delete mode 100644 scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py delete mode 100644 scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py delete mode 100644 scripts/magpietts/ipa_scripts/cuts_dirs_config.json delete mode 100644 scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py delete mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json index 49822ce9cf25..4be3056020ce 100644 --- a/examples/tts/evalset_config.json +++ b/examples/tts/evalset_config.json @@ -13,53 +13,5 @@ "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1_longform_tiny.json", "audio_dir": "/", "feature_dir": null - }, - "riva_multibpe": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/riva_hard_multi_bpe.ndjson", - "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS", - "tokenizer_names": ["nemotron_nano_30b"] - }, - "riva_hard_digits": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-digits-path-corrected.ndjson", - "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS", - "tokenizer_names": ["nemotron_nano_30b"] - }, - "riva_hard_letters": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-letters-path-corrected.ndjson", - "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS", - "tokenizer_names": ["nemotron_nano_30b"] - }, - "riva_hard_money": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-money-path-corrected.ndjson", - "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS", - "tokenizer_names": ["nemotron_nano_30b"] - }, - "riva_hard_short": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/hard-short-path-corrected.ndjson", - "audio_dir": "/Data/RIVA-TTS", - "feature_dir": "/Data/RIVA-TTS", - "tokenizer_names": ["nemotron_nano_30b"] - }, - "vctk": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths_silence_trimmed.json", - "audio_dir": "/Data/VCTK-Corpus-0.92", - "feature_dir": "/Data/VCTK-Corpus-0.92", - "tokenizer_names": ["nemotron_nano_30b"] - }, - "libritts_seen": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_seen_evalset_from_testclean_v2.json", - "audio_dir": "/Data/LibriTTS", - "feature_dir": "/Data/LibriTTS", - "tokenizer_names": ["nemotron_nano_30b"] - }, - "libritts_test_clean": { - "manifest_path": "/Data/evaluation_manifests/ipa_manifests/LibriTTS_test_clean_withContextAudioPaths.jsonl", - "audio_dir": "/Data/LibriTTS", - "feature_dir": "/Data/LibriTTS", - "tokenizer_names": ["nemotron_nano_30b"] } } diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py index de11bb4f9229..42b6c81f0f0b 100644 --- a/nemo/collections/tts/models/audio_codec.py +++ b/nemo/collections/tts/models/audio_codec.py @@ -110,7 +110,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.audio_decoder = instantiate(cfg.audio_decoder) # Discriminator setup - # self.discriminator = instantiate(cfg.discriminator) + self.discriminator = instantiate(cfg.discriminator) # Mel loss setup loss_resolutions = cfg.loss_resolutions @@ -182,16 +182,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.speaker_encoder = ResNetSpeakerEncoder() # load pretrained model # self.speaker_encoder.load_checkpoint("https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar") - import os - - # TODO: revert this - if os.path.exists("/gitrepos/checkpoints/pytorch_model.bin"): - self.speaker_encoder.load_checkpoint("/gitrepos/checkpoints/pytorch_model.bin", strict=False) - else: - self.speaker_encoder.load_checkpoint( - "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", - strict=False, - ) + self.speaker_encoder.load_checkpoint( + "https://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.bin", + strict=False, + ) # freeze the pretrained speaker encoder self.speaker_encoder.freeze() logging.info("Speaker encoder loaded and frozen !!") diff --git a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py b/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py deleted file mode 100644 index 10972d1bdc6a..000000000000 --- a/scripts/magpietts/ipa_scripts/add_ipa_to_lhotse_shards.py +++ /dev/null @@ -1,358 +0,0 @@ -#!/usr/bin/env python3 -""" -Add IPA strings (from espeak/espeak-ng) to Lhotse cuts jsonl.gz shards. - -For each cuts directory like: - /Data/.../de/.../cuts -creates: - /Data/.../de/.../cuts_with_ipa -and writes corresponding cuts.000000.jsonl.gz, etc. with an added IPA field. - -IPA is added to each supervision under: - cut["supervisions"][i]["custom"]["ipa"] - -Usage: - python add_ipa_to_cuts.py --lang de - python add_ipa_to_cuts.py --lang all # run all languages - -Edit the `CUTS_DIRS_BY_LANG` dict below (or replace with argparse/config as desired). -""" - -from __future__ import annotations - -import argparse -import concurrent.futures as cf -import gzip -import json -import os -import re -import shutil -import subprocess -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Iterable, List, Optional, Tuple - -# ------------------------- -# USER CONFIG -# ------------------------- - -# Default config file path (same directory as this script) -DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json" - - -def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: - """Load CUTS_DIRS_BY_LANG from a JSON config file.""" - if config_path is None: - config_path = DEFAULT_CONFIG_PATH - - if not config_path.exists(): - raise FileNotFoundError(f"Config file not found: {config_path}") - - with open(config_path, "r", encoding="utf-8") as f: - return json.load(f) - - -# Map your dataset language keys to espeak voice codes (adjust as needed). -# For German, espeak-ng uses "de" typically. -ESPEAK_VOICE_BY_LANG: Dict[str, str] = { - "de": "de", - "en": "en", - "es": "es", - "fr": "fr", - "hi": "hi", - "it": "it", - "vi": "vi", - "zh": "zh", - "ru": "ru", - "ja": "ja", - "ko": "ko", - "ar": "ar", - "he": "he", - "nl": "nl", - "pl": "pl", - "pt": "pt", -} - -OUTPUT_SUFFIX = "_with_ipa" # cuts -> cuts_with_ipa -SHARD_GLOB = "cuts.*.jsonl.gz" - -# Parallelism -MAX_WORKERS = max(1, (os.cpu_count() or 4) - 1) -# MAX_WORKERS = 8 - -# If True, skip writing if output shard exists (basic resume) -SKIP_EXISTING_OUTPUT_SHARDS = False -# ------------------------- -# IMPLEMENTATION -# ------------------------- - -IPA_FLAG = "--ipa" # espeak-ng uses --ipa, espeak supports --ipa in many builds -# Use --quiet if available; safe to try. -COMMON_FLAGS = ["-q"] - -# Some espeak builds output extra spaces/newlines; we normalize. -_WS_RE = re.compile(r"\s+") - - -def _find_espeak_binary() -> str: - """Prefer espeak-ng if present, else espeak.""" - for exe in ("espeak-ng", "espeak"): - if shutil.which(exe): - return exe - raise RuntimeError( - "Neither 'espeak-ng' nor 'espeak' was found on PATH. " "Install espeak-ng (recommended) or espeak." - ) - - -@dataclass(frozen=True) -class EspeakRunner: - exe: str - voice: str - - def text_to_ipa(self, text: str) -> str: - """ - Convert text -> IPA using espeak/espeak-ng. - """ - # Note: We pass text via stdin to avoid shell escaping issues. - cmd = [self.exe, "-v", self.voice, IPA_FLAG] + COMMON_FLAGS - try: - proc = subprocess.run( - cmd, - input=text.encode("utf-8"), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=False, - ) - except Exception as e: - raise RuntimeError(f"Failed to run {cmd}: {e}") from e - - if proc.returncode != 0: - raise RuntimeError( - f"espeak command failed (rc={proc.returncode})\n" - f"cmd: {' '.join(cmd)}\n" - f"stderr: {proc.stderr.decode('utf-8', errors='replace')}" - ) - - out = proc.stdout.decode("utf-8", errors="replace").strip() - # Normalize whitespace to single spaces - out = _WS_RE.sub(" ", out).strip() - return out - - -def iter_shards(cuts_dir: Path) -> List[Path]: - return sorted(cuts_dir.glob(SHARD_GLOB)) - - -def derive_output_dir(cuts_dir: Path) -> Path: - # If dir name ends with "cuts", produce "cuts_with_ipa". - # Otherwise append suffix to the directory name. - name = cuts_dir.name - if name == "cuts": - out_name = f"cuts{OUTPUT_SUFFIX}" - else: - out_name = f"{name}{OUTPUT_SUFFIX}" - return cuts_dir.parent / out_name - - -def load_json_line(line: str) -> dict: - return json.loads(line) - - -def dump_json_line(obj: dict) -> str: - # compact, consistent output - return json.dumps(obj, ensure_ascii=False) - - -class IPACache: - """ - Process-local cache. Speeds up repeated identical texts. - """ - - def __init__(self) -> None: - self._cache: Dict[Tuple[str, str], str] = {} - - def get(self, voice: str, text: str) -> Optional[str]: - return self._cache.get((voice, text)) - - def set(self, voice: str, text: str, ipa: str) -> None: - self._cache[(voice, text)] = ipa - - -def add_ipa_to_cut( - cut: dict, - espeak: EspeakRunner, - cache: IPACache, -) -> dict: - """ - Adds IPA to each supervision custom field: custom["ipa"]. - Uses supervision["custom"]["normalized_text"] if available, otherwise supervision["text"] as source text. - For Vietnamese (vi), uses original_text and updates text/normalized_text fields. - """ - sups = cut.get("supervisions") or [] - is_vietnamese = espeak.voice == "vi" - for sup in sups: - custom = sup.get("custom") - if custom is None: - custom = {} - sup["custom"] = custom - - # For Vietnamese, use original_text and fix the text fields - if is_vietnamese and custom.get("original_text"): - text = custom["original_text"] - sup["text"] = text - custom["normalized_text"] = text - else: - text = custom.get("normalized_text") or sup.get("text") - - if not text: - continue - - # If already has IPA, keep it - if "ipa" in custom and isinstance(custom["ipa"], str) and custom["ipa"].strip(): - continue - - cached = cache.get(espeak.voice, text) - if cached is None: - cached = espeak.text_to_ipa(text) - cache.set(espeak.voice, text, cached) - - custom["ipa"] = cached - - return cut - - -def process_shard( - shard_path: Path, - out_shard_path: Path, - espeak: EspeakRunner, -) -> Tuple[Path, int]: - """ - Read shard jsonl.gz, add IPA, write out shard jsonl.gz - Returns: (out_shard_path, num_lines) - """ - cache = IPACache() - n = 0 - - with ( - gzip.open(shard_path, "rt", encoding="utf-8") as fin, - gzip.open(out_shard_path, "wt", encoding="utf-8") as fout, - ): - for line in fin: - line = line.strip() - if not line: - continue - cut = load_json_line(line) - cut = add_ipa_to_cut(cut, espeak=espeak, cache=cache) - fout.write(dump_json_line(cut)) - fout.write("\n") - n += 1 - - return out_shard_path, n - - -def process_cuts_dir(lang: str, cuts_dir: Path) -> None: - voice = ESPEAK_VOICE_BY_LANG.get(lang, lang) - exe = _find_espeak_binary() - espeak = EspeakRunner(exe=exe, voice=voice) - - out_dir = derive_output_dir(cuts_dir) - out_dir.mkdir(parents=True, exist_ok=True) - - shards = iter_shards(cuts_dir) - if not shards: - print(f"[WARN] No shards matched {SHARD_GLOB} in {cuts_dir}", file=sys.stderr) - return - - print(f"[INFO] {lang}: {cuts_dir} -> {out_dir} (shards={len(shards)})") - - jobs: List[Tuple[Path, Path]] = [] - for shard in shards: - out_shard = out_dir / shard.name - if SKIP_EXISTING_OUTPUT_SHARDS and out_shard.exists(): - continue - jobs.append((shard, out_shard)) - - if not jobs: - print(f"[INFO] {lang}: nothing to do in {cuts_dir} (all outputs exist).") - return - - # Parallelize per shard - with cf.ProcessPoolExecutor(max_workers=MAX_WORKERS) as ex: - futures = [] - for shard, out_shard in jobs: - futures.append(ex.submit(_process_shard_worker, shard, out_shard, espeak.exe, espeak.voice)) - - for fut in cf.as_completed(futures): - out_shard_path, n = fut.result() - print(f"[OK] wrote {out_shard_path} (lines={n})") - - -def _process_shard_worker(shard: Path, out_shard: Path, exe: str, voice: str) -> Tuple[Path, int]: - # Re-create runner in worker process - espeak = EspeakRunner(exe=exe, voice=voice) - return process_shard(shard, out_shard, espeak) - - -def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]: - """Return list of all available language codes.""" - return list(cuts_dirs.keys()) - - -def process_language(lang: str, cuts_dirs: Dict[str, List[str]]) -> bool: - """ - Process all directories for a given language. - Returns True if successful, False if there was an issue. - """ - if lang not in cuts_dirs: - print(f"[ERROR] Unknown language: {lang}", file=sys.stderr) - print(f"[ERROR] Available languages: {get_available_languages(cuts_dirs)}", file=sys.stderr) - return False - - dirs = cuts_dirs[lang] - for d in dirs: - cuts_dir = Path(d) - if not cuts_dir.exists(): - print(f"[WARN] missing dir: {cuts_dir}", file=sys.stderr) - continue - process_cuts_dir(lang, cuts_dir) - - return True - - -def main() -> None: - parser = argparse.ArgumentParser(description="Add IPA strings to Lhotse cuts jsonl.gz shards.") - parser.add_argument( - "--lang", - type=str, - required=True, - help="Language code to process (e.g., 'de', 'en', 'fr') or 'all' for all languages.", - ) - parser.add_argument( - "--config", - type=str, - default=None, - help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}", - ) - args = parser.parse_args() - - # Load config - config_path = Path(args.config) if args.config else None - cuts_dirs = load_cuts_dirs_config(config_path) - print(f"[INFO] Loaded config with languages: {get_available_languages(cuts_dirs)}") - - if args.lang == "all": - # Process all languages - for lang in cuts_dirs.keys(): - print(f"\n{'='*60}") - print(f"[INFO] Processing language: {lang}") - print(f"{'='*60}") - process_language(lang, cuts_dirs) - else: - success = process_language(args.lang, cuts_dirs) - if not success: - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py b/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py deleted file mode 100644 index e2d53c3099d3..000000000000 --- a/scripts/magpietts/ipa_scripts/analyze_ipa_tokenization.py +++ /dev/null @@ -1,734 +0,0 @@ -#!/usr/bin/env python3 -""" -Analyze and compare tokenization (tokens per second of audio) between: -1. Qwen/Qwen2.5-1.5B-Instruct tokenizer on raw text -2. NVIDIA Nemotron Nano 30B tokenizer on raw text -3. IPABPETokenizer on phonemized IPA text at different vocab sizes - -This script: -1. Creates a balanced IPA corpus (equal samples per language) from train_langs -2. Trains IPA BPE tokenizers at vocab sizes 512, 1024, 2048, 4096 -3. For each test language, samples text pairs from cuts_with_ipa directories -4. Computes tokens per second (tokens / audio duration) for each tokenizer -5. Outputs comparison statistics showing tokens/second for each tokenizer - -Features: -- Reads data once and reuses across all vocab sizes (efficient) -- Balances training data across languages (uses min count across all train langs) -- Supports separate train and test language sets -- Computes tokens per second using audio duration from cuts - -Usage: - # Train and test on all languages - python analyze_ipa_tokenization.py --output_dir /path/to/output - - # Train on en,de,fr but test on all languages - python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs en,de,fr --test_langs all - - # Train on all, test on specific languages - python analyze_ipa_tokenization.py --output_dir /path/to/output --train_langs all --test_langs en,zh - - # Cap training samples per language - python analyze_ipa_tokenization.py --output_dir /path/to/output --max_samples_per_lang 50000 -""" - -import argparse -import gzip -import json -import os -import random -import sys -from collections import defaultdict -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Generator, List, Optional, Tuple - -import numpy as np -from tokenizers import Tokenizer -from tokenizers.models import BPE -from tokenizers.pre_tokenizers import ByteLevel -from tokenizers.trainers import BpeTrainer -from transformers import AutoTokenizer - -# ------------------------- -# CONFIGURATION -# ------------------------- - -VOCAB_SIZES = [512, 1024, 2048, 4096] - -# Default config file path (same directory as this script) -DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json" - - -def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: - """Load CUTS_DIRS_BY_LANG from a JSON config file.""" - if config_path is None: - config_path = DEFAULT_CONFIG_PATH - - if not config_path.exists(): - raise FileNotFoundError(f"Config file not found: {config_path}") - - with open(config_path, "r", encoding="utf-8") as f: - return json.load(f) - - -OUTPUT_SUFFIX = "_with_ipa" -SHARD_GLOB = "cuts.*.jsonl.gz" - - -@dataclass -class TextPair: - """A pair of raw text and its IPA phonemization with audio duration.""" - - raw_text: str - ipa_text: str - lang: str - duration: float # audio duration in seconds - - -@dataclass -class TokenizationStats: - """Statistics for tokenization comparison (tokens per second).""" - - lang: str - num_samples: int - total_duration: float # sum of all durations in seconds - qwen_tokens_per_second: float - nemotron_tokens_per_second: float - ipa_tokens_per_second: Dict[int, float] # vocab_size -> tokens/sec - - -def get_ipa_dir(cuts_dir: Path) -> Path: - """Convert a cuts directory path to its corresponding cuts_with_ipa path.""" - name = cuts_dir.name - if name == "cuts": - out_name = f"cuts{OUTPUT_SUFFIX}" - else: - out_name = f"{name}{OUTPUT_SUFFIX}" - return cuts_dir.parent / out_name - - -def iter_shards(ipa_dir: Path) -> List[Path]: - """Get all shard files in a directory.""" - return sorted(ipa_dir.glob(SHARD_GLOB)) - - -def extract_text_pairs_from_shard(shard_path: Path, lang: str) -> Generator[TextPair, None, None]: - """ - Extract text pairs (raw text + IPA) from a single shard file. - - Yields: - TextPair objects with raw_text, ipa_text, and duration - """ - with gzip.open(shard_path, "rt", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - try: - cut = json.loads(line) - # Get duration from the top-level cut object - duration = cut.get("duration", 0.0) - supervisions = cut.get("supervisions", []) - for sup in supervisions: - custom = sup.get("custom", {}) - ipa = custom.get("ipa") - # Get raw text - prefer normalized_text, fallback to text - raw_text = custom.get("normalized_text") or sup.get("text") - - if ipa and raw_text and isinstance(ipa, str) and isinstance(raw_text, str): - ipa = ipa.strip() - raw_text = raw_text.strip() - if ipa and raw_text and duration > 0: - yield TextPair(raw_text=raw_text, ipa_text=ipa, lang=lang, duration=duration) - except json.JSONDecodeError: - continue - - -def sample_text_pairs( - lang: str, - cuts_dirs: Dict[str, List[str]], - num_samples: int = 1000, - seed: int = 42, -) -> List[TextPair]: - """ - Sample text pairs from a language's cuts_with_ipa directories. - - Args: - lang: Language code - cuts_dirs: Dictionary mapping language codes to lists of cuts directories - num_samples: Number of samples to collect - seed: Random seed for reproducibility - - Returns: - List of TextPair objects - """ - random.seed(seed) - - if lang not in cuts_dirs: - raise ValueError(f"Unknown language: {lang}") - - # Collect all text pairs from all directories - all_pairs = [] - for cuts_dir_str in cuts_dirs[lang]: - cuts_dir = Path(cuts_dir_str) - ipa_dir = get_ipa_dir(cuts_dir) - - if not ipa_dir.exists(): - print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr) - continue - - shards = iter_shards(ipa_dir) - for shard in shards: - for pair in extract_text_pairs_from_shard(shard, lang): - all_pairs.append(pair) - # Early exit if we have way more than needed - if len(all_pairs) >= num_samples * 10: - break - if len(all_pairs) >= num_samples * 10: - break - if len(all_pairs) >= num_samples * 10: - break - - # Sample - if len(all_pairs) <= num_samples: - print(f"[INFO] {lang}: Only found {len(all_pairs)} pairs, using all") - return all_pairs - - return random.sample(all_pairs, num_samples) - - -def iter_ipa_strings_for_lang( - lang: str, - cuts_dirs: Dict[str, List[str]], -) -> Generator[str, None, None]: - """Iterate over all IPA strings for a single language (memory-efficient).""" - if lang not in cuts_dirs: - return - - for cuts_dir_str in cuts_dirs[lang]: - cuts_dir = Path(cuts_dir_str) - ipa_dir = get_ipa_dir(cuts_dir) - - if not ipa_dir.exists(): - continue - - shards = iter_shards(ipa_dir) - for shard in shards: - with gzip.open(shard, "rt", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - try: - cut = json.loads(line) - for sup in cut.get("supervisions", []): - ipa = sup.get("custom", {}).get("ipa") - if ipa and isinstance(ipa, str) and ipa.strip(): - yield ipa.strip() - except json.JSONDecodeError: - continue - - -def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int: - """Count IPA strings for a language without loading into memory.""" - count = 0 - for _ in iter_ipa_strings_for_lang(lang, cuts_dirs): - count += 1 - if count >= max_count: - break - return count - - -def simple_sample_ipa_strings( - lang: str, - cuts_dirs: Dict[str, List[str]], - k: int, - max_collect: int = 100000, - seed: int = 42, -) -> List[str]: - """ - Simple sampling: collect up to max_collect IPA strings, then randomly sample k. - - This avoids reading through all data like reservoir sampling does. - - Args: - lang: Language code - cuts_dirs: Dictionary mapping language codes to lists of cuts directories - k: Number of samples to select - max_collect: Maximum number of strings to collect before sampling - seed: Random seed for reproducibility - - Returns: - List of up to k sampled IPA strings - """ - rng = random.Random(seed) - collected: List[str] = [] - - for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs): - collected.append(ipa) - if len(collected) >= max_collect: - break - - # If we have fewer than k, return all - if len(collected) <= k: - return collected - - # Otherwise, randomly sample k - return rng.sample(collected, k) - - -def create_balanced_corpus( - train_langs: List[str], - cuts_dirs: Dict[str, List[str]], - output_file: str, - max_samples_per_lang: Optional[int] = None, - max_count_per_lang: int = 100000, - seed: int = 42, -) -> Tuple[str, Dict[str, int]]: - """ - Create a balanced IPA corpus file with equal samples from each language. - - Uses a memory-efficient two-pass approach: - 1. First pass: Count sentences per language (up to max_count_per_lang) - 2. Second pass: Use simple sampling to select samples - - Args: - train_langs: List of language codes to include - cuts_dirs: Dictionary mapping language codes to lists of cuts directories - output_file: Path to write the balanced corpus - max_samples_per_lang: Optional cap on samples per language - max_count_per_lang: Max count per language when counting IPA strings - seed: Random seed for reproducibility - - Returns: - Tuple of (corpus_file_path, dict of lang -> actual_count) - """ - # First pass: Count sentences per language - print("[INFO] Pass 1: Counting IPA strings per language...") - lang_counts: Dict[str, int] = {} - - for lang in train_langs: - if lang not in cuts_dirs: - print(f"[WARN] Language {lang} not in config, skipping") - continue - print(f"[INFO] Counting {lang}...", end=" ", flush=True) - count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang) - lang_counts[lang] = count - print(f"{count} IPA strings") - - if not lang_counts: - raise ValueError("No IPA strings found for any language") - - # Find minimum count across languages - min_count = min(lang_counts.values()) - print(f"[INFO] Minimum count across languages: {min_count}") - - # Apply max_samples_per_lang cap if specified - samples_per_lang = min_count - if max_samples_per_lang is not None and max_samples_per_lang < min_count: - samples_per_lang = max_samples_per_lang - print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}") - - # Second pass: Sample from each language using simple sampling - print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...") - actual_counts: Dict[str, int] = {} - total_written = 0 - - with open(output_file, "w", encoding="utf-8") as f: - for lang in lang_counts.keys(): - print(f"[INFO] Sampling from {lang}...", end=" ", flush=True) - # Use different seed per language for variety, but reproducible - lang_seed = seed + hash(lang) % 10000 - sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed) - - for ipa in sampled: - f.write(ipa + "\n") - total_written += 1 - - actual_counts[lang] = len(sampled) - print(f"sampled {len(sampled)} strings") - - print(f"[INFO] Total IPA strings written to corpus: {total_written}") - print(f"[INFO] Balanced corpus saved to: {output_file}") - - return output_file, actual_counts - - -def train_ipa_bpe_tokenizer( - output_dir: str, - vocab_size: int, - corpus_file: str, - min_frequency: int = 2, -) -> Tokenizer: - """ - Train a byte-level BPE tokenizer on IPA strings from a pre-built corpus file. - - Args: - output_dir: Directory to save tokenizer files - vocab_size: Target vocabulary size - corpus_file: Path to the IPA corpus file (one IPA string per line) - min_frequency: Minimum frequency for a token to be included - - Returns: - Trained Tokenizer object - """ - tokenizer_dir = os.path.join(output_dir, f"ipa_bpe_v{vocab_size}") - os.makedirs(tokenizer_dir, exist_ok=True) - - tokenizer_file = os.path.join(tokenizer_dir, "tokenizer.json") - - # Check if already trained - if os.path.exists(tokenizer_file): - print(f"[INFO] Loading existing tokenizer from {tokenizer_file}") - return Tokenizer.from_file(tokenizer_file) - - # Initialize tokenizer - tokenizer = Tokenizer(BPE(unk_token="")) - tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) - - special_tokens = ["", "", ""] - - trainer = BpeTrainer( - vocab_size=vocab_size, - min_frequency=min_frequency, - special_tokens=special_tokens, - show_progress=True, - ) - - print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}...") - tokenizer.train(files=[corpus_file], trainer=trainer) - - # Save - tokenizer.save(tokenizer_file) - tokenizer.model.save(tokenizer_dir) - - print(f"[INFO] Saved tokenizer to {tokenizer_dir}") - - return tokenizer - - -def compute_stats( - text_pairs: List[TextPair], - qwen_tokenizer: AutoTokenizer, - nemotron_tokenizer: AutoTokenizer, - ipa_tokenizers: Dict[int, Tokenizer], - lang: str, -) -> TokenizationStats: - """ - Compute tokenization statistics (tokens per second) for a set of text pairs. - """ - qwen_counts = [] - nemotron_counts = [] - ipa_counts = {vs: [] for vs in ipa_tokenizers.keys()} - - for pair in text_pairs: - # Qwen tokenizer on raw text - qwen_tokens = qwen_tokenizer.encode(pair.raw_text) - qwen_counts.append(len(qwen_tokens)) - - # Nemotron tokenizer on raw text - nemotron_tokens = nemotron_tokenizer.encode(pair.raw_text) - nemotron_counts.append(len(nemotron_tokens)) - - # IPA tokenizers on IPA text - for vocab_size, tokenizer in ipa_tokenizers.items(): - ipa_tokens = tokenizer.encode(pair.ipa_text) - ipa_counts[vocab_size].append(len(ipa_tokens.ids)) - - # Calculate total duration and token counts - total_duration = sum(pair.duration for pair in text_pairs) - qwen_total = sum(qwen_counts) - nemotron_total = sum(nemotron_counts) - - # Compute tokens per second - qwen_tps = qwen_total / total_duration if total_duration > 0 else 0.0 - nemotron_tps = nemotron_total / total_duration if total_duration > 0 else 0.0 - - ipa_tps = {} - for vocab_size in ipa_tokenizers.keys(): - ipa_total = sum(ipa_counts[vocab_size]) - ipa_tps[vocab_size] = ipa_total / total_duration if total_duration > 0 else 0.0 - - return TokenizationStats( - lang=lang, - num_samples=len(text_pairs), - total_duration=total_duration, - qwen_tokens_per_second=qwen_tps, - nemotron_tokens_per_second=nemotron_tps, - ipa_tokens_per_second=ipa_tps, - ) - - -def print_stats_table(all_stats: List[TokenizationStats], vocab_sizes: List[int]): - """Print a formatted table of tokens per second statistics.""" - print("\n" + "=" * 120) - print("TOKENS PER SECOND: Qwen2.5-1.5B-Instruct & Nemotron Nano 30B (raw text) vs IPA BPE (phonemized)") - print("=" * 120) - - # Header - header = f"{'Lang':<6} {'Samples':>8} {'Duration(s)':>12} {'Qwen tok/s':>12} {'Nemo tok/s':>12}" - for vs in vocab_sizes: - header += f" {'IPA-' + str(vs):>10}" - print(header) - print("-" * 120) - - # Data rows - for stats in all_stats: - row = f"{stats.lang:<6} {stats.num_samples:>8} {stats.total_duration:>12.2f} {stats.qwen_tokens_per_second:>12.2f} {stats.nemotron_tokens_per_second:>12.2f}" - for vs in vocab_sizes: - row += f" {stats.ipa_tokens_per_second[vs]:>10.2f}" - print(row) - - # Aggregated stats - print("-" * 120) - total_samples = sum(s.num_samples for s in all_stats) - total_duration = sum(s.total_duration for s in all_stats) - - # Compute overall tokens per second (weighted by duration) - total_qwen_tokens = sum(s.qwen_tokens_per_second * s.total_duration for s in all_stats) - total_nemotron_tokens = sum(s.nemotron_tokens_per_second * s.total_duration for s in all_stats) - overall_qwen_tps = total_qwen_tokens / total_duration if total_duration > 0 else 0 - overall_nemotron_tps = total_nemotron_tokens / total_duration if total_duration > 0 else 0 - - agg_row = f"{'TOTAL':<6} {total_samples:>8} {total_duration:>12.2f} {overall_qwen_tps:>12.2f} {overall_nemotron_tps:>12.2f}" - for vs in vocab_sizes: - total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats) - overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0 - agg_row += f" {overall_ipa_tps:>10.2f}" - print(agg_row) - print("=" * 120) - - # Summary - print("\nSUMMARY:") - print(f" - Total samples analyzed: {total_samples}") - print(f" - Total audio duration: {total_duration:.2f} seconds ({total_duration/3600:.2f} hours)") - print(f" - Qwen tokens/second: {overall_qwen_tps:.2f}") - print(f" - Nemotron tokens/second: {overall_nemotron_tps:.2f}") - for vs in vocab_sizes: - total_ipa_tokens = sum(s.ipa_tokens_per_second[vs] * s.total_duration for s in all_stats) - overall_ipa_tps = total_ipa_tokens / total_duration if total_duration > 0 else 0 - print(f" - IPA-{vs} tokens/second: {overall_ipa_tps:.2f}") - print() - - -def save_results_json( - all_stats: List[TokenizationStats], - output_path: str, - train_langs: Optional[List[str]] = None, - test_langs: Optional[List[str]] = None, -): - """Save results to JSON file with metadata.""" - output = { - "metadata": { - "train_langs": train_langs or [], - "test_langs": test_langs or [], - }, - "results": [], - } - - for stats in all_stats: - output["results"].append( - { - "lang": stats.lang, - "num_samples": stats.num_samples, - "total_duration_seconds": stats.total_duration, - "qwen_tokens_per_second": stats.qwen_tokens_per_second, - "nemotron_tokens_per_second": stats.nemotron_tokens_per_second, - "ipa_tokens_per_second": { - str(vs): stats.ipa_tokens_per_second[vs] for vs in stats.ipa_tokens_per_second.keys() - }, - } - ) - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(output, f, indent=2) - print(f"[INFO] Saved results to {output_path}") - - -def parse_lang_arg(arg: str, available_langs: List[str]) -> List[str]: - """Parse a language argument (comma-separated or 'all').""" - if arg == "all": - return available_langs - langs = [l.strip() for l in arg.split(",") if l.strip()] - # Validate languages - for lang in langs: - if lang not in available_langs: - raise ValueError(f"Unknown language: {lang}. Available: {available_langs}") - return langs - - -def main(): - parser = argparse.ArgumentParser(description="Compare tokenization between Qwen and IPA BPE tokenizers.") - parser.add_argument( - "--output_dir", - type=str, - required=True, - help="Directory to save tokenizers and results", - ) - parser.add_argument( - "--samples_per_lang", - type=int, - default=1000, - help="Number of samples per language for testing (default: 1000)", - ) - parser.add_argument( - "--train_langs", - type=str, - default="all", - help="Comma-separated languages for training tokenizer, or 'all' (default: all)", - ) - parser.add_argument( - "--test_langs", - type=str, - default="all", - help="Comma-separated languages for testing/analysis, or 'all' (default: all)", - ) - parser.add_argument( - "--max_samples_per_lang", - type=int, - default=None, - help="Optional cap on training samples per language (default: use min count across langs)", - ) - parser.add_argument( - "--seed", - type=int, - default=42, - help="Random seed for sampling (default: 42)", - ) - parser.add_argument( - "--config", - type=str, - default=None, - help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}", - ) - parser.add_argument( - "--max_count_per_lang", - type=int, - default=100000, - help="Max count per language when counting IPA strings (default: 100000)", - ) - args = parser.parse_args() - - os.makedirs(args.output_dir, exist_ok=True) - - # Load config - config_path = Path(args.config) if args.config else None - cuts_dirs = load_cuts_dirs_config(config_path) - available_langs = list(cuts_dirs.keys()) - print(f"[INFO] Loaded config with languages: {available_langs}") - - # Parse train and test languages - try: - train_langs = parse_lang_arg(args.train_langs, available_langs) - test_langs = parse_lang_arg(args.test_langs, available_langs) - except ValueError as e: - print(f"[ERROR] {e}") - sys.exit(1) - - print(f"[INFO] Training languages: {train_langs}") - print(f"[INFO] Testing languages: {test_langs}") - print(f"[INFO] Samples per language for testing: {args.samples_per_lang}") - print(f"[INFO] Max samples per language for training: {args.max_samples_per_lang or 'auto (min across langs)'}") - print(f"[INFO] Vocab sizes: {VOCAB_SIZES}") - - # Step 1: Create balanced IPA corpus once - print("\n" + "=" * 60) - print("STEP 1: Creating balanced IPA corpus") - print("=" * 60) - - corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt") - - # Check if corpus already exists - if os.path.exists(corpus_file): - print(f"[INFO] Using existing corpus file: {corpus_file}") - with open(corpus_file, "r", encoding="utf-8") as f: - line_count = sum(1 for _ in f) - print(f"[INFO] Corpus contains {line_count} IPA strings") - else: - corpus_file, lang_counts = create_balanced_corpus( - train_langs=train_langs, - cuts_dirs=cuts_dirs, - output_file=corpus_file, - max_samples_per_lang=args.max_samples_per_lang, - max_count_per_lang=args.max_count_per_lang, - seed=args.seed, - ) - - # Step 2: Train IPA BPE tokenizers at different vocab sizes (reusing corpus) - print("\n" + "=" * 60) - print("STEP 2: Training IPA BPE tokenizers") - print("=" * 60) - - ipa_tokenizers = {} - for vocab_size in VOCAB_SIZES: - print(f"\n[INFO] Training tokenizer with vocab_size={vocab_size}") - ipa_tokenizers[vocab_size] = train_ipa_bpe_tokenizer( - output_dir=args.output_dir, - vocab_size=vocab_size, - corpus_file=corpus_file, - min_frequency=2, - ) - - # Step 3: Load Qwen and Nemotron tokenizers - print("\n" + "=" * 60) - print("STEP 3: Loading Qwen and Nemotron tokenizers") - print("=" * 60) - - print("[INFO] Loading Qwen/Qwen2.5-1.5B-Instruct tokenizer...") - qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") - print(f"[INFO] Qwen tokenizer vocab size: {qwen_tokenizer.vocab_size}") - - print("[INFO] Loading nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 tokenizer...") - - nemotron_tokenizer = AutoTokenizer.from_pretrained( - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", trust_remote_code=True - ) - - print(f"[INFO] Nemotron tokenizer vocab size: {nemotron_tokenizer.vocab_size}") - - # Step 4: Sample text pairs and compute statistics (on test languages) - print("\n" + "=" * 60) - print("STEP 4: Sampling and analyzing (test languages)") - print("=" * 60) - - all_stats = [] - for lang in test_langs: - print(f"\n[INFO] Processing language: {lang}") - - # Sample text pairs - text_pairs = sample_text_pairs(lang, cuts_dirs, args.samples_per_lang, args.seed) - - if not text_pairs: - print(f"[WARN] No text pairs found for {lang}, skipping") - continue - - print(f"[INFO] Sampled {len(text_pairs)} text pairs for {lang}") - - # Compute stats - stats = compute_stats(text_pairs, qwen_tokenizer, nemotron_tokenizer, ipa_tokenizers, lang) - all_stats.append(stats) - - # Print intermediate results - print( - f"[INFO] {lang}: duration={stats.total_duration:.2f}s, Qwen={stats.qwen_tokens_per_second:.2f} tok/s, Nemotron={stats.nemotron_tokens_per_second:.2f} tok/s" - ) - for vs in VOCAB_SIZES: - print(f" IPA-{vs}={stats.ipa_tokens_per_second[vs]:.2f} tok/s") - - # Step 5: Print and save results - print("\n" + "=" * 60) - print("STEP 5: Results") - print("=" * 60) - - print_stats_table(all_stats, VOCAB_SIZES) - - # Save to JSON with metadata - results_path = os.path.join(args.output_dir, "tokenization_comparison.json") - save_results_json(all_stats, results_path, train_langs, test_langs) - - print("[INFO] Done!") - - -if __name__ == "__main__": - main() diff --git a/scripts/magpietts/ipa_scripts/cuts_dirs_config.json b/scripts/magpietts/ipa_scripts/cuts_dirs_config.json deleted file mode 100644 index 8785de53211e..000000000000 --- a/scripts/magpietts/ipa_scripts/cuts_dirs_config.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "de": ["/Data/tts_lhotse_datasets/speech_data/de/cmltts_de_train/cuts"], - "es": [ - "/Data/tts_lhotse_datasets/speech_data/es/cmltts_es_train/cuts", - "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts", - "/Data/tts_lhotse_datasets/speech_data/es/riva_ES_RubbyCarlos/cuts_textContext" - ], - "fr": [ - "/Data/tts_lhotse_datasets/speech_data/fr/cmltts_fr_train/cuts", - "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts", - "/Data/tts_lhotse_datasets/speech_data/fr/riva_FR_VirginieSamy/cuts_textContext" - ], - "hi": [ - "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_1/cuts", - "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi/filter_2/cuts", - "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_1/cuts", - "/Data/tts_lhotse_datasets/speech_data/hi/nvyt_hi_2/filter_2/cuts" - ], - "it": ["/Data/tts_lhotse_datasets/speech_data/it/cmltts_it_train/cuts"], - "vi": [ - "/Data/tts_lhotse_datasets/speech_data/vi/Infore1_2_lsvsc/cuts", - "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts", - "/Data/tts_lhotse_datasets/speech_data/vi/Long_ContextAudio/cuts_textContext", - "/Data/tts_lhotse_datasets/speech_data/vi/NorthFemale/cuts", - "/Data/tts_lhotse_datasets/speech_data/vi/nvyt_vi/nvyt_yt2025/cuts" - ], - "zh": [ - "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts", - "/Data/tts_lhotse_datasets/speech_data/zh/riva_ZH_SiweiHouZhen/cuts_textContext", - "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_1/cuts", - "/Data/tts_lhotse_datasets/speech_data/zh/nvyt_zh/filter_2/cuts" - ], - "en": [ - "/Data/tts_lhotse_datasets/speech_data/en/nvyt2505/lhotse_shar_shuffle_shardSize256/cuts", - "/Data/tts_lhotse_datasets/speech_data/en/hifitts/lhotse_shar_shuffle_shardSize256/cuts", - "/Data/tts_lhotse_datasets/speech_data/en/hifitts2/lhotse_shar_shuffle_shardSize256/cuts", - "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts", - "/Data/tts_lhotse_datasets/speech_data/en/libritts/lhotse_shar_shuffle_shardSize256/cuts", - "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts", - "/Data/tts_lhotse_datasets/speech_data/en/rivaLindyRodney/lhotse_shar_shuffle_shardSize256/cuts_textContext", - "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts", - "/Data/tts_lhotse_datasets/speech_data/en/rivaEmmaMeganSeanTom/lhotse_shar_shuffle_shardSize256/cuts_textContext", - "/Data/tts_lhotse_datasets/speech_data/en/jhsdGtc20Amp20Keynote/lhotse_shar_shuffle_shardSize256/cuts_textContext" - ] -} diff --git a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py b/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py deleted file mode 100644 index 825129d2c928..000000000000 --- a/scripts/magpietts/ipa_scripts/train_ipa_bpe_tokenizer.py +++ /dev/null @@ -1,522 +0,0 @@ -#!/usr/bin/env python3 -""" -Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards. - -This script: -1. Reads IPA strings from cuts_with_ipa directories (output of add_ipa_to_lhotse_shards.py) -2. Optionally balances data across languages (samples equal amounts from each) -3. Trains a HuggingFace ByteLevelBPETokenizer on all extracted IPA strings -4. Saves vocab.json and merges.txt to the specified output directory - -Features: -- Language balancing: uses the same number of samples from each language -- Configurable max samples per language - -Usage: - python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --vocab_size 1024 - python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs en,de --vocab_size 2048 - python train_ipa_bpe_tokenizer.py --output_dir /path/to/output --train_langs all --max_samples_per_lang 50000 - -The trained tokenizer can be loaded using the IPABPETokenizer class in: - nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py -""" - -from __future__ import annotations - -import argparse -import gzip -import json -import os -import random -import sys -from pathlib import Path -from typing import Dict, Generator, List, Optional, Tuple - -from tokenizers import Tokenizer -from tokenizers.decoders import ByteLevel as ByteLevelDecoder -from tokenizers.models import BPE -from tokenizers.pre_tokenizers import ByteLevel -from tokenizers.trainers import BpeTrainer - -# ------------------------- -# USER CONFIG - Same structure as add_ipa_to_lhotse_shards.py -# ------------------------- - -# Default config file path (same directory as this script) -DEFAULT_CONFIG_PATH = Path(__file__).parent / "cuts_dirs_config.json" - - -def load_cuts_dirs_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: - """Load CUTS_DIRS_BY_LANG from a JSON config file.""" - if config_path is None: - config_path = DEFAULT_CONFIG_PATH - - if not config_path.exists(): - raise FileNotFoundError(f"Config file not found: {config_path}") - - with open(config_path, "r", encoding="utf-8") as f: - return json.load(f) - - -OUTPUT_SUFFIX = "_with_ipa" # cuts -> cuts_with_ipa -SHARD_GLOB = "cuts.*.jsonl.gz" - - -def get_ipa_dir(cuts_dir: Path) -> Path: - """Convert a cuts directory path to its corresponding cuts_with_ipa path.""" - name = cuts_dir.name - if name == "cuts": - out_name = f"cuts{OUTPUT_SUFFIX}" - elif name.endswith("_textContext"): - # Handle cuts_textContext -> cuts_textContext_with_ipa - out_name = f"{name}{OUTPUT_SUFFIX}" - else: - out_name = f"{name}{OUTPUT_SUFFIX}" - return cuts_dir.parent / out_name - - -def iter_shards(ipa_dir: Path) -> List[Path]: - """Get all shard files in a directory.""" - return sorted(ipa_dir.glob(SHARD_GLOB)) - - -def extract_ipa_from_shard(shard_path: Path) -> Generator[str, None, None]: - """ - Extract all IPA strings from a single shard file. - - Yields: - IPA strings from cut["supervisions"][i]["custom"]["ipa"] - """ - with gzip.open(shard_path, "rt", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - try: - cut = json.loads(line) - supervisions = cut.get("supervisions", []) - for sup in supervisions: - custom = sup.get("custom", {}) - ipa = custom.get("ipa") - if ipa and isinstance(ipa, str) and ipa.strip(): - yield ipa.strip() - except json.JSONDecodeError: - continue - - -def extract_ipa_from_dir(ipa_dir: Path) -> Generator[str, None, None]: - """Extract all IPA strings from all shards in a directory.""" - shards = iter_shards(ipa_dir) - for shard in shards: - yield from extract_ipa_from_shard(shard) - - -def get_available_languages(cuts_dirs: Dict[str, List[str]]) -> List[str]: - """Return list of all available language codes.""" - return list(cuts_dirs.keys()) - - -def collect_ipa_strings( - cuts_dirs: Dict[str, List[str]], - lang: Optional[str] = None, -) -> Generator[str, None, None]: - """ - Collect all IPA strings from the specified language(s). - - Args: - cuts_dirs: Dictionary mapping language codes to lists of cuts directories - lang: Language code or None for all languages. - - Yields: - IPA strings - """ - if lang is None or lang == "all": - langs_to_process = list(cuts_dirs.keys()) - else: - if lang not in cuts_dirs: - raise ValueError(f"Unknown language: {lang}. Available: {get_available_languages(cuts_dirs)}") - langs_to_process = [lang] - - for lang_code in langs_to_process: - print(f"[INFO] Processing language: {lang_code}") - for cuts_dir_str in cuts_dirs[lang_code]: - cuts_dir = Path(cuts_dir_str) - ipa_dir = get_ipa_dir(cuts_dir) - - if not ipa_dir.exists(): - print(f"[WARN] IPA directory does not exist: {ipa_dir}", file=sys.stderr) - continue - - print(f"[INFO] Reading from: {ipa_dir}") - count = 0 - for ipa in extract_ipa_from_dir(ipa_dir): - yield ipa - count += 1 - print(f"[INFO] Extracted {count} IPA strings from {ipa_dir}") - - -def iter_ipa_strings_for_lang( - lang: str, - cuts_dirs: Dict[str, List[str]], -) -> Generator[str, None, None]: - """Iterate over all IPA strings for a single language (memory-efficient).""" - if lang not in cuts_dirs: - return - - for cuts_dir_str in cuts_dirs[lang]: - cuts_dir = Path(cuts_dir_str) - ipa_dir = get_ipa_dir(cuts_dir) - - if not ipa_dir.exists(): - continue - - for ipa in extract_ipa_from_dir(ipa_dir): - yield ipa - - -def count_ipa_strings_for_lang(lang: str, cuts_dirs: Dict[str, List[str]], max_count: int = 100000) -> int: - """Count IPA strings for a language without loading into memory.""" - count = 0 - for _ in iter_ipa_strings_for_lang(lang, cuts_dirs): - count += 1 - if count >= max_count: - break - return count - - -def simple_sample_ipa_strings( - lang: str, - cuts_dirs: Dict[str, List[str]], - k: int, - max_collect: int = 100000, - seed: int = 42, -) -> List[str]: - """ - Simple sampling: collect up to max_collect IPA strings, then randomly sample k. - - This avoids reading through all data like reservoir sampling does. - - Args: - lang: Language code - cuts_dirs: Dictionary mapping language codes to lists of cuts directories - k: Number of samples to select - max_collect: Maximum number of strings to collect before sampling - seed: Random seed for reproducibility - - Returns: - List of up to k sampled IPA strings - """ - rng = random.Random(seed) - collected: List[str] = [] - - for ipa in iter_ipa_strings_for_lang(lang, cuts_dirs): - collected.append(ipa) - if len(collected) >= max_collect: - break - - # If we have fewer than k, return all - if len(collected) <= k: - return collected - - # Otherwise, randomly sample k - return rng.sample(collected, k) - - -def parse_langs_arg(arg: str, available_langs: List[str]) -> List[str]: - """Parse a language argument (comma-separated or 'all').""" - if arg == "all": - return available_langs - langs = [l.strip() for l in arg.split(",") if l.strip()] - for lang in langs: - if lang not in available_langs: - raise ValueError(f"Unknown language: {lang}. Available: {available_langs}") - return langs - - -def create_balanced_corpus( - train_langs: List[str], - cuts_dirs: Dict[str, List[str]], - output_file: str, - max_samples_per_lang: Optional[int] = None, - max_count_per_lang: int = 100000, - seed: int = 42, -) -> Tuple[str, Dict[str, int]]: - """ - Create a balanced IPA corpus file with equal samples from each language. - - Uses a memory-efficient two-pass approach: - 1. First pass: Count sentences per language (up to max_count_per_lang) - 2. Second pass: Use simple sampling to select samples - - Args: - train_langs: List of language codes to include - cuts_dirs: Dictionary mapping language codes to lists of cuts directories - output_file: Path to write the balanced corpus - max_samples_per_lang: Optional cap on samples per language - max_count_per_lang: Max count per language when counting IPA strings - seed: Random seed for reproducibility - - Returns: - Tuple of (corpus_file_path, dict of lang -> actual_count) - """ - # First pass: Count sentences per language - print("[INFO] Pass 1: Counting IPA strings per language...") - lang_counts: Dict[str, int] = {} - - for lang in train_langs: - if lang not in cuts_dirs: - print(f"[WARN] Language {lang} not in config, skipping") - continue - print(f"[INFO] Counting {lang}...", end=" ", flush=True) - count = count_ipa_strings_for_lang(lang, cuts_dirs, max_count_per_lang) - lang_counts[lang] = count - print(f"{count} IPA strings") - - if not lang_counts: - raise ValueError("No IPA strings found for any language") - - # Find minimum count across languages - min_count = min(lang_counts.values()) - print(f"[INFO] Minimum count across languages: {min_count}") - - # Apply max_samples_per_lang cap if specified - samples_per_lang = min_count - if max_samples_per_lang is not None and max_samples_per_lang < min_count: - samples_per_lang = max_samples_per_lang - print(f"[INFO] Using max_samples_per_lang cap: {samples_per_lang}") - - # Second pass: Sample from each language using simple sampling - print(f"[INFO] Pass 2: Sampling {samples_per_lang} strings per language...") - actual_counts: Dict[str, int] = {} - total_written = 0 - - with open(output_file, "w", encoding="utf-8") as f: - for lang in lang_counts.keys(): - print(f"[INFO] Sampling from {lang}...", end=" ", flush=True) - # Use different seed per language for variety, but reproducible - lang_seed = seed + hash(lang) % 10000 - sampled = simple_sample_ipa_strings(lang, cuts_dirs, samples_per_lang, max_count_per_lang, lang_seed) - - for ipa in sampled: - f.write(ipa + "\n") - total_written += 1 - - actual_counts[lang] = len(sampled) - print(f"sampled {len(sampled)} strings") - - print(f"[INFO] Total IPA strings written to corpus: {total_written}") - print(f"[INFO] Balanced corpus saved to: {output_file}") - - return output_file, actual_counts - - -def train_bpe_tokenizer( - corpus_file: str, - vocab_size: int = 1024, - min_frequency: int = 2, - output_dir: str = "./ipa_bpe_tokenizer", -) -> Tokenizer: - """ - Train a byte-level BPE tokenizer on IPA strings from a corpus file. - - Args: - corpus_file: Path to the IPA corpus file (one IPA string per line) - vocab_size: Target vocabulary size - min_frequency: Minimum frequency for a token to be included - output_dir: Directory to save the tokenizer files - - Returns: - Trained Tokenizer object - """ - # Create output directory - os.makedirs(output_dir, exist_ok=True) - - # Check if tokenizer already exists - tokenizer_path = os.path.join(output_dir, "tokenizer.json") - if os.path.exists(tokenizer_path): - print(f"[INFO] Loading existing tokenizer from {tokenizer_path}") - return Tokenizer.from_file(tokenizer_path) - - # Count lines in corpus - with open(corpus_file, "r", encoding="utf-8") as f: - total_count = sum(1 for _ in f) - print(f"[INFO] Corpus contains {total_count} IPA strings") - - if total_count == 0: - raise ValueError("Corpus file is empty. Make sure the cuts_with_ipa directories exist.") - - # Initialize a byte-level BPE tokenizer - tokenizer = Tokenizer(BPE(unk_token="")) - - # Use byte-level pre-tokenization (like GPT-2) - tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) - - # Add byte-level decoder to properly convert back to original text - tokenizer.decoder = ByteLevelDecoder() - - # Define special tokens - special_tokens = ["", "", ""] - - # Create trainer - trainer = BpeTrainer( - vocab_size=vocab_size, - min_frequency=min_frequency, - special_tokens=special_tokens, - show_progress=True, - ) - - # Train the tokenizer - print(f"[INFO] Training BPE tokenizer with vocab_size={vocab_size}, min_frequency={min_frequency}") - tokenizer.train(files=[corpus_file], trainer=trainer) - - # Save the tokenizer - vocab_path = os.path.join(output_dir, "vocab.json") - merges_path = os.path.join(output_dir, "merges.txt") - - # Save using the tokenizer's model save method - tokenizer.model.save(output_dir) - - # Also save the full tokenizer for easy loading - tokenizer.save(tokenizer_path) - - print(f"[INFO] Tokenizer saved to: {output_dir}") - print(f"[INFO] - vocab.json: {vocab_path}") - print(f"[INFO] - merges.txt: {merges_path}") - print(f"[INFO] - tokenizer.json: {tokenizer_path}") - print(f"[INFO] Vocabulary size: {tokenizer.get_vocab_size()}") - - return tokenizer - - -def main(): - parser = argparse.ArgumentParser( - description="Train a byte-level BPE tokenizer on IPA strings from Lhotse cuts_with_ipa shards." - ) - parser.add_argument( - "--output_dir", - type=str, - required=True, - help="Directory to save the trained tokenizer files (vocab.json, merges.txt, tokenizer.json)", - ) - parser.add_argument( - "--vocab_size", - type=int, - default=1024, - help="Vocabulary size for the BPE tokenizer (default: 1024)", - ) - parser.add_argument( - "--min_frequency", - type=int, - default=2, - help="Minimum frequency for a token to be included in vocabulary (default: 2)", - ) - parser.add_argument( - "--train_langs", - type=str, - default="all", - help="Comma-separated language codes for training, or 'all' (default: all)", - ) - parser.add_argument( - "--max_samples_per_lang", - type=int, - default=None, - help="Optional cap on samples per language (default: use min count across langs for balance)", - ) - parser.add_argument( - "--seed", - type=int, - default=42, - help="Random seed for sampling (default: 42)", - ) - parser.add_argument( - "--config", - type=str, - default=None, - help=f"Path to JSON config file with cuts directories. Default: {DEFAULT_CONFIG_PATH}", - ) - parser.add_argument( - "--max_count_per_lang", - type=int, - default=100000, - help="Max count per language when counting IPA strings (default: 100000)", - ) - args = parser.parse_args() - - # Load config - config_path = Path(args.config) if args.config else None - cuts_dirs = load_cuts_dirs_config(config_path) - available_langs = get_available_languages(cuts_dirs) - - # Parse train_langs - try: - train_langs = parse_langs_arg(args.train_langs, available_langs) - except ValueError as e: - print(f"[ERROR] {e}") - sys.exit(1) - - print(f"[INFO] Training IPA BPE tokenizer") - print(f"[INFO] Output directory: {args.output_dir}") - print(f"[INFO] Vocabulary size: {args.vocab_size}") - print(f"[INFO] Min frequency: {args.min_frequency}") - print(f"[INFO] Training languages: {train_langs}") - print(f"[INFO] Max samples per lang: {args.max_samples_per_lang or 'auto (min across langs)'}") - print(f"[INFO] Max count per lang: {args.max_count_per_lang}") - print(f"[INFO] Available languages: {available_langs}") - - os.makedirs(args.output_dir, exist_ok=True) - - # Step 1: Create balanced corpus - print("\n" + "=" * 60) - print("STEP 1: Creating balanced IPA corpus") - print("=" * 60) - - corpus_file = os.path.join(args.output_dir, "ipa_corpus_balanced.txt") - - if os.path.exists(corpus_file): - print(f"[INFO] Using existing corpus file: {corpus_file}") - with open(corpus_file, "r", encoding="utf-8") as f: - line_count = sum(1 for _ in f) - print(f"[INFO] Corpus contains {line_count} IPA strings") - else: - corpus_file, lang_counts = create_balanced_corpus( - train_langs=train_langs, - cuts_dirs=cuts_dirs, - output_file=corpus_file, - max_samples_per_lang=args.max_samples_per_lang, - max_count_per_lang=args.max_count_per_lang, - seed=args.seed, - ) - - # Step 2: Train tokenizer - print("\n" + "=" * 60) - print("STEP 2: Training BPE tokenizer") - print("=" * 60) - - tokenizer = train_bpe_tokenizer( - corpus_file=corpus_file, - vocab_size=args.vocab_size, - min_frequency=args.min_frequency, - output_dir=args.output_dir, - ) - - # Test the tokenizer - print("\n[INFO] Testing tokenizer with sample IPA strings:") - test_strings = [ - "həˈloʊ wɜːld", # hello world - "ˈaɪ pʰiː eɪ", # IPA - "ˈtɛstɪŋ wʌn tuː θriː", # testing one two three - ] - for test_str in test_strings: - encoded = tokenizer.encode(test_str) - decoded = tokenizer.decode(encoded.ids) - print(f" Input: '{test_str}'") - print(f" Tokens: {encoded.tokens}") - print(f" IDs: {encoded.ids}") - print(f" Decoded: '{decoded}'") - print() - - print("[INFO] Done!") - - -if __name__ == "__main__": - main() diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json deleted file mode 100644 index 6d7e35116405..000000000000 --- a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json +++ /dev/null @@ -1,9954 +0,0 @@ -{ - "version": "1.0", - "truncation": null, - "padding": null, - "added_tokens": [ - { - "id": 0, - "content": "", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - }, - { - "id": 1, - "content": "", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - }, - { - "id": 2, - "content": "", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - } - ], - "normalizer": null, - "pre_tokenizer": { - "type": "ByteLevel", - "add_prefix_space": false, - "trim_offsets": true, - "use_regex": true - }, - "post_processor": null, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": true, - "use_regex": true - }, - "model": { - "type": "BPE", - "dropout": null, - "unk_token": "", - "continuing_subword_prefix": null, - "end_of_word_suffix": null, - "fuse_unk": false, - "byte_fallback": false, - "ignore_merges": false, - "vocab": { - "": 0, - "": 1, - "": 2, - "(": 3, - ")": 4, - "-": 5, - ".": 6, - "1": 7, - "2": 8, - "4": 9, - "5": 10, - "6": 11, - "7": 12, - "F": 13, - "a": 14, - "b": 15, - "c": 16, - "d": 17, - "e": 18, - "f": 19, - "h": 20, - "i": 21, - "j": 22, - "k": 23, - "l": 24, - "m": 25, - "n": 26, - "o": 27, - "p": 28, - "q": 29, - "r": 30, - "s": 31, - "t": 32, - "u": 33, - "v": 34, - "w": 35, - "x": 36, - "y": 37, - "z": 38, - "¡": 39, - "£": 40, - "¦": 41, - "§": 42, - "©": 43, - "ª": 44, - "¬": 45, - "°": 46, - "²": 47, - "³": 48, - "¸": 49, - "¹": 50, - "¾": 51, - "Ã": 52, - "Å": 53, - "É": 54, - "Ê": 55, - "Ë": 56, - "Ì": 57, - "Î": 58, - "Ï": 59, - "Ċ": 60, - "Ġ": 61, - "Ģ": 62, - "ģ": 63, - "Ĥ": 64, - "ĥ": 65, - "ĩ": 66, - "Ī": 67, - "Ĭ": 68, - "ĭ": 69, - "Į": 70, - "į": 71, - "İ": 72, - "ı": 73, - "IJ": 74, - "ij": 75, - "Ĵ": 76, - "ĵ": 77, - "Ķ": 78, - "ķ": 79, - "ĸ": 80, - "Ĺ": 81, - "Ļ": 82, - "Ľ": 83, - "ľ": 84, - "Ŀ": 85, - "Ł": 86, - "ËĪ": 87, - "ËIJ": 88, - "ËĪÉ": 89, - "ËĮ": 90, - "ÉĻ": 91, - "ËĪa": 92, - "ËĪi": 93, - "Ġt": 94, - "ɪ": 95, - "ɾ": 96, - "ĠÉ": 97, - "Ġk": 98, - "Éľ": 99, - "Ġs": 100, - "ËĪe": 101, - "ÉĽ": 102, - "ËĪo": 103, - "Ġl": 104, - "ËĪÉĽ": 105, - "Ġd": 106, - "ÊĬ": 107, - "ËĪaËIJ": 108, - "Ġp": 109, - "Ìĥ": 110, - "Ġm": 111, - "ËĪu": 112, - "Åĭ": 113, - "ð": 114, - "ËĪÉĶ": 115, - "ÊĮ": 116, - "ËĮa": 117, - "Ġh": 118, - "ËĪÊĮ": 119, - "Ġn": 120, - "Êģ": 121, - "ËĪÉij": 122, - "Êĥ": 123, - "eËIJ": 124, - "Ġa": 125, - "Ġb": 126, - "ÉĶ": 127, - "ËĪÉĻ": 128, - "ÉĻn": 129, - "Ġf": 130, - "ËĪɪ": 131, - "É¡": 132, - "ËĪeËIJ": 133, - "Ġj": 134, - "nt": 135, - "Ġð": 136, - "ĠËĮ": 137, - "Ġts": 138, - "ĠÉ¡": 139, - "Éķ": 140, - "ËĪoËIJ": 141, - "ʰ": 142, - "aËIJ": 143, - "ËĪy": 144, - "ĠtÉķ": 145, - "ËĪiËIJ": 146, - "ĠÊ": 147, - "Ġv": 148, - "Ġw": 149, - "st": 150, - "Éij": 151, - "nd": 152, - "ËĮi": 153, - "̪": 154, - "ËĮe": 155, - "Ġz": 156, - "ËĪaɪ": 157, - "ËĪiÉĽ": 158, - "β": 159, - "ɹ": 160, - "ĠËĮa": 161, - "θ": 162, - "ĠhÉĽ": 163, - "ÊĪ": 164, - "iËIJ": 165, - "ËĮo": 166, - "Ġɪ": 167, - "Éľn": 168, - "Ġx": 169, - "ĠtÉĻ": 170, - "ËĪuËIJ": 171, - "ËĮÉĻ": 172, - "ĠjËĪi": 173, - "ËĮÉĽ": 174, - "ĠÉĽ": 175, - "ĠËĪa": 176, - "ËĮaËIJ": 177, - "Ġla": 178, - "Ġðe": 179, - "ĠhÉĽËIJ": 180, - "Ġe": 181, - "ç": 182, - "ÉĻl": 183, - "oËIJ": 184, - "ËĪÉiju": 185, - "ÊĴ": 186, - "uËIJ": 187, - "ĠÉĹ": 188, - "ĠÉķ": 189, - "ËĮeËIJ": 190, - "ĠtÉķËĪi": 191, - "os": 192, - "ËĪÉĶËIJ": 193, - "as": 194, - "ËĪÊĬ": 195, - "Ġi": 196, - "ËĪai": 197, - "ɲ": 198, - "ɪn": 199, - "ts": 200, - "ÉľÅĭ": 201, - "ĠÉŁ": 202, - "ĠÊĥ": 203, - "ËĪeɪ": 204, - "ÉĽÉ¾": 205, - "ËĪÉĽËIJ": 206, - "ËĪÉĽÉ¾": 207, - "Ġr": 208, - "tÊĥ": 209, - "ËĮÉĶ": 210, - "ĠdÉĻ": 211, - "tÉĻ": 212, - "ou": 213, - "ËĪyÉĻ": 214, - "ĠËĮi": 215, - "ÉĻɾ": 216, - "ËĪÉĻÊĬ": 217, - "ËĪÊĮɾ": 218, - "ËĪÉĴ": 219, - "Ġth": 220, - "ËĪon": 221, - "Êĭ": 222, - "ËĪÉijËIJ": 223, - "ËĪÊĮh": 224, - "wËĪa": 225, - "ËĪei": 226, - "ll": 227, - "ĠÉIJ": 228, - "ÉijËIJ": 229, - "an": 230, - "ÉŁ": 231, - "ĠÊĭ": 232, - "Ġko": 233, - "kh": 234, - "ɪÅĭ": 235, - "ËĪaËIJɪ": 236, - "ĠtÊĥ": 237, - "ËĪaËIJt": 238, - "ĠËĮe": 239, - "ĠtÉķh": 240, - "ËĪuo": 241, - "ËĪonÉ¡": 242, - "Éĸ": 243, - "at": 244, - "Ġke": 245, - "ÉĴ": 246, - "ĠÉķËĪi": 247, - "ø": 248, - "ĠÉij": 249, - "ËĪeËIJk": 250, - "Åĵ": 251, - "re": 252, - "Ġɾ": 253, - "ĠkÉĶ": 254, - "ËĮÊĬ": 255, - "sk": 256, - "ĠÊĬ": 257, - "Ġand": 258, - "ɪç": 259, - "Ġme": 260, - "ËĪaɾ": 261, - "ĠËĪɪ": 262, - "na": 263, - "Ġβ": 264, - "ĠlËĪi": 265, - "jaËIJ": 266, - "li": 267, - "no": 268, - "Ġɪn": 269, - "ĠdËĮi": 270, - "Ġɲ": 271, - "tËIJ": 272, - "ÉĻm": 273, - "ĠlÉĻ": 274, - "ĠðÉĻ": 275, - "ɪk": 276, - "ËĪÉĽl": 277, - "Éľt": 278, - "Ġse": 279, - "es": 280, - "ËĪou": 281, - "ËĪaÊĬ": 282, - "ĠÉĶ": 283, - "ɪt": 284, - "ĠÅĭ": 285, - "ËĪÉĽn": 286, - "Êİ": 287, - "Ġkh": 288, - "ËĪÉĽnt": 289, - "ËĪaËIJɾ": 290, - "Ġki": 291, - "mp": 292, - "lt": 293, - "É£": 294, - "Ġpa": 295, - "ËĪÉĻËIJ": 296, - "ɪs": 297, - "ĠÉĴ": 298, - "Ġle": 299, - "ÉªÉľ": 300, - "ËĪÉĽt": 301, - "Ġde": 302, - "Ġɹ": 303, - "ĠtËĪoËIJ": 304, - "ĠÊģ": 305, - "ÊĥÉĻn": 306, - "ĠÊĬnt": 307, - "ËĪÉĶɾ": 308, - "ËĪað": 309, - "Ġaɪ": 310, - "ĠÊIJ": 311, - "ĠmËĪa": 312, - "ra": 313, - "ĠkËĪɪ": 314, - "kt": 315, - "ËIJp": 316, - "ĠÊĪ": 317, - "ËĪaËIJÊĬ": 318, - "ĠkËĪÊĮɾ": 319, - "ĠËĪÊĮ": 320, - "ĠÉĴv": 321, - "Ġel": 322, - "ks": 323, - "Ġkw": 324, - "ÉĻt": 325, - "ndo": 326, - "ei": 327, - "ĠËĮaËIJp": 328, - "se": 329, - "ÉĻɹ": 330, - "ËĪuei": 331, - "ÉĻs": 332, - "ĠkËĮo": 333, - "ĠÊĤ": 334, - "ĠËĮÊĬ": 335, - "Ġc": 336, - "ĠÉĽn": 337, - "ËĪant": 338, - "θj": 339, - "ËĮoËIJ": 340, - "ĠËĪaËIJ": 341, - "Ġpɾ": 342, - "si": 343, - "ĠËĪe": 344, - "ĠjuËIJ": 345, - "ĠkËĮe": 346, - "ËĮɪ": 347, - "ÉĶn": 348, - "ĠsËĪÊĮ": 349, - "ĠËĪu": 350, - "ni": 351, - "Ġst": 352, - "ĠdiËIJ": 353, - "ĠkeËIJ": 354, - "ĠjËĪiou": 355, - "ËĪaiÉľ": 356, - "ĠdÊĴ": 357, - "ĠËĪÉĶ": 358, - "va": 359, - "ËIJɾ": 360, - "ËĪø": 361, - "ËĮÉĻÊĬ": 362, - "ĠpËĪu": 363, - "Ġsu": 364, - "Ġma": 365, - "ĠÉĻ": 366, - "dÊĴ": 367, - "Ġpʰ": 368, - "le": 369, - "in": 370, - "ĠtÉķhËĪi": 371, - "ĠwËĪo": 372, - "ro": 373, - "ËĮy": 374, - "ɾa": 375, - "ĠsËĪi": 376, - "ðÉĻ": 377, - "ĠseËIJ": 378, - "la": 379, - "ĠÊĴ": 380, - "mb": 381, - "ĠhËĪoËIJ": 382, - "Ġbʰ": 383, - "ĠÉĽÉ¾": 384, - "Ġðat": 385, - "sp": 386, - "ÉĶɾ": 387, - "en": 388, - "ĠsÉĻ": 389, - "ËĪÉĶÉľ": 390, - "ĠlËĮa": 391, - "ĠËĮÉĽ": 392, - "ĠËĪy": 393, - "É¡aËIJ": 394, - "ĠdÉĽÉ¾": 395, - "ËĪÉĽÊģ": 396, - "Éľkh": 397, - "ËĪiÉĻ": 398, - "ËĪan": 399, - "ĠmËĪo": 400, - "ËĪaβ": 401, - "Ġal": 402, - "ĠËĪeËIJ": 403, - "Ġθ": 404, - "ĠnËĪi": 405, - "pʰ": 406, - "lla": 407, - "Ġpl": 408, - "ËĪÅĵ": 409, - "jËĪÉiju": 410, - "Ġav": 411, - "ĠmËĪi": 412, - "ĠfËĪa": 413, - "ËĪÉľ": 414, - "me": 415, - "ËĮÉĻh": 416, - "ËĪuÉĻ": 417, - "it": 418, - "jËĪe": 419, - "Ġo": 420, - "ËĪÉľËIJ": 421, - "ĠtÉķËĪiou": 422, - "ÉĶËIJ": 423, - "ĠnÉĻ": 424, - "ËĪÉĻÉľn": 425, - "ĠmÉĻ": 426, - "ĠdeËIJ": 427, - "mo": 428, - "sa": 429, - "jËĪÉĶ": 430, - "ËĪal": 431, - "ĠtÉķËĪiÉĽ": 432, - "ĠÉ¡ÉĻ": 433, - "ða": 434, - "Ġɪz": 435, - "Ġsa": 436, - "ri": 437, - "ĠËĮil": 438, - "ËĮu": 439, - "ĠkaËIJ": 440, - "ĠÉĻËIJ": 441, - "ĠÉĸ": 442, - "Ġka": 443, - "ËĪÊĮhi": 444, - "ĠjeËIJ": 445, - "Ġtʰ": 446, - "ne": 447, - "kËIJ": 448, - "ĠtsËĪai": 449, - "ĠËĪeËIJk": 450, - "nk": 451, - "ti": 452, - "ËĪaÉľn": 453, - "ĠkËIJ": 454, - "É¡ÉĻn": 455, - "ËĪia": 456, - "ĠÉĶËIJɾ": 457, - "Êı": 458, - "ĠËĮÊĮ": 459, - "ĠzËĪaËIJ": 460, - "Ġlos": 461, - "ÉĽs": 462, - "ËĪÉĶn": 463, - "ÉĽnt": 464, - "ÉĽn": 465, - "ĠÉŁËĪoËIJ": 466, - "çt": 467, - "Ġdas": 468, - "ĠxËĮo": 469, - "ËĪuÉľ": 470, - "ËĪas": 471, - "ĠbËĪÊĮ": 472, - "ËĪiÉĽÉľn": 473, - "ÉIJ": 474, - "ĠtsuËIJ": 475, - "ĠpËĮÉĽ": 476, - "ĠnËĪÉĶ": 477, - "ÊĬt": 478, - "ma": 479, - "ĠnËĪo": 480, - "ĠlËĪɪ": 481, - "ËĪÉĽs": 482, - "ɪl": 483, - "ĠÉķËĪiÉĽ": 484, - "ĠËĪÊĬ": 485, - "ÉĴt": 486, - "to": 487, - "ĠËĪo": 488, - "ËĮon": 489, - "ĠkwËĪa": 490, - "Ġɪt": 491, - "ĠhoËIJ": 492, - "ËĪiËIJk": 493, - "ĠËĮaËIJpk": 494, - "ËĪaɪn": 495, - "æ": 496, - "ÉĻnt": 497, - "ta": 498, - "lo": 499, - "ĠnËĪÉij": 500, - "ĠlËĪa": 501, - "ËĪiÉľ": 502, - "ĠwËĪei": 503, - "ÉĽÊģ": 504, - "ĠtËĪa": 505, - "ĠɾËĮÉĻh": 506, - "ĠÉķËĪiÉij": 507, - "ËĮiËIJ": 508, - "ËĮÉĽl": 509, - "ĠtÉĻÉľ": 510, - "ĠkËĪuo": 511, - "ĠtËĪu": 512, - "jËĪÉĽ": 513, - "ĠËĮin": 514, - "ɾe": 515, - "ĠkoËIJ": 516, - "ĠkËĪa": 517, - "ɾi": 518, - "ĠtÉķËĪiÉij": 519, - "lÉĻ": 520, - "ĠkÉĻ": 521, - "ĠtËĪi": 522, - "ĠÅĭËĪyÉĻ": 523, - "Ġtsh": 524, - "er": 525, - "av": 526, - "ĠkÉĶn": 527, - "ËĪÉĻÉľÅĭ": 528, - "ðo": 529, - "ËĪaËIJn": 530, - "ĠbʰËĪi": 531, - "ĠkËIJjaËIJ": 532, - "ÉĻz": 533, - "ĠpÊģ": 534, - "ĠdËĪɪ": 535, - "ĠziËIJ": 536, - "É¡eËIJ": 537, - "ĠtËĪÉĻ": 538, - "ɪz": 539, - "ĠnËĮon": 540, - "taËIJ": 541, - "bl": 542, - "te": 543, - "nËĮeËIJ": 544, - "ËĪɪl": 545, - "so": 546, - "ko": 547, - "uÊģ": 548, - "ĠÉ£": 549, - "ĠpaÊģ": 550, - "ĠËĪÉĽ": 551, - "jËĪuËIJ": 552, - "ËĮÊĮ": 553, - "yn": 554, - "ËĪiËIJn": 555, - "ĠlËĪaɪ": 556, - "ËĪɪÅĭ": 557, - "ĠtÉķhËĪy": 558, - "ĠnËĪÊĮhi": 559, - "ĠdËĮe": 560, - "ĠjËĪÉiju": 561, - "ĠtËĪÉiju": 562, - "ĠhËĪo": 563, - "ɪd": 564, - "ĠthËĪÉij": 565, - "mËĪe": 566, - "ĠËĪÉĻ": 567, - "ja": 568, - "Ġph": 569, - "ÉĽt": 570, - "ĠkËĪÊĮ": 571, - "tÉĻn": 572, - "mËĪÉij": 573, - "wËĪe": 574, - "ĠËĮaɪn": 575, - "Ġðɪs": 576, - "É¡ÉĻ": 577, - "ĠnËĪaËIJ": 578, - "ĠbËĪaËIJ": 579, - "Ġaθ": 580, - "ĠmËĮa": 581, - "ËĪÊĮha": 582, - "ĠdËĮa": 583, - "ËĪÊı": 584, - "ĠɲËĮy": 585, - "ĠpËĪa": 586, - "ËĪaðo": 587, - "di": 588, - "bÉľ": 589, - "ɳ": 590, - "ĠwiËIJ": 591, - "ĠnËĪɪ": 592, - "ĠÉ¡ËĪÉĶÉľ": 593, - "tËIJo": 594, - "ËĮÉĻm": 595, - "ËĪaËIJr": 596, - "ĠmÉĽ": 597, - "ËĪeËIJÉ¡aËIJ": 598, - "ĠsËĮi": 599, - "ĠlËĮaËIJ": 600, - "nËĮaËIJ": 601, - "Ġsp": 602, - "tÊģ": 603, - "ĠÊİ": 604, - "ËĮÉijËIJ": 605, - "Ġkl": 606, - "kʰ": 607, - "il": 608, - "ĠÊĥt": 609, - "ĠËĮÊĬn": 610, - "al": 611, - "ĠsËĪÉĽ": 612, - "ĠmËĪaËIJ": 613, - "ĠÅĵ": 614, - "ĠÉ¡ËĪÊĮ": 615, - "ĠpËĮÉĽr": 616, - "ɾËĪa": 617, - "ËIJÊĪ": 618, - "ËĪaβa": 619, - "ĠwËĪÉĴ": 620, - "ĠxËĪuei": 621, - "ĠkhËĪo": 622, - "Ġlas": 623, - "ĠÉĹËĪo": 624, - "ĠfÉĽÉ¾": 625, - "ĠjËĪiÉĽ": 626, - "ĠtËĪe": 627, - "ĠkËĮÉĶ": 628, - "ĠdeËIJn": 629, - "Ġmo": 630, - "ĠpËĪi": 631, - "ĠtËĪÉij": 632, - "ËĪÉĽst": 633, - "wËĪÉij": 634, - "ËĪaɪt": 635, - "ÉĻÊĬ": 636, - "ĠËĪi": 637, - "ɪj": 638, - "aɪ": 639, - "ËĪaËIJÉľ": 640, - "ĠËĪɪs": 641, - "ĠpÉĶɾ": 642, - "Ã¦Éľn": 643, - "ka": 644, - "ÅĭÉ¡": 645, - "bÉĻn": 646, - "ÊĬf": 647, - "Ġpɹ": 648, - "ĠlËĮe": 649, - "ËĪiËIJd": 650, - "ËĪaËIJre": 651, - "ĠmËĪÊĮ": 652, - "ÉĻr": 653, - "ĠdÉij": 654, - "ËĪaËIJto": 655, - "ĠpËĪeËIJ": 656, - "ĠdËĪoËIJ": 657, - "ĠsËĮÊĬ": 658, - "ĠhËĪi": 659, - "ĠsËĪa": 660, - "ËĪeËIJn": 661, - "dÉĻ": 662, - "Ġpj": 663, - "ËĪÅĵÊģ": 664, - "lɪç": 665, - "ÉĴn": 666, - "ĠËĪÉĻr": 667, - "tËĪe": 668, - "Ġil": 669, - "ËĪaËIJl": 670, - "ĠsËĮÉĻÊĬ": 671, - "sÊĪ": 672, - "ĠdËĪuËIJ": 673, - "hËĪÉij": 674, - "ĠxËĪou": 675, - "ĠlËĪaiÉľ": 676, - "wËĪo": 677, - "ËĪÉĽnte": 678, - "Ġsy": 679, - "Ġzɪç": 680, - "ĠÉ¡ËĪu": 681, - "ĠÉķËĪy": 682, - "ËĪÉĶËIJl": 683, - "ÉĶl": 684, - "ĠtËĪo": 685, - "ĠÊĭoËIJ": 686, - "ĠiËIJ": 687, - "wËĪaða": 688, - "ËĪando": 689, - "Ġaθɼnt": 690, - "ĠaθɼntwËĪaða": 691, - "ĠtËĪiÉĽ": 692, - "ËĪeiÉľ": 693, - "ĠpËĮa": 694, - "ĠnËĪaɪ": 695, - "wa": 696, - "Ġfr": 697, - "ĠÊIJËĪÉĻÉľn": 698, - "ËĪua": 699, - "mi": 700, - "ĠmËĪÉĽ": 701, - "ËĪeËIJkʰ": 702, - "cʰ": 703, - "ĠwËĪÉij": 704, - "sta": 705, - "Ġtu": 706, - "Ġsk": 707, - "ËĪÉĶl": 708, - "ËĪeËIJÊĪ": 709, - "ĠlËĪaËIJɪ": 710, - "ĠlËĪaËIJ": 711, - "ËĪÉĽËIJs": 712, - "ËĪÉĽÉ¾a": 713, - "ËĪÉĻÉľt": 714, - "Ġyn": 715, - "dÉĻn": 716, - "Ġdi": 717, - "ËĪiËIJs": 718, - "Ġðel": 719, - "ËĪÊĮr": 720, - "ĠhËĪaËIJ": 721, - "ĠbÉĻ": 722, - "ĠjËĪuËIJ": 723, - "lle": 724, - "sto": 725, - "ËĪɪt": 726, - "ËĪoËIJɾ": 727, - "bʰ": 728, - "mÉĻn": 729, - "ËĮuÉĻ": 730, - "ËĮÉĻɾ": 731, - "ËĪÊĮn": 732, - "ĠlËĪaɪk": 733, - "ĠbËĪa": 734, - "ɪð": 735, - "Ġlo": 736, - "zi": 737, - "ËĪÊĮst": 738, - "mËĪi": 739, - "ÉĶÊģ": 740, - "ĠnËĪɪçt": 741, - "Ġtɾ": 742, - "ĠdËĪeËIJkʰ": 743, - "ĠsËĮe": 744, - "ĠnËĪÉĻÊĬ": 745, - "Ġu": 746, - "Ġsi": 747, - "Ġɪç": 748, - "Ġpr": 749, - "ĠtÉķËĪy": 750, - "ĠmËĪu": 751, - "za": 752, - "ĠtÊģ": 753, - "Ġwɪð": 754, - "tËĪÉĽ": 755, - "ĠpËĪÊĮɾ": 756, - "ĠkËĪÉĶ": 757, - "ËĪoËIJr": 758, - "ĠhËĮa": 759, - "ĠkËĪonÉ¡": 760, - "ĠpuÊģ": 761, - "Ġdy": 762, - "ËĪɪn": 763, - "nte": 764, - "ĠkËĮa": 765, - "ËĪÉĻɪ": 766, - "Ġmi": 767, - "ĠÉ¡ËĮuÉĻ": 768, - "Ġʲ": 769, - "ĠfËĪÉij": 770, - "ĠvÉijËIJ": 771, - "ĠËĮaÊĬ": 772, - "ËĮuËIJ": 773, - "ĠËĪun": 774, - "ĠjËĪÊĮha": 775, - "juËIJ": 776, - "Ġmɪt": 777, - "ĠlËĪÉĽ": 778, - "ËĪeËIJÊĥ": 779, - "ĠfÉĶËIJ": 780, - "mÉĻ": 781, - "ɾt": 782, - "ĠkËĮon": 783, - "ĠlËĪÉĶ": 784, - "ĠxËĪÉiju": 785, - "pl": 786, - "ĠdËĪi": 787, - "ĠlËĪoËIJ": 788, - "sÉĻ": 789, - "ËĪaËIJva": 790, - "ĠlËĪu": 791, - "ĠÉ¡ËĮÉĻÊĬ": 792, - "Ġhav": 793, - "ĠËĮaËIJpkËĮoËIJ": 794, - "ɾËĪi": 795, - "ĠfËĪÉĻ": 796, - "ĠhËĮÉĻm": 797, - "ËĪonÉ¡Éľ": 798, - "jo": 799, - "ĠsÉĶ": 800, - "ËĪaËIJd": 801, - "wËĪiÉĻ": 802, - "ËĪand": 803, - "ËĮaɪn": 804, - "tɾ": 805, - "ĠËĮɪ": 806, - "ĠËĪuna": 807, - "ĠxwËĪÉij": 808, - "ĠjÉĶËIJ": 809, - "ÊģËĪi": 810, - "ĠkËĪuoÉľ": 811, - "Ġaβ": 812, - "ĠÉ¡ËĪaËIJ": 813, - "ano": 814, - "tÉĻl": 815, - "ĠrËĮe": 816, - "ËĮÊĮt": 817, - "ĠjËĪiÉij": 818, - "ĠɾËĮÉĻhaËIJ": 819, - "ĠmËĪe": 820, - "ĠËĪyÃ¦Éľn": 821, - "ĠfËĪu": 822, - "Ġbl": 823, - "nËĪi": 824, - "sÉĻn": 825, - "Ġaɪn": 826, - "ËĪiÊĬ": 827, - "Ġðeɪ": 828, - "Ġɪts": 829, - "Ġ(": 830, - "ËĪyËIJ": 831, - "ÉĻd": 832, - "ĠËĮo": 833, - "ĠÉĽs": 834, - "ĠviËIJ": 835, - "ËIJÉ¡eËIJ": 836, - "kËĪe": 837, - "ĠËĪal": 838, - "ÉĽl": 839, - "ĠÊĮ": 840, - "ËIJo": 841, - "ĠkËĪo": 842, - "ĠÊĪËĪuËIJ": 843, - "ĠsËĪɪ": 844, - "ËĪeËIJɾ": 845, - "Éľm": 846, - "ËĮÉĻn": 847, - "ËĪaËIJi": 848, - "ËĪoËIJl": 849, - "ɪËĮeËIJ": 850, - "ĠʲËĪy": 851, - "ĠkËĪÉĶËIJ": 852, - "sËĪi": 853, - "ĠlËĪe": 854, - "ËĮÉĴt": 855, - "ËĪiËIJp": 856, - "aÊģ": 857, - "ĠθËĪɪÅĭ": 858, - "ËĪÉĻËIJɪ": 859, - "ËĪÊĮl": 860, - "ĠhËĪoËIJtaËIJ": 861, - "ËĪoɪ": 862, - "nto": 863, - "zh": 864, - "ĠdeËIJm": 865, - "ĠkÉĶm": 866, - "ʰËĪiËIJk": 867, - "ĠdÊĴËĪÊĮst": 868, - "pɾ": 869, - "Ġly": 870, - "hËĪu": 871, - "ËĪÉĶø": 872, - "ËĪaËIJs": 873, - "ĠËĪan": 874, - "ĠËĪÉĴ": 875, - "Ġkan": 876, - "ĠtsËĪuo": 877, - "ËĪeËIJva": 878, - "Ġɡɾ": 879, - "Ġpo": 880, - "ĠtÊĥËĪÉĶ": 881, - "Êİa": 882, - "ĠmËĮi": 883, - "Êĥt": 884, - "tËĪi": 885, - "ĠhËĪÊĮ": 886, - "tÊĥe": 887, - "ĠfÉĶn": 888, - "ve": 889, - "ĠnËĮe": 890, - "ËĪÉĶÊģ": 891, - "iz": 892, - "ĠsËĪuo": 893, - "ËĪÉĽËIJr": 894, - "wËĪaÊģ": 895, - "ËĪaða": 896, - "Åĭk": 897, - "po": 898, - "ĠkËĪi": 899, - "ËĪad": 900, - "ĠvËĪi": 901, - "tÉķ": 902, - "ĠkËĪÉĻ": 903, - "ĠwËĪu": 904, - "ÉĴz": 905, - "ĠvÉijËIJɾ": 906, - "ÊģËĪÉĽ": 907, - "ĠkËĪaËIJ": 908, - "ke": 909, - "nÉĻ": 910, - "ËĪÊĮb": 911, - "ËĪuËIJɾ": 912, - "ËĮÉĻËIJ": 913, - "ĠÊĪʰËĪiËIJk": 914, - "ĠkËĪu": 915, - "ĠbËĮÊĮt": 916, - "Ġat": 917, - "Ġfɹ": 918, - "ËĪax": 919, - "ĠzoËIJ": 920, - "ĠtËĪaËIJ": 921, - "ĠðËĮe": 922, - "neËIJ": 923, - "ĠÉijËIJ": 924, - "ĠaÊĬf": 925, - "am": 926, - "ÊĬÅĭ": 927, - "ĠÉĶËIJ": 928, - "ĠÉķËĪiÉľÅĭ": 929, - "ĠËĪÉĶËIJl": 930, - "ɪm": 931, - "jËĪo": 932, - "ËĪiËIJÉŁ": 933, - "ĠkwËĮÉĽ": 934, - "ĠmËĪas": 935, - "ÉĻh": 936, - "ĠËĪaÊĬ": 937, - "ËĪÉĶɪ": 938, - "É¡ÉĻɾ": 939, - "rÉĻn": 940, - "ËĪɪk": 941, - "sse": 942, - "ĠpËĪÉij": 943, - "ĠÉĹËĮe": 944, - "ĠÉĹËĪi": 945, - "Ġaz": 946, - "ĠÉ¡ËĪÊĮjaËIJ": 947, - "ze": 948, - "ĠÉĹËĮaËIJ": 949, - "ĠfËĪi": 950, - "ĠËĮÉĴn": 951, - "ĠxËĪo": 952, - "ĠËĮÊĬna": 953, - "ĠtʰaËIJ": 954, - "ĠsÉij": 955, - "ËĪeɪÊĥÉĻn": 956, - "ĠtÉķËĪiÉľ": 957, - "ĠÉŁaËIJ": 958, - "pËIJ": 959, - "Ġply": 960, - "θËĪi": 961, - "ËIJÉĸ": 962, - "ĠtËĪuei": 963, - "ĠlËĪÉĻ": 964, - "ĠdÉijËIJ": 965, - "ft": 966, - "ËĪam": 967, - "ĠsËĪÊĮkt": 968, - "ĠtËĪou": 969, - "ĠpËĪiÉĽ": 970, - "ĠËĪai": 971, - "ĠwËĪÉĴn": 972, - "ĠzËĮaɪn": 973, - "Ġest": 974, - "ĠmÉĶ": 975, - "ĠtÉķjËĪÉiju": 976, - "Éľp": 977, - "ËĪÊĮz": 978, - "bi": 979, - "ËĪÉĽËIJseËIJ": 980, - "ĠlËĪy": 981, - "ĠmËĮe": 982, - "ĠdËĮÉĽl": 983, - "ËĪiËIJl": 984, - "ĠkËĮomo": 985, - "ĠhËĪaÉľn": 986, - "ËĪoËIJne": 987, - "ĠkËĪÊĮɾt": 988, - "ĠsyÊģ": 989, - "ËĮÉĶɾ": 990, - "Ġɪf": 991, - "uv": 992, - "zÉĻn": 993, - "ol": 994, - "Ïĩ": 995, - "im": 996, - "ĠmËĪiÉĽ": 997, - "Ġðɪ": 998, - "ĠvËĪÉĽ": 999, - "ÊĬd": 1000, - "Ġtr": 1001, - "ËĪeËIJs": 1002, - "ðe": 1003, - "de": 1004, - "ʰÏĩ": 1005, - "ÉŁÊ°": 1006, - "ËĮÉĻËIJÉªÉľ": 1007, - "bËIJ": 1008, - "ËĪÊĬk": 1009, - "ĠnËĪÉĶÉªÉľ": 1010, - "ĠËĮiËIJ": 1011, - "ËĪÉijËIJt": 1012, - "ËĪiËIJɾ": 1013, - "Ġtɹ": 1014, - "ɾÉĶ": 1015, - "ĠwÉĴz": 1016, - "Ġvu": 1017, - "bÉĻl": 1018, - "bÉĻ": 1019, - "ɹi": 1020, - "nts": 1021, - "ĠsËĪaËIJ": 1022, - "dʰ": 1023, - "ĠtÊĬ": 1024, - "ĠÊİËĮi": 1025, - "βa": 1026, - "hËĪÉĻÉľÅĭ": 1027, - "ĠsËĪiËIJ": 1028, - "ĠpËĮaɾa": 1029, - "ËĪÉĽÉ¾ÉĶ": 1030, - "ËĪɪs": 1031, - "É£o": 1032, - "ĠËĮal": 1033, - "or": 1034, - "ĠbËĪÊĮh": 1035, - "ĠkËĪoËIJ": 1036, - "ĠtËĪÉĽ": 1037, - "ĠpËĪo": 1038, - "ĠÊĴÉĻ": 1039, - "pÊģ": 1040, - "ĠËĪaɪ": 1041, - "hËĪÉijÉľÅĭ": 1042, - "ÉĻli": 1043, - "ËĪeɪt": 1044, - "ĠjËĪiouÉľ": 1045, - "ĠdËĪÉĻ": 1046, - "ĠmËĪÉĶËIJ": 1047, - "lËĪi": 1048, - "ËĮyÉĻ": 1049, - "ĠlËĪoËIJÉ¡": 1050, - "ĠnËĪÊĮ": 1051, - "ĠhËĪÊĬ": 1052, - "ĠnËĪÉĻÉľÅĭ": 1053, - "ĠÊģÉĻ": 1054, - "zËĪi": 1055, - "ĠtËĪuËIJ": 1056, - "ĠkËĮome": 1057, - "ĠlËĪeËIJ": 1058, - "ËĪaËIJtaËIJ": 1059, - "Ġan": 1060, - "ĠËĪyu": 1061, - "ĠËĮÊĮÉ¡ÉĻɾ": 1062, - "ĠËĪɪn": 1063, - "ĠhËĪoÉĻ": 1064, - "vÉĻ": 1065, - "ËĪøËIJ": 1066, - "θja": 1067, - "ËĪuÉĻÉľn": 1068, - "ĠkÉĻɾ": 1069, - "ËĪat": 1070, - "jËĪø": 1071, - "ËĪÉĽtÊģ": 1072, - "ĠpËĪÉiju": 1073, - "stÉĻ": 1074, - "ĠwÉĴt": 1075, - "ËĪeËIJl": 1076, - "ÊĪi": 1077, - "ĠxËĪaiÉľ": 1078, - "ËĪyÊģ": 1079, - "ĠhËĪoËIJÉ¡aËIJ": 1080, - "ĠtsËĪi": 1081, - "ĠËĪÊĮp": 1082, - "ĠnËĮÉĴt": 1083, - "ĠlËĪɪeËIJ": 1084, - "ĠhËĪa": 1085, - "Ġfl": 1086, - "ĠnËĪeËIJ": 1087, - "ËĮaËIJɪ": 1088, - "ĠtËĪuo": 1089, - "tÊĥËIJ": 1090, - "sËĪe": 1091, - "bʰi": 1092, - "ĠbËĪÊĮhÊĬt": 1093, - "ËĪÉĽnd": 1094, - "ĠsËĪÉĶ": 1095, - "ÉĻns": 1096, - "ËĮÉĻl": 1097, - "ÉĽÉľ": 1098, - "ĠÉ¡l": 1099, - "ËĪɪɾ": 1100, - "ËĪaËIJta": 1101, - "ÉľËIJ": 1102, - "ËĪÉĽnto": 1103, - "skËĮoËIJ": 1104, - "ËĪÉĽk": 1105, - "tsi": 1106, - "ĠtËĪonÉ¡": 1107, - "ĠbiËIJ": 1108, - "ĠhËĪaËIJɪ": 1109, - "ĠbËĪi": 1110, - "jj": 1111, - "Êİi": 1112, - "Ġkʰ": 1113, - "ĠsËĪo": 1114, - "llo": 1115, - "Ġbaɪ": 1116, - "ĠÉĽnt": 1117, - "ĠËĪiËIJ": 1118, - "ĠÉ¡ËĪo": 1119, - "ɾeËIJ": 1120, - "ĠkÊĭ": 1121, - "ĠmËĪeiÉľ": 1122, - "ÊĬËĪÉĶËIJ": 1123, - "ĠtËĪaɪ": 1124, - "Ġsus": 1125, - "Ġri": 1126, - "ĠvËĮÉĽ": 1127, - "ËĪiËIJno": 1128, - "vano": 1129, - "ĠdËĮiËIJ": 1130, - "ĠÊIJËĪaÉľn": 1131, - "ÊĤ": 1132, - "ĠÉIJb": 1133, - "ËĪaËIJh": 1134, - "ɪÊĥ": 1135, - "ĠdËĮella": 1136, - "tËIJi": 1137, - "ĠËĪÊĬn": 1138, - "ĠhiËIJ": 1139, - "ĠbËĪaËIJt": 1140, - "ĠthËĪi": 1141, - "Ġam": 1142, - "ĠËĪoËIJ": 1143, - "Ġhu": 1144, - "ĠkËĪÊĮh": 1145, - "ĠzËĪÉijËIJ": 1146, - "ĠÉ¡ËĮÉĶ": 1147, - "ĠËĪÉĻÊĬ": 1148, - "yËĪi": 1149, - "ĠlËĪÊĮ": 1150, - "ĠdËĪeËIJ": 1151, - "ĠsËĪÉĶËIJ": 1152, - "skËĮeËIJ": 1153, - "ɾo": 1154, - "ÊģËĪÉij": 1155, - "tËĪa": 1156, - "ĠkËĪÊĬ": 1157, - "ËĪante": 1158, - "ĠdÉĶ": 1159, - "ĠsËĪeɪ": 1160, - "ĠsÉĽt": 1161, - "ɹɪ": 1162, - "ĠÉ¡ËĮÉĻÊĬɪÅĭ": 1163, - "zo": 1164, - "ĠjËĪaËIJ": 1165, - "ĠÉĴvðÉĻ": 1166, - "ĠÊĿ": 1167, - "ĠÉĽl": 1168, - "ĠsËĪoËIJ": 1169, - "ĠthËĪiÉľ": 1170, - "ĠËĪÉĽl": 1171, - "ĠlyËĮi": 1172, - "ndÊĴ": 1173, - "ĠÉķjËĪÉiju": 1174, - "θa": 1175, - "ĠɾËĮÉĻheËIJ": 1176, - "Ġmaɪ": 1177, - "jÉĻ": 1178, - "ĠËĪÊĮb": 1179, - "asjËĪÉĶ": 1180, - "dÊģ": 1181, - "ĠkhËĪa": 1182, - "ĠËĪes": 1183, - "vi": 1184, - "fi": 1185, - "ËĮÉĻb": 1186, - "Ġre": 1187, - "ĠavËĮÉĽ": 1188, - "ĠtËĮi": 1189, - "Ġkɾ": 1190, - "Ġbɪk": 1191, - "ste": 1192, - "ËĪeËIJÊĥc": 1193, - "pt": 1194, - "zÉĻ": 1195, - "ĠwËĪaËIJ": 1196, - "kl": 1197, - "ĠsËĪÊĮm": 1198, - "ɪÊĪ": 1199, - "dz": 1200, - "vo": 1201, - "ËĮaÊĬt": 1202, - "nde": 1203, - "ĠdÉĽs": 1204, - "ĠÉŁËĪaËIJ": 1205, - "ĠrËĮi": 1206, - "sËĮeËIJ": 1207, - "É¡i": 1208, - "Ġals": 1209, - "ËĪiðo": 1210, - "ĠnËĪiÉľn": 1211, - "ÊĬl": 1212, - "tsËIJ": 1213, - "ËĪanto": 1214, - "ĠÉĹËĪÉĻÊĬ": 1215, - "kËIJi": 1216, - "ĠsËĪÊĮb": 1217, - "ĠnËĪa": 1218, - "ĠlËĮo": 1219, - "ĠphËĪi": 1220, - "mËĮe": 1221, - "Ġfa": 1222, - "kÉĻ": 1223, - "ĠzËĪu": 1224, - "ns": 1225, - "ĠÊģe": 1226, - "ĠbËĪo": 1227, - "ËĪaËIJti": 1228, - "Ġman": 1229, - "ĠlËĪiÉij": 1230, - "ĠÉĹËĮyÉĻ": 1231, - "ĠfËĪÉĶËIJ": 1232, - "ĠkÊĭËĪeËIJÊĥc": 1233, - "ĠxËĪÉij": 1234, - "ĠtÉķËĪu": 1235, - "jÉĻɾ": 1236, - "Ġɪst": 1237, - "wËĪi": 1238, - "ĠËĮaɪnÉĻ": 1239, - "ɪɡ": 1240, - "ĠsÊĪ": 1241, - "ËĪiÉĻl": 1242, - "ĠnËĪiÉĽÉľn": 1243, - "ĠËĮÉĽËIJ": 1244, - "ËĪaɪnd": 1245, - "ĠzËĪi": 1246, - "vÉĻn": 1247, - "mz": 1248, - "ðos": 1249, - "dÊĴËIJ": 1250, - "jËĪa": 1251, - "ɾËĪÉĶ": 1252, - "lËĪe": 1253, - "ʲ": 1254, - "ĠvËĪÉĶ": 1255, - "ĠlËĪiÉĽ": 1256, - "θe": 1257, - "mËĪente": 1258, - "ĠɪnðÉĻ": 1259, - "Ġaɪm": 1260, - "nÉĻn": 1261, - "ĠhÉĻm": 1262, - "ɾaËIJ": 1263, - "ĠsËĪuoÉľ": 1264, - "ĠɲËĪi": 1265, - "ĠɹËĪiÉĻl": 1266, - "lËĪa": 1267, - "ĠbËĪÉĶ": 1268, - "ĠkËĪai": 1269, - "ÊģËĪa": 1270, - "ĠwËĪÉľËIJ": 1271, - "ĠaËIJ": 1272, - "Ġpas": 1273, - "ËĪÊĮs": 1274, - "wËĪÉĽÉ¾": 1275, - "ĠÉĹËĪe": 1276, - "ĠhËĮatÉĻ": 1277, - "aɪn": 1278, - "ĠËĪÉĶpʰ": 1279, - "ÊģËĪe": 1280, - "ĠÉŁaËIJËĪeËIJÉ¡aËIJ": 1281, - "ĠËĪÊĬs": 1282, - "ĠtÉķhËĪiÉľ": 1283, - "ntÊĥ": 1284, - "ĠxËĪuo": 1285, - "ËĪuÊģ": 1286, - "Ġɪm": 1287, - "ɳÉĸ": 1288, - "ËĪyÉĻÉľkh": 1289, - "ĠËĪyÉĽ": 1290, - "ĠmËĮaËIJ": 1291, - "ÅĵÊģ": 1292, - "ĠËĪalt": 1293, - "ĠkÉĻm": 1294, - "Êİo": 1295, - "ĠÉIJn": 1296, - "Ġfy": 1297, - "ĠËĮÉĽra": 1298, - "ĠÉ¡ËĪÊĬ": 1299, - "ĠpËĪÊĮ": 1300, - "ls": 1301, - "ĠlËĪiËIJ": 1302, - "ĠÊĤËĪy": 1303, - "ĠbɪkËĪÊĮz": 1304, - "ĠÉ¡ÉĽt": 1305, - "Ġbɾ": 1306, - "tʰ": 1307, - "tÉĻlËĮÉĻb": 1308, - "xo": 1309, - "skËĮaËIJ": 1310, - "ɲʲ": 1311, - "ËĪeËIJkÊĪ": 1312, - "rÉĻ": 1313, - "tÊĥo": 1314, - "ĠpÊģÉĶ": 1315, - "ĠɹËĪaɪt": 1316, - "ĠpËĪei": 1317, - "ËĮɪç": 1318, - "jËĪÉĽÉ¾": 1319, - "tËIJa": 1320, - "ĠÉIJbËĮaÊĬt": 1321, - "ĠkÊĭËĪeËIJÊĥcÉĻn": 1322, - "ĠvËĪe": 1323, - "ÊĬÉľ": 1324, - "ĠakËĪe": 1325, - "ĠpËĪai": 1326, - "vËĪÉĽ": 1327, - "Ġθɹ": 1328, - "ɪf": 1329, - "ĠavËĪÉĽ": 1330, - "ĠkËĪe": 1331, - "dËĪi": 1332, - "ËĪeËIJÉĸ": 1333, - "ĠbÉĻt": 1334, - "ÊĪʰ": 1335, - "teËIJ": 1336, - "θjËĪÉĶn": 1337, - "dÉľ": 1338, - "ĠjËĪiÉľ": 1339, - "Ġve": 1340, - "É£ËĪu": 1341, - "ËĪÊĮhÉĻl": 1342, - "ĠpÉĶ": 1343, - "ĠÉ¡r": 1344, - "Ġða": 1345, - "ĠvËĪiËIJ": 1346, - "ĠËĮÉijËIJ": 1347, - "ËĪÉĻÊĬnt": 1348, - "ĠbËĪaËIJɾ": 1349, - "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350, - "ld": 1351, - "ĠtÉķËĮÉĶ": 1352, - "pa": 1353, - "ðËĪad": 1354, - "ËĪiɾ": 1355, - "ĠxËĪu": 1356, - "ĠlËĪiÉľÅĭ": 1357, - "ËĪeɪs": 1358, - "ĠÉĹËĮeÉľn": 1359, - "ĠthËĪiÉĽ": 1360, - "tËIJe": 1361, - "ĠavËĮÉĽk": 1362, - "ĠËĮÉĶ": 1363, - "ĠkËĪÉiju": 1364, - "ɪv": 1365, - "iËIJz": 1366, - "ËĪos": 1367, - "Ġɡɹ": 1368, - "and": 1369, - "ĠlËĪiou": 1370, - "ĠËĪoÉľ": 1371, - "É¡l": 1372, - "ĠpËĪÉĶËIJ": 1373, - "ĠmËĮeËIJ": 1374, - "ĠkËĪÉĴ": 1375, - "nos": 1376, - "çÉĻn": 1377, - "fÉĻn": 1378, - "ĠsËĪÊĮktËĮeËIJ": 1379, - "ĠËĪaɪn": 1380, - "ËĪoËIJre": 1381, - "jËĪÉĽn": 1382, - "ĠðËĪÉĽn": 1383, - "ĠtÉķhËĪiÉĽÉľn": 1384, - "ĠhËĪaɪ": 1385, - "ɾËĪÉĽ": 1386, - "ĠsËĪu": 1387, - "ĠkËĪɪjaËIJ": 1388, - "ĠpjËĮÊĬ": 1389, - "ĠhÉĻmËĮaËIJ": 1390, - "ĠËĮÊĮp": 1391, - "ĠpËĪÊĮhÉĻl": 1392, - "ĠxËĪÉĻ": 1393, - "dËĪe": 1394, - "ĠmÉij": 1395, - "ĠÊĬm": 1396, - "ndÉĻ": 1397, - "ĠdËĪÉĻÊĬnt": 1398, - "ËĪeËIJÊĥÉĻn": 1399, - "Ġðats": 1400, - "is": 1401, - "ĠcËĪaËIJh": 1402, - "pe": 1403, - "ĠsËĮo": 1404, - "ĠðËĪe": 1405, - "ĠsËĪaËIJt": 1406, - "ËĪaÊģ": 1407, - "ĠsËĪe": 1408, - "ÉĻk": 1409, - "ɪÊĭ": 1410, - "ĠkËĪoËIJi": 1411, - "kÉĶ": 1412, - "ĠvËĪaËIJÊĬ": 1413, - "ĠfËĪei": 1414, - "ĠlËĪeËIJk": 1415, - "ĠhËĪiÉĻ": 1416, - "ĠaÊĬ": 1417, - "ËĪÉĽndo": 1418, - "ËĪes": 1419, - "ĠzËĪÉĶ": 1420, - "ĠËĪÉĽÉ¾a": 1421, - "nËĪiÉľn": 1422, - "ĠkËĪÊĮm": 1423, - "ĠlËĪÉĴ": 1424, - "ɪst": 1425, - "ĠpÉij": 1426, - "ĠfËĪÉĶ": 1427, - "ĠthËĪonÉ¡": 1428, - "nke": 1429, - "ËĮɪk": 1430, - "ĠɲËĪÉĻ": 1431, - "ËĮÊĮm": 1432, - "ËĪiËIJt": 1433, - "ĠwËĪÉĴnt": 1434, - "ËĪaβan": 1435, - "ĠbËĪÊĮr": 1436, - "ÉĽnd": 1437, - "ĠËĮÉijËIJbÉľ": 1438, - "ĠvËĪaɪ": 1439, - "ĠtÊĥËĮi": 1440, - "ĠθËĪɪÅĭk": 1441, - "sti": 1442, - "Ġkɹ": 1443, - "ĠËĪaÊĬt": 1444, - "stÉĻn": 1445, - "ĠÊĭËĪÊĮn": 1446, - "ĠÉ¡ËĮaËIJ": 1447, - "ËĪaËIJÉľÉ²": 1448, - "Êģi": 1449, - "ĠnËĪÉĶx": 1450, - "ĠɹËĪiÉĻlɪ": 1451, - "ĠvËĮi": 1452, - "ĠðeÉĻ": 1453, - "ËĮɪtÊĥ": 1454, - "ĠvËĪyÉĻ": 1455, - "ĠËĮaËIJpkËĮaËIJ": 1456, - "ĠfËĮaËIJɪ": 1457, - "ĠpËĪÉĶ": 1458, - "ĠnËĪÊĮmb": 1459, - "θes": 1460, - "jËĪÉĽÊģ": 1461, - "ĠkËĪÊĬcʰ": 1462, - "mËĪÉĽ": 1463, - "ĠvËĪu": 1464, - "ĠlÅĵÊģ": 1465, - "ĠiËIJm": 1466, - "ÊĪÉĻɾ": 1467, - "tÊĥi": 1468, - "ËIJs": 1469, - "ĠtËĪy": 1470, - "ĠmËĪiÉľÅĭ": 1471, - "ɾËĪe": 1472, - "mËĮa": 1473, - "ĠmËĮiËIJ": 1474, - "ĠÉĽks": 1475, - "ɪp": 1476, - "ĠkËĪÊĮɾnËĮaËIJ": 1477, - "ĠËĮaÊĬx": 1478, - "rËĪiËIJ": 1479, - "ĠcËĪÊĮl": 1480, - "mos": 1481, - "ĠkËĪÊĮɾtËĮeËIJ": 1482, - "iËIJɾ": 1483, - "kÉĻn": 1484, - "ĠdËĪu": 1485, - "naËIJ": 1486, - "ĠpwËĪe": 1487, - "ËĮÉĶɪ": 1488, - "ĠtÉķhËĪiÉĽ": 1489, - "ĠβËĪi": 1490, - "ËĪiÉĽÉľt": 1491, - "Ġte": 1492, - "ËĪaðos": 1493, - "mËĪa": 1494, - "ĠvËĪo": 1495, - "ĠmËĪɪ": 1496, - "ĠbËĮi": 1497, - "ad": 1498, - "do": 1499, - "ĠnËĪaÊĬ": 1500, - "ĠʲËĪyÉľ": 1501, - "wËĪÉĽ": 1502, - "ËĪis": 1503, - "el": 1504, - "Ġpar": 1505, - "ĠtËĪai": 1506, - "ĠdËĪɪjaËIJ": 1507, - "hËĪi": 1508, - "ĠɾËĪÊĮ": 1509, - "ĠdËĪe": 1510, - "ËĪaɪd": 1511, - "Ġper": 1512, - "ĠsËĮÉĶ": 1513, - "we": 1514, - "ÊĬm": 1515, - "Ġin": 1516, - "ĠjËĪuËIJz": 1517, - "ËĪiËIJpÉĻl": 1518, - "ĠÊĭËĪaËIJl": 1519, - "ĠetËĪÉĽ": 1520, - "ËĮÉĽm": 1521, - "ĠnËĪu": 1522, - "ËĪÉĽkt": 1523, - "ĠiËIJɾ": 1524, - "Ġbɹ": 1525, - "ĠtshËĪi": 1526, - "ĠÉĹËĪÉĶÉľ": 1527, - "ĠkwËĮa": 1528, - "ĠfËĪuÉľ": 1529, - "wËĮa": 1530, - "ĠdËĪiËIJ": 1531, - "ĠÉ¡ËĪyÉĻ": 1532, - "ËĮÉĽËIJ": 1533, - "rËĪa": 1534, - "Ġne": 1535, - "ĠzËĪyÉĻ": 1536, - "ĠbËĪaɪ": 1537, - "ĠÉŁËĪÊĮb": 1538, - "ËĪuËIJto": 1539, - "ÊĬnt": 1540, - "Ġcʰ": 1541, - "ËĪÉĽnti": 1542, - "ËĪoÉĻ": 1543, - "ĠsËĮÊĮm": 1544, - "ĠlÉij": 1545, - "ËĮeva": 1546, - "É¾ÉĽ": 1547, - "ntÉľ": 1548, - "ĠmËĪÉĽn": 1549, - "ËĪÉijËIJk": 1550, - "Ġkil": 1551, - "ËĪones": 1552, - "ff": 1553, - "ĠmËĪÉĽËIJ": 1554, - "ĠvËĪÉĻɪ": 1555, - "ĠËĪÉĶËIJ": 1556, - "ĠËĮɪnt": 1557, - "ÊĬn": 1558, - "Ġwɪl": 1559, - "Ġsin": 1560, - "ĠËĮalla": 1561, - "ĠaβËĪia": 1562, - "pi": 1563, - "ËĪoÉľ": 1564, - "ɪjËĮaËIJ": 1565, - "ku": 1566, - "ĠvËĪɪ": 1567, - "Ġtut": 1568, - "ĠtËĪeÉľ": 1569, - "ĠhËĪÉĶ": 1570, - "βɾe": 1571, - "sÉĻɾ": 1572, - "ĠkhËĪai": 1573, - "ĠmËĪÉĶ": 1574, - "Ġta": 1575, - "ĠɲËĪaËIJ": 1576, - "Ġnu": 1577, - "ËĪuËIJn": 1578, - "ĠÉĻËIJÉľ": 1579, - "ĠËĪaÊĬf": 1580, - "ËĪiËIJdÉľ": 1581, - "nti": 1582, - "ĠpËĪiËIJpÉĻl": 1583, - "Ġkj": 1584, - "Ġpe": 1585, - "ĠmËĪÉij": 1586, - "ËĮaɪ": 1587, - "ËĪaËIJle": 1588, - "ĠvËĮÉĻËIJÉªÉľ": 1589, - "mpo": 1590, - "ĠkËĪɪt": 1591, - "ĠnËĮÉĽ": 1592, - "ĠÉŁËĪaËIJtaËIJ": 1593, - "ĠsËĪaËIJtʰ": 1594, - "ĠÉŁËĪi": 1595, - "Ġso": 1596, - "ĠbËĪÉĽ": 1597, - "kËĪi": 1598, - "ɪti": 1599, - "Ġtsi": 1600, - "ĠkÊģ": 1601, - "ËĮÉĴ": 1602, - "É¡ÉĻl": 1603, - "kst": 1604, - "ĠmËĪÉĻËIJ": 1605, - "ËĪÊĮk": 1606, - "ĠnËĪaËIJÊĬ": 1607, - "Ġap": 1608, - "ĠlËĪɪkʰ": 1609, - "lli": 1610, - "ĠkwËĪal": 1611, - "ĠËĪÉĻËIJ": 1612, - "ĠtsËĪuei": 1613, - "Ġdo": 1614, - "ĠkËIJjËĪo": 1615, - "ÊĬz": 1616, - "ĠpËĪaËIJ": 1617, - "ĠmËĪuËIJ": 1618, - "ĠÉ¡ÉĻv": 1619, - "rËĪi": 1620, - "Ġtw": 1621, - "ËĮɪn": 1622, - "dËĪÉij": 1623, - "ĠðËĪi": 1624, - "ĠËĪaËIJi": 1625, - "ĠhËĪiÉĽ": 1626, - "ĠðËĮÉĽm": 1627, - "ĠpʰËĪɪɾ": 1628, - "ÉĴm": 1629, - "ĠËĮeËIJ": 1630, - "ĠthËĪaiÉľ": 1631, - "ĠvËĪas": 1632, - "ĠnÉijËIJ": 1633, - "pÉĻn": 1634, - "ĠpËĮÉĻɾ": 1635, - "ĠÉĹËĪaËIJɪ": 1636, - "ËĪouÉľ": 1637, - "ĠÊIJËĪuÉľ": 1638, - "ĠmËĪan": 1639, - "ĠtËĪÉĻÉªÉľ": 1640, - "ĠlËĪaËIJÊĬ": 1641, - "mËĪÉĽnte": 1642, - "ĠfËĪam": 1643, - "sjËĪÉĶ": 1644, - "ĠpËĪÉĻ": 1645, - "ËĪeËIJm": 1646, - "ĠpËĪÊĮr": 1647, - "jËĪi": 1648, - "ĠlÉĽ": 1649, - "Ġten": 1650, - "ËĪoËIJra": 1651, - "ki": 1652, - "ĠÊĤËĪaËIJÊĬ": 1653, - "kɪ": 1654, - "bËIJe": 1655, - "ËĪalt": 1656, - "ðɪ": 1657, - "pËĪi": 1658, - "ĠËĮÉĽnt": 1659, - "ĠmËĪei": 1660, - "ĠhËĪÉĻÊĬ": 1661, - "ĠhËĪÉĽÉ¾": 1662, - "jËĪÉij": 1663, - "ĠhËĪÊĬaËIJ": 1664, - "mÉľ": 1665, - "Ġdʰ": 1666, - "ĠtÊĥËĪe": 1667, - "lËĪÉĽ": 1668, - "ËĪaËIJte": 1669, - "ĠpËĪuËIJ": 1670, - "ĠmËĪÊĬ": 1671, - "ËĪaËIJɪÊĪ": 1672, - "diËIJ": 1673, - "ĠfɹÉĴm": 1674, - "ĠhËĪÉijËIJ": 1675, - "βo": 1676, - "ĠmËĪiÉľn": 1677, - "ĠðiËIJz": 1678, - "ĠkËĪou": 1679, - "ËĪiËIJna": 1680, - "ĠavËĮeva": 1681, - "ĠËĪaËIJɾ": 1682, - "ĠnËĪuËIJɾ": 1683, - "ĠβËĪe": 1684, - "Ġzaɪn": 1685, - "ËĪÉĽd": 1686, - "ÉĹ": 1687, - "ËĪeɪk": 1688, - "sËĮÉĻÊĬ": 1689, - "ËĪeËIJÉŁ": 1690, - "ĠÊĤËĪÉĻËIJ": 1691, - "je": 1692, - "cʰËIJ": 1693, - "ËĪÉĶr": 1694, - "ÉĽËIJ": 1695, - "ĠtÉķhËĪyÃ¦Éľn": 1696, - "ĠËĮaɪnÉĻn": 1697, - "ĠiËIJn": 1698, - "ĠbËĪÊĮc": 1699, - "ËĪiËIJm": 1700, - "ɾas": 1701, - "ËĮÉĻs": 1702, - "ĠvËĪeËIJ": 1703, - "ĠËĪÉĻrÉľ": 1704, - "ĠduËIJ": 1705, - "ntÉĻ": 1706, - "ĠpɹËĪÉĴ": 1707, - "ĠbËĪɪ": 1708, - "ĠwËĪoÉľ": 1709, - "nËĮi": 1710, - "ĠhÉIJ": 1711, - "ĠkËĪÉĽ": 1712, - "Ġet": 1713, - "jËĪÉĽndo": 1714, - "ĠËĪaiÉľ": 1715, - "Ġli": 1716, - "ĠËĪaÊĬs": 1717, - "kËIJo": 1718, - "ĠÉĹËĪyÉĻ": 1719, - "keËIJ": 1720, - "ĠfËĪiËIJl": 1721, - "ĠbʰËĪaËIJi": 1722, - "ĠÉ¡ÉĻÊĥ": 1723, - "ÊĴËĪe": 1724, - "ĠnjËĪuËIJ": 1725, - "ĠËĪak": 1726, - "ĠÉĹËĪaËIJ": 1727, - "zËĪa": 1728, - "vËĪe": 1729, - "ĠhËĮaÊĬ": 1730, - "ÉIJç": 1731, - "ĠɾËĪÊĮkʰ": 1732, - "pËĪe": 1733, - "ĠtÉĻbi": 1734, - "ĠpËĪÊĮhÉĻlËĮeËIJ": 1735, - "ĠfËĪÉĽ": 1736, - "ĠwËĮɪtÊĥ": 1737, - "ĠtÉķËĪyÉĽÉľ": 1738, - "wËĮe": 1739, - "ËĮaɪt": 1740, - "ĠnÉijËIJx": 1741, - "ĠkËĪÉĶËIJn": 1742, - "ÊĬk": 1743, - "ĠbËĪaËIJd": 1744, - "ÅĭÉĻn": 1745, - "Ġni": 1746, - "ĠbËĪe": 1747, - "ĠmËĮÊĬ": 1748, - "ËĪar": 1749, - "ĠmËĮeɪk": 1750, - "ĠsËĪaËIJɾ": 1751, - "βe": 1752, - "ĠtÉķhËĪiÉľÅĭ": 1753, - "itËĪe": 1754, - "kËĮe": 1755, - "ËĪÉĽËIJl": 1756, - "ËĮÉĴn": 1757, - "ËĮÉij": 1758, - "ĠbËĪɪl": 1759, - "ĠwÊĬd": 1760, - "ĠbËĪoËIJl": 1761, - "rd": 1762, - "iÉĻ": 1763, - "Ġda": 1764, - "ĠbËĪaËIJÊĬ": 1765, - "ĠnËĪÊĮmbÉĻɾ": 1766, - "ËĪaËIJÉªÉľ": 1767, - "ĠÉĽm": 1768, - "ĠmiËIJɾ": 1769, - "ËĪeɪm": 1770, - "los": 1771, - "ËĮÉĽt": 1772, - "ĠËĮaÊĬs": 1773, - "ĠmËĪaÉľt": 1774, - "ĠwËĪuÉĻ": 1775, - "ĠwËĪeɪ": 1776, - "Ġseɲ": 1777, - "ĠbjËĪÉĽ": 1778, - "ĠwÉĽn": 1779, - "fl": 1780, - "ĠkhwËĪa": 1781, - "dËĪÉĽ": 1782, - "vɹɪ": 1783, - "ĠËĪaɾ": 1784, - "jËĪÉijuÉľ": 1785, - "ĠËĮaËIJpkËĮeËIJ": 1786, - "bÊģ": 1787, - "ĠtËĪaɪm": 1788, - "ĠËĪÉij": 1789, - "ĠsËĮa": 1790, - "ĠzËĪoɪ": 1791, - "ËĪÉĶɾa": 1792, - "ĠdËĪø": 1793, - "ËĪÉĶɾt": 1794, - "ĠÅĭËĪÉĶ": 1795, - "min": 1796, - "ĠlËĪÊĬk": 1797, - "ËĪÉĶËIJt": 1798, - "ĠËĪÉĶtɾ": 1799, - "ĠfËĪaɪ": 1800, - "ĠÉ¡ÉĴt": 1801, - "ËĪeËIJÉĻn": 1802, - "kËĪÉĶ": 1803, - "ĠvËĪÉĽÉ¹i": 1804, - "mÉĽ": 1805, - "ËĪaɪz": 1806, - "Ġesp": 1807, - "ɲa": 1808, - "ĠlËĪo": 1809, - "ËĪÉĽËIJra": 1810, - "βËĪi": 1811, - "ouÉľ": 1812, - "ËĮÉĻk": 1813, - "tÊĥuËIJ": 1814, - "ĠnËĪyÉĻ": 1815, - "ÊĪɾ": 1816, - "ĠÉ¡ËĪy": 1817, - "ĠtËĪoðo": 1818, - "ËĪɪçt": 1819, - "Ġmɪç": 1820, - "ĠËĪand": 1821, - "ĠkwËĮÉĽl": 1822, - "ĠÊĤËĪaËIJ": 1823, - "ĠnËĪiÉľ": 1824, - "ËĪÉĶp": 1825, - "ËĪiËIJz": 1826, - "ĠÊĤËĪaÊĬ": 1827, - "ĠɾËĮÉĻhi": 1828, - "ĠsËĮÊĬo": 1829, - "ĠÉĽÉ¡": 1830, - "ĠdÅĵ": 1831, - "ĠÉ¡ËĮaËIJÉªÉľ": 1832, - "dɪ": 1833, - "lËĮa": 1834, - "stËĪi": 1835, - "ĠdËĮiËIJz": 1836, - "ĠtËĮÊĬ": 1837, - "θi": 1838, - "ĠËĪɪskËĮoËIJ": 1839, - "ndÉĻn": 1840, - "Ġtsv": 1841, - "ĠhËĪÉĻËIJ": 1842, - "ĠÊĥËĪÊĬ": 1843, - "ÉĻtËĮeËIJ": 1844, - "pËĮÉĽ": 1845, - "ËĪaɾÉĶn": 1846, - "ĠpÉĽÊģ": 1847, - "Ġy": 1848, - "mnËĮeËIJ": 1849, - "ËĪÉĽllo": 1850, - "ĠÉ¡ËĪÉĻ": 1851, - "ĠËĮad": 1852, - "ĠÊĥv": 1853, - "ËĪÊıɾ": 1854, - "rËĪe": 1855, - "yËIJ": 1856, - "ĠpËĪaËIJs": 1857, - "ĠËĪÉĽn": 1858, - "ɪdÊĴ": 1859, - "ËĪuai": 1860, - "Ġfi": 1861, - "ĠtËĪyÉĻ": 1862, - "ËĪaËIJÉŁ": 1863, - "ĠtjËĪe": 1864, - "ËĪaËIJnaËIJ": 1865, - "stɾ": 1866, - "Êİe": 1867, - "ËĮeɪt": 1868, - "ba": 1869, - "ðas": 1870, - "vÊģ": 1871, - "ĠzËĪÉĻËIJ": 1872, - "ËĪaËIJli": 1873, - "ÉŁÊ°eËIJ": 1874, - "ËĪaËIJteËIJ": 1875, - "ĠvËĪa": 1876, - "Ġsal": 1877, - "ËĪaËIJno": 1878, - "ĠÉ¡ÉĻz": 1879, - "ĠhËĪoËIJti": 1880, - "ĠɲËĪiÉĽ": 1881, - "tÉľ": 1882, - "ĠËĪaËIJp": 1883, - "ĠwËĪÉĽl": 1884, - "ĠmËĪɪl": 1885, - "ĠfyËIJɾ": 1886, - "ËĪÉĽËIJsaËIJ": 1887, - "ĠbËĮiËIJ": 1888, - "ËĪaËIJjaËIJ": 1889, - "ËĪɪp": 1890, - "ĠfÊģ": 1891, - "tsiËĪoËIJne": 1892, - "ĠwËĪuÉľ": 1893, - "Ġvi": 1894, - "ĠwËĪÉijÉľn": 1895, - "ËĪoËIJn": 1896, - "ĠÉĹËĪÉĻɪ": 1897, - "ĠÊĿËĪo": 1898, - "Ġra": 1899, - "mÉĻnt": 1900, - "ËĪaÊĬnd": 1901, - "ĠpÉĽÉ¾": 1902, - "ĠÉĹËĪaËIJÊĬ": 1903, - "oËIJɾ": 1904, - "hËĪo": 1905, - "ĠÉĴn": 1906, - "ĠÊİe": 1907, - "ĠsËĪɪks": 1908, - "É¡n": 1909, - "ĠÉ¡ËĪa": 1910, - "Ġθj": 1911, - "ĠpËĪe": 1912, - "spe": 1913, - "ĠvËĪÉĻ": 1914, - "ĠfËĪɪ": 1915, - "ĠËĮɪntÊĬ": 1916, - "lÉĻn": 1917, - "ĠnËĪiËIJd": 1918, - "ĠsËĮÊĬa": 1919, - "ĠËĪum": 1920, - "ĠdËĪeɪ": 1921, - "ĠËĪÊĮbʰi": 1922, - "ËĪÉijËIJɾ": 1923, - "ĠbËĪiÉĽÉľt": 1924, - "Êİos": 1925, - "ĠtshËĪaiÉľ": 1926, - "ĠËĮɪskËĮaËIJ": 1927, - "ĠaÊĬÉĻ": 1928, - "ĠËĪyæ": 1929, - "Ġdyn": 1930, - "ĠmËĪiËIJn": 1931, - "ĠËĪÊĮcʰËIJ": 1932, - "ĠsÉĽ": 1933, - "ĠnËĪy": 1934, - "ĠnËĮÉĽl": 1935, - "ɡɾ": 1936, - "ÊĥËĪe": 1937, - "ĠÊĤËĮÉĽ": 1938, - "ĠËĪÉĽvɹɪ": 1939, - "ËĪÉĽlp": 1940, - "ĠbËĪak": 1941, - "ĠeËIJ": 1942, - "ĠfËĪaËIJ": 1943, - "ĠkÉĽl": 1944, - "ĠËĪeËIJs": 1945, - "jËĪaËIJd": 1946, - "ĠlËĮi": 1947, - "mbɾe": 1948, - "ktÉĻ": 1949, - "nta": 1950, - "tËĪu": 1951, - "ĠðËĪat": 1952, - "ĠËĪaβ": 1953, - "ÉĻɹi": 1954, - "ĠkwËĮÉĽlla": 1955, - "ĠbÉĻn": 1956, - "rËĮÉĽ": 1957, - "ĠnÉĶ": 1958, - "ĠÉ¡ËĪɪ": 1959, - "ĠËĪap": 1960, - "ɹÉĻ": 1961, - "ËĪaÉľkh": 1962, - "ĠÊIJËĪi": 1963, - "ĠËĪÉijËIJ": 1964, - "ɪɡÉĻn": 1965, - "ĠwËĪai": 1966, - "ĠpÉĻt": 1967, - "kËIJa": 1968, - "ĠbËĪÉĽËIJ": 1969, - "ËĪeËIJÊĭ": 1970, - "lsÉĻÊĬ": 1971, - "ĠcËĪaËIJhɪËĮeËIJ": 1972, - "ĠkÉĻn": 1973, - "ĠËĮaɪnÉĻm": 1974, - "ËĪuËIJt": 1975, - "ĠhËĪaÊĬ": 1976, - "ĠtËĪanto": 1977, - "ĠhÉIJz": 1978, - "ĠsËĪÊĮɾ": 1979, - "Ġno": 1980, - "ĠtËĪÉĶËIJ": 1981, - "ĠzËĪaɪ": 1982, - "ĠtÉķËĪiÉĽÉľ": 1983, - "ĠkozËĪi": 1984, - "ĠkËĪei": 1985, - "ðËĪÉĶɾ": 1986, - "ËĮÉĶÊģ": 1987, - "ĠtËĪÊĮɾ": 1988, - "ĠÊIJËĪÉĻ": 1989, - "ĠÉķËĪyÉĽÉľ": 1990, - "ĠmËĮÊĬÉŁÊ°eËIJ": 1991, - "mf": 1992, - "ĠvËĪiËIJdÉľ": 1993, - "kËĪa": 1994, - "ĠÉIJÉ¡": 1995, - "kw": 1996, - "ĠÊģÉĽ": 1997, - "xÉĻn": 1998, - "ĠdÊĬ": 1999, - "ĠkËĪÊĮɾnËĮeËIJ": 2000, - "jËĪaËIJdaËIJ": 2001, - "ĠfÉĻ": 2002, - "ĠËĮimp": 2003, - "Ġhɪz": 2004, - "ĠʰÏĩ": 2005, - "ËĪoËIJni": 2006, - "ĠxËĪiÉľ": 2007, - "ËĪeËIJsÊĪ": 2008, - "ÊıbÉľ": 2009, - "ËĮÉĶɾke": 2010, - "ĠÉ¡ËĪÉĻÊĬ": 2011, - "ËĪɪÊĥÉĻn": 2012, - "les": 2013, - "ĠfËĪiËIJ": 2014, - "É¡tÉĻ": 2015, - "ËĪeËIJre": 2016, - "ĠvËĮaËIJ": 2017, - "ĠËĪeɪ": 2018, - "ĠmËĪuÉĻÉľn": 2019, - "ĠÉ¡ËĪÊĬd": 2020, - "ĠmËĮaɪn": 2021, - "zËĪe": 2022, - "ĠlËĪiÉľ": 2023, - "Ġmu": 2024, - "ĠkËĮÉĽl": 2025, - "ĠjËĮÉĻh": 2026, - "ĠfËĮÉĶɾ": 2027, - "fɹ": 2028, - "ĠkËĪaɪn": 2029, - "ĠËĪÉĴlsÉĻÊĬ": 2030, - "θɪÅĭ": 2031, - "ĠthËĪonÉ¡Éľ": 2032, - "tËĪÉij": 2033, - "θjo": 2034, - "mËĪÉĶ": 2035, - "Ġos": 2036, - "ĠsÊĬ": 2037, - "ĠsËĪÊĮmÉĻ": 2038, - "ĠvËĮÉĽn": 2039, - "nËĪo": 2040, - "ĠËĪaktÊĥuËIJ": 2041, - "É£a": 2042, - "Ġtʰi": 2043, - "ĠfËĮi": 2044, - "ĠvËĪÉĽl": 2045, - "ĠtËĪutËIJi": 2046, - "xos": 2047 - }, - "merges": [ - [ - "Ë", - "Ī" - ], - [ - "Ë", - "IJ" - ], - [ - "ËĪ", - "É" - ], - [ - "Ë", - "Į" - ], - [ - "É", - "Ļ" - ], - [ - "ËĪ", - "a" - ], - [ - "ËĪ", - "i" - ], - [ - "Ġ", - "t" - ], - [ - "É", - "ª" - ], - [ - "É", - "¾" - ], - [ - "Ġ", - "É" - ], - [ - "Ġ", - "k" - ], - [ - "É", - "ľ" - ], - [ - "Ġ", - "s" - ], - [ - "ËĪ", - "e" - ], - [ - "É", - "Ľ" - ], - [ - "ËĪ", - "o" - ], - [ - "Ġ", - "l" - ], - [ - "ËĪÉ", - "Ľ" - ], - [ - "Ġ", - "d" - ], - [ - "Ê", - "Ĭ" - ], - [ - "ËĪa", - "ËIJ" - ], - [ - "Ġ", - "p" - ], - [ - "Ì", - "ĥ" - ], - [ - "Ġ", - "m" - ], - [ - "ËĪ", - "u" - ], - [ - "Å", - "ĭ" - ], - [ - "Ã", - "°" - ], - [ - "ËĪÉ", - "Ķ" - ], - [ - "Ê", - "Į" - ], - [ - "ËĮ", - "a" - ], - [ - "Ġ", - "h" - ], - [ - "ËĪ", - "ÊĮ" - ], - [ - "Ġ", - "n" - ], - [ - "Ê", - "ģ" - ], - [ - "ËĪÉ", - "ij" - ], - [ - "Ê", - "ĥ" - ], - [ - "e", - "ËIJ" - ], - [ - "Ġ", - "a" - ], - [ - "Ġ", - "b" - ], - [ - "É", - "Ķ" - ], - [ - "ËĪÉ", - "Ļ" - ], - [ - "ÉĻ", - "n" - ], - [ - "Ġ", - "f" - ], - [ - "ËĪÉ", - "ª" - ], - [ - "É", - "¡" - ], - [ - "ËĪe", - "ËIJ" - ], - [ - "Ġ", - "j" - ], - [ - "n", - "t" - ], - [ - "Ġ", - "ð" - ], - [ - "Ġ", - "ËĮ" - ], - [ - "Ġt", - "s" - ], - [ - "ĠÉ", - "¡" - ], - [ - "É", - "ķ" - ], - [ - "ËĪo", - "ËIJ" - ], - [ - "Ê", - "°" - ], - [ - "a", - "ËIJ" - ], - [ - "ËĪ", - "y" - ], - [ - "Ġt", - "Éķ" - ], - [ - "ËĪi", - "ËIJ" - ], - [ - "Ġ", - "Ê" - ], - [ - "Ġ", - "v" - ], - [ - "Ġ", - "w" - ], - [ - "s", - "t" - ], - [ - "É", - "ij" - ], - [ - "n", - "d" - ], - [ - "ËĮ", - "i" - ], - [ - "Ì", - "ª" - ], - [ - "ËĮ", - "e" - ], - [ - "Ġ", - "z" - ], - [ - "ËĪa", - "ɪ" - ], - [ - "ËĪi", - "ÉĽ" - ], - [ - "Î", - "²" - ], - [ - "É", - "¹" - ], - [ - "Ġ", - "ËĮa" - ], - [ - "Î", - "¸" - ], - [ - "Ġh", - "ÉĽ" - ], - [ - "Ê", - "Ī" - ], - [ - "i", - "ËIJ" - ], - [ - "ËĮ", - "o" - ], - [ - "Ġ", - "ɪ" - ], - [ - "Éľ", - "n" - ], - [ - "Ġ", - "x" - ], - [ - "Ġt", - "ÉĻ" - ], - [ - "ËĪu", - "ËIJ" - ], - [ - "ËĮ", - "ÉĻ" - ], - [ - "Ġj", - "ËĪi" - ], - [ - "ËĮ", - "ÉĽ" - ], - [ - "ĠÉ", - "Ľ" - ], - [ - "Ġ", - "ËĪa" - ], - [ - "ËĮa", - "ËIJ" - ], - [ - "Ġl", - "a" - ], - [ - "Ġð", - "e" - ], - [ - "ĠhÉĽ", - "ËIJ" - ], - [ - "Ġ", - "e" - ], - [ - "Ã", - "§" - ], - [ - "ÉĻ", - "l" - ], - [ - "o", - "ËIJ" - ], - [ - "ËĪÉij", - "u" - ], - [ - "Ê", - "Ĵ" - ], - [ - "u", - "ËIJ" - ], - [ - "ĠÉ", - "Ĺ" - ], - [ - "ĠÉ", - "ķ" - ], - [ - "ËĮ", - "eËIJ" - ], - [ - "ĠtÉķ", - "ËĪi" - ], - [ - "o", - "s" - ], - [ - "ËĪÉĶ", - "ËIJ" - ], - [ - "a", - "s" - ], - [ - "ËĪ", - "ÊĬ" - ], - [ - "Ġ", - "i" - ], - [ - "ËĪa", - "i" - ], - [ - "É", - "²" - ], - [ - "ɪ", - "n" - ], - [ - "t", - "s" - ], - [ - "Éľ", - "Åĭ" - ], - [ - "ĠÉ", - "Ł" - ], - [ - "Ġ", - "Êĥ" - ], - [ - "ËĪe", - "ɪ" - ], - [ - "ÉĽ", - "ɾ" - ], - [ - "ËĪÉĽ", - "ËIJ" - ], - [ - "ËĪÉĽ", - "ɾ" - ], - [ - "Ġ", - "r" - ], - [ - "t", - "Êĥ" - ], - [ - "ËĮ", - "ÉĶ" - ], - [ - "Ġd", - "ÉĻ" - ], - [ - "t", - "ÉĻ" - ], - [ - "o", - "u" - ], - [ - "ËĪy", - "ÉĻ" - ], - [ - "ĠËĮ", - "i" - ], - [ - "ÉĻ", - "ɾ" - ], - [ - "ËĪÉĻ", - "ÊĬ" - ], - [ - "ËĪÊĮ", - "ɾ" - ], - [ - "ËĪÉ", - "Ĵ" - ], - [ - "Ġt", - "h" - ], - [ - "ËĪo", - "n" - ], - [ - "Ê", - "ĭ" - ], - [ - "ËĪÉij", - "ËIJ" - ], - [ - "ËĪÊĮ", - "h" - ], - [ - "w", - "ËĪa" - ], - [ - "ËĪe", - "i" - ], - [ - "l", - "l" - ], - [ - "ĠÉ", - "IJ" - ], - [ - "Éij", - "ËIJ" - ], - [ - "a", - "n" - ], - [ - "É", - "Ł" - ], - [ - "ĠÊ", - "ĭ" - ], - [ - "Ġk", - "o" - ], - [ - "k", - "h" - ], - [ - "ɪ", - "Åĭ" - ], - [ - "ËĪaËIJ", - "ɪ" - ], - [ - "Ġt", - "Êĥ" - ], - [ - "ËĪaËIJ", - "t" - ], - [ - "ĠËĮ", - "e" - ], - [ - "ĠtÉķ", - "h" - ], - [ - "ËĪu", - "o" - ], - [ - "ËĪon", - "É¡" - ], - [ - "É", - "ĸ" - ], - [ - "a", - "t" - ], - [ - "Ġk", - "e" - ], - [ - "É", - "Ĵ" - ], - [ - "ĠÉķ", - "ËĪi" - ], - [ - "Ã", - "¸" - ], - [ - "ĠÉ", - "ij" - ], - [ - "ËĪeËIJ", - "k" - ], - [ - "Å", - "ĵ" - ], - [ - "r", - "e" - ], - [ - "Ġ", - "ɾ" - ], - [ - "Ġk", - "ÉĶ" - ], - [ - "ËĮ", - "ÊĬ" - ], - [ - "s", - "k" - ], - [ - "Ġ", - "ÊĬ" - ], - [ - "Ġa", - "nd" - ], - [ - "ɪ", - "ç" - ], - [ - "Ġm", - "e" - ], - [ - "ËĪa", - "ɾ" - ], - [ - "Ġ", - "ËĪɪ" - ], - [ - "n", - "a" - ], - [ - "Ġ", - "β" - ], - [ - "Ġl", - "ËĪi" - ], - [ - "j", - "aËIJ" - ], - [ - "l", - "i" - ], - [ - "n", - "o" - ], - [ - "Ġɪ", - "n" - ], - [ - "Ġd", - "ËĮi" - ], - [ - "ĠÉ", - "²" - ], - [ - "t", - "ËIJ" - ], - [ - "ÉĻ", - "m" - ], - [ - "Ġl", - "ÉĻ" - ], - [ - "Ġð", - "ÉĻ" - ], - [ - "ɪ", - "k" - ], - [ - "ËĪÉĽ", - "l" - ], - [ - "Éľ", - "t" - ], - [ - "Ġs", - "e" - ], - [ - "e", - "s" - ], - [ - "ËĪo", - "u" - ], - [ - "ËĪa", - "ÊĬ" - ], - [ - "ĠÉ", - "Ķ" - ], - [ - "ɪ", - "t" - ], - [ - "Ġ", - "Åĭ" - ], - [ - "ËĪÉĽ", - "n" - ], - [ - "Ê", - "İ" - ], - [ - "Ġk", - "h" - ], - [ - "ËĪÉĽ", - "nt" - ], - [ - "ËĪaËIJ", - "ɾ" - ], - [ - "Ġk", - "i" - ], - [ - "m", - "p" - ], - [ - "l", - "t" - ], - [ - "É", - "£" - ], - [ - "Ġp", - "a" - ], - [ - "ËĪÉĻ", - "ËIJ" - ], - [ - "ɪ", - "s" - ], - [ - "ĠÉ", - "Ĵ" - ], - [ - "Ġl", - "e" - ], - [ - "ɪ", - "Éľ" - ], - [ - "ËĪÉĽ", - "t" - ], - [ - "Ġd", - "e" - ], - [ - "ĠÉ", - "¹" - ], - [ - "Ġt", - "ËĪoËIJ" - ], - [ - "Ġ", - "Êģ" - ], - [ - "Êĥ", - "ÉĻn" - ], - [ - "ĠÊĬ", - "nt" - ], - [ - "ËĪÉĶ", - "ɾ" - ], - [ - "ËĪa", - "ð" - ], - [ - "Ġa", - "ɪ" - ], - [ - "ĠÊ", - "IJ" - ], - [ - "Ġm", - "ËĪa" - ], - [ - "r", - "a" - ], - [ - "Ġk", - "ËĪɪ" - ], - [ - "k", - "t" - ], - [ - "ËIJ", - "p" - ], - [ - "ĠÊ", - "Ī" - ], - [ - "ËĪaËIJ", - "ÊĬ" - ], - [ - "Ġk", - "ËĪÊĮɾ" - ], - [ - "Ġ", - "ËĪÊĮ" - ], - [ - "ĠÉĴ", - "v" - ], - [ - "Ġe", - "l" - ], - [ - "k", - "s" - ], - [ - "Ġk", - "w" - ], - [ - "ÉĻ", - "t" - ], - [ - "nd", - "o" - ], - [ - "e", - "i" - ], - [ - "ĠËĮa", - "ËIJp" - ], - [ - "s", - "e" - ], - [ - "ÉĻ", - "ɹ" - ], - [ - "ËĪu", - "ei" - ], - [ - "ÉĻ", - "s" - ], - [ - "Ġk", - "ËĮo" - ], - [ - "ĠÊ", - "Ĥ" - ], - [ - "ĠËĮ", - "ÊĬ" - ], - [ - "Ġ", - "c" - ], - [ - "ĠÉĽ", - "n" - ], - [ - "ËĪa", - "nt" - ], - [ - "θ", - "j" - ], - [ - "ËĮo", - "ËIJ" - ], - [ - "Ġ", - "ËĪaËIJ" - ], - [ - "Ġp", - "ɾ" - ], - [ - "s", - "i" - ], - [ - "Ġ", - "ËĪe" - ], - [ - "Ġj", - "uËIJ" - ], - [ - "Ġk", - "ËĮe" - ], - [ - "ËĮ", - "ɪ" - ], - [ - "ÉĶ", - "n" - ], - [ - "Ġs", - "ËĪÊĮ" - ], - [ - "Ġ", - "ËĪu" - ], - [ - "n", - "i" - ], - [ - "Ġs", - "t" - ], - [ - "Ġd", - "iËIJ" - ], - [ - "Ġk", - "eËIJ" - ], - [ - "ĠjËĪi", - "ou" - ], - [ - "ËĪai", - "Éľ" - ], - [ - "Ġd", - "ÊĴ" - ], - [ - "Ġ", - "ËĪÉĶ" - ], - [ - "v", - "a" - ], - [ - "ËIJ", - "ɾ" - ], - [ - "ËĪ", - "ø" - ], - [ - "ËĮÉĻ", - "ÊĬ" - ], - [ - "Ġp", - "ËĪu" - ], - [ - "Ġs", - "u" - ], - [ - "Ġm", - "a" - ], - [ - "Ġ", - "ÉĻ" - ], - [ - "d", - "ÊĴ" - ], - [ - "Ġp", - "ʰ" - ], - [ - "l", - "e" - ], - [ - "i", - "n" - ], - [ - "ĠtÉķh", - "ËĪi" - ], - [ - "Ġw", - "ËĪo" - ], - [ - "r", - "o" - ], - [ - "ËĮ", - "y" - ], - [ - "ɾ", - "a" - ], - [ - "Ġs", - "ËĪi" - ], - [ - "ð", - "ÉĻ" - ], - [ - "Ġs", - "eËIJ" - ], - [ - "l", - "a" - ], - [ - "ĠÊ", - "Ĵ" - ], - [ - "m", - "b" - ], - [ - "Ġh", - "ËĪoËIJ" - ], - [ - "Ġb", - "ʰ" - ], - [ - "ĠÉĽ", - "ɾ" - ], - [ - "Ġð", - "at" - ], - [ - "s", - "p" - ], - [ - "ÉĶ", - "ɾ" - ], - [ - "e", - "n" - ], - [ - "Ġs", - "ÉĻ" - ], - [ - "ËĪÉĶ", - "Éľ" - ], - [ - "Ġl", - "ËĮa" - ], - [ - "ĠËĮ", - "ÉĽ" - ], - [ - "Ġ", - "ËĪy" - ], - [ - "É¡", - "aËIJ" - ], - [ - "Ġd", - "ÉĽÉ¾" - ], - [ - "ËĪÉĽ", - "Êģ" - ], - [ - "Éľ", - "kh" - ], - [ - "ËĪi", - "ÉĻ" - ], - [ - "ËĪa", - "n" - ], - [ - "Ġm", - "ËĪo" - ], - [ - "ËĪa", - "β" - ], - [ - "Ġa", - "l" - ], - [ - "Ġ", - "ËĪeËIJ" - ], - [ - "Ġ", - "θ" - ], - [ - "Ġn", - "ËĪi" - ], - [ - "p", - "ʰ" - ], - [ - "ll", - "a" - ], - [ - "Ġp", - "l" - ], - [ - "ËĪ", - "Åĵ" - ], - [ - "j", - "ËĪÉiju" - ], - [ - "Ġa", - "v" - ], - [ - "Ġm", - "ËĪi" - ], - [ - "Ġf", - "ËĪa" - ], - [ - "ËĪÉ", - "ľ" - ], - [ - "m", - "e" - ], - [ - "ËĮÉĻ", - "h" - ], - [ - "ËĪu", - "ÉĻ" - ], - [ - "i", - "t" - ], - [ - "j", - "ËĪe" - ], - [ - "Ġ", - "o" - ], - [ - "ËĪÉľ", - "ËIJ" - ], - [ - "ĠtÉķËĪi", - "ou" - ], - [ - "ÉĶ", - "ËIJ" - ], - [ - "Ġn", - "ÉĻ" - ], - [ - "ËĪÉĻ", - "Éľn" - ], - [ - "Ġm", - "ÉĻ" - ], - [ - "Ġd", - "eËIJ" - ], - [ - "m", - "o" - ], - [ - "s", - "a" - ], - [ - "j", - "ËĪÉĶ" - ], - [ - "ËĪa", - "l" - ], - [ - "ĠtÉķ", - "ËĪiÉĽ" - ], - [ - "ĠÉ¡", - "ÉĻ" - ], - [ - "ð", - "a" - ], - [ - "Ġɪ", - "z" - ], - [ - "Ġs", - "a" - ], - [ - "r", - "i" - ], - [ - "ĠËĮi", - "l" - ], - [ - "ËĮ", - "u" - ], - [ - "Ġk", - "aËIJ" - ], - [ - "ĠÉĻ", - "ËIJ" - ], - [ - "ĠÉ", - "ĸ" - ], - [ - "Ġk", - "a" - ], - [ - "ËĪÊĮh", - "i" - ], - [ - "Ġj", - "eËIJ" - ], - [ - "Ġt", - "ʰ" - ], - [ - "n", - "e" - ], - [ - "k", - "ËIJ" - ], - [ - "Ġts", - "ËĪai" - ], - [ - "Ġ", - "ËĪeËIJk" - ], - [ - "n", - "k" - ], - [ - "t", - "i" - ], - [ - "ËĪa", - "Éľn" - ], - [ - "Ġk", - "ËIJ" - ], - [ - "É¡", - "ÉĻn" - ], - [ - "ËĪi", - "a" - ], - [ - "ĠÉĶ", - "ËIJɾ" - ], - [ - "Ê", - "ı" - ], - [ - "ĠËĮ", - "ÊĮ" - ], - [ - "Ġz", - "ËĪaËIJ" - ], - [ - "Ġl", - "os" - ], - [ - "ÉĽ", - "s" - ], - [ - "ËĪÉĶ", - "n" - ], - [ - "ÉĽ", - "nt" - ], - [ - "ÉĽ", - "n" - ], - [ - "ĠÉŁ", - "ËĪoËIJ" - ], - [ - "ç", - "t" - ], - [ - "Ġd", - "as" - ], - [ - "Ġx", - "ËĮo" - ], - [ - "ËĪu", - "Éľ" - ], - [ - "ËĪa", - "s" - ], - [ - "Ġb", - "ËĪÊĮ" - ], - [ - "ËĪiÉĽ", - "Éľn" - ], - [ - "É", - "IJ" - ], - [ - "Ġts", - "uËIJ" - ], - [ - "Ġp", - "ËĮÉĽ" - ], - [ - "Ġn", - "ËĪÉĶ" - ], - [ - "ÊĬ", - "t" - ], - [ - "m", - "a" - ], - [ - "Ġn", - "ËĪo" - ], - [ - "Ġl", - "ËĪɪ" - ], - [ - "ËĪÉĽ", - "s" - ], - [ - "ɪ", - "l" - ], - [ - "ĠÉķ", - "ËĪiÉĽ" - ], - [ - "Ġ", - "ËĪÊĬ" - ], - [ - "ÉĴ", - "t" - ], - [ - "t", - "o" - ], - [ - "Ġ", - "ËĪo" - ], - [ - "ËĮo", - "n" - ], - [ - "Ġk", - "wËĪa" - ], - [ - "Ġɪ", - "t" - ], - [ - "Ġh", - "oËIJ" - ], - [ - "ËĪiËIJ", - "k" - ], - [ - "ĠËĮaËIJp", - "k" - ], - [ - "ËĪaɪ", - "n" - ], - [ - "Ã", - "¦" - ], - [ - "ÉĻn", - "t" - ], - [ - "t", - "a" - ], - [ - "l", - "o" - ], - [ - "Ġn", - "ËĪÉij" - ], - [ - "Ġl", - "ËĪa" - ], - [ - "ËĪi", - "Éľ" - ], - [ - "Ġw", - "ËĪei" - ], - [ - "ÉĽ", - "Êģ" - ], - [ - "Ġt", - "ËĪa" - ], - [ - "Ġɾ", - "ËĮÉĻh" - ], - [ - "ĠÉķËĪi", - "Éij" - ], - [ - "ËĮi", - "ËIJ" - ], - [ - "ËĮÉĽ", - "l" - ], - [ - "ĠtÉĻ", - "Éľ" - ], - [ - "Ġk", - "ËĪuo" - ], - [ - "Ġt", - "ËĪu" - ], - [ - "j", - "ËĪÉĽ" - ], - [ - "ĠËĮi", - "n" - ], - [ - "ɾ", - "e" - ], - [ - "Ġk", - "oËIJ" - ], - [ - "Ġk", - "ËĪa" - ], - [ - "ɾ", - "i" - ], - [ - "ĠtÉķËĪi", - "Éij" - ], - [ - "l", - "ÉĻ" - ], - [ - "Ġk", - "ÉĻ" - ], - [ - "Ġt", - "ËĪi" - ], - [ - "ĠÅĭ", - "ËĪyÉĻ" - ], - [ - "Ġts", - "h" - ], - [ - "e", - "r" - ], - [ - "a", - "v" - ], - [ - "ĠkÉĶ", - "n" - ], - [ - "ËĪÉĻ", - "ÉľÅĭ" - ], - [ - "ð", - "o" - ], - [ - "ËĪaËIJ", - "n" - ], - [ - "Ġbʰ", - "ËĪi" - ], - [ - "ĠkËIJ", - "jaËIJ" - ], - [ - "ÉĻ", - "z" - ], - [ - "Ġp", - "Êģ" - ], - [ - "Ġd", - "ËĪɪ" - ], - [ - "Ġz", - "iËIJ" - ], - [ - "É¡", - "eËIJ" - ], - [ - "Ġt", - "ËĪÉĻ" - ], - [ - "ɪ", - "z" - ], - [ - "Ġn", - "ËĮon" - ], - [ - "t", - "aËIJ" - ], - [ - "b", - "l" - ], - [ - "t", - "e" - ], - [ - "n", - "ËĮeËIJ" - ], - [ - "ËĪɪ", - "l" - ], - [ - "s", - "o" - ], - [ - "k", - "o" - ], - [ - "u", - "Êģ" - ], - [ - "ĠÉ", - "£" - ], - [ - "Ġpa", - "Êģ" - ], - [ - "Ġ", - "ËĪÉĽ" - ], - [ - "j", - "ËĪuËIJ" - ], - [ - "ËĮ", - "ÊĮ" - ], - [ - "y", - "n" - ], - [ - "ËĪiËIJ", - "n" - ], - [ - "Ġl", - "ËĪaɪ" - ], - [ - "ËĪɪ", - "Åĭ" - ], - [ - "ĠtÉķh", - "ËĪy" - ], - [ - "Ġn", - "ËĪÊĮhi" - ], - [ - "Ġd", - "ËĮe" - ], - [ - "Ġj", - "ËĪÉiju" - ], - [ - "Ġt", - "ËĪÉiju" - ], - [ - "Ġh", - "ËĪo" - ], - [ - "ɪ", - "d" - ], - [ - "Ġth", - "ËĪÉij" - ], - [ - "m", - "ËĪe" - ], - [ - "Ġ", - "ËĪÉĻ" - ], - [ - "j", - "a" - ], - [ - "Ġp", - "h" - ], - [ - "ÉĽ", - "t" - ], - [ - "Ġk", - "ËĪÊĮ" - ], - [ - "t", - "ÉĻn" - ], - [ - "m", - "ËĪÉij" - ], - [ - "w", - "ËĪe" - ], - [ - "ĠËĮa", - "ɪn" - ], - [ - "Ġð", - "ɪs" - ], - [ - "É¡", - "ÉĻ" - ], - [ - "Ġn", - "ËĪaËIJ" - ], - [ - "Ġb", - "ËĪaËIJ" - ], - [ - "Ġa", - "θ" - ], - [ - "Ġm", - "ËĮa" - ], - [ - "ËĪÊĮh", - "a" - ], - [ - "Ġd", - "ËĮa" - ], - [ - "ËĪ", - "Êı" - ], - [ - "Ġɲ", - "ËĮy" - ], - [ - "Ġp", - "ËĪa" - ], - [ - "ËĪað", - "o" - ], - [ - "d", - "i" - ], - [ - "b", - "Éľ" - ], - [ - "É", - "³" - ], - [ - "Ġw", - "iËIJ" - ], - [ - "Ġn", - "ËĪɪ" - ], - [ - "ĠÉ¡", - "ËĪÉĶÉľ" - ], - [ - "tËIJ", - "o" - ], - [ - "ËĮÉĻ", - "m" - ], - [ - "ËĪaËIJ", - "r" - ], - [ - "Ġm", - "ÉĽ" - ], - [ - "ËĪeËIJ", - "É¡aËIJ" - ], - [ - "Ġs", - "ËĮi" - ], - [ - "Ġl", - "ËĮaËIJ" - ], - [ - "n", - "ËĮaËIJ" - ], - [ - "Ġs", - "p" - ], - [ - "t", - "Êģ" - ], - [ - "ĠÊ", - "İ" - ], - [ - "ËĮ", - "ÉijËIJ" - ], - [ - "Ġk", - "l" - ], - [ - "k", - "ʰ" - ], - [ - "i", - "l" - ], - [ - "ĠÊĥ", - "t" - ], - [ - "ĠËĮÊĬ", - "n" - ], - [ - "a", - "l" - ], - [ - "Ġs", - "ËĪÉĽ" - ], - [ - "Ġm", - "ËĪaËIJ" - ], - [ - "Ġ", - "Åĵ" - ], - [ - "ĠÉ¡", - "ËĪÊĮ" - ], - [ - "ĠpËĮÉĽ", - "r" - ], - [ - "ɾ", - "ËĪa" - ], - [ - "ËIJ", - "ÊĪ" - ], - [ - "ËĪaβ", - "a" - ], - [ - "Ġw", - "ËĪÉĴ" - ], - [ - "Ġx", - "ËĪuei" - ], - [ - "Ġkh", - "ËĪo" - ], - [ - "Ġla", - "s" - ], - [ - "ĠÉĹ", - "ËĪo" - ], - [ - "Ġf", - "ÉĽÉ¾" - ], - [ - "Ġj", - "ËĪiÉĽ" - ], - [ - "Ġt", - "ËĪe" - ], - [ - "Ġk", - "ËĮÉĶ" - ], - [ - "ĠdeËIJ", - "n" - ], - [ - "Ġm", - "o" - ], - [ - "Ġp", - "ËĪi" - ], - [ - "Ġt", - "ËĪÉij" - ], - [ - "ËĪÉĽ", - "st" - ], - [ - "w", - "ËĪÉij" - ], - [ - "ËĪaɪ", - "t" - ], - [ - "ÉĻ", - "ÊĬ" - ], - [ - "Ġ", - "ËĪi" - ], - [ - "ɪ", - "j" - ], - [ - "a", - "ɪ" - ], - [ - "ËĪaËIJ", - "Éľ" - ], - [ - "ĠËĪɪ", - "s" - ], - [ - "Ġp", - "ÉĶɾ" - ], - [ - "æ", - "Éľn" - ], - [ - "k", - "a" - ], - [ - "Åĭ", - "É¡" - ], - [ - "b", - "ÉĻn" - ], - [ - "ÊĬ", - "f" - ], - [ - "Ġp", - "ɹ" - ], - [ - "Ġl", - "ËĮe" - ], - [ - "ËĪiËIJ", - "d" - ], - [ - "ËĪaËIJ", - "re" - ], - [ - "Ġm", - "ËĪÊĮ" - ], - [ - "ÉĻ", - "r" - ], - [ - "Ġd", - "Éij" - ], - [ - "ËĪaËIJt", - "o" - ], - [ - "Ġp", - "ËĪeËIJ" - ], - [ - "Ġd", - "ËĪoËIJ" - ], - [ - "Ġs", - "ËĮÊĬ" - ], - [ - "Ġh", - "ËĪi" - ], - [ - "Ġs", - "ËĪa" - ], - [ - "ËĪeËIJ", - "n" - ], - [ - "d", - "ÉĻ" - ], - [ - "Ġp", - "j" - ], - [ - "ËĪÅĵ", - "Êģ" - ], - [ - "l", - "ɪç" - ], - [ - "ÉĴ", - "n" - ], - [ - "ĠËĪÉĻ", - "r" - ], - [ - "t", - "ËĪe" - ], - [ - "Ġi", - "l" - ], - [ - "ËĪaËIJ", - "l" - ], - [ - "Ġs", - "ËĮÉĻÊĬ" - ], - [ - "s", - "ÊĪ" - ], - [ - "Ġd", - "ËĪuËIJ" - ], - [ - "h", - "ËĪÉij" - ], - [ - "Ġx", - "ËĪou" - ], - [ - "Ġl", - "ËĪaiÉľ" - ], - [ - "w", - "ËĪo" - ], - [ - "ËĪÉĽnt", - "e" - ], - [ - "Ġs", - "y" - ], - [ - "Ġz", - "ɪç" - ], - [ - "ĠÉ¡", - "ËĪu" - ], - [ - "ĠÉķ", - "ËĪy" - ], - [ - "ËĪÉĶËIJ", - "l" - ], - [ - "ÉĶ", - "l" - ], - [ - "Ġt", - "ËĪo" - ], - [ - "ĠÊĭ", - "oËIJ" - ], - [ - "Ġ", - "iËIJ" - ], - [ - "wËĪa", - "ða" - ], - [ - "ËĪa", - "ndo" - ], - [ - "Ġaθ", - "ÉĽnt" - ], - [ - "Ġaθɼnt", - "wËĪaða" - ], - [ - "Ġt", - "ËĪiÉĽ" - ], - [ - "ËĪei", - "Éľ" - ], - [ - "Ġp", - "ËĮa" - ], - [ - "Ġn", - "ËĪaɪ" - ], - [ - "w", - "a" - ], - [ - "Ġf", - "r" - ], - [ - "ĠÊIJ", - "ËĪÉĻÉľn" - ], - [ - "ËĪu", - "a" - ], - [ - "m", - "i" - ], - [ - "Ġm", - "ËĪÉĽ" - ], - [ - "ËĪeËIJk", - "ʰ" - ], - [ - "c", - "ʰ" - ], - [ - "Ġw", - "ËĪÉij" - ], - [ - "st", - "a" - ], - [ - "Ġt", - "u" - ], - [ - "Ġs", - "k" - ], - [ - "ËĪÉĶ", - "l" - ], - [ - "ËĪeËIJ", - "ÊĪ" - ], - [ - "Ġl", - "ËĪaËIJɪ" - ], - [ - "Ġl", - "ËĪaËIJ" - ], - [ - "ËĪÉĽËIJ", - "s" - ], - [ - "ËĪÉĽÉ¾", - "a" - ], - [ - "ËĪÉĻ", - "Éľt" - ], - [ - "Ġ", - "yn" - ], - [ - "d", - "ÉĻn" - ], - [ - "Ġd", - "i" - ], - [ - "ËĪiËIJ", - "s" - ], - [ - "Ġðe", - "l" - ], - [ - "ËĪÊĮ", - "r" - ], - [ - "Ġh", - "ËĪaËIJ" - ], - [ - "Ġb", - "ÉĻ" - ], - [ - "Ġj", - "ËĪuËIJ" - ], - [ - "ll", - "e" - ], - [ - "st", - "o" - ], - [ - "ËĪɪ", - "t" - ], - [ - "ËĪoËIJ", - "ɾ" - ], - [ - "b", - "ʰ" - ], - [ - "m", - "ÉĻn" - ], - [ - "ËĮu", - "ÉĻ" - ], - [ - "ËĮÉĻ", - "ɾ" - ], - [ - "ËĪÊĮ", - "n" - ], - [ - "ĠlËĪaɪ", - "k" - ], - [ - "Ġb", - "ËĪa" - ], - [ - "ɪ", - "ð" - ], - [ - "Ġl", - "o" - ], - [ - "z", - "i" - ], - [ - "ËĪÊĮ", - "st" - ], - [ - "m", - "ËĪi" - ], - [ - "ÉĶ", - "Êģ" - ], - [ - "ĠnËĪɪ", - "çt" - ], - [ - "Ġt", - "ɾ" - ], - [ - "Ġd", - "ËĪeËIJkʰ" - ], - [ - "Ġs", - "ËĮe" - ], - [ - "Ġn", - "ËĪÉĻÊĬ" - ], - [ - "Ġ", - "u" - ], - [ - "Ġs", - "i" - ], - [ - "Ġɪ", - "ç" - ], - [ - "Ġp", - "r" - ], - [ - "ĠtÉķ", - "ËĪy" - ], - [ - "Ġm", - "ËĪu" - ], - [ - "z", - "a" - ], - [ - "Ġt", - "Êģ" - ], - [ - "Ġw", - "ɪð" - ], - [ - "t", - "ËĪÉĽ" - ], - [ - "Ġp", - "ËĪÊĮɾ" - ], - [ - "Ġk", - "ËĪÉĶ" - ], - [ - "ËĪoËIJ", - "r" - ], - [ - "Ġh", - "ËĮa" - ], - [ - "Ġk", - "ËĪonÉ¡" - ], - [ - "Ġp", - "uÊģ" - ], - [ - "Ġd", - "y" - ], - [ - "ËĪɪ", - "n" - ], - [ - "nt", - "e" - ], - [ - "Ġk", - "ËĮa" - ], - [ - "ËĪÉĻ", - "ɪ" - ], - [ - "Ġm", - "i" - ], - [ - "ĠÉ¡", - "ËĮuÉĻ" - ], - [ - "ĠÊ", - "²" - ], - [ - "Ġf", - "ËĪÉij" - ], - [ - "Ġv", - "ÉijËIJ" - ], - [ - "ĠËĮa", - "ÊĬ" - ], - [ - "ËĮ", - "uËIJ" - ], - [ - "ĠËĪu", - "n" - ], - [ - "Ġj", - "ËĪÊĮha" - ], - [ - "j", - "uËIJ" - ], - [ - "Ġm", - "ɪt" - ], - [ - "Ġl", - "ËĪÉĽ" - ], - [ - "ËĪeËIJ", - "Êĥ" - ], - [ - "Ġf", - "ÉĶËIJ" - ], - [ - "m", - "ÉĻ" - ], - [ - "ɾ", - "t" - ], - [ - "ĠkËĮo", - "n" - ], - [ - "Ġl", - "ËĪÉĶ" - ], - [ - "Ġx", - "ËĪÉiju" - ], - [ - "p", - "l" - ], - [ - "Ġd", - "ËĪi" - ], - [ - "Ġl", - "ËĪoËIJ" - ], - [ - "s", - "ÉĻ" - ], - [ - "ËĪaËIJ", - "va" - ], - [ - "Ġl", - "ËĪu" - ], - [ - "ĠÉ¡", - "ËĮÉĻÊĬ" - ], - [ - "Ġh", - "av" - ], - [ - "ĠËĮaËIJpk", - "ËĮoËIJ" - ], - [ - "ɾ", - "ËĪi" - ], - [ - "Ġf", - "ËĪÉĻ" - ], - [ - "Ġh", - "ËĮÉĻm" - ], - [ - "ËĪonÉ¡", - "Éľ" - ], - [ - "j", - "o" - ], - [ - "Ġs", - "ÉĶ" - ], - [ - "ËĪaËIJ", - "d" - ], - [ - "w", - "ËĪiÉĻ" - ], - [ - "ËĪa", - "nd" - ], - [ - "ËĮa", - "ɪn" - ], - [ - "t", - "ɾ" - ], - [ - "ĠËĮ", - "ɪ" - ], - [ - "ĠËĪu", - "na" - ], - [ - "Ġx", - "wËĪÉij" - ], - [ - "Ġj", - "ÉĶËIJ" - ], - [ - "Êģ", - "ËĪi" - ], - [ - "ĠkËĪuo", - "Éľ" - ], - [ - "Ġa", - "β" - ], - [ - "ĠÉ¡", - "ËĪaËIJ" - ], - [ - "an", - "o" - ], - [ - "t", - "ÉĻl" - ], - [ - "Ġr", - "ËĮe" - ], - [ - "ËĮÊĮ", - "t" - ], - [ - "ĠjËĪi", - "Éij" - ], - [ - "ĠɾËĮÉĻh", - "aËIJ" - ], - [ - "Ġm", - "ËĪe" - ], - [ - "ĠËĪy", - "Ã¦Éľn" - ], - [ - "Ġf", - "ËĪu" - ], - [ - "Ġb", - "l" - ], - [ - "n", - "ËĪi" - ], - [ - "s", - "ÉĻn" - ], - [ - "Ġa", - "ɪn" - ], - [ - "ËĪi", - "ÊĬ" - ], - [ - "Ġðe", - "ɪ" - ], - [ - "Ġɪ", - "ts" - ], - [ - "Ġ", - "(" - ], - [ - "ËĪy", - "ËIJ" - ], - [ - "ÉĻ", - "d" - ], - [ - "ĠËĮ", - "o" - ], - [ - "ĠÉĽ", - "s" - ], - [ - "Ġv", - "iËIJ" - ], - [ - "ËIJ", - "É¡eËIJ" - ], - [ - "k", - "ËĪe" - ], - [ - "ĠËĪa", - "l" - ], - [ - "ÉĽ", - "l" - ], - [ - "Ġ", - "ÊĮ" - ], - [ - "ËIJ", - "o" - ], - [ - "Ġk", - "ËĪo" - ], - [ - "ĠÊĪ", - "ËĪuËIJ" - ], - [ - "Ġs", - "ËĪɪ" - ], - [ - "ËĪeËIJ", - "ɾ" - ], - [ - "Éľ", - "m" - ], - [ - "ËĮ", - "ÉĻn" - ], - [ - "ËĪaËIJ", - "i" - ], - [ - "ËĪoËIJ", - "l" - ], - [ - "ɪ", - "ËĮeËIJ" - ], - [ - "Ġʲ", - "ËĪy" - ], - [ - "Ġk", - "ËĪÉĶËIJ" - ], - [ - "s", - "ËĪi" - ], - [ - "Ġl", - "ËĪe" - ], - [ - "ËĮ", - "ÉĴt" - ], - [ - "ËĪiËIJ", - "p" - ], - [ - "a", - "Êģ" - ], - [ - "Ġθ", - "ËĪɪÅĭ" - ], - [ - "ËĪÉĻËIJ", - "ɪ" - ], - [ - "ËĪÊĮ", - "l" - ], - [ - "ĠhËĪoËIJ", - "taËIJ" - ], - [ - "ËĪo", - "ɪ" - ], - [ - "nt", - "o" - ], - [ - "z", - "h" - ], - [ - "ĠdeËIJ", - "m" - ], - [ - "ĠkÉĶ", - "m" - ], - [ - "ʰ", - "ËĪiËIJk" - ], - [ - "ĠdÊĴ", - "ËĪÊĮst" - ], - [ - "p", - "ɾ" - ], - [ - "Ġl", - "y" - ], - [ - "h", - "ËĪu" - ], - [ - "ËĪÉĶ", - "ø" - ], - [ - "ËĪaËIJ", - "s" - ], - [ - "ĠËĪa", - "n" - ], - [ - "Ġ", - "ËĪÉĴ" - ], - [ - "Ġk", - "an" - ], - [ - "Ġts", - "ËĪuo" - ], - [ - "ËĪeËIJ", - "va" - ], - [ - "ĠÉ¡", - "ɾ" - ], - [ - "Ġp", - "o" - ], - [ - "ĠtÊĥ", - "ËĪÉĶ" - ], - [ - "Êİ", - "a" - ], - [ - "Ġm", - "ËĮi" - ], - [ - "Êĥ", - "t" - ], - [ - "t", - "ËĪi" - ], - [ - "Ġh", - "ËĪÊĮ" - ], - [ - "tÊĥ", - "e" - ], - [ - "Ġf", - "ÉĶn" - ], - [ - "v", - "e" - ], - [ - "Ġn", - "ËĮe" - ], - [ - "ËĪÉĶ", - "Êģ" - ], - [ - "i", - "z" - ], - [ - "Ġs", - "ËĪuo" - ], - [ - "ËĪÉĽËIJ", - "r" - ], - [ - "wËĪa", - "Êģ" - ], - [ - "ËĪað", - "a" - ], - [ - "Åĭ", - "k" - ], - [ - "p", - "o" - ], - [ - "Ġk", - "ËĪi" - ], - [ - "ËĪa", - "d" - ], - [ - "Ġv", - "ËĪi" - ], - [ - "t", - "Éķ" - ], - [ - "Ġk", - "ËĪÉĻ" - ], - [ - "Ġw", - "ËĪu" - ], - [ - "ÉĴ", - "z" - ], - [ - "ĠvÉijËIJ", - "ɾ" - ], - [ - "Êģ", - "ËĪÉĽ" - ], - [ - "Ġk", - "ËĪaËIJ" - ], - [ - "k", - "e" - ], - [ - "n", - "ÉĻ" - ], - [ - "ËĪÊĮ", - "b" - ], - [ - "ËĪuËIJ", - "ɾ" - ], - [ - "ËĮÉĻ", - "ËIJ" - ], - [ - "ĠÊĪ", - "ʰËĪiËIJk" - ], - [ - "Ġk", - "ËĪu" - ], - [ - "Ġb", - "ËĮÊĮt" - ], - [ - "Ġa", - "t" - ], - [ - "Ġf", - "ɹ" - ], - [ - "ËĪa", - "x" - ], - [ - "Ġz", - "oËIJ" - ], - [ - "Ġt", - "ËĪaËIJ" - ], - [ - "Ġð", - "ËĮe" - ], - [ - "n", - "eËIJ" - ], - [ - "ĠÉij", - "ËIJ" - ], - [ - "Ġa", - "ÊĬf" - ], - [ - "a", - "m" - ], - [ - "ÊĬ", - "Åĭ" - ], - [ - "ĠÉĶ", - "ËIJ" - ], - [ - "ĠÉķËĪi", - "ÉľÅĭ" - ], - [ - "Ġ", - "ËĪÉĶËIJl" - ], - [ - "ɪ", - "m" - ], - [ - "j", - "ËĪo" - ], - [ - "ËĪiËIJ", - "ÉŁ" - ], - [ - "Ġkw", - "ËĮÉĽ" - ], - [ - "ĠmËĪa", - "s" - ], - [ - "ÉĻ", - "h" - ], - [ - "ĠËĪa", - "ÊĬ" - ], - [ - "ËĪÉĶ", - "ɪ" - ], - [ - "É¡", - "ÉĻɾ" - ], - [ - "r", - "ÉĻn" - ], - [ - "ËĪɪ", - "k" - ], - [ - "s", - "se" - ], - [ - "Ġp", - "ËĪÉij" - ], - [ - "ĠÉĹ", - "ËĮe" - ], - [ - "ĠÉĹ", - "ËĪi" - ], - [ - "Ġa", - "z" - ], - [ - "ĠÉ¡ËĪÊĮ", - "jaËIJ" - ], - [ - "z", - "e" - ], - [ - "ĠÉĹ", - "ËĮaËIJ" - ], - [ - "Ġf", - "ËĪi" - ], - [ - "ĠËĮ", - "ÉĴn" - ], - [ - "Ġx", - "ËĪo" - ], - [ - "ĠËĮÊĬ", - "na" - ], - [ - "Ġtʰ", - "aËIJ" - ], - [ - "Ġs", - "Éij" - ], - [ - "ËĪeɪ", - "ÊĥÉĻn" - ], - [ - "ĠtÉķËĪi", - "Éľ" - ], - [ - "ĠÉŁ", - "aËIJ" - ], - [ - "p", - "ËIJ" - ], - [ - "Ġpl", - "y" - ], - [ - "θ", - "ËĪi" - ], - [ - "ËIJ", - "Éĸ" - ], - [ - "Ġt", - "ËĪuei" - ], - [ - "Ġl", - "ËĪÉĻ" - ], - [ - "Ġd", - "ÉijËIJ" - ], - [ - "f", - "t" - ], - [ - "ËĪa", - "m" - ], - [ - "ĠsËĪÊĮ", - "kt" - ], - [ - "Ġt", - "ËĪou" - ], - [ - "Ġp", - "ËĪiÉĽ" - ], - [ - "ĠËĪa", - "i" - ], - [ - "ĠwËĪÉĴ", - "n" - ], - [ - "Ġz", - "ËĮaɪn" - ], - [ - "Ġe", - "st" - ], - [ - "Ġm", - "ÉĶ" - ], - [ - "ĠtÉķ", - "jËĪÉiju" - ], - [ - "Éľ", - "p" - ], - [ - "ËĪÊĮ", - "z" - ], - [ - "b", - "i" - ], - [ - "ËĪÉĽËIJs", - "eËIJ" - ], - [ - "Ġl", - "ËĪy" - ], - [ - "Ġm", - "ËĮe" - ], - [ - "Ġd", - "ËĮÉĽl" - ], - [ - "ËĪiËIJ", - "l" - ], - [ - "ĠkËĮo", - "mo" - ], - [ - "Ġh", - "ËĪaÉľn" - ], - [ - "ËĪoËIJ", - "ne" - ], - [ - "ĠkËĪÊĮɾ", - "t" - ], - [ - "Ġsy", - "Êģ" - ], - [ - "ËĮÉĶ", - "ɾ" - ], - [ - "Ġɪ", - "f" - ], - [ - "u", - "v" - ], - [ - "z", - "ÉĻn" - ], - [ - "o", - "l" - ], - [ - "Ï", - "ĩ" - ], - [ - "i", - "m" - ], - [ - "Ġm", - "ËĪiÉĽ" - ], - [ - "Ġð", - "ɪ" - ], - [ - "Ġv", - "ËĪÉĽ" - ], - [ - "ÊĬ", - "d" - ], - [ - "Ġt", - "r" - ], - [ - "ËĪeËIJ", - "s" - ], - [ - "ð", - "e" - ], - [ - "d", - "e" - ], - [ - "ʰ", - "Ïĩ" - ], - [ - "ÉŁ", - "ʰ" - ], - [ - "ËĮÉĻËIJ", - "ÉªÉľ" - ], - [ - "b", - "ËIJ" - ], - [ - "ËĪÊĬ", - "k" - ], - [ - "ĠnËĪÉĶ", - "ÉªÉľ" - ], - [ - "ĠËĮ", - "iËIJ" - ], - [ - "ËĪÉijËIJ", - "t" - ], - [ - "ËĪiËIJ", - "ɾ" - ], - [ - "Ġt", - "ɹ" - ], - [ - "ɾ", - "ÉĶ" - ], - [ - "Ġw", - "ÉĴz" - ], - [ - "Ġv", - "u" - ], - [ - "b", - "ÉĻl" - ], - [ - "b", - "ÉĻ" - ], - [ - "ɹ", - "i" - ], - [ - "nt", - "s" - ], - [ - "Ġs", - "ËĪaËIJ" - ], - [ - "d", - "ʰ" - ], - [ - "Ġt", - "ÊĬ" - ], - [ - "ĠÊİ", - "ËĮi" - ], - [ - "β", - "a" - ], - [ - "h", - "ËĪÉĻÉľÅĭ" - ], - [ - "Ġs", - "ËĪiËIJ" - ], - [ - "ĠpËĮa", - "ɾa" - ], - [ - "ËĪÉĽÉ¾", - "ÉĶ" - ], - [ - "ËĪɪ", - "s" - ], - [ - "É£", - "o" - ], - [ - "ĠËĮa", - "l" - ], - [ - "o", - "r" - ], - [ - "Ġb", - "ËĪÊĮh" - ], - [ - "Ġk", - "ËĪoËIJ" - ], - [ - "Ġt", - "ËĪÉĽ" - ], - [ - "Ġp", - "ËĪo" - ], - [ - "ĠÊĴ", - "ÉĻ" - ], - [ - "p", - "Êģ" - ], - [ - "Ġ", - "ËĪaɪ" - ], - [ - "hËĪÉij", - "ÉľÅĭ" - ], - [ - "ÉĻl", - "i" - ], - [ - "ËĪeɪ", - "t" - ], - [ - "ĠjËĪiou", - "Éľ" - ], - [ - "Ġd", - "ËĪÉĻ" - ], - [ - "Ġm", - "ËĪÉĶËIJ" - ], - [ - "l", - "ËĪi" - ], - [ - "ËĮy", - "ÉĻ" - ], - [ - "ĠlËĪoËIJ", - "É¡" - ], - [ - "Ġn", - "ËĪÊĮ" - ], - [ - "Ġh", - "ËĪÊĬ" - ], - [ - "Ġn", - "ËĪÉĻÉľÅĭ" - ], - [ - "ĠÊģ", - "ÉĻ" - ], - [ - "z", - "ËĪi" - ], - [ - "Ġt", - "ËĪuËIJ" - ], - [ - "ĠkËĮo", - "me" - ], - [ - "Ġl", - "ËĪeËIJ" - ], - [ - "ËĪaËIJt", - "aËIJ" - ], - [ - "Ġa", - "n" - ], - [ - "ĠËĪy", - "u" - ], - [ - "ĠËĮÊĮ", - "É¡ÉĻɾ" - ], - [ - "ĠËĪɪ", - "n" - ], - [ - "ĠhËĪo", - "ÉĻ" - ], - [ - "v", - "ÉĻ" - ], - [ - "ËĪø", - "ËIJ" - ], - [ - "θj", - "a" - ], - [ - "ËĪuÉĻ", - "Éľn" - ], - [ - "Ġk", - "ÉĻɾ" - ], - [ - "ËĪa", - "t" - ], - [ - "j", - "ËĪø" - ], - [ - "ËĪÉĽt", - "Êģ" - ], - [ - "Ġp", - "ËĪÉiju" - ], - [ - "st", - "ÉĻ" - ], - [ - "Ġw", - "ÉĴt" - ], - [ - "ËĪeËIJ", - "l" - ], - [ - "ÊĪ", - "i" - ], - [ - "Ġx", - "ËĪaiÉľ" - ], - [ - "ËĪy", - "Êģ" - ], - [ - "ĠhËĪoËIJ", - "É¡aËIJ" - ], - [ - "Ġts", - "ËĪi" - ], - [ - "ĠËĪÊĮ", - "p" - ], - [ - "Ġn", - "ËĮÉĴt" - ], - [ - "ĠlËĪɪ", - "eËIJ" - ], - [ - "Ġh", - "ËĪa" - ], - [ - "Ġf", - "l" - ], - [ - "Ġn", - "ËĪeËIJ" - ], - [ - "ËĮaËIJ", - "ɪ" - ], - [ - "Ġt", - "ËĪuo" - ], - [ - "tÊĥ", - "ËIJ" - ], - [ - "s", - "ËĪe" - ], - [ - "bʰ", - "i" - ], - [ - "ĠbËĪÊĮh", - "ÊĬt" - ], - [ - "ËĪÉĽ", - "nd" - ], - [ - "Ġs", - "ËĪÉĶ" - ], - [ - "ÉĻn", - "s" - ], - [ - "ËĮÉĻ", - "l" - ], - [ - "ÉĽ", - "Éľ" - ], - [ - "ĠÉ¡", - "l" - ], - [ - "ËĪɪ", - "ɾ" - ], - [ - "ËĪaËIJt", - "a" - ], - [ - "Éľ", - "ËIJ" - ], - [ - "ËĪÉĽnt", - "o" - ], - [ - "sk", - "ËĮoËIJ" - ], - [ - "ËĪÉĽ", - "k" - ], - [ - "ts", - "i" - ], - [ - "Ġt", - "ËĪonÉ¡" - ], - [ - "Ġb", - "iËIJ" - ], - [ - "Ġh", - "ËĪaËIJɪ" - ], - [ - "Ġb", - "ËĪi" - ], - [ - "j", - "j" - ], - [ - "Êİ", - "i" - ], - [ - "Ġk", - "ʰ" - ], - [ - "Ġs", - "ËĪo" - ], - [ - "ll", - "o" - ], - [ - "Ġb", - "aɪ" - ], - [ - "ĠÉĽ", - "nt" - ], - [ - "Ġ", - "ËĪiËIJ" - ], - [ - "ĠÉ¡", - "ËĪo" - ], - [ - "ɾ", - "eËIJ" - ], - [ - "Ġk", - "Êĭ" - ], - [ - "Ġm", - "ËĪeiÉľ" - ], - [ - "ÊĬ", - "ËĪÉĶËIJ" - ], - [ - "Ġt", - "ËĪaɪ" - ], - [ - "Ġsu", - "s" - ], - [ - "Ġr", - "i" - ], - [ - "Ġv", - "ËĮÉĽ" - ], - [ - "ËĪiËIJ", - "no" - ], - [ - "v", - "ano" - ], - [ - "ĠdËĮi", - "ËIJ" - ], - [ - "ĠÊIJ", - "ËĪaÉľn" - ], - [ - "Ê", - "Ĥ" - ], - [ - "ĠÉIJ", - "b" - ], - [ - "ËĪaËIJ", - "h" - ], - [ - "ɪ", - "Êĥ" - ], - [ - "ĠdËĮe", - "lla" - ], - [ - "tËIJ", - "i" - ], - [ - "ĠËĪÊĬ", - "n" - ], - [ - "Ġh", - "iËIJ" - ], - [ - "Ġb", - "ËĪaËIJt" - ], - [ - "Ġth", - "ËĪi" - ], - [ - "Ġa", - "m" - ], - [ - "Ġ", - "ËĪoËIJ" - ], - [ - "Ġh", - "u" - ], - [ - "Ġk", - "ËĪÊĮh" - ], - [ - "Ġz", - "ËĪÉijËIJ" - ], - [ - "ĠÉ¡", - "ËĮÉĶ" - ], - [ - "Ġ", - "ËĪÉĻÊĬ" - ], - [ - "y", - "ËĪi" - ], - [ - "Ġl", - "ËĪÊĮ" - ], - [ - "Ġd", - "ËĪeËIJ" - ], - [ - "Ġs", - "ËĪÉĶËIJ" - ], - [ - "sk", - "ËĮeËIJ" - ], - [ - "ɾ", - "o" - ], - [ - "Êģ", - "ËĪÉij" - ], - [ - "t", - "ËĪa" - ], - [ - "Ġk", - "ËĪÊĬ" - ], - [ - "ËĪant", - "e" - ], - [ - "Ġd", - "ÉĶ" - ], - [ - "Ġs", - "ËĪeɪ" - ], - [ - "Ġs", - "ÉĽt" - ], - [ - "ɹ", - "ɪ" - ], - [ - "ĠÉ¡ËĮÉĻÊĬ", - "ɪÅĭ" - ], - [ - "z", - "o" - ], - [ - "Ġj", - "ËĪaËIJ" - ], - [ - "ĠÉĴv", - "ðÉĻ" - ], - [ - "ĠÊ", - "Ŀ" - ], - [ - "ĠÉĽ", - "l" - ], - [ - "Ġs", - "ËĪoËIJ" - ], - [ - "Ġth", - "ËĪiÉľ" - ], - [ - "Ġ", - "ËĪÉĽl" - ], - [ - "Ġly", - "ËĮi" - ], - [ - "nd", - "ÊĴ" - ], - [ - "ĠÉķ", - "jËĪÉiju" - ], - [ - "θ", - "a" - ], - [ - "ĠɾËĮÉĻh", - "eËIJ" - ], - [ - "Ġma", - "ɪ" - ], - [ - "j", - "ÉĻ" - ], - [ - "ĠËĪÊĮ", - "b" - ], - [ - "as", - "jËĪÉĶ" - ], - [ - "d", - "Êģ" - ], - [ - "Ġkh", - "ËĪa" - ], - [ - "ĠËĪe", - "s" - ], - [ - "v", - "i" - ], - [ - "f", - "i" - ], - [ - "ËĮÉĻ", - "b" - ], - [ - "Ġr", - "e" - ], - [ - "Ġav", - "ËĮÉĽ" - ], - [ - "Ġt", - "ËĮi" - ], - [ - "Ġk", - "ɾ" - ], - [ - "Ġb", - "ɪk" - ], - [ - "st", - "e" - ], - [ - "ËĪeËIJÊĥ", - "c" - ], - [ - "p", - "t" - ], - [ - "z", - "ÉĻ" - ], - [ - "Ġw", - "ËĪaËIJ" - ], - [ - "k", - "l" - ], - [ - "ĠsËĪÊĮ", - "m" - ], - [ - "ɪ", - "ÊĪ" - ], - [ - "d", - "z" - ], - [ - "v", - "o" - ], - [ - "ËĮa", - "ÊĬt" - ], - [ - "nd", - "e" - ], - [ - "Ġd", - "ÉĽs" - ], - [ - "ĠÉŁ", - "ËĪaËIJ" - ], - [ - "Ġr", - "ËĮi" - ], - [ - "s", - "ËĮeËIJ" - ], - [ - "É¡", - "i" - ], - [ - "Ġal", - "s" - ], - [ - "ËĪi", - "ðo" - ], - [ - "ĠnËĪi", - "Éľn" - ], - [ - "ÊĬ", - "l" - ], - [ - "ts", - "ËIJ" - ], - [ - "ËĪant", - "o" - ], - [ - "ĠÉĹ", - "ËĪÉĻÊĬ" - ], - [ - "kËIJ", - "i" - ], - [ - "ĠsËĪÊĮ", - "b" - ], - [ - "Ġn", - "ËĪa" - ], - [ - "Ġl", - "ËĮo" - ], - [ - "Ġph", - "ËĪi" - ], - [ - "m", - "ËĮe" - ], - [ - "Ġf", - "a" - ], - [ - "k", - "ÉĻ" - ], - [ - "Ġz", - "ËĪu" - ], - [ - "n", - "s" - ], - [ - "ĠÊģ", - "e" - ], - [ - "Ġb", - "ËĪo" - ], - [ - "ËĪaËIJt", - "i" - ], - [ - "Ġm", - "an" - ], - [ - "ĠlËĪi", - "Éij" - ], - [ - "ĠÉĹ", - "ËĮyÉĻ" - ], - [ - "Ġf", - "ËĪÉĶËIJ" - ], - [ - "ĠkÊĭ", - "ËĪeËIJÊĥc" - ], - [ - "Ġx", - "ËĪÉij" - ], - [ - "ĠtÉķ", - "ËĪu" - ], - [ - "j", - "ÉĻɾ" - ], - [ - "Ġɪ", - "st" - ], - [ - "w", - "ËĪi" - ], - [ - "ĠËĮaɪn", - "ÉĻ" - ], - [ - "ɪ", - "É¡" - ], - [ - "Ġs", - "ÊĪ" - ], - [ - "ËĪi", - "ÉĻl" - ], - [ - "Ġn", - "ËĪiÉĽÉľn" - ], - [ - "ĠËĮÉĽ", - "ËIJ" - ], - [ - "ËĪaɪ", - "nd" - ], - [ - "Ġz", - "ËĪi" - ], - [ - "v", - "ÉĻn" - ], - [ - "m", - "z" - ], - [ - "ð", - "os" - ], - [ - "dÊĴ", - "ËIJ" - ], - [ - "j", - "ËĪa" - ], - [ - "ɾ", - "ËĪÉĶ" - ], - [ - "l", - "ËĪe" - ], - [ - "Ê", - "²" - ], - [ - "Ġv", - "ËĪÉĶ" - ], - [ - "Ġl", - "ËĪiÉĽ" - ], - [ - "θ", - "e" - ], - [ - "mËĪe", - "nte" - ], - [ - "Ġɪn", - "ðÉĻ" - ], - [ - "Ġaɪ", - "m" - ], - [ - "n", - "ÉĻn" - ], - [ - "Ġh", - "ÉĻm" - ], - [ - "ɾ", - "aËIJ" - ], - [ - "ĠsËĪuo", - "Éľ" - ], - [ - "Ġɲ", - "ËĪi" - ], - [ - "Ġɹ", - "ËĪiÉĻl" - ], - [ - "l", - "ËĪa" - ], - [ - "Ġb", - "ËĪÉĶ" - ], - [ - "Ġk", - "ËĪai" - ], - [ - "Êģ", - "ËĪa" - ], - [ - "Ġw", - "ËĪÉľËIJ" - ], - [ - "Ġa", - "ËIJ" - ], - [ - "Ġp", - "as" - ], - [ - "ËĪÊĮ", - "s" - ], - [ - "w", - "ËĪÉĽÉ¾" - ], - [ - "ĠÉĹ", - "ËĪe" - ], - [ - "ĠhËĮa", - "tÉĻ" - ], - [ - "a", - "ɪn" - ], - [ - "ĠËĪÉĶ", - "pʰ" - ], - [ - "Êģ", - "ËĪe" - ], - [ - "ĠÉŁaËIJ", - "ËĪeËIJÉ¡aËIJ" - ], - [ - "ĠËĪÊĬ", - "s" - ], - [ - "ĠtÉķhËĪi", - "Éľ" - ], - [ - "nt", - "Êĥ" - ], - [ - "Ġx", - "ËĪuo" - ], - [ - "ËĪu", - "Êģ" - ], - [ - "Ġɪ", - "m" - ], - [ - "ɳ", - "Éĸ" - ], - [ - "ËĪyÉĻ", - "Éľkh" - ], - [ - "ĠËĪy", - "ÉĽ" - ], - [ - "Ġm", - "ËĮaËIJ" - ], - [ - "Åĵ", - "Êģ" - ], - [ - "ĠËĪa", - "lt" - ], - [ - "Ġk", - "ÉĻm" - ], - [ - "Êİ", - "o" - ], - [ - "ĠÉIJ", - "n" - ], - [ - "Ġf", - "y" - ], - [ - "ĠËĮÉĽ", - "ra" - ], - [ - "ĠÉ¡", - "ËĪÊĬ" - ], - [ - "Ġp", - "ËĪÊĮ" - ], - [ - "l", - "s" - ], - [ - "Ġl", - "ËĪiËIJ" - ], - [ - "ĠÊĤ", - "ËĪy" - ], - [ - "Ġbɪk", - "ËĪÊĮz" - ], - [ - "ĠÉ¡", - "ÉĽt" - ], - [ - "Ġb", - "ɾ" - ], - [ - "t", - "ʰ" - ], - [ - "tÉĻl", - "ËĮÉĻb" - ], - [ - "x", - "o" - ], - [ - "sk", - "ËĮaËIJ" - ], - [ - "ɲ", - "ʲ" - ], - [ - "ËĪeËIJk", - "ÊĪ" - ], - [ - "r", - "ÉĻ" - ], - [ - "tÊĥ", - "o" - ], - [ - "ĠpÊģ", - "ÉĶ" - ], - [ - "Ġɹ", - "ËĪaɪt" - ], - [ - "Ġp", - "ËĪei" - ], - [ - "ËĮ", - "ɪç" - ], - [ - "j", - "ËĪÉĽÉ¾" - ], - [ - "tËIJ", - "a" - ], - [ - "ĠÉIJb", - "ËĮaÊĬt" - ], - [ - "ĠkÊĭËĪeËIJÊĥc", - "ÉĻn" - ], - [ - "Ġv", - "ËĪe" - ], - [ - "ÊĬ", - "Éľ" - ], - [ - "Ġa", - "kËĪe" - ], - [ - "Ġp", - "ËĪai" - ], - [ - "v", - "ËĪÉĽ" - ], - [ - "Ġθ", - "ɹ" - ], - [ - "ɪ", - "f" - ], - [ - "Ġav", - "ËĪÉĽ" - ], - [ - "Ġk", - "ËĪe" - ], - [ - "d", - "ËĪi" - ], - [ - "ËĪeËIJ", - "Éĸ" - ], - [ - "Ġb", - "ÉĻt" - ], - [ - "ÊĪ", - "ʰ" - ], - [ - "t", - "eËIJ" - ], - [ - "θj", - "ËĪÉĶn" - ], - [ - "d", - "Éľ" - ], - [ - "ĠjËĪi", - "Éľ" - ], - [ - "Ġv", - "e" - ], - [ - "É£", - "ËĪu" - ], - [ - "ËĪÊĮh", - "ÉĻl" - ], - [ - "Ġp", - "ÉĶ" - ], - [ - "ĠÉ¡", - "r" - ], - [ - "Ġð", - "a" - ], - [ - "Ġv", - "ËĪiËIJ" - ], - [ - "ĠËĮ", - "ÉijËIJ" - ], - [ - "ËĪÉĻÊĬ", - "nt" - ], - [ - "Ġb", - "ËĪaËIJɾ" - ], - [ - "ĠmËĪÊĮ", - "tÉĻlËĮÉĻb" - ], - [ - "l", - "d" - ], - [ - "ĠtÉķ", - "ËĮÉĶ" - ], - [ - "p", - "a" - ], - [ - "ð", - "ËĪad" - ], - [ - "ËĪi", - "ɾ" - ], - [ - "Ġx", - "ËĪu" - ], - [ - "ĠlËĪi", - "ÉľÅĭ" - ], - [ - "ËĪeɪ", - "s" - ], - [ - "ĠÉĹËĮe", - "Éľn" - ], - [ - "Ġth", - "ËĪiÉĽ" - ], - [ - "tËIJ", - "e" - ], - [ - "ĠavËĮÉĽ", - "k" - ], - [ - "ĠËĮ", - "ÉĶ" - ], - [ - "Ġk", - "ËĪÉiju" - ], - [ - "ɪ", - "v" - ], - [ - "iËIJ", - "z" - ], - [ - "ËĪo", - "s" - ], - [ - "ĠÉ¡", - "ɹ" - ], - [ - "a", - "nd" - ], - [ - "ĠlËĪi", - "ou" - ], - [ - "ĠËĪo", - "Éľ" - ], - [ - "É¡", - "l" - ], - [ - "Ġp", - "ËĪÉĶËIJ" - ], - [ - "Ġm", - "ËĮeËIJ" - ], - [ - "Ġk", - "ËĪÉĴ" - ], - [ - "n", - "os" - ], - [ - "ç", - "ÉĻn" - ], - [ - "f", - "ÉĻn" - ], - [ - "ĠsËĪÊĮkt", - "ËĮeËIJ" - ], - [ - "Ġ", - "ËĪaɪn" - ], - [ - "ËĪoËIJ", - "re" - ], - [ - "j", - "ËĪÉĽn" - ], - [ - "Ġð", - "ËĪÉĽn" - ], - [ - "ĠtÉķh", - "ËĪiÉĽÉľn" - ], - [ - "Ġh", - "ËĪaɪ" - ], - [ - "ɾ", - "ËĪÉĽ" - ], - [ - "Ġs", - "ËĪu" - ], - [ - "ĠkËĪɪ", - "jaËIJ" - ], - [ - "Ġpj", - "ËĮÊĬ" - ], - [ - "ĠhÉĻm", - "ËĮaËIJ" - ], - [ - "ĠËĮÊĮ", - "p" - ], - [ - "Ġp", - "ËĪÊĮhÉĻl" - ], - [ - "Ġx", - "ËĪÉĻ" - ], - [ - "d", - "ËĪe" - ], - [ - "Ġm", - "Éij" - ], - [ - "ĠÊĬ", - "m" - ], - [ - "nd", - "ÉĻ" - ], - [ - "Ġd", - "ËĪÉĻÊĬnt" - ], - [ - "ËĪeËIJ", - "ÊĥÉĻn" - ], - [ - "Ġða", - "ts" - ], - [ - "i", - "s" - ], - [ - "Ġc", - "ËĪaËIJh" - ], - [ - "p", - "e" - ], - [ - "Ġs", - "ËĮo" - ], - [ - "Ġð", - "ËĪe" - ], - [ - "Ġs", - "ËĪaËIJt" - ], - [ - "ËĪa", - "Êģ" - ], - [ - "Ġs", - "ËĪe" - ], - [ - "ÉĻ", - "k" - ], - [ - "ɪ", - "Êĭ" - ], - [ - "ĠkËĪoËIJ", - "i" - ], - [ - "k", - "ÉĶ" - ], - [ - "Ġv", - "ËĪaËIJÊĬ" - ], - [ - "Ġf", - "ËĪei" - ], - [ - "Ġl", - "ËĪeËIJk" - ], - [ - "Ġh", - "ËĪiÉĻ" - ], - [ - "Ġa", - "ÊĬ" - ], - [ - "ËĪÉĽ", - "ndo" - ], - [ - "ËĪe", - "s" - ], - [ - "Ġz", - "ËĪÉĶ" - ], - [ - "Ġ", - "ËĪÉĽÉ¾a" - ], - [ - "nËĪi", - "Éľn" - ], - [ - "ĠkËĪÊĮ", - "m" - ], - [ - "Ġl", - "ËĪÉĴ" - ], - [ - "ɪ", - "st" - ], - [ - "Ġp", - "Éij" - ], - [ - "Ġf", - "ËĪÉĶ" - ], - [ - "Ġth", - "ËĪonÉ¡" - ], - [ - "nk", - "e" - ], - [ - "ËĮ", - "ɪk" - ], - [ - "Ġɲ", - "ËĪÉĻ" - ], - [ - "ËĮÊĮ", - "m" - ], - [ - "ËĪiËIJ", - "t" - ], - [ - "ĠwËĪÉĴ", - "nt" - ], - [ - "ËĪaβ", - "an" - ], - [ - "ĠbËĪÊĮ", - "r" - ], - [ - "ÉĽ", - "nd" - ], - [ - "ĠËĮÉijËIJ", - "bÉľ" - ], - [ - "Ġv", - "ËĪaɪ" - ], - [ - "ĠtÊĥ", - "ËĮi" - ], - [ - "ĠθËĪɪÅĭ", - "k" - ], - [ - "st", - "i" - ], - [ - "Ġk", - "ɹ" - ], - [ - "ĠËĪa", - "ÊĬt" - ], - [ - "st", - "ÉĻn" - ], - [ - "ĠÊĭ", - "ËĪÊĮn" - ], - [ - "ĠÉ¡", - "ËĮaËIJ" - ], - [ - "ËĪaËIJÉľ", - "ɲ" - ], - [ - "Êģ", - "i" - ], - [ - "ĠnËĪÉĶ", - "x" - ], - [ - "ĠɹËĪiÉĻl", - "ɪ" - ], - [ - "Ġv", - "ËĮi" - ], - [ - "Ġðe", - "ÉĻ" - ], - [ - "ËĮɪ", - "tÊĥ" - ], - [ - "Ġv", - "ËĪyÉĻ" - ], - [ - "ĠËĮaËIJpk", - "ËĮaËIJ" - ], - [ - "Ġf", - "ËĮaËIJɪ" - ], - [ - "Ġp", - "ËĪÉĶ" - ], - [ - "ĠnËĪÊĮ", - "mb" - ], - [ - "θ", - "es" - ], - [ - "j", - "ËĪÉĽÊģ" - ], - [ - "ĠkËĪÊĬ", - "cʰ" - ], - [ - "m", - "ËĪÉĽ" - ], - [ - "Ġv", - "ËĪu" - ], - [ - "Ġl", - "ÅĵÊģ" - ], - [ - "ĠiËIJ", - "m" - ], - [ - "ÊĪ", - "ÉĻɾ" - ], - [ - "tÊĥ", - "i" - ], - [ - "ËIJ", - "s" - ], - [ - "Ġt", - "ËĪy" - ], - [ - "ĠmËĪi", - "ÉľÅĭ" - ], - [ - "ɾ", - "ËĪe" - ], - [ - "m", - "ËĮa" - ], - [ - "Ġm", - "ËĮiËIJ" - ], - [ - "ĠÉĽ", - "ks" - ], - [ - "ɪ", - "p" - ], - [ - "ĠkËĪÊĮɾ", - "nËĮaËIJ" - ], - [ - "ĠËĮaÊĬ", - "x" - ], - [ - "r", - "ËĪiËIJ" - ], - [ - "Ġc", - "ËĪÊĮl" - ], - [ - "m", - "os" - ], - [ - "ĠkËĪÊĮɾt", - "ËĮeËIJ" - ], - [ - "iËIJ", - "ɾ" - ], - [ - "k", - "ÉĻn" - ], - [ - "Ġd", - "ËĪu" - ], - [ - "n", - "aËIJ" - ], - [ - "Ġp", - "wËĪe" - ], - [ - "ËĮÉĶ", - "ɪ" - ], - [ - "ĠtÉķh", - "ËĪiÉĽ" - ], - [ - "Ġβ", - "ËĪi" - ], - [ - "ËĪiÉĽ", - "Éľt" - ], - [ - "Ġt", - "e" - ], - [ - "ËĪað", - "os" - ], - [ - "m", - "ËĪa" - ], - [ - "Ġv", - "ËĪo" - ], - [ - "Ġm", - "ËĪɪ" - ], - [ - "Ġb", - "ËĮi" - ], - [ - "a", - "d" - ], - [ - "d", - "o" - ], - [ - "Ġn", - "ËĪaÊĬ" - ], - [ - "ĠʲËĪy", - "Éľ" - ], - [ - "w", - "ËĪÉĽ" - ], - [ - "ËĪi", - "s" - ], - [ - "e", - "l" - ], - [ - "Ġpa", - "r" - ], - [ - "Ġt", - "ËĪai" - ], - [ - "ĠdËĪɪ", - "jaËIJ" - ], - [ - "h", - "ËĪi" - ], - [ - "Ġɾ", - "ËĪÊĮ" - ], - [ - "Ġd", - "ËĪe" - ], - [ - "ËĪaɪ", - "d" - ], - [ - "Ġp", - "er" - ], - [ - "Ġs", - "ËĮÉĶ" - ], - [ - "w", - "e" - ], - [ - "ÊĬ", - "m" - ], - [ - "Ġi", - "n" - ], - [ - "ĠjËĪuËIJ", - "z" - ], - [ - "ËĪiËIJp", - "ÉĻl" - ], - [ - "ĠÊĭ", - "ËĪaËIJl" - ], - [ - "Ġe", - "tËĪÉĽ" - ], - [ - "ËĮÉĽ", - "m" - ], - [ - "Ġn", - "ËĪu" - ], - [ - "ËĪÉĽ", - "kt" - ], - [ - "ĠiËIJ", - "ɾ" - ], - [ - "Ġb", - "ɹ" - ], - [ - "Ġtsh", - "ËĪi" - ], - [ - "ĠÉĹ", - "ËĪÉĶÉľ" - ], - [ - "Ġkw", - "ËĮa" - ], - [ - "Ġf", - "ËĪuÉľ" - ], - [ - "w", - "ËĮa" - ], - [ - "Ġd", - "ËĪiËIJ" - ], - [ - "ĠÉ¡", - "ËĪyÉĻ" - ], - [ - "ËĮÉĽ", - "ËIJ" - ], - [ - "r", - "ËĪa" - ], - [ - "Ġn", - "e" - ], - [ - "Ġz", - "ËĪyÉĻ" - ], - [ - "Ġb", - "ËĪaɪ" - ], - [ - "ĠÉŁ", - "ËĪÊĮb" - ], - [ - "ËĪuËIJ", - "to" - ], - [ - "ÊĬ", - "nt" - ], - [ - "Ġc", - "ʰ" - ], - [ - "ËĪÉĽnt", - "i" - ], - [ - "ËĪo", - "ÉĻ" - ], - [ - "Ġs", - "ËĮÊĮm" - ], - [ - "Ġl", - "Éij" - ], - [ - "ËĮe", - "va" - ], - [ - "ɾ", - "ÉĽ" - ], - [ - "nt", - "Éľ" - ], - [ - "Ġm", - "ËĪÉĽn" - ], - [ - "ËĪÉijËIJ", - "k" - ], - [ - "Ġki", - "l" - ], - [ - "ËĪon", - "es" - ], - [ - "f", - "f" - ], - [ - "Ġm", - "ËĪÉĽËIJ" - ], - [ - "Ġv", - "ËĪÉĻɪ" - ], - [ - "Ġ", - "ËĪÉĶËIJ" - ], - [ - "ĠËĮɪ", - "nt" - ], - [ - "ÊĬ", - "n" - ], - [ - "Ġw", - "ɪl" - ], - [ - "Ġs", - "in" - ], - [ - "ĠËĮa", - "lla" - ], - [ - "Ġaβ", - "ËĪia" - ], - [ - "p", - "i" - ], - [ - "ËĪo", - "Éľ" - ], - [ - "ɪj", - "ËĮaËIJ" - ], - [ - "k", - "u" - ], - [ - "Ġv", - "ËĪɪ" - ], - [ - "Ġtu", - "t" - ], - [ - "ĠtËĪe", - "Éľ" - ], - [ - "Ġh", - "ËĪÉĶ" - ], - [ - "β", - "ɾe" - ], - [ - "s", - "ÉĻɾ" - ], - [ - "Ġkh", - "ËĪai" - ], - [ - "Ġm", - "ËĪÉĶ" - ], - [ - "Ġt", - "a" - ], - [ - "Ġɲ", - "ËĪaËIJ" - ], - [ - "Ġn", - "u" - ], - [ - "ËĪuËIJ", - "n" - ], - [ - "ĠÉĻËIJ", - "Éľ" - ], - [ - "ĠËĪa", - "ÊĬf" - ], - [ - "ËĪiËIJd", - "Éľ" - ], - [ - "nt", - "i" - ], - [ - "Ġp", - "ËĪiËIJpÉĻl" - ], - [ - "Ġk", - "j" - ], - [ - "Ġp", - "e" - ], - [ - "Ġm", - "ËĪÉij" - ], - [ - "ËĮa", - "ɪ" - ], - [ - "ËĪaËIJ", - "le" - ], - [ - "Ġv", - "ËĮÉĻËIJÉªÉľ" - ], - [ - "mp", - "o" - ], - [ - "ĠkËĪɪ", - "t" - ], - [ - "Ġn", - "ËĮÉĽ" - ], - [ - "ĠÉŁ", - "ËĪaËIJtaËIJ" - ], - [ - "ĠsËĪaËIJt", - "ʰ" - ], - [ - "ĠÉŁ", - "ËĪi" - ], - [ - "Ġs", - "o" - ], - [ - "Ġb", - "ËĪÉĽ" - ], - [ - "k", - "ËĪi" - ], - [ - "ɪt", - "i" - ], - [ - "Ġts", - "i" - ], - [ - "Ġk", - "Êģ" - ], - [ - "ËĮ", - "ÉĴ" - ], - [ - "É¡", - "ÉĻl" - ], - [ - "k", - "st" - ], - [ - "Ġm", - "ËĪÉĻËIJ" - ], - [ - "ËĪÊĮ", - "k" - ], - [ - "Ġn", - "ËĪaËIJÊĬ" - ], - [ - "Ġa", - "p" - ], - [ - "ĠlËĪɪ", - "kʰ" - ], - [ - "ll", - "i" - ], - [ - "ĠkwËĪa", - "l" - ], - [ - "Ġ", - "ËĪÉĻËIJ" - ], - [ - "Ġts", - "ËĪuei" - ], - [ - "Ġd", - "o" - ], - [ - "ĠkËIJ", - "jËĪo" - ], - [ - "ÊĬ", - "z" - ], - [ - "Ġp", - "ËĪaËIJ" - ], - [ - "Ġm", - "ËĪuËIJ" - ], - [ - "ĠÉ¡ÉĻ", - "v" - ], - [ - "r", - "ËĪi" - ], - [ - "Ġt", - "w" - ], - [ - "ËĮ", - "ɪn" - ], - [ - "d", - "ËĪÉij" - ], - [ - "Ġð", - "ËĪi" - ], - [ - "ĠËĪaËIJ", - "i" - ], - [ - "Ġh", - "ËĪiÉĽ" - ], - [ - "Ġð", - "ËĮÉĽm" - ], - [ - "Ġpʰ", - "ËĪɪɾ" - ], - [ - "ÉĴ", - "m" - ], - [ - "ĠËĮ", - "eËIJ" - ], - [ - "Ġth", - "ËĪaiÉľ" - ], - [ - "Ġv", - "ËĪas" - ], - [ - "Ġn", - "ÉijËIJ" - ], - [ - "p", - "ÉĻn" - ], - [ - "Ġp", - "ËĮÉĻɾ" - ], - [ - "ĠÉĹ", - "ËĪaËIJɪ" - ], - [ - "ËĪou", - "Éľ" - ], - [ - "ĠÊIJ", - "ËĪuÉľ" - ], - [ - "ĠmËĪa", - "n" - ], - [ - "ĠtËĪÉĻ", - "ÉªÉľ" - ], - [ - "Ġl", - "ËĪaËIJÊĬ" - ], - [ - "m", - "ËĪÉĽnte" - ], - [ - "ĠfËĪa", - "m" - ], - [ - "s", - "jËĪÉĶ" - ], - [ - "Ġp", - "ËĪÉĻ" - ], - [ - "ËĪeËIJ", - "m" - ], - [ - "Ġp", - "ËĪÊĮr" - ], - [ - "j", - "ËĪi" - ], - [ - "Ġl", - "ÉĽ" - ], - [ - "Ġt", - "en" - ], - [ - "ËĪoËIJ", - "ra" - ], - [ - "k", - "i" - ], - [ - "ĠÊĤ", - "ËĪaËIJÊĬ" - ], - [ - "k", - "ɪ" - ], - [ - "bËIJ", - "e" - ], - [ - "ËĪa", - "lt" - ], - [ - "ð", - "ɪ" - ], - [ - "p", - "ËĪi" - ], - [ - "ĠËĮÉĽ", - "nt" - ], - [ - "Ġm", - "ËĪei" - ], - [ - "Ġh", - "ËĪÉĻÊĬ" - ], - [ - "Ġh", - "ËĪÉĽÉ¾" - ], - [ - "j", - "ËĪÉij" - ], - [ - "ĠhËĪÊĬ", - "aËIJ" - ], - [ - "m", - "Éľ" - ], - [ - "Ġd", - "ʰ" - ], - [ - "ĠtÊĥ", - "ËĪe" - ], - [ - "l", - "ËĪÉĽ" - ], - [ - "ËĪaËIJt", - "e" - ], - [ - "Ġp", - "ËĪuËIJ" - ], - [ - "Ġm", - "ËĪÊĬ" - ], - [ - "ËĪaËIJɪ", - "ÊĪ" - ], - [ - "d", - "iËIJ" - ], - [ - "Ġfɹ", - "ÉĴm" - ], - [ - "Ġh", - "ËĪÉijËIJ" - ], - [ - "β", - "o" - ], - [ - "ĠmËĪi", - "Éľn" - ], - [ - "Ġð", - "iËIJz" - ], - [ - "Ġk", - "ËĪou" - ], - [ - "ËĪiËIJ", - "na" - ], - [ - "Ġav", - "ËĮeva" - ], - [ - "Ġ", - "ËĪaËIJɾ" - ], - [ - "Ġn", - "ËĪuËIJɾ" - ], - [ - "Ġβ", - "ËĪe" - ], - [ - "Ġz", - "aɪn" - ], - [ - "ËĪÉĽ", - "d" - ], - [ - "É", - "Ĺ" - ], - [ - "ËĪeɪ", - "k" - ], - [ - "s", - "ËĮÉĻÊĬ" - ], - [ - "ËĪeËIJ", - "ÉŁ" - ], - [ - "ĠÊĤ", - "ËĪÉĻËIJ" - ], - [ - "j", - "e" - ], - [ - "cʰ", - "ËIJ" - ], - [ - "ËĪÉĶ", - "r" - ], - [ - "ÉĽ", - "ËIJ" - ], - [ - "ĠtÉķhËĪy", - "Ã¦Éľn" - ], - [ - "ĠËĮaɪn", - "ÉĻn" - ], - [ - "ĠiËIJ", - "n" - ], - [ - "ĠbËĪÊĮ", - "c" - ], - [ - "ËĪiËIJ", - "m" - ], - [ - "ɾ", - "as" - ], - [ - "ËĮÉĻ", - "s" - ], - [ - "Ġv", - "ËĪeËIJ" - ], - [ - "ĠËĪÉĻr", - "Éľ" - ], - [ - "Ġd", - "uËIJ" - ], - [ - "nt", - "ÉĻ" - ], - [ - "Ġpɹ", - "ËĪÉĴ" - ], - [ - "Ġb", - "ËĪɪ" - ], - [ - "ĠwËĪo", - "Éľ" - ], - [ - "n", - "ËĮi" - ], - [ - "Ġh", - "ÉIJ" - ], - [ - "Ġk", - "ËĪÉĽ" - ], - [ - "Ġe", - "t" - ], - [ - "jËĪÉĽ", - "ndo" - ], - [ - "ĠËĪai", - "Éľ" - ], - [ - "Ġl", - "i" - ], - [ - "ĠËĪaÊĬ", - "s" - ], - [ - "kËIJ", - "o" - ], - [ - "ĠÉĹ", - "ËĪyÉĻ" - ], - [ - "k", - "eËIJ" - ], - [ - "Ġf", - "ËĪiËIJl" - ], - [ - "Ġbʰ", - "ËĪaËIJi" - ], - [ - "ĠÉ¡ÉĻ", - "Êĥ" - ], - [ - "ÊĴ", - "ËĪe" - ], - [ - "Ġn", - "jËĪuËIJ" - ], - [ - "ĠËĪa", - "k" - ], - [ - "ĠÉĹ", - "ËĪaËIJ" - ], - [ - "z", - "ËĪa" - ], - [ - "v", - "ËĪe" - ], - [ - "ĠhËĮa", - "ÊĬ" - ], - [ - "ÉIJ", - "ç" - ], - [ - "ĠɾËĪÊĮ", - "kʰ" - ], - [ - "p", - "ËĪe" - ], - [ - "ĠtÉĻ", - "bi" - ], - [ - "ĠpËĪÊĮhÉĻl", - "ËĮeËIJ" - ], - [ - "Ġf", - "ËĪÉĽ" - ], - [ - "Ġw", - "ËĮɪtÊĥ" - ], - [ - "ĠtÉķËĪy", - "ÉĽÉľ" - ], - [ - "w", - "ËĮe" - ], - [ - "ËĮa", - "ɪt" - ], - [ - "ĠnÉijËIJ", - "x" - ], - [ - "ĠkËĪÉĶËIJ", - "n" - ], - [ - "ÊĬ", - "k" - ], - [ - "ĠbËĪaËIJ", - "d" - ], - [ - "Åĭ", - "ÉĻn" - ], - [ - "Ġn", - "i" - ], - [ - "Ġb", - "ËĪe" - ], - [ - "Ġm", - "ËĮÊĬ" - ], - [ - "ËĪa", - "r" - ], - [ - "ĠmËĮe", - "ɪk" - ], - [ - "Ġs", - "ËĪaËIJɾ" - ], - [ - "β", - "e" - ], - [ - "ĠtÉķhËĪi", - "ÉľÅĭ" - ], - [ - "it", - "ËĪe" - ], - [ - "k", - "ËĮe" - ], - [ - "ËĪÉĽËIJ", - "l" - ], - [ - "ËĮ", - "ÉĴn" - ], - [ - "ËĮ", - "Éij" - ], - [ - "Ġb", - "ËĪɪl" - ], - [ - "Ġw", - "ÊĬd" - ], - [ - "Ġb", - "ËĪoËIJl" - ], - [ - "r", - "d" - ], - [ - "i", - "ÉĻ" - ], - [ - "Ġd", - "a" - ], - [ - "Ġb", - "ËĪaËIJÊĬ" - ], - [ - "ĠnËĪÊĮmb", - "ÉĻɾ" - ], - [ - "ËĪaËIJɪ", - "Éľ" - ], - [ - "ĠÉĽ", - "m" - ], - [ - "Ġm", - "iËIJɾ" - ], - [ - "ËĪeɪ", - "m" - ], - [ - "l", - "os" - ], - [ - "ËĮÉĽ", - "t" - ], - [ - "ĠËĮaÊĬ", - "s" - ], - [ - "ĠmËĪa", - "Éľt" - ], - [ - "Ġw", - "ËĪuÉĻ" - ], - [ - "Ġw", - "ËĪeɪ" - ], - [ - "Ġse", - "ɲ" - ], - [ - "Ġb", - "jËĪÉĽ" - ], - [ - "Ġw", - "ÉĽn" - ], - [ - "f", - "l" - ], - [ - "Ġkh", - "wËĪa" - ], - [ - "d", - "ËĪÉĽ" - ], - [ - "v", - "ɹɪ" - ], - [ - "ĠËĪa", - "ɾ" - ], - [ - "jËĪÉiju", - "Éľ" - ], - [ - "ĠËĮaËIJpk", - "ËĮeËIJ" - ], - [ - "b", - "Êģ" - ], - [ - "ĠtËĪaɪ", - "m" - ], - [ - "Ġ", - "ËĪÉij" - ], - [ - "Ġs", - "ËĮa" - ], - [ - "Ġz", - "ËĪoɪ" - ], - [ - "ËĪÉĶɾ", - "a" - ], - [ - "Ġd", - "ËĪø" - ], - [ - "ËĪÉĶɾ", - "t" - ], - [ - "ĠÅĭ", - "ËĪÉĶ" - ], - [ - "m", - "in" - ], - [ - "Ġl", - "ËĪÊĬk" - ], - [ - "ËĪÉĶËIJ", - "t" - ], - [ - "ĠËĪÉĶ", - "tɾ" - ], - [ - "Ġf", - "ËĪaɪ" - ], - [ - "ĠÉ¡", - "ÉĴt" - ], - [ - "ËĪeËIJ", - "ÉĻn" - ], - [ - "k", - "ËĪÉĶ" - ], - [ - "ĠvËĪÉĽ", - "ɹi" - ], - [ - "m", - "ÉĽ" - ], - [ - "ËĪaɪ", - "z" - ], - [ - "Ġe", - "sp" - ], - [ - "ɲ", - "a" - ], - [ - "Ġl", - "ËĪo" - ], - [ - "ËĪÉĽËIJ", - "ra" - ], - [ - "β", - "ËĪi" - ], - [ - "ou", - "Éľ" - ], - [ - "ËĮÉĻ", - "k" - ], - [ - "tÊĥ", - "uËIJ" - ], - [ - "Ġn", - "ËĪyÉĻ" - ], - [ - "ÊĪ", - "ɾ" - ], - [ - "ĠÉ¡", - "ËĪy" - ], - [ - "ĠtËĪo", - "ðo" - ], - [ - "ËĪɪ", - "çt" - ], - [ - "Ġm", - "ɪç" - ], - [ - "ĠËĪa", - "nd" - ], - [ - "Ġkw", - "ËĮÉĽl" - ], - [ - "ĠÊĤ", - "ËĪaËIJ" - ], - [ - "ĠnËĪi", - "Éľ" - ], - [ - "ËĪÉĶ", - "p" - ], - [ - "ËĪiËIJ", - "z" - ], - [ - "ĠÊĤ", - "ËĪaÊĬ" - ], - [ - "ĠɾËĮÉĻh", - "i" - ], - [ - "ĠsËĮÊĬ", - "o" - ], - [ - "ĠÉĽ", - "É¡" - ], - [ - "Ġd", - "Åĵ" - ], - [ - "ĠÉ¡ËĮaËIJ", - "ÉªÉľ" - ], - [ - "d", - "ɪ" - ], - [ - "l", - "ËĮa" - ], - [ - "st", - "ËĪi" - ], - [ - "ĠdËĮiËIJ", - "z" - ], - [ - "Ġt", - "ËĮÊĬ" - ], - [ - "θ", - "i" - ], - [ - "ĠËĪɪ", - "skËĮoËIJ" - ], - [ - "nd", - "ÉĻn" - ], - [ - "Ġts", - "v" - ], - [ - "Ġh", - "ËĪÉĻËIJ" - ], - [ - "ĠÊĥ", - "ËĪÊĬ" - ], - [ - "ÉĻt", - "ËĮeËIJ" - ], - [ - "p", - "ËĮÉĽ" - ], - [ - "ËĪaɾ", - "ÉĶn" - ], - [ - "Ġp", - "ÉĽÊģ" - ], - [ - "Ġ", - "y" - ], - [ - "m", - "nËĮeËIJ" - ], - [ - "ËĪÉĽ", - "llo" - ], - [ - "ĠÉ¡", - "ËĪÉĻ" - ], - [ - "ĠËĮa", - "d" - ], - [ - "ĠÊĥ", - "v" - ], - [ - "ËĪÊı", - "ɾ" - ], - [ - "r", - "ËĪe" - ], - [ - "y", - "ËIJ" - ], - [ - "Ġp", - "ËĪaËIJs" - ], - [ - "Ġ", - "ËĪÉĽn" - ], - [ - "ɪ", - "dÊĴ" - ], - [ - "ËĪua", - "i" - ], - [ - "Ġf", - "i" - ], - [ - "Ġt", - "ËĪyÉĻ" - ], - [ - "ËĪaËIJ", - "ÉŁ" - ], - [ - "Ġt", - "jËĪe" - ], - [ - "ËĪaËIJn", - "aËIJ" - ], - [ - "st", - "ɾ" - ], - [ - "Êİ", - "e" - ], - [ - "ËĮe", - "ɪt" - ], - [ - "b", - "a" - ], - [ - "ð", - "as" - ], - [ - "v", - "Êģ" - ], - [ - "Ġz", - "ËĪÉĻËIJ" - ], - [ - "ËĪaËIJ", - "li" - ], - [ - "ÉŁÊ°", - "eËIJ" - ], - [ - "ËĪaËIJt", - "eËIJ" - ], - [ - "Ġv", - "ËĪa" - ], - [ - "Ġsa", - "l" - ], - [ - "ËĪaËIJ", - "no" - ], - [ - "ĠÉ¡ÉĻ", - "z" - ], - [ - "ĠhËĪoËIJ", - "ti" - ], - [ - "Ġɲ", - "ËĪiÉĽ" - ], - [ - "t", - "Éľ" - ], - [ - "ĠËĪaËIJ", - "p" - ], - [ - "Ġw", - "ËĪÉĽl" - ], - [ - "Ġm", - "ËĪɪl" - ], - [ - "Ġfy", - "ËIJɾ" - ], - [ - "ËĪÉĽËIJs", - "aËIJ" - ], - [ - "Ġb", - "ËĮiËIJ" - ], - [ - "ËĪaËIJ", - "jaËIJ" - ], - [ - "ËĪɪ", - "p" - ], - [ - "Ġf", - "Êģ" - ], - [ - "tsi", - "ËĪoËIJne" - ], - [ - "Ġw", - "ËĪuÉľ" - ], - [ - "Ġv", - "i" - ], - [ - "ĠwËĪÉij", - "Éľn" - ], - [ - "ËĪoËIJ", - "n" - ], - [ - "ĠÉĹ", - "ËĪÉĻɪ" - ], - [ - "ĠÊĿ", - "ËĪo" - ], - [ - "Ġr", - "a" - ], - [ - "m", - "ÉĻnt" - ], - [ - "ËĪaÊĬ", - "nd" - ], - [ - "Ġp", - "ÉĽÉ¾" - ], - [ - "ĠÉĹ", - "ËĪaËIJÊĬ" - ], - [ - "oËIJ", - "ɾ" - ], - [ - "h", - "ËĪo" - ], - [ - "ĠÉĴ", - "n" - ], - [ - "ĠÊİ", - "e" - ], - [ - "ĠsËĪɪ", - "ks" - ], - [ - "É¡", - "n" - ], - [ - "ĠÉ¡", - "ËĪa" - ], - [ - "Ġ", - "θj" - ], - [ - "Ġp", - "ËĪe" - ], - [ - "sp", - "e" - ], - [ - "Ġv", - "ËĪÉĻ" - ], - [ - "Ġf", - "ËĪɪ" - ], - [ - "ĠËĮɪnt", - "ÊĬ" - ], - [ - "l", - "ÉĻn" - ], - [ - "Ġn", - "ËĪiËIJd" - ], - [ - "ĠsËĮÊĬ", - "a" - ], - [ - "ĠËĪu", - "m" - ], - [ - "Ġd", - "ËĪeɪ" - ], - [ - "ĠËĪÊĮ", - "bʰi" - ], - [ - "ËĪÉijËIJ", - "ɾ" - ], - [ - "Ġb", - "ËĪiÉĽÉľt" - ], - [ - "Êİ", - "os" - ], - [ - "Ġtsh", - "ËĪaiÉľ" - ], - [ - "ĠËĮɪ", - "skËĮaËIJ" - ], - [ - "ĠaÊĬ", - "ÉĻ" - ], - [ - "ĠËĪy", - "æ" - ], - [ - "Ġd", - "yn" - ], - [ - "Ġm", - "ËĪiËIJn" - ], - [ - "ĠËĪÊĮ", - "cʰËIJ" - ], - [ - "Ġs", - "ÉĽ" - ], - [ - "Ġn", - "ËĪy" - ], - [ - "Ġn", - "ËĮÉĽl" - ], - [ - "É¡", - "ɾ" - ], - [ - "Êĥ", - "ËĪe" - ], - [ - "ĠÊĤ", - "ËĮÉĽ" - ], - [ - "ĠËĪÉĽ", - "vɹɪ" - ], - [ - "ËĪÉĽl", - "p" - ], - [ - "ĠbËĪa", - "k" - ], - [ - "Ġ", - "eËIJ" - ], - [ - "Ġf", - "ËĪaËIJ" - ], - [ - "Ġk", - "ÉĽl" - ], - [ - "ĠËĪeËIJ", - "s" - ], - [ - "j", - "ËĪaËIJd" - ], - [ - "Ġl", - "ËĮi" - ], - [ - "mb", - "ɾe" - ], - [ - "k", - "tÉĻ" - ], - [ - "nt", - "a" - ], - [ - "t", - "ËĪu" - ], - [ - "Ġð", - "ËĪat" - ], - [ - "ĠËĪa", - "β" - ], - [ - "ÉĻɹ", - "i" - ], - [ - "ĠkwËĮÉĽ", - "lla" - ], - [ - "Ġb", - "ÉĻn" - ], - [ - "r", - "ËĮÉĽ" - ], - [ - "Ġn", - "ÉĶ" - ], - [ - "ĠÉ¡", - "ËĪɪ" - ], - [ - "ĠËĪa", - "p" - ], - [ - "ɹ", - "ÉĻ" - ], - [ - "ËĪa", - "Éľkh" - ], - [ - "ĠÊIJ", - "ËĪi" - ], - [ - "Ġ", - "ËĪÉijËIJ" - ], - [ - "ɪ", - "É¡ÉĻn" - ], - [ - "Ġw", - "ËĪai" - ], - [ - "Ġp", - "ÉĻt" - ], - [ - "kËIJ", - "a" - ], - [ - "Ġb", - "ËĪÉĽËIJ" - ], - [ - "ËĪeËIJ", - "Êĭ" - ], - [ - "ls", - "ÉĻÊĬ" - ], - [ - "ĠcËĪaËIJh", - "ɪËĮeËIJ" - ], - [ - "Ġk", - "ÉĻn" - ], - [ - "ĠËĮaɪn", - "ÉĻm" - ], - [ - "ËĪuËIJ", - "t" - ], - [ - "Ġh", - "ËĪaÊĬ" - ], - [ - "Ġt", - "ËĪanto" - ], - [ - "ĠhÉIJ", - "z" - ], - [ - "Ġs", - "ËĪÊĮɾ" - ], - [ - "Ġn", - "o" - ], - [ - "Ġt", - "ËĪÉĶËIJ" - ], - [ - "Ġz", - "ËĪaɪ" - ], - [ - "ĠtÉķËĪiÉĽ", - "Éľ" - ], - [ - "Ġko", - "zËĪi" - ], - [ - "Ġk", - "ËĪei" - ], - [ - "ð", - "ËĪÉĶɾ" - ], - [ - "ËĮÉĶ", - "Êģ" - ], - [ - "Ġt", - "ËĪÊĮɾ" - ], - [ - "ĠÊIJ", - "ËĪÉĻ" - ], - [ - "ĠÉķËĪy", - "ÉĽÉľ" - ], - [ - "ĠmËĮÊĬ", - "ÉŁÊ°eËIJ" - ], - [ - "m", - "f" - ], - [ - "Ġv", - "ËĪiËIJdÉľ" - ], - [ - "k", - "ËĪa" - ], - [ - "ĠÉIJ", - "É¡" - ], - [ - "k", - "w" - ], - [ - "ĠÊģ", - "ÉĽ" - ], - [ - "x", - "ÉĻn" - ], - [ - "Ġd", - "ÊĬ" - ], - [ - "ĠkËĪÊĮɾ", - "nËĮeËIJ" - ], - [ - "jËĪaËIJd", - "aËIJ" - ], - [ - "Ġf", - "ÉĻ" - ], - [ - "ĠËĮi", - "mp" - ], - [ - "Ġh", - "ɪz" - ], - [ - "Ġ", - "ʰÏĩ" - ], - [ - "ËĪoËIJ", - "ni" - ], - [ - "Ġx", - "ËĪiÉľ" - ], - [ - "ËĪeËIJ", - "sÊĪ" - ], - [ - "Êı", - "bÉľ" - ], - [ - "ËĮÉĶɾ", - "ke" - ], - [ - "ĠÉ¡", - "ËĪÉĻÊĬ" - ], - [ - "ËĪɪ", - "ÊĥÉĻn" - ], - [ - "l", - "es" - ], - [ - "Ġf", - "ËĪiËIJ" - ], - [ - "É¡", - "tÉĻ" - ], - [ - "ËĪeËIJ", - "re" - ], - [ - "Ġv", - "ËĮaËIJ" - ], - [ - "Ġ", - "ËĪeɪ" - ], - [ - "Ġm", - "ËĪuÉĻÉľn" - ], - [ - "ĠÉ¡ËĪÊĬ", - "d" - ], - [ - "ĠmËĮa", - "ɪn" - ], - [ - "z", - "ËĪe" - ], - [ - "ĠlËĪi", - "Éľ" - ], - [ - "Ġm", - "u" - ], - [ - "Ġk", - "ËĮÉĽl" - ], - [ - "Ġj", - "ËĮÉĻh" - ], - [ - "Ġf", - "ËĮÉĶɾ" - ], - [ - "f", - "ɹ" - ], - [ - "Ġk", - "ËĪaɪn" - ], - [ - "ĠËĪÉĴ", - "lsÉĻÊĬ" - ], - [ - "θ", - "ɪÅĭ" - ], - [ - "Ġth", - "ËĪonÉ¡Éľ" - ], - [ - "t", - "ËĪÉij" - ], - [ - "θj", - "o" - ], - [ - "m", - "ËĪÉĶ" - ], - [ - "Ġ", - "os" - ], - [ - "Ġs", - "ÊĬ" - ], - [ - "ĠsËĪÊĮ", - "mÉĻ" - ], - [ - "ĠvËĮÉĽ", - "n" - ], - [ - "n", - "ËĪo" - ], - [ - "ĠËĪak", - "tÊĥuËIJ" - ], - [ - "É£", - "a" - ], - [ - "Ġtʰ", - "i" - ], - [ - "Ġf", - "ËĮi" - ], - [ - "Ġv", - "ËĪÉĽl" - ], - [ - "ĠtËĪu", - "tËIJi" - ], - [ - "x", - "os" - ] - ] - } -} \ No newline at end of file From 7c4a9d635cb5225ad1cac98f8c603bef7341a156 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Wed, 4 Feb 2026 09:30:48 +0000 Subject: [PATCH 34/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 508a1332c31e..09a6491b364a 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -1950,7 +1950,7 @@ def _sample_audio_codes( ) -> Tuple[torch.Tensor, torch.Tensor]: """ Sample audio codes from logits using either local transformer or parallel sampling. - + Returns: audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks) all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks) @@ -1972,9 +1972,7 @@ def _sample_audio_codes( all_codes_next_argmax = audio_codes_next else: # Parallel sampling from all codebook logits - audio_codes_next = self.sample_codes_from_logits( - all_code_logits_t, temperature=temperature, topk=topk - ) + audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) # Argmax sampling for reliable EOS detection all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) @@ -1995,7 +1993,7 @@ def _process_phoneme_predictions( ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Process phoneme predictions for the current timestep. - + Returns: pred_phoneme_tokens: Predicted phoneme tokens (B, phoneme_stacking_factor) gt_phoneme_tokens_current: GT phoneme tokens for current timestep (B, phoneme_stacking_factor) @@ -2034,9 +2032,7 @@ def _process_phoneme_predictions( gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] # Select input tokens (GT or predicted) and embed - input_phoneme_tokens_current = ( - gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens - ) + input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2)) return pred_phoneme_tokens, gt_phoneme_tokens_current, input_phoneme_tokens_current, input_phoneme_embedding @@ -2051,7 +2047,7 @@ def _compute_phoneme_channel_input( ) -> Tuple[torch.Tensor, torch.Tensor]: """ Compute the phoneme channel input embedding with masking. - + Returns: phoneme_channel_input_t: Masked phoneme embedding (B, 1, E) use_phoneme_input: Mask indicating which items should use phoneme input (B, 1, 1) @@ -2061,9 +2057,7 @@ def _compute_phoneme_channel_input( use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float() # Create zero embedding for items not using phoneme input - zero_phoneme_embedding = torch.zeros( - actual_batch_size, 1, self.cfg.embedding_dim, device=device - ) + zero_phoneme_embedding = torch.zeros(actual_batch_size, 1, self.cfg.embedding_dim, device=device) # Combine: use phoneme embedding where active, zero otherwise phoneme_channel_input_t = ( @@ -2088,7 +2082,7 @@ def _prepare_next_decoder_input( ) -> torch.Tensor: """ Prepare the input embedding for the next decoder step. - + Handles: - Mixing context embeddings with generated audio embeddings based on context completeness - Adding streaming text embeddings if in streaming mode @@ -2487,9 +2481,7 @@ def infer_batch( # Calculate predicted lengths, accounting for context offset pred_codes_start_indices = context_plus_audio_lens - min_context_len - predicted_lens = [ - end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size) - ] + predicted_lens = [end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size)] predicted_codes_lens = torch.tensor(predicted_lens, device=device).long() predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices From 61b8afde37bc0b393e1b9d67a7ec8dfa656de2f2 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Thu, 5 Feb 2026 14:34:04 -0800 Subject: [PATCH 35/94] Magpietts decoderonly 2601 simplify code (#60) * adding streaming inference support Signed-off-by: Paarth Neekhara * some code cleanup Signed-off-by: Paarth Neekhara * update to get the right number of samples for context audio Signed-off-by: Paarth Neekhara * add text eos token Signed-off-by: Paarth Neekhara * fix sample rate issues Signed-off-by: Paarth Neekhara * simplifying streaming inference Signed-off-by: Paarth Neekhara * streaming batched inference working Signed-off-by: Paarth Neekhara * inference features added Signed-off-by: Paarth Neekhara * Inference function simplified Signed-off-by: Paarth Neekhara * correct handling of audio EOS Signed-off-by: Paarth Neekhara * bug fix Signed-off-by: Paarth Neekhara * simplify streaming init Signed-off-by: Paarth Neekhara * remove unnecessary line Signed-off-by: Paarth Neekhara * bug fix able to reproduce F2F presentation results with new inference Signed-off-by: Paarth Neekhara --------- Signed-off-by: Paarth Neekhara --- examples/tts/magpietts_streaming_inference.py | 1018 ++++++++ nemo/collections/tts/models/easy_magpietts.py | 2235 ++++++++++------- .../modules/magpietts_inference/inference.py | 20 +- 3 files changed, 2354 insertions(+), 919 deletions(-) create mode 100644 examples/tts/magpietts_streaming_inference.py diff --git a/examples/tts/magpietts_streaming_inference.py b/examples/tts/magpietts_streaming_inference.py new file mode 100644 index 000000000000..6e72ea77b8e6 --- /dev/null +++ b/examples/tts/magpietts_streaming_inference.py @@ -0,0 +1,1018 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MagpieTTS Streaming Inference Test Script. + +This script tests the streaming TTS inference functionality, supporting both +single sample (batch_size=1) and batched inference (batch_size>1). + +For batched inference, each item in the batch can have different context lengths +and be in different processing phases (context, prompt, phoneme-only, audio). + +Example usage: + # Single sample inference from checkpoint + python examples/tts/magpietts_streaming_inference.py \ + --hparams_file /path/to/hparams.yaml \ + --checkpoint_file /path/to/model.ckpt \ + --codecmodel_path /path/to/codec.nemo \ + --context_audio /path/to/context.wav \ + --text "Hello, this is a test of streaming TTS inference." \ + --output_path /path/to/output.wav + + # Batched inference with multiple context audios + python examples/tts/magpietts_streaming_inference.py \ + --nemo_file /path/to/model.nemo \ + --codecmodel_path /path/to/codec.nemo \ + --context_audio /path/to/context1.wav /path/to/context2.wav \ + --context_duration 3.0 5.0 \ + --text "First text to synthesize." "Second text to synthesize." \ + --output_path /path/to/output.wav +""" +from __future__ import annotations + +import argparse +import os +import time +from typing import Optional + +import numpy as np +import soundfile as sf +import torch +from omegaconf import OmegaConf, open_dict + +from nemo.collections.tts.models import EasyMagpieTTSModel +from nemo.utils import logging + + +def load_model( + hparams_file: Optional[str], + checkpoint_file: Optional[str], + nemo_file: Optional[str], + codecmodel_path: str, + device: str = "cuda", +) -> EasyMagpieTTSModel: + """ + Load an EasyMagpieTTSModel from checkpoint or .nemo file. + + Args: + hparams_file: Path to hparams.yaml (required with checkpoint_file). + checkpoint_file: Path to .ckpt file (required with hparams_file). + nemo_file: Path to .nemo file (alternative to hparams + checkpoint). + codecmodel_path: Path to the audio codec model. + device: Device to load model on. + + Returns: + Loaded model ready for inference. + """ + if hparams_file is not None and checkpoint_file is not None: + # Load from hparams + checkpoint + logging.info(f"Loading model from checkpoint: {checkpoint_file}") + model_cfg = OmegaConf.load(hparams_file) + + # Handle different config structures + if "cfg" in model_cfg: + model_cfg = model_cfg.cfg + + with open_dict(model_cfg): + # Override codec model path + model_cfg.codecmodel_path = codecmodel_path + + # Disable training datasets + model_cfg.train_ds = None + model_cfg.validation_ds = None + + model = EasyMagpieTTSModel(cfg=model_cfg) + + # Load weights + ckpt = torch.load(checkpoint_file, weights_only=False) + state_dict = ckpt['state_dict'] + model.load_state_dict(state_dict) + + elif nemo_file is not None: + # Load from .nemo file + logging.info(f"Loading model from NeMo archive: {nemo_file}") + model_cfg = EasyMagpieTTSModel.restore_from(nemo_file, return_config=True) + + with open_dict(model_cfg): + model_cfg.codecmodel_path = codecmodel_path + model_cfg.train_ds = None + model_cfg.validation_ds = None + + model = EasyMagpieTTSModel.restore_from(nemo_file, override_config_path=model_cfg) + + else: + raise ValueError("Must provide either (hparams_file + checkpoint_file) or nemo_file") + + model.to(device) + model.eval() + logging.info("Model loaded and ready for streaming inference.") + + return model + + +def load_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor: + """ + Load audio file and resample if needed. + + Args: + audio_path: Path to audio file. + target_sample_rate: Target sample rate. + + Returns: + Audio tensor of shape (1, num_samples). + """ + audio, sr = sf.read(audio_path, dtype='float32') + + # Convert to mono if stereo + if len(audio.shape) > 1: + audio = audio.mean(axis=1) + + # Resample if needed + if sr != target_sample_rate: + import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate) + + return torch.from_numpy(audio).unsqueeze(0) # (1, num_samples) + + +def adjust_audio_to_duration( + audio: torch.Tensor, + sample_rate: int, + target_duration: float, + codec_model_samples_per_frame: int, +) -> torch.Tensor: + """ + Adjust audio to target_duration seconds, aligned to codec frame boundaries. + + The target number of samples is calculated to align with codec frame boundaries: + 1. Convert target_duration to number of codec frames + 2. Convert codec frames back to samples + + If audio is longer than target, take the first target_duration seconds. + If audio is shorter, repeat it until it reaches target_duration seconds. + + Args: + audio: Audio tensor of shape (1, num_samples). + sample_rate: Sample rate of the audio. + target_duration: Target duration in seconds. + codec_model_samples_per_frame: Number of audio samples per codec frame + (codec downsampling factor). + + Returns: + Audio tensor of shape (1, target_num_samples) where target_num_samples + is aligned to codec frame boundaries. + """ + # Calculate target samples aligned to codec frame boundaries + # Same logic as text_to_speech_dataset.py + num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame) + target_num_samples = num_codec_frames * codec_model_samples_per_frame + current_num_samples = audio.size(1) + + if current_num_samples >= target_num_samples: + # Audio is longer than target - take the first target_duration seconds + audio = audio[:, :target_num_samples] + else: + # Audio is shorter - repeat until we have enough samples + num_repeats = int(np.ceil(target_num_samples / current_num_samples)) + audio_repeated = audio.repeat(1, num_repeats) + audio = audio_repeated[:, :target_num_samples] + + return audio + + +def run_streaming_inference( + model: EasyMagpieTTSModel, + context_audio: torch.Tensor, + context_audio_lens: torch.Tensor, + context_text: str, + text: str, + phoneme_text: Optional[str] = None, + use_gt_phonemes: bool = False, + inference_mode: Optional[str] = None, + use_cfg: bool = False, + cfg_scale: float = 1.5, + use_local_transformer: bool = False, + temperature: float = 0.7, + topk: int = 80, + max_steps: int = 500, + verbose: bool = True, + force_dropout_text: bool = False, +) -> tuple: + """ + Run streaming TTS inference. + + Args: + model: The loaded EasyMagpieTTSModel. + context_audio: Context audio tensor (1, num_samples). + context_audio_lens: Length of context audio (1,). + context_text: Context text for speaker conditioning. + text: Main text to synthesize. + phoneme_text: Optional phoneme text for GT conditioning. If None, uses text. + use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing). + inference_mode: Inference mode name (e.g., "streaming_4_8"). + use_cfg: Whether to use classifier-free guidance. + cfg_scale: CFG scale factor. + use_local_transformer: Whether to use local transformer. + temperature: Sampling temperature. + topk: Top-k sampling parameter. + max_steps: Maximum generation steps. + verbose: Whether to print progress. + + Returns: + Tuple of (output, timing_info, context_audio_decoded, context_audio_decoded_lens). + output is StreamingFinalizeOutput with audio, codes, and phoneme predictions. + context_audio_decoded is the decoded context audio from the model's internal codes (for sanity checking). + """ + device = next(model.parameters()).device + + # Encode context audio to codes + context_audio = context_audio.to(device) + context_audio_lens = context_audio_lens.to(device) + + with torch.inference_mode(): + context_audio_codes, context_audio_codes_lens = model.audio_to_codes( + context_audio, context_audio_lens + ) + + # Tokenize context text + # Use the text conditioning tokenizer + tokenizer_name = model.text_conditioning_tokenizer_name + context_text_tokens = model.tokenizer.encode(context_text, tokenizer_name=tokenizer_name) + context_text_tokens = torch.tensor([context_text_tokens], dtype=torch.long, device=device) + context_text_tokens_lens = torch.tensor([context_text_tokens.size(1)], dtype=torch.long, device=device) + + # Tokenize main text + # Get the appropriate tokenizer name for main text + if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers: + main_tokenizer_name = 'english_phoneme' + else: + main_tokenizer_name = tokenizer_name + + text_tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name) + text_tokens = text_tokens + [model.eos_id] + text_tokens = torch.tensor(text_tokens, dtype=torch.long, device=device) + + # Tokenize phoneme text if provided (for GT phoneme conditioning) + gt_phoneme_tokens = None + gt_phoneme_tokens_lens = None + if model.phoneme_tokenizer is not None: + phoneme_source = phoneme_text if phoneme_text is not None else text + phoneme_tokens_list = model.phoneme_tokenizer.encode(phoneme_source) + # Add BOS and EOS + bos_id = model.phoneme_tokenizer.bos_token_id + eos_id = model.phoneme_tokenizer.eos_token_id + phoneme_tokens_list = [bos_id] + phoneme_tokens_list + [eos_id] + gt_phoneme_tokens = torch.tensor([phoneme_tokens_list], dtype=torch.long, device=device) + gt_phoneme_tokens_lens = torch.tensor([len(phoneme_tokens_list)], dtype=torch.long, device=device) + + phoneme_input_type = 'gt' if use_gt_phonemes else 'pred' + + # Get streaming delays for logging + mode_name = inference_mode or model.default_inference_mode + training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0]) + phoneme_delay = training_mode.streaming_phonemes_delay + speech_delay = training_mode.streaming_speech_delay + + if verbose: + logging.info(f"Context audio codes shape: {context_audio_codes.shape}") + logging.info(f"Context text tokens: {context_text_tokens.shape}") + logging.info(f"Main text tokens: {text_tokens.shape} ({len(text_tokens)} tokens)") + if gt_phoneme_tokens is not None: + logging.info(f"GT phoneme tokens: {gt_phoneme_tokens.shape} ({gt_phoneme_tokens_lens[0].item()} tokens)") + logging.info(f"Phoneme input type: {phoneme_input_type}") + logging.info(f"Using inference mode: {mode_name}") + logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}") + logging.info("Phases: Prompt (0 to phoneme_delay) -> Phoneme-only (phoneme_delay to speech_delay) -> Audio") + + # Initialize streaming state + start_time = time.time() + + state = model.streaming_init( + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + inference_mode=inference_mode, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer=use_local_transformer, + temperature=temperature, + topk=topk, + phoneme_input_type=phoneme_input_type, + gt_phoneme_tokens=gt_phoneme_tokens, + gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, + ) + + init_time = time.time() - start_time + if verbose: + logging.info(f"Streaming init completed in {init_time:.3f}s") + + # Decode and return context audio for sanity check + # The context_audio_codes in state have special tokens and are stacked + # We need to remove special tokens and decode them + with torch.inference_mode(): + ctx_codes = state.context_audio_codes.clone() + ctx_codes_lens = state.context_audio_codes_lens.clone() + # Remove special tokens (BOS and EOS) + ctx_codes, ctx_codes_lens = model.remove_special_tokens( + codes=ctx_codes, + codes_len=ctx_codes_lens, + ) + # codes_to_audio will handle unstacking internally + context_audio_decoded, context_audio_decoded_lens, _ = model.codes_to_audio(ctx_codes, ctx_codes_lens) + + # Feed text tokens one at a time + generation_start = time.time() + num_audio_frames = 0 + num_phoneme_frames = 0 + prompt_phase_tokens = 0 + phoneme_only_phase_tokens = 0 + + for i, token in enumerate(text_tokens): + state, audio_codes, phoneme_tokens = model.streaming_step( + state, text_tokens=token.unsqueeze(0), force_dropout_text=force_dropout_text + ) + + # Track which phase we're in + if audio_codes is None and phoneme_tokens is None: + prompt_phase_tokens += 1 + elif audio_codes is None and phoneme_tokens is not None: + phoneme_only_phase_tokens += 1 + num_phoneme_frames += 1 + else: + if audio_codes is not None: + num_audio_frames += 1 + if phoneme_tokens is not None: + num_phoneme_frames += 1 + + if verbose and (i + 1) % 10 == 0: + phase = "prompt" if audio_codes is None and phoneme_tokens is None else ( + "phoneme-only" if audio_codes is None else "audio" + ) + logging.info( + f"Processed {i + 1}/{len(text_tokens)} text tokens (phase: {phase}), " + f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}" + ) + + if state.finished: + if verbose: + logging.info(f"EOS detected at text token {i + 1}") + break + + # Continue generating until finished (text has ended) + continuation_steps = 0 + while not state.finished and continuation_steps < max_steps: + state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text) + + if audio_codes is not None: + num_audio_frames += 1 + if phoneme_tokens is not None: + num_phoneme_frames += 1 + + continuation_steps += 1 + + if verbose and continuation_steps % 20 == 0: + logging.info( + f"Continuation step {continuation_steps}, " + f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}" + ) + + generation_time = time.time() - generation_start + + if verbose: + logging.info(f"Generation completed in {generation_time:.3f}s") + logging.info(f"Prompt phase tokens: {prompt_phase_tokens}") + logging.info(f"Phoneme-only phase tokens: {phoneme_only_phase_tokens}") + logging.info(f"Audio frames generated: {num_audio_frames}") + logging.info(f"Phoneme frames generated: {num_phoneme_frames}") + logging.info(f"Continuation steps: {continuation_steps}") + + # Finalize and get complete audio + output = model.streaming_finalize(state) + + total_time = time.time() - start_time + + if verbose and output.phoneme_text: + logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}") + + timing_info = { + 'init_time': init_time, + 'generation_time': generation_time, + 'total_time': total_time, + 'num_text_tokens': len(text_tokens), + 'prompt_phase_tokens': prompt_phase_tokens, + 'phoneme_only_phase_tokens': phoneme_only_phase_tokens, + 'num_audio_frames': num_audio_frames, + 'num_phoneme_frames': num_phoneme_frames, + 'continuation_steps': continuation_steps, + } + + return output, timing_info, context_audio_decoded, context_audio_decoded_lens + + +def run_batched_streaming_inference( + model: EasyMagpieTTSModel, + context_audios: list[torch.Tensor], + context_audio_lens_list: list[torch.Tensor], + context_texts: list[str], + texts: list[str], + phoneme_texts: Optional[list[str]] = None, + use_gt_phonemes: bool = False, + inference_mode: Optional[str] = None, + use_cfg: bool = False, + cfg_scale: float = 1.5, + use_local_transformer: bool = False, + temperature: float = 0.7, + topk: int = 80, + max_steps: int = 500, + verbose: bool = True, + force_dropout_text: bool = False, +) -> tuple: + """ + Run batched streaming TTS inference. + + Each batch item can have different context lengths. The streaming processes + only the minimum context length initially, then continues processing remaining + context per-item in the "context phase" before moving to prompt/audio phases. + + Args: + model: The loaded EasyMagpieTTSModel. + context_audios: List of context audio tensors, each (1, num_samples). + context_audio_lens_list: List of context audio lengths, each (1,). + context_texts: List of context texts for speaker conditioning. + texts: List of main texts to synthesize. + phoneme_texts: Optional list of phoneme texts for GT conditioning. If None, uses texts. + use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing). + inference_mode: Inference mode name (e.g., "streaming_4_8"). + use_cfg: Whether to use classifier-free guidance. + cfg_scale: CFG scale factor. + use_local_transformer: Whether to use local transformer. + temperature: Sampling temperature. + topk: Top-k sampling parameter. + max_steps: Maximum generation steps. + verbose: Whether to print progress. + + Returns: + Tuple of (output, timing_info) where output is StreamingFinalizeOutput. + """ + device = next(model.parameters()).device + batch_size = len(context_audios) + + assert len(context_texts) == batch_size, "Number of context texts must match batch size" + assert len(texts) == batch_size, "Number of texts must match batch size" + + # Encode context audio to codes for each item + context_audio_codes_list = [] + context_audio_codes_lens_list = [] + + with torch.inference_mode(): + for i in range(batch_size): + context_audio = context_audios[i].to(device) + context_audio_lens = context_audio_lens_list[i].to(device) + codes, codes_lens = model.audio_to_codes(context_audio, context_audio_lens) + context_audio_codes_list.append(codes) + context_audio_codes_lens_list.append(codes_lens) + + # Pad and batch context audio codes + max_context_len = max(c.size(-1) for c in context_audio_codes_list) + num_codebooks = context_audio_codes_list[0].size(1) + + context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_len, dtype=torch.long, device=device) + context_audio_codes_lens = torch.zeros(batch_size, dtype=torch.long, device=device) + + for i in range(batch_size): + codes = context_audio_codes_list[i] + codes_len = context_audio_codes_lens_list[i] + context_audio_codes[i, :, :codes.size(-1)] = codes[0] + context_audio_codes_lens[i] = codes_len[0] + + # Tokenize context texts + tokenizer_name = model.text_conditioning_tokenizer_name + context_text_tokens_list = [] + for ctx_text in context_texts: + tokens = model.tokenizer.encode(ctx_text, tokenizer_name=tokenizer_name) + context_text_tokens_list.append(tokens) + + # Pad and batch context text tokens + max_context_text_len = max(len(t) for t in context_text_tokens_list) + context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device) + context_text_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) + + for i, tokens in enumerate(context_text_tokens_list): + context_text_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) + context_text_tokens_lens[i] = len(tokens) + + # Tokenize main texts + if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers: + main_tokenizer_name = 'english_phoneme' + else: + main_tokenizer_name = tokenizer_name + + text_tokens_list = [] + for text in texts: + tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name) + tokens = tokens + [model.eos_id] + text_tokens_list.append(torch.tensor(tokens, dtype=torch.long, device=device)) + + max_text_len = max(len(t) for t in text_tokens_list) + + # Tokenize phoneme texts if model has phoneme tokenizer + gt_phoneme_tokens = None + gt_phoneme_tokens_lens = None + if model.phoneme_tokenizer is not None: + phoneme_sources = phoneme_texts if phoneme_texts is not None else texts + bos_id = model.phoneme_tokenizer.bos_token_id + eos_id = model.phoneme_tokenizer.eos_token_id + phoneme_tokens_lists = [] + for ptext in phoneme_sources: + tokens = model.phoneme_tokenizer.encode(ptext) + tokens = [bos_id] + tokens + [eos_id] + phoneme_tokens_lists.append(tokens) + max_phoneme_len = max(len(t) for t in phoneme_tokens_lists) + gt_phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device) + gt_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) + for i, tokens in enumerate(phoneme_tokens_lists): + gt_phoneme_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) + gt_phoneme_tokens_lens[i] = len(tokens) + + phoneme_input_type = 'gt' if use_gt_phonemes else 'pred' + + # Get streaming delays for logging + mode_name = inference_mode or model.default_inference_mode + training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0]) + phoneme_delay = training_mode.streaming_phonemes_delay + speech_delay = training_mode.streaming_speech_delay + + if verbose: + logging.info(f"Batch size: {batch_size}") + logging.info(f"Context audio codes shape: {context_audio_codes.shape}") + logging.info(f"Context audio codes lens: {context_audio_codes_lens.tolist()}") + logging.info(f"Context text tokens shape: {context_text_tokens.shape}") + logging.info(f"Context text tokens lens: {context_text_tokens_lens.tolist()}") + logging.info(f"Max text tokens: {max_text_len}") + logging.info(f"Text tokens per item: {[len(t) for t in text_tokens_list]}") + if gt_phoneme_tokens is not None: + logging.info(f"GT phoneme tokens shape: {gt_phoneme_tokens.shape}") + logging.info(f"GT phoneme tokens lens: {gt_phoneme_tokens_lens.tolist()}") + logging.info(f"Phoneme input type: {phoneme_input_type}") + logging.info(f"Using inference mode: {mode_name}") + logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}") + + # Initialize streaming state + start_time = time.time() + + state = model.streaming_init( + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + inference_mode=inference_mode, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer=use_local_transformer, + temperature=temperature, + topk=topk, + phoneme_input_type=phoneme_input_type, + gt_phoneme_tokens=gt_phoneme_tokens, + gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, + ) + + init_time = time.time() - start_time + if verbose: + logging.info(f"Streaming init completed in {init_time:.3f}s") + logging.info(f"Initial context_position: {state.context_position.tolist()}") + logging.info(f"Full context lens: {state.full_context_lens.tolist()}") + + # Feed text tokens one at a time + generation_start = time.time() + step_count = 0 + num_audio_frames = 0 + + # Track which items have finished their text + text_positions = torch.zeros(batch_size, dtype=torch.long, device=device) + text_finished_mask = torch.zeros(batch_size, dtype=torch.bool, device=device) + + # Main streaming loop + while not state.finished.all() and step_count < max_steps + max_text_len: + # Determine which items are in context phase + in_context_phase = state.context_position < state.full_context_lens + + # Prepare text tokens for this step + # Items in context phase: use 0 (will be ignored) + # Items not in context phase: use their next text token or 0 if text finished + text_tokens_batch = torch.zeros(batch_size, dtype=torch.long, device=device) + + for i in range(batch_size): + if not in_context_phase[i] and not text_finished_mask[i]: + if text_positions[i] < len(text_tokens_list[i]): + text_tokens_batch[i] = text_tokens_list[i][text_positions[i]] + text_positions[i] += 1 + else: + text_finished_mask[i] = True + + # Determine if we should pass None (all items have finished text and exited context) + all_text_done = text_finished_mask.all() and not in_context_phase.any() + + if all_text_done: + state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text) + else: + state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text) + + if audio_codes is not None: + num_audio_frames += 1 + + step_count += 1 + + if verbose and step_count % 20 == 0: + in_ctx = state.context_position < state.full_context_lens + logging.info( + f"Step {step_count}: " + f"in_context_phase={in_ctx.tolist()}, " + f"text_positions={text_positions.tolist()}, " + f"audio_frames={num_audio_frames}, " + f"finished={state.finished.tolist()}" + ) + + generation_time = time.time() - generation_start + + if verbose: + logging.info(f"Generation completed in {generation_time:.3f}s") + logging.info(f"Total steps: {step_count}") + logging.info(f"Audio frames generated: {num_audio_frames}") + + # Finalize and get complete audio + output = model.streaming_finalize(state) + + total_time = time.time() - start_time + + if verbose and output.phoneme_text: + for i, ptext in enumerate(output.phoneme_text): + logging.info(f"Predicted phoneme text [{i}]: {ptext}") + + timing_info = { + 'init_time': init_time, + 'generation_time': generation_time, + 'total_time': total_time, + 'num_text_tokens': [len(t) for t in text_tokens_list], + 'num_audio_frames': num_audio_frames, + 'total_steps': step_count, + } + + return output, timing_info + + +def main(): + parser = argparse.ArgumentParser( + description="MagpieTTS Streaming Inference Test Script", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # Model loading arguments + model_group = parser.add_argument_group('Model Loading') + model_group.add_argument( + '--hparams_file', + type=str, + default=None, + help='Path to hparams.yaml file', + ) + model_group.add_argument( + '--checkpoint_file', + type=str, + default=None, + help='Path to .ckpt checkpoint file', + ) + model_group.add_argument( + '--nemo_file', + type=str, + default=None, + help='Path to .nemo model file', + ) + model_group.add_argument( + '--codecmodel_path', + type=str, + required=True, + help='Path to audio codec model (.nemo)', + ) + + # Input arguments + input_group = parser.add_argument_group('Input') + input_group.add_argument( + '--context_audio', + type=str, + nargs='+', + required=True, + help='Path(s) to context audio file(s) for speaker cloning. ' + 'Multiple files enable batched inference.', + ) + input_group.add_argument( + '--context_text', + type=str, + nargs='+', + default=["[NO TEXT CONTEXT]"], + help='Context text(s) for speaker conditioning. Provide one per context audio, ' + 'or a single value to use for all. (default: "[NO TEXT CONTEXT]")', + ) + input_group.add_argument( + '--context_duration', + type=float, + nargs='+', + default=[5.0], + help='Target duration(s) for context audio in seconds. Provide one per context audio, ' + 'or a single value to use for all. If audio is longer, ' + 'first N seconds are used. If shorter, audio is repeated. (default: 5.0)', + ) + input_group.add_argument( + '--text', + type=str, + nargs='+', + required=True, + help='Text(s) to synthesize. Provide one per context audio for batched inference.', + ) + input_group.add_argument( + '--phoneme_text', + type=str, + nargs='+', + default=None, + help='Phoneme text(s) for GT phoneme conditioning. If not provided, uses --text. ' + 'Provide one per context audio for batched inference.', + ) + input_group.add_argument( + '--use_gt_phonemes', + action='store_true', + help='Use ground-truth phonemes as decoder input (teacher forcing). ' + 'If not set, uses model-predicted phonemes.', + ) + + # Output arguments + output_group = parser.add_argument_group('Output') + output_group.add_argument( + '--output_path', + type=str, + default='streaming_output.wav', + help='Path for output audio file', + ) + + # Inference arguments + infer_group = parser.add_argument_group('Inference Parameters') + infer_group.add_argument( + '--inference_mode', + type=str, + default=None, + help='Inference mode name (e.g., "streaming_4_8"). Uses model default if not specified.', + ) + infer_group.add_argument( + '--use_cfg', + action='store_true', + help='Enable classifier-free guidance', + ) + infer_group.add_argument( + '--cfg_scale', + type=float, + default=1.5, + help='CFG scale factor (higher = stronger conditioning)', + ) + infer_group.add_argument( + '--use_local_transformer', + action='store_true', + help='Use local transformer for inference', + ) + infer_group.add_argument( + '--temperature', + type=float, + default=0.7, + help='Sampling temperature', + ) + infer_group.add_argument( + '--topk', + type=int, + default=80, + help='Top-k sampling parameter', + ) + infer_group.add_argument( + '--max_steps', + type=int, + default=500, + help='Maximum generation steps after text ends', + ) + infer_group.add_argument( + '--device', + type=str, + default='cuda', + choices=['cuda', 'cpu'], + help='Device to run inference on', + ) + infer_group.add_argument( + '--verbose', + action='store_true', + help='Print detailed progress information', + ) + infer_group.add_argument( + '--force_dropout_text', + action='store_true', + help='Force dropout of text embeddings (pass zeros) to test phoneme-only inference', + ) + + args = parser.parse_args() + + # Validate arguments + has_ckpt_mode = args.hparams_file is not None and args.checkpoint_file is not None + has_nemo_mode = args.nemo_file is not None + + if not (has_ckpt_mode or has_nemo_mode): + parser.error("Must provide either (--hparams_file and --checkpoint_file) or --nemo_file") + + # Load model + model = load_model( + hparams_file=args.hparams_file, + checkpoint_file=args.checkpoint_file, + nemo_file=args.nemo_file, + codecmodel_path=args.codecmodel_path, + device=args.device, + ) + + model = model.float() + + # Determine batch size from number of context audios + batch_size = len(args.context_audio) + + # Expand context_text, context_duration, and text to match batch_size + context_texts = args.context_text + if len(context_texts) == 1 and batch_size > 1: + context_texts = context_texts * batch_size + elif len(context_texts) != batch_size: + parser.error(f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1") + + context_durations = args.context_duration + if len(context_durations) == 1 and batch_size > 1: + context_durations = context_durations * batch_size + elif len(context_durations) != batch_size: + parser.error(f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1") + + texts = args.text + if len(texts) == 1 and batch_size > 1: + texts = texts * batch_size + elif len(texts) != batch_size: + parser.error(f"Number of texts ({len(texts)}) must match number of context_audios ({batch_size}) or be 1") + + # Handle phoneme_text - default to text if not provided + phoneme_texts = args.phoneme_text + if phoneme_texts is None: + phoneme_texts = texts + elif len(phoneme_texts) == 1 and batch_size > 1: + phoneme_texts = phoneme_texts * batch_size + elif len(phoneme_texts) != batch_size: + parser.error(f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1") + + # Load and process context audios + context_audios = [] + context_audio_lens_list = [] + + for i, (audio_path, duration) in enumerate(zip(args.context_audio, context_durations)): + logging.info(f"Loading context audio {i+1}/{batch_size} from: {audio_path}") + audio = load_audio(audio_path, model.sample_rate) + original_duration = audio.size(1) / model.sample_rate + logging.info(f" Original duration: {original_duration:.2f}s") + + # Adjust to target duration (aligned to codec frame boundaries) + audio = adjust_audio_to_duration(audio, model.sample_rate, duration, model.codec_model_samples_per_frame) + adjusted_duration = audio.size(1) / model.sample_rate + logging.info(f" Adjusted duration: {adjusted_duration:.2f}s (target: {duration}s, codec-aligned)") + + context_audios.append(audio) + context_audio_lens_list.append(torch.tensor([audio.size(1)], dtype=torch.long)) + + logging.info(f"\nBatch size: {batch_size}") + logging.info(f"Context texts: {context_texts}") + logging.info(f"Texts to synthesize: {texts}") + logging.info(f"Phoneme texts: {phoneme_texts}") + logging.info(f"Use GT phonemes: {args.use_gt_phonemes}") + + # Use single-sample or batched inference + if batch_size == 1: + logging.info("\n=== Running single-sample streaming inference ===") + output, timing_info, context_audio_decoded, context_audio_decoded_lens = run_streaming_inference( + model=model, + context_audio=context_audios[0], + context_audio_lens=context_audio_lens_list[0], + context_text=context_texts[0], + text=texts[0], + phoneme_text=phoneme_texts[0], + use_gt_phonemes=args.use_gt_phonemes, + inference_mode=args.inference_mode, + use_cfg=args.use_cfg, + cfg_scale=args.cfg_scale, + use_local_transformer=args.use_local_transformer, + temperature=args.temperature, + topk=args.topk, + max_steps=args.max_steps, + verbose=args.verbose, + force_dropout_text=args.force_dropout_text, + ) + + # Save output + output_dir = os.path.dirname(args.output_path) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir) + + audio_np = output.audio[0, :output.audio_len[0].item()].cpu().numpy() + sf.write(args.output_path, audio_np, model.output_sample_rate) + logging.info(f"Output saved to: {args.output_path}") + + # Save decoded context audio for sanity check + output_base, output_ext = os.path.splitext(args.output_path) + context_output_path = f"{output_base}_context_decoded{output_ext}" + context_audio_np = context_audio_decoded[0, :context_audio_decoded_lens[0].item()].cpu().numpy() + sf.write(context_output_path, context_audio_np, model.output_sample_rate) + + logging.info(f"Context audio (decoded from codes) saved to: {context_output_path}") + logging.info(f"Context audio duration: {context_audio_decoded_lens[0].item() / model.output_sample_rate:.2f}s") + logging.info(f"Audio duration: {output.audio_len[0].item() / model.output_sample_rate:.2f}s") + logging.info(f"Generated codes shape: {output.audio_codes.shape}") + if output.phoneme_text: + logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}") + + # Print timing summary + logging.info("\n=== Timing Summary ===") + logging.info(f"Init time: {timing_info['init_time']:.3f}s") + logging.info(f"Generation time: {timing_info['generation_time']:.3f}s") + logging.info(f"Total time: {timing_info['total_time']:.3f}s") + logging.info(f"Text tokens processed: {timing_info['num_text_tokens']}") + logging.info(f" - Prompt phase tokens: {timing_info['prompt_phase_tokens']}") + logging.info(f" - Phoneme-only phase tokens: {timing_info['phoneme_only_phase_tokens']}") + logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}") + logging.info(f"Phoneme frames generated: {timing_info['num_phoneme_frames']}") + logging.info(f"Continuation steps: {timing_info['continuation_steps']}") + + # Calculate RTF + audio_duration = output.audio_len[0].item() / model.output_sample_rate + rtf = audio_duration / timing_info['total_time'] + logging.info(f"Real-time factor (RTF): {rtf:.2f}x") + + else: + logging.info(f"\n=== Running batched streaming inference (batch_size={batch_size}) ===") + output, timing_info = run_batched_streaming_inference( + model=model, + context_audios=context_audios, + context_audio_lens_list=context_audio_lens_list, + context_texts=context_texts, + texts=texts, + phoneme_texts=phoneme_texts, + use_gt_phonemes=args.use_gt_phonemes, + inference_mode=args.inference_mode, + use_cfg=args.use_cfg, + cfg_scale=args.cfg_scale, + use_local_transformer=args.use_local_transformer, + temperature=args.temperature, + topk=args.topk, + max_steps=args.max_steps, + verbose=args.verbose, + force_dropout_text=args.force_dropout_text, + ) + + # Save outputs for each batch item + output_dir = os.path.dirname(args.output_path) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir) + + output_base, output_ext = os.path.splitext(args.output_path) + + for i in range(batch_size): + output_path_i = f"{output_base}_{i}{output_ext}" + audio_np = output.audio[i, :output.audio_len[i].item()].cpu().numpy() + sf.write(output_path_i, audio_np, model.output_sample_rate) + audio_duration_i = output.audio_len[i].item() / model.output_sample_rate + logging.info(f"Output {i+1}/{batch_size} saved to: {output_path_i} (duration: {audio_duration_i:.2f}s)") + if output.phoneme_text and i < len(output.phoneme_text): + logging.info(f" Predicted phoneme text: {output.phoneme_text[i]}") + + logging.info(f"\nGenerated codes shape: {output.audio_codes.shape}") + + # Print timing summary + logging.info("\n=== Timing Summary ===") + logging.info(f"Init time: {timing_info['init_time']:.3f}s") + logging.info(f"Generation time: {timing_info['generation_time']:.3f}s") + logging.info(f"Total time: {timing_info['total_time']:.3f}s") + logging.info(f"Text tokens per item: {timing_info['num_text_tokens']}") + logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}") + logging.info(f"Total steps: {timing_info['total_steps']}") + + # Calculate average RTF + total_audio_duration = sum(output.audio_len[i].item() for i in range(batch_size)) / model.output_sample_rate + avg_rtf = total_audio_duration / timing_info['total_time'] + logging.info(f"Average real-time factor (RTF): {avg_rtf:.2f}x") + logging.info(f"Total audio duration (all items): {total_audio_duration:.2f}s") + + +if __name__ == "__main__": + main() diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 09a6491b364a..1351b8409417 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -15,7 +15,7 @@ import time from dataclasses import dataclass from functools import partial -from typing import Dict, List, Optional, Sequence, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple import torch import wandb @@ -68,38 +68,6 @@ class TrainingMode: mode_idx: int -@dataclass -class ContextTensors: - """ - Output dataclass from prepare_context_tensors containing all context-related tensors. - - Attributes: - context_embedding: Combined context embedding tensor (B, T_total, E) - context_lens: Length of context for each batch item (B,) - context_audio_codes: Audio codes for context audio (B, C, T') - context_audio_embedded: Embedded context audio codes (B, T', E) - context_audio_codes_lens: Length of context audio codes (B,) - text_embedded: Embedded text tokens (B, L, E) - text_lens: Length of text for each batch item (B,) - context_text_tokens: Context text token IDs (B, L) - context_text_lens: Length of context text (B,) - remaining_text_embedded: Embedded remaining text for streaming mode, None otherwise (B, T, E) - remaining_text_lens: Length of remaining text for streaming mode, None otherwise (B,) - """ - - context_embedding: torch.Tensor - context_lens: torch.Tensor - context_audio_codes: torch.Tensor - context_audio_embedded: torch.Tensor - context_audio_codes_lens: torch.Tensor - text_embedded: torch.Tensor - text_lens: torch.Tensor - context_text_tokens: torch.Tensor - context_text_lens: torch.Tensor - remaining_text_embedded: Optional[torch.Tensor] - remaining_text_lens: Optional[torch.Tensor] - - @dataclass class ProcessBatchOutput: """ @@ -132,6 +100,120 @@ class ProcessBatchOutput: selected_training_mode: Optional[str] = None +@dataclass +class StreamingState: + """ + State for streaming TTS inference with batch support. + + This dataclass maintains all the necessary state for autoregressive streaming + generation, allowing text tokens to be fed incrementally. Supports arbitrary + batch sizes where each batch item can have different context lengths and be + in different phases. + + The streaming operates in four phases (per batch item): + 1. Context phase (context_position < full_context_lens): Processing remaining context + 2. Prompt phase (text_tokens_seen < phoneme_delay): Only text, no predictions + 3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): Phoneme predictions only + 4. Audio phase (text_tokens_seen >= speech_delay): Both phoneme and audio predictions + + Attributes: + batch_size: Number of items in the batch. + past_key_values: KV cache from the transformer for efficient autoregressive decoding. + cache_seq_len: Current sequence length in the cache. + all_predictions: List of predicted audio codes at each timestep, each tensor is (B, C, S) unstacked. + all_phoneme_predictions: List of predicted phoneme tokens at each timestep, each tensor is (B, phoneme_stacking_factor). + context_audio_codes: Processed context audio codes with special tokens. + context_audio_codes_lens: Length of context audio codes. + context_lens: Total context length (task_embedding + context_audio + context_text). + full_context_embedding: Full context embedding for each batch item (B, T_max_context, E). + full_context_lens: Full context length for each batch item (B,). + context_position: How much context has been processed per batch item (B,). + text_tokens_seen: Number of text tokens processed so far per batch item (B,). + phoneme_steps: Number of phoneme prediction steps taken per batch item (B,). + audio_steps: Number of audio prediction steps taken per batch item (B,). + phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor. + finished: Whether generation is complete per batch item (B,) bool tensor. + device: Device tensors are on. + training_mode: The training mode being used for inference. + use_cfg: Whether classifier-free guidance is enabled. + cfg_scale: CFG scale factor. + use_local_transformer: Whether to use local transformer for inference. + temperature: Sampling temperature. + topk: Top-k sampling parameter. + dummy_context_embedding_unconditional: Unconditional embedding for CFG (if enabled). + last_hidden: Last hidden state from transformer. + text_finished: Whether text input has finished per batch item (B,) bool tensor. + phoneme_input_type: 'gt' or 'pred' for phoneme tokens. + phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. + last_phoneme_tokens: Last predicted phoneme tokens (B, phoneme_stacking_factor). + last_audio_codes: Last predicted audio codes (B, num_codebooks). + audio_prediction_start_idx: Global frame index where audio predictions start per batch item (B,). + audio_prediction_end_idx: Global frame index where audio predictions end per batch item (B,), -1 if not ended. + phoneme_prediction_start_idx: Global step index where phoneme predictions start per batch item (B,). + phoneme_prediction_end_idx: Global step index where phoneme predictions end per batch item (B,), -1 if not ended. + """ + + batch_size: int + past_key_values: Optional[Tuple] + cache_seq_len: int + all_predictions: List[torch.Tensor] + all_phoneme_predictions: List[torch.Tensor] + context_audio_codes: torch.Tensor + context_audio_codes_lens: torch.Tensor + context_lens: torch.Tensor + full_context_embedding: torch.Tensor + full_context_lens: torch.Tensor + context_position: torch.Tensor + text_tokens_seen: torch.Tensor + phoneme_steps: torch.Tensor + audio_steps: torch.Tensor + phoneme_stream_ended: torch.Tensor + finished: torch.Tensor + device: torch.device + training_mode: TrainingMode + use_cfg: bool + cfg_scale: float + use_local_transformer: bool + temperature: float + topk: int + dummy_context_embedding_unconditional: Optional[torch.Tensor] + last_hidden: torch.Tensor + text_finished: torch.Tensor + phoneme_input_type: str + phoneme_sampling_method: str + last_phoneme_tokens: Optional[torch.Tensor] + last_audio_codes: Optional[torch.Tensor] + audio_prediction_start_idx: torch.Tensor + audio_prediction_end_idx: torch.Tensor + phoneme_prediction_start_idx: torch.Tensor + phoneme_prediction_end_idx: torch.Tensor + gt_phoneme_embeddings: Optional[torch.Tensor] = None # (B, T', E) pre-computed GT embeddings + gt_phoneme_lens: Optional[torch.Tensor] = None # (B,) lengths after stacking + + +@dataclass +class StreamingFinalizeOutput: + """Output from streaming_finalize containing audio and phoneme predictions.""" + + audio: torch.Tensor # (B, max_audio_len) generated audio waveform + audio_len: torch.Tensor # (B,) length of audio per batch item + audio_codes: torch.Tensor # (B, num_codebooks, T) generated audio codes + audio_codes_len: torch.Tensor # (B,) length of codes per batch item + phoneme_tokens: List[List[int]] # List of phoneme token sequences per batch item + phoneme_text: List[str] # Decoded phoneme strings per batch item + + +@dataclass +class InferBatchOutput: + """Output dataclass for EasyMagpieTTS infer_batch method.""" + + predicted_audio: torch.Tensor # (B, T_audio) + predicted_audio_lens: torch.Tensor # (B,) + predicted_codes: torch.Tensor # (B, num_codebooks, T_frames) + predicted_codes_lens: torch.Tensor # (B,) + rtf_metrics: Dict[str, Any] + + def worker_init_fn(worker_id): # For mp.set_start_method("spawn", force=True) # The dataset class should be picklable, so we initialize non-picklable objects here @@ -885,13 +967,13 @@ def log_val_audio_example( wandb_audio_log[f"Audio/Example_{idx}"] = list() if context_audio_np is not None: wandb_audio_log[f"Audio/Example_{idx}"].append( - wandb.Audio(context_audio_np, sample_rate=self.sample_rate, caption="context") + wandb.Audio(context_audio_np, sample_rate=self.output_sample_rate, caption="context") ) wandb_audio_log[f"Audio/Example_{idx}"].append( - wandb.Audio(pred_audio_np, sample_rate=self.sample_rate, caption="prediction") + wandb.Audio(pred_audio_np, sample_rate=self.output_sample_rate, caption="prediction") ) wandb_audio_log[f"Audio/Example_{idx}"].append( - wandb.Audio(target_audio_np, sample_rate=self.sample_rate, caption="target") + wandb.Audio(target_audio_np, sample_rate=self.output_sample_rate, caption="target") ) if is_tb: @@ -900,19 +982,19 @@ def log_val_audio_example( f'Example_{idx}/context', context_audio_np, global_step=self.global_step, - sample_rate=self.sample_rate, + sample_rate=self.output_sample_rate, ) logger.experiment.add_audio( f'Example_{idx}/prediction', pred_audio_np, global_step=self.global_step, - sample_rate=self.sample_rate, + sample_rate=self.output_sample_rate, ) logger.experiment.add_audio( f'Example_{idx}/target', target_audio_np, global_step=self.global_step, - sample_rate=self.sample_rate, + sample_rate=self.output_sample_rate, ) return wandb_audio_log @@ -977,27 +1059,21 @@ def join_embeddings_temporally( def prepare_context_tensors( self, - text: torch.Tensor, - text_lens: torch.Tensor, context_text_tokens: torch.Tensor, context_text_tokens_lens: torch.Tensor, context_audio_codes: Optional[torch.Tensor] = None, context_audio_codes_lens: Optional[torch.Tensor] = None, context_audio: Optional[torch.Tensor] = None, context_audio_lens: Optional[torch.Tensor] = None, - dropout_text_input: bool = False, training_mode: Optional[TrainingMode] = None, - ) -> ContextTensors: + dropout_conditional_input: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ - Prepare context tensors for the EasyMagpieTTS model. - - This function processes the input text, context audio, and context text to create - the combined context embedding that will be fed to the transformer decoder. It handles - both 'full' and 'streaming' text input modes. + Prepare context tensors (without text) for the simplified process_batch. + This function processes context audio and context text to create the combined + context embedding. Args: - text: Input text token IDs (B, L) - text_lens: Length of text for each batch item (B,) context_text_tokens: Context text token IDs for speaker/style conditioning (B, L) context_text_tokens_lens: Length of context text for each batch item (B,) context_audio_codes: Pre-computed audio codes for context audio (B, C, T'). @@ -1008,57 +1084,24 @@ def prepare_context_tensors( Used to compute context_audio_codes if not provided. context_audio_lens: Length of context audio (B,). Required if context_audio is provided. - dropout_text_input: If True, zero out the text embedding for classifier-free guidance. training_mode: Optional TrainingMode object specifying the mode to use. If None, uses the first mode from training_modes as default. + dropout_conditional_input: If True, replace context with CFG unconditional token. Returns: - ContextTensors: A dataclass containing all prepared context tensors including: - - context_embedding: Combined context embedding (B, T_total, E) + Tuple of: + - context_embedding: Combined context embedding (B, T_context, E) - context_lens: Total context length per batch item (B,) - context_audio_codes: Processed audio codes with special tokens (B, C, T') - - context_audio_embedded: Embedded context audio (B, T', E) - context_audio_codes_lens: Length of processed context audio codes (B,) - - text_embedded: Embedded text tokens (B, L, E) - - text_lens: Text length per batch item (B,) - - context_text_tokens: Context text token IDs (B, L) - - context_text_lens: Context text length per batch item (B,) - - remaining_text_embedded: For streaming mode, embedded remaining text (B, T, E) - - remaining_text_lens: For streaming mode, remaining text length (B,) - - Raises: - ValueError: If neither context_audio_codes nor context_audio is provided. - ValueError: If text_input_mode is not 'full' or 'streaming'. """ # Determine the mode parameters to use - # If no mode is specified, use the first (default) mode if training_mode is None: training_mode = self.training_modes[0] - current_text_input_mode = training_mode.text_input_mode - current_streaming_speech_delay = training_mode.streaming_speech_delay - current_streaming_phonemes_delay = training_mode.streaming_phonemes_delay current_mode_idx = training_mode.mode_idx - - text_embedded = self.decoder.get_input_embeddings()(text) - if self.use_bpe_char_tokenizer: - text_mask = get_mask_from_lengths(text_lens) - cas_embedding = self.cas_encoder(text, subword_mask=text_mask) # (B, L, E) - text_embedded = text_embedded + cas_embedding - - if text_embedded.shape[1] < current_streaming_speech_delay + 1: - # If text is too short, pad it with zeros - padding_tensor = torch.zeros( - text_embedded.shape[0], - current_streaming_speech_delay + 1 - text_embedded.shape[1], - text_embedded.shape[2], - device=text_embedded.device, - ) - text_embedded = torch.cat([text_embedded, padding_tensor], dim=1) - - if dropout_text_input: - # Make text embedding all zeros - text_embedded = text_embedded * 0.0 + batch_size = context_text_tokens.size(0) + device = context_text_tokens.device # Context Audio if context_audio_codes is None: @@ -1093,63 +1136,237 @@ def prepare_context_tensors( context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens) # (B, L, E) # Prepare task embedding for multi-mode training - # Only use task embedding if there are multiple modes (task_embedding is not None) task_embedding = None task_embedding_lens = None if self.task_embedding is not None and current_mode_idx is not None: - batch_size = text.size(0) - mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=text.device) + mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=device) task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1) # (B, 1, E) - task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=text.device) # (B,) + task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=device) # (B,) - remaining_text_embedded = None - remaining_text_lens = None - if current_text_input_mode == 'full': - if task_embedding is not None: - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[task_embedding, context_audio_embedded, context_text_embedded, text_embedded], - lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, text_lens], - ) - else: - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[context_audio_embedded, context_text_embedded, text_embedded], - lengths=[context_audio_codes_lens, context_text_lens, text_lens], - ) - elif current_text_input_mode == 'streaming': - prompt_text_embedded = text_embedded[:, :current_streaming_speech_delay, :] - prompt_text_lens = torch.ones_like(text_lens) * current_streaming_speech_delay - if task_embedding is not None: - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[task_embedding, context_audio_embedded, context_text_embedded, prompt_text_embedded], - lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens, prompt_text_lens], - ) - else: - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[context_audio_embedded, context_text_embedded, prompt_text_embedded], - lengths=[context_audio_codes_lens, context_text_lens, prompt_text_lens], - ) - remaining_text_embedded = text_embedded[:, current_streaming_speech_delay:, :] - remaining_text_lens = text_lens - current_streaming_speech_delay - remaining_text_lens = remaining_text_lens.clamp(min=0) - remaining_text_mask = get_mask_from_lengths(remaining_text_lens) - remaining_text_embedded = remaining_text_embedded * remaining_text_mask.unsqueeze(2) # (B, T, E) + # Combine context embeddings: [task_embedding | context_audio | context_text] + if task_embedding is not None: + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[task_embedding, context_audio_embedded, context_text_embedded], + lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens], + ) else: - raise ValueError(f"Invalid text input mode: {current_text_input_mode}") + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[context_audio_embedded, context_text_embedded], + lengths=[context_audio_codes_lens, context_text_lens], + ) - return ContextTensors( - context_embedding=context_embedding, - context_lens=context_lens, - context_audio_codes=context_audio_codes, - context_audio_embedded=context_audio_embedded, - context_audio_codes_lens=context_audio_codes_lens, - text_embedded=text_embedded, - text_lens=text_lens, - context_text_tokens=context_text_tokens, - context_text_lens=context_text_lens, - remaining_text_embedded=remaining_text_embedded, - remaining_text_lens=remaining_text_lens, + # Handle CFG unconditional dropout + if dropout_conditional_input: + cfg_token_id = self.cfg_unk_token_id + cfg_token_embedding = self.decoder.get_input_embeddings()( + torch.full((batch_size, 1), cfg_token_id, device=device) + ) # (B, 1, E) + # Expand CFG token to match context embedding size + context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1) # (B, T_context, E) + + return context_embedding, context_lens, context_audio_codes, context_audio_codes_lens + + def prepare_text_channel_embeddings( + self, + text: torch.Tensor, + text_lens: torch.Tensor, + delay: torch.Tensor, + dropout_text_input: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Prepare text embeddings as a channel input with delay handling. + + This function embeds text tokens and prepends zero-padding based on the delay + parameter. The delay represents the number of zero positions to prepend before + the text embeddings, aligning the text channel with other channels. + + Args: + text: Input text token IDs (B, L) + text_lens: Length of text for each batch item (B,) + delay: Number of zero positions to prepend for each batch item (B,). + For text channel, this is typically just context_lens. + dropout_text_input: If True, return all zeros (for text dropout regularization). + + Returns: + Tuple of: + - text_channel_embedding: Text embeddings with zero-padded delay (B, T_delay + T_text, E) + - text_channel_lens: Total length of text channel for each batch item (B,) + """ + batch_size = text.size(0) + device = text.device + + # Embed text tokens + text_embedded = self.decoder.get_input_embeddings()(text) # (B, L, E) + + # Apply CAS encoding if using BPE char tokenizer + if self.use_bpe_char_tokenizer: + text_mask = get_mask_from_lengths(text_lens) + cas_embedding = self.cas_encoder(text, subword_mask=text_mask) # (B, L, E) + text_embedded = text_embedded + cas_embedding + + # Handle text dropout - zero out the embeddings + if dropout_text_input: + text_embedded = text_embedded * 0.0 + + # Create zero tensor for delay padding + max_delay = delay.max().item() + zero_delay_tensor = torch.zeros( + batch_size, max_delay, self.cfg.embedding_dim, device=device + ) + + # Join delay zeros with text embeddings + text_channel_embedding, text_channel_lens = self.join_embeddings_temporally( + embeddings=[zero_delay_tensor, text_embedded], + lengths=[delay, text_lens], + ) + + return text_channel_embedding, text_channel_lens + + def prepare_phoneme_channel_embeddings( + self, + phoneme_tokens: torch.Tensor, + phoneme_tokens_lens: torch.Tensor, + delay: torch.Tensor, + dropout_phoneme_input: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Prepare phoneme embeddings as a channel input with delay handling. + + This function stacks phoneme tokens (if configured), embeds them, and prepends + zero-padding based on the delay parameter. The delay represents the number of + zero positions to prepend before the phoneme embeddings. + + Args: + phoneme_tokens: Phoneme token IDs (B, L) + phoneme_tokens_lens: Length of phoneme tokens for each batch item (B,) + delay: Number of zero positions to prepend for each batch item (B,). + This is typically context_lens + phoneme_delay. + dropout_phoneme_input: If True, return all zeros (for phoneme dropout regularization). + + Returns: + Tuple of: + - phoneme_channel_embedding: Phoneme embeddings with zero-padded delay (B, T_delay + T_phoneme, E) + - phoneme_channel_lens: Total length of phoneme channel for each batch item (B,) + - phoneme_tokens_stacked: Stacked phoneme tokens (B, S, T') + - phoneme_tokens_lens_stacked: Length of stacked phoneme tokens (B,) + """ + batch_size = phoneme_tokens.size(0) + device = phoneme_tokens.device + + # Stack phoneme tokens + phoneme_tokens_expanded = phoneme_tokens.unsqueeze(1) # (B, 1, L) + phoneme_tokens_stacked, phoneme_tokens_lens_stacked = self.stack_codes( + phoneme_tokens_expanded, + phoneme_tokens_lens, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.eos_token_id, + self.phoneme_stacking_factor, + 1, ) + # Embed phoneme tokens + phoneme_embedded = self.embed_phoneme_tokens(phoneme_tokens_stacked) # (B, T', E) + + # Apply mask to zero out padding + phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens_stacked) + phoneme_embedded = phoneme_embedded * phoneme_mask.unsqueeze(2) # (B, T', E) + + # Handle phoneme dropout - zero out the embeddings + if dropout_phoneme_input: + phoneme_embedded = phoneme_embedded * 0.0 + + # Create zero tensor for delay padding + max_delay = delay.max().item() + zero_delay_tensor = torch.zeros( + batch_size, max_delay, self.cfg.embedding_dim, device=device + ) + + # Join delay zeros with phoneme embeddings + phoneme_channel_embedding, phoneme_channel_lens = self.join_embeddings_temporally( + embeddings=[zero_delay_tensor, phoneme_embedded], + lengths=[delay, phoneme_tokens_lens_stacked], + ) + + return phoneme_channel_embedding, phoneme_channel_lens, phoneme_tokens_stacked, phoneme_tokens_lens_stacked + + def prepare_audio_channel_embeddings( + self, + audio_codes: torch.Tensor, + audio_codes_lens: torch.Tensor, + delay: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Prepare audio embeddings as a channel input with delay handling. + + This function processes audio codes by adding special tokens, stacking them, + and embedding them. It prepends zero-padding based on the delay parameter. + Also prepares input/target split for autoregressive training. + + Args: + audio_codes: Audio codes (B, C, T) - raw codes without special tokens + audio_codes_lens: Length of audio codes for each batch item (B,) + delay: Number of zero positions to prepend for each batch item (B,). + In full mode: context_lens + text_lens + speech_delay + In streaming mode: context_lens + speech_delay + + Returns: + Tuple of: + - audio_channel_embedding: Audio embeddings with zero-padded delay (B, T_delay + T_audio, E) + - audio_channel_lens: Total length of audio channel for each batch item (B,) + - audio_codes_target: Target audio codes for loss computation (B, C, T'-1) + - audio_codes_lens_target: Length of target audio codes (B,) + """ + batch_size = audio_codes.size(0) + device = audio_codes.device + + # Apply codec conversion if configured + if self._codec_converter is not None: + audio_codes = self._codec_converter.convert_original_to_new( + audio_tokens=audio_codes, audio_lens=audio_codes_lens + ).long() + + # Add BOS and EOS tokens + audio_codes, audio_codes_lens = self.add_special_tokens( + codes=audio_codes, + codes_len=audio_codes_lens, + bos_id=self.audio_bos_id, + eos_id=self.audio_eos_id, + ) + + # Stack audio codes across codebooks + audio_codes, audio_codes_lens = self.stack_codes( + audio_codes, + audio_codes_lens, + self.audio_bos_id, + self.audio_eos_id, + self.frame_stacking_factor, + self.num_audio_codebooks, + ) + + # Prepare input and target for autoregressive training + # Input: all tokens except the last (teacher forcing) + # Target: all tokens except the first (shifted by one) + audio_codes_lens_target = audio_codes_lens - 1 + audio_codes_target = audio_codes[:, :, 1:] # (B, C, T'-1) + audio_codes_input = audio_codes[:, :, :-1] # (B, C, T'-1) + + # Embed audio tokens + audio_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T'-1, E) + + # Create zero tensor for delay padding + max_delay = delay.max().item() + zero_delay_tensor = torch.zeros( + batch_size, max_delay, self.cfg.embedding_dim, device=device + ) + + # Join delay zeros with audio embeddings + audio_channel_embedding, audio_channel_lens = self.join_embeddings_temporally( + embeddings=[zero_delay_tensor, audio_embedded], + lengths=[delay, audio_codes_lens_target], + ) + + return audio_channel_embedding, audio_channel_lens, audio_codes_target, audio_codes_lens_target + def slice_pred_embeddings(self, transformer_out, context_lens, target_lens): """ Slices the transformer output to get the predicted embeddings for the target sequence. @@ -1270,346 +1487,259 @@ def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): return x, orig_lens - def prepare_phoneme_channel_input(self, phoneme_tokens, phoneme_tokens_lens, context_lens): - """ - Prepare phoneme tokens as an auxiliary input channel for the decoder. - - This function processes phoneme tokens by stacking them (if configured), embedding them, - and prepending a zero-padded context region. The resulting tensor can be used as an - additional input channel to provide phoneme conditioning to the audio decoder. - - Args: - phoneme_tokens: Phoneme token IDs, shape (B, L) where B is batch size and - L is the phoneme sequence length. - phoneme_tokens_lens: Length of valid phoneme tokens for each batch item, shape (B,). - context_lens: Length of the context region for each batch item, shape (B,). - Used to prepend zero-padding to align with audio context. - - Returns: - Tuple of: - - phoneme_channel_input: Embedded phoneme tokens with zero-padded context, - shape (B, T_context + T_phoneme, E) where E is the embedding dimension. - - phoneme_channel_input_lens: Total length of phoneme channel input for each - batch item (context_lens + phoneme_tokens_lens after stacking), shape (B,). - - phoneme_tokens: Stacked phoneme tokens, shape (B, phoneme_stacking_factor, T_stacked). - - phoneme_tokens_lens: Length of stacked phoneme tokens, shape (B,). - """ - phoneme_tokens = phoneme_tokens.unsqueeze(1) # (B, 1, L) - phoneme_tokens, phoneme_tokens_lens = self.stack_codes( - phoneme_tokens, - phoneme_tokens_lens, - self.phoneme_tokenizer.bos_token_id, - self.phoneme_tokenizer.eos_token_id, - self.phoneme_stacking_factor, - 1, - ) - # import ipdb; ipdb.set_trace() - phoneme_tokens_embedded = self.embed_phoneme_tokens(phoneme_tokens) # (B, T', E) - - phoneme_mask = get_mask_from_lengths(phoneme_tokens_lens) - phoneme_tokens_embedded = phoneme_tokens_embedded * phoneme_mask.unsqueeze(2) # (B, T', E) - - zero_context_tensor = torch.zeros( - context_lens.size(0), context_lens.max().item(), self.cfg.embedding_dim, device=phoneme_tokens.device - ) - phoneme_channel_input, phoneme_channel_input_lens = self.join_embeddings_temporally( - embeddings=[zero_context_tensor, phoneme_tokens_embedded], - lengths=[context_lens, phoneme_tokens_lens], - ) - return phoneme_channel_input, phoneme_channel_input_lens, phoneme_tokens, phoneme_tokens_lens - def process_batch( self, text: torch.Tensor, text_lens: torch.Tensor, context_text_tokens: torch.Tensor, context_text_tokens_lens: torch.Tensor, - audio: Optional[torch.Tensor] = None, - audio_lens: Optional[torch.Tensor] = None, - audio_codes: Optional[torch.Tensor] = None, - audio_codes_lens: Optional[torch.Tensor] = None, - context_audio: Optional[torch.Tensor] = None, - context_audio_lens: Optional[torch.Tensor] = None, - context_audio_codes: Optional[torch.Tensor] = None, - context_audio_codes_lens: Optional[torch.Tensor] = None, + audio_codes: torch.Tensor, + audio_codes_lens: torch.Tensor, + context_audio_codes: torch.Tensor, + context_audio_codes_lens: torch.Tensor, phoneme_tokens: Optional[torch.Tensor] = None, phoneme_tokens_lens: Optional[torch.Tensor] = None, mode: str = "train", training_mode: Optional[TrainingMode] = None, ) -> ProcessBatchOutput: """ - Process a batch of inputs to compute model outputs and losses. + Simplified batch processing using channel-based embedding architecture. + + This function provides a cleaner implementation of process_batch where: + 1. Context is prepared separately (without text) + 2. Text, phoneme, and audio are each treated as channels with delay-based alignment + 3. Channels are summed element-wise and joined temporally with context - This function performs the following steps: - 1. Prepares context tensors from text and audio inputs - 2. Optionally applies dropout to text/phoneme inputs for regularization - 3. Optionally applies classifier-free guidance (CFG) unconditional training - 4. Converts audio to codes if not already provided - 5. Embeds audio codes and combines with context embeddings - 6. Runs the transformer forward pass - 7. Computes codebook loss, phoneme loss (if applicable), and local transformer loss (if applicable) + The delay handling ensures proper temporal alignment: + - Text channel delay: context_lens (no additional delay) + - Phoneme channel delay: context_lens + phoneme_delay + - Audio channel delay: context_lens + text_lens + speech_delay (full mode) + or context_lens + speech_delay (streaming mode) Args: - text: Input text token IDs, shape (B, L) - text_lens: Length of text for each batch item, shape (B,) - context_text_tokens: Context text token IDs for conditioning, shape (B, L_ctx) - context_text_tokens_lens: Length of context text for each batch item, shape (B,) - audio: Raw audio waveform (used if audio_codes not provided), shape (B, T_audio) - audio_lens: Length of audio for each batch item, shape (B,) - audio_codes: Pre-computed audio codes (optional, computed from audio if not provided), shape (B, C, T) - audio_codes_lens: Length of audio codes for each batch item, shape (B,) - context_audio: Raw context audio waveform (optional), shape (B, T_ctx_audio) - context_audio_lens: Length of context audio for each batch item, shape (B,) - context_audio_codes: Pre-computed context audio codes (optional), shape (B, C, T_ctx) - context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,) - phoneme_tokens: Phoneme token IDs (required if phoneme_tokenizer is enabled), shape (B, L_phoneme) - phoneme_tokens_lens: Length of phoneme tokens for each batch item, shape (B,) - mode: Training mode, either "train" or "val". Affects dropout behavior. - training_mode: Optional TrainingMode object specifying which mode to use. - If None and multi_mode_training is enabled, a random mode is selected during training. + text: Input text token IDs (B, L) + text_lens: Length of text for each batch item (B,) + context_text_tokens: Context text token IDs for conditioning (B, L_ctx) + context_text_tokens_lens: Length of context text (B,) + audio_codes: Audio codes (B, C, T) - raw codes without special tokens + audio_codes_lens: Length of audio codes (B,) + context_audio_codes: Pre-computed context audio codes (B, C, T') + context_audio_codes_lens: Length of context audio codes (B,) + phoneme_tokens: Phoneme token IDs (optional) (B, L_phoneme) + phoneme_tokens_lens: Length of phoneme tokens (B,) + mode: Training mode, either "train" or "val" + training_mode: Optional TrainingMode object Returns: - ProcessBatchOutput: Dataclass containing: - - loss: Total combined loss - - codebook_loss: Loss for audio codebook prediction - - phoneme_loss: Loss for phoneme prediction (None if not using phonemes) - - local_transformer_loss: Loss from local transformer (None if not used) - - local_transformer_logits: Logits from local transformer - - logits: Predicted logits from the main decoder - - audio_codes_target: Target audio codes - - audio_codes_lens_target: Length of target audio codes - - context_audio_codes: Audio codes from context - - context_audio_codes_lens: Length of context audio codes + ProcessBatchOutput: Contains loss values and model predictions """ - # Select training mode for multi-mode training - # During training, randomly select a mode if not specified - # During validation, use the first mode (default) if not specified + # Select training mode selected_training_mode = training_mode if selected_training_mode is None: if mode == 'train': - # Randomly select a mode during training selected_training_mode = random.choice(self.training_modes) else: - # Use the first mode during validation selected_training_mode = self.training_modes[0] - # Get the current mode's parameters current_text_input_mode = selected_training_mode.text_input_mode current_streaming_speech_delay = selected_training_mode.streaming_speech_delay current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay - # Determine whether to apply text/phoneme dropout for regularization during training - # Text dropout: randomly drop text input to encourage the model to rely on other signals + # Determine dropout flags dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False if dropout_phoneme_input and dropout_text_input: - # Only one of the two can be True, so choose randomly dropout_phoneme_input = random.random() < 0.5 dropout_text_input = not dropout_phoneme_input - # Prepare context tensors by combining text and audio context information - context_tensors = self.prepare_context_tensors( - text=text, - text_lens=text_lens, - context_text_tokens=context_text_tokens, - context_text_tokens_lens=context_text_tokens_lens, - context_audio_codes=context_audio_codes, - context_audio_codes_lens=context_audio_codes_lens, - context_audio=context_audio, - context_audio_lens=context_audio_lens, - dropout_text_input=dropout_text_input, - training_mode=selected_training_mode, - ) - - # Extract context tensors for use in the forward pass - remaining_text_embedded = context_tensors.remaining_text_embedded - context_embedding = context_tensors.context_embedding - context_lens = context_tensors.context_lens - - # Classifier-Free Guidance (CFG) unconditional training: - # With some probability, replace the context with a special unconditional token - # This allows the model to generate without conditioning during inference + # Determine CFG unconditional dropout dropout_conditional_input = False if mode == 'train' and self.cfg_unconditional_prob > 0.0: if torch.rand(1).item() < self.cfg_unconditional_prob: dropout_conditional_input = True - # Get embedding of a special UNCONDITIONAL_TOKEN - cfg_token_id = self.cfg_unk_token_id # int - cfg_token_embedding = self.decoder.get_input_embeddings()( - torch.full((context_embedding.size(0), 1), cfg_token_id, device=context_embedding.device) - ) # (B, 1, E) - # Keeping the dummy context same size as the context embedding makes - # inference easier especially with KV caching and using a duplicated batch. - context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1) # (B, T_total, E) - # Make unconditional remaining text embedding all zeros. Simplifies the inference implementation. - if current_text_input_mode == 'streaming': - remaining_text_embedded = torch.zeros_like(remaining_text_embedded) - - # Convert raw audio to discrete codes if codes are not already provided - if audio_codes is None: - audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens) - # Apply codec conversion if a converter is configured (e.g., for different codec formats) - if self._codec_converter is not None: - audio_codes = self._codec_converter.convert_original_to_new( - audio_tokens=audio_codes, audio_lens=audio_codes_lens - ).long() + # 1. Prepare context tensors (without text) + context_embedding, context_lens, context_audio_codes_processed, context_audio_codes_lens_processed = ( + self.prepare_context_tensors( + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + training_mode=selected_training_mode, + dropout_conditional_input=dropout_conditional_input, + ) + ) - # Add BOS (beginning of sequence) and EOS (end of sequence) tokens to audio codes - audio_codes, audio_codes_lens = self.add_special_tokens( - codes=audio_codes, - codes_len=audio_codes_lens, - bos_id=self.audio_bos_id, - eos_id=self.audio_eos_id, + # 2. Compute delays for each channel based on mode + # Text channel delay: always context_lens + text_delay = context_lens.clone() + + # Phoneme channel delay: context_lens + phoneme_delay (both modes) + phoneme_delay = context_lens + current_streaming_phonemes_delay + + # Audio channel delay depends on mode + if current_text_input_mode == 'full': + # Full mode: context_lens + text_lens + speech_delay + audio_delay = context_lens + text_lens + current_streaming_speech_delay + else: + # Streaming mode: context_lens + speech_delay + audio_delay = context_lens + current_streaming_speech_delay + + # 3. Prepare text channel embeddings + text_channel_embedding, text_channel_lens = self.prepare_text_channel_embeddings( + text=text, + text_lens=text_lens, + delay=text_delay, + dropout_text_input=dropout_text_input or dropout_conditional_input, ) - # Stack audio codes across codebooks for multi-codebook processing - # This reshapes codes for parallel prediction of multiple codebooks - audio_codes, audio_codes_lens = self.stack_codes( - audio_codes, - audio_codes_lens, - self.audio_bos_id, - self.audio_eos_id, - self.frame_stacking_factor, - self.num_audio_codebooks, + # 4. Prepare phoneme channel embeddings (if phoneme tokenizer is configured) + phoneme_channel_embedding = None + phoneme_tokens_stacked = None + phoneme_tokens_lens_stacked = None + if self.phoneme_tokenizer is not None and phoneme_tokens is not None: + ( + phoneme_channel_embedding, + phoneme_channel_lens, + phoneme_tokens_stacked, + phoneme_tokens_lens_stacked, + ) = self.prepare_phoneme_channel_embeddings( + phoneme_tokens=phoneme_tokens, + phoneme_tokens_lens=phoneme_tokens_lens, + delay=phoneme_delay, + dropout_phoneme_input=dropout_phoneme_input or dropout_conditional_input, + ) + + # 5. Prepare audio channel embeddings + ( + audio_channel_embedding, + audio_channel_lens, + audio_codes_target, + audio_codes_lens_target, + ) = self.prepare_audio_channel_embeddings( + audio_codes=audio_codes, + audio_codes_lens=audio_codes_lens, + delay=audio_delay, ) - # Prepare input and target sequences for autoregressive training - # Input: all tokens except the last (teacher forcing) - # Target: all tokens except the first (shifted by one position) - audio_codes_lens_input = audio_codes_lens_target = audio_codes_lens - 1 - audio_codes_target = audio_codes[:, :, 1:] # (B, C, T') Target for the decoder - audio_codes_input = audio_codes[:, :, :-1] # (B, C, T') Input to the decoder - - # Embed audio tokens to get continuous representations - audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_input) # (B, T, E) - - # In streaming mode, add remaining text embeddings to audio embeddings - # This provides text information at each audio timestep - if remaining_text_embedded is not None: - # Pad remaining text to match audio sequence length by adding zeros on the right - padding_len = audio_codes_input_embedded.size(1) - remaining_text_embedded.size(1) - if padding_len > 0: - padding_tensor = torch.zeros( - remaining_text_embedded.size(0), - padding_len, - remaining_text_embedded.size(2), - device=remaining_text_embedded.device, - ) - remaining_text_embedded = torch.cat([remaining_text_embedded, padding_tensor], dim=1) - else: - # Log Warning - print( - f"Warning: Remaining text length {remaining_text_embedded.size(1)} is greater than audio codes input length {audio_codes_input_embedded.size(1)}" - ) - remaining_text_embedded = remaining_text_embedded[:, : audio_codes_input_embedded.size(1), :] - # Add text information to audio embeddings (element-wise addition) - audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded - - # Concatenate context embeddings with audio embeddings along the time dimension - # Result: [context_embedding | audio_codes_input_embedded] - context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( - embeddings=[context_embedding, audio_codes_input_embedded], - lengths=[context_lens, audio_codes_lens_input], + # 6. Sum the channel embeddings element-wise + # First, align all channels to the same length (max of all channel lengths) + max_channel_len = max( + text_channel_embedding.size(1), + audio_channel_embedding.size(1), + phoneme_channel_embedding.size(1) if phoneme_channel_embedding is not None else 0, ) - # Process phoneme input if phoneme tokenizer is configured - if self.phoneme_tokenizer is not None: - # Compute context length offset for phoneme alignment - # This accounts for different delays in speech vs phoneme streams - # Use the selected mode's streaming delays - context_lens_for_phonemes = ( - context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay + # Pad text channel if needed + if text_channel_embedding.size(1) < max_channel_len: + padding = torch.zeros( + text_channel_embedding.size(0), + max_channel_len - text_channel_embedding.size(1), + text_channel_embedding.size(2), + device=text_channel_embedding.device, ) - - # Prepare phoneme channel input with proper alignment - ( - phoneme_channel_input, - phoneme_channel_input_lens, - phoneme_tokens_processed, - phoneme_tokens_lens_processed, - ) = self.prepare_phoneme_channel_input(phoneme_tokens, phoneme_tokens_lens, context_lens_for_phonemes) - - # Align phoneme channel input to match the combined context+audio sequence length - if phoneme_channel_input.shape[1] < context_plus_audio_embedded.shape[1]: - # Pad phoneme channel with zeros if shorter than context+audio - padding_tensor = torch.zeros( - phoneme_channel_input.shape[0], - context_plus_audio_embedded.shape[1] - phoneme_channel_input.shape[1], - phoneme_channel_input.shape[2], - device=phoneme_channel_input.device, + text_channel_embedding = torch.cat([text_channel_embedding, padding], dim=1) + + # Pad audio channel if needed + if audio_channel_embedding.size(1) < max_channel_len: + padding = torch.zeros( + audio_channel_embedding.size(0), + max_channel_len - audio_channel_embedding.size(1), + audio_channel_embedding.size(2), + device=audio_channel_embedding.device, + ) + audio_channel_embedding = torch.cat([audio_channel_embedding, padding], dim=1) + + # Sum channels + combined_channel_embedding = text_channel_embedding + audio_channel_embedding + + # Add phoneme channel if available + if phoneme_channel_embedding is not None: + if phoneme_channel_embedding.size(1) < max_channel_len: + padding = torch.zeros( + phoneme_channel_embedding.size(0), + max_channel_len - phoneme_channel_embedding.size(1), + phoneme_channel_embedding.size(2), + device=phoneme_channel_embedding.device, ) - phoneme_channel_input = torch.cat([phoneme_channel_input, padding_tensor], dim=1) - else: - # Truncate phoneme channel if longer than context+audio - phoneme_channel_input = phoneme_channel_input[:, : context_plus_audio_embedded.shape[1], :] + phoneme_channel_embedding = torch.cat([phoneme_channel_embedding, padding], dim=1) + combined_channel_embedding = combined_channel_embedding + phoneme_channel_embedding + + # 7. Join context with combined channel embeddings + # The combined_channel_lens is the max of all channel lens for each batch item + combined_channel_lens = torch.stack([ + text_channel_lens, + audio_channel_lens, + phoneme_channel_lens if phoneme_channel_embedding is not None else audio_channel_lens, + ], dim=0).max(dim=0).values + + + + # Right pad context embedding + context_padding = torch.zeros( + context_embedding.size(0), + combined_channel_embedding.size(1) - context_embedding.size(1), + context_embedding.size(2), + device=context_embedding.device, + ) + context_embedding_padded = torch.cat([context_embedding, context_padding], dim=1) - # Add phoneme information unless doing unconditional or phoneme dropout training - if (not dropout_conditional_input) and (not dropout_phoneme_input): - context_plus_audio_embedded = context_plus_audio_embedded + phoneme_channel_input + full_embedding = context_embedding_padded + combined_channel_embedding - # Run the transformer forward pass + # 8. Forward pass through transformer transformer_out = self.forward( - inputs_embeds=context_plus_audio_embedded, - attention_mask=get_mask_from_lengths(context_plus_audio_lens), + inputs_embeds=full_embedding, + attention_mask=get_mask_from_lengths(combined_channel_lens), ) transformer_hidden_states = transformer_out.last_hidden_state # (B, T_total, E) - # Extract prediction embeddings by slicing out the audio portion (excluding context) + # 9. Extract prediction embeddings and compute losses + # Audio predictions start at audio_delay pred_embeddings = self.slice_pred_embeddings( transformer_hidden_states, - context_lens=context_lens, + context_lens=audio_delay, target_lens=audio_codes_lens_target, ) - # Project embeddings to logits for each codebook - # First project from hidden_dim to audio_embedding_dim, then to logits + # Project to audio logits pred_embeddings_audio = self.audio_out_projection(pred_embeddings) - logits = self.final_proj(pred_embeddings_audio) # (B, T', num_codebooks * num_tokens_per_codebook) + logits = self.final_proj(pred_embeddings_audio) - # Compute the main codebook prediction loss + # Compute codebook loss codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) loss = codebook_loss - # Compute local transformer loss if using local transformer architecture + # Compute local transformer loss if applicable local_transformer_loss = None local_transformer_logits = None if self.local_transformer_type != LocalTransformerType.NO_LT: assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" - # Compute logits using the local (autoregressive) transformer local_transformer_logits = self.compute_local_transformer_logits( pred_embeddings, audio_codes_target, targets_offset_by_one=False ) local_transformer_loss, _ = self.compute_loss( local_transformer_logits, audio_codes_target, audio_codes_lens_target ) - # Scale and add local transformer loss to total loss local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0) loss = loss + local_transformer_loss_scale * local_transformer_loss - # Compute phoneme prediction loss if using phoneme tokenizer + # Compute phoneme loss if applicable phoneme_loss = None - if self.phoneme_tokenizer is not None: - # Extract phoneme prediction embeddings with proper alignment + if self.phoneme_tokenizer is not None and phoneme_tokens_stacked is not None: + # Phoneme predictions start at phoneme_delay pred_embeddings_phoneme = self.slice_pred_embeddings( transformer_hidden_states, - context_lens=context_lens_for_phonemes, - target_lens=phoneme_tokens_lens_processed - 1, + context_lens=phoneme_delay, + target_lens=phoneme_tokens_lens_stacked - 1, ) - # Project to phoneme logits - phoneme_logits = self.phoneme_final_proj( - pred_embeddings_phoneme - ) # (B, T', phoneme_stacking_factor * phoneme_vocab_size) + phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) - # Only compute phoneme loss if not doing any dropout - # (unconditional, text dropout, or phoneme dropout) if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input): phoneme_loss, _ = self.compute_phoneme_loss( - phoneme_logits, phoneme_tokens_processed[:, :, 1:].long(), phoneme_tokens_lens_processed - 1 + phoneme_logits, phoneme_tokens_stacked[:, :, 1:].long(), phoneme_tokens_lens_stacked - 1 ) print("No Dropout - phoneme loss:", phoneme_loss.item()) else: - # Skip phoneme loss computation during dropout training phoneme_loss = torch.tensor(0.0, device=logits.device) print("Dropout - phoneme loss skipped", phoneme_loss.item()) @@ -1624,27 +1754,37 @@ def process_batch( logits=logits, audio_codes_target=audio_codes_target, audio_codes_lens_target=audio_codes_lens_target, - context_audio_codes=context_tensors.context_audio_codes, - context_audio_codes_lens=context_tensors.context_audio_codes_lens, + context_audio_codes=context_audio_codes_processed, + context_audio_codes_lens=context_audio_codes_lens_processed, selected_training_mode=selected_training_mode.name if selected_training_mode is not None else None, ) def training_step(self, batch, batch_idx): - # Extract inputs from batch and pass explicitly to process_batch - # import ipdb; ipdb.set_trace() + if 'context_audio_codes' in batch: + context_audio_codes = batch['context_audio_codes'] + context_audio_codes_lens = batch['context_audio_codes_lens'] + else: + context_audio = batch['context_audio'] + context_audio_lens = batch['context_audio_lens'] + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + + if 'audio_codes' in batch: + audio_codes = batch['audio_codes'] + audio_codes_lens = batch['audio_codes_lens'] + else: + audio = batch['audio'] + audio_lens = batch['audio_lens'] + audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens) + batch_output = self.process_batch( text=batch['text'], text_lens=batch['text_lens'], context_text_tokens=batch['context_text_tokens'], context_text_tokens_lens=batch['context_text_tokens_lens'], - audio=batch.get('audio'), - audio_lens=batch.get('audio_lens'), - audio_codes=batch.get('audio_codes'), - audio_codes_lens=batch.get('audio_codes_lens'), - context_audio=batch.get('context_audio'), - context_audio_lens=batch.get('context_audio_lens'), - context_audio_codes=batch.get('context_audio_codes'), - context_audio_codes_lens=batch.get('context_audio_codes_lens'), + audio_codes=audio_codes, + audio_codes_lens=audio_codes_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, phoneme_tokens=batch.get('phoneme_tokens'), phoneme_tokens_lens=batch.get('phoneme_tokens_lens'), mode="train", @@ -1709,19 +1849,31 @@ def training_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx): # Extract inputs from batch and pass explicitly to process_batch + if 'context_audio_codes' in batch: + context_audio_codes = batch['context_audio_codes'] + context_audio_codes_lens = batch['context_audio_codes_lens'] + else: + context_audio = batch['context_audio'] + context_audio_lens = batch['context_audio_lens'] + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + + if 'audio_codes' in batch: + audio_codes = batch['audio_codes'] + audio_codes_lens = batch['audio_codes_lens'] + else: + audio = batch['audio'] + audio_lens = batch['audio_lens'] + audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens) + batch_output = self.process_batch( text=batch['text'], text_lens=batch['text_lens'], context_text_tokens=batch['context_text_tokens'], context_text_tokens_lens=batch['context_text_tokens_lens'], - audio=batch.get('audio'), - audio_lens=batch.get('audio_lens'), - audio_codes=batch.get('audio_codes'), - audio_codes_lens=batch.get('audio_codes_lens'), - context_audio=batch.get('context_audio'), - context_audio_lens=batch.get('context_audio_lens'), - context_audio_codes=batch.get('context_audio_codes'), - context_audio_codes_lens=batch.get('context_audio_codes_lens'), + audio_codes=audio_codes, + audio_codes_lens=audio_codes_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, phoneme_tokens=batch.get('phoneme_tokens'), phoneme_tokens_lens=batch.get('phoneme_tokens_lens'), mode="val", @@ -1897,47 +2049,7 @@ def setup_validation_data(self, cfg): def setup_test_data(self, cfg): self._test_dl = self._setup_test_dataloader(cfg) - def _log_phoneme_predictions( - self, - pred_phoneme_token_lists: List[List[int]], - gt_phoneme_token_lists: List[List[int]], - batch_size: int, - ) -> None: - """Log predicted vs ground truth phoneme tokens for debugging.""" - for item_idx in range(batch_size): - logging.info(f"Predicted phoneme tokens for item {item_idx}: {pred_phoneme_token_lists[item_idx]}") - logging.info(f"GT phoneme tokens for item {item_idx}: {gt_phoneme_token_lists[item_idx]}") - predicted_phoneme_text = self.phoneme_tokenizer.decode(pred_phoneme_token_lists[item_idx]) - gt_phoneme_text = self.phoneme_tokenizer.decode(gt_phoneme_token_lists[item_idx]) - logging.info(f"Predicted phoneme text for item {item_idx}: {predicted_phoneme_text}") - logging.info(f"GT phoneme text for item {item_idx}: {gt_phoneme_text}") - - def _collect_phoneme_tokens_for_logging( - self, - pred_phoneme_tokens: torch.Tensor, - gt_phoneme_tokens_current: torch.Tensor, - use_phoneme_input: torch.Tensor, - pred_phoneme_token_lists: List[List[int]], - gt_phoneme_token_lists: List[List[int]], - batch_size: int, - ) -> None: - """Collect phoneme tokens into lists for later logging (does not print).""" - special_tokens = { - self.phoneme_tokenizer.eos_token_id, - self.phoneme_tokenizer.bos_token_id, - self.phoneme_tokenizer.pad, - } - for item_idx in range(batch_size): - if use_phoneme_input[item_idx, 0, 0] > 0: - for phoneme_channel_idx in range(self.phoneme_stacking_factor): - pred_token = pred_phoneme_tokens[item_idx, phoneme_channel_idx].item() - if pred_token not in special_tokens: - pred_phoneme_token_lists[item_idx].append(pred_token) - - gt_token = gt_phoneme_tokens_current[item_idx, phoneme_channel_idx].item() - if gt_token not in special_tokens: - gt_phoneme_token_lists[item_idx].append(gt_token) - + def _sample_audio_codes( self, last_hidden: torch.Tensor, @@ -1978,235 +2090,67 @@ def _sample_audio_codes( return audio_codes_next, all_codes_next_argmax - def _process_phoneme_predictions( - self, - last_hidden: torch.Tensor, - actual_batch_size: int, - current_phoneme_positions: torch.Tensor, - gt_phoneme_tokens: torch.Tensor, - phoneme_input_type: str, - phoneme_sampling_method: str, - temperature: float, - topk: int, - timestep_idx: int, - device: torch.device, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Process phoneme predictions for the current timestep. - - Returns: - pred_phoneme_tokens: Predicted phoneme tokens (B, phoneme_stacking_factor) - gt_phoneme_tokens_current: GT phoneme tokens for current timestep (B, phoneme_stacking_factor) - input_phoneme_tokens_current: Tokens to use as input (GT or predicted) - input_phoneme_embedding: Embedded phoneme tokens (B, phoneme_stacking_factor, E) - """ - # Get phoneme logits and sample - all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) - all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] - - all_codes_next_phoneme = self.sample_codes_from_logits_phoneme( - all_code_logits_t_phoneme, temperature=temperature, topk=topk - ) - all_codes_next_phoneme_argmax = self.sample_codes_from_logits_phoneme( - all_code_logits_t_phoneme, temperature=0.01 - ) - - # Select predicted tokens based on sampling method - pred_phoneme_tokens = ( - all_codes_next_phoneme_argmax if phoneme_sampling_method == 'argmax' else all_codes_next_phoneme - ) - - # Handle BOS token at position 0 - phoneme_bos_tensor = torch.full( - (actual_batch_size, self.phoneme_stacking_factor), - self.phoneme_tokenizer.bos_token_id, - device=device, - ).long() - use_bos_phoneme = (current_phoneme_positions == 0).unsqueeze(1).long() - pred_phoneme_tokens = ( - use_bos_phoneme * phoneme_bos_tensor + (1 - use_bos_phoneme) * pred_phoneme_tokens - ).long() - - # Get ground truth phoneme tokens for current timestep - gt_phoneme_idx = min(timestep_idx, gt_phoneme_tokens.size(2) - 1) - gt_phoneme_tokens_current = gt_phoneme_tokens[:, :, gt_phoneme_idx] - - # Select input tokens (GT or predicted) and embed - input_phoneme_tokens_current = gt_phoneme_tokens_current if phoneme_input_type == 'gt' else pred_phoneme_tokens - input_phoneme_embedding = self.embed_phoneme_tokens(input_phoneme_tokens_current.unsqueeze(2)) - - return pred_phoneme_tokens, gt_phoneme_tokens_current, input_phoneme_tokens_current, input_phoneme_embedding - - def _compute_phoneme_channel_input( - self, - input_phoneme_embedding: torch.Tensor, - current_phoneme_positions: torch.Tensor, - phoneme_stream_ended: torch.Tensor, - actual_batch_size: int, - device: torch.device, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Compute the phoneme channel input embedding with masking. - - Returns: - phoneme_channel_input_t: Masked phoneme embedding (B, 1, E) - use_phoneme_input: Mask indicating which items should use phoneme input (B, 1, 1) - """ - # Determine which items should use phoneme input - use_phoneme_input = (current_phoneme_positions >= 0) & (~phoneme_stream_ended) - use_phoneme_input = use_phoneme_input.unsqueeze(1).unsqueeze(2).float() - - # Create zero embedding for items not using phoneme input - zero_phoneme_embedding = torch.zeros(actual_batch_size, 1, self.cfg.embedding_dim, device=device) - - # Combine: use phoneme embedding where active, zero otherwise - phoneme_channel_input_t = ( - use_phoneme_input * input_phoneme_embedding + (1 - use_phoneme_input) * zero_phoneme_embedding - ) - - return phoneme_channel_input_t, use_phoneme_input - - def _prepare_next_decoder_input( + def streaming_init( self, - audio_codes_next: torch.Tensor, - context_plus_audio_embedded: torch.Tensor, - context_plus_audio_lens: torch.Tensor, - min_context_len: int, - idx: int, - current_text_input_mode: str, - remaining_text_embedded: Optional[torch.Tensor], - current_text_positions: torch.Tensor, - phoneme_channel_input_t: Optional[torch.Tensor], - use_cfg: bool, - dummy_context_embedding_unconditional: Optional[torch.Tensor], - ) -> torch.Tensor: - """ - Prepare the input embedding for the next decoder step. - - Handles: - - Mixing context embeddings with generated audio embeddings based on context completeness - - Adding streaming text embeddings if in streaming mode - - Adding phoneme channel input if available - - Duplicating for CFG if enabled + context_audio_codes: torch.Tensor, + context_audio_codes_lens: torch.Tensor, + context_text_tokens: torch.Tensor, + context_text_tokens_lens: torch.Tensor, + inference_mode: Optional[str] = None, + use_cfg: bool = False, + cfg_scale: float = 1.0, + use_local_transformer: bool = False, + temperature: float = 0.7, + topk: int = 80, + phoneme_input_type: str = 'predicted', + phoneme_sampling_method: str = 'argmax', + gt_phoneme_tokens: Optional[torch.Tensor] = None, + gt_phoneme_tokens_lens: Optional[torch.Tensor] = None, + ) -> StreamingState: """ - batch_size = audio_codes_next.size(0) - device = audio_codes_next.device - - # Embed the newly generated audio codes - new_emb = self.embed_audio_tokens(audio_codes_next.unsqueeze(2)) # (B, 1, E) - new_emb_unconditional = new_emb.clone() - - # Add streaming text embeddings if in streaming mode - if current_text_input_mode == 'streaming': - remaining_text_idx = current_text_positions.clamp(min=0) - remaining_text_embedded_current = remaining_text_embedded[ - torch.arange(batch_size, device=device), remaining_text_idx, : - ].unsqueeze(1) - new_emb = new_emb + remaining_text_embedded_current - - # Check which items still have context to process - context_incomplete_mask = context_plus_audio_lens > idx + min_context_len - - if context_incomplete_mask.any(): - # Some items still processing context - blend context with generated embeddings - context_incomplete_mask = context_incomplete_mask.unsqueeze(1).unsqueeze(2).float() - context_embedding_slice = context_plus_audio_embedded[ - :, min_context_len + idx : min_context_len + idx + 1, : - ] - next_input = context_incomplete_mask * context_embedding_slice + (1 - context_incomplete_mask) * new_emb - - if phoneme_channel_input_t is not None: - next_input = next_input + phoneme_channel_input_t + Initialize streaming TTS inference state. - if use_cfg: - next_input_unconditional = ( - context_incomplete_mask * dummy_context_embedding_unconditional - + (1 - context_incomplete_mask) * new_emb_unconditional - ) - next_input = torch.cat([next_input, next_input_unconditional], dim=0) - else: - # All items finished context - use generated embeddings - next_input = new_emb - if phoneme_channel_input_t is not None: - next_input = next_input + phoneme_channel_input_t + This prepares the model for streaming inference by processing the context + (audio + context text) and returning a StreamingState that can be used + with streaming_step() to incrementally generate audio. - if use_cfg: - next_input = torch.cat([next_input, new_emb_unconditional], dim=0) - - return next_input + Note: This function does NOT take the main text input. Text tokens are + provided incrementally via streaming_step(). - def _check_eos_and_update_end_indices( - self, - all_codes_next_argmax: torch.Tensor, - audio_codes_next: torch.Tensor, - end_indices: Dict[int, int], - context_plus_audio_lens: torch.Tensor, - min_context_len: int, - idx: int, - verbose: bool = False, - ) -> None: - """Check for EOS tokens and update end indices for completed items.""" - for item_idx in range(all_codes_next_argmax.size(0)): - # Only check items that haven't ended and have passed their context - if item_idx not in end_indices and idx + min_context_len > context_plus_audio_lens[item_idx]: - pred_tokens = all_codes_next_argmax[item_idx] - pred_tokens_multinomial = audio_codes_next[item_idx] - - if torch.any(pred_tokens == self.audio_eos_id) or torch.any( - pred_tokens_multinomial == self.audio_eos_id - ): - if verbose: - logging.info(f"EOS detected for item {item_idx} at timestep {idx}") - end_indices[item_idx] = idx + For batched inference, each batch item can have a different context length. + This function processes only up to the minimum context length across the batch, + storing the remaining context to be processed in streaming_step's context phase. - def infer_batch( - self, - batch, - max_decoder_steps=500, - temperature=0.7, - topk=80, - use_local_transformer_for_inference=False, - maskgit_n_steps=3, - use_cfg=False, - cfg_scale=1.0, - phoneme_input_type='gt', - phoneme_sampling_method='argmax', - dropout_text_input=False, - inference_mode: Optional[str] = None, - verbose: bool = False, - ): - """ - Run inference on a batch of inputs to generate audio from text. + The streaming inference follows phases (per batch item): + 1. Context phase: Processing remaining context (if any) for items with longer context. + 2. Prompt phase: First `streaming_speech_delay` text tokens are processed + without generating audio (building up context). + 3. Generation phase: Audio BOS is added and audio codes are generated + autoregressively, with remaining text tokens added to audio embeddings. Args: - batch: Input batch containing: - - text, text_lens: Input text tokens and lengths - - context_text_tokens, context_text_tokens_lens: Context text for speaker/style - - context_audio_codes/context_audio (optional): Audio context for speaker cloning - max_decoder_steps: Maximum number of decoding steps. - temperature: Sampling temperature for audio codes. - topk: Top-k sampling parameter. - use_local_transformer_for_inference: Whether to use local transformer for AR sampling. - maskgit_n_steps: Number of MaskGit steps (unused in AR mode). + context_audio_codes: Pre-computed audio codes for context audio (B, C, T'). + context_audio_codes_lens: Length of context audio codes (B,). + context_text_tokens: Context text token IDs for speaker/style conditioning (B, L). + context_text_tokens_lens: Length of context text (B,). + inference_mode: Name of the inference mode to use (e.g., "streaming_4_8"). + If None, uses the default inference mode. use_cfg: Whether to use classifier-free guidance. cfg_scale: CFG scale factor (higher = stronger conditioning). - phoneme_input_type: 'gt' for ground truth or 'pred' for predicted phonemes. + use_local_transformer: Whether to use local transformer for AR sampling. + temperature: Sampling temperature for audio codes. + topk: Top-k sampling parameter. + phoneme_input_type: 'gt' or 'predicted' for phoneme tokens (use 'predicted' for streaming). phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. - dropout_text_input: Whether to dropout text input for CFG training. - inference_mode: Name of the inference mode to use (e.g., "full", "streaming_4_8"). - If None, uses the default inference mode. - verbose: If True, enables detailed logging of decoding progress, EOS detection, - and phoneme predictions. Default False for cleaner output. + gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing. + gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,). Returns: - predicted_audio: Generated audio waveforms (B, max_audio_len) - predicted_audio_lens: Lengths of generated audio (B,) - predicted_codes: Generated audio codes (B, num_codebooks, T) - predicted_codes_lens: Lengths of generated codes (B,) - rtf_metrics: Dictionary with timing metrics (rtf, time_to_first_prediction, etc.) + StreamingState: Initial state for streaming inference. """ with torch.inference_mode(): - start_time = time.time() + batch_size = context_audio_codes.size(0) + device = context_audio_codes.device # Resolve inference mode mode_name = inference_mode if inference_mode is not None else self.default_inference_mode @@ -2215,306 +2159,775 @@ def infer_batch( raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}") selected_training_mode = self.mode_name_to_mode[mode_name] - if verbose: - logging.info(f"Using inference mode: {selected_training_mode.name}") - - current_text_input_mode = selected_training_mode.text_input_mode - current_streaming_speech_delay = selected_training_mode.streaming_speech_delay - current_streaming_phonemes_delay = selected_training_mode.streaming_phonemes_delay - - # Prepare context embeddings (text + audio context) - context_tensors = self.prepare_context_tensors( - text=batch['text'], - text_lens=batch['text_lens'], - context_text_tokens=batch['context_text_tokens'], - context_text_tokens_lens=batch['context_text_tokens_lens'], - context_audio_codes=batch.get('context_audio_codes'), - context_audio_codes_lens=batch.get('context_audio_codes_lens'), - context_audio=batch.get('context_audio'), - context_audio_lens=batch.get('context_audio_lens'), - dropout_text_input=dropout_text_input, + + # Prepare context embedding using shared helper + context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = self.prepare_context_tensors( + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, training_mode=selected_training_mode, + dropout_conditional_input=False, ) - context_embedding = context_tensors.context_embedding # (B, T_total, E) - context_lens = context_tensors.context_lens # (B,) - remaining_text_embedded = context_tensors.remaining_text_embedded - remaining_text_lens = context_tensors.remaining_text_lens - - actual_batch_size = context_embedding.size(0) - device = context_embedding.device - - # Prepare phoneme channel input if phoneme tokenizer is available - gt_phoneme_tokens = None - if self.phoneme_tokenizer is not None: - context_lens_for_phonemes = ( - context_lens - current_streaming_speech_delay + current_streaming_phonemes_delay - ) - _, _, gt_phoneme_tokens, _ = self.prepare_phoneme_channel_input( - batch['phoneme_tokens'], batch['phoneme_tokens_lens'], context_lens_for_phonemes - ) - - # Initialize audio codes with BOS token - audio_codes_bos = torch.full( - (actual_batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1), - self.audio_bos_id, - device=device, - ).long() - audio_codes_lens = torch.ones(actual_batch_size, device=device).long() - - audio_codes_input_embedded = self.embed_audio_tokens(audio_codes_bos) # (B, 1, E) - # For streaming mode, add text embeddings to audio BOS - if current_text_input_mode == 'streaming': - remaining_text_pad_length = max_decoder_steps - remaining_text_lens.max().item() + 1 - remaining_text_pad_tensor = torch.zeros( - actual_batch_size, remaining_text_pad_length, remaining_text_embedded.size(2), device=device - ) - remaining_text_embedded = torch.cat([remaining_text_embedded, remaining_text_pad_tensor], dim=1) - audio_codes_input_embedded = audio_codes_input_embedded + remaining_text_embedded[:, :1, :] + # Store full context embedding and lens before any CFG manipulation + full_context_embedding = context_embedding.clone() # (B, T_max, E) + full_context_lens = context_lens.clone() # (B,) - # Combine context and audio embeddings - context_plus_audio_embedded, context_plus_audio_lens = self.join_embeddings_temporally( - embeddings=[context_embedding, audio_codes_input_embedded], - lengths=[context_lens, audio_codes_lens], - ) - min_context_len = context_plus_audio_lens.min().item() - - # Adjust min_context_len for phoneme delay if using phoneme tokenizer - if self.phoneme_tokenizer is not None: - min_context_len = ( - min_context_len - current_streaming_speech_delay + current_streaming_phonemes_delay - 1 - ) + # Compute min context length - we only process up to this in init + min_context_len = context_lens.min().item() # Setup classifier-free guidance if enabled dummy_context_embedding_unconditional = None if use_cfg: - # Create unconditional context embedding (all UNK tokens) dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()( - torch.full((actual_batch_size, 1), self.cfg_unk_token_id, device=device) + torch.full((1, 1), self.cfg_unk_token_id, device=device) ) - dummy_context_embedding_unconditional_expanded = dummy_context_embedding_unconditional.expand( - -1, context_embedding.size(1), -1 + # Create unconditional context (same length as conditional) + dummy_context_expanded = dummy_context_embedding_unconditional.expand( + batch_size, context_embedding.size(1), -1 ) + # Concatenate conditional and unconditional: (2*B, T, E) + context_embedding = torch.cat([context_embedding, dummy_context_expanded], dim=0) - dummy_context_plus_audio_embedded, _ = self.join_embeddings_temporally( - embeddings=[dummy_context_embedding_unconditional_expanded, audio_codes_input_embedded], - lengths=[context_lens, audio_codes_lens], - ) - # Concatenate conditional and unconditional inputs: (2B, T_min, E) - first_inference_input = torch.cat( - [context_plus_audio_embedded, dummy_context_plus_audio_embedded], dim=0 - )[:, :min_context_len, :] - else: - first_inference_input = context_plus_audio_embedded[:, :min_context_len, :] - - # First forward pass to process all context at once + # First forward pass to process context - only up to min_context_len cache_position = torch.arange(min_context_len, device=device) transformer_out = self.forward( - inputs_embeds=first_inference_input, + inputs_embeds=context_embedding[:, :min_context_len, :], attention_mask=None, use_cache=True, past_key_values=None, cache_position=cache_position, ) - time_to_first_prediction = time.time() - start_time last_hidden = transformer_out.last_hidden_state past_kv = transformer_out.past_key_values current_cache_seq_len = min_context_len - # Initialize decoding state - all_predictions = [] - end_indices = {} # Maps item_idx -> timestep when EOS was detected - - # Track text position for each item in batch - # Negative values indicate we haven't started reading remaining text yet - current_text_positions = torch.tensor( - [min_context_len - context_plus_audio_lens[i] for i in range(actual_batch_size)], + # Process GT phoneme tokens if provided (for teacher forcing) + gt_phoneme_embeddings = None + gt_phoneme_lens = None + if gt_phoneme_tokens is not None and gt_phoneme_tokens_lens is not None: + gt_phoneme_expanded = gt_phoneme_tokens.unsqueeze(1) # (B, 1, L) + gt_phoneme_stacked, gt_phoneme_lens = self.stack_codes( + gt_phoneme_expanded, + gt_phoneme_tokens_lens, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.eos_token_id, + self.phoneme_stacking_factor, + 1, + ) + gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked) # (B, T', E) + + # Initialize streaming state with batch support + state = StreamingState( + batch_size=batch_size, + past_key_values=past_kv, + cache_seq_len=current_cache_seq_len, + all_predictions=[], + all_phoneme_predictions=[], + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + context_lens=context_lens, + full_context_embedding=full_context_embedding, + full_context_lens=full_context_lens, + context_position=torch.full((batch_size,), min_context_len, dtype=torch.long, device=device), + text_tokens_seen=torch.zeros(batch_size, dtype=torch.long, device=device), + phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device), + audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device), + phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device), + finished=torch.zeros(batch_size, dtype=torch.bool, device=device), device=device, - ).long() + training_mode=selected_training_mode, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer=use_local_transformer, + temperature=temperature, + topk=topk, + dummy_context_embedding_unconditional=dummy_context_embedding_unconditional, + last_hidden=last_hidden, + text_finished=torch.zeros(batch_size, dtype=torch.bool, device=device), + phoneme_input_type=phoneme_input_type, + phoneme_sampling_method=phoneme_sampling_method, + last_phoneme_tokens=None, + last_audio_codes=None, + audio_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + audio_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + phoneme_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + gt_phoneme_embeddings=gt_phoneme_embeddings, + gt_phoneme_lens=gt_phoneme_lens, + ) - # Initialize phoneme tracking state - current_phoneme_positions = None - pred_phoneme_token_lists = [[] for _ in range(actual_batch_size)] - gt_phoneme_token_lists = [[] for _ in range(actual_batch_size)] - phoneme_stream_ended = torch.zeros(actual_batch_size, device=device).bool() + return state - if self.phoneme_tokenizer is not None: - current_phoneme_positions = current_text_positions - current_text_positions.max() - 1 - - # Main autoregressive decoding loop - for idx in range(max_decoder_steps): - # Update position trackers - current_text_positions += 1 - if self.phoneme_tokenizer is not None: - current_phoneme_positions += 1 - - if verbose and idx % 20 == 0: - logging.info(f"Decoding timestep {idx}") - - # Compute audio logits from last hidden state - last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) - all_code_logits_t = self.final_proj(last_hidden_audio) - - # Apply CFG to logits if enabled - if use_cfg: - conditional_logits = all_code_logits_t[:actual_batch_size] - unconditional_logits = all_code_logits_t[actual_batch_size:] - all_code_logits_t = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits - - # Sample audio codes - audio_codes_next, all_codes_next_argmax = self._sample_audio_codes( - last_hidden=last_hidden, - all_code_logits_t=all_code_logits_t, - temperature=temperature, - topk=topk, - use_local_transformer_for_inference=use_local_transformer_for_inference, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - ) + def streaming_step( + self, + state: StreamingState, + text_tokens: Optional[torch.Tensor] = None, + force_dropout_text: bool = False, + ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform one streaming inference step with batch support. + + This function processes one text token per batch item (or signals end of text with None) + and generates predictions according to the streaming delays. Each batch item can be + in a different phase. + + The streaming operates in four phases per batch item: + 1. Context phase (context_position < full_context_lens): + - Still processing remaining context from streaming_init + - Uses context embedding, ignores text_tokens for this item + 2. Prompt phase (text_tokens_seen < phoneme_delay): + - Only text tokens are processed, KV cache is extended + - No phoneme or audio predictions + 3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): + - Starts with phoneme BOS on first step + - Only phoneme predictions (no audio) + - Input: text embedding + phoneme embedding + 4. Audio phase (text_tokens_seen >= speech_delay): + - Starts with audio BOS on first step + - Both phoneme and audio predictions + - Input: text embedding + phoneme embedding + audio embedding + + IMPORTANT: Only ONE forward call to the decoder per streaming_step. - # Process phoneme predictions if phoneme tokenizer exists - phoneme_channel_input_t = None - if self.phoneme_tokenizer is not None: - ( - pred_phoneme_tokens, - gt_phoneme_tokens_current, - input_phoneme_tokens_current, - input_phoneme_embedding, - ) = self._process_phoneme_predictions( - last_hidden=last_hidden, - actual_batch_size=actual_batch_size, - current_phoneme_positions=current_phoneme_positions, - gt_phoneme_tokens=gt_phoneme_tokens, - phoneme_input_type=phoneme_input_type, - phoneme_sampling_method=phoneme_sampling_method, - temperature=temperature, - topk=topk, - timestep_idx=idx, + Args: + state: Current StreamingState from streaming_init or previous streaming_step. + text_tokens: Next text token for each batch item, shape (B,), or None if text has finished. + For items still in context phase, the text_token value is ignored (can be 0). + When None is passed, the model continues generating until EOS. + + Returns: + Tuple of: + - Updated StreamingState + - Predicted audio codes for this step (B, C, S) unstacked, or None if no items in audio phase + where C = num_audio_codebooks and S = frame_stacking_factor + - Predicted phoneme tokens for this step (B, phoneme_stacking_factor) or None if no items in phoneme phase + """ + if state.finished.all(): + return state, None, None + + with torch.inference_mode(): + device = state.device + batch_size = state.batch_size + streaming_speech_delay = state.training_mode.streaming_speech_delay + streaming_phonemes_delay = state.training_mode.streaming_phonemes_delay + + # ==================== DETERMINE PHASES PER BATCH ITEM ==================== + needs_context = state.context_position < state.full_context_lens # (B,) bool + needs_text = (~needs_context) & (~state.text_finished) + needs_phoneme = (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended) + needs_audio = (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished) + + next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) + # --- Context phase items: use next context embedding --- + if needs_context.any(): + # Gather context embeddings at current position for each item + # context_position: (B,) - position indices + # full_context_embedding: (B, T_max, E) + ctx_positions = state.context_position.clone() # (B,) + # Clamp positions to valid range for gathering + ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1) + # Gather: need (B, 1, E) from (B, T, E) at positions (B,) + ctx_emb = state.full_context_embedding[ + torch.arange(batch_size, device=device), + ctx_positions, + : + ].unsqueeze(1) # (B, 1, E) + # Only apply to items in context phase + context_mask = needs_context.view(batch_size, 1, 1).float() + next_input = next_input + ctx_emb * context_mask + + # --- Non-context phase items: handle text embedding --- + text_embedded = None + if text_tokens is not None and needs_text.any(): + # Embed text tokens for all items (will be masked later) + text_tokens_2d = text_tokens.unsqueeze(1) # (B, 1) + text_embedded = self.decoder.get_input_embeddings()(text_tokens_2d) # (B, 1, E) + + # Handle BPE char tokenizer + if self.use_bpe_char_tokenizer: + text_mask = torch.ones_like(text_tokens_2d, dtype=torch.bool) + cas_embedding = self.cas_encoder(text_tokens_2d, subword_mask=text_mask) # (B, 1, E) + text_embedded = text_embedded + cas_embedding + + if force_dropout_text: + text_embedded = text_embedded * 0 + + text_add_mask = needs_text.view(batch_size, 1, 1).float() + next_input = next_input + text_embedded * text_add_mask + # Check for EOS tokens - mark those items as text_finished + # Items that receive EOS should not have their text embedded added after this step + is_eos_token = (text_tokens == self.eos_id) # (B,) bool + state.text_finished = state.text_finished | is_eos_token + + elif text_tokens is None: + # Text finished signal for items not in context phase + state.text_finished = state.text_finished | ~needs_context + + # --- Phoneme embedding for phoneme and audio phase items --- + if self.phoneme_tokenizer is not None: + if needs_phoneme.any(): + phoneme_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) + + if state.phoneme_input_type == 'gt' and state.gt_phoneme_embeddings is not None: + # Teacher forcing: use pre-computed GT phoneme embeddings + # Only use GT embedding if within valid length, otherwise zero + within_gt_len = state.phoneme_steps < state.gt_phoneme_lens # (B,) + positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1) + gt_emb = state.gt_phoneme_embeddings[ + torch.arange(batch_size, device=device), positions, : + ].unsqueeze(1) # (B, 1, E) + phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float() + phoneme_emb = phoneme_emb + gt_emb * phoneme_mask + else: + # Prediction mode: use BOS or last predicted phoneme + first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0) + has_last_phoneme = needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None) + + if first_phoneme_step.any(): + phoneme_bos = torch.full( + (batch_size, self.phoneme_stacking_factor, 1), + self.phoneme_tokenizer.bos_token_id, + device=device, + ).long() + phoneme_bos_emb = self.embed_phoneme_tokens(phoneme_bos) # (B, 1, E) + first_mask = first_phoneme_step.view(batch_size, 1, 1).float() + phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask + + if has_last_phoneme.any() and state.last_phoneme_tokens is not None: + last_phoneme_emb = self.embed_phoneme_tokens(state.last_phoneme_tokens.unsqueeze(2)) # (B, 1, E) + last_mask = has_last_phoneme.view(batch_size, 1, 1).float() + phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask + + next_input = next_input + phoneme_emb + + # --- Audio embedding for audio phase items --- + if needs_audio.any(): + # Determine which items are at first audio step + first_audio_step = needs_audio & (state.audio_steps == 0) + has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None) + + audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) + + if first_audio_step.any(): + # Create BOS for items at first audio step + audio_bos = torch.full( + (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1), + self.audio_bos_id, device=device, + ).long() + audio_bos_emb = self.embed_audio_tokens(audio_bos) # (B, 1, E) + first_mask = first_audio_step.view(batch_size, 1, 1).float() + audio_emb = audio_emb + audio_bos_emb * first_mask + + if has_last_audio.any() and state.last_audio_codes is not None: + # Use last predicted audio + last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2)) # (B, 1, E) + last_mask = has_last_audio.view(batch_size, 1, 1).float() + audio_emb = audio_emb + last_audio_emb * last_mask + + next_input = next_input + audio_emb + + # ==================== HANDLE CFG ==================== + if state.use_cfg: + # For unconditional branch, use dummy embedding for non-audio items + # and audio-only embedding for audio items + next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand(batch_size, 1, -1) + # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts + next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context) + context_mask = needs_context.view(batch_size, 1, 1).float() + next_input_unconditional = context_mask * next_input_unconditional_context + (1 - context_mask) * next_input_unconditional_zeros + + # For audio phase items, we use audio embedding for the unconditional branch + if needs_audio.any(): + audio_mask = needs_audio.view(batch_size, 1, 1).float() + next_input_unconditional = next_input_unconditional * (1 - audio_mask) + audio_emb * audio_mask + + # Concatenate conditional and unconditional: (2*B, 1, E) + next_input = torch.cat([next_input, next_input_unconditional], dim=0) + + # ==================== FORWARD PASS ==================== + cache_position = torch.tensor([state.cache_seq_len], device=device) + transformer_out = self.forward( + inputs_embeds=next_input, + attention_mask=None, + use_cache=True, + past_key_values=state.past_key_values, + cache_position=cache_position, + ) + + state.last_hidden = transformer_out.last_hidden_state + state.past_key_values = transformer_out.past_key_values + state.cache_seq_len += 1 + + # ==================== UPDATE STATE ==================== + # Update context_position for items in context phase + state.context_position = state.context_position + needs_context.long() + # Keep updating text_tokens_seen for items once the context is finished + # This is because this counter is used to determine when to start predicting phonemes and audio + state.text_tokens_seen = state.text_tokens_seen + (~needs_context).long() + + # Update phoneme_steps for items in phoneme or audio phase + state.phoneme_steps = state.phoneme_steps + needs_phoneme.long() + + # Update audio_steps for items in audio phase + state.audio_steps = state.audio_steps + needs_audio.long() + + # ==================== PREDICTIONS ==================== + pred_phoneme_tokens = None + audio_codes_next = None + + # Phoneme predictions for items in phoneme or audio phase + if needs_phoneme.any() and self.phoneme_tokenizer is not None: + # Track phoneme prediction start index for items just entering phoneme phase + first_phoneme_step = needs_phoneme & (state.phoneme_prediction_start_idx == -1) + if first_phoneme_step.any(): + current_phoneme_step_idx = len(state.all_phoneme_predictions) # before append + state.phoneme_prediction_start_idx = torch.where( + first_phoneme_step, + torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx), + state.phoneme_prediction_start_idx ) - # Compute masked phoneme channel input - phoneme_channel_input_t, use_phoneme_input = self._compute_phoneme_channel_input( - input_phoneme_embedding=input_phoneme_embedding, - current_phoneme_positions=current_phoneme_positions, - phoneme_stream_ended=phoneme_stream_ended, - actual_batch_size=actual_batch_size, - device=device, + # Check which items should predict phonemes (not ended) + pred_phoneme_tokens = self._predict_phoneme_tokens(state) # (B, phoneme_stacking_factor) + state.last_phoneme_tokens = pred_phoneme_tokens + state.all_phoneme_predictions.append(pred_phoneme_tokens) + + # Check for phoneme EOS per item + phoneme_eos_detected = needs_phoneme & (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1) # (B,) + state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected + + # Track phoneme prediction end index for items that just ended + newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1) + if newly_ended_phoneme.any(): + current_phoneme_step_idx = len(state.all_phoneme_predictions) # after append + state.phoneme_prediction_end_idx = torch.where( + newly_ended_phoneme, + torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx), + state.phoneme_prediction_end_idx ) - # Collect phoneme tokens for logging (no printing here) - self._collect_phoneme_tokens_for_logging( - pred_phoneme_tokens=pred_phoneme_tokens, - gt_phoneme_tokens_current=gt_phoneme_tokens_current, - use_phoneme_input=use_phoneme_input, - pred_phoneme_token_lists=pred_phoneme_token_lists, - gt_phoneme_token_lists=gt_phoneme_token_lists, - batch_size=actual_batch_size, + # Audio predictions for items in audio phase + if needs_audio.any(): + # Track audio prediction start index for items just entering audio phase + first_audio_step = needs_audio & (state.audio_prediction_start_idx == -1) + if first_audio_step.any(): + # Track start in terms of frames (not steps) + current_frame_idx = sum(p.size(-1) for p in state.all_predictions) # total frames so far + state.audio_prediction_start_idx = torch.where( + first_audio_step, + torch.full_like(state.audio_prediction_start_idx, current_frame_idx), + state.audio_prediction_start_idx ) - # Check for phoneme EOS - for item_idx in range(actual_batch_size): - if torch.any(input_phoneme_tokens_current[item_idx] == self.phoneme_tokenizer.eos_token_id): - if verbose and not phoneme_stream_ended[item_idx]: - logging.info(f"Phoneme EOS detected for item {item_idx} at timestep {idx}") - phoneme_stream_ended[item_idx] = True - - # Check for audio EOS - self._check_eos_and_update_end_indices( - all_codes_next_argmax=all_codes_next_argmax, - audio_codes_next=audio_codes_next, - end_indices=end_indices, - context_plus_audio_lens=context_plus_audio_lens, - min_context_len=min_context_len, - idx=idx, - verbose=verbose, - ) + audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state) # (B, C*S) + + # Unstack immediately: (B, C*S) -> (B, C, S) where S = frame_stacking_factor + S = self.frame_stacking_factor + C = self.num_audio_codebooks + audio_codes_unstacked = audio_codes_next_stacked.view(batch_size, C, S) # (B, C, S) + + # Update last_audio_codes with stacked format (needed for next step's embedding) + if state.last_audio_codes is None: + state.last_audio_codes = audio_codes_next_stacked + else: + update_mask = needs_audio.view(batch_size, 1).expand_as(audio_codes_next_stacked) + state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes) + + # Check for EOS in each frame and track exact end position + # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S) + all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S) + + # For each batch item, find if/where EOS occurs in this step's frames + eos_in_sampled = (audio_codes_unstacked == self.audio_eos_id) # (B, C, S) + eos_in_argmax = (all_codes_argmax_unstacked == self.audio_eos_id) # (B, C, S) + eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1) # (B, S) + + # Find first frame with EOS per batch item (or S if none) + eos_frame_idx = torch.where( + eos_any_codebook.any(dim=1), + eos_any_codebook.int().argmax(dim=1), # first frame with EOS + torch.full((batch_size,), S, device=device) # no EOS in this step + ) # (B,) + + audio_eos_detected = eos_any_codebook.any(dim=1) # (B,) + state.finished = state.finished | audio_eos_detected + + # Track audio prediction end index (in frames) for items that just ended + newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1) + if newly_ended_audio.any(): + # End index = current frame count + frame offset where EOS was found + current_frame_count = len(state.all_predictions) * self.frame_stacking_factor + end_frame_idx = current_frame_count + eos_frame_idx + state.audio_prediction_end_idx = torch.where( + newly_ended_audio, + end_frame_idx, + state.audio_prediction_end_idx + ) - all_predictions.append(audio_codes_next) - - # Prepare input for next decoder step - next_input = self._prepare_next_decoder_input( - audio_codes_next=audio_codes_next, - context_plus_audio_embedded=context_plus_audio_embedded, - context_plus_audio_lens=context_plus_audio_lens, - min_context_len=min_context_len, - idx=idx, - current_text_input_mode=current_text_input_mode, - remaining_text_embedded=remaining_text_embedded, - current_text_positions=current_text_positions, - phoneme_channel_input_t=phoneme_channel_input_t, - use_cfg=use_cfg, - dummy_context_embedding_unconditional=dummy_context_embedding_unconditional, + # Store unstacked codes + state.all_predictions.append(audio_codes_unstacked) + audio_codes_next = audio_codes_unstacked + + return state, audio_codes_next, pred_phoneme_tokens + + def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: + """Predict phoneme tokens from the last hidden state.""" + actual_batch_size = state.batch_size + last_hidden = state.last_hidden + + # Get phoneme logits + all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) + all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] + + # Sample phonemes + if state.phoneme_sampling_method == 'argmax': + pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( + all_code_logits_t_phoneme, temperature=0.01 + ) + else: + pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( + all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk + ) + # (B, phoneme_stacking_factor) + return pred_phoneme_tokens + + def _predict_audio_codes( + self, state: StreamingState + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Predict audio codes from the last hidden state.""" + actual_batch_size = state.batch_size + last_hidden = state.last_hidden + + # Compute audio logits + last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) + all_code_logits_t = self.final_proj(last_hidden_audio) + + # Apply CFG if enabled + if state.use_cfg: + conditional_logits = all_code_logits_t[:actual_batch_size] + unconditional_logits = all_code_logits_t[actual_batch_size:] + all_code_logits_t = state.cfg_scale * conditional_logits + (1.0 - state.cfg_scale) * unconditional_logits + + # Sample audio codes + audio_codes_next, all_codes_next_argmax = self._sample_audio_codes( + last_hidden=last_hidden, + all_code_logits_t=all_code_logits_t, + temperature=state.temperature, + topk=state.topk, + use_local_transformer_for_inference=state.use_local_transformer, + use_cfg=state.use_cfg, + cfg_scale=state.cfg_scale, + ) + + return audio_codes_next, all_codes_next_argmax + + def streaming_decode( + self, + state: StreamingState, + previous_decode_length: int = 0, + ) -> Tuple[torch.Tensor, torch.Tensor, int]: + """ + Decode accumulated audio codes to waveform, returning only the new chunk. + + WARNING: This function does not yet support batch_size > 1. + Do not use with batched streaming inference. Use streaming_finalize instead. + + This function takes all predicted codes so far and decodes them, but only + returns the newly generated audio portion (after previous_decode_length). + + Args: + state: Current StreamingState containing all_predictions. + previous_decode_length: Number of audio samples already decoded and returned + in previous calls. Use 0 on first call. + + Returns: + Tuple of: + - new_audio: Newly generated audio waveform (1, new_samples) + - new_audio_len: Length of new audio (1,) + - total_decode_length: Total decoded length so far (use as previous_decode_length + for next call) + """ + if len(state.all_predictions) == 0: + return ( + torch.zeros(1, 0, device=state.device), + torch.zeros(1, dtype=torch.long, device=state.device), + previous_decode_length, + ) + + with torch.inference_mode(): + # Concatenate all predictions - each is (1, C, S), concat gives (1, C, T_total_frames) + predicted_codes = torch.cat(state.all_predictions, dim=-1) # (1, C, T_total_frames) + predicted_codes_lens = torch.tensor([predicted_codes.size(-1)], device=state.device) + + # Decode to audio (codes are already unstacked, no EOS removal needed) + audio, audio_len, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens) + + # Extract only new audio + total_decode_length = audio_len[0].item() + if total_decode_length <= previous_decode_length: + return ( + torch.zeros(1, 0, device=state.device), + torch.zeros(1, dtype=torch.long, device=state.device), + previous_decode_length, ) - # Forward pass for next token - cache_position = torch.tensor([current_cache_seq_len], device=device) - transformer_out = self.forward( - inputs_embeds=next_input, - attention_mask=None, - use_cache=True, - past_key_values=past_kv, - cache_position=cache_position, + new_audio = audio[:, previous_decode_length:total_decode_length] + new_audio_len = torch.tensor([total_decode_length - previous_decode_length], device=state.device) + + return new_audio, new_audio_len, total_decode_length + + def streaming_finalize( + self, + state: StreamingState, + ) -> StreamingFinalizeOutput: + """ + Finalize streaming and return the complete generated audio and phoneme predictions. + + This function should be called after all streaming_step() calls are complete + (i.e., when state.finished.all() is True or max steps reached). + + Args: + state: Final StreamingState after streaming is complete. + + Returns: + StreamingFinalizeOutput containing audio, codes, and phoneme predictions. + """ + batch_size = state.batch_size + + # Extract and decode phoneme predictions + phoneme_tokens_list: List[List[int]] = [] + phoneme_text_list: List[str] = [] + if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0: + # Stack phoneme predictions: each is (B, phoneme_stacking_factor) + all_phonemes = torch.stack(state.all_phoneme_predictions, dim=-1) # (B, S, T) + for i in range(batch_size): + start = max(0, state.phoneme_prediction_start_idx[i].item()) + end = state.phoneme_prediction_end_idx[i].item() + if end < 0: + end = all_phonemes.size(-1) + # Flatten stacked phonemes back to sequence + tokens = all_phonemes[i, :, start:end].T.reshape(-1).tolist() + # Remove special tokens (BOS, EOS, PAD) + special = {self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.eos_token_id} + if hasattr(self.phoneme_tokenizer, 'pad_token_id'): + special.add(self.phoneme_tokenizer.pad_token_id) + tokens = [t for t in tokens if t not in special] + phoneme_tokens_list.append(tokens) + phoneme_text_list.append(self.phoneme_tokenizer.decode(tokens)) + else: + phoneme_tokens_list = [[] for _ in range(batch_size)] + phoneme_text_list = ["" for _ in range(batch_size)] + + if len(state.all_predictions) == 0: + return StreamingFinalizeOutput( + audio=torch.zeros(batch_size, 0, device=state.device), + audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + audio_codes=torch.zeros(batch_size, self.num_audio_codebooks, 0, device=state.device), + audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + phoneme_tokens=phoneme_tokens_list, + phoneme_text=phoneme_text_list, + ) + + with torch.inference_mode(): + # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames) + all_codes = torch.cat(state.all_predictions, dim=-1) # (B, C, T_total_frames) + total_frames = all_codes.size(-1) + num_codebooks = all_codes.size(1) + + # Start and end indices are in frames (not steps) + # If start_idx is -1, item never started audio predictions - use 0 + # If end_idx is -1, item never ended - use total_frames + start_indices = torch.clamp(state.audio_prediction_start_idx, min=0) + end_indices = torch.where( + state.audio_prediction_end_idx >= 0, + state.audio_prediction_end_idx, + torch.full_like(state.audio_prediction_end_idx, total_frames) + ) + + # Calculate per-item lengths (in frames) + predicted_codes_lens = end_indices - start_indices + max_len = predicted_codes_lens.max().item() + + # Handle case where all items have zero-length predictions + if max_len == 0: + return StreamingFinalizeOutput( + audio=torch.zeros(batch_size, 0, device=state.device), + audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + audio_codes=torch.zeros(batch_size, num_codebooks, 0, device=state.device, dtype=all_codes.dtype), + audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + phoneme_tokens=phoneme_tokens_list, + phoneme_text=phoneme_text_list, ) - last_hidden = transformer_out.last_hidden_state - past_kv = transformer_out.past_key_values - current_cache_seq_len += 1 - - # Check if all items have finished - if len(end_indices) == actual_batch_size: - if verbose: - logging.info(f"All items finished at timestep {idx}") - break - - # Log phoneme predictions if verbose - if verbose and self.phoneme_tokenizer is not None: - self._log_phoneme_predictions( - pred_phoneme_token_lists=pred_phoneme_token_lists, - gt_phoneme_token_lists=gt_phoneme_token_lists, - batch_size=actual_batch_size, + + # Create padded output tensor and slice each item's valid predictions + predicted_codes = torch.zeros( + batch_size, num_codebooks, max_len, + dtype=all_codes.dtype, device=state.device + ) + for i in range(batch_size): + start = start_indices[i].item() + end = end_indices[i].item() + length = end - start + if length > 0: + predicted_codes[i, :, :length] = all_codes[i, :, start:end] + + # No need to remove EOS - end_indices already point to the frame before EOS + # Decode to audio (codes are already unstacked: B, C, T) + audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens) + + return StreamingFinalizeOutput( + audio=audio, + audio_len=audio_len, + audio_codes=predicted_codes, + audio_codes_len=predicted_codes_lens, + phoneme_tokens=phoneme_tokens_list, + phoneme_text=phoneme_text_list, + ) + + def infer_batch( + self, + batch: Dict[str, torch.Tensor], + max_decoder_steps: int = 500, + temperature: float = 0.7, + topk: int = 80, + use_cfg: bool = False, + cfg_scale: float = 1.0, + use_local_transformer_for_inference: bool = False, + phoneme_input_type: str = 'pred', + phoneme_sampling_method: str = 'argmax', + force_dropout_text: bool = False, + ) -> InferBatchOutput: + """ + Batch inference using streaming infrastructure. + + This is a simple wrapper around streaming_init, streaming_step, and streaming_finalize + that processes a batch dictionary similar to training_step/validation_step. + + Args: + batch: Dictionary containing: + - text: Text token IDs (B, L) + - text_lens: Lengths (B,) + - context_text_tokens: Context text tokens (B, L') + - context_text_tokens_lens: Lengths (B,) + - context_audio_codes: Context audio codes (B, C, T) OR + - context_audio / context_audio_lens: Raw context audio to encode + - phoneme_tokens (optional): GT phoneme tokens (B, L'') + - phoneme_tokens_lens (optional): Lengths (B,) + max_decoder_steps: Maximum number of decoder steps. + temperature: Sampling temperature for audio codes. + topk: Top-k sampling parameter. + use_cfg: Whether to use classifier-free guidance. + cfg_scale: CFG scale factor. + use_local_transformer_for_inference: Whether to use local transformer. + phoneme_input_type: 'gt' or 'pred' for phoneme tokens. + phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. + force_dropout_text: Whether to dropout text embeddings. + + Returns: + InferBatchOutput containing predicted audio, codes, and RTF metrics. + """ + with torch.inference_mode(): + start_time = time.time() + + # Extract tensors from batch + text = batch['text'] + text_lens = batch['text_lens'] + context_text_tokens = batch['context_text_tokens'] + context_text_tokens_lens = batch['context_text_tokens_lens'] + + # Handle context audio - either use codes directly or encode from audio + if 'context_audio_codes' in batch: + context_audio_codes = batch['context_audio_codes'] + context_audio_codes_lens = batch['context_audio_codes_lens'] + else: + context_audio = batch['context_audio'] + context_audio_lens = batch['context_audio_lens'] + context_audio_codes, context_audio_codes_lens = self.audio_to_codes( + context_audio, context_audio_lens ) - # Post-process predictions - tts_generation_time = time.time() - start_time - tts_generation_time_per_frame = tts_generation_time / len(all_predictions) - - # Calculate predicted lengths, accounting for context offset - pred_codes_start_indices = context_plus_audio_lens - min_context_len - predicted_lens = [end_indices.get(i, max_decoder_steps) for i in range(actual_batch_size)] - predicted_codes_lens = torch.tensor(predicted_lens, device=device).long() - predicted_codes_lens = predicted_codes_lens - pred_codes_start_indices - - # Stack and slice predictions to remove context portion - predicted_codes = torch.stack(all_predictions, dim=-1) # (B, num_codebooks, T) - predicted_codes = self.slice_pred_embeddings( - predicted_codes.permute(0, 2, 1), - context_lens=pred_codes_start_indices, - target_lens=predicted_codes_lens, + # Optional GT phoneme tokens for teacher forcing + gt_phoneme_tokens = batch.get('phoneme_tokens') + gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens') + + batch_size = text.size(0) + + # Initialize streaming state + state = self.streaming_init( + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer=use_local_transformer_for_inference, + temperature=temperature, + topk=topk, + phoneme_input_type=phoneme_input_type, + phoneme_sampling_method=phoneme_sampling_method, + gt_phoneme_tokens=gt_phoneme_tokens, + gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, ) - predicted_codes = predicted_codes.permute(0, 2, 1) - # Remove EOS tokens and convert codes to audio - predicted_codes, predicted_codes_lens = self.remove_eos_token(predicted_codes, predicted_codes_lens) - predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens) + time_to_first_prediction = None + generation_start_time = time.time() + device = text.device + + # Generate until all items are finished or max steps reached + while not state.finished.all() and len(state.all_predictions) < max_decoder_steps: + # Gather the correct text token for each batch item based on text_tokens_seen + # Items in context phase will have their token ignored by streaming_step + positions = state.text_tokens_seen.clamp(max=text.size(1) - 1) + current_tokens = text[torch.arange(batch_size, device=device), positions] + + # For items that have exhausted their text, provide EOS token + text_exhausted = state.text_tokens_seen >= text_lens + current_tokens = torch.where(text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens) + + state, audio_codes, phoneme_tokens = self.streaming_step( + state=state, + text_tokens=current_tokens, + force_dropout_text=force_dropout_text, + ) + + # Record time to first audio prediction + if time_to_first_prediction is None and audio_codes is not None: + time_to_first_prediction = time.time() - start_time + + tts_generation_time = time.time() - generation_start_time + + # Finalize and decode audio + finalize_output = self.streaming_finalize(state) - # Compute RTF metrics end_time = time.time() - total_audio_duration_generated = ( - predicted_audio_lens.max().item() * predicted_audio_lens.shape[0] - ) / self.sample_rate - rtf = total_audio_duration_generated / (end_time - start_time) + total_time = end_time - start_time + + # Compute RTF metrics + total_audio_samples = finalize_output.audio_len.sum().item() + total_audio_duration = total_audio_samples / self.output_sample_rate + num_frames = len(state.all_predictions) + tts_generation_time_per_frame = tts_generation_time / num_frames if num_frames > 0 else 0.0 rtf_metrics = { - 'rtf': rtf, + 'rtf': total_audio_duration / total_time if total_time > 0 else 0.0, 'time_to_first_prediction': time_to_first_prediction, 'tts_generation_time': tts_generation_time, - 'max_frames_generated': len(all_predictions), + 'max_frames_generated': num_frames, 'tts_generation_time_per_frame': tts_generation_time_per_frame, - 'batch_size': actual_batch_size, + 'batch_size': batch_size, } - return predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics + return InferBatchOutput( + predicted_audio=finalize_output.audio, + predicted_audio_lens=finalize_output.audio_len, + predicted_codes=finalize_output.audio_codes, + predicted_codes_lens=finalize_output.audio_codes_len, + rtf_metrics=rtf_metrics, + ) @classmethod def list_available_models(cls) -> List[PretrainedModelInfo]: diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index 34ba8d62c730..9b0db0f7f75e 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -319,21 +319,24 @@ def _run_decoder_only_inference( for batch_idx, batch in enumerate(dataloader): logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}") - batch_cuda = self._batch_to_cuda(batch) - - predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, rtf_metrics = self.model.infer_batch( - batch_cuda, + batch = self._batch_to_cuda(batch) + output = self.model.infer_batch( + batch, max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps, temperature=self.config.model_inference_parameters.temperature, topk=self.config.model_inference_parameters.topk, - use_local_transformer_for_inference=self.config.use_local_transformer, - maskgit_n_steps=self.config.maskgit_n_steps, use_cfg=self.config.use_cfg, cfg_scale=self.config.model_inference_parameters.cfg_scale, + use_local_transformer_for_inference=self.config.use_local_transformer, phoneme_input_type=self.config.phoneme_input_type, phoneme_sampling_method=phoneme_sampling_method, - dropout_text_input=self.config.dropout_text_input, + force_dropout_text=self.config.dropout_text_input, ) + predicted_audio = output.predicted_audio + predicted_audio_lens = output.predicted_audio_lens + predicted_codes = output.predicted_codes + predicted_codes_lens = output.predicted_codes_lens + rtf_metrics = output.rtf_metrics all_rtf_metrics.append(rtf_metrics) logging.info(f"Output shape: {predicted_audio.size()}") @@ -342,7 +345,8 @@ def _run_decoder_only_inference( audio_len = predicted_audio_lens[idx].item() audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len] audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav") - sf.write(audio_path, audio_np, self.model.sample_rate) + sample_rate = getattr(self.model, "output_sample_rate", self.model.sample_rate) + sf.write(audio_path, audio_np, sample_rate) generated_audio_paths.append(audio_path) if save_context_audio and item_idx < len(manifest_records): From cd59639f34ff643d8ef4ab756d732852c790eba1 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 5 Feb 2026 22:34:49 +0000 Subject: [PATCH 36/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- examples/tts/magpietts_streaming_inference.py | 60 ++++---- nemo/collections/tts/models/easy_magpietts.py | 130 ++++++++++-------- 2 files changed, 105 insertions(+), 85 deletions(-) diff --git a/examples/tts/magpietts_streaming_inference.py b/examples/tts/magpietts_streaming_inference.py index 6e72ea77b8e6..d25172d4e1f6 100644 --- a/examples/tts/magpietts_streaming_inference.py +++ b/examples/tts/magpietts_streaming_inference.py @@ -141,6 +141,7 @@ def load_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor: # Resample if needed if sr != target_sample_rate: import librosa + audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate) return torch.from_numpy(audio).unsqueeze(0) # (1, num_samples) @@ -241,9 +242,7 @@ def run_streaming_inference( context_audio_lens = context_audio_lens.to(device) with torch.inference_mode(): - context_audio_codes, context_audio_codes_lens = model.audio_to_codes( - context_audio, context_audio_lens - ) + context_audio_codes, context_audio_codes_lens = model.audio_to_codes(context_audio, context_audio_lens) # Tokenize context text # Use the text conditioning tokenizer @@ -357,8 +356,10 @@ def run_streaming_inference( num_phoneme_frames += 1 if verbose and (i + 1) % 10 == 0: - phase = "prompt" if audio_codes is None and phoneme_tokens is None else ( - "phoneme-only" if audio_codes is None else "audio" + phase = ( + "prompt" + if audio_codes is None and phoneme_tokens is None + else ("phoneme-only" if audio_codes is None else "audio") ) logging.info( f"Processed {i + 1}/{len(text_tokens)} text tokens (phase: {phase}), " @@ -373,7 +374,9 @@ def run_streaming_inference( # Continue generating until finished (text has ended) continuation_steps = 0 while not state.finished and continuation_steps < max_steps: - state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text) + state, audio_codes, phoneme_tokens = model.streaming_step( + state, text_tokens=None, force_dropout_text=force_dropout_text + ) if audio_codes is not None: num_audio_frames += 1 @@ -494,7 +497,7 @@ def run_batched_streaming_inference( for i in range(batch_size): codes = context_audio_codes_list[i] codes_len = context_audio_codes_lens_list[i] - context_audio_codes[i, :, :codes.size(-1)] = codes[0] + context_audio_codes[i, :, : codes.size(-1)] = codes[0] context_audio_codes_lens[i] = codes_len[0] # Tokenize context texts @@ -510,7 +513,7 @@ def run_batched_streaming_inference( context_text_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) for i, tokens in enumerate(context_text_tokens_list): - context_text_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) + context_text_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) context_text_tokens_lens[i] = len(tokens) # Tokenize main texts @@ -543,7 +546,7 @@ def run_batched_streaming_inference( gt_phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device) gt_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) for i, tokens in enumerate(phoneme_tokens_lists): - gt_phoneme_tokens[i, :len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) + gt_phoneme_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) gt_phoneme_tokens_lens[i] = len(tokens) phoneme_input_type = 'gt' if use_gt_phonemes else 'pred' @@ -625,9 +628,13 @@ def run_batched_streaming_inference( all_text_done = text_finished_mask.all() and not in_context_phase.any() if all_text_done: - state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=None, force_dropout_text=force_dropout_text) + state, audio_codes, phoneme_tokens = model.streaming_step( + state, text_tokens=None, force_dropout_text=force_dropout_text + ) else: - state, audio_codes, phoneme_tokens = model.streaming_step(state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text) + state, audio_codes, phoneme_tokens = model.streaming_step( + state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text + ) if audio_codes is not None: num_audio_frames += 1 @@ -712,8 +719,7 @@ def main(): type=str, nargs='+', required=True, - help='Path(s) to context audio file(s) for speaker cloning. ' - 'Multiple files enable batched inference.', + help='Path(s) to context audio file(s) for speaker cloning. ' 'Multiple files enable batched inference.', ) input_group.add_argument( '--context_text', @@ -721,7 +727,7 @@ def main(): nargs='+', default=["[NO TEXT CONTEXT]"], help='Context text(s) for speaker conditioning. Provide one per context audio, ' - 'or a single value to use for all. (default: "[NO TEXT CONTEXT]")', + 'or a single value to use for all. (default: "[NO TEXT CONTEXT]")', ) input_group.add_argument( '--context_duration', @@ -729,8 +735,8 @@ def main(): nargs='+', default=[5.0], help='Target duration(s) for context audio in seconds. Provide one per context audio, ' - 'or a single value to use for all. If audio is longer, ' - 'first N seconds are used. If shorter, audio is repeated. (default: 5.0)', + 'or a single value to use for all. If audio is longer, ' + 'first N seconds are used. If shorter, audio is repeated. (default: 5.0)', ) input_group.add_argument( '--text', @@ -745,13 +751,13 @@ def main(): nargs='+', default=None, help='Phoneme text(s) for GT phoneme conditioning. If not provided, uses --text. ' - 'Provide one per context audio for batched inference.', + 'Provide one per context audio for batched inference.', ) input_group.add_argument( '--use_gt_phonemes', action='store_true', help='Use ground-truth phonemes as decoder input (teacher forcing). ' - 'If not set, uses model-predicted phonemes.', + 'If not set, uses model-predicted phonemes.', ) # Output arguments @@ -851,13 +857,17 @@ def main(): if len(context_texts) == 1 and batch_size > 1: context_texts = context_texts * batch_size elif len(context_texts) != batch_size: - parser.error(f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1") + parser.error( + f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1" + ) context_durations = args.context_duration if len(context_durations) == 1 and batch_size > 1: context_durations = context_durations * batch_size elif len(context_durations) != batch_size: - parser.error(f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1") + parser.error( + f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1" + ) texts = args.text if len(texts) == 1 and batch_size > 1: @@ -872,7 +882,9 @@ def main(): elif len(phoneme_texts) == 1 and batch_size > 1: phoneme_texts = phoneme_texts * batch_size elif len(phoneme_texts) != batch_size: - parser.error(f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1") + parser.error( + f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1" + ) # Load and process context audios context_audios = [] @@ -925,14 +937,14 @@ def main(): if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) - audio_np = output.audio[0, :output.audio_len[0].item()].cpu().numpy() + audio_np = output.audio[0, : output.audio_len[0].item()].cpu().numpy() sf.write(args.output_path, audio_np, model.output_sample_rate) logging.info(f"Output saved to: {args.output_path}") # Save decoded context audio for sanity check output_base, output_ext = os.path.splitext(args.output_path) context_output_path = f"{output_base}_context_decoded{output_ext}" - context_audio_np = context_audio_decoded[0, :context_audio_decoded_lens[0].item()].cpu().numpy() + context_audio_np = context_audio_decoded[0, : context_audio_decoded_lens[0].item()].cpu().numpy() sf.write(context_output_path, context_audio_np, model.output_sample_rate) logging.info(f"Context audio (decoded from codes) saved to: {context_output_path}") @@ -989,7 +1001,7 @@ def main(): for i in range(batch_size): output_path_i = f"{output_base}_{i}{output_ext}" - audio_np = output.audio[i, :output.audio_len[i].item()].cpu().numpy() + audio_np = output.audio[i, : output.audio_len[i].item()].cpu().numpy() sf.write(output_path_i, audio_np, model.output_sample_rate) audio_duration_i = output.audio_len[i].item() / model.output_sample_rate logging.info(f"Output {i+1}/{batch_size} saved to: {output_path_i} (duration: {audio_duration_i:.2f}s)") diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 1351b8409417..4c9b26ded4d7 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -1210,9 +1210,7 @@ def prepare_text_channel_embeddings( # Create zero tensor for delay padding max_delay = delay.max().item() - zero_delay_tensor = torch.zeros( - batch_size, max_delay, self.cfg.embedding_dim, device=device - ) + zero_delay_tensor = torch.zeros(batch_size, max_delay, self.cfg.embedding_dim, device=device) # Join delay zeros with text embeddings text_channel_embedding, text_channel_lens = self.join_embeddings_temporally( @@ -1277,9 +1275,7 @@ def prepare_phoneme_channel_embeddings( # Create zero tensor for delay padding max_delay = delay.max().item() - zero_delay_tensor = torch.zeros( - batch_size, max_delay, self.cfg.embedding_dim, device=device - ) + zero_delay_tensor = torch.zeros(batch_size, max_delay, self.cfg.embedding_dim, device=device) # Join delay zeros with phoneme embeddings phoneme_channel_embedding, phoneme_channel_lens = self.join_embeddings_temporally( @@ -1355,9 +1351,7 @@ def prepare_audio_channel_embeddings( # Create zero tensor for delay padding max_delay = delay.max().item() - zero_delay_tensor = torch.zeros( - batch_size, max_delay, self.cfg.embedding_dim, device=device - ) + zero_delay_tensor = torch.zeros(batch_size, max_delay, self.cfg.embedding_dim, device=device) # Join delay zeros with audio embeddings audio_channel_embedding, audio_channel_lens = self.join_embeddings_temporally( @@ -1667,13 +1661,18 @@ def process_batch( # 7. Join context with combined channel embeddings # The combined_channel_lens is the max of all channel lens for each batch item - combined_channel_lens = torch.stack([ - text_channel_lens, - audio_channel_lens, - phoneme_channel_lens if phoneme_channel_embedding is not None else audio_channel_lens, - ], dim=0).max(dim=0).values - - + combined_channel_lens = ( + torch.stack( + [ + text_channel_lens, + audio_channel_lens, + phoneme_channel_lens if phoneme_channel_embedding is not None else audio_channel_lens, + ], + dim=0, + ) + .max(dim=0) + .values + ) # Right pad context embedding context_padding = torch.zeros( @@ -1767,7 +1766,7 @@ def training_step(self, batch, batch_idx): context_audio = batch['context_audio'] context_audio_lens = batch['context_audio_lens'] context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) - + if 'audio_codes' in batch: audio_codes = batch['audio_codes'] audio_codes_lens = batch['audio_codes_lens'] @@ -1856,7 +1855,7 @@ def validation_step(self, batch, batch_idx): context_audio = batch['context_audio'] context_audio_lens = batch['context_audio_lens'] context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) - + if 'audio_codes' in batch: audio_codes = batch['audio_codes'] audio_codes_lens = batch['audio_codes_lens'] @@ -1864,7 +1863,7 @@ def validation_step(self, batch, batch_idx): audio = batch['audio'] audio_lens = batch['audio_lens'] audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens) - + batch_output = self.process_batch( text=batch['text'], text_lens=batch['text_lens'], @@ -2049,7 +2048,6 @@ def setup_validation_data(self, cfg): def setup_test_data(self, cfg): self._test_dl = self._setup_test_dataloader(cfg) - def _sample_audio_codes( self, last_hidden: torch.Tensor, @@ -2161,13 +2159,15 @@ def streaming_init( selected_training_mode = self.mode_name_to_mode[mode_name] # Prepare context embedding using shared helper - context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = self.prepare_context_tensors( - context_text_tokens=context_text_tokens, - context_text_tokens_lens=context_text_tokens_lens, - context_audio_codes=context_audio_codes, - context_audio_codes_lens=context_audio_codes_lens, - training_mode=selected_training_mode, - dropout_conditional_input=False, + context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = ( + self.prepare_context_tensors( + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + training_mode=selected_training_mode, + dropout_conditional_input=False, + ) ) # Store full context embedding and lens before any CFG manipulation @@ -2331,10 +2331,10 @@ def streaming_step( ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1) # Gather: need (B, 1, E) from (B, T, E) at positions (B,) ctx_emb = state.full_context_embedding[ - torch.arange(batch_size, device=device), - ctx_positions, - : - ].unsqueeze(1) # (B, 1, E) + torch.arange(batch_size, device=device), ctx_positions, : + ].unsqueeze( + 1 + ) # (B, 1, E) # Only apply to items in context phase context_mask = needs_context.view(batch_size, 1, 1).float() next_input = next_input + ctx_emb * context_mask @@ -2359,7 +2359,7 @@ def streaming_step( next_input = next_input + text_embedded * text_add_mask # Check for EOS tokens - mark those items as text_finished # Items that receive EOS should not have their text embedded added after this step - is_eos_token = (text_tokens == self.eos_id) # (B,) bool + is_eos_token = text_tokens == self.eos_id # (B,) bool state.text_finished = state.text_finished | is_eos_token elif text_tokens is None: @@ -2378,13 +2378,17 @@ def streaming_step( positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1) gt_emb = state.gt_phoneme_embeddings[ torch.arange(batch_size, device=device), positions, : - ].unsqueeze(1) # (B, 1, E) + ].unsqueeze( + 1 + ) # (B, 1, E) phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float() phoneme_emb = phoneme_emb + gt_emb * phoneme_mask else: # Prediction mode: use BOS or last predicted phoneme first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0) - has_last_phoneme = needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None) + has_last_phoneme = ( + needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None) + ) if first_phoneme_step.any(): phoneme_bos = torch.full( @@ -2397,7 +2401,9 @@ def streaming_step( phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask if has_last_phoneme.any() and state.last_phoneme_tokens is not None: - last_phoneme_emb = self.embed_phoneme_tokens(state.last_phoneme_tokens.unsqueeze(2)) # (B, 1, E) + last_phoneme_emb = self.embed_phoneme_tokens( + state.last_phoneme_tokens.unsqueeze(2) + ) # (B, 1, E) last_mask = has_last_phoneme.view(batch_size, 1, 1).float() phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask @@ -2434,12 +2440,17 @@ def streaming_step( if state.use_cfg: # For unconditional branch, use dummy embedding for non-audio items # and audio-only embedding for audio items - next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand(batch_size, 1, -1) + next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand( + batch_size, 1, -1 + ) # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context) context_mask = needs_context.view(batch_size, 1, 1).float() - next_input_unconditional = context_mask * next_input_unconditional_context + (1 - context_mask) * next_input_unconditional_zeros - + next_input_unconditional = ( + context_mask * next_input_unconditional_context + + (1 - context_mask) * next_input_unconditional_zeros + ) + # For audio phase items, we use audio embedding for the unconditional branch if needs_audio.any(): audio_mask = needs_audio.view(batch_size, 1, 1).float() @@ -2488,7 +2499,7 @@ def streaming_step( state.phoneme_prediction_start_idx = torch.where( first_phoneme_step, torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx), - state.phoneme_prediction_start_idx + state.phoneme_prediction_start_idx, ) # Check which items should predict phonemes (not ended) @@ -2497,7 +2508,11 @@ def streaming_step( state.all_phoneme_predictions.append(pred_phoneme_tokens) # Check for phoneme EOS per item - phoneme_eos_detected = needs_phoneme & (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1) # (B,) + phoneme_eos_detected = needs_phoneme & ( + pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id + ).any( + dim=1 + ) # (B,) state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected # Track phoneme prediction end index for items that just ended @@ -2507,7 +2522,7 @@ def streaming_step( state.phoneme_prediction_end_idx = torch.where( newly_ended_phoneme, torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx), - state.phoneme_prediction_end_idx + state.phoneme_prediction_end_idx, ) # Audio predictions for items in audio phase @@ -2520,7 +2535,7 @@ def streaming_step( state.audio_prediction_start_idx = torch.where( first_audio_step, torch.full_like(state.audio_prediction_start_idx, current_frame_idx), - state.audio_prediction_start_idx + state.audio_prediction_start_idx, ) audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state) # (B, C*S) @@ -2542,15 +2557,15 @@ def streaming_step( all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S) # For each batch item, find if/where EOS occurs in this step's frames - eos_in_sampled = (audio_codes_unstacked == self.audio_eos_id) # (B, C, S) - eos_in_argmax = (all_codes_argmax_unstacked == self.audio_eos_id) # (B, C, S) + eos_in_sampled = audio_codes_unstacked == self.audio_eos_id # (B, C, S) + eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id # (B, C, S) eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1) # (B, S) # Find first frame with EOS per batch item (or S if none) eos_frame_idx = torch.where( eos_any_codebook.any(dim=1), eos_any_codebook.int().argmax(dim=1), # first frame with EOS - torch.full((batch_size,), S, device=device) # no EOS in this step + torch.full((batch_size,), S, device=device), # no EOS in this step ) # (B,) audio_eos_detected = eos_any_codebook.any(dim=1) # (B,) @@ -2563,9 +2578,7 @@ def streaming_step( current_frame_count = len(state.all_predictions) * self.frame_stacking_factor end_frame_idx = current_frame_count + eos_frame_idx state.audio_prediction_end_idx = torch.where( - newly_ended_audio, - end_frame_idx, - state.audio_prediction_end_idx + newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx ) # Store unstacked codes @@ -2585,9 +2598,7 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: # Sample phonemes if state.phoneme_sampling_method == 'argmax': - pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( - all_code_logits_t_phoneme, temperature=0.01 - ) + pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01) else: pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk @@ -2595,9 +2606,7 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: # (B, phoneme_stacking_factor) return pred_phoneme_tokens - def _predict_audio_codes( - self, state: StreamingState - ) -> Tuple[torch.Tensor, torch.Tensor]: + def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, torch.Tensor]: """Predict audio codes from the last hidden state.""" actual_batch_size = state.batch_size last_hidden = state.last_hidden @@ -2745,7 +2754,7 @@ def streaming_finalize( end_indices = torch.where( state.audio_prediction_end_idx >= 0, state.audio_prediction_end_idx, - torch.full_like(state.audio_prediction_end_idx, total_frames) + torch.full_like(state.audio_prediction_end_idx, total_frames), ) # Calculate per-item lengths (in frames) @@ -2765,8 +2774,7 @@ def streaming_finalize( # Create padded output tensor and slice each item's valid predictions predicted_codes = torch.zeros( - batch_size, num_codebooks, max_len, - dtype=all_codes.dtype, device=state.device + batch_size, num_codebooks, max_len, dtype=all_codes.dtype, device=state.device ) for i in range(batch_size): start = start_indices[i].item() @@ -2846,9 +2854,7 @@ def infer_batch( else: context_audio = batch['context_audio'] context_audio_lens = batch['context_audio_lens'] - context_audio_codes, context_audio_codes_lens = self.audio_to_codes( - context_audio, context_audio_lens - ) + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) # Optional GT phoneme tokens for teacher forcing gt_phoneme_tokens = batch.get('phoneme_tokens') @@ -2886,7 +2892,9 @@ def infer_batch( # For items that have exhausted their text, provide EOS token text_exhausted = state.text_tokens_seen >= text_lens - current_tokens = torch.where(text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens) + current_tokens = torch.where( + text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens + ) state, audio_codes, phoneme_tokens = self.streaming_step( state=state, From e96f34445697ea08641814f64fbbf451f829f599 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Thu, 5 Feb 2026 18:02:48 -0500 Subject: [PATCH 37/94] include vocab file Signed-off-by: Paarth Neekhara --- ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 +++++++++++++++++ 1 file changed, 9954 insertions(+) create mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json new file mode 100644 index 000000000000..6d7e35116405 --- /dev/null +++ b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json @@ -0,0 +1,9954 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": null, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "(": 3, + ")": 4, + "-": 5, + ".": 6, + "1": 7, + "2": 8, + "4": 9, + "5": 10, + "6": 11, + "7": 12, + "F": 13, + "a": 14, + "b": 15, + "c": 16, + "d": 17, + "e": 18, + "f": 19, + "h": 20, + "i": 21, + "j": 22, + "k": 23, + "l": 24, + "m": 25, + "n": 26, + "o": 27, + "p": 28, + "q": 29, + "r": 30, + "s": 31, + "t": 32, + "u": 33, + "v": 34, + "w": 35, + "x": 36, + "y": 37, + "z": 38, + "¡": 39, + "£": 40, + "¦": 41, + "§": 42, + "©": 43, + "ª": 44, + "¬": 45, + "°": 46, + "²": 47, + "³": 48, + "¸": 49, + "¹": 50, + "¾": 51, + "Ã": 52, + "Å": 53, + "É": 54, + "Ê": 55, + "Ë": 56, + "Ì": 57, + "Î": 58, + "Ï": 59, + "Ċ": 60, + "Ġ": 61, + "Ģ": 62, + "ģ": 63, + "Ĥ": 64, + "ĥ": 65, + "ĩ": 66, + "Ī": 67, + "Ĭ": 68, + "ĭ": 69, + "Į": 70, + "į": 71, + "İ": 72, + "ı": 73, + "IJ": 74, + "ij": 75, + "Ĵ": 76, + "ĵ": 77, + "Ķ": 78, + "ķ": 79, + "ĸ": 80, + "Ĺ": 81, + "Ļ": 82, + "Ľ": 83, + "ľ": 84, + "Ŀ": 85, + "Ł": 86, + "ËĪ": 87, + "ËIJ": 88, + "ËĪÉ": 89, + "ËĮ": 90, + "ÉĻ": 91, + "ËĪa": 92, + "ËĪi": 93, + "Ġt": 94, + "ɪ": 95, + "ɾ": 96, + "ĠÉ": 97, + "Ġk": 98, + "Éľ": 99, + "Ġs": 100, + "ËĪe": 101, + "ÉĽ": 102, + "ËĪo": 103, + "Ġl": 104, + "ËĪÉĽ": 105, + "Ġd": 106, + "ÊĬ": 107, + "ËĪaËIJ": 108, + "Ġp": 109, + "Ìĥ": 110, + "Ġm": 111, + "ËĪu": 112, + "Åĭ": 113, + "ð": 114, + "ËĪÉĶ": 115, + "ÊĮ": 116, + "ËĮa": 117, + "Ġh": 118, + "ËĪÊĮ": 119, + "Ġn": 120, + "Êģ": 121, + "ËĪÉij": 122, + "Êĥ": 123, + "eËIJ": 124, + "Ġa": 125, + "Ġb": 126, + "ÉĶ": 127, + "ËĪÉĻ": 128, + "ÉĻn": 129, + "Ġf": 130, + "ËĪɪ": 131, + "É¡": 132, + "ËĪeËIJ": 133, + "Ġj": 134, + "nt": 135, + "Ġð": 136, + "ĠËĮ": 137, + "Ġts": 138, + "ĠÉ¡": 139, + "Éķ": 140, + "ËĪoËIJ": 141, + "ʰ": 142, + "aËIJ": 143, + "ËĪy": 144, + "ĠtÉķ": 145, + "ËĪiËIJ": 146, + "ĠÊ": 147, + "Ġv": 148, + "Ġw": 149, + "st": 150, + "Éij": 151, + "nd": 152, + "ËĮi": 153, + "̪": 154, + "ËĮe": 155, + "Ġz": 156, + "ËĪaɪ": 157, + "ËĪiÉĽ": 158, + "β": 159, + "ɹ": 160, + "ĠËĮa": 161, + "θ": 162, + "ĠhÉĽ": 163, + "ÊĪ": 164, + "iËIJ": 165, + "ËĮo": 166, + "Ġɪ": 167, + "Éľn": 168, + "Ġx": 169, + "ĠtÉĻ": 170, + "ËĪuËIJ": 171, + "ËĮÉĻ": 172, + "ĠjËĪi": 173, + "ËĮÉĽ": 174, + "ĠÉĽ": 175, + "ĠËĪa": 176, + "ËĮaËIJ": 177, + "Ġla": 178, + "Ġðe": 179, + "ĠhÉĽËIJ": 180, + "Ġe": 181, + "ç": 182, + "ÉĻl": 183, + "oËIJ": 184, + "ËĪÉiju": 185, + "ÊĴ": 186, + "uËIJ": 187, + "ĠÉĹ": 188, + "ĠÉķ": 189, + "ËĮeËIJ": 190, + "ĠtÉķËĪi": 191, + "os": 192, + "ËĪÉĶËIJ": 193, + "as": 194, + "ËĪÊĬ": 195, + "Ġi": 196, + "ËĪai": 197, + "ɲ": 198, + "ɪn": 199, + "ts": 200, + "ÉľÅĭ": 201, + "ĠÉŁ": 202, + "ĠÊĥ": 203, + "ËĪeɪ": 204, + "ÉĽÉ¾": 205, + "ËĪÉĽËIJ": 206, + "ËĪÉĽÉ¾": 207, + "Ġr": 208, + "tÊĥ": 209, + "ËĮÉĶ": 210, + "ĠdÉĻ": 211, + "tÉĻ": 212, + "ou": 213, + "ËĪyÉĻ": 214, + "ĠËĮi": 215, + "ÉĻɾ": 216, + "ËĪÉĻÊĬ": 217, + "ËĪÊĮɾ": 218, + "ËĪÉĴ": 219, + "Ġth": 220, + "ËĪon": 221, + "Êĭ": 222, + "ËĪÉijËIJ": 223, + "ËĪÊĮh": 224, + "wËĪa": 225, + "ËĪei": 226, + "ll": 227, + "ĠÉIJ": 228, + "ÉijËIJ": 229, + "an": 230, + "ÉŁ": 231, + "ĠÊĭ": 232, + "Ġko": 233, + "kh": 234, + "ɪÅĭ": 235, + "ËĪaËIJɪ": 236, + "ĠtÊĥ": 237, + "ËĪaËIJt": 238, + "ĠËĮe": 239, + "ĠtÉķh": 240, + "ËĪuo": 241, + "ËĪonÉ¡": 242, + "Éĸ": 243, + "at": 244, + "Ġke": 245, + "ÉĴ": 246, + "ĠÉķËĪi": 247, + "ø": 248, + "ĠÉij": 249, + "ËĪeËIJk": 250, + "Åĵ": 251, + "re": 252, + "Ġɾ": 253, + "ĠkÉĶ": 254, + "ËĮÊĬ": 255, + "sk": 256, + "ĠÊĬ": 257, + "Ġand": 258, + "ɪç": 259, + "Ġme": 260, + "ËĪaɾ": 261, + "ĠËĪɪ": 262, + "na": 263, + "Ġβ": 264, + "ĠlËĪi": 265, + "jaËIJ": 266, + "li": 267, + "no": 268, + "Ġɪn": 269, + "ĠdËĮi": 270, + "Ġɲ": 271, + "tËIJ": 272, + "ÉĻm": 273, + "ĠlÉĻ": 274, + "ĠðÉĻ": 275, + "ɪk": 276, + "ËĪÉĽl": 277, + "Éľt": 278, + "Ġse": 279, + "es": 280, + "ËĪou": 281, + "ËĪaÊĬ": 282, + "ĠÉĶ": 283, + "ɪt": 284, + "ĠÅĭ": 285, + "ËĪÉĽn": 286, + "Êİ": 287, + "Ġkh": 288, + "ËĪÉĽnt": 289, + "ËĪaËIJɾ": 290, + "Ġki": 291, + "mp": 292, + "lt": 293, + "É£": 294, + "Ġpa": 295, + "ËĪÉĻËIJ": 296, + "ɪs": 297, + "ĠÉĴ": 298, + "Ġle": 299, + "ÉªÉľ": 300, + "ËĪÉĽt": 301, + "Ġde": 302, + "Ġɹ": 303, + "ĠtËĪoËIJ": 304, + "ĠÊģ": 305, + "ÊĥÉĻn": 306, + "ĠÊĬnt": 307, + "ËĪÉĶɾ": 308, + "ËĪað": 309, + "Ġaɪ": 310, + "ĠÊIJ": 311, + "ĠmËĪa": 312, + "ra": 313, + "ĠkËĪɪ": 314, + "kt": 315, + "ËIJp": 316, + "ĠÊĪ": 317, + "ËĪaËIJÊĬ": 318, + "ĠkËĪÊĮɾ": 319, + "ĠËĪÊĮ": 320, + "ĠÉĴv": 321, + "Ġel": 322, + "ks": 323, + "Ġkw": 324, + "ÉĻt": 325, + "ndo": 326, + "ei": 327, + "ĠËĮaËIJp": 328, + "se": 329, + "ÉĻɹ": 330, + "ËĪuei": 331, + "ÉĻs": 332, + "ĠkËĮo": 333, + "ĠÊĤ": 334, + "ĠËĮÊĬ": 335, + "Ġc": 336, + "ĠÉĽn": 337, + "ËĪant": 338, + "θj": 339, + "ËĮoËIJ": 340, + "ĠËĪaËIJ": 341, + "Ġpɾ": 342, + "si": 343, + "ĠËĪe": 344, + "ĠjuËIJ": 345, + "ĠkËĮe": 346, + "ËĮɪ": 347, + "ÉĶn": 348, + "ĠsËĪÊĮ": 349, + "ĠËĪu": 350, + "ni": 351, + "Ġst": 352, + "ĠdiËIJ": 353, + "ĠkeËIJ": 354, + "ĠjËĪiou": 355, + "ËĪaiÉľ": 356, + "ĠdÊĴ": 357, + "ĠËĪÉĶ": 358, + "va": 359, + "ËIJɾ": 360, + "ËĪø": 361, + "ËĮÉĻÊĬ": 362, + "ĠpËĪu": 363, + "Ġsu": 364, + "Ġma": 365, + "ĠÉĻ": 366, + "dÊĴ": 367, + "Ġpʰ": 368, + "le": 369, + "in": 370, + "ĠtÉķhËĪi": 371, + "ĠwËĪo": 372, + "ro": 373, + "ËĮy": 374, + "ɾa": 375, + "ĠsËĪi": 376, + "ðÉĻ": 377, + "ĠseËIJ": 378, + "la": 379, + "ĠÊĴ": 380, + "mb": 381, + "ĠhËĪoËIJ": 382, + "Ġbʰ": 383, + "ĠÉĽÉ¾": 384, + "Ġðat": 385, + "sp": 386, + "ÉĶɾ": 387, + "en": 388, + "ĠsÉĻ": 389, + "ËĪÉĶÉľ": 390, + "ĠlËĮa": 391, + "ĠËĮÉĽ": 392, + "ĠËĪy": 393, + "É¡aËIJ": 394, + "ĠdÉĽÉ¾": 395, + "ËĪÉĽÊģ": 396, + "Éľkh": 397, + "ËĪiÉĻ": 398, + "ËĪan": 399, + "ĠmËĪo": 400, + "ËĪaβ": 401, + "Ġal": 402, + "ĠËĪeËIJ": 403, + "Ġθ": 404, + "ĠnËĪi": 405, + "pʰ": 406, + "lla": 407, + "Ġpl": 408, + "ËĪÅĵ": 409, + "jËĪÉiju": 410, + "Ġav": 411, + "ĠmËĪi": 412, + "ĠfËĪa": 413, + "ËĪÉľ": 414, + "me": 415, + "ËĮÉĻh": 416, + "ËĪuÉĻ": 417, + "it": 418, + "jËĪe": 419, + "Ġo": 420, + "ËĪÉľËIJ": 421, + "ĠtÉķËĪiou": 422, + "ÉĶËIJ": 423, + "ĠnÉĻ": 424, + "ËĪÉĻÉľn": 425, + "ĠmÉĻ": 426, + "ĠdeËIJ": 427, + "mo": 428, + "sa": 429, + "jËĪÉĶ": 430, + "ËĪal": 431, + "ĠtÉķËĪiÉĽ": 432, + "ĠÉ¡ÉĻ": 433, + "ða": 434, + "Ġɪz": 435, + "Ġsa": 436, + "ri": 437, + "ĠËĮil": 438, + "ËĮu": 439, + "ĠkaËIJ": 440, + "ĠÉĻËIJ": 441, + "ĠÉĸ": 442, + "Ġka": 443, + "ËĪÊĮhi": 444, + "ĠjeËIJ": 445, + "Ġtʰ": 446, + "ne": 447, + "kËIJ": 448, + "ĠtsËĪai": 449, + "ĠËĪeËIJk": 450, + "nk": 451, + "ti": 452, + "ËĪaÉľn": 453, + "ĠkËIJ": 454, + "É¡ÉĻn": 455, + "ËĪia": 456, + "ĠÉĶËIJɾ": 457, + "Êı": 458, + "ĠËĮÊĮ": 459, + "ĠzËĪaËIJ": 460, + "Ġlos": 461, + "ÉĽs": 462, + "ËĪÉĶn": 463, + "ÉĽnt": 464, + "ÉĽn": 465, + "ĠÉŁËĪoËIJ": 466, + "çt": 467, + "Ġdas": 468, + "ĠxËĮo": 469, + "ËĪuÉľ": 470, + "ËĪas": 471, + "ĠbËĪÊĮ": 472, + "ËĪiÉĽÉľn": 473, + "ÉIJ": 474, + "ĠtsuËIJ": 475, + "ĠpËĮÉĽ": 476, + "ĠnËĪÉĶ": 477, + "ÊĬt": 478, + "ma": 479, + "ĠnËĪo": 480, + "ĠlËĪɪ": 481, + "ËĪÉĽs": 482, + "ɪl": 483, + "ĠÉķËĪiÉĽ": 484, + "ĠËĪÊĬ": 485, + "ÉĴt": 486, + "to": 487, + "ĠËĪo": 488, + "ËĮon": 489, + "ĠkwËĪa": 490, + "Ġɪt": 491, + "ĠhoËIJ": 492, + "ËĪiËIJk": 493, + "ĠËĮaËIJpk": 494, + "ËĪaɪn": 495, + "æ": 496, + "ÉĻnt": 497, + "ta": 498, + "lo": 499, + "ĠnËĪÉij": 500, + "ĠlËĪa": 501, + "ËĪiÉľ": 502, + "ĠwËĪei": 503, + "ÉĽÊģ": 504, + "ĠtËĪa": 505, + "ĠɾËĮÉĻh": 506, + "ĠÉķËĪiÉij": 507, + "ËĮiËIJ": 508, + "ËĮÉĽl": 509, + "ĠtÉĻÉľ": 510, + "ĠkËĪuo": 511, + "ĠtËĪu": 512, + "jËĪÉĽ": 513, + "ĠËĮin": 514, + "ɾe": 515, + "ĠkoËIJ": 516, + "ĠkËĪa": 517, + "ɾi": 518, + "ĠtÉķËĪiÉij": 519, + "lÉĻ": 520, + "ĠkÉĻ": 521, + "ĠtËĪi": 522, + "ĠÅĭËĪyÉĻ": 523, + "Ġtsh": 524, + "er": 525, + "av": 526, + "ĠkÉĶn": 527, + "ËĪÉĻÉľÅĭ": 528, + "ðo": 529, + "ËĪaËIJn": 530, + "ĠbʰËĪi": 531, + "ĠkËIJjaËIJ": 532, + "ÉĻz": 533, + "ĠpÊģ": 534, + "ĠdËĪɪ": 535, + "ĠziËIJ": 536, + "É¡eËIJ": 537, + "ĠtËĪÉĻ": 538, + "ɪz": 539, + "ĠnËĮon": 540, + "taËIJ": 541, + "bl": 542, + "te": 543, + "nËĮeËIJ": 544, + "ËĪɪl": 545, + "so": 546, + "ko": 547, + "uÊģ": 548, + "ĠÉ£": 549, + "ĠpaÊģ": 550, + "ĠËĪÉĽ": 551, + "jËĪuËIJ": 552, + "ËĮÊĮ": 553, + "yn": 554, + "ËĪiËIJn": 555, + "ĠlËĪaɪ": 556, + "ËĪɪÅĭ": 557, + "ĠtÉķhËĪy": 558, + "ĠnËĪÊĮhi": 559, + "ĠdËĮe": 560, + "ĠjËĪÉiju": 561, + "ĠtËĪÉiju": 562, + "ĠhËĪo": 563, + "ɪd": 564, + "ĠthËĪÉij": 565, + "mËĪe": 566, + "ĠËĪÉĻ": 567, + "ja": 568, + "Ġph": 569, + "ÉĽt": 570, + "ĠkËĪÊĮ": 571, + "tÉĻn": 572, + "mËĪÉij": 573, + "wËĪe": 574, + "ĠËĮaɪn": 575, + "Ġðɪs": 576, + "É¡ÉĻ": 577, + "ĠnËĪaËIJ": 578, + "ĠbËĪaËIJ": 579, + "Ġaθ": 580, + "ĠmËĮa": 581, + "ËĪÊĮha": 582, + "ĠdËĮa": 583, + "ËĪÊı": 584, + "ĠɲËĮy": 585, + "ĠpËĪa": 586, + "ËĪaðo": 587, + "di": 588, + "bÉľ": 589, + "ɳ": 590, + "ĠwiËIJ": 591, + "ĠnËĪɪ": 592, + "ĠÉ¡ËĪÉĶÉľ": 593, + "tËIJo": 594, + "ËĮÉĻm": 595, + "ËĪaËIJr": 596, + "ĠmÉĽ": 597, + "ËĪeËIJÉ¡aËIJ": 598, + "ĠsËĮi": 599, + "ĠlËĮaËIJ": 600, + "nËĮaËIJ": 601, + "Ġsp": 602, + "tÊģ": 603, + "ĠÊİ": 604, + "ËĮÉijËIJ": 605, + "Ġkl": 606, + "kʰ": 607, + "il": 608, + "ĠÊĥt": 609, + "ĠËĮÊĬn": 610, + "al": 611, + "ĠsËĪÉĽ": 612, + "ĠmËĪaËIJ": 613, + "ĠÅĵ": 614, + "ĠÉ¡ËĪÊĮ": 615, + "ĠpËĮÉĽr": 616, + "ɾËĪa": 617, + "ËIJÊĪ": 618, + "ËĪaβa": 619, + "ĠwËĪÉĴ": 620, + "ĠxËĪuei": 621, + "ĠkhËĪo": 622, + "Ġlas": 623, + "ĠÉĹËĪo": 624, + "ĠfÉĽÉ¾": 625, + "ĠjËĪiÉĽ": 626, + "ĠtËĪe": 627, + "ĠkËĮÉĶ": 628, + "ĠdeËIJn": 629, + "Ġmo": 630, + "ĠpËĪi": 631, + "ĠtËĪÉij": 632, + "ËĪÉĽst": 633, + "wËĪÉij": 634, + "ËĪaɪt": 635, + "ÉĻÊĬ": 636, + "ĠËĪi": 637, + "ɪj": 638, + "aɪ": 639, + "ËĪaËIJÉľ": 640, + "ĠËĪɪs": 641, + "ĠpÉĶɾ": 642, + "Ã¦Éľn": 643, + "ka": 644, + "ÅĭÉ¡": 645, + "bÉĻn": 646, + "ÊĬf": 647, + "Ġpɹ": 648, + "ĠlËĮe": 649, + "ËĪiËIJd": 650, + "ËĪaËIJre": 651, + "ĠmËĪÊĮ": 652, + "ÉĻr": 653, + "ĠdÉij": 654, + "ËĪaËIJto": 655, + "ĠpËĪeËIJ": 656, + "ĠdËĪoËIJ": 657, + "ĠsËĮÊĬ": 658, + "ĠhËĪi": 659, + "ĠsËĪa": 660, + "ËĪeËIJn": 661, + "dÉĻ": 662, + "Ġpj": 663, + "ËĪÅĵÊģ": 664, + "lɪç": 665, + "ÉĴn": 666, + "ĠËĪÉĻr": 667, + "tËĪe": 668, + "Ġil": 669, + "ËĪaËIJl": 670, + "ĠsËĮÉĻÊĬ": 671, + "sÊĪ": 672, + "ĠdËĪuËIJ": 673, + "hËĪÉij": 674, + "ĠxËĪou": 675, + "ĠlËĪaiÉľ": 676, + "wËĪo": 677, + "ËĪÉĽnte": 678, + "Ġsy": 679, + "Ġzɪç": 680, + "ĠÉ¡ËĪu": 681, + "ĠÉķËĪy": 682, + "ËĪÉĶËIJl": 683, + "ÉĶl": 684, + "ĠtËĪo": 685, + "ĠÊĭoËIJ": 686, + "ĠiËIJ": 687, + "wËĪaða": 688, + "ËĪando": 689, + "Ġaθɼnt": 690, + "ĠaθɼntwËĪaða": 691, + "ĠtËĪiÉĽ": 692, + "ËĪeiÉľ": 693, + "ĠpËĮa": 694, + "ĠnËĪaɪ": 695, + "wa": 696, + "Ġfr": 697, + "ĠÊIJËĪÉĻÉľn": 698, + "ËĪua": 699, + "mi": 700, + "ĠmËĪÉĽ": 701, + "ËĪeËIJkʰ": 702, + "cʰ": 703, + "ĠwËĪÉij": 704, + "sta": 705, + "Ġtu": 706, + "Ġsk": 707, + "ËĪÉĶl": 708, + "ËĪeËIJÊĪ": 709, + "ĠlËĪaËIJɪ": 710, + "ĠlËĪaËIJ": 711, + "ËĪÉĽËIJs": 712, + "ËĪÉĽÉ¾a": 713, + "ËĪÉĻÉľt": 714, + "Ġyn": 715, + "dÉĻn": 716, + "Ġdi": 717, + "ËĪiËIJs": 718, + "Ġðel": 719, + "ËĪÊĮr": 720, + "ĠhËĪaËIJ": 721, + "ĠbÉĻ": 722, + "ĠjËĪuËIJ": 723, + "lle": 724, + "sto": 725, + "ËĪɪt": 726, + "ËĪoËIJɾ": 727, + "bʰ": 728, + "mÉĻn": 729, + "ËĮuÉĻ": 730, + "ËĮÉĻɾ": 731, + "ËĪÊĮn": 732, + "ĠlËĪaɪk": 733, + "ĠbËĪa": 734, + "ɪð": 735, + "Ġlo": 736, + "zi": 737, + "ËĪÊĮst": 738, + "mËĪi": 739, + "ÉĶÊģ": 740, + "ĠnËĪɪçt": 741, + "Ġtɾ": 742, + "ĠdËĪeËIJkʰ": 743, + "ĠsËĮe": 744, + "ĠnËĪÉĻÊĬ": 745, + "Ġu": 746, + "Ġsi": 747, + "Ġɪç": 748, + "Ġpr": 749, + "ĠtÉķËĪy": 750, + "ĠmËĪu": 751, + "za": 752, + "ĠtÊģ": 753, + "Ġwɪð": 754, + "tËĪÉĽ": 755, + "ĠpËĪÊĮɾ": 756, + "ĠkËĪÉĶ": 757, + "ËĪoËIJr": 758, + "ĠhËĮa": 759, + "ĠkËĪonÉ¡": 760, + "ĠpuÊģ": 761, + "Ġdy": 762, + "ËĪɪn": 763, + "nte": 764, + "ĠkËĮa": 765, + "ËĪÉĻɪ": 766, + "Ġmi": 767, + "ĠÉ¡ËĮuÉĻ": 768, + "Ġʲ": 769, + "ĠfËĪÉij": 770, + "ĠvÉijËIJ": 771, + "ĠËĮaÊĬ": 772, + "ËĮuËIJ": 773, + "ĠËĪun": 774, + "ĠjËĪÊĮha": 775, + "juËIJ": 776, + "Ġmɪt": 777, + "ĠlËĪÉĽ": 778, + "ËĪeËIJÊĥ": 779, + "ĠfÉĶËIJ": 780, + "mÉĻ": 781, + "ɾt": 782, + "ĠkËĮon": 783, + "ĠlËĪÉĶ": 784, + "ĠxËĪÉiju": 785, + "pl": 786, + "ĠdËĪi": 787, + "ĠlËĪoËIJ": 788, + "sÉĻ": 789, + "ËĪaËIJva": 790, + "ĠlËĪu": 791, + "ĠÉ¡ËĮÉĻÊĬ": 792, + "Ġhav": 793, + "ĠËĮaËIJpkËĮoËIJ": 794, + "ɾËĪi": 795, + "ĠfËĪÉĻ": 796, + "ĠhËĮÉĻm": 797, + "ËĪonÉ¡Éľ": 798, + "jo": 799, + "ĠsÉĶ": 800, + "ËĪaËIJd": 801, + "wËĪiÉĻ": 802, + "ËĪand": 803, + "ËĮaɪn": 804, + "tɾ": 805, + "ĠËĮɪ": 806, + "ĠËĪuna": 807, + "ĠxwËĪÉij": 808, + "ĠjÉĶËIJ": 809, + "ÊģËĪi": 810, + "ĠkËĪuoÉľ": 811, + "Ġaβ": 812, + "ĠÉ¡ËĪaËIJ": 813, + "ano": 814, + "tÉĻl": 815, + "ĠrËĮe": 816, + "ËĮÊĮt": 817, + "ĠjËĪiÉij": 818, + "ĠɾËĮÉĻhaËIJ": 819, + "ĠmËĪe": 820, + "ĠËĪyÃ¦Éľn": 821, + "ĠfËĪu": 822, + "Ġbl": 823, + "nËĪi": 824, + "sÉĻn": 825, + "Ġaɪn": 826, + "ËĪiÊĬ": 827, + "Ġðeɪ": 828, + "Ġɪts": 829, + "Ġ(": 830, + "ËĪyËIJ": 831, + "ÉĻd": 832, + "ĠËĮo": 833, + "ĠÉĽs": 834, + "ĠviËIJ": 835, + "ËIJÉ¡eËIJ": 836, + "kËĪe": 837, + "ĠËĪal": 838, + "ÉĽl": 839, + "ĠÊĮ": 840, + "ËIJo": 841, + "ĠkËĪo": 842, + "ĠÊĪËĪuËIJ": 843, + "ĠsËĪɪ": 844, + "ËĪeËIJɾ": 845, + "Éľm": 846, + "ËĮÉĻn": 847, + "ËĪaËIJi": 848, + "ËĪoËIJl": 849, + "ɪËĮeËIJ": 850, + "ĠʲËĪy": 851, + "ĠkËĪÉĶËIJ": 852, + "sËĪi": 853, + "ĠlËĪe": 854, + "ËĮÉĴt": 855, + "ËĪiËIJp": 856, + "aÊģ": 857, + "ĠθËĪɪÅĭ": 858, + "ËĪÉĻËIJɪ": 859, + "ËĪÊĮl": 860, + "ĠhËĪoËIJtaËIJ": 861, + "ËĪoɪ": 862, + "nto": 863, + "zh": 864, + "ĠdeËIJm": 865, + "ĠkÉĶm": 866, + "ʰËĪiËIJk": 867, + "ĠdÊĴËĪÊĮst": 868, + "pɾ": 869, + "Ġly": 870, + "hËĪu": 871, + "ËĪÉĶø": 872, + "ËĪaËIJs": 873, + "ĠËĪan": 874, + "ĠËĪÉĴ": 875, + "Ġkan": 876, + "ĠtsËĪuo": 877, + "ËĪeËIJva": 878, + "Ġɡɾ": 879, + "Ġpo": 880, + "ĠtÊĥËĪÉĶ": 881, + "Êİa": 882, + "ĠmËĮi": 883, + "Êĥt": 884, + "tËĪi": 885, + "ĠhËĪÊĮ": 886, + "tÊĥe": 887, + "ĠfÉĶn": 888, + "ve": 889, + "ĠnËĮe": 890, + "ËĪÉĶÊģ": 891, + "iz": 892, + "ĠsËĪuo": 893, + "ËĪÉĽËIJr": 894, + "wËĪaÊģ": 895, + "ËĪaða": 896, + "Åĭk": 897, + "po": 898, + "ĠkËĪi": 899, + "ËĪad": 900, + "ĠvËĪi": 901, + "tÉķ": 902, + "ĠkËĪÉĻ": 903, + "ĠwËĪu": 904, + "ÉĴz": 905, + "ĠvÉijËIJɾ": 906, + "ÊģËĪÉĽ": 907, + "ĠkËĪaËIJ": 908, + "ke": 909, + "nÉĻ": 910, + "ËĪÊĮb": 911, + "ËĪuËIJɾ": 912, + "ËĮÉĻËIJ": 913, + "ĠÊĪʰËĪiËIJk": 914, + "ĠkËĪu": 915, + "ĠbËĮÊĮt": 916, + "Ġat": 917, + "Ġfɹ": 918, + "ËĪax": 919, + "ĠzoËIJ": 920, + "ĠtËĪaËIJ": 921, + "ĠðËĮe": 922, + "neËIJ": 923, + "ĠÉijËIJ": 924, + "ĠaÊĬf": 925, + "am": 926, + "ÊĬÅĭ": 927, + "ĠÉĶËIJ": 928, + "ĠÉķËĪiÉľÅĭ": 929, + "ĠËĪÉĶËIJl": 930, + "ɪm": 931, + "jËĪo": 932, + "ËĪiËIJÉŁ": 933, + "ĠkwËĮÉĽ": 934, + "ĠmËĪas": 935, + "ÉĻh": 936, + "ĠËĪaÊĬ": 937, + "ËĪÉĶɪ": 938, + "É¡ÉĻɾ": 939, + "rÉĻn": 940, + "ËĪɪk": 941, + "sse": 942, + "ĠpËĪÉij": 943, + "ĠÉĹËĮe": 944, + "ĠÉĹËĪi": 945, + "Ġaz": 946, + "ĠÉ¡ËĪÊĮjaËIJ": 947, + "ze": 948, + "ĠÉĹËĮaËIJ": 949, + "ĠfËĪi": 950, + "ĠËĮÉĴn": 951, + "ĠxËĪo": 952, + "ĠËĮÊĬna": 953, + "ĠtʰaËIJ": 954, + "ĠsÉij": 955, + "ËĪeɪÊĥÉĻn": 956, + "ĠtÉķËĪiÉľ": 957, + "ĠÉŁaËIJ": 958, + "pËIJ": 959, + "Ġply": 960, + "θËĪi": 961, + "ËIJÉĸ": 962, + "ĠtËĪuei": 963, + "ĠlËĪÉĻ": 964, + "ĠdÉijËIJ": 965, + "ft": 966, + "ËĪam": 967, + "ĠsËĪÊĮkt": 968, + "ĠtËĪou": 969, + "ĠpËĪiÉĽ": 970, + "ĠËĪai": 971, + "ĠwËĪÉĴn": 972, + "ĠzËĮaɪn": 973, + "Ġest": 974, + "ĠmÉĶ": 975, + "ĠtÉķjËĪÉiju": 976, + "Éľp": 977, + "ËĪÊĮz": 978, + "bi": 979, + "ËĪÉĽËIJseËIJ": 980, + "ĠlËĪy": 981, + "ĠmËĮe": 982, + "ĠdËĮÉĽl": 983, + "ËĪiËIJl": 984, + "ĠkËĮomo": 985, + "ĠhËĪaÉľn": 986, + "ËĪoËIJne": 987, + "ĠkËĪÊĮɾt": 988, + "ĠsyÊģ": 989, + "ËĮÉĶɾ": 990, + "Ġɪf": 991, + "uv": 992, + "zÉĻn": 993, + "ol": 994, + "Ïĩ": 995, + "im": 996, + "ĠmËĪiÉĽ": 997, + "Ġðɪ": 998, + "ĠvËĪÉĽ": 999, + "ÊĬd": 1000, + "Ġtr": 1001, + "ËĪeËIJs": 1002, + "ðe": 1003, + "de": 1004, + "ʰÏĩ": 1005, + "ÉŁÊ°": 1006, + "ËĮÉĻËIJÉªÉľ": 1007, + "bËIJ": 1008, + "ËĪÊĬk": 1009, + "ĠnËĪÉĶÉªÉľ": 1010, + "ĠËĮiËIJ": 1011, + "ËĪÉijËIJt": 1012, + "ËĪiËIJɾ": 1013, + "Ġtɹ": 1014, + "ɾÉĶ": 1015, + "ĠwÉĴz": 1016, + "Ġvu": 1017, + "bÉĻl": 1018, + "bÉĻ": 1019, + "ɹi": 1020, + "nts": 1021, + "ĠsËĪaËIJ": 1022, + "dʰ": 1023, + "ĠtÊĬ": 1024, + "ĠÊİËĮi": 1025, + "βa": 1026, + "hËĪÉĻÉľÅĭ": 1027, + "ĠsËĪiËIJ": 1028, + "ĠpËĮaɾa": 1029, + "ËĪÉĽÉ¾ÉĶ": 1030, + "ËĪɪs": 1031, + "É£o": 1032, + "ĠËĮal": 1033, + "or": 1034, + "ĠbËĪÊĮh": 1035, + "ĠkËĪoËIJ": 1036, + "ĠtËĪÉĽ": 1037, + "ĠpËĪo": 1038, + "ĠÊĴÉĻ": 1039, + "pÊģ": 1040, + "ĠËĪaɪ": 1041, + "hËĪÉijÉľÅĭ": 1042, + "ÉĻli": 1043, + "ËĪeɪt": 1044, + "ĠjËĪiouÉľ": 1045, + "ĠdËĪÉĻ": 1046, + "ĠmËĪÉĶËIJ": 1047, + "lËĪi": 1048, + "ËĮyÉĻ": 1049, + "ĠlËĪoËIJÉ¡": 1050, + "ĠnËĪÊĮ": 1051, + "ĠhËĪÊĬ": 1052, + "ĠnËĪÉĻÉľÅĭ": 1053, + "ĠÊģÉĻ": 1054, + "zËĪi": 1055, + "ĠtËĪuËIJ": 1056, + "ĠkËĮome": 1057, + "ĠlËĪeËIJ": 1058, + "ËĪaËIJtaËIJ": 1059, + "Ġan": 1060, + "ĠËĪyu": 1061, + "ĠËĮÊĮÉ¡ÉĻɾ": 1062, + "ĠËĪɪn": 1063, + "ĠhËĪoÉĻ": 1064, + "vÉĻ": 1065, + "ËĪøËIJ": 1066, + "θja": 1067, + "ËĪuÉĻÉľn": 1068, + "ĠkÉĻɾ": 1069, + "ËĪat": 1070, + "jËĪø": 1071, + "ËĪÉĽtÊģ": 1072, + "ĠpËĪÉiju": 1073, + "stÉĻ": 1074, + "ĠwÉĴt": 1075, + "ËĪeËIJl": 1076, + "ÊĪi": 1077, + "ĠxËĪaiÉľ": 1078, + "ËĪyÊģ": 1079, + "ĠhËĪoËIJÉ¡aËIJ": 1080, + "ĠtsËĪi": 1081, + "ĠËĪÊĮp": 1082, + "ĠnËĮÉĴt": 1083, + "ĠlËĪɪeËIJ": 1084, + "ĠhËĪa": 1085, + "Ġfl": 1086, + "ĠnËĪeËIJ": 1087, + "ËĮaËIJɪ": 1088, + "ĠtËĪuo": 1089, + "tÊĥËIJ": 1090, + "sËĪe": 1091, + "bʰi": 1092, + "ĠbËĪÊĮhÊĬt": 1093, + "ËĪÉĽnd": 1094, + "ĠsËĪÉĶ": 1095, + "ÉĻns": 1096, + "ËĮÉĻl": 1097, + "ÉĽÉľ": 1098, + "ĠÉ¡l": 1099, + "ËĪɪɾ": 1100, + "ËĪaËIJta": 1101, + "ÉľËIJ": 1102, + "ËĪÉĽnto": 1103, + "skËĮoËIJ": 1104, + "ËĪÉĽk": 1105, + "tsi": 1106, + "ĠtËĪonÉ¡": 1107, + "ĠbiËIJ": 1108, + "ĠhËĪaËIJɪ": 1109, + "ĠbËĪi": 1110, + "jj": 1111, + "Êİi": 1112, + "Ġkʰ": 1113, + "ĠsËĪo": 1114, + "llo": 1115, + "Ġbaɪ": 1116, + "ĠÉĽnt": 1117, + "ĠËĪiËIJ": 1118, + "ĠÉ¡ËĪo": 1119, + "ɾeËIJ": 1120, + "ĠkÊĭ": 1121, + "ĠmËĪeiÉľ": 1122, + "ÊĬËĪÉĶËIJ": 1123, + "ĠtËĪaɪ": 1124, + "Ġsus": 1125, + "Ġri": 1126, + "ĠvËĮÉĽ": 1127, + "ËĪiËIJno": 1128, + "vano": 1129, + "ĠdËĮiËIJ": 1130, + "ĠÊIJËĪaÉľn": 1131, + "ÊĤ": 1132, + "ĠÉIJb": 1133, + "ËĪaËIJh": 1134, + "ɪÊĥ": 1135, + "ĠdËĮella": 1136, + "tËIJi": 1137, + "ĠËĪÊĬn": 1138, + "ĠhiËIJ": 1139, + "ĠbËĪaËIJt": 1140, + "ĠthËĪi": 1141, + "Ġam": 1142, + "ĠËĪoËIJ": 1143, + "Ġhu": 1144, + "ĠkËĪÊĮh": 1145, + "ĠzËĪÉijËIJ": 1146, + "ĠÉ¡ËĮÉĶ": 1147, + "ĠËĪÉĻÊĬ": 1148, + "yËĪi": 1149, + "ĠlËĪÊĮ": 1150, + "ĠdËĪeËIJ": 1151, + "ĠsËĪÉĶËIJ": 1152, + "skËĮeËIJ": 1153, + "ɾo": 1154, + "ÊģËĪÉij": 1155, + "tËĪa": 1156, + "ĠkËĪÊĬ": 1157, + "ËĪante": 1158, + "ĠdÉĶ": 1159, + "ĠsËĪeɪ": 1160, + "ĠsÉĽt": 1161, + "ɹɪ": 1162, + "ĠÉ¡ËĮÉĻÊĬɪÅĭ": 1163, + "zo": 1164, + "ĠjËĪaËIJ": 1165, + "ĠÉĴvðÉĻ": 1166, + "ĠÊĿ": 1167, + "ĠÉĽl": 1168, + "ĠsËĪoËIJ": 1169, + "ĠthËĪiÉľ": 1170, + "ĠËĪÉĽl": 1171, + "ĠlyËĮi": 1172, + "ndÊĴ": 1173, + "ĠÉķjËĪÉiju": 1174, + "θa": 1175, + "ĠɾËĮÉĻheËIJ": 1176, + "Ġmaɪ": 1177, + "jÉĻ": 1178, + "ĠËĪÊĮb": 1179, + "asjËĪÉĶ": 1180, + "dÊģ": 1181, + "ĠkhËĪa": 1182, + "ĠËĪes": 1183, + "vi": 1184, + "fi": 1185, + "ËĮÉĻb": 1186, + "Ġre": 1187, + "ĠavËĮÉĽ": 1188, + "ĠtËĮi": 1189, + "Ġkɾ": 1190, + "Ġbɪk": 1191, + "ste": 1192, + "ËĪeËIJÊĥc": 1193, + "pt": 1194, + "zÉĻ": 1195, + "ĠwËĪaËIJ": 1196, + "kl": 1197, + "ĠsËĪÊĮm": 1198, + "ɪÊĪ": 1199, + "dz": 1200, + "vo": 1201, + "ËĮaÊĬt": 1202, + "nde": 1203, + "ĠdÉĽs": 1204, + "ĠÉŁËĪaËIJ": 1205, + "ĠrËĮi": 1206, + "sËĮeËIJ": 1207, + "É¡i": 1208, + "Ġals": 1209, + "ËĪiðo": 1210, + "ĠnËĪiÉľn": 1211, + "ÊĬl": 1212, + "tsËIJ": 1213, + "ËĪanto": 1214, + "ĠÉĹËĪÉĻÊĬ": 1215, + "kËIJi": 1216, + "ĠsËĪÊĮb": 1217, + "ĠnËĪa": 1218, + "ĠlËĮo": 1219, + "ĠphËĪi": 1220, + "mËĮe": 1221, + "Ġfa": 1222, + "kÉĻ": 1223, + "ĠzËĪu": 1224, + "ns": 1225, + "ĠÊģe": 1226, + "ĠbËĪo": 1227, + "ËĪaËIJti": 1228, + "Ġman": 1229, + "ĠlËĪiÉij": 1230, + "ĠÉĹËĮyÉĻ": 1231, + "ĠfËĪÉĶËIJ": 1232, + "ĠkÊĭËĪeËIJÊĥc": 1233, + "ĠxËĪÉij": 1234, + "ĠtÉķËĪu": 1235, + "jÉĻɾ": 1236, + "Ġɪst": 1237, + "wËĪi": 1238, + "ĠËĮaɪnÉĻ": 1239, + "ɪɡ": 1240, + "ĠsÊĪ": 1241, + "ËĪiÉĻl": 1242, + "ĠnËĪiÉĽÉľn": 1243, + "ĠËĮÉĽËIJ": 1244, + "ËĪaɪnd": 1245, + "ĠzËĪi": 1246, + "vÉĻn": 1247, + "mz": 1248, + "ðos": 1249, + "dÊĴËIJ": 1250, + "jËĪa": 1251, + "ɾËĪÉĶ": 1252, + "lËĪe": 1253, + "ʲ": 1254, + "ĠvËĪÉĶ": 1255, + "ĠlËĪiÉĽ": 1256, + "θe": 1257, + "mËĪente": 1258, + "ĠɪnðÉĻ": 1259, + "Ġaɪm": 1260, + "nÉĻn": 1261, + "ĠhÉĻm": 1262, + "ɾaËIJ": 1263, + "ĠsËĪuoÉľ": 1264, + "ĠɲËĪi": 1265, + "ĠɹËĪiÉĻl": 1266, + "lËĪa": 1267, + "ĠbËĪÉĶ": 1268, + "ĠkËĪai": 1269, + "ÊģËĪa": 1270, + "ĠwËĪÉľËIJ": 1271, + "ĠaËIJ": 1272, + "Ġpas": 1273, + "ËĪÊĮs": 1274, + "wËĪÉĽÉ¾": 1275, + "ĠÉĹËĪe": 1276, + "ĠhËĮatÉĻ": 1277, + "aɪn": 1278, + "ĠËĪÉĶpʰ": 1279, + "ÊģËĪe": 1280, + "ĠÉŁaËIJËĪeËIJÉ¡aËIJ": 1281, + "ĠËĪÊĬs": 1282, + "ĠtÉķhËĪiÉľ": 1283, + "ntÊĥ": 1284, + "ĠxËĪuo": 1285, + "ËĪuÊģ": 1286, + "Ġɪm": 1287, + "ɳÉĸ": 1288, + "ËĪyÉĻÉľkh": 1289, + "ĠËĪyÉĽ": 1290, + "ĠmËĮaËIJ": 1291, + "ÅĵÊģ": 1292, + "ĠËĪalt": 1293, + "ĠkÉĻm": 1294, + "Êİo": 1295, + "ĠÉIJn": 1296, + "Ġfy": 1297, + "ĠËĮÉĽra": 1298, + "ĠÉ¡ËĪÊĬ": 1299, + "ĠpËĪÊĮ": 1300, + "ls": 1301, + "ĠlËĪiËIJ": 1302, + "ĠÊĤËĪy": 1303, + "ĠbɪkËĪÊĮz": 1304, + "ĠÉ¡ÉĽt": 1305, + "Ġbɾ": 1306, + "tʰ": 1307, + "tÉĻlËĮÉĻb": 1308, + "xo": 1309, + "skËĮaËIJ": 1310, + "ɲʲ": 1311, + "ËĪeËIJkÊĪ": 1312, + "rÉĻ": 1313, + "tÊĥo": 1314, + "ĠpÊģÉĶ": 1315, + "ĠɹËĪaɪt": 1316, + "ĠpËĪei": 1317, + "ËĮɪç": 1318, + "jËĪÉĽÉ¾": 1319, + "tËIJa": 1320, + "ĠÉIJbËĮaÊĬt": 1321, + "ĠkÊĭËĪeËIJÊĥcÉĻn": 1322, + "ĠvËĪe": 1323, + "ÊĬÉľ": 1324, + "ĠakËĪe": 1325, + "ĠpËĪai": 1326, + "vËĪÉĽ": 1327, + "Ġθɹ": 1328, + "ɪf": 1329, + "ĠavËĪÉĽ": 1330, + "ĠkËĪe": 1331, + "dËĪi": 1332, + "ËĪeËIJÉĸ": 1333, + "ĠbÉĻt": 1334, + "ÊĪʰ": 1335, + "teËIJ": 1336, + "θjËĪÉĶn": 1337, + "dÉľ": 1338, + "ĠjËĪiÉľ": 1339, + "Ġve": 1340, + "É£ËĪu": 1341, + "ËĪÊĮhÉĻl": 1342, + "ĠpÉĶ": 1343, + "ĠÉ¡r": 1344, + "Ġða": 1345, + "ĠvËĪiËIJ": 1346, + "ĠËĮÉijËIJ": 1347, + "ËĪÉĻÊĬnt": 1348, + "ĠbËĪaËIJɾ": 1349, + "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350, + "ld": 1351, + "ĠtÉķËĮÉĶ": 1352, + "pa": 1353, + "ðËĪad": 1354, + "ËĪiɾ": 1355, + "ĠxËĪu": 1356, + "ĠlËĪiÉľÅĭ": 1357, + "ËĪeɪs": 1358, + "ĠÉĹËĮeÉľn": 1359, + "ĠthËĪiÉĽ": 1360, + "tËIJe": 1361, + "ĠavËĮÉĽk": 1362, + "ĠËĮÉĶ": 1363, + "ĠkËĪÉiju": 1364, + "ɪv": 1365, + "iËIJz": 1366, + "ËĪos": 1367, + "Ġɡɹ": 1368, + "and": 1369, + "ĠlËĪiou": 1370, + "ĠËĪoÉľ": 1371, + "É¡l": 1372, + "ĠpËĪÉĶËIJ": 1373, + "ĠmËĮeËIJ": 1374, + "ĠkËĪÉĴ": 1375, + "nos": 1376, + "çÉĻn": 1377, + "fÉĻn": 1378, + "ĠsËĪÊĮktËĮeËIJ": 1379, + "ĠËĪaɪn": 1380, + "ËĪoËIJre": 1381, + "jËĪÉĽn": 1382, + "ĠðËĪÉĽn": 1383, + "ĠtÉķhËĪiÉĽÉľn": 1384, + "ĠhËĪaɪ": 1385, + "ɾËĪÉĽ": 1386, + "ĠsËĪu": 1387, + "ĠkËĪɪjaËIJ": 1388, + "ĠpjËĮÊĬ": 1389, + "ĠhÉĻmËĮaËIJ": 1390, + "ĠËĮÊĮp": 1391, + "ĠpËĪÊĮhÉĻl": 1392, + "ĠxËĪÉĻ": 1393, + "dËĪe": 1394, + "ĠmÉij": 1395, + "ĠÊĬm": 1396, + "ndÉĻ": 1397, + "ĠdËĪÉĻÊĬnt": 1398, + "ËĪeËIJÊĥÉĻn": 1399, + "Ġðats": 1400, + "is": 1401, + "ĠcËĪaËIJh": 1402, + "pe": 1403, + "ĠsËĮo": 1404, + "ĠðËĪe": 1405, + "ĠsËĪaËIJt": 1406, + "ËĪaÊģ": 1407, + "ĠsËĪe": 1408, + "ÉĻk": 1409, + "ɪÊĭ": 1410, + "ĠkËĪoËIJi": 1411, + "kÉĶ": 1412, + "ĠvËĪaËIJÊĬ": 1413, + "ĠfËĪei": 1414, + "ĠlËĪeËIJk": 1415, + "ĠhËĪiÉĻ": 1416, + "ĠaÊĬ": 1417, + "ËĪÉĽndo": 1418, + "ËĪes": 1419, + "ĠzËĪÉĶ": 1420, + "ĠËĪÉĽÉ¾a": 1421, + "nËĪiÉľn": 1422, + "ĠkËĪÊĮm": 1423, + "ĠlËĪÉĴ": 1424, + "ɪst": 1425, + "ĠpÉij": 1426, + "ĠfËĪÉĶ": 1427, + "ĠthËĪonÉ¡": 1428, + "nke": 1429, + "ËĮɪk": 1430, + "ĠɲËĪÉĻ": 1431, + "ËĮÊĮm": 1432, + "ËĪiËIJt": 1433, + "ĠwËĪÉĴnt": 1434, + "ËĪaβan": 1435, + "ĠbËĪÊĮr": 1436, + "ÉĽnd": 1437, + "ĠËĮÉijËIJbÉľ": 1438, + "ĠvËĪaɪ": 1439, + "ĠtÊĥËĮi": 1440, + "ĠθËĪɪÅĭk": 1441, + "sti": 1442, + "Ġkɹ": 1443, + "ĠËĪaÊĬt": 1444, + "stÉĻn": 1445, + "ĠÊĭËĪÊĮn": 1446, + "ĠÉ¡ËĮaËIJ": 1447, + "ËĪaËIJÉľÉ²": 1448, + "Êģi": 1449, + "ĠnËĪÉĶx": 1450, + "ĠɹËĪiÉĻlɪ": 1451, + "ĠvËĮi": 1452, + "ĠðeÉĻ": 1453, + "ËĮɪtÊĥ": 1454, + "ĠvËĪyÉĻ": 1455, + "ĠËĮaËIJpkËĮaËIJ": 1456, + "ĠfËĮaËIJɪ": 1457, + "ĠpËĪÉĶ": 1458, + "ĠnËĪÊĮmb": 1459, + "θes": 1460, + "jËĪÉĽÊģ": 1461, + "ĠkËĪÊĬcʰ": 1462, + "mËĪÉĽ": 1463, + "ĠvËĪu": 1464, + "ĠlÅĵÊģ": 1465, + "ĠiËIJm": 1466, + "ÊĪÉĻɾ": 1467, + "tÊĥi": 1468, + "ËIJs": 1469, + "ĠtËĪy": 1470, + "ĠmËĪiÉľÅĭ": 1471, + "ɾËĪe": 1472, + "mËĮa": 1473, + "ĠmËĮiËIJ": 1474, + "ĠÉĽks": 1475, + "ɪp": 1476, + "ĠkËĪÊĮɾnËĮaËIJ": 1477, + "ĠËĮaÊĬx": 1478, + "rËĪiËIJ": 1479, + "ĠcËĪÊĮl": 1480, + "mos": 1481, + "ĠkËĪÊĮɾtËĮeËIJ": 1482, + "iËIJɾ": 1483, + "kÉĻn": 1484, + "ĠdËĪu": 1485, + "naËIJ": 1486, + "ĠpwËĪe": 1487, + "ËĮÉĶɪ": 1488, + "ĠtÉķhËĪiÉĽ": 1489, + "ĠβËĪi": 1490, + "ËĪiÉĽÉľt": 1491, + "Ġte": 1492, + "ËĪaðos": 1493, + "mËĪa": 1494, + "ĠvËĪo": 1495, + "ĠmËĪɪ": 1496, + "ĠbËĮi": 1497, + "ad": 1498, + "do": 1499, + "ĠnËĪaÊĬ": 1500, + "ĠʲËĪyÉľ": 1501, + "wËĪÉĽ": 1502, + "ËĪis": 1503, + "el": 1504, + "Ġpar": 1505, + "ĠtËĪai": 1506, + "ĠdËĪɪjaËIJ": 1507, + "hËĪi": 1508, + "ĠɾËĪÊĮ": 1509, + "ĠdËĪe": 1510, + "ËĪaɪd": 1511, + "Ġper": 1512, + "ĠsËĮÉĶ": 1513, + "we": 1514, + "ÊĬm": 1515, + "Ġin": 1516, + "ĠjËĪuËIJz": 1517, + "ËĪiËIJpÉĻl": 1518, + "ĠÊĭËĪaËIJl": 1519, + "ĠetËĪÉĽ": 1520, + "ËĮÉĽm": 1521, + "ĠnËĪu": 1522, + "ËĪÉĽkt": 1523, + "ĠiËIJɾ": 1524, + "Ġbɹ": 1525, + "ĠtshËĪi": 1526, + "ĠÉĹËĪÉĶÉľ": 1527, + "ĠkwËĮa": 1528, + "ĠfËĪuÉľ": 1529, + "wËĮa": 1530, + "ĠdËĪiËIJ": 1531, + "ĠÉ¡ËĪyÉĻ": 1532, + "ËĮÉĽËIJ": 1533, + "rËĪa": 1534, + "Ġne": 1535, + "ĠzËĪyÉĻ": 1536, + "ĠbËĪaɪ": 1537, + "ĠÉŁËĪÊĮb": 1538, + "ËĪuËIJto": 1539, + "ÊĬnt": 1540, + "Ġcʰ": 1541, + "ËĪÉĽnti": 1542, + "ËĪoÉĻ": 1543, + "ĠsËĮÊĮm": 1544, + "ĠlÉij": 1545, + "ËĮeva": 1546, + "É¾ÉĽ": 1547, + "ntÉľ": 1548, + "ĠmËĪÉĽn": 1549, + "ËĪÉijËIJk": 1550, + "Ġkil": 1551, + "ËĪones": 1552, + "ff": 1553, + "ĠmËĪÉĽËIJ": 1554, + "ĠvËĪÉĻɪ": 1555, + "ĠËĪÉĶËIJ": 1556, + "ĠËĮɪnt": 1557, + "ÊĬn": 1558, + "Ġwɪl": 1559, + "Ġsin": 1560, + "ĠËĮalla": 1561, + "ĠaβËĪia": 1562, + "pi": 1563, + "ËĪoÉľ": 1564, + "ɪjËĮaËIJ": 1565, + "ku": 1566, + "ĠvËĪɪ": 1567, + "Ġtut": 1568, + "ĠtËĪeÉľ": 1569, + "ĠhËĪÉĶ": 1570, + "βɾe": 1571, + "sÉĻɾ": 1572, + "ĠkhËĪai": 1573, + "ĠmËĪÉĶ": 1574, + "Ġta": 1575, + "ĠɲËĪaËIJ": 1576, + "Ġnu": 1577, + "ËĪuËIJn": 1578, + "ĠÉĻËIJÉľ": 1579, + "ĠËĪaÊĬf": 1580, + "ËĪiËIJdÉľ": 1581, + "nti": 1582, + "ĠpËĪiËIJpÉĻl": 1583, + "Ġkj": 1584, + "Ġpe": 1585, + "ĠmËĪÉij": 1586, + "ËĮaɪ": 1587, + "ËĪaËIJle": 1588, + "ĠvËĮÉĻËIJÉªÉľ": 1589, + "mpo": 1590, + "ĠkËĪɪt": 1591, + "ĠnËĮÉĽ": 1592, + "ĠÉŁËĪaËIJtaËIJ": 1593, + "ĠsËĪaËIJtʰ": 1594, + "ĠÉŁËĪi": 1595, + "Ġso": 1596, + "ĠbËĪÉĽ": 1597, + "kËĪi": 1598, + "ɪti": 1599, + "Ġtsi": 1600, + "ĠkÊģ": 1601, + "ËĮÉĴ": 1602, + "É¡ÉĻl": 1603, + "kst": 1604, + "ĠmËĪÉĻËIJ": 1605, + "ËĪÊĮk": 1606, + "ĠnËĪaËIJÊĬ": 1607, + "Ġap": 1608, + "ĠlËĪɪkʰ": 1609, + "lli": 1610, + "ĠkwËĪal": 1611, + "ĠËĪÉĻËIJ": 1612, + "ĠtsËĪuei": 1613, + "Ġdo": 1614, + "ĠkËIJjËĪo": 1615, + "ÊĬz": 1616, + "ĠpËĪaËIJ": 1617, + "ĠmËĪuËIJ": 1618, + "ĠÉ¡ÉĻv": 1619, + "rËĪi": 1620, + "Ġtw": 1621, + "ËĮɪn": 1622, + "dËĪÉij": 1623, + "ĠðËĪi": 1624, + "ĠËĪaËIJi": 1625, + "ĠhËĪiÉĽ": 1626, + "ĠðËĮÉĽm": 1627, + "ĠpʰËĪɪɾ": 1628, + "ÉĴm": 1629, + "ĠËĮeËIJ": 1630, + "ĠthËĪaiÉľ": 1631, + "ĠvËĪas": 1632, + "ĠnÉijËIJ": 1633, + "pÉĻn": 1634, + "ĠpËĮÉĻɾ": 1635, + "ĠÉĹËĪaËIJɪ": 1636, + "ËĪouÉľ": 1637, + "ĠÊIJËĪuÉľ": 1638, + "ĠmËĪan": 1639, + "ĠtËĪÉĻÉªÉľ": 1640, + "ĠlËĪaËIJÊĬ": 1641, + "mËĪÉĽnte": 1642, + "ĠfËĪam": 1643, + "sjËĪÉĶ": 1644, + "ĠpËĪÉĻ": 1645, + "ËĪeËIJm": 1646, + "ĠpËĪÊĮr": 1647, + "jËĪi": 1648, + "ĠlÉĽ": 1649, + "Ġten": 1650, + "ËĪoËIJra": 1651, + "ki": 1652, + "ĠÊĤËĪaËIJÊĬ": 1653, + "kɪ": 1654, + "bËIJe": 1655, + "ËĪalt": 1656, + "ðɪ": 1657, + "pËĪi": 1658, + "ĠËĮÉĽnt": 1659, + "ĠmËĪei": 1660, + "ĠhËĪÉĻÊĬ": 1661, + "ĠhËĪÉĽÉ¾": 1662, + "jËĪÉij": 1663, + "ĠhËĪÊĬaËIJ": 1664, + "mÉľ": 1665, + "Ġdʰ": 1666, + "ĠtÊĥËĪe": 1667, + "lËĪÉĽ": 1668, + "ËĪaËIJte": 1669, + "ĠpËĪuËIJ": 1670, + "ĠmËĪÊĬ": 1671, + "ËĪaËIJɪÊĪ": 1672, + "diËIJ": 1673, + "ĠfɹÉĴm": 1674, + "ĠhËĪÉijËIJ": 1675, + "βo": 1676, + "ĠmËĪiÉľn": 1677, + "ĠðiËIJz": 1678, + "ĠkËĪou": 1679, + "ËĪiËIJna": 1680, + "ĠavËĮeva": 1681, + "ĠËĪaËIJɾ": 1682, + "ĠnËĪuËIJɾ": 1683, + "ĠβËĪe": 1684, + "Ġzaɪn": 1685, + "ËĪÉĽd": 1686, + "ÉĹ": 1687, + "ËĪeɪk": 1688, + "sËĮÉĻÊĬ": 1689, + "ËĪeËIJÉŁ": 1690, + "ĠÊĤËĪÉĻËIJ": 1691, + "je": 1692, + "cʰËIJ": 1693, + "ËĪÉĶr": 1694, + "ÉĽËIJ": 1695, + "ĠtÉķhËĪyÃ¦Éľn": 1696, + "ĠËĮaɪnÉĻn": 1697, + "ĠiËIJn": 1698, + "ĠbËĪÊĮc": 1699, + "ËĪiËIJm": 1700, + "ɾas": 1701, + "ËĮÉĻs": 1702, + "ĠvËĪeËIJ": 1703, + "ĠËĪÉĻrÉľ": 1704, + "ĠduËIJ": 1705, + "ntÉĻ": 1706, + "ĠpɹËĪÉĴ": 1707, + "ĠbËĪɪ": 1708, + "ĠwËĪoÉľ": 1709, + "nËĮi": 1710, + "ĠhÉIJ": 1711, + "ĠkËĪÉĽ": 1712, + "Ġet": 1713, + "jËĪÉĽndo": 1714, + "ĠËĪaiÉľ": 1715, + "Ġli": 1716, + "ĠËĪaÊĬs": 1717, + "kËIJo": 1718, + "ĠÉĹËĪyÉĻ": 1719, + "keËIJ": 1720, + "ĠfËĪiËIJl": 1721, + "ĠbʰËĪaËIJi": 1722, + "ĠÉ¡ÉĻÊĥ": 1723, + "ÊĴËĪe": 1724, + "ĠnjËĪuËIJ": 1725, + "ĠËĪak": 1726, + "ĠÉĹËĪaËIJ": 1727, + "zËĪa": 1728, + "vËĪe": 1729, + "ĠhËĮaÊĬ": 1730, + "ÉIJç": 1731, + "ĠɾËĪÊĮkʰ": 1732, + "pËĪe": 1733, + "ĠtÉĻbi": 1734, + "ĠpËĪÊĮhÉĻlËĮeËIJ": 1735, + "ĠfËĪÉĽ": 1736, + "ĠwËĮɪtÊĥ": 1737, + "ĠtÉķËĪyÉĽÉľ": 1738, + "wËĮe": 1739, + "ËĮaɪt": 1740, + "ĠnÉijËIJx": 1741, + "ĠkËĪÉĶËIJn": 1742, + "ÊĬk": 1743, + "ĠbËĪaËIJd": 1744, + "ÅĭÉĻn": 1745, + "Ġni": 1746, + "ĠbËĪe": 1747, + "ĠmËĮÊĬ": 1748, + "ËĪar": 1749, + "ĠmËĮeɪk": 1750, + "ĠsËĪaËIJɾ": 1751, + "βe": 1752, + "ĠtÉķhËĪiÉľÅĭ": 1753, + "itËĪe": 1754, + "kËĮe": 1755, + "ËĪÉĽËIJl": 1756, + "ËĮÉĴn": 1757, + "ËĮÉij": 1758, + "ĠbËĪɪl": 1759, + "ĠwÊĬd": 1760, + "ĠbËĪoËIJl": 1761, + "rd": 1762, + "iÉĻ": 1763, + "Ġda": 1764, + "ĠbËĪaËIJÊĬ": 1765, + "ĠnËĪÊĮmbÉĻɾ": 1766, + "ËĪaËIJÉªÉľ": 1767, + "ĠÉĽm": 1768, + "ĠmiËIJɾ": 1769, + "ËĪeɪm": 1770, + "los": 1771, + "ËĮÉĽt": 1772, + "ĠËĮaÊĬs": 1773, + "ĠmËĪaÉľt": 1774, + "ĠwËĪuÉĻ": 1775, + "ĠwËĪeɪ": 1776, + "Ġseɲ": 1777, + "ĠbjËĪÉĽ": 1778, + "ĠwÉĽn": 1779, + "fl": 1780, + "ĠkhwËĪa": 1781, + "dËĪÉĽ": 1782, + "vɹɪ": 1783, + "ĠËĪaɾ": 1784, + "jËĪÉijuÉľ": 1785, + "ĠËĮaËIJpkËĮeËIJ": 1786, + "bÊģ": 1787, + "ĠtËĪaɪm": 1788, + "ĠËĪÉij": 1789, + "ĠsËĮa": 1790, + "ĠzËĪoɪ": 1791, + "ËĪÉĶɾa": 1792, + "ĠdËĪø": 1793, + "ËĪÉĶɾt": 1794, + "ĠÅĭËĪÉĶ": 1795, + "min": 1796, + "ĠlËĪÊĬk": 1797, + "ËĪÉĶËIJt": 1798, + "ĠËĪÉĶtɾ": 1799, + "ĠfËĪaɪ": 1800, + "ĠÉ¡ÉĴt": 1801, + "ËĪeËIJÉĻn": 1802, + "kËĪÉĶ": 1803, + "ĠvËĪÉĽÉ¹i": 1804, + "mÉĽ": 1805, + "ËĪaɪz": 1806, + "Ġesp": 1807, + "ɲa": 1808, + "ĠlËĪo": 1809, + "ËĪÉĽËIJra": 1810, + "βËĪi": 1811, + "ouÉľ": 1812, + "ËĮÉĻk": 1813, + "tÊĥuËIJ": 1814, + "ĠnËĪyÉĻ": 1815, + "ÊĪɾ": 1816, + "ĠÉ¡ËĪy": 1817, + "ĠtËĪoðo": 1818, + "ËĪɪçt": 1819, + "Ġmɪç": 1820, + "ĠËĪand": 1821, + "ĠkwËĮÉĽl": 1822, + "ĠÊĤËĪaËIJ": 1823, + "ĠnËĪiÉľ": 1824, + "ËĪÉĶp": 1825, + "ËĪiËIJz": 1826, + "ĠÊĤËĪaÊĬ": 1827, + "ĠɾËĮÉĻhi": 1828, + "ĠsËĮÊĬo": 1829, + "ĠÉĽÉ¡": 1830, + "ĠdÅĵ": 1831, + "ĠÉ¡ËĮaËIJÉªÉľ": 1832, + "dɪ": 1833, + "lËĮa": 1834, + "stËĪi": 1835, + "ĠdËĮiËIJz": 1836, + "ĠtËĮÊĬ": 1837, + "θi": 1838, + "ĠËĪɪskËĮoËIJ": 1839, + "ndÉĻn": 1840, + "Ġtsv": 1841, + "ĠhËĪÉĻËIJ": 1842, + "ĠÊĥËĪÊĬ": 1843, + "ÉĻtËĮeËIJ": 1844, + "pËĮÉĽ": 1845, + "ËĪaɾÉĶn": 1846, + "ĠpÉĽÊģ": 1847, + "Ġy": 1848, + "mnËĮeËIJ": 1849, + "ËĪÉĽllo": 1850, + "ĠÉ¡ËĪÉĻ": 1851, + "ĠËĮad": 1852, + "ĠÊĥv": 1853, + "ËĪÊıɾ": 1854, + "rËĪe": 1855, + "yËIJ": 1856, + "ĠpËĪaËIJs": 1857, + "ĠËĪÉĽn": 1858, + "ɪdÊĴ": 1859, + "ËĪuai": 1860, + "Ġfi": 1861, + "ĠtËĪyÉĻ": 1862, + "ËĪaËIJÉŁ": 1863, + "ĠtjËĪe": 1864, + "ËĪaËIJnaËIJ": 1865, + "stɾ": 1866, + "Êİe": 1867, + "ËĮeɪt": 1868, + "ba": 1869, + "ðas": 1870, + "vÊģ": 1871, + "ĠzËĪÉĻËIJ": 1872, + "ËĪaËIJli": 1873, + "ÉŁÊ°eËIJ": 1874, + "ËĪaËIJteËIJ": 1875, + "ĠvËĪa": 1876, + "Ġsal": 1877, + "ËĪaËIJno": 1878, + "ĠÉ¡ÉĻz": 1879, + "ĠhËĪoËIJti": 1880, + "ĠɲËĪiÉĽ": 1881, + "tÉľ": 1882, + "ĠËĪaËIJp": 1883, + "ĠwËĪÉĽl": 1884, + "ĠmËĪɪl": 1885, + "ĠfyËIJɾ": 1886, + "ËĪÉĽËIJsaËIJ": 1887, + "ĠbËĮiËIJ": 1888, + "ËĪaËIJjaËIJ": 1889, + "ËĪɪp": 1890, + "ĠfÊģ": 1891, + "tsiËĪoËIJne": 1892, + "ĠwËĪuÉľ": 1893, + "Ġvi": 1894, + "ĠwËĪÉijÉľn": 1895, + "ËĪoËIJn": 1896, + "ĠÉĹËĪÉĻɪ": 1897, + "ĠÊĿËĪo": 1898, + "Ġra": 1899, + "mÉĻnt": 1900, + "ËĪaÊĬnd": 1901, + "ĠpÉĽÉ¾": 1902, + "ĠÉĹËĪaËIJÊĬ": 1903, + "oËIJɾ": 1904, + "hËĪo": 1905, + "ĠÉĴn": 1906, + "ĠÊİe": 1907, + "ĠsËĪɪks": 1908, + "É¡n": 1909, + "ĠÉ¡ËĪa": 1910, + "Ġθj": 1911, + "ĠpËĪe": 1912, + "spe": 1913, + "ĠvËĪÉĻ": 1914, + "ĠfËĪɪ": 1915, + "ĠËĮɪntÊĬ": 1916, + "lÉĻn": 1917, + "ĠnËĪiËIJd": 1918, + "ĠsËĮÊĬa": 1919, + "ĠËĪum": 1920, + "ĠdËĪeɪ": 1921, + "ĠËĪÊĮbʰi": 1922, + "ËĪÉijËIJɾ": 1923, + "ĠbËĪiÉĽÉľt": 1924, + "Êİos": 1925, + "ĠtshËĪaiÉľ": 1926, + "ĠËĮɪskËĮaËIJ": 1927, + "ĠaÊĬÉĻ": 1928, + "ĠËĪyæ": 1929, + "Ġdyn": 1930, + "ĠmËĪiËIJn": 1931, + "ĠËĪÊĮcʰËIJ": 1932, + "ĠsÉĽ": 1933, + "ĠnËĪy": 1934, + "ĠnËĮÉĽl": 1935, + "ɡɾ": 1936, + "ÊĥËĪe": 1937, + "ĠÊĤËĮÉĽ": 1938, + "ĠËĪÉĽvɹɪ": 1939, + "ËĪÉĽlp": 1940, + "ĠbËĪak": 1941, + "ĠeËIJ": 1942, + "ĠfËĪaËIJ": 1943, + "ĠkÉĽl": 1944, + "ĠËĪeËIJs": 1945, + "jËĪaËIJd": 1946, + "ĠlËĮi": 1947, + "mbɾe": 1948, + "ktÉĻ": 1949, + "nta": 1950, + "tËĪu": 1951, + "ĠðËĪat": 1952, + "ĠËĪaβ": 1953, + "ÉĻɹi": 1954, + "ĠkwËĮÉĽlla": 1955, + "ĠbÉĻn": 1956, + "rËĮÉĽ": 1957, + "ĠnÉĶ": 1958, + "ĠÉ¡ËĪɪ": 1959, + "ĠËĪap": 1960, + "ɹÉĻ": 1961, + "ËĪaÉľkh": 1962, + "ĠÊIJËĪi": 1963, + "ĠËĪÉijËIJ": 1964, + "ɪɡÉĻn": 1965, + "ĠwËĪai": 1966, + "ĠpÉĻt": 1967, + "kËIJa": 1968, + "ĠbËĪÉĽËIJ": 1969, + "ËĪeËIJÊĭ": 1970, + "lsÉĻÊĬ": 1971, + "ĠcËĪaËIJhɪËĮeËIJ": 1972, + "ĠkÉĻn": 1973, + "ĠËĮaɪnÉĻm": 1974, + "ËĪuËIJt": 1975, + "ĠhËĪaÊĬ": 1976, + "ĠtËĪanto": 1977, + "ĠhÉIJz": 1978, + "ĠsËĪÊĮɾ": 1979, + "Ġno": 1980, + "ĠtËĪÉĶËIJ": 1981, + "ĠzËĪaɪ": 1982, + "ĠtÉķËĪiÉĽÉľ": 1983, + "ĠkozËĪi": 1984, + "ĠkËĪei": 1985, + "ðËĪÉĶɾ": 1986, + "ËĮÉĶÊģ": 1987, + "ĠtËĪÊĮɾ": 1988, + "ĠÊIJËĪÉĻ": 1989, + "ĠÉķËĪyÉĽÉľ": 1990, + "ĠmËĮÊĬÉŁÊ°eËIJ": 1991, + "mf": 1992, + "ĠvËĪiËIJdÉľ": 1993, + "kËĪa": 1994, + "ĠÉIJÉ¡": 1995, + "kw": 1996, + "ĠÊģÉĽ": 1997, + "xÉĻn": 1998, + "ĠdÊĬ": 1999, + "ĠkËĪÊĮɾnËĮeËIJ": 2000, + "jËĪaËIJdaËIJ": 2001, + "ĠfÉĻ": 2002, + "ĠËĮimp": 2003, + "Ġhɪz": 2004, + "ĠʰÏĩ": 2005, + "ËĪoËIJni": 2006, + "ĠxËĪiÉľ": 2007, + "ËĪeËIJsÊĪ": 2008, + "ÊıbÉľ": 2009, + "ËĮÉĶɾke": 2010, + "ĠÉ¡ËĪÉĻÊĬ": 2011, + "ËĪɪÊĥÉĻn": 2012, + "les": 2013, + "ĠfËĪiËIJ": 2014, + "É¡tÉĻ": 2015, + "ËĪeËIJre": 2016, + "ĠvËĮaËIJ": 2017, + "ĠËĪeɪ": 2018, + "ĠmËĪuÉĻÉľn": 2019, + "ĠÉ¡ËĪÊĬd": 2020, + "ĠmËĮaɪn": 2021, + "zËĪe": 2022, + "ĠlËĪiÉľ": 2023, + "Ġmu": 2024, + "ĠkËĮÉĽl": 2025, + "ĠjËĮÉĻh": 2026, + "ĠfËĮÉĶɾ": 2027, + "fɹ": 2028, + "ĠkËĪaɪn": 2029, + "ĠËĪÉĴlsÉĻÊĬ": 2030, + "θɪÅĭ": 2031, + "ĠthËĪonÉ¡Éľ": 2032, + "tËĪÉij": 2033, + "θjo": 2034, + "mËĪÉĶ": 2035, + "Ġos": 2036, + "ĠsÊĬ": 2037, + "ĠsËĪÊĮmÉĻ": 2038, + "ĠvËĮÉĽn": 2039, + "nËĪo": 2040, + "ĠËĪaktÊĥuËIJ": 2041, + "É£a": 2042, + "Ġtʰi": 2043, + "ĠfËĮi": 2044, + "ĠvËĪÉĽl": 2045, + "ĠtËĪutËIJi": 2046, + "xos": 2047 + }, + "merges": [ + [ + "Ë", + "Ī" + ], + [ + "Ë", + "IJ" + ], + [ + "ËĪ", + "É" + ], + [ + "Ë", + "Į" + ], + [ + "É", + "Ļ" + ], + [ + "ËĪ", + "a" + ], + [ + "ËĪ", + "i" + ], + [ + "Ġ", + "t" + ], + [ + "É", + "ª" + ], + [ + "É", + "¾" + ], + [ + "Ġ", + "É" + ], + [ + "Ġ", + "k" + ], + [ + "É", + "ľ" + ], + [ + "Ġ", + "s" + ], + [ + "ËĪ", + "e" + ], + [ + "É", + "Ľ" + ], + [ + "ËĪ", + "o" + ], + [ + "Ġ", + "l" + ], + [ + "ËĪÉ", + "Ľ" + ], + [ + "Ġ", + "d" + ], + [ + "Ê", + "Ĭ" + ], + [ + "ËĪa", + "ËIJ" + ], + [ + "Ġ", + "p" + ], + [ + "Ì", + "ĥ" + ], + [ + "Ġ", + "m" + ], + [ + "ËĪ", + "u" + ], + [ + "Å", + "ĭ" + ], + [ + "Ã", + "°" + ], + [ + "ËĪÉ", + "Ķ" + ], + [ + "Ê", + "Į" + ], + [ + "ËĮ", + "a" + ], + [ + "Ġ", + "h" + ], + [ + "ËĪ", + "ÊĮ" + ], + [ + "Ġ", + "n" + ], + [ + "Ê", + "ģ" + ], + [ + "ËĪÉ", + "ij" + ], + [ + "Ê", + "ĥ" + ], + [ + "e", + "ËIJ" + ], + [ + "Ġ", + "a" + ], + [ + "Ġ", + "b" + ], + [ + "É", + "Ķ" + ], + [ + "ËĪÉ", + "Ļ" + ], + [ + "ÉĻ", + "n" + ], + [ + "Ġ", + "f" + ], + [ + "ËĪÉ", + "ª" + ], + [ + "É", + "¡" + ], + [ + "ËĪe", + "ËIJ" + ], + [ + "Ġ", + "j" + ], + [ + "n", + "t" + ], + [ + "Ġ", + "ð" + ], + [ + "Ġ", + "ËĮ" + ], + [ + "Ġt", + "s" + ], + [ + "ĠÉ", + "¡" + ], + [ + "É", + "ķ" + ], + [ + "ËĪo", + "ËIJ" + ], + [ + "Ê", + "°" + ], + [ + "a", + "ËIJ" + ], + [ + "ËĪ", + "y" + ], + [ + "Ġt", + "Éķ" + ], + [ + "ËĪi", + "ËIJ" + ], + [ + "Ġ", + "Ê" + ], + [ + "Ġ", + "v" + ], + [ + "Ġ", + "w" + ], + [ + "s", + "t" + ], + [ + "É", + "ij" + ], + [ + "n", + "d" + ], + [ + "ËĮ", + "i" + ], + [ + "Ì", + "ª" + ], + [ + "ËĮ", + "e" + ], + [ + "Ġ", + "z" + ], + [ + "ËĪa", + "ɪ" + ], + [ + "ËĪi", + "ÉĽ" + ], + [ + "Î", + "²" + ], + [ + "É", + "¹" + ], + [ + "Ġ", + "ËĮa" + ], + [ + "Î", + "¸" + ], + [ + "Ġh", + "ÉĽ" + ], + [ + "Ê", + "Ī" + ], + [ + "i", + "ËIJ" + ], + [ + "ËĮ", + "o" + ], + [ + "Ġ", + "ɪ" + ], + [ + "Éľ", + "n" + ], + [ + "Ġ", + "x" + ], + [ + "Ġt", + "ÉĻ" + ], + [ + "ËĪu", + "ËIJ" + ], + [ + "ËĮ", + "ÉĻ" + ], + [ + "Ġj", + "ËĪi" + ], + [ + "ËĮ", + "ÉĽ" + ], + [ + "ĠÉ", + "Ľ" + ], + [ + "Ġ", + "ËĪa" + ], + [ + "ËĮa", + "ËIJ" + ], + [ + "Ġl", + "a" + ], + [ + "Ġð", + "e" + ], + [ + "ĠhÉĽ", + "ËIJ" + ], + [ + "Ġ", + "e" + ], + [ + "Ã", + "§" + ], + [ + "ÉĻ", + "l" + ], + [ + "o", + "ËIJ" + ], + [ + "ËĪÉij", + "u" + ], + [ + "Ê", + "Ĵ" + ], + [ + "u", + "ËIJ" + ], + [ + "ĠÉ", + "Ĺ" + ], + [ + "ĠÉ", + "ķ" + ], + [ + "ËĮ", + "eËIJ" + ], + [ + "ĠtÉķ", + "ËĪi" + ], + [ + "o", + "s" + ], + [ + "ËĪÉĶ", + "ËIJ" + ], + [ + "a", + "s" + ], + [ + "ËĪ", + "ÊĬ" + ], + [ + "Ġ", + "i" + ], + [ + "ËĪa", + "i" + ], + [ + "É", + "²" + ], + [ + "ɪ", + "n" + ], + [ + "t", + "s" + ], + [ + "Éľ", + "Åĭ" + ], + [ + "ĠÉ", + "Ł" + ], + [ + "Ġ", + "Êĥ" + ], + [ + "ËĪe", + "ɪ" + ], + [ + "ÉĽ", + "ɾ" + ], + [ + "ËĪÉĽ", + "ËIJ" + ], + [ + "ËĪÉĽ", + "ɾ" + ], + [ + "Ġ", + "r" + ], + [ + "t", + "Êĥ" + ], + [ + "ËĮ", + "ÉĶ" + ], + [ + "Ġd", + "ÉĻ" + ], + [ + "t", + "ÉĻ" + ], + [ + "o", + "u" + ], + [ + "ËĪy", + "ÉĻ" + ], + [ + "ĠËĮ", + "i" + ], + [ + "ÉĻ", + "ɾ" + ], + [ + "ËĪÉĻ", + "ÊĬ" + ], + [ + "ËĪÊĮ", + "ɾ" + ], + [ + "ËĪÉ", + "Ĵ" + ], + [ + "Ġt", + "h" + ], + [ + "ËĪo", + "n" + ], + [ + "Ê", + "ĭ" + ], + [ + "ËĪÉij", + "ËIJ" + ], + [ + "ËĪÊĮ", + "h" + ], + [ + "w", + "ËĪa" + ], + [ + "ËĪe", + "i" + ], + [ + "l", + "l" + ], + [ + "ĠÉ", + "IJ" + ], + [ + "Éij", + "ËIJ" + ], + [ + "a", + "n" + ], + [ + "É", + "Ł" + ], + [ + "ĠÊ", + "ĭ" + ], + [ + "Ġk", + "o" + ], + [ + "k", + "h" + ], + [ + "ɪ", + "Åĭ" + ], + [ + "ËĪaËIJ", + "ɪ" + ], + [ + "Ġt", + "Êĥ" + ], + [ + "ËĪaËIJ", + "t" + ], + [ + "ĠËĮ", + "e" + ], + [ + "ĠtÉķ", + "h" + ], + [ + "ËĪu", + "o" + ], + [ + "ËĪon", + "É¡" + ], + [ + "É", + "ĸ" + ], + [ + "a", + "t" + ], + [ + "Ġk", + "e" + ], + [ + "É", + "Ĵ" + ], + [ + "ĠÉķ", + "ËĪi" + ], + [ + "Ã", + "¸" + ], + [ + "ĠÉ", + "ij" + ], + [ + "ËĪeËIJ", + "k" + ], + [ + "Å", + "ĵ" + ], + [ + "r", + "e" + ], + [ + "Ġ", + "ɾ" + ], + [ + "Ġk", + "ÉĶ" + ], + [ + "ËĮ", + "ÊĬ" + ], + [ + "s", + "k" + ], + [ + "Ġ", + "ÊĬ" + ], + [ + "Ġa", + "nd" + ], + [ + "ɪ", + "ç" + ], + [ + "Ġm", + "e" + ], + [ + "ËĪa", + "ɾ" + ], + [ + "Ġ", + "ËĪɪ" + ], + [ + "n", + "a" + ], + [ + "Ġ", + "β" + ], + [ + "Ġl", + "ËĪi" + ], + [ + "j", + "aËIJ" + ], + [ + "l", + "i" + ], + [ + "n", + "o" + ], + [ + "Ġɪ", + "n" + ], + [ + "Ġd", + "ËĮi" + ], + [ + "ĠÉ", + "²" + ], + [ + "t", + "ËIJ" + ], + [ + "ÉĻ", + "m" + ], + [ + "Ġl", + "ÉĻ" + ], + [ + "Ġð", + "ÉĻ" + ], + [ + "ɪ", + "k" + ], + [ + "ËĪÉĽ", + "l" + ], + [ + "Éľ", + "t" + ], + [ + "Ġs", + "e" + ], + [ + "e", + "s" + ], + [ + "ËĪo", + "u" + ], + [ + "ËĪa", + "ÊĬ" + ], + [ + "ĠÉ", + "Ķ" + ], + [ + "ɪ", + "t" + ], + [ + "Ġ", + "Åĭ" + ], + [ + "ËĪÉĽ", + "n" + ], + [ + "Ê", + "İ" + ], + [ + "Ġk", + "h" + ], + [ + "ËĪÉĽ", + "nt" + ], + [ + "ËĪaËIJ", + "ɾ" + ], + [ + "Ġk", + "i" + ], + [ + "m", + "p" + ], + [ + "l", + "t" + ], + [ + "É", + "£" + ], + [ + "Ġp", + "a" + ], + [ + "ËĪÉĻ", + "ËIJ" + ], + [ + "ɪ", + "s" + ], + [ + "ĠÉ", + "Ĵ" + ], + [ + "Ġl", + "e" + ], + [ + "ɪ", + "Éľ" + ], + [ + "ËĪÉĽ", + "t" + ], + [ + "Ġd", + "e" + ], + [ + "ĠÉ", + "¹" + ], + [ + "Ġt", + "ËĪoËIJ" + ], + [ + "Ġ", + "Êģ" + ], + [ + "Êĥ", + "ÉĻn" + ], + [ + "ĠÊĬ", + "nt" + ], + [ + "ËĪÉĶ", + "ɾ" + ], + [ + "ËĪa", + "ð" + ], + [ + "Ġa", + "ɪ" + ], + [ + "ĠÊ", + "IJ" + ], + [ + "Ġm", + "ËĪa" + ], + [ + "r", + "a" + ], + [ + "Ġk", + "ËĪɪ" + ], + [ + "k", + "t" + ], + [ + "ËIJ", + "p" + ], + [ + "ĠÊ", + "Ī" + ], + [ + "ËĪaËIJ", + "ÊĬ" + ], + [ + "Ġk", + "ËĪÊĮɾ" + ], + [ + "Ġ", + "ËĪÊĮ" + ], + [ + "ĠÉĴ", + "v" + ], + [ + "Ġe", + "l" + ], + [ + "k", + "s" + ], + [ + "Ġk", + "w" + ], + [ + "ÉĻ", + "t" + ], + [ + "nd", + "o" + ], + [ + "e", + "i" + ], + [ + "ĠËĮa", + "ËIJp" + ], + [ + "s", + "e" + ], + [ + "ÉĻ", + "ɹ" + ], + [ + "ËĪu", + "ei" + ], + [ + "ÉĻ", + "s" + ], + [ + "Ġk", + "ËĮo" + ], + [ + "ĠÊ", + "Ĥ" + ], + [ + "ĠËĮ", + "ÊĬ" + ], + [ + "Ġ", + "c" + ], + [ + "ĠÉĽ", + "n" + ], + [ + "ËĪa", + "nt" + ], + [ + "θ", + "j" + ], + [ + "ËĮo", + "ËIJ" + ], + [ + "Ġ", + "ËĪaËIJ" + ], + [ + "Ġp", + "ɾ" + ], + [ + "s", + "i" + ], + [ + "Ġ", + "ËĪe" + ], + [ + "Ġj", + "uËIJ" + ], + [ + "Ġk", + "ËĮe" + ], + [ + "ËĮ", + "ɪ" + ], + [ + "ÉĶ", + "n" + ], + [ + "Ġs", + "ËĪÊĮ" + ], + [ + "Ġ", + "ËĪu" + ], + [ + "n", + "i" + ], + [ + "Ġs", + "t" + ], + [ + "Ġd", + "iËIJ" + ], + [ + "Ġk", + "eËIJ" + ], + [ + "ĠjËĪi", + "ou" + ], + [ + "ËĪai", + "Éľ" + ], + [ + "Ġd", + "ÊĴ" + ], + [ + "Ġ", + "ËĪÉĶ" + ], + [ + "v", + "a" + ], + [ + "ËIJ", + "ɾ" + ], + [ + "ËĪ", + "ø" + ], + [ + "ËĮÉĻ", + "ÊĬ" + ], + [ + "Ġp", + "ËĪu" + ], + [ + "Ġs", + "u" + ], + [ + "Ġm", + "a" + ], + [ + "Ġ", + "ÉĻ" + ], + [ + "d", + "ÊĴ" + ], + [ + "Ġp", + "ʰ" + ], + [ + "l", + "e" + ], + [ + "i", + "n" + ], + [ + "ĠtÉķh", + "ËĪi" + ], + [ + "Ġw", + "ËĪo" + ], + [ + "r", + "o" + ], + [ + "ËĮ", + "y" + ], + [ + "ɾ", + "a" + ], + [ + "Ġs", + "ËĪi" + ], + [ + "ð", + "ÉĻ" + ], + [ + "Ġs", + "eËIJ" + ], + [ + "l", + "a" + ], + [ + "ĠÊ", + "Ĵ" + ], + [ + "m", + "b" + ], + [ + "Ġh", + "ËĪoËIJ" + ], + [ + "Ġb", + "ʰ" + ], + [ + "ĠÉĽ", + "ɾ" + ], + [ + "Ġð", + "at" + ], + [ + "s", + "p" + ], + [ + "ÉĶ", + "ɾ" + ], + [ + "e", + "n" + ], + [ + "Ġs", + "ÉĻ" + ], + [ + "ËĪÉĶ", + "Éľ" + ], + [ + "Ġl", + "ËĮa" + ], + [ + "ĠËĮ", + "ÉĽ" + ], + [ + "Ġ", + "ËĪy" + ], + [ + "É¡", + "aËIJ" + ], + [ + "Ġd", + "ÉĽÉ¾" + ], + [ + "ËĪÉĽ", + "Êģ" + ], + [ + "Éľ", + "kh" + ], + [ + "ËĪi", + "ÉĻ" + ], + [ + "ËĪa", + "n" + ], + [ + "Ġm", + "ËĪo" + ], + [ + "ËĪa", + "β" + ], + [ + "Ġa", + "l" + ], + [ + "Ġ", + "ËĪeËIJ" + ], + [ + "Ġ", + "θ" + ], + [ + "Ġn", + "ËĪi" + ], + [ + "p", + "ʰ" + ], + [ + "ll", + "a" + ], + [ + "Ġp", + "l" + ], + [ + "ËĪ", + "Åĵ" + ], + [ + "j", + "ËĪÉiju" + ], + [ + "Ġa", + "v" + ], + [ + "Ġm", + "ËĪi" + ], + [ + "Ġf", + "ËĪa" + ], + [ + "ËĪÉ", + "ľ" + ], + [ + "m", + "e" + ], + [ + "ËĮÉĻ", + "h" + ], + [ + "ËĪu", + "ÉĻ" + ], + [ + "i", + "t" + ], + [ + "j", + "ËĪe" + ], + [ + "Ġ", + "o" + ], + [ + "ËĪÉľ", + "ËIJ" + ], + [ + "ĠtÉķËĪi", + "ou" + ], + [ + "ÉĶ", + "ËIJ" + ], + [ + "Ġn", + "ÉĻ" + ], + [ + "ËĪÉĻ", + "Éľn" + ], + [ + "Ġm", + "ÉĻ" + ], + [ + "Ġd", + "eËIJ" + ], + [ + "m", + "o" + ], + [ + "s", + "a" + ], + [ + "j", + "ËĪÉĶ" + ], + [ + "ËĪa", + "l" + ], + [ + "ĠtÉķ", + "ËĪiÉĽ" + ], + [ + "ĠÉ¡", + "ÉĻ" + ], + [ + "ð", + "a" + ], + [ + "Ġɪ", + "z" + ], + [ + "Ġs", + "a" + ], + [ + "r", + "i" + ], + [ + "ĠËĮi", + "l" + ], + [ + "ËĮ", + "u" + ], + [ + "Ġk", + "aËIJ" + ], + [ + "ĠÉĻ", + "ËIJ" + ], + [ + "ĠÉ", + "ĸ" + ], + [ + "Ġk", + "a" + ], + [ + "ËĪÊĮh", + "i" + ], + [ + "Ġj", + "eËIJ" + ], + [ + "Ġt", + "ʰ" + ], + [ + "n", + "e" + ], + [ + "k", + "ËIJ" + ], + [ + "Ġts", + "ËĪai" + ], + [ + "Ġ", + "ËĪeËIJk" + ], + [ + "n", + "k" + ], + [ + "t", + "i" + ], + [ + "ËĪa", + "Éľn" + ], + [ + "Ġk", + "ËIJ" + ], + [ + "É¡", + "ÉĻn" + ], + [ + "ËĪi", + "a" + ], + [ + "ĠÉĶ", + "ËIJɾ" + ], + [ + "Ê", + "ı" + ], + [ + "ĠËĮ", + "ÊĮ" + ], + [ + "Ġz", + "ËĪaËIJ" + ], + [ + "Ġl", + "os" + ], + [ + "ÉĽ", + "s" + ], + [ + "ËĪÉĶ", + "n" + ], + [ + "ÉĽ", + "nt" + ], + [ + "ÉĽ", + "n" + ], + [ + "ĠÉŁ", + "ËĪoËIJ" + ], + [ + "ç", + "t" + ], + [ + "Ġd", + "as" + ], + [ + "Ġx", + "ËĮo" + ], + [ + "ËĪu", + "Éľ" + ], + [ + "ËĪa", + "s" + ], + [ + "Ġb", + "ËĪÊĮ" + ], + [ + "ËĪiÉĽ", + "Éľn" + ], + [ + "É", + "IJ" + ], + [ + "Ġts", + "uËIJ" + ], + [ + "Ġp", + "ËĮÉĽ" + ], + [ + "Ġn", + "ËĪÉĶ" + ], + [ + "ÊĬ", + "t" + ], + [ + "m", + "a" + ], + [ + "Ġn", + "ËĪo" + ], + [ + "Ġl", + "ËĪɪ" + ], + [ + "ËĪÉĽ", + "s" + ], + [ + "ɪ", + "l" + ], + [ + "ĠÉķ", + "ËĪiÉĽ" + ], + [ + "Ġ", + "ËĪÊĬ" + ], + [ + "ÉĴ", + "t" + ], + [ + "t", + "o" + ], + [ + "Ġ", + "ËĪo" + ], + [ + "ËĮo", + "n" + ], + [ + "Ġk", + "wËĪa" + ], + [ + "Ġɪ", + "t" + ], + [ + "Ġh", + "oËIJ" + ], + [ + "ËĪiËIJ", + "k" + ], + [ + "ĠËĮaËIJp", + "k" + ], + [ + "ËĪaɪ", + "n" + ], + [ + "Ã", + "¦" + ], + [ + "ÉĻn", + "t" + ], + [ + "t", + "a" + ], + [ + "l", + "o" + ], + [ + "Ġn", + "ËĪÉij" + ], + [ + "Ġl", + "ËĪa" + ], + [ + "ËĪi", + "Éľ" + ], + [ + "Ġw", + "ËĪei" + ], + [ + "ÉĽ", + "Êģ" + ], + [ + "Ġt", + "ËĪa" + ], + [ + "Ġɾ", + "ËĮÉĻh" + ], + [ + "ĠÉķËĪi", + "Éij" + ], + [ + "ËĮi", + "ËIJ" + ], + [ + "ËĮÉĽ", + "l" + ], + [ + "ĠtÉĻ", + "Éľ" + ], + [ + "Ġk", + "ËĪuo" + ], + [ + "Ġt", + "ËĪu" + ], + [ + "j", + "ËĪÉĽ" + ], + [ + "ĠËĮi", + "n" + ], + [ + "ɾ", + "e" + ], + [ + "Ġk", + "oËIJ" + ], + [ + "Ġk", + "ËĪa" + ], + [ + "ɾ", + "i" + ], + [ + "ĠtÉķËĪi", + "Éij" + ], + [ + "l", + "ÉĻ" + ], + [ + "Ġk", + "ÉĻ" + ], + [ + "Ġt", + "ËĪi" + ], + [ + "ĠÅĭ", + "ËĪyÉĻ" + ], + [ + "Ġts", + "h" + ], + [ + "e", + "r" + ], + [ + "a", + "v" + ], + [ + "ĠkÉĶ", + "n" + ], + [ + "ËĪÉĻ", + "ÉľÅĭ" + ], + [ + "ð", + "o" + ], + [ + "ËĪaËIJ", + "n" + ], + [ + "Ġbʰ", + "ËĪi" + ], + [ + "ĠkËIJ", + "jaËIJ" + ], + [ + "ÉĻ", + "z" + ], + [ + "Ġp", + "Êģ" + ], + [ + "Ġd", + "ËĪɪ" + ], + [ + "Ġz", + "iËIJ" + ], + [ + "É¡", + "eËIJ" + ], + [ + "Ġt", + "ËĪÉĻ" + ], + [ + "ɪ", + "z" + ], + [ + "Ġn", + "ËĮon" + ], + [ + "t", + "aËIJ" + ], + [ + "b", + "l" + ], + [ + "t", + "e" + ], + [ + "n", + "ËĮeËIJ" + ], + [ + "ËĪɪ", + "l" + ], + [ + "s", + "o" + ], + [ + "k", + "o" + ], + [ + "u", + "Êģ" + ], + [ + "ĠÉ", + "£" + ], + [ + "Ġpa", + "Êģ" + ], + [ + "Ġ", + "ËĪÉĽ" + ], + [ + "j", + "ËĪuËIJ" + ], + [ + "ËĮ", + "ÊĮ" + ], + [ + "y", + "n" + ], + [ + "ËĪiËIJ", + "n" + ], + [ + "Ġl", + "ËĪaɪ" + ], + [ + "ËĪɪ", + "Åĭ" + ], + [ + "ĠtÉķh", + "ËĪy" + ], + [ + "Ġn", + "ËĪÊĮhi" + ], + [ + "Ġd", + "ËĮe" + ], + [ + "Ġj", + "ËĪÉiju" + ], + [ + "Ġt", + "ËĪÉiju" + ], + [ + "Ġh", + "ËĪo" + ], + [ + "ɪ", + "d" + ], + [ + "Ġth", + "ËĪÉij" + ], + [ + "m", + "ËĪe" + ], + [ + "Ġ", + "ËĪÉĻ" + ], + [ + "j", + "a" + ], + [ + "Ġp", + "h" + ], + [ + "ÉĽ", + "t" + ], + [ + "Ġk", + "ËĪÊĮ" + ], + [ + "t", + "ÉĻn" + ], + [ + "m", + "ËĪÉij" + ], + [ + "w", + "ËĪe" + ], + [ + "ĠËĮa", + "ɪn" + ], + [ + "Ġð", + "ɪs" + ], + [ + "É¡", + "ÉĻ" + ], + [ + "Ġn", + "ËĪaËIJ" + ], + [ + "Ġb", + "ËĪaËIJ" + ], + [ + "Ġa", + "θ" + ], + [ + "Ġm", + "ËĮa" + ], + [ + "ËĪÊĮh", + "a" + ], + [ + "Ġd", + "ËĮa" + ], + [ + "ËĪ", + "Êı" + ], + [ + "Ġɲ", + "ËĮy" + ], + [ + "Ġp", + "ËĪa" + ], + [ + "ËĪað", + "o" + ], + [ + "d", + "i" + ], + [ + "b", + "Éľ" + ], + [ + "É", + "³" + ], + [ + "Ġw", + "iËIJ" + ], + [ + "Ġn", + "ËĪɪ" + ], + [ + "ĠÉ¡", + "ËĪÉĶÉľ" + ], + [ + "tËIJ", + "o" + ], + [ + "ËĮÉĻ", + "m" + ], + [ + "ËĪaËIJ", + "r" + ], + [ + "Ġm", + "ÉĽ" + ], + [ + "ËĪeËIJ", + "É¡aËIJ" + ], + [ + "Ġs", + "ËĮi" + ], + [ + "Ġl", + "ËĮaËIJ" + ], + [ + "n", + "ËĮaËIJ" + ], + [ + "Ġs", + "p" + ], + [ + "t", + "Êģ" + ], + [ + "ĠÊ", + "İ" + ], + [ + "ËĮ", + "ÉijËIJ" + ], + [ + "Ġk", + "l" + ], + [ + "k", + "ʰ" + ], + [ + "i", + "l" + ], + [ + "ĠÊĥ", + "t" + ], + [ + "ĠËĮÊĬ", + "n" + ], + [ + "a", + "l" + ], + [ + "Ġs", + "ËĪÉĽ" + ], + [ + "Ġm", + "ËĪaËIJ" + ], + [ + "Ġ", + "Åĵ" + ], + [ + "ĠÉ¡", + "ËĪÊĮ" + ], + [ + "ĠpËĮÉĽ", + "r" + ], + [ + "ɾ", + "ËĪa" + ], + [ + "ËIJ", + "ÊĪ" + ], + [ + "ËĪaβ", + "a" + ], + [ + "Ġw", + "ËĪÉĴ" + ], + [ + "Ġx", + "ËĪuei" + ], + [ + "Ġkh", + "ËĪo" + ], + [ + "Ġla", + "s" + ], + [ + "ĠÉĹ", + "ËĪo" + ], + [ + "Ġf", + "ÉĽÉ¾" + ], + [ + "Ġj", + "ËĪiÉĽ" + ], + [ + "Ġt", + "ËĪe" + ], + [ + "Ġk", + "ËĮÉĶ" + ], + [ + "ĠdeËIJ", + "n" + ], + [ + "Ġm", + "o" + ], + [ + "Ġp", + "ËĪi" + ], + [ + "Ġt", + "ËĪÉij" + ], + [ + "ËĪÉĽ", + "st" + ], + [ + "w", + "ËĪÉij" + ], + [ + "ËĪaɪ", + "t" + ], + [ + "ÉĻ", + "ÊĬ" + ], + [ + "Ġ", + "ËĪi" + ], + [ + "ɪ", + "j" + ], + [ + "a", + "ɪ" + ], + [ + "ËĪaËIJ", + "Éľ" + ], + [ + "ĠËĪɪ", + "s" + ], + [ + "Ġp", + "ÉĶɾ" + ], + [ + "æ", + "Éľn" + ], + [ + "k", + "a" + ], + [ + "Åĭ", + "É¡" + ], + [ + "b", + "ÉĻn" + ], + [ + "ÊĬ", + "f" + ], + [ + "Ġp", + "ɹ" + ], + [ + "Ġl", + "ËĮe" + ], + [ + "ËĪiËIJ", + "d" + ], + [ + "ËĪaËIJ", + "re" + ], + [ + "Ġm", + "ËĪÊĮ" + ], + [ + "ÉĻ", + "r" + ], + [ + "Ġd", + "Éij" + ], + [ + "ËĪaËIJt", + "o" + ], + [ + "Ġp", + "ËĪeËIJ" + ], + [ + "Ġd", + "ËĪoËIJ" + ], + [ + "Ġs", + "ËĮÊĬ" + ], + [ + "Ġh", + "ËĪi" + ], + [ + "Ġs", + "ËĪa" + ], + [ + "ËĪeËIJ", + "n" + ], + [ + "d", + "ÉĻ" + ], + [ + "Ġp", + "j" + ], + [ + "ËĪÅĵ", + "Êģ" + ], + [ + "l", + "ɪç" + ], + [ + "ÉĴ", + "n" + ], + [ + "ĠËĪÉĻ", + "r" + ], + [ + "t", + "ËĪe" + ], + [ + "Ġi", + "l" + ], + [ + "ËĪaËIJ", + "l" + ], + [ + "Ġs", + "ËĮÉĻÊĬ" + ], + [ + "s", + "ÊĪ" + ], + [ + "Ġd", + "ËĪuËIJ" + ], + [ + "h", + "ËĪÉij" + ], + [ + "Ġx", + "ËĪou" + ], + [ + "Ġl", + "ËĪaiÉľ" + ], + [ + "w", + "ËĪo" + ], + [ + "ËĪÉĽnt", + "e" + ], + [ + "Ġs", + "y" + ], + [ + "Ġz", + "ɪç" + ], + [ + "ĠÉ¡", + "ËĪu" + ], + [ + "ĠÉķ", + "ËĪy" + ], + [ + "ËĪÉĶËIJ", + "l" + ], + [ + "ÉĶ", + "l" + ], + [ + "Ġt", + "ËĪo" + ], + [ + "ĠÊĭ", + "oËIJ" + ], + [ + "Ġ", + "iËIJ" + ], + [ + "wËĪa", + "ða" + ], + [ + "ËĪa", + "ndo" + ], + [ + "Ġaθ", + "ÉĽnt" + ], + [ + "Ġaθɼnt", + "wËĪaða" + ], + [ + "Ġt", + "ËĪiÉĽ" + ], + [ + "ËĪei", + "Éľ" + ], + [ + "Ġp", + "ËĮa" + ], + [ + "Ġn", + "ËĪaɪ" + ], + [ + "w", + "a" + ], + [ + "Ġf", + "r" + ], + [ + "ĠÊIJ", + "ËĪÉĻÉľn" + ], + [ + "ËĪu", + "a" + ], + [ + "m", + "i" + ], + [ + "Ġm", + "ËĪÉĽ" + ], + [ + "ËĪeËIJk", + "ʰ" + ], + [ + "c", + "ʰ" + ], + [ + "Ġw", + "ËĪÉij" + ], + [ + "st", + "a" + ], + [ + "Ġt", + "u" + ], + [ + "Ġs", + "k" + ], + [ + "ËĪÉĶ", + "l" + ], + [ + "ËĪeËIJ", + "ÊĪ" + ], + [ + "Ġl", + "ËĪaËIJɪ" + ], + [ + "Ġl", + "ËĪaËIJ" + ], + [ + "ËĪÉĽËIJ", + "s" + ], + [ + "ËĪÉĽÉ¾", + "a" + ], + [ + "ËĪÉĻ", + "Éľt" + ], + [ + "Ġ", + "yn" + ], + [ + "d", + "ÉĻn" + ], + [ + "Ġd", + "i" + ], + [ + "ËĪiËIJ", + "s" + ], + [ + "Ġðe", + "l" + ], + [ + "ËĪÊĮ", + "r" + ], + [ + "Ġh", + "ËĪaËIJ" + ], + [ + "Ġb", + "ÉĻ" + ], + [ + "Ġj", + "ËĪuËIJ" + ], + [ + "ll", + "e" + ], + [ + "st", + "o" + ], + [ + "ËĪɪ", + "t" + ], + [ + "ËĪoËIJ", + "ɾ" + ], + [ + "b", + "ʰ" + ], + [ + "m", + "ÉĻn" + ], + [ + "ËĮu", + "ÉĻ" + ], + [ + "ËĮÉĻ", + "ɾ" + ], + [ + "ËĪÊĮ", + "n" + ], + [ + "ĠlËĪaɪ", + "k" + ], + [ + "Ġb", + "ËĪa" + ], + [ + "ɪ", + "ð" + ], + [ + "Ġl", + "o" + ], + [ + "z", + "i" + ], + [ + "ËĪÊĮ", + "st" + ], + [ + "m", + "ËĪi" + ], + [ + "ÉĶ", + "Êģ" + ], + [ + "ĠnËĪɪ", + "çt" + ], + [ + "Ġt", + "ɾ" + ], + [ + "Ġd", + "ËĪeËIJkʰ" + ], + [ + "Ġs", + "ËĮe" + ], + [ + "Ġn", + "ËĪÉĻÊĬ" + ], + [ + "Ġ", + "u" + ], + [ + "Ġs", + "i" + ], + [ + "Ġɪ", + "ç" + ], + [ + "Ġp", + "r" + ], + [ + "ĠtÉķ", + "ËĪy" + ], + [ + "Ġm", + "ËĪu" + ], + [ + "z", + "a" + ], + [ + "Ġt", + "Êģ" + ], + [ + "Ġw", + "ɪð" + ], + [ + "t", + "ËĪÉĽ" + ], + [ + "Ġp", + "ËĪÊĮɾ" + ], + [ + "Ġk", + "ËĪÉĶ" + ], + [ + "ËĪoËIJ", + "r" + ], + [ + "Ġh", + "ËĮa" + ], + [ + "Ġk", + "ËĪonÉ¡" + ], + [ + "Ġp", + "uÊģ" + ], + [ + "Ġd", + "y" + ], + [ + "ËĪɪ", + "n" + ], + [ + "nt", + "e" + ], + [ + "Ġk", + "ËĮa" + ], + [ + "ËĪÉĻ", + "ɪ" + ], + [ + "Ġm", + "i" + ], + [ + "ĠÉ¡", + "ËĮuÉĻ" + ], + [ + "ĠÊ", + "²" + ], + [ + "Ġf", + "ËĪÉij" + ], + [ + "Ġv", + "ÉijËIJ" + ], + [ + "ĠËĮa", + "ÊĬ" + ], + [ + "ËĮ", + "uËIJ" + ], + [ + "ĠËĪu", + "n" + ], + [ + "Ġj", + "ËĪÊĮha" + ], + [ + "j", + "uËIJ" + ], + [ + "Ġm", + "ɪt" + ], + [ + "Ġl", + "ËĪÉĽ" + ], + [ + "ËĪeËIJ", + "Êĥ" + ], + [ + "Ġf", + "ÉĶËIJ" + ], + [ + "m", + "ÉĻ" + ], + [ + "ɾ", + "t" + ], + [ + "ĠkËĮo", + "n" + ], + [ + "Ġl", + "ËĪÉĶ" + ], + [ + "Ġx", + "ËĪÉiju" + ], + [ + "p", + "l" + ], + [ + "Ġd", + "ËĪi" + ], + [ + "Ġl", + "ËĪoËIJ" + ], + [ + "s", + "ÉĻ" + ], + [ + "ËĪaËIJ", + "va" + ], + [ + "Ġl", + "ËĪu" + ], + [ + "ĠÉ¡", + "ËĮÉĻÊĬ" + ], + [ + "Ġh", + "av" + ], + [ + "ĠËĮaËIJpk", + "ËĮoËIJ" + ], + [ + "ɾ", + "ËĪi" + ], + [ + "Ġf", + "ËĪÉĻ" + ], + [ + "Ġh", + "ËĮÉĻm" + ], + [ + "ËĪonÉ¡", + "Éľ" + ], + [ + "j", + "o" + ], + [ + "Ġs", + "ÉĶ" + ], + [ + "ËĪaËIJ", + "d" + ], + [ + "w", + "ËĪiÉĻ" + ], + [ + "ËĪa", + "nd" + ], + [ + "ËĮa", + "ɪn" + ], + [ + "t", + "ɾ" + ], + [ + "ĠËĮ", + "ɪ" + ], + [ + "ĠËĪu", + "na" + ], + [ + "Ġx", + "wËĪÉij" + ], + [ + "Ġj", + "ÉĶËIJ" + ], + [ + "Êģ", + "ËĪi" + ], + [ + "ĠkËĪuo", + "Éľ" + ], + [ + "Ġa", + "β" + ], + [ + "ĠÉ¡", + "ËĪaËIJ" + ], + [ + "an", + "o" + ], + [ + "t", + "ÉĻl" + ], + [ + "Ġr", + "ËĮe" + ], + [ + "ËĮÊĮ", + "t" + ], + [ + "ĠjËĪi", + "Éij" + ], + [ + "ĠɾËĮÉĻh", + "aËIJ" + ], + [ + "Ġm", + "ËĪe" + ], + [ + "ĠËĪy", + "Ã¦Éľn" + ], + [ + "Ġf", + "ËĪu" + ], + [ + "Ġb", + "l" + ], + [ + "n", + "ËĪi" + ], + [ + "s", + "ÉĻn" + ], + [ + "Ġa", + "ɪn" + ], + [ + "ËĪi", + "ÊĬ" + ], + [ + "Ġðe", + "ɪ" + ], + [ + "Ġɪ", + "ts" + ], + [ + "Ġ", + "(" + ], + [ + "ËĪy", + "ËIJ" + ], + [ + "ÉĻ", + "d" + ], + [ + "ĠËĮ", + "o" + ], + [ + "ĠÉĽ", + "s" + ], + [ + "Ġv", + "iËIJ" + ], + [ + "ËIJ", + "É¡eËIJ" + ], + [ + "k", + "ËĪe" + ], + [ + "ĠËĪa", + "l" + ], + [ + "ÉĽ", + "l" + ], + [ + "Ġ", + "ÊĮ" + ], + [ + "ËIJ", + "o" + ], + [ + "Ġk", + "ËĪo" + ], + [ + "ĠÊĪ", + "ËĪuËIJ" + ], + [ + "Ġs", + "ËĪɪ" + ], + [ + "ËĪeËIJ", + "ɾ" + ], + [ + "Éľ", + "m" + ], + [ + "ËĮ", + "ÉĻn" + ], + [ + "ËĪaËIJ", + "i" + ], + [ + "ËĪoËIJ", + "l" + ], + [ + "ɪ", + "ËĮeËIJ" + ], + [ + "Ġʲ", + "ËĪy" + ], + [ + "Ġk", + "ËĪÉĶËIJ" + ], + [ + "s", + "ËĪi" + ], + [ + "Ġl", + "ËĪe" + ], + [ + "ËĮ", + "ÉĴt" + ], + [ + "ËĪiËIJ", + "p" + ], + [ + "a", + "Êģ" + ], + [ + "Ġθ", + "ËĪɪÅĭ" + ], + [ + "ËĪÉĻËIJ", + "ɪ" + ], + [ + "ËĪÊĮ", + "l" + ], + [ + "ĠhËĪoËIJ", + "taËIJ" + ], + [ + "ËĪo", + "ɪ" + ], + [ + "nt", + "o" + ], + [ + "z", + "h" + ], + [ + "ĠdeËIJ", + "m" + ], + [ + "ĠkÉĶ", + "m" + ], + [ + "ʰ", + "ËĪiËIJk" + ], + [ + "ĠdÊĴ", + "ËĪÊĮst" + ], + [ + "p", + "ɾ" + ], + [ + "Ġl", + "y" + ], + [ + "h", + "ËĪu" + ], + [ + "ËĪÉĶ", + "ø" + ], + [ + "ËĪaËIJ", + "s" + ], + [ + "ĠËĪa", + "n" + ], + [ + "Ġ", + "ËĪÉĴ" + ], + [ + "Ġk", + "an" + ], + [ + "Ġts", + "ËĪuo" + ], + [ + "ËĪeËIJ", + "va" + ], + [ + "ĠÉ¡", + "ɾ" + ], + [ + "Ġp", + "o" + ], + [ + "ĠtÊĥ", + "ËĪÉĶ" + ], + [ + "Êİ", + "a" + ], + [ + "Ġm", + "ËĮi" + ], + [ + "Êĥ", + "t" + ], + [ + "t", + "ËĪi" + ], + [ + "Ġh", + "ËĪÊĮ" + ], + [ + "tÊĥ", + "e" + ], + [ + "Ġf", + "ÉĶn" + ], + [ + "v", + "e" + ], + [ + "Ġn", + "ËĮe" + ], + [ + "ËĪÉĶ", + "Êģ" + ], + [ + "i", + "z" + ], + [ + "Ġs", + "ËĪuo" + ], + [ + "ËĪÉĽËIJ", + "r" + ], + [ + "wËĪa", + "Êģ" + ], + [ + "ËĪað", + "a" + ], + [ + "Åĭ", + "k" + ], + [ + "p", + "o" + ], + [ + "Ġk", + "ËĪi" + ], + [ + "ËĪa", + "d" + ], + [ + "Ġv", + "ËĪi" + ], + [ + "t", + "Éķ" + ], + [ + "Ġk", + "ËĪÉĻ" + ], + [ + "Ġw", + "ËĪu" + ], + [ + "ÉĴ", + "z" + ], + [ + "ĠvÉijËIJ", + "ɾ" + ], + [ + "Êģ", + "ËĪÉĽ" + ], + [ + "Ġk", + "ËĪaËIJ" + ], + [ + "k", + "e" + ], + [ + "n", + "ÉĻ" + ], + [ + "ËĪÊĮ", + "b" + ], + [ + "ËĪuËIJ", + "ɾ" + ], + [ + "ËĮÉĻ", + "ËIJ" + ], + [ + "ĠÊĪ", + "ʰËĪiËIJk" + ], + [ + "Ġk", + "ËĪu" + ], + [ + "Ġb", + "ËĮÊĮt" + ], + [ + "Ġa", + "t" + ], + [ + "Ġf", + "ɹ" + ], + [ + "ËĪa", + "x" + ], + [ + "Ġz", + "oËIJ" + ], + [ + "Ġt", + "ËĪaËIJ" + ], + [ + "Ġð", + "ËĮe" + ], + [ + "n", + "eËIJ" + ], + [ + "ĠÉij", + "ËIJ" + ], + [ + "Ġa", + "ÊĬf" + ], + [ + "a", + "m" + ], + [ + "ÊĬ", + "Åĭ" + ], + [ + "ĠÉĶ", + "ËIJ" + ], + [ + "ĠÉķËĪi", + "ÉľÅĭ" + ], + [ + "Ġ", + "ËĪÉĶËIJl" + ], + [ + "ɪ", + "m" + ], + [ + "j", + "ËĪo" + ], + [ + "ËĪiËIJ", + "ÉŁ" + ], + [ + "Ġkw", + "ËĮÉĽ" + ], + [ + "ĠmËĪa", + "s" + ], + [ + "ÉĻ", + "h" + ], + [ + "ĠËĪa", + "ÊĬ" + ], + [ + "ËĪÉĶ", + "ɪ" + ], + [ + "É¡", + "ÉĻɾ" + ], + [ + "r", + "ÉĻn" + ], + [ + "ËĪɪ", + "k" + ], + [ + "s", + "se" + ], + [ + "Ġp", + "ËĪÉij" + ], + [ + "ĠÉĹ", + "ËĮe" + ], + [ + "ĠÉĹ", + "ËĪi" + ], + [ + "Ġa", + "z" + ], + [ + "ĠÉ¡ËĪÊĮ", + "jaËIJ" + ], + [ + "z", + "e" + ], + [ + "ĠÉĹ", + "ËĮaËIJ" + ], + [ + "Ġf", + "ËĪi" + ], + [ + "ĠËĮ", + "ÉĴn" + ], + [ + "Ġx", + "ËĪo" + ], + [ + "ĠËĮÊĬ", + "na" + ], + [ + "Ġtʰ", + "aËIJ" + ], + [ + "Ġs", + "Éij" + ], + [ + "ËĪeɪ", + "ÊĥÉĻn" + ], + [ + "ĠtÉķËĪi", + "Éľ" + ], + [ + "ĠÉŁ", + "aËIJ" + ], + [ + "p", + "ËIJ" + ], + [ + "Ġpl", + "y" + ], + [ + "θ", + "ËĪi" + ], + [ + "ËIJ", + "Éĸ" + ], + [ + "Ġt", + "ËĪuei" + ], + [ + "Ġl", + "ËĪÉĻ" + ], + [ + "Ġd", + "ÉijËIJ" + ], + [ + "f", + "t" + ], + [ + "ËĪa", + "m" + ], + [ + "ĠsËĪÊĮ", + "kt" + ], + [ + "Ġt", + "ËĪou" + ], + [ + "Ġp", + "ËĪiÉĽ" + ], + [ + "ĠËĪa", + "i" + ], + [ + "ĠwËĪÉĴ", + "n" + ], + [ + "Ġz", + "ËĮaɪn" + ], + [ + "Ġe", + "st" + ], + [ + "Ġm", + "ÉĶ" + ], + [ + "ĠtÉķ", + "jËĪÉiju" + ], + [ + "Éľ", + "p" + ], + [ + "ËĪÊĮ", + "z" + ], + [ + "b", + "i" + ], + [ + "ËĪÉĽËIJs", + "eËIJ" + ], + [ + "Ġl", + "ËĪy" + ], + [ + "Ġm", + "ËĮe" + ], + [ + "Ġd", + "ËĮÉĽl" + ], + [ + "ËĪiËIJ", + "l" + ], + [ + "ĠkËĮo", + "mo" + ], + [ + "Ġh", + "ËĪaÉľn" + ], + [ + "ËĪoËIJ", + "ne" + ], + [ + "ĠkËĪÊĮɾ", + "t" + ], + [ + "Ġsy", + "Êģ" + ], + [ + "ËĮÉĶ", + "ɾ" + ], + [ + "Ġɪ", + "f" + ], + [ + "u", + "v" + ], + [ + "z", + "ÉĻn" + ], + [ + "o", + "l" + ], + [ + "Ï", + "ĩ" + ], + [ + "i", + "m" + ], + [ + "Ġm", + "ËĪiÉĽ" + ], + [ + "Ġð", + "ɪ" + ], + [ + "Ġv", + "ËĪÉĽ" + ], + [ + "ÊĬ", + "d" + ], + [ + "Ġt", + "r" + ], + [ + "ËĪeËIJ", + "s" + ], + [ + "ð", + "e" + ], + [ + "d", + "e" + ], + [ + "ʰ", + "Ïĩ" + ], + [ + "ÉŁ", + "ʰ" + ], + [ + "ËĮÉĻËIJ", + "ÉªÉľ" + ], + [ + "b", + "ËIJ" + ], + [ + "ËĪÊĬ", + "k" + ], + [ + "ĠnËĪÉĶ", + "ÉªÉľ" + ], + [ + "ĠËĮ", + "iËIJ" + ], + [ + "ËĪÉijËIJ", + "t" + ], + [ + "ËĪiËIJ", + "ɾ" + ], + [ + "Ġt", + "ɹ" + ], + [ + "ɾ", + "ÉĶ" + ], + [ + "Ġw", + "ÉĴz" + ], + [ + "Ġv", + "u" + ], + [ + "b", + "ÉĻl" + ], + [ + "b", + "ÉĻ" + ], + [ + "ɹ", + "i" + ], + [ + "nt", + "s" + ], + [ + "Ġs", + "ËĪaËIJ" + ], + [ + "d", + "ʰ" + ], + [ + "Ġt", + "ÊĬ" + ], + [ + "ĠÊİ", + "ËĮi" + ], + [ + "β", + "a" + ], + [ + "h", + "ËĪÉĻÉľÅĭ" + ], + [ + "Ġs", + "ËĪiËIJ" + ], + [ + "ĠpËĮa", + "ɾa" + ], + [ + "ËĪÉĽÉ¾", + "ÉĶ" + ], + [ + "ËĪɪ", + "s" + ], + [ + "É£", + "o" + ], + [ + "ĠËĮa", + "l" + ], + [ + "o", + "r" + ], + [ + "Ġb", + "ËĪÊĮh" + ], + [ + "Ġk", + "ËĪoËIJ" + ], + [ + "Ġt", + "ËĪÉĽ" + ], + [ + "Ġp", + "ËĪo" + ], + [ + "ĠÊĴ", + "ÉĻ" + ], + [ + "p", + "Êģ" + ], + [ + "Ġ", + "ËĪaɪ" + ], + [ + "hËĪÉij", + "ÉľÅĭ" + ], + [ + "ÉĻl", + "i" + ], + [ + "ËĪeɪ", + "t" + ], + [ + "ĠjËĪiou", + "Éľ" + ], + [ + "Ġd", + "ËĪÉĻ" + ], + [ + "Ġm", + "ËĪÉĶËIJ" + ], + [ + "l", + "ËĪi" + ], + [ + "ËĮy", + "ÉĻ" + ], + [ + "ĠlËĪoËIJ", + "É¡" + ], + [ + "Ġn", + "ËĪÊĮ" + ], + [ + "Ġh", + "ËĪÊĬ" + ], + [ + "Ġn", + "ËĪÉĻÉľÅĭ" + ], + [ + "ĠÊģ", + "ÉĻ" + ], + [ + "z", + "ËĪi" + ], + [ + "Ġt", + "ËĪuËIJ" + ], + [ + "ĠkËĮo", + "me" + ], + [ + "Ġl", + "ËĪeËIJ" + ], + [ + "ËĪaËIJt", + "aËIJ" + ], + [ + "Ġa", + "n" + ], + [ + "ĠËĪy", + "u" + ], + [ + "ĠËĮÊĮ", + "É¡ÉĻɾ" + ], + [ + "ĠËĪɪ", + "n" + ], + [ + "ĠhËĪo", + "ÉĻ" + ], + [ + "v", + "ÉĻ" + ], + [ + "ËĪø", + "ËIJ" + ], + [ + "θj", + "a" + ], + [ + "ËĪuÉĻ", + "Éľn" + ], + [ + "Ġk", + "ÉĻɾ" + ], + [ + "ËĪa", + "t" + ], + [ + "j", + "ËĪø" + ], + [ + "ËĪÉĽt", + "Êģ" + ], + [ + "Ġp", + "ËĪÉiju" + ], + [ + "st", + "ÉĻ" + ], + [ + "Ġw", + "ÉĴt" + ], + [ + "ËĪeËIJ", + "l" + ], + [ + "ÊĪ", + "i" + ], + [ + "Ġx", + "ËĪaiÉľ" + ], + [ + "ËĪy", + "Êģ" + ], + [ + "ĠhËĪoËIJ", + "É¡aËIJ" + ], + [ + "Ġts", + "ËĪi" + ], + [ + "ĠËĪÊĮ", + "p" + ], + [ + "Ġn", + "ËĮÉĴt" + ], + [ + "ĠlËĪɪ", + "eËIJ" + ], + [ + "Ġh", + "ËĪa" + ], + [ + "Ġf", + "l" + ], + [ + "Ġn", + "ËĪeËIJ" + ], + [ + "ËĮaËIJ", + "ɪ" + ], + [ + "Ġt", + "ËĪuo" + ], + [ + "tÊĥ", + "ËIJ" + ], + [ + "s", + "ËĪe" + ], + [ + "bʰ", + "i" + ], + [ + "ĠbËĪÊĮh", + "ÊĬt" + ], + [ + "ËĪÉĽ", + "nd" + ], + [ + "Ġs", + "ËĪÉĶ" + ], + [ + "ÉĻn", + "s" + ], + [ + "ËĮÉĻ", + "l" + ], + [ + "ÉĽ", + "Éľ" + ], + [ + "ĠÉ¡", + "l" + ], + [ + "ËĪɪ", + "ɾ" + ], + [ + "ËĪaËIJt", + "a" + ], + [ + "Éľ", + "ËIJ" + ], + [ + "ËĪÉĽnt", + "o" + ], + [ + "sk", + "ËĮoËIJ" + ], + [ + "ËĪÉĽ", + "k" + ], + [ + "ts", + "i" + ], + [ + "Ġt", + "ËĪonÉ¡" + ], + [ + "Ġb", + "iËIJ" + ], + [ + "Ġh", + "ËĪaËIJɪ" + ], + [ + "Ġb", + "ËĪi" + ], + [ + "j", + "j" + ], + [ + "Êİ", + "i" + ], + [ + "Ġk", + "ʰ" + ], + [ + "Ġs", + "ËĪo" + ], + [ + "ll", + "o" + ], + [ + "Ġb", + "aɪ" + ], + [ + "ĠÉĽ", + "nt" + ], + [ + "Ġ", + "ËĪiËIJ" + ], + [ + "ĠÉ¡", + "ËĪo" + ], + [ + "ɾ", + "eËIJ" + ], + [ + "Ġk", + "Êĭ" + ], + [ + "Ġm", + "ËĪeiÉľ" + ], + [ + "ÊĬ", + "ËĪÉĶËIJ" + ], + [ + "Ġt", + "ËĪaɪ" + ], + [ + "Ġsu", + "s" + ], + [ + "Ġr", + "i" + ], + [ + "Ġv", + "ËĮÉĽ" + ], + [ + "ËĪiËIJ", + "no" + ], + [ + "v", + "ano" + ], + [ + "ĠdËĮi", + "ËIJ" + ], + [ + "ĠÊIJ", + "ËĪaÉľn" + ], + [ + "Ê", + "Ĥ" + ], + [ + "ĠÉIJ", + "b" + ], + [ + "ËĪaËIJ", + "h" + ], + [ + "ɪ", + "Êĥ" + ], + [ + "ĠdËĮe", + "lla" + ], + [ + "tËIJ", + "i" + ], + [ + "ĠËĪÊĬ", + "n" + ], + [ + "Ġh", + "iËIJ" + ], + [ + "Ġb", + "ËĪaËIJt" + ], + [ + "Ġth", + "ËĪi" + ], + [ + "Ġa", + "m" + ], + [ + "Ġ", + "ËĪoËIJ" + ], + [ + "Ġh", + "u" + ], + [ + "Ġk", + "ËĪÊĮh" + ], + [ + "Ġz", + "ËĪÉijËIJ" + ], + [ + "ĠÉ¡", + "ËĮÉĶ" + ], + [ + "Ġ", + "ËĪÉĻÊĬ" + ], + [ + "y", + "ËĪi" + ], + [ + "Ġl", + "ËĪÊĮ" + ], + [ + "Ġd", + "ËĪeËIJ" + ], + [ + "Ġs", + "ËĪÉĶËIJ" + ], + [ + "sk", + "ËĮeËIJ" + ], + [ + "ɾ", + "o" + ], + [ + "Êģ", + "ËĪÉij" + ], + [ + "t", + "ËĪa" + ], + [ + "Ġk", + "ËĪÊĬ" + ], + [ + "ËĪant", + "e" + ], + [ + "Ġd", + "ÉĶ" + ], + [ + "Ġs", + "ËĪeɪ" + ], + [ + "Ġs", + "ÉĽt" + ], + [ + "ɹ", + "ɪ" + ], + [ + "ĠÉ¡ËĮÉĻÊĬ", + "ɪÅĭ" + ], + [ + "z", + "o" + ], + [ + "Ġj", + "ËĪaËIJ" + ], + [ + "ĠÉĴv", + "ðÉĻ" + ], + [ + "ĠÊ", + "Ŀ" + ], + [ + "ĠÉĽ", + "l" + ], + [ + "Ġs", + "ËĪoËIJ" + ], + [ + "Ġth", + "ËĪiÉľ" + ], + [ + "Ġ", + "ËĪÉĽl" + ], + [ + "Ġly", + "ËĮi" + ], + [ + "nd", + "ÊĴ" + ], + [ + "ĠÉķ", + "jËĪÉiju" + ], + [ + "θ", + "a" + ], + [ + "ĠɾËĮÉĻh", + "eËIJ" + ], + [ + "Ġma", + "ɪ" + ], + [ + "j", + "ÉĻ" + ], + [ + "ĠËĪÊĮ", + "b" + ], + [ + "as", + "jËĪÉĶ" + ], + [ + "d", + "Êģ" + ], + [ + "Ġkh", + "ËĪa" + ], + [ + "ĠËĪe", + "s" + ], + [ + "v", + "i" + ], + [ + "f", + "i" + ], + [ + "ËĮÉĻ", + "b" + ], + [ + "Ġr", + "e" + ], + [ + "Ġav", + "ËĮÉĽ" + ], + [ + "Ġt", + "ËĮi" + ], + [ + "Ġk", + "ɾ" + ], + [ + "Ġb", + "ɪk" + ], + [ + "st", + "e" + ], + [ + "ËĪeËIJÊĥ", + "c" + ], + [ + "p", + "t" + ], + [ + "z", + "ÉĻ" + ], + [ + "Ġw", + "ËĪaËIJ" + ], + [ + "k", + "l" + ], + [ + "ĠsËĪÊĮ", + "m" + ], + [ + "ɪ", + "ÊĪ" + ], + [ + "d", + "z" + ], + [ + "v", + "o" + ], + [ + "ËĮa", + "ÊĬt" + ], + [ + "nd", + "e" + ], + [ + "Ġd", + "ÉĽs" + ], + [ + "ĠÉŁ", + "ËĪaËIJ" + ], + [ + "Ġr", + "ËĮi" + ], + [ + "s", + "ËĮeËIJ" + ], + [ + "É¡", + "i" + ], + [ + "Ġal", + "s" + ], + [ + "ËĪi", + "ðo" + ], + [ + "ĠnËĪi", + "Éľn" + ], + [ + "ÊĬ", + "l" + ], + [ + "ts", + "ËIJ" + ], + [ + "ËĪant", + "o" + ], + [ + "ĠÉĹ", + "ËĪÉĻÊĬ" + ], + [ + "kËIJ", + "i" + ], + [ + "ĠsËĪÊĮ", + "b" + ], + [ + "Ġn", + "ËĪa" + ], + [ + "Ġl", + "ËĮo" + ], + [ + "Ġph", + "ËĪi" + ], + [ + "m", + "ËĮe" + ], + [ + "Ġf", + "a" + ], + [ + "k", + "ÉĻ" + ], + [ + "Ġz", + "ËĪu" + ], + [ + "n", + "s" + ], + [ + "ĠÊģ", + "e" + ], + [ + "Ġb", + "ËĪo" + ], + [ + "ËĪaËIJt", + "i" + ], + [ + "Ġm", + "an" + ], + [ + "ĠlËĪi", + "Éij" + ], + [ + "ĠÉĹ", + "ËĮyÉĻ" + ], + [ + "Ġf", + "ËĪÉĶËIJ" + ], + [ + "ĠkÊĭ", + "ËĪeËIJÊĥc" + ], + [ + "Ġx", + "ËĪÉij" + ], + [ + "ĠtÉķ", + "ËĪu" + ], + [ + "j", + "ÉĻɾ" + ], + [ + "Ġɪ", + "st" + ], + [ + "w", + "ËĪi" + ], + [ + "ĠËĮaɪn", + "ÉĻ" + ], + [ + "ɪ", + "É¡" + ], + [ + "Ġs", + "ÊĪ" + ], + [ + "ËĪi", + "ÉĻl" + ], + [ + "Ġn", + "ËĪiÉĽÉľn" + ], + [ + "ĠËĮÉĽ", + "ËIJ" + ], + [ + "ËĪaɪ", + "nd" + ], + [ + "Ġz", + "ËĪi" + ], + [ + "v", + "ÉĻn" + ], + [ + "m", + "z" + ], + [ + "ð", + "os" + ], + [ + "dÊĴ", + "ËIJ" + ], + [ + "j", + "ËĪa" + ], + [ + "ɾ", + "ËĪÉĶ" + ], + [ + "l", + "ËĪe" + ], + [ + "Ê", + "²" + ], + [ + "Ġv", + "ËĪÉĶ" + ], + [ + "Ġl", + "ËĪiÉĽ" + ], + [ + "θ", + "e" + ], + [ + "mËĪe", + "nte" + ], + [ + "Ġɪn", + "ðÉĻ" + ], + [ + "Ġaɪ", + "m" + ], + [ + "n", + "ÉĻn" + ], + [ + "Ġh", + "ÉĻm" + ], + [ + "ɾ", + "aËIJ" + ], + [ + "ĠsËĪuo", + "Éľ" + ], + [ + "Ġɲ", + "ËĪi" + ], + [ + "Ġɹ", + "ËĪiÉĻl" + ], + [ + "l", + "ËĪa" + ], + [ + "Ġb", + "ËĪÉĶ" + ], + [ + "Ġk", + "ËĪai" + ], + [ + "Êģ", + "ËĪa" + ], + [ + "Ġw", + "ËĪÉľËIJ" + ], + [ + "Ġa", + "ËIJ" + ], + [ + "Ġp", + "as" + ], + [ + "ËĪÊĮ", + "s" + ], + [ + "w", + "ËĪÉĽÉ¾" + ], + [ + "ĠÉĹ", + "ËĪe" + ], + [ + "ĠhËĮa", + "tÉĻ" + ], + [ + "a", + "ɪn" + ], + [ + "ĠËĪÉĶ", + "pʰ" + ], + [ + "Êģ", + "ËĪe" + ], + [ + "ĠÉŁaËIJ", + "ËĪeËIJÉ¡aËIJ" + ], + [ + "ĠËĪÊĬ", + "s" + ], + [ + "ĠtÉķhËĪi", + "Éľ" + ], + [ + "nt", + "Êĥ" + ], + [ + "Ġx", + "ËĪuo" + ], + [ + "ËĪu", + "Êģ" + ], + [ + "Ġɪ", + "m" + ], + [ + "ɳ", + "Éĸ" + ], + [ + "ËĪyÉĻ", + "Éľkh" + ], + [ + "ĠËĪy", + "ÉĽ" + ], + [ + "Ġm", + "ËĮaËIJ" + ], + [ + "Åĵ", + "Êģ" + ], + [ + "ĠËĪa", + "lt" + ], + [ + "Ġk", + "ÉĻm" + ], + [ + "Êİ", + "o" + ], + [ + "ĠÉIJ", + "n" + ], + [ + "Ġf", + "y" + ], + [ + "ĠËĮÉĽ", + "ra" + ], + [ + "ĠÉ¡", + "ËĪÊĬ" + ], + [ + "Ġp", + "ËĪÊĮ" + ], + [ + "l", + "s" + ], + [ + "Ġl", + "ËĪiËIJ" + ], + [ + "ĠÊĤ", + "ËĪy" + ], + [ + "Ġbɪk", + "ËĪÊĮz" + ], + [ + "ĠÉ¡", + "ÉĽt" + ], + [ + "Ġb", + "ɾ" + ], + [ + "t", + "ʰ" + ], + [ + "tÉĻl", + "ËĮÉĻb" + ], + [ + "x", + "o" + ], + [ + "sk", + "ËĮaËIJ" + ], + [ + "ɲ", + "ʲ" + ], + [ + "ËĪeËIJk", + "ÊĪ" + ], + [ + "r", + "ÉĻ" + ], + [ + "tÊĥ", + "o" + ], + [ + "ĠpÊģ", + "ÉĶ" + ], + [ + "Ġɹ", + "ËĪaɪt" + ], + [ + "Ġp", + "ËĪei" + ], + [ + "ËĮ", + "ɪç" + ], + [ + "j", + "ËĪÉĽÉ¾" + ], + [ + "tËIJ", + "a" + ], + [ + "ĠÉIJb", + "ËĮaÊĬt" + ], + [ + "ĠkÊĭËĪeËIJÊĥc", + "ÉĻn" + ], + [ + "Ġv", + "ËĪe" + ], + [ + "ÊĬ", + "Éľ" + ], + [ + "Ġa", + "kËĪe" + ], + [ + "Ġp", + "ËĪai" + ], + [ + "v", + "ËĪÉĽ" + ], + [ + "Ġθ", + "ɹ" + ], + [ + "ɪ", + "f" + ], + [ + "Ġav", + "ËĪÉĽ" + ], + [ + "Ġk", + "ËĪe" + ], + [ + "d", + "ËĪi" + ], + [ + "ËĪeËIJ", + "Éĸ" + ], + [ + "Ġb", + "ÉĻt" + ], + [ + "ÊĪ", + "ʰ" + ], + [ + "t", + "eËIJ" + ], + [ + "θj", + "ËĪÉĶn" + ], + [ + "d", + "Éľ" + ], + [ + "ĠjËĪi", + "Éľ" + ], + [ + "Ġv", + "e" + ], + [ + "É£", + "ËĪu" + ], + [ + "ËĪÊĮh", + "ÉĻl" + ], + [ + "Ġp", + "ÉĶ" + ], + [ + "ĠÉ¡", + "r" + ], + [ + "Ġð", + "a" + ], + [ + "Ġv", + "ËĪiËIJ" + ], + [ + "ĠËĮ", + "ÉijËIJ" + ], + [ + "ËĪÉĻÊĬ", + "nt" + ], + [ + "Ġb", + "ËĪaËIJɾ" + ], + [ + "ĠmËĪÊĮ", + "tÉĻlËĮÉĻb" + ], + [ + "l", + "d" + ], + [ + "ĠtÉķ", + "ËĮÉĶ" + ], + [ + "p", + "a" + ], + [ + "ð", + "ËĪad" + ], + [ + "ËĪi", + "ɾ" + ], + [ + "Ġx", + "ËĪu" + ], + [ + "ĠlËĪi", + "ÉľÅĭ" + ], + [ + "ËĪeɪ", + "s" + ], + [ + "ĠÉĹËĮe", + "Éľn" + ], + [ + "Ġth", + "ËĪiÉĽ" + ], + [ + "tËIJ", + "e" + ], + [ + "ĠavËĮÉĽ", + "k" + ], + [ + "ĠËĮ", + "ÉĶ" + ], + [ + "Ġk", + "ËĪÉiju" + ], + [ + "ɪ", + "v" + ], + [ + "iËIJ", + "z" + ], + [ + "ËĪo", + "s" + ], + [ + "ĠÉ¡", + "ɹ" + ], + [ + "a", + "nd" + ], + [ + "ĠlËĪi", + "ou" + ], + [ + "ĠËĪo", + "Éľ" + ], + [ + "É¡", + "l" + ], + [ + "Ġp", + "ËĪÉĶËIJ" + ], + [ + "Ġm", + "ËĮeËIJ" + ], + [ + "Ġk", + "ËĪÉĴ" + ], + [ + "n", + "os" + ], + [ + "ç", + "ÉĻn" + ], + [ + "f", + "ÉĻn" + ], + [ + "ĠsËĪÊĮkt", + "ËĮeËIJ" + ], + [ + "Ġ", + "ËĪaɪn" + ], + [ + "ËĪoËIJ", + "re" + ], + [ + "j", + "ËĪÉĽn" + ], + [ + "Ġð", + "ËĪÉĽn" + ], + [ + "ĠtÉķh", + "ËĪiÉĽÉľn" + ], + [ + "Ġh", + "ËĪaɪ" + ], + [ + "ɾ", + "ËĪÉĽ" + ], + [ + "Ġs", + "ËĪu" + ], + [ + "ĠkËĪɪ", + "jaËIJ" + ], + [ + "Ġpj", + "ËĮÊĬ" + ], + [ + "ĠhÉĻm", + "ËĮaËIJ" + ], + [ + "ĠËĮÊĮ", + "p" + ], + [ + "Ġp", + "ËĪÊĮhÉĻl" + ], + [ + "Ġx", + "ËĪÉĻ" + ], + [ + "d", + "ËĪe" + ], + [ + "Ġm", + "Éij" + ], + [ + "ĠÊĬ", + "m" + ], + [ + "nd", + "ÉĻ" + ], + [ + "Ġd", + "ËĪÉĻÊĬnt" + ], + [ + "ËĪeËIJ", + "ÊĥÉĻn" + ], + [ + "Ġða", + "ts" + ], + [ + "i", + "s" + ], + [ + "Ġc", + "ËĪaËIJh" + ], + [ + "p", + "e" + ], + [ + "Ġs", + "ËĮo" + ], + [ + "Ġð", + "ËĪe" + ], + [ + "Ġs", + "ËĪaËIJt" + ], + [ + "ËĪa", + "Êģ" + ], + [ + "Ġs", + "ËĪe" + ], + [ + "ÉĻ", + "k" + ], + [ + "ɪ", + "Êĭ" + ], + [ + "ĠkËĪoËIJ", + "i" + ], + [ + "k", + "ÉĶ" + ], + [ + "Ġv", + "ËĪaËIJÊĬ" + ], + [ + "Ġf", + "ËĪei" + ], + [ + "Ġl", + "ËĪeËIJk" + ], + [ + "Ġh", + "ËĪiÉĻ" + ], + [ + "Ġa", + "ÊĬ" + ], + [ + "ËĪÉĽ", + "ndo" + ], + [ + "ËĪe", + "s" + ], + [ + "Ġz", + "ËĪÉĶ" + ], + [ + "Ġ", + "ËĪÉĽÉ¾a" + ], + [ + "nËĪi", + "Éľn" + ], + [ + "ĠkËĪÊĮ", + "m" + ], + [ + "Ġl", + "ËĪÉĴ" + ], + [ + "ɪ", + "st" + ], + [ + "Ġp", + "Éij" + ], + [ + "Ġf", + "ËĪÉĶ" + ], + [ + "Ġth", + "ËĪonÉ¡" + ], + [ + "nk", + "e" + ], + [ + "ËĮ", + "ɪk" + ], + [ + "Ġɲ", + "ËĪÉĻ" + ], + [ + "ËĮÊĮ", + "m" + ], + [ + "ËĪiËIJ", + "t" + ], + [ + "ĠwËĪÉĴ", + "nt" + ], + [ + "ËĪaβ", + "an" + ], + [ + "ĠbËĪÊĮ", + "r" + ], + [ + "ÉĽ", + "nd" + ], + [ + "ĠËĮÉijËIJ", + "bÉľ" + ], + [ + "Ġv", + "ËĪaɪ" + ], + [ + "ĠtÊĥ", + "ËĮi" + ], + [ + "ĠθËĪɪÅĭ", + "k" + ], + [ + "st", + "i" + ], + [ + "Ġk", + "ɹ" + ], + [ + "ĠËĪa", + "ÊĬt" + ], + [ + "st", + "ÉĻn" + ], + [ + "ĠÊĭ", + "ËĪÊĮn" + ], + [ + "ĠÉ¡", + "ËĮaËIJ" + ], + [ + "ËĪaËIJÉľ", + "ɲ" + ], + [ + "Êģ", + "i" + ], + [ + "ĠnËĪÉĶ", + "x" + ], + [ + "ĠɹËĪiÉĻl", + "ɪ" + ], + [ + "Ġv", + "ËĮi" + ], + [ + "Ġðe", + "ÉĻ" + ], + [ + "ËĮɪ", + "tÊĥ" + ], + [ + "Ġv", + "ËĪyÉĻ" + ], + [ + "ĠËĮaËIJpk", + "ËĮaËIJ" + ], + [ + "Ġf", + "ËĮaËIJɪ" + ], + [ + "Ġp", + "ËĪÉĶ" + ], + [ + "ĠnËĪÊĮ", + "mb" + ], + [ + "θ", + "es" + ], + [ + "j", + "ËĪÉĽÊģ" + ], + [ + "ĠkËĪÊĬ", + "cʰ" + ], + [ + "m", + "ËĪÉĽ" + ], + [ + "Ġv", + "ËĪu" + ], + [ + "Ġl", + "ÅĵÊģ" + ], + [ + "ĠiËIJ", + "m" + ], + [ + "ÊĪ", + "ÉĻɾ" + ], + [ + "tÊĥ", + "i" + ], + [ + "ËIJ", + "s" + ], + [ + "Ġt", + "ËĪy" + ], + [ + "ĠmËĪi", + "ÉľÅĭ" + ], + [ + "ɾ", + "ËĪe" + ], + [ + "m", + "ËĮa" + ], + [ + "Ġm", + "ËĮiËIJ" + ], + [ + "ĠÉĽ", + "ks" + ], + [ + "ɪ", + "p" + ], + [ + "ĠkËĪÊĮɾ", + "nËĮaËIJ" + ], + [ + "ĠËĮaÊĬ", + "x" + ], + [ + "r", + "ËĪiËIJ" + ], + [ + "Ġc", + "ËĪÊĮl" + ], + [ + "m", + "os" + ], + [ + "ĠkËĪÊĮɾt", + "ËĮeËIJ" + ], + [ + "iËIJ", + "ɾ" + ], + [ + "k", + "ÉĻn" + ], + [ + "Ġd", + "ËĪu" + ], + [ + "n", + "aËIJ" + ], + [ + "Ġp", + "wËĪe" + ], + [ + "ËĮÉĶ", + "ɪ" + ], + [ + "ĠtÉķh", + "ËĪiÉĽ" + ], + [ + "Ġβ", + "ËĪi" + ], + [ + "ËĪiÉĽ", + "Éľt" + ], + [ + "Ġt", + "e" + ], + [ + "ËĪað", + "os" + ], + [ + "m", + "ËĪa" + ], + [ + "Ġv", + "ËĪo" + ], + [ + "Ġm", + "ËĪɪ" + ], + [ + "Ġb", + "ËĮi" + ], + [ + "a", + "d" + ], + [ + "d", + "o" + ], + [ + "Ġn", + "ËĪaÊĬ" + ], + [ + "ĠʲËĪy", + "Éľ" + ], + [ + "w", + "ËĪÉĽ" + ], + [ + "ËĪi", + "s" + ], + [ + "e", + "l" + ], + [ + "Ġpa", + "r" + ], + [ + "Ġt", + "ËĪai" + ], + [ + "ĠdËĪɪ", + "jaËIJ" + ], + [ + "h", + "ËĪi" + ], + [ + "Ġɾ", + "ËĪÊĮ" + ], + [ + "Ġd", + "ËĪe" + ], + [ + "ËĪaɪ", + "d" + ], + [ + "Ġp", + "er" + ], + [ + "Ġs", + "ËĮÉĶ" + ], + [ + "w", + "e" + ], + [ + "ÊĬ", + "m" + ], + [ + "Ġi", + "n" + ], + [ + "ĠjËĪuËIJ", + "z" + ], + [ + "ËĪiËIJp", + "ÉĻl" + ], + [ + "ĠÊĭ", + "ËĪaËIJl" + ], + [ + "Ġe", + "tËĪÉĽ" + ], + [ + "ËĮÉĽ", + "m" + ], + [ + "Ġn", + "ËĪu" + ], + [ + "ËĪÉĽ", + "kt" + ], + [ + "ĠiËIJ", + "ɾ" + ], + [ + "Ġb", + "ɹ" + ], + [ + "Ġtsh", + "ËĪi" + ], + [ + "ĠÉĹ", + "ËĪÉĶÉľ" + ], + [ + "Ġkw", + "ËĮa" + ], + [ + "Ġf", + "ËĪuÉľ" + ], + [ + "w", + "ËĮa" + ], + [ + "Ġd", + "ËĪiËIJ" + ], + [ + "ĠÉ¡", + "ËĪyÉĻ" + ], + [ + "ËĮÉĽ", + "ËIJ" + ], + [ + "r", + "ËĪa" + ], + [ + "Ġn", + "e" + ], + [ + "Ġz", + "ËĪyÉĻ" + ], + [ + "Ġb", + "ËĪaɪ" + ], + [ + "ĠÉŁ", + "ËĪÊĮb" + ], + [ + "ËĪuËIJ", + "to" + ], + [ + "ÊĬ", + "nt" + ], + [ + "Ġc", + "ʰ" + ], + [ + "ËĪÉĽnt", + "i" + ], + [ + "ËĪo", + "ÉĻ" + ], + [ + "Ġs", + "ËĮÊĮm" + ], + [ + "Ġl", + "Éij" + ], + [ + "ËĮe", + "va" + ], + [ + "ɾ", + "ÉĽ" + ], + [ + "nt", + "Éľ" + ], + [ + "Ġm", + "ËĪÉĽn" + ], + [ + "ËĪÉijËIJ", + "k" + ], + [ + "Ġki", + "l" + ], + [ + "ËĪon", + "es" + ], + [ + "f", + "f" + ], + [ + "Ġm", + "ËĪÉĽËIJ" + ], + [ + "Ġv", + "ËĪÉĻɪ" + ], + [ + "Ġ", + "ËĪÉĶËIJ" + ], + [ + "ĠËĮɪ", + "nt" + ], + [ + "ÊĬ", + "n" + ], + [ + "Ġw", + "ɪl" + ], + [ + "Ġs", + "in" + ], + [ + "ĠËĮa", + "lla" + ], + [ + "Ġaβ", + "ËĪia" + ], + [ + "p", + "i" + ], + [ + "ËĪo", + "Éľ" + ], + [ + "ɪj", + "ËĮaËIJ" + ], + [ + "k", + "u" + ], + [ + "Ġv", + "ËĪɪ" + ], + [ + "Ġtu", + "t" + ], + [ + "ĠtËĪe", + "Éľ" + ], + [ + "Ġh", + "ËĪÉĶ" + ], + [ + "β", + "ɾe" + ], + [ + "s", + "ÉĻɾ" + ], + [ + "Ġkh", + "ËĪai" + ], + [ + "Ġm", + "ËĪÉĶ" + ], + [ + "Ġt", + "a" + ], + [ + "Ġɲ", + "ËĪaËIJ" + ], + [ + "Ġn", + "u" + ], + [ + "ËĪuËIJ", + "n" + ], + [ + "ĠÉĻËIJ", + "Éľ" + ], + [ + "ĠËĪa", + "ÊĬf" + ], + [ + "ËĪiËIJd", + "Éľ" + ], + [ + "nt", + "i" + ], + [ + "Ġp", + "ËĪiËIJpÉĻl" + ], + [ + "Ġk", + "j" + ], + [ + "Ġp", + "e" + ], + [ + "Ġm", + "ËĪÉij" + ], + [ + "ËĮa", + "ɪ" + ], + [ + "ËĪaËIJ", + "le" + ], + [ + "Ġv", + "ËĮÉĻËIJÉªÉľ" + ], + [ + "mp", + "o" + ], + [ + "ĠkËĪɪ", + "t" + ], + [ + "Ġn", + "ËĮÉĽ" + ], + [ + "ĠÉŁ", + "ËĪaËIJtaËIJ" + ], + [ + "ĠsËĪaËIJt", + "ʰ" + ], + [ + "ĠÉŁ", + "ËĪi" + ], + [ + "Ġs", + "o" + ], + [ + "Ġb", + "ËĪÉĽ" + ], + [ + "k", + "ËĪi" + ], + [ + "ɪt", + "i" + ], + [ + "Ġts", + "i" + ], + [ + "Ġk", + "Êģ" + ], + [ + "ËĮ", + "ÉĴ" + ], + [ + "É¡", + "ÉĻl" + ], + [ + "k", + "st" + ], + [ + "Ġm", + "ËĪÉĻËIJ" + ], + [ + "ËĪÊĮ", + "k" + ], + [ + "Ġn", + "ËĪaËIJÊĬ" + ], + [ + "Ġa", + "p" + ], + [ + "ĠlËĪɪ", + "kʰ" + ], + [ + "ll", + "i" + ], + [ + "ĠkwËĪa", + "l" + ], + [ + "Ġ", + "ËĪÉĻËIJ" + ], + [ + "Ġts", + "ËĪuei" + ], + [ + "Ġd", + "o" + ], + [ + "ĠkËIJ", + "jËĪo" + ], + [ + "ÊĬ", + "z" + ], + [ + "Ġp", + "ËĪaËIJ" + ], + [ + "Ġm", + "ËĪuËIJ" + ], + [ + "ĠÉ¡ÉĻ", + "v" + ], + [ + "r", + "ËĪi" + ], + [ + "Ġt", + "w" + ], + [ + "ËĮ", + "ɪn" + ], + [ + "d", + "ËĪÉij" + ], + [ + "Ġð", + "ËĪi" + ], + [ + "ĠËĪaËIJ", + "i" + ], + [ + "Ġh", + "ËĪiÉĽ" + ], + [ + "Ġð", + "ËĮÉĽm" + ], + [ + "Ġpʰ", + "ËĪɪɾ" + ], + [ + "ÉĴ", + "m" + ], + [ + "ĠËĮ", + "eËIJ" + ], + [ + "Ġth", + "ËĪaiÉľ" + ], + [ + "Ġv", + "ËĪas" + ], + [ + "Ġn", + "ÉijËIJ" + ], + [ + "p", + "ÉĻn" + ], + [ + "Ġp", + "ËĮÉĻɾ" + ], + [ + "ĠÉĹ", + "ËĪaËIJɪ" + ], + [ + "ËĪou", + "Éľ" + ], + [ + "ĠÊIJ", + "ËĪuÉľ" + ], + [ + "ĠmËĪa", + "n" + ], + [ + "ĠtËĪÉĻ", + "ÉªÉľ" + ], + [ + "Ġl", + "ËĪaËIJÊĬ" + ], + [ + "m", + "ËĪÉĽnte" + ], + [ + "ĠfËĪa", + "m" + ], + [ + "s", + "jËĪÉĶ" + ], + [ + "Ġp", + "ËĪÉĻ" + ], + [ + "ËĪeËIJ", + "m" + ], + [ + "Ġp", + "ËĪÊĮr" + ], + [ + "j", + "ËĪi" + ], + [ + "Ġl", + "ÉĽ" + ], + [ + "Ġt", + "en" + ], + [ + "ËĪoËIJ", + "ra" + ], + [ + "k", + "i" + ], + [ + "ĠÊĤ", + "ËĪaËIJÊĬ" + ], + [ + "k", + "ɪ" + ], + [ + "bËIJ", + "e" + ], + [ + "ËĪa", + "lt" + ], + [ + "ð", + "ɪ" + ], + [ + "p", + "ËĪi" + ], + [ + "ĠËĮÉĽ", + "nt" + ], + [ + "Ġm", + "ËĪei" + ], + [ + "Ġh", + "ËĪÉĻÊĬ" + ], + [ + "Ġh", + "ËĪÉĽÉ¾" + ], + [ + "j", + "ËĪÉij" + ], + [ + "ĠhËĪÊĬ", + "aËIJ" + ], + [ + "m", + "Éľ" + ], + [ + "Ġd", + "ʰ" + ], + [ + "ĠtÊĥ", + "ËĪe" + ], + [ + "l", + "ËĪÉĽ" + ], + [ + "ËĪaËIJt", + "e" + ], + [ + "Ġp", + "ËĪuËIJ" + ], + [ + "Ġm", + "ËĪÊĬ" + ], + [ + "ËĪaËIJɪ", + "ÊĪ" + ], + [ + "d", + "iËIJ" + ], + [ + "Ġfɹ", + "ÉĴm" + ], + [ + "Ġh", + "ËĪÉijËIJ" + ], + [ + "β", + "o" + ], + [ + "ĠmËĪi", + "Éľn" + ], + [ + "Ġð", + "iËIJz" + ], + [ + "Ġk", + "ËĪou" + ], + [ + "ËĪiËIJ", + "na" + ], + [ + "Ġav", + "ËĮeva" + ], + [ + "Ġ", + "ËĪaËIJɾ" + ], + [ + "Ġn", + "ËĪuËIJɾ" + ], + [ + "Ġβ", + "ËĪe" + ], + [ + "Ġz", + "aɪn" + ], + [ + "ËĪÉĽ", + "d" + ], + [ + "É", + "Ĺ" + ], + [ + "ËĪeɪ", + "k" + ], + [ + "s", + "ËĮÉĻÊĬ" + ], + [ + "ËĪeËIJ", + "ÉŁ" + ], + [ + "ĠÊĤ", + "ËĪÉĻËIJ" + ], + [ + "j", + "e" + ], + [ + "cʰ", + "ËIJ" + ], + [ + "ËĪÉĶ", + "r" + ], + [ + "ÉĽ", + "ËIJ" + ], + [ + "ĠtÉķhËĪy", + "Ã¦Éľn" + ], + [ + "ĠËĮaɪn", + "ÉĻn" + ], + [ + "ĠiËIJ", + "n" + ], + [ + "ĠbËĪÊĮ", + "c" + ], + [ + "ËĪiËIJ", + "m" + ], + [ + "ɾ", + "as" + ], + [ + "ËĮÉĻ", + "s" + ], + [ + "Ġv", + "ËĪeËIJ" + ], + [ + "ĠËĪÉĻr", + "Éľ" + ], + [ + "Ġd", + "uËIJ" + ], + [ + "nt", + "ÉĻ" + ], + [ + "Ġpɹ", + "ËĪÉĴ" + ], + [ + "Ġb", + "ËĪɪ" + ], + [ + "ĠwËĪo", + "Éľ" + ], + [ + "n", + "ËĮi" + ], + [ + "Ġh", + "ÉIJ" + ], + [ + "Ġk", + "ËĪÉĽ" + ], + [ + "Ġe", + "t" + ], + [ + "jËĪÉĽ", + "ndo" + ], + [ + "ĠËĪai", + "Éľ" + ], + [ + "Ġl", + "i" + ], + [ + "ĠËĪaÊĬ", + "s" + ], + [ + "kËIJ", + "o" + ], + [ + "ĠÉĹ", + "ËĪyÉĻ" + ], + [ + "k", + "eËIJ" + ], + [ + "Ġf", + "ËĪiËIJl" + ], + [ + "Ġbʰ", + "ËĪaËIJi" + ], + [ + "ĠÉ¡ÉĻ", + "Êĥ" + ], + [ + "ÊĴ", + "ËĪe" + ], + [ + "Ġn", + "jËĪuËIJ" + ], + [ + "ĠËĪa", + "k" + ], + [ + "ĠÉĹ", + "ËĪaËIJ" + ], + [ + "z", + "ËĪa" + ], + [ + "v", + "ËĪe" + ], + [ + "ĠhËĮa", + "ÊĬ" + ], + [ + "ÉIJ", + "ç" + ], + [ + "ĠɾËĪÊĮ", + "kʰ" + ], + [ + "p", + "ËĪe" + ], + [ + "ĠtÉĻ", + "bi" + ], + [ + "ĠpËĪÊĮhÉĻl", + "ËĮeËIJ" + ], + [ + "Ġf", + "ËĪÉĽ" + ], + [ + "Ġw", + "ËĮɪtÊĥ" + ], + [ + "ĠtÉķËĪy", + "ÉĽÉľ" + ], + [ + "w", + "ËĮe" + ], + [ + "ËĮa", + "ɪt" + ], + [ + "ĠnÉijËIJ", + "x" + ], + [ + "ĠkËĪÉĶËIJ", + "n" + ], + [ + "ÊĬ", + "k" + ], + [ + "ĠbËĪaËIJ", + "d" + ], + [ + "Åĭ", + "ÉĻn" + ], + [ + "Ġn", + "i" + ], + [ + "Ġb", + "ËĪe" + ], + [ + "Ġm", + "ËĮÊĬ" + ], + [ + "ËĪa", + "r" + ], + [ + "ĠmËĮe", + "ɪk" + ], + [ + "Ġs", + "ËĪaËIJɾ" + ], + [ + "β", + "e" + ], + [ + "ĠtÉķhËĪi", + "ÉľÅĭ" + ], + [ + "it", + "ËĪe" + ], + [ + "k", + "ËĮe" + ], + [ + "ËĪÉĽËIJ", + "l" + ], + [ + "ËĮ", + "ÉĴn" + ], + [ + "ËĮ", + "Éij" + ], + [ + "Ġb", + "ËĪɪl" + ], + [ + "Ġw", + "ÊĬd" + ], + [ + "Ġb", + "ËĪoËIJl" + ], + [ + "r", + "d" + ], + [ + "i", + "ÉĻ" + ], + [ + "Ġd", + "a" + ], + [ + "Ġb", + "ËĪaËIJÊĬ" + ], + [ + "ĠnËĪÊĮmb", + "ÉĻɾ" + ], + [ + "ËĪaËIJɪ", + "Éľ" + ], + [ + "ĠÉĽ", + "m" + ], + [ + "Ġm", + "iËIJɾ" + ], + [ + "ËĪeɪ", + "m" + ], + [ + "l", + "os" + ], + [ + "ËĮÉĽ", + "t" + ], + [ + "ĠËĮaÊĬ", + "s" + ], + [ + "ĠmËĪa", + "Éľt" + ], + [ + "Ġw", + "ËĪuÉĻ" + ], + [ + "Ġw", + "ËĪeɪ" + ], + [ + "Ġse", + "ɲ" + ], + [ + "Ġb", + "jËĪÉĽ" + ], + [ + "Ġw", + "ÉĽn" + ], + [ + "f", + "l" + ], + [ + "Ġkh", + "wËĪa" + ], + [ + "d", + "ËĪÉĽ" + ], + [ + "v", + "ɹɪ" + ], + [ + "ĠËĪa", + "ɾ" + ], + [ + "jËĪÉiju", + "Éľ" + ], + [ + "ĠËĮaËIJpk", + "ËĮeËIJ" + ], + [ + "b", + "Êģ" + ], + [ + "ĠtËĪaɪ", + "m" + ], + [ + "Ġ", + "ËĪÉij" + ], + [ + "Ġs", + "ËĮa" + ], + [ + "Ġz", + "ËĪoɪ" + ], + [ + "ËĪÉĶɾ", + "a" + ], + [ + "Ġd", + "ËĪø" + ], + [ + "ËĪÉĶɾ", + "t" + ], + [ + "ĠÅĭ", + "ËĪÉĶ" + ], + [ + "m", + "in" + ], + [ + "Ġl", + "ËĪÊĬk" + ], + [ + "ËĪÉĶËIJ", + "t" + ], + [ + "ĠËĪÉĶ", + "tɾ" + ], + [ + "Ġf", + "ËĪaɪ" + ], + [ + "ĠÉ¡", + "ÉĴt" + ], + [ + "ËĪeËIJ", + "ÉĻn" + ], + [ + "k", + "ËĪÉĶ" + ], + [ + "ĠvËĪÉĽ", + "ɹi" + ], + [ + "m", + "ÉĽ" + ], + [ + "ËĪaɪ", + "z" + ], + [ + "Ġe", + "sp" + ], + [ + "ɲ", + "a" + ], + [ + "Ġl", + "ËĪo" + ], + [ + "ËĪÉĽËIJ", + "ra" + ], + [ + "β", + "ËĪi" + ], + [ + "ou", + "Éľ" + ], + [ + "ËĮÉĻ", + "k" + ], + [ + "tÊĥ", + "uËIJ" + ], + [ + "Ġn", + "ËĪyÉĻ" + ], + [ + "ÊĪ", + "ɾ" + ], + [ + "ĠÉ¡", + "ËĪy" + ], + [ + "ĠtËĪo", + "ðo" + ], + [ + "ËĪɪ", + "çt" + ], + [ + "Ġm", + "ɪç" + ], + [ + "ĠËĪa", + "nd" + ], + [ + "Ġkw", + "ËĮÉĽl" + ], + [ + "ĠÊĤ", + "ËĪaËIJ" + ], + [ + "ĠnËĪi", + "Éľ" + ], + [ + "ËĪÉĶ", + "p" + ], + [ + "ËĪiËIJ", + "z" + ], + [ + "ĠÊĤ", + "ËĪaÊĬ" + ], + [ + "ĠɾËĮÉĻh", + "i" + ], + [ + "ĠsËĮÊĬ", + "o" + ], + [ + "ĠÉĽ", + "É¡" + ], + [ + "Ġd", + "Åĵ" + ], + [ + "ĠÉ¡ËĮaËIJ", + "ÉªÉľ" + ], + [ + "d", + "ɪ" + ], + [ + "l", + "ËĮa" + ], + [ + "st", + "ËĪi" + ], + [ + "ĠdËĮiËIJ", + "z" + ], + [ + "Ġt", + "ËĮÊĬ" + ], + [ + "θ", + "i" + ], + [ + "ĠËĪɪ", + "skËĮoËIJ" + ], + [ + "nd", + "ÉĻn" + ], + [ + "Ġts", + "v" + ], + [ + "Ġh", + "ËĪÉĻËIJ" + ], + [ + "ĠÊĥ", + "ËĪÊĬ" + ], + [ + "ÉĻt", + "ËĮeËIJ" + ], + [ + "p", + "ËĮÉĽ" + ], + [ + "ËĪaɾ", + "ÉĶn" + ], + [ + "Ġp", + "ÉĽÊģ" + ], + [ + "Ġ", + "y" + ], + [ + "m", + "nËĮeËIJ" + ], + [ + "ËĪÉĽ", + "llo" + ], + [ + "ĠÉ¡", + "ËĪÉĻ" + ], + [ + "ĠËĮa", + "d" + ], + [ + "ĠÊĥ", + "v" + ], + [ + "ËĪÊı", + "ɾ" + ], + [ + "r", + "ËĪe" + ], + [ + "y", + "ËIJ" + ], + [ + "Ġp", + "ËĪaËIJs" + ], + [ + "Ġ", + "ËĪÉĽn" + ], + [ + "ɪ", + "dÊĴ" + ], + [ + "ËĪua", + "i" + ], + [ + "Ġf", + "i" + ], + [ + "Ġt", + "ËĪyÉĻ" + ], + [ + "ËĪaËIJ", + "ÉŁ" + ], + [ + "Ġt", + "jËĪe" + ], + [ + "ËĪaËIJn", + "aËIJ" + ], + [ + "st", + "ɾ" + ], + [ + "Êİ", + "e" + ], + [ + "ËĮe", + "ɪt" + ], + [ + "b", + "a" + ], + [ + "ð", + "as" + ], + [ + "v", + "Êģ" + ], + [ + "Ġz", + "ËĪÉĻËIJ" + ], + [ + "ËĪaËIJ", + "li" + ], + [ + "ÉŁÊ°", + "eËIJ" + ], + [ + "ËĪaËIJt", + "eËIJ" + ], + [ + "Ġv", + "ËĪa" + ], + [ + "Ġsa", + "l" + ], + [ + "ËĪaËIJ", + "no" + ], + [ + "ĠÉ¡ÉĻ", + "z" + ], + [ + "ĠhËĪoËIJ", + "ti" + ], + [ + "Ġɲ", + "ËĪiÉĽ" + ], + [ + "t", + "Éľ" + ], + [ + "ĠËĪaËIJ", + "p" + ], + [ + "Ġw", + "ËĪÉĽl" + ], + [ + "Ġm", + "ËĪɪl" + ], + [ + "Ġfy", + "ËIJɾ" + ], + [ + "ËĪÉĽËIJs", + "aËIJ" + ], + [ + "Ġb", + "ËĮiËIJ" + ], + [ + "ËĪaËIJ", + "jaËIJ" + ], + [ + "ËĪɪ", + "p" + ], + [ + "Ġf", + "Êģ" + ], + [ + "tsi", + "ËĪoËIJne" + ], + [ + "Ġw", + "ËĪuÉľ" + ], + [ + "Ġv", + "i" + ], + [ + "ĠwËĪÉij", + "Éľn" + ], + [ + "ËĪoËIJ", + "n" + ], + [ + "ĠÉĹ", + "ËĪÉĻɪ" + ], + [ + "ĠÊĿ", + "ËĪo" + ], + [ + "Ġr", + "a" + ], + [ + "m", + "ÉĻnt" + ], + [ + "ËĪaÊĬ", + "nd" + ], + [ + "Ġp", + "ÉĽÉ¾" + ], + [ + "ĠÉĹ", + "ËĪaËIJÊĬ" + ], + [ + "oËIJ", + "ɾ" + ], + [ + "h", + "ËĪo" + ], + [ + "ĠÉĴ", + "n" + ], + [ + "ĠÊİ", + "e" + ], + [ + "ĠsËĪɪ", + "ks" + ], + [ + "É¡", + "n" + ], + [ + "ĠÉ¡", + "ËĪa" + ], + [ + "Ġ", + "θj" + ], + [ + "Ġp", + "ËĪe" + ], + [ + "sp", + "e" + ], + [ + "Ġv", + "ËĪÉĻ" + ], + [ + "Ġf", + "ËĪɪ" + ], + [ + "ĠËĮɪnt", + "ÊĬ" + ], + [ + "l", + "ÉĻn" + ], + [ + "Ġn", + "ËĪiËIJd" + ], + [ + "ĠsËĮÊĬ", + "a" + ], + [ + "ĠËĪu", + "m" + ], + [ + "Ġd", + "ËĪeɪ" + ], + [ + "ĠËĪÊĮ", + "bʰi" + ], + [ + "ËĪÉijËIJ", + "ɾ" + ], + [ + "Ġb", + "ËĪiÉĽÉľt" + ], + [ + "Êİ", + "os" + ], + [ + "Ġtsh", + "ËĪaiÉľ" + ], + [ + "ĠËĮɪ", + "skËĮaËIJ" + ], + [ + "ĠaÊĬ", + "ÉĻ" + ], + [ + "ĠËĪy", + "æ" + ], + [ + "Ġd", + "yn" + ], + [ + "Ġm", + "ËĪiËIJn" + ], + [ + "ĠËĪÊĮ", + "cʰËIJ" + ], + [ + "Ġs", + "ÉĽ" + ], + [ + "Ġn", + "ËĪy" + ], + [ + "Ġn", + "ËĮÉĽl" + ], + [ + "É¡", + "ɾ" + ], + [ + "Êĥ", + "ËĪe" + ], + [ + "ĠÊĤ", + "ËĮÉĽ" + ], + [ + "ĠËĪÉĽ", + "vɹɪ" + ], + [ + "ËĪÉĽl", + "p" + ], + [ + "ĠbËĪa", + "k" + ], + [ + "Ġ", + "eËIJ" + ], + [ + "Ġf", + "ËĪaËIJ" + ], + [ + "Ġk", + "ÉĽl" + ], + [ + "ĠËĪeËIJ", + "s" + ], + [ + "j", + "ËĪaËIJd" + ], + [ + "Ġl", + "ËĮi" + ], + [ + "mb", + "ɾe" + ], + [ + "k", + "tÉĻ" + ], + [ + "nt", + "a" + ], + [ + "t", + "ËĪu" + ], + [ + "Ġð", + "ËĪat" + ], + [ + "ĠËĪa", + "β" + ], + [ + "ÉĻɹ", + "i" + ], + [ + "ĠkwËĮÉĽ", + "lla" + ], + [ + "Ġb", + "ÉĻn" + ], + [ + "r", + "ËĮÉĽ" + ], + [ + "Ġn", + "ÉĶ" + ], + [ + "ĠÉ¡", + "ËĪɪ" + ], + [ + "ĠËĪa", + "p" + ], + [ + "ɹ", + "ÉĻ" + ], + [ + "ËĪa", + "Éľkh" + ], + [ + "ĠÊIJ", + "ËĪi" + ], + [ + "Ġ", + "ËĪÉijËIJ" + ], + [ + "ɪ", + "É¡ÉĻn" + ], + [ + "Ġw", + "ËĪai" + ], + [ + "Ġp", + "ÉĻt" + ], + [ + "kËIJ", + "a" + ], + [ + "Ġb", + "ËĪÉĽËIJ" + ], + [ + "ËĪeËIJ", + "Êĭ" + ], + [ + "ls", + "ÉĻÊĬ" + ], + [ + "ĠcËĪaËIJh", + "ɪËĮeËIJ" + ], + [ + "Ġk", + "ÉĻn" + ], + [ + "ĠËĮaɪn", + "ÉĻm" + ], + [ + "ËĪuËIJ", + "t" + ], + [ + "Ġh", + "ËĪaÊĬ" + ], + [ + "Ġt", + "ËĪanto" + ], + [ + "ĠhÉIJ", + "z" + ], + [ + "Ġs", + "ËĪÊĮɾ" + ], + [ + "Ġn", + "o" + ], + [ + "Ġt", + "ËĪÉĶËIJ" + ], + [ + "Ġz", + "ËĪaɪ" + ], + [ + "ĠtÉķËĪiÉĽ", + "Éľ" + ], + [ + "Ġko", + "zËĪi" + ], + [ + "Ġk", + "ËĪei" + ], + [ + "ð", + "ËĪÉĶɾ" + ], + [ + "ËĮÉĶ", + "Êģ" + ], + [ + "Ġt", + "ËĪÊĮɾ" + ], + [ + "ĠÊIJ", + "ËĪÉĻ" + ], + [ + "ĠÉķËĪy", + "ÉĽÉľ" + ], + [ + "ĠmËĮÊĬ", + "ÉŁÊ°eËIJ" + ], + [ + "m", + "f" + ], + [ + "Ġv", + "ËĪiËIJdÉľ" + ], + [ + "k", + "ËĪa" + ], + [ + "ĠÉIJ", + "É¡" + ], + [ + "k", + "w" + ], + [ + "ĠÊģ", + "ÉĽ" + ], + [ + "x", + "ÉĻn" + ], + [ + "Ġd", + "ÊĬ" + ], + [ + "ĠkËĪÊĮɾ", + "nËĮeËIJ" + ], + [ + "jËĪaËIJd", + "aËIJ" + ], + [ + "Ġf", + "ÉĻ" + ], + [ + "ĠËĮi", + "mp" + ], + [ + "Ġh", + "ɪz" + ], + [ + "Ġ", + "ʰÏĩ" + ], + [ + "ËĪoËIJ", + "ni" + ], + [ + "Ġx", + "ËĪiÉľ" + ], + [ + "ËĪeËIJ", + "sÊĪ" + ], + [ + "Êı", + "bÉľ" + ], + [ + "ËĮÉĶɾ", + "ke" + ], + [ + "ĠÉ¡", + "ËĪÉĻÊĬ" + ], + [ + "ËĪɪ", + "ÊĥÉĻn" + ], + [ + "l", + "es" + ], + [ + "Ġf", + "ËĪiËIJ" + ], + [ + "É¡", + "tÉĻ" + ], + [ + "ËĪeËIJ", + "re" + ], + [ + "Ġv", + "ËĮaËIJ" + ], + [ + "Ġ", + "ËĪeɪ" + ], + [ + "Ġm", + "ËĪuÉĻÉľn" + ], + [ + "ĠÉ¡ËĪÊĬ", + "d" + ], + [ + "ĠmËĮa", + "ɪn" + ], + [ + "z", + "ËĪe" + ], + [ + "ĠlËĪi", + "Éľ" + ], + [ + "Ġm", + "u" + ], + [ + "Ġk", + "ËĮÉĽl" + ], + [ + "Ġj", + "ËĮÉĻh" + ], + [ + "Ġf", + "ËĮÉĶɾ" + ], + [ + "f", + "ɹ" + ], + [ + "Ġk", + "ËĪaɪn" + ], + [ + "ĠËĪÉĴ", + "lsÉĻÊĬ" + ], + [ + "θ", + "ɪÅĭ" + ], + [ + "Ġth", + "ËĪonÉ¡Éľ" + ], + [ + "t", + "ËĪÉij" + ], + [ + "θj", + "o" + ], + [ + "m", + "ËĪÉĶ" + ], + [ + "Ġ", + "os" + ], + [ + "Ġs", + "ÊĬ" + ], + [ + "ĠsËĪÊĮ", + "mÉĻ" + ], + [ + "ĠvËĮÉĽ", + "n" + ], + [ + "n", + "ËĪo" + ], + [ + "ĠËĪak", + "tÊĥuËIJ" + ], + [ + "É£", + "a" + ], + [ + "Ġtʰ", + "i" + ], + [ + "Ġf", + "ËĮi" + ], + [ + "Ġv", + "ËĪÉĽl" + ], + [ + "ĠtËĪu", + "tËIJi" + ], + [ + "x", + "os" + ] + ] + } +} \ No newline at end of file From 4e09d1c69fe7e42d1387ba0d11aa8e82cccd4ff3 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Fri, 6 Feb 2026 22:17:45 -0800 Subject: [PATCH 38/94] Magpietts decoderonly 2601 valinfer (#61) * add inference loggin in val step Signed-off-by: Paarth Neekhara * infer during validation Signed-off-by: Paarth Neekhara * use local transformer for val Signed-off-by: Paarth Neekhara * ignore eval models when loading weights Signed-off-by: Paarth Neekhara * logging statements Signed-off-by: Paarth Neekhara * asr issue Signed-off-by: Paarth Neekhara * bug fix for multinode Signed-off-by: Paarth Neekhara * add whisper asr as well for val infer Signed-off-by: Paarth Neekhara * add missing changes Signed-off-by: Paarth Neekhara * handle errors Signed-off-by: Paarth Neekhara * allow non lhotse validation loader also Signed-off-by: Paarth Neekhara --------- Signed-off-by: Paarth Neekhara --- .../tts/conf/magpietts/easy_magpietts.yaml | 2 +- .../conf/magpietts/easy_magpietts_lhotse.yaml | 3 +- .../tts/data/text_to_speech_dataset_lhotse.py | 8 + nemo/collections/tts/models/easy_magpietts.py | 291 +++++++++++++++++- .../magpietts_preference_optimization.py | 76 +---- nemo/collections/tts/parts/utils/helpers.py | 83 ++++- 6 files changed, 384 insertions(+), 79 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 6166fd68968f..8c44fef3f173 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -60,7 +60,7 @@ model: embedding_dim: 1536 hidden_dim: 1536 - audio_embedding_dim: 256 # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection. + audio_embedding_dim: 1536 # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection. codecmodel_path: ??? max_epochs: ${max_epochs} steps_per_epoch: ${weighted_sampling_steps_per_epoch} diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index 5461af8d6ee5..af943ee25dbb 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -59,7 +59,7 @@ model: embedding_dim: 1536 hidden_dim: 1536 - audio_embedding_dim: 256 # Smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection. + audio_embedding_dim: 1536 # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection. codecmodel_path: ??? # Local transformer parameters for autoregressive codebook prediction within a frame @@ -141,6 +141,7 @@ model: shuffle: false num_workers: 2 pin_memory: true + force_map_dataset: true input_cfg: - type: lhotse_shar diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 480119202e28..5e088708573f 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -225,6 +225,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: context_text_tokens_len_list = [] context_has_text_context_list = [] reward_list = [] + language_list = [] raw_text_list = ( [] ) # raw text here is the string of normalized text or text stored in the supervision segment. Used to distinguish from text tokens. @@ -236,6 +237,12 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: raise ValueError(f"Invalid format in cut.supervisions[0].speaker: {speaker}") dataset_name = speaker.strip().split()[2].split(":")[-1] dataset_name_list.append(dataset_name) + language = ( + cut.supervisions[0].language + if cut.supervisions[0].has_custom("language") + else "en" + ) + language_list.append(language) # target audio or target codes if self.load_cached_codes_if_available and cut.has_custom("target_codes"): @@ -444,6 +451,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: batch_dict = { "dataset_names": dataset_name_list, "raw_texts": raw_text_list, + "languages": language_list, "text": collate_vectors(token_list, padding_value=self.pad_id), # (B, max_len) "text_lens": torch.IntTensor(token_len_list), } diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 4c9b26ded4d7..224100e07ff6 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -11,8 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import random import time + +import numpy as np +import soundfile as sf from dataclasses import dataclass from functools import partial from typing import Any, Dict, List, Optional, Sequence, Tuple @@ -25,8 +29,12 @@ from omegaconf import DictConfig from torch import nn from torch.utils.data import get_worker_info +from torch.utils.data.distributed import DistributedSampler from transformers import AutoConfig, AutoModel, AutoModelForCausalLM +import nemo.collections.asr as nemo_asr +from nemo.collections.asr.metrics.wer import word_error_rate +from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( MagpieTTSLhotseDataset, @@ -42,7 +50,12 @@ SpecialAudioToken, cosine_schedule, ) -from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths +from nemo.collections.tts.parts.utils.helpers import ( + get_mask_from_lengths, + get_speaker_embeddings_from_filepaths, + process_text_for_cer, + transcribe_with_whisper, +) from nemo.core.classes import ModelPT from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging @@ -496,6 +509,31 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): ) self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) + # Validation inference with metrics (optional) + self.run_val_inference = cfg.get('run_val_inference', False) + self.use_multilingual_asr = cfg.get('use_multilingual_asr', False) + if self.run_val_inference: + logging.info("Loading eval models for validation inference (ASR and speaker verification)...") + if self.use_multilingual_asr: + from transformers import WhisperForConditionalGeneration, WhisperProcessor + + self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") + self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") + self.whisper_model.eval() + self._eval_asr_model = None + else: + self._eval_asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained( + model_name="nvidia/parakeet-ctc-0.6b" + ) + self._eval_asr_model.freeze() + self.whisper_processor = None + self.whisper_model = None + self._eval_speaker_verification_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( + model_name='titanet_large' + ) + self._eval_speaker_verification_model.freeze() + logging.info("Eval models loaded successfully.") + def state_dict(self, destination=None, prefix='', keep_vars=False): """ Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model @@ -505,7 +543,14 @@ def state_dict(self, destination=None, prefix='', keep_vars=False): return {} # Don't save the speaker verification and codec model in the state dict state_dict = super().state_dict(destination, prefix, keep_vars) - keys_substrings_to_exclude = ['_speaker_verification_model', '_codec_model'] + keys_substrings_to_exclude = [ + '_speaker_verification_model', + '_codec_model', + '_eval_asr_model', + '_eval_speaker_verification_model', + 'whisper_model', + 'whisper_processor', + ] for key in list(state_dict.keys()): if any([substring in key for substring in keys_substrings_to_exclude]): del state_dict[key] @@ -521,7 +566,14 @@ def load_state_dict(self, state_dict, strict=True): if strict == False: super().load_state_dict(state_dict, strict=False) for name, child in self.named_children(): - if name in ['_speaker_verification_model', '_codec_model']: + if name in [ + '_speaker_verification_model', + '_codec_model', + '_eval_asr_model', + '_eval_speaker_verification_model', + 'whisper_model', + 'whisper_processor', + ]: continue if any(param.numel() > 0 for param in child.parameters()): # If the module has parameters, we want to change the default mapping so that the state_dict gets @@ -1124,8 +1176,8 @@ def prepare_context_tensors( context_audio_codes, context_audio_codes_lens = self.stack_codes( context_audio_codes, context_audio_codes_lens, - self.audio_bos_id, - self.audio_eos_id, + self.context_audio_bos_id, + self.context_audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks, ) @@ -1848,6 +1900,10 @@ def training_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx): # Extract inputs from batch and pass explicitly to process_batch + print(f"[Validation] global_rank: {self.global_rank}, " + f"local_rank: {self.local_rank}, " + f"world_size: {self.trainer.world_size}, " + f"batch_idx: {batch_idx}") if 'context_audio_codes' in batch: context_audio_codes = batch['context_audio_codes'] context_audio_codes_lens = batch['context_audio_codes_lens'] @@ -1915,6 +1971,140 @@ def validation_step(self, batch, batch_idx): phoneme_loss = batch_output.phoneme_loss val_output['val_phoneme_loss'] = phoneme_loss + # Run inference and compute metrics if enabled + if self.run_val_inference: + infer_output = self.infer_batch( + batch, + max_decoder_steps=220, + temperature=0.7, + topk=80, + use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR + ) + + # Get audio output directory + audio_dir = self.trainer.log_dir + audio_dir = os.path.join(audio_dir, 'val_audios', f'epoch_{self.trainer.current_epoch}') + os.makedirs(audio_dir, exist_ok=True) + + # Save predicted and context audio, collect paths for metrics + predicted_audio_paths = [] + context_audio_paths = [] + + context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self.remove_special_tokens( + codes=context_audio_codes, + codes_len=context_audio_codes_lens, + ) + context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio(context_audio_codes_cleaned, context_audio_codes_lens_cleaned) + + for idx in range(infer_output.predicted_audio.size(0)): + audio_np = infer_output.predicted_audio[idx].float().detach().cpu().numpy() + audio_np = audio_np[: infer_output.predicted_audio_lens[idx]] + + # Log first batch on first device to wandb/tensorboard (first 3 samples) + if batch_idx == 0 and self.global_rank == 0 and idx < 3: + for logger in self.loggers: + if isinstance(logger, WandbLogger): + logger.experiment.log( + { + f"Audio_Generated/Example_{idx}": wandb.Audio( + audio_np, sample_rate=self.output_sample_rate, caption="generated" + ) + } + ) + elif isinstance(logger, TensorBoardLogger): + logger.experiment.add_audio( + f'Example_{idx}/generated', + audio_np, + global_step=self.global_step, + sample_rate=self.output_sample_rate, + ) + + # Save predicted audio to disk + if audio_dir: + audio_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}.wav') + sf.write(audio_path, audio_np, self.output_sample_rate) + predicted_audio_paths.append(audio_path) + + # Save context audio for SSIM computation + ctx_audio_np = context_audio_codes_cleaned[idx].float().detach().cpu().numpy()[: context_audio_lens_cleaned[idx]] + ctx_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_context.wav') + sf.write(ctx_path, ctx_audio_np, self.output_sample_rate) + context_audio_paths.append(ctx_path) + + # Compute metrics if we have audio paths + if predicted_audio_paths and context_audio_paths: + with torch.no_grad(): + # ASR transcription for CER/WER + if self.use_multilingual_asr: + self.whisper_model.to(self.device) + languages = batch.get('languages', None) + if languages is None: + languages = ['en'] * len(predicted_audio_paths) + pred_transcripts = [] + for audio_path, lang in zip(predicted_audio_paths, languages): + try: + transcript = transcribe_with_whisper( + audio_path, lang, self.whisper_processor, self.whisper_model, self.device, normalizer=None + ) + pred_transcripts.append(process_text_for_cer(transcript)) + except Exception as e: + logging.warning(f"Val ASR transcription failed for {audio_path}: {e}") + pred_transcripts.append(None) + else: + pred_transcripts = self._eval_asr_model.transcribe( + predicted_audio_paths, + batch_size=len(predicted_audio_paths), + override_config=TranscribeConfig( + use_lhotse=False, + batch_size=len(predicted_audio_paths), + num_workers=0 + ) + ) + pred_transcripts = [process_text_for_cer(t.text) for t in pred_transcripts] + + # Speaker embeddings for SSIM + try: + pred_embeddings = get_speaker_embeddings_from_filepaths( + predicted_audio_paths, self._eval_speaker_verification_model, self.device + ) + ctx_embeddings = get_speaker_embeddings_from_filepaths( + context_audio_paths, self._eval_speaker_verification_model, self.device + ) + except Exception as e: + logging.warning(f"Val speaker embeddings failed: {e}") + pred_embeddings = ctx_embeddings = None + + # Compute per-sample metrics for successful cases only + batch_cer, batch_wer, batch_ssim = [], [], [] + for idx in range(len(predicted_audio_paths)): + if pred_transcripts[idx] is None: + continue + gt_transcript = process_text_for_cer(batch['raw_texts'][idx]) + cer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=True) + wer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False) + batch_cer.append(cer) + batch_wer.append(wer) + if pred_embeddings is not None and ctx_embeddings is not None: + pred_emb = pred_embeddings[idx].cpu().float().numpy() + ctx_emb = ctx_embeddings[idx].cpu().float().numpy() + ssim = np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb)) + batch_ssim.append(ssim) + logging.info( + f"[Val] rank{self.global_rank}_batch{batch_idx}_idx{idx}: " + f"CER={cer:.4f}, WER={wer:.4f} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'" + ) + + if batch_cer: + val_output['val_cer'] = torch.tensor(np.mean(batch_cer), device=self.device) + val_output['val_wer'] = torch.tensor(np.mean(batch_wer), device=self.device) + if self.use_multilingual_asr: + langs = batch.get('languages', ['en'] * len(predicted_audio_paths)) + val_output['val_languages'] = [langs[i] for i in range(len(pred_transcripts)) if pred_transcripts[i] is not None] + val_output['val_cer_list'] = batch_cer + val_output['val_wer_list'] = batch_wer + if batch_ssim: + val_output['val_ssim'] = torch.tensor(np.mean(batch_ssim), device=self.device) + self.validation_step_outputs.append(val_output) return val_output @@ -1935,6 +2125,39 @@ def on_validation_epoch_end(self): val_phoneme_loss = collect("val_phoneme_loss") self.log("val/phoneme_loss", val_phoneme_loss, prog_bar=True, sync_dist=True) + if self.run_val_inference: + # Collect metrics only from outputs that have them + def collect_if_exists(key): + values = [x[key] for x in self.validation_step_outputs if key in x] + if values: + return torch.stack(values).mean() + return None + + val_cer = collect_if_exists("val_cer") + val_wer = collect_if_exists("val_wer") + val_ssim = collect_if_exists("val_ssim") + + if val_cer is not None: + self.log("val/cer", val_cer, prog_bar=True, sync_dist=True) + if val_wer is not None: + self.log("val/wer", val_wer, prog_bar=True, sync_dist=True) + if val_ssim is not None: + self.log("val/ssim", val_ssim, prog_bar=True, sync_dist=True) + + if self.use_multilingual_asr: + lang_cer = {} + lang_wer = {} + for x in self.validation_step_outputs: + if 'val_languages' not in x or 'val_cer_list' not in x or 'val_wer_list' not in x: + continue + for lang, cer, wer in zip(x['val_languages'], x['val_cer_list'], x['val_wer_list']): + lang_cer.setdefault(lang, []).append(cer) + lang_wer.setdefault(lang, []).append(wer) + for lang in lang_cer: + self.log(f"val/cer_lang_{lang}", torch.tensor(np.mean(lang_cer[lang]), device=self.device), prog_bar=True, sync_dist=True) + for lang in lang_wer: + self.log(f"val/wer_lang_{lang}", torch.tensor(np.mean(lang_wer[lang]), device=self.device), prog_bar=True, sync_dist=True) + self.validation_step_outputs.clear() # free memory def get_dataset(self, dataset_cfg, dataset_type): @@ -2043,11 +2266,69 @@ def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader: return data_loader def setup_validation_data(self, cfg): + self._validation_uses_lhotse = cfg.get("use_lhotse", False) self._validation_dl = self._setup_test_dataloader(cfg) def setup_test_data(self, cfg): self._test_dl = self._setup_test_dataloader(cfg) + def val_dataloader(self): + """ + Override val_dataloader to lazily wrap with DistributedSampler for non-lhotse + validation. This is needed because use_distributed_sampler=False is set for lhotse + training, which also prevents Lightning from auto-wrapping the non-lhotse validation + dataloader. We do this lazily (here instead of in setup_validation_data) because + distributed is not yet initialized when setup_validation_data is called during __init__. + """ + if self._validation_dl is None: + self._validation_dl = [] + + if getattr(self, '_validation_uses_lhotse', False): + print(f"[val_dataloader] rank={self.global_rank}: Using lhotse, skipping DistributedSampler wrap") + return self._validation_dl + + if not torch.distributed.is_initialized(): + print(f"[val_dataloader] rank={self.global_rank}: Distributed not initialized, skipping DistributedSampler wrap") + return self._validation_dl + + if getattr(self, '_val_dl_wrapped_with_dist_sampler', False): + return self._validation_dl + + # Wrap the validation dataloader(s) with DistributedSampler + dataloaders = self._validation_dl if isinstance(self._validation_dl, list) else [self._validation_dl] + wrapped = [] + for i, dl in enumerate(dataloaders): + if dl is not None and not isinstance(dl.sampler, DistributedSampler): + print(f"[val_dataloader] rank={self.global_rank}: Wrapping val dataloader {i} with DistributedSampler " + f"(dataset_len={len(dl.dataset)}, world_size={torch.distributed.get_world_size()}, " + f"batch_size={dl.batch_size}, num_workers={dl.num_workers})") + sampler = DistributedSampler(dl.dataset, shuffle=False) + new_dl = torch.utils.data.DataLoader( + dl.dataset, + sampler=sampler, + batch_size=dl.batch_size, + num_workers=dl.num_workers, + collate_fn=dl.collate_fn, + pin_memory=dl.pin_memory, + drop_last=dl.drop_last, + worker_init_fn=dl.worker_init_fn, + persistent_workers=dl.persistent_workers, + ) + wrapped.append(new_dl) + else: + sampler_type = type(dl.sampler).__name__ if dl is not None else "N/A" + print(f"[val_dataloader] rank={self.global_rank}: Val dataloader {i} already has " + f"sampler={sampler_type}, skipping wrap") + wrapped.append(dl) + + if isinstance(self._validation_dl, list): + self._validation_dl = wrapped + else: + self._validation_dl = wrapped[0] + + self._val_dl_wrapped_with_dist_sampler = True + return self._validation_dl + def _sample_audio_codes( self, last_hidden: torch.Tensor, diff --git a/nemo/collections/tts/models/magpietts_preference_optimization.py b/nemo/collections/tts/models/magpietts_preference_optimization.py index d583cacadd74..a6d11f6ac1ae 100644 --- a/nemo/collections/tts/models/magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/magpietts_preference_optimization.py @@ -15,7 +15,6 @@ import json import os import random -import string from typing import Optional import librosa @@ -27,7 +26,11 @@ import nemo.collections.asr as nemo_asr from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors +from nemo.collections.tts.parts.utils.helpers import ( + get_speaker_embeddings_from_filepaths, + process_text_for_cer, + transcribe_with_whisper, +) from nemo.utils import logging try: @@ -1030,72 +1033,3 @@ def collect(key): for val_outputs in self.validation_step_outputs: val_outputs.clear() - - -# Utility functions -def process_text_for_cer(input_text): - """ - Normalizes text for CER/WER calculation. - Taken from hallucination_eval.py - """ - # Convert text to lowercase - lower_case_text = input_text.lower() - - # Remove commas from text - no_comma_text = lower_case_text.replace(",", "") - # Replace "-" with spaces - no_dash_text = no_comma_text.replace("-", " ") - no_dash_text = no_dash_text.replace("'", "") - no_dash_text = no_dash_text.replace(";", "") - no_dash_text = no_dash_text.replace(".", "") - - # Replace double spaces with single space - single_space_text = " ".join(no_dash_text.split()) - - single_space_text = single_space_text.translate(str.maketrans('', '', string.punctuation)) - - # @shehzeen: Added this to handle some common errors in ASR transcripts - single_space_text = single_space_text.replace("h t t p", "http") - single_space_text = single_space_text.replace("w w w", "www") - - return single_space_text - - -def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model, device): - audio_batch = [] - audio_lengths = [] - for filepath in filepaths: - audio, sr = sf.read(filepath) - if sr != 16000: - audio = librosa.core.resample(audio, orig_sr=sr, target_sr=16000) - audio_tensor = torch.tensor(audio, dtype=torch.float32, device=device) - audio_batch.append(audio_tensor) - audio_lengths.append(audio_tensor.size(0)) - - batch_audio_lens = torch.tensor(audio_lengths, device=device).long() - max_audio_len = int(batch_audio_lens.max().item()) - audio_batch = stack_tensors(audio_batch, max_lens=[max_audio_len]) - - _, speaker_embeddings = speaker_verification_model.forward( - input_signal=audio_batch, input_signal_length=batch_audio_lens - ) - - return speaker_embeddings - - -def transcribe_with_whisper( - audio_filepath, language, whisper_processor, whisper_model, device, normalizer: Optional[Normalizer] = None -): - speech_array, sampling_rate = librosa.load(audio_filepath, sr=16000) - forced_decoder_ids = ( - whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe") if language else None - ) - inputs = whisper_processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt").input_features - inputs = inputs.to(device) - with torch.no_grad(): - predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids) - transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True) - result = transcription[0] - if normalizer is not None: - result = normalizer.normalize(result) - return result diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py index 1b1855cf356d..a8ee48ce57ef 100644 --- a/nemo/collections/tts/parts/utils/helpers.py +++ b/nemo/collections/tts/parts/utils/helpers.py @@ -42,15 +42,18 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import string from enum import Enum -from typing import Optional, Tuple +from typing import Any, Optional, Tuple import librosa import matplotlib.pylab as plt import numpy as np +import soundfile as sf import torch from numba import jit, prange +from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors from nemo.collections.tts.torch.tts_data_types import DATA_STR2DATA_CLASS, MAIN_DATA_TYPES, WithLens from nemo.utils import logging from nemo.utils.decorators import deprecated @@ -802,3 +805,81 @@ def g2p_backward_compatible_support(g2p_target: str) -> str: # for backward compatibility g2p_target_new = g2p_target.replace("nemo_text_processing.g2p", "nemo.collections.tts.g2p") return g2p_target_new + + +def process_text_for_cer(input_text): + """ + Normalizes text for CER/WER calculation. + """ + # Convert text to lowercase + lower_case_text = input_text.lower() + + # Remove commas from text + no_comma_text = lower_case_text.replace(",", "") + # Replace "-" with spaces + no_dash_text = no_comma_text.replace("-", " ") + no_dash_text = no_dash_text.replace("'", "") + no_dash_text = no_dash_text.replace(";", "") + no_dash_text = no_dash_text.replace(".", "") + + # Replace double spaces with single space + single_space_text = " ".join(no_dash_text.split()) + + single_space_text = single_space_text.translate(str.maketrans('', '', string.punctuation)) + + # Handle some common errors in ASR transcripts + single_space_text = single_space_text.replace("h t t p", "http") + single_space_text = single_space_text.replace("w w w", "www") + + return single_space_text + + +def transcribe_with_whisper( + audio_filepath: str, + language: Optional[str], + whisper_processor: Any, + whisper_model: Any, + device: torch.device, + normalizer: Optional[Any] = None, +) -> str: + """ + Transcribe audio with Whisper. Optionally normalize the transcript if a normalizer is provided. + """ + speech_array, sampling_rate = librosa.load(audio_filepath, sr=16000) + forced_decoder_ids = ( + whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe") if language else None + ) + inputs = whisper_processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt").input_features + inputs = inputs.to(device) + with torch.no_grad(): + predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids) + transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True) + result = transcription[0] + if normalizer is not None: + result = normalizer.normalize(result) + return result + + +def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model, device): + """ + Get speaker embeddings from audio filepaths using a speaker verification model. + """ + audio_batch = [] + audio_lengths = [] + for filepath in filepaths: + audio, sr = sf.read(filepath) + if sr != 16000: + audio = librosa.core.resample(audio, orig_sr=sr, target_sr=16000) + audio_tensor = torch.tensor(audio, dtype=torch.float32, device=device) + audio_batch.append(audio_tensor) + audio_lengths.append(audio_tensor.size(0)) + + batch_audio_lens = torch.tensor(audio_lengths, device=device).long() + max_audio_len = int(batch_audio_lens.max().item()) + audio_batch = stack_tensors(audio_batch, max_lens=[max_audio_len]) + + _, speaker_embeddings = speaker_verification_model.forward( + input_signal=audio_batch, input_signal_length=batch_audio_lens + ) + + return speaker_embeddings From 97d98daa7273ba854506770cb6e794347d5ce5e6 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Sat, 7 Feb 2026 06:18:32 +0000 Subject: [PATCH 39/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- .../tts/data/text_to_speech_dataset_lhotse.py | 6 +- nemo/collections/tts/models/easy_magpietts.py | 78 +++++++++++++------ 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 5e088708573f..ba111838efa3 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -237,11 +237,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: raise ValueError(f"Invalid format in cut.supervisions[0].speaker: {speaker}") dataset_name = speaker.strip().split()[2].split(":")[-1] dataset_name_list.append(dataset_name) - language = ( - cut.supervisions[0].language - if cut.supervisions[0].has_custom("language") - else "en" - ) + language = cut.supervisions[0].language if cut.supervisions[0].has_custom("language") else "en" language_list.append(language) # target audio or target codes diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 224100e07ff6..c51315140d31 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -14,13 +14,12 @@ import os import random import time - -import numpy as np -import soundfile as sf from dataclasses import dataclass from functools import partial from typing import Any, Dict, List, Optional, Sequence, Tuple +import numpy as np +import soundfile as sf import torch import wandb from hydra.utils import instantiate @@ -1900,10 +1899,12 @@ def training_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx): # Extract inputs from batch and pass explicitly to process_batch - print(f"[Validation] global_rank: {self.global_rank}, " - f"local_rank: {self.local_rank}, " - f"world_size: {self.trainer.world_size}, " - f"batch_idx: {batch_idx}") + print( + f"[Validation] global_rank: {self.global_rank}, " + f"local_rank: {self.local_rank}, " + f"world_size: {self.trainer.world_size}, " + f"batch_idx: {batch_idx}" + ) if 'context_audio_codes' in batch: context_audio_codes = batch['context_audio_codes'] context_audio_codes_lens = batch['context_audio_codes_lens'] @@ -1978,7 +1979,7 @@ def validation_step(self, batch, batch_idx): max_decoder_steps=220, temperature=0.7, topk=80, - use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR + use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR, ) # Get audio output directory @@ -1994,7 +1995,9 @@ def validation_step(self, batch, batch_idx): codes=context_audio_codes, codes_len=context_audio_codes_lens, ) - context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio(context_audio_codes_cleaned, context_audio_codes_lens_cleaned) + context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio( + context_audio_codes_cleaned, context_audio_codes_lens_cleaned + ) for idx in range(infer_output.predicted_audio.size(0)): audio_np = infer_output.predicted_audio[idx].float().detach().cpu().numpy() @@ -2026,7 +2029,13 @@ def validation_step(self, batch, batch_idx): predicted_audio_paths.append(audio_path) # Save context audio for SSIM computation - ctx_audio_np = context_audio_codes_cleaned[idx].float().detach().cpu().numpy()[: context_audio_lens_cleaned[idx]] + ctx_audio_np = ( + context_audio_codes_cleaned[idx] + .float() + .detach() + .cpu() + .numpy()[: context_audio_lens_cleaned[idx]] + ) ctx_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_context.wav') sf.write(ctx_path, ctx_audio_np, self.output_sample_rate) context_audio_paths.append(ctx_path) @@ -2044,7 +2053,12 @@ def validation_step(self, batch, batch_idx): for audio_path, lang in zip(predicted_audio_paths, languages): try: transcript = transcribe_with_whisper( - audio_path, lang, self.whisper_processor, self.whisper_model, self.device, normalizer=None + audio_path, + lang, + self.whisper_processor, + self.whisper_model, + self.device, + normalizer=None, ) pred_transcripts.append(process_text_for_cer(transcript)) except Exception as e: @@ -2055,10 +2069,8 @@ def validation_step(self, batch, batch_idx): predicted_audio_paths, batch_size=len(predicted_audio_paths), override_config=TranscribeConfig( - use_lhotse=False, - batch_size=len(predicted_audio_paths), - num_workers=0 - ) + use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0 + ), ) pred_transcripts = [process_text_for_cer(t.text) for t in pred_transcripts] @@ -2099,7 +2111,9 @@ def validation_step(self, batch, batch_idx): val_output['val_wer'] = torch.tensor(np.mean(batch_wer), device=self.device) if self.use_multilingual_asr: langs = batch.get('languages', ['en'] * len(predicted_audio_paths)) - val_output['val_languages'] = [langs[i] for i in range(len(pred_transcripts)) if pred_transcripts[i] is not None] + val_output['val_languages'] = [ + langs[i] for i in range(len(pred_transcripts)) if pred_transcripts[i] is not None + ] val_output['val_cer_list'] = batch_cer val_output['val_wer_list'] = batch_wer if batch_ssim: @@ -2154,9 +2168,19 @@ def collect_if_exists(key): lang_cer.setdefault(lang, []).append(cer) lang_wer.setdefault(lang, []).append(wer) for lang in lang_cer: - self.log(f"val/cer_lang_{lang}", torch.tensor(np.mean(lang_cer[lang]), device=self.device), prog_bar=True, sync_dist=True) + self.log( + f"val/cer_lang_{lang}", + torch.tensor(np.mean(lang_cer[lang]), device=self.device), + prog_bar=True, + sync_dist=True, + ) for lang in lang_wer: - self.log(f"val/wer_lang_{lang}", torch.tensor(np.mean(lang_wer[lang]), device=self.device), prog_bar=True, sync_dist=True) + self.log( + f"val/wer_lang_{lang}", + torch.tensor(np.mean(lang_wer[lang]), device=self.device), + prog_bar=True, + sync_dist=True, + ) self.validation_step_outputs.clear() # free memory @@ -2288,7 +2312,9 @@ def val_dataloader(self): return self._validation_dl if not torch.distributed.is_initialized(): - print(f"[val_dataloader] rank={self.global_rank}: Distributed not initialized, skipping DistributedSampler wrap") + print( + f"[val_dataloader] rank={self.global_rank}: Distributed not initialized, skipping DistributedSampler wrap" + ) return self._validation_dl if getattr(self, '_val_dl_wrapped_with_dist_sampler', False): @@ -2299,9 +2325,11 @@ def val_dataloader(self): wrapped = [] for i, dl in enumerate(dataloaders): if dl is not None and not isinstance(dl.sampler, DistributedSampler): - print(f"[val_dataloader] rank={self.global_rank}: Wrapping val dataloader {i} with DistributedSampler " - f"(dataset_len={len(dl.dataset)}, world_size={torch.distributed.get_world_size()}, " - f"batch_size={dl.batch_size}, num_workers={dl.num_workers})") + print( + f"[val_dataloader] rank={self.global_rank}: Wrapping val dataloader {i} with DistributedSampler " + f"(dataset_len={len(dl.dataset)}, world_size={torch.distributed.get_world_size()}, " + f"batch_size={dl.batch_size}, num_workers={dl.num_workers})" + ) sampler = DistributedSampler(dl.dataset, shuffle=False) new_dl = torch.utils.data.DataLoader( dl.dataset, @@ -2317,8 +2345,10 @@ def val_dataloader(self): wrapped.append(new_dl) else: sampler_type = type(dl.sampler).__name__ if dl is not None else "N/A" - print(f"[val_dataloader] rank={self.global_rank}: Val dataloader {i} already has " - f"sampler={sampler_type}, skipping wrap") + print( + f"[val_dataloader] rank={self.global_rank}: Val dataloader {i} already has " + f"sampler={sampler_type}, skipping wrap" + ) wrapped.append(dl) if isinstance(self._validation_dl, list): From 003c4395cb30c919b94e3702e66169d9e4b715b6 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sat, 7 Feb 2026 10:08:41 -0800 Subject: [PATCH 40/94] bug fixes in inference and logging Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 60 ++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index c51315140d31..0e3f6ce55797 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json import os import random import time @@ -519,6 +520,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") self.whisper_model.eval() + for param in self.whisper_model.parameters(): + param.requires_grad = False self._eval_asr_model = None else: self._eval_asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained( @@ -533,6 +536,38 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self._eval_speaker_verification_model.freeze() logging.info("Eval models loaded successfully.") + def setup_optimizer_param_groups(self): + """ + Override to exclude frozen eval/inference-only models from the optimizer. + This prevents optimizer state mismatch errors when resuming from checkpoints + that were saved before these eval models were added. + """ + modules_to_exclude = { + '_speaker_verification_model', + # '_codec_model', + '_eval_asr_model', + '_eval_speaker_verification_model', + 'whisper_model', + 'whisper_processor', + } + + # Collect parameter ids to exclude + excluded_param_ids = set() + for name, module in self.named_children(): + if name in modules_to_exclude: + for param in module.parameters(): + excluded_param_ids.add(id(param)) + + # Build param group with only trainable (non-excluded) parameters + trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids] + + logging.info( + f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, " + f"{len(excluded_param_ids)} params excluded (eval models)" + ) + + self._optimizer_param_groups = [{"params": trainable_params}] + def state_dict(self, destination=None, prefix='', keep_vars=False): """ Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model @@ -1980,6 +2015,8 @@ def validation_step(self, batch, batch_idx): temperature=0.7, topk=80, use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR, + use_cfg=True, + cfg_scale=2.5 ) # Get audio output directory @@ -2030,7 +2067,7 @@ def validation_step(self, batch, batch_idx): # Save context audio for SSIM computation ctx_audio_np = ( - context_audio_codes_cleaned[idx] + context_audio_cleaned[idx] .float() .detach() .cpu() @@ -2096,16 +2133,35 @@ def validation_step(self, batch, batch_idx): wer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False) batch_cer.append(cer) batch_wer.append(wer) + ssim = None if pred_embeddings is not None and ctx_embeddings is not None: pred_emb = pred_embeddings[idx].cpu().float().numpy() ctx_emb = ctx_embeddings[idx].cpu().float().numpy() - ssim = np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb)) + ssim = float(np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb))) batch_ssim.append(ssim) logging.info( f"[Val] rank{self.global_rank}_batch{batch_idx}_idx{idx}: " f"CER={cer:.4f}, WER={wer:.4f} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'" ) + # Save per-audio metrics JSON file alongside the audio file + if audio_dir: + metrics_dict = { + 'cer': float(cer), + 'wer': float(wer), + 'ssim': ssim, + 'gt_transcript': gt_transcript, + 'pred_transcript': pred_transcripts[idx], + 'audio_path': predicted_audio_paths[idx], + 'epoch': self.trainer.current_epoch, + 'global_step': self.global_step, + } + metrics_path = os.path.join( + audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_metrics.json' + ) + with open(metrics_path, 'w') as f: + json.dump(metrics_dict, f, indent=2) + if batch_cer: val_output['val_cer'] = torch.tensor(np.mean(batch_cer), device=self.device) val_output['val_wer'] = torch.tensor(np.mean(batch_wer), device=self.device) From af7e76b253a20bca9ca86ffcaa0143c1d44a6d10 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sat, 7 Feb 2026 13:38:06 -0800 Subject: [PATCH 41/94] more tests Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 208 ++++++-- .../tts/test_infer_vs_process_batch.py | 487 ++++++++++++++++++ 2 files changed, 660 insertions(+), 35 deletions(-) create mode 100644 tests/collections/tts/test_infer_vs_process_batch.py diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 0e3f6ce55797..0bb3aaff8441 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -93,6 +93,9 @@ class ProcessBatchOutput: local_transformer_loss: Loss from local transformer (None if not using local transformer) local_transformer_logits: Logits from local transformer, shape (B, T', num_codebooks * num_tokens_per_codebook) logits: Predicted logits from the main decoder, shape (B, T', num_codebooks * num_tokens_per_codebook) + phoneme_logits: Predicted phoneme logits, shape (B, T', phoneme_stacking_factor * phoneme_vocab_size). None if no phoneme tokenizer. + phoneme_tokens_target: Target phoneme tokens (shifted), shape (B, S, T'). None if no phoneme tokenizer. + phoneme_tokens_lens_target: Length of target phoneme tokens (B,). None if no phoneme tokenizer. audio_codes_target: Target audio codes for the decoder, shape (B, C, T') audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,) context_audio_codes: Audio codes extracted from context audio, shape (B, C, T') @@ -106,6 +109,9 @@ class ProcessBatchOutput: local_transformer_loss: Optional[torch.Tensor] local_transformer_logits: Optional[torch.Tensor] logits: torch.Tensor + phoneme_logits: Optional[torch.Tensor] + phoneme_tokens_target: Optional[torch.Tensor] + phoneme_tokens_lens_target: Optional[torch.Tensor] audio_codes_target: torch.Tensor audio_codes_lens_target: torch.Tensor context_audio_codes: torch.Tensor @@ -202,6 +208,8 @@ class StreamingState: phoneme_prediction_end_idx: torch.Tensor gt_phoneme_embeddings: Optional[torch.Tensor] = None # (B, T', E) pre-computed GT embeddings gt_phoneme_lens: Optional[torch.Tensor] = None # (B,) lengths after stacking + gt_audio_embeddings: Optional[torch.Tensor] = None # (B, T', E) pre-computed GT audio embeddings + gt_audio_lens: Optional[torch.Tensor] = None # (B,) lengths after stacking @dataclass @@ -225,6 +233,9 @@ class InferBatchOutput: predicted_codes: torch.Tensor # (B, num_codebooks, T_frames) predicted_codes_lens: torch.Tensor # (B,) rtf_metrics: Dict[str, Any] + predicted_phoneme_tokens: Optional[torch.Tensor] = None # (B, phoneme_stacking_factor, T_phoneme_steps) + predicted_phoneme_tokens_lens: Optional[torch.Tensor] = None # (B,) number of valid phoneme steps per item + phoneme_prediction_start_idx: Optional[torch.Tensor] = None # (B,) start index into predicted_phoneme_tokens def worker_init_fn(worker_id): @@ -970,10 +981,14 @@ def sample_codes_from_logits( codebook_logits_rescored = codebook_logits.clone() codebook_logits_rescored[indices_to_remove] = float('-inf') - codebook_probs = torch.softmax( - codebook_logits_rescored / temperature, dim=-1 - ) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + if temperature <= 0.0: + # Argmax sampling for deterministic output + codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) + else: + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) all_preds.append(codebook_preds) all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) return all_preds @@ -992,10 +1007,14 @@ def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, t codebook_logits_rescored = codebook_logits.clone() codebook_logits_rescored[indices_to_remove] = float('-inf') - codebook_probs = torch.softmax( - codebook_logits_rescored / temperature, dim=-1 - ) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + if temperature <= 0.0: + # Argmax sampling for deterministic output + codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) + else: + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) all_preds.append(codebook_preds) all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) return all_preds @@ -1810,6 +1829,9 @@ def process_batch( # Compute phoneme loss if applicable phoneme_loss = None + pb_phoneme_logits = None + pb_phoneme_tokens_target = None + pb_phoneme_tokens_lens_target = None if self.phoneme_tokenizer is not None and phoneme_tokens_stacked is not None: # Phoneme predictions start at phoneme_delay pred_embeddings_phoneme = self.slice_pred_embeddings( @@ -1817,11 +1839,13 @@ def process_batch( context_lens=phoneme_delay, target_lens=phoneme_tokens_lens_stacked - 1, ) - phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) + pb_phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) + pb_phoneme_tokens_target = phoneme_tokens_stacked[:, :, 1:].long() + pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1 if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input): phoneme_loss, _ = self.compute_phoneme_loss( - phoneme_logits, phoneme_tokens_stacked[:, :, 1:].long(), phoneme_tokens_lens_stacked - 1 + pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target ) print("No Dropout - phoneme loss:", phoneme_loss.item()) else: @@ -1837,6 +1861,9 @@ def process_batch( local_transformer_loss=local_transformer_loss, local_transformer_logits=local_transformer_logits, logits=logits, + phoneme_logits=pb_phoneme_logits, + phoneme_tokens_target=pb_phoneme_tokens_target, + phoneme_tokens_lens_target=pb_phoneme_tokens_lens_target, audio_codes_target=audio_codes_target, audio_codes_lens_target=audio_codes_lens_target, context_audio_codes=context_audio_codes_processed, @@ -2451,7 +2478,10 @@ def _sample_audio_codes( # Parallel sampling from all codebook logits audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) # Argmax sampling for reliable EOS detection - all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) + if temperature <= 0.0: + all_codes_next_argmax = audio_codes_next # already argmax + else: + all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) return audio_codes_next, all_codes_next_argmax @@ -2471,6 +2501,8 @@ def streaming_init( phoneme_sampling_method: str = 'argmax', gt_phoneme_tokens: Optional[torch.Tensor] = None, gt_phoneme_tokens_lens: Optional[torch.Tensor] = None, + gt_audio_codes: Optional[torch.Tensor] = None, + gt_audio_codes_lens: Optional[torch.Tensor] = None, ) -> StreamingState: """ Initialize streaming TTS inference state. @@ -2509,6 +2541,9 @@ def streaming_init( phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing. gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,). + gt_audio_codes: Optional GT audio codes (B, C*S, T) already stacked with BOS/EOS, + input portion ([:, :, :-1]) for teacher forcing. Pre-processed by caller. + gt_audio_codes_lens: Lengths of GT audio codes (B,) after stacking. Returns: StreamingState: Initial state for streaming inference. @@ -2586,6 +2621,13 @@ def streaming_init( ) gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked) # (B, T', E) + # Process GT audio codes if provided (for teacher forcing) + gt_audio_embeddings = None + gt_audio_lens_state = None + if gt_audio_codes is not None and gt_audio_codes_lens is not None: + gt_audio_embeddings = self.embed_audio_tokens(gt_audio_codes) # (B, T', E) + gt_audio_lens_state = gt_audio_codes_lens + # Initialize streaming state with batch support state = StreamingState( batch_size=batch_size, @@ -2624,6 +2666,8 @@ def streaming_init( phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), gt_phoneme_embeddings=gt_phoneme_embeddings, gt_phoneme_lens=gt_phoneme_lens, + gt_audio_embeddings=gt_audio_embeddings, + gt_audio_lens=gt_audio_lens_state, ) return state @@ -2722,11 +2766,13 @@ def streaming_step( if force_dropout_text: text_embedded = text_embedded * 0 - text_add_mask = needs_text.view(batch_size, 1, 1).float() - next_input = next_input + text_embedded * text_add_mask # Check for EOS tokens - mark those items as text_finished - # Items that receive EOS should not have their text embedded added after this step + # The EOS token itself IS embedded normally (matching process_batch behavior + # where EOS is part of the text sequence). After this step, text_finished is set + # so subsequent steps won't add any text embedding. is_eos_token = text_tokens == self.eos_id # (B,) bool + text_add_mask = needs_text.view(batch_size, 1, 1).float() + next_input = next_input + text_embedded * text_add_mask state.text_finished = state.text_finished | is_eos_token elif text_tokens is None: @@ -2778,28 +2824,39 @@ def streaming_step( # --- Audio embedding for audio phase items --- if needs_audio.any(): - # Determine which items are at first audio step - first_audio_step = needs_audio & (state.audio_steps == 0) - has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None) - audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) - if first_audio_step.any(): - # Create BOS for items at first audio step - audio_bos = torch.full( - (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1), - self.audio_bos_id, - device=device, - ).long() - audio_bos_emb = self.embed_audio_tokens(audio_bos) # (B, 1, E) - first_mask = first_audio_step.view(batch_size, 1, 1).float() - audio_emb = audio_emb + audio_bos_emb * first_mask - - if has_last_audio.any() and state.last_audio_codes is not None: - # Use last predicted audio - last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2)) # (B, 1, E) - last_mask = has_last_audio.view(batch_size, 1, 1).float() - audio_emb = audio_emb + last_audio_emb * last_mask + if state.gt_audio_embeddings is not None: + # Teacher forcing: use pre-computed GT audio embeddings + # Only use GT embedding if within valid length, otherwise zero + within_gt_len = state.audio_steps < state.gt_audio_lens # (B,) + positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1) + gt_emb = state.gt_audio_embeddings[ + torch.arange(batch_size, device=device), positions, : + ].unsqueeze(1) # (B, 1, E) + audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float() + audio_emb = audio_emb + gt_emb * audio_mask + else: + # Prediction mode: use BOS or last predicted audio + first_audio_step = needs_audio & (state.audio_steps == 0) + has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None) + + if first_audio_step.any(): + # Create BOS for items at first audio step + audio_bos = torch.full( + (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1), + self.audio_bos_id, + device=device, + ).long() + audio_bos_emb = self.embed_audio_tokens(audio_bos) # (B, 1, E) + first_mask = first_audio_step.view(batch_size, 1, 1).float() + audio_emb = audio_emb + audio_bos_emb * first_mask + + if has_last_audio.any() and state.last_audio_codes is not None: + # Use last predicted audio + last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2)) # (B, 1, E) + last_mask = has_last_audio.view(batch_size, 1, 1).float() + audio_emb = audio_emb + last_audio_emb * last_mask next_input = next_input + audio_emb @@ -2952,6 +3009,14 @@ def streaming_step( state.all_predictions.append(audio_codes_unstacked) audio_codes_next = audio_codes_unstacked + # Force-finish items when GT audio is exhausted (teacher forcing). + # This is checked AFTER predictions so the last valid prediction is still made. + # audio_steps was already incremented above. When audio_steps >= gt_audio_lens, + # we've consumed all GT input positions and made all corresponding predictions. + if state.gt_audio_embeddings is not None and state.gt_audio_lens is not None: + gt_exhausted = needs_audio & (state.audio_steps >= state.gt_audio_lens) + state.finished = state.finished | gt_exhausted + return state, audio_codes_next, pred_phoneme_tokens def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: @@ -3175,6 +3240,7 @@ def infer_batch( phoneme_input_type: str = 'pred', phoneme_sampling_method: str = 'argmax', force_dropout_text: bool = False, + use_teacher_forced: bool = False, ) -> InferBatchOutput: """ Batch inference using streaming infrastructure. @@ -3192,8 +3258,11 @@ def infer_batch( - context_audio / context_audio_lens: Raw context audio to encode - phoneme_tokens (optional): GT phoneme tokens (B, L'') - phoneme_tokens_lens (optional): Lengths (B,) + For teacher forcing (use_teacher_forced=True), also requires: + - audio_codes / audio_codes_lens: GT audio codes (B, C, T) OR + - audio / audio_lens: Raw audio waveforms to encode max_decoder_steps: Maximum number of decoder steps. - temperature: Sampling temperature for audio codes. + temperature: Sampling temperature for audio codes. Use 0.0 for argmax. topk: Top-k sampling parameter. use_cfg: Whether to use classifier-free guidance. cfg_scale: CFG scale factor. @@ -3201,6 +3270,8 @@ def infer_batch( phoneme_input_type: 'gt' or 'pred' for phoneme tokens. phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. force_dropout_text: Whether to dropout text embeddings. + use_teacher_forced: If True, feed GT audio codes (and force GT phonemes, argmax sampling) + instead of predicted codes at each streaming step. Returns: InferBatchOutput containing predicted audio, codes, and RTF metrics. @@ -3227,6 +3298,53 @@ def infer_batch( gt_phoneme_tokens = batch.get('phoneme_tokens') gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens') + # Prepare GT audio codes for teacher forcing if requested + gt_audio_codes_for_init = None + gt_audio_codes_lens_for_init = None + if use_teacher_forced: + # Force GT phoneme input and argmax sampling + phoneme_input_type = 'gt' + temperature = 0.0 + + # Get GT audio codes - support both codes and raw audio + if 'audio_codes' in batch: + gt_audio_codes_raw = batch['audio_codes'] + gt_audio_codes_lens_raw = batch['audio_codes_lens'] + elif 'audio' in batch: + gt_audio_codes_raw, gt_audio_codes_lens_raw = self.audio_to_codes( + batch['audio'], batch['audio_lens'] + ) + else: + raise ValueError( + "Teacher forcing requires 'audio_codes'/'audio_codes_lens' or 'audio'/'audio_lens' in batch." + ) + + # Pre-process GT audio codes same as prepare_audio_channel_embeddings: + # codec convert, add BOS/EOS, stack, then take input portion ([:, :, :-1]) + if self._codec_converter is not None: + gt_audio_codes_raw = self._codec_converter.convert_original_to_new( + audio_tokens=gt_audio_codes_raw, audio_lens=gt_audio_codes_lens_raw + ).long() + + gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens( + codes=gt_audio_codes_raw, + codes_len=gt_audio_codes_lens_raw, + bos_id=self.audio_bos_id, + eos_id=self.audio_eos_id, + ) + gt_audio_codes_processed, gt_audio_codes_lens_processed = self.stack_codes( + gt_audio_codes_processed, + gt_audio_codes_lens_processed, + self.audio_bos_id, + self.audio_eos_id, + self.frame_stacking_factor, + self.num_audio_codebooks, + ) + + # Input portion: all tokens except the last (teacher forcing shift) + gt_audio_codes_for_init = gt_audio_codes_processed[:, :, :-1] + gt_audio_codes_lens_for_init = gt_audio_codes_lens_processed - 1 + batch_size = text.size(0) # Initialize streaming state @@ -3244,6 +3362,8 @@ def infer_batch( phoneme_sampling_method=phoneme_sampling_method, gt_phoneme_tokens=gt_phoneme_tokens, gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, + gt_audio_codes=gt_audio_codes_for_init, + gt_audio_codes_lens=gt_audio_codes_lens_for_init, ) time_to_first_prediction = None @@ -3296,12 +3416,30 @@ def infer_batch( 'batch_size': batch_size, } + # Extract raw phoneme predictions from state + ib_phoneme_tokens = None + ib_phoneme_tokens_lens = None + if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0: + # Stack: each element is (B, phoneme_stacking_factor), stack along time -> (B, S, T) + ib_phoneme_tokens = torch.stack(state.all_phoneme_predictions, dim=-1) # (B, S, T) + # Compute per-item lengths using start/end indices + ib_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) + for i in range(batch_size): + start = max(0, state.phoneme_prediction_start_idx[i].item()) + end = state.phoneme_prediction_end_idx[i].item() + if end < 0: + end = ib_phoneme_tokens.size(-1) + ib_phoneme_tokens_lens[i] = end - start + return InferBatchOutput( predicted_audio=finalize_output.audio, predicted_audio_lens=finalize_output.audio_len, predicted_codes=finalize_output.audio_codes, predicted_codes_lens=finalize_output.audio_codes_len, rtf_metrics=rtf_metrics, + predicted_phoneme_tokens=ib_phoneme_tokens, + predicted_phoneme_tokens_lens=ib_phoneme_tokens_lens, + phoneme_prediction_start_idx=state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None, ) @classmethod diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py new file mode 100644 index 000000000000..b9838602586a --- /dev/null +++ b/tests/collections/tts/test_infer_vs_process_batch.py @@ -0,0 +1,487 @@ +""" +Test script to verify that infer_batch (teacher-forced) produces the same audio code +and phoneme predictions as process_batch (single forward pass). + +Usage: + python tests/collections/tts/test_infer_vs_process_batch.py --codecmodel_path /path/to/codec.nemo + +The script: +1. Builds a tiny NemotronH-backed EasyMagpieTTSModel with a real codec model. +2. Creates synthetic random inputs (with variable lengths per batch item). +3. Runs process_batch (full-sequence forward) and infer_batch (streaming, teacher-forced). +4. Compares the argmax audio code predictions and phoneme predictions from both paths. +5. Repeats for multiple configurations. +""" + +import argparse +import sys +import torch +from omegaconf import OmegaConf + +from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel + + +def build_minimal_config(codecmodel_path: str) -> OmegaConf: + """Build a minimal OmegaConf config for a tiny NemotronH model.""" + hidden_size = 256 + + cfg_dict = { + # Decoder backend + 'decoder_type': 'nemotron_h', + 'nemotron_h_config': { + 'hidden_size': hidden_size, + 'num_hidden_layers': 2, + 'vocab_size': 131072, + 'num_attention_heads': 4, + 'num_key_value_heads': 2, + 'attention_dropout': 0.0, + 'attention_bias': False, + 'max_position_embeddings': 4096, + 'mamba_num_heads': 16, + 'mamba_head_dim': 16, + 'ssm_state_size': 128, + 'conv_kernel': 4, + 'n_groups': 8, + 'chunk_size': 256, + 'mamba_hidden_act': 'silu', + 'use_conv_bias': True, + 'use_bias': False, + 'intermediate_size': 512, + 'mlp_hidden_act': 'silu', + 'mlp_bias': False, + 'hybrid_override_pattern': 'M*', + 'layer_norm_epsilon': 1e-5, + 'residual_in_fp32': True, + }, + 'embedding_dim': hidden_size, + 'hidden_dim': hidden_size, + 'audio_embedding_dim': hidden_size, + 'codecmodel_path': codecmodel_path, + # Text tokenizer - use a simple AutoTokenizer + 'text_tokenizers': { + 'test_tokenizer': { + '_target_': 'AutoTokenizer', + 'pretrained_model': 'gpt2', + }, + }, + # Phoneme tokenizer + 'phoneme_tokenizer': { + '_target_': 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer', + 'tokenizer_path': 'scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json', + }, + 'phoneme_stacking_factor': 1, + # Training modes (single streaming mode) + 'training_modes': [ + { + 'name': 'streaming_4_8', + 'text_input_mode': 'streaming', + 'streaming_phonemes_delay': 4, + 'streaming_speech_delay': 8, + }, + ], + 'frame_stacking_factor': 2, + 'cfg_unconditional_prob': 0.0, + 'dropout_text_input_prob': 0.0, + 'dropout_phoneme_input_prob': 0.0, + 'local_transformer_type': 'none', + 'run_val_inference': False, + # Optim placeholder (required by ModelPT but not used) + 'optim': { + '_target_': 'torch.optim.AdamW', + 'lr': 1e-4, + }, + # No dataloaders + } + return OmegaConf.create(cfg_dict) + + +def create_synthetic_batch( + model, + batch_size=2, + text_lens_list=None, + audio_frames_list=None, + context_text_lens_list=None, + context_audio_frames_list=None, + phoneme_lens_list=None, + device='cpu', +): + """Create a synthetic batch with random valid token IDs and variable lengths per item. + + If *_list args are None, defaults to uniform lengths for all items. + """ + num_codebooks = model.num_audio_codebooks + codebook_size = model.codebook_size + text_vocab_size = model.bos_id # valid text tokens are [0, bos_id) + phoneme_vocab_size = model.phoneme_tokenizer.vocab_size - 2 # exclude BOS/EOS + + # Defaults + if text_lens_list is None: + text_lens_list = [20] * batch_size + if audio_frames_list is None: + audio_frames_list = [30] * batch_size + if context_text_lens_list is None: + context_text_lens_list = [10] * batch_size + if context_audio_frames_list is None: + context_audio_frames_list = [15] * batch_size + if phoneme_lens_list is None: + phoneme_lens_list = [25] * batch_size + + assert len(text_lens_list) == batch_size + assert len(audio_frames_list) == batch_size + assert len(context_text_lens_list) == batch_size + assert len(context_audio_frames_list) == batch_size + assert len(phoneme_lens_list) == batch_size + + # Max lengths for padding + max_text_len = max(text_lens_list) + max_audio_frames = max(audio_frames_list) + max_context_text_len = max(context_text_lens_list) + max_context_audio_frames = max(context_audio_frames_list) + max_phoneme_len = max(phoneme_lens_list) + + # Text tokens: random tokens + EOS at the end (matching dataset behavior) + text = torch.zeros(batch_size, max_text_len, dtype=torch.long, device=device) + for b in range(batch_size): + tl = text_lens_list[b] + text[b, :tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device) + text[b, tl - 1] = model.eos_id # EOS as last valid token + text_lens = torch.tensor(text_lens_list, dtype=torch.long, device=device) + + # Context text tokens + context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device) + for b in range(batch_size): + cl = context_text_lens_list[b] + context_text_tokens[b, :cl] = torch.randint(0, text_vocab_size, (cl,), device=device) + context_text_tokens_lens = torch.tensor(context_text_lens_list, dtype=torch.long, device=device) + + # Audio codes (raw, without BOS/EOS) + audio_codes = torch.zeros(batch_size, num_codebooks, max_audio_frames, dtype=torch.long, device=device) + for b in range(batch_size): + af = audio_frames_list[b] + audio_codes[b, :, :af] = torch.randint(0, codebook_size, (num_codebooks, af), device=device) + audio_codes_lens = torch.tensor(audio_frames_list, dtype=torch.long, device=device) + + # Context audio codes (raw, without BOS/EOS) + context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device) + for b in range(batch_size): + caf = context_audio_frames_list[b] + context_audio_codes[b, :, :caf] = torch.randint(0, codebook_size, (num_codebooks, caf), device=device) + context_audio_codes_lens = torch.tensor(context_audio_frames_list, dtype=torch.long, device=device) + + # Phoneme tokens (raw IDs, BOS/EOS will be added by the model) + phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device) + for b in range(batch_size): + pl = phoneme_lens_list[b] + phoneme_tokens[b, :pl] = torch.randint(0, phoneme_vocab_size, (pl,), device=device) + phoneme_tokens_lens = torch.tensor(phoneme_lens_list, dtype=torch.long, device=device) + + batch = { + 'text': text, + 'text_lens': text_lens, + 'context_text_tokens': context_text_tokens, + 'context_text_tokens_lens': context_text_tokens_lens, + 'audio_codes': audio_codes, + 'audio_codes_lens': audio_codes_lens, + 'context_audio_codes': context_audio_codes, + 'context_audio_codes_lens': context_audio_codes_lens, + 'phoneme_tokens': phoneme_tokens, + 'phoneme_tokens_lens': phoneme_tokens_lens, + } + return batch + + +def compare_audio_codes(model, pb_output, ib_output, batch): + """Compare audio codes from process_batch and infer_batch. Returns True if all match.""" + C = model.num_audio_codebooks + S = model.frame_stacking_factor + C_stacked = C * S + V = model.num_all_tokens_per_codebook + pb_logits = pb_output.logits # (B, T_stacked, C_stacked * V) + T_stacked = pb_logits.size(1) + batch_size = batch['text'].size(0) + + # Extract per-codebook argmax at stacked resolution + pb_stacked_codes_list = [] + for cb_idx in range(C_stacked): + si = cb_idx * V + ei = si + V + cb_logits = pb_logits[:, :, si:ei] # (B, T_stacked, V) + cb_preds = cb_logits.argmax(dim=-1) # (B, T_stacked) + pb_stacked_codes_list.append(cb_preds) + pb_stacked_codes = torch.stack(pb_stacked_codes_list, dim=1) # (B, C_stacked, T_stacked) + + # Unstack: (B, C*S, T_stacked) -> (B, C, S, T_stacked) -> (B, C, T_stacked, S) -> (B, C, T_stacked*S) + pb_unstacked = pb_stacked_codes.view(batch_size, C, S, T_stacked) + pb_unstacked = pb_unstacked.permute(0, 1, 3, 2).contiguous() + pb_unstacked = pb_unstacked.reshape(batch_size, C, T_stacked * S) + pb_unstacked_lens = pb_output.audio_codes_lens_target * S + + ib_codes = ib_output.predicted_codes + ib_codes_lens = ib_output.predicted_codes_lens + + print(f" process_batch argmax codes (unstacked): {pb_unstacked.shape}, lens: {pb_unstacked_lens.tolist()}") + print(f" infer_batch predicted codes: {ib_codes.shape}, lens: {ib_codes_lens.tolist()}") + + all_match = True + for b in range(batch_size): + pb_len = pb_unstacked_lens[b].item() + ib_len = ib_codes_lens[b].item() + compare_len = min(pb_len, ib_len) + + if compare_len == 0: + print(f" Batch item {b}: No codes to compare (pb_len={pb_len}, ib_len={ib_len})") + continue + + pb_codes_b = pb_unstacked[b, :, :compare_len] + ib_codes_b = ib_codes[b, :, :compare_len] + + matches = (pb_codes_b == ib_codes_b).all() + num_matching = (pb_codes_b == ib_codes_b).sum().item() + total = pb_codes_b.numel() + match_pct = 100.0 * num_matching / total if total > 0 else 0.0 + + print(f" Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}") + print(f" Audio match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)") + + if not matches: + all_match = False + mismatch_mask = pb_codes_b != ib_codes_b + mismatch_positions = mismatch_mask.nonzero(as_tuple=False) + num_show = min(10, mismatch_positions.size(0)) + for i in range(num_show): + cb, t = mismatch_positions[i].tolist() + print(f" Mismatch at codebook={cb}, time={t}: " + f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}") + + return all_match + + +def compare_phoneme_predictions(model, pb_output, ib_output, batch): + """Compare phoneme predictions from process_batch and infer_batch. Returns True if all match.""" + if pb_output.phoneme_logits is None: + print(" No phoneme logits from process_batch (no phoneme tokenizer?). Skipping.") + return True + if ib_output.predicted_phoneme_tokens is None: + print(" No phoneme predictions from infer_batch. Skipping.") + return True + + batch_size = batch['text'].size(0) + phoneme_stacking_factor = model.phoneme_stacking_factor + phoneme_vocab_size = model.phoneme_vocab_size + + # Extract argmax phoneme predictions from process_batch logits + # phoneme_logits: (B, T_phoneme, phoneme_stacking_factor * phoneme_vocab_size) + pb_phoneme_logits = pb_output.phoneme_logits + T_phoneme = pb_phoneme_logits.size(1) + + pb_phoneme_preds_list = [] + for sf_idx in range(phoneme_stacking_factor): + si = sf_idx * phoneme_vocab_size + ei = si + phoneme_vocab_size + sf_logits = pb_phoneme_logits[:, :, si:ei] # (B, T_phoneme, V_phoneme) + sf_preds = sf_logits.argmax(dim=-1) # (B, T_phoneme) + pb_phoneme_preds_list.append(sf_preds) + pb_phoneme_preds = torch.stack(pb_phoneme_preds_list, dim=1) # (B, phoneme_stacking_factor, T_phoneme) + pb_phoneme_lens = pb_output.phoneme_tokens_lens_target # (B,) number of phoneme prediction steps + + # infer_batch phoneme predictions: (B, phoneme_stacking_factor, T_all_steps) + ib_phoneme_preds = ib_output.predicted_phoneme_tokens + ib_phoneme_lens = ib_output.predicted_phoneme_tokens_lens + + print(f" process_batch phoneme preds: {pb_phoneme_preds.shape}, lens: {pb_phoneme_lens.tolist()}") + print(f" infer_batch phoneme preds: {ib_phoneme_preds.shape}, lens: {ib_phoneme_lens.tolist()}") + + # Get start indices for infer_batch phoneme predictions + ib_start_idx = ib_output.phoneme_prediction_start_idx # (B,) + + all_match = True + for b in range(batch_size): + pb_len = pb_phoneme_lens[b].item() + ib_len = ib_phoneme_lens[b].item() + compare_len = min(pb_len, ib_len) + + if compare_len == 0: + print(f" Batch item {b}: No phonemes to compare (pb_len={pb_len}, ib_len={ib_len})") + continue + + # process_batch phoneme preds start from 0 (already sliced to prediction region) + pb_ph_b = pb_phoneme_preds[b, :, :compare_len] + + # infer_batch phoneme preds: slice from start_idx for this batch item + start = max(0, ib_start_idx[b].item()) + ib_ph_b = ib_phoneme_preds[b, :, start:start + compare_len] + + matches = (pb_ph_b == ib_ph_b).all() + num_matching = (pb_ph_b == ib_ph_b).sum().item() + total = pb_ph_b.numel() + match_pct = 100.0 * num_matching / total if total > 0 else 0.0 + + print(f" Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}") + print(f" Phoneme match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)") + + if not matches: + all_match = False + mismatch_mask = pb_ph_b != ib_ph_b + mismatch_positions = mismatch_mask.nonzero(as_tuple=False) + num_show = min(10, mismatch_positions.size(0)) + for i in range(num_show): + sf, t = mismatch_positions[i].tolist() + print(f" Mismatch at stacking_factor={sf}, time={t}: " + f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}") + + return all_match + + +def run_single_test(model, batch, test_name, device): + """Run a single test comparing process_batch and infer_batch outputs.""" + print(f"\n{'='*60}") + print(f"TEST: {test_name}") + print(f"{'='*60}") + + for k, v in batch.items(): + if isinstance(v, torch.Tensor): + print(f" {k}: shape={v.shape}, dtype={v.dtype}") + + # Run process_batch + print("\n Running process_batch...") + training_mode = model.training_modes[0] + with torch.inference_mode(): + pb_output = model.process_batch( + text=batch['text'], + text_lens=batch['text_lens'], + context_text_tokens=batch['context_text_tokens'], + context_text_tokens_lens=batch['context_text_tokens_lens'], + audio_codes=batch['audio_codes'], + audio_codes_lens=batch['audio_codes_lens'], + context_audio_codes=batch['context_audio_codes'], + context_audio_codes_lens=batch['context_audio_codes_lens'], + phoneme_tokens=batch['phoneme_tokens'], + phoneme_tokens_lens=batch['phoneme_tokens_lens'], + mode='val', + training_mode=training_mode, + ) + + # Run infer_batch (teacher-forced) + print(" Running infer_batch (teacher-forced)...") + ib_output = model.infer_batch( + batch=batch, + max_decoder_steps=1000, + temperature=0.0, + topk=80, + use_cfg=False, + use_local_transformer_for_inference=False, + phoneme_input_type='gt', + phoneme_sampling_method='argmax', + use_teacher_forced=True, + ) + + # Compare audio codes + print("\n --- Audio Codes Comparison ---") + audio_match = compare_audio_codes(model, pb_output, ib_output, batch) + + # Compare phoneme predictions + print("\n --- Phoneme Predictions Comparison ---") + phoneme_match = compare_phoneme_predictions(model, pb_output, ib_output, batch) + + success = audio_match and phoneme_match + if success: + print(f"\n ✓ {test_name}: PASSED (audio + phoneme match)") + else: + parts = [] + if not audio_match: + parts.append("audio") + if not phoneme_match: + parts.append("phoneme") + print(f"\n ✗ {test_name}: FAILED ({' and '.join(parts)} mismatch)") + + return success + + +def main(): + parser = argparse.ArgumentParser(description='Test infer_batch vs process_batch') + parser.add_argument('--codecmodel_path', type=str, required=True, help='Path to codec model .nemo file') + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + + device = args.device + print(f"Using device: {device}") + + # 1. Build config and model + print("Building minimal config...") + cfg = build_minimal_config(args.codecmodel_path) + + print("Instantiating EasyMagpieTTSModel (tiny NemotronH + real codec)...") + model = EasyMagpieTTSModel(cfg=cfg, trainer=None) + model = model.to(device) + model.eval() + print(f" num_audio_codebooks={model.num_audio_codebooks}, codebook_size={model.codebook_size}") + print(f" frame_stacking_factor={model.frame_stacking_factor}") + print(f" phoneme_vocab_size={model.phoneme_tokenizer.vocab_size}") + + # Define test configurations: (test_name, kwargs_for_create_synthetic_batch) + test_configs = [ + ( + "Uniform lengths (B=2, text=20, audio=30, ctx_text=10, ctx_audio=15, phoneme=25)", + dict( + batch_size=2, + text_lens_list=[20, 20], + audio_frames_list=[30, 30], + context_text_lens_list=[10, 10], + context_audio_frames_list=[15, 15], + phoneme_lens_list=[25, 25], + ), + ), + ( + "Variable text & context lens (B=2, text=[15,25], ctx_text=[8,12], ctx_audio=[10,20])", + dict( + batch_size=2, + text_lens_list=[15, 25], + audio_frames_list=[30, 30], + context_text_lens_list=[8, 12], + context_audio_frames_list=[10, 20], + phoneme_lens_list=[20, 30], + ), + ), + ( + "Variable audio & phoneme lens (B=2, audio=[20,40], phoneme=[15,35])", + dict( + batch_size=2, + text_lens_list=[20, 20], + audio_frames_list=[20, 40], + context_text_lens_list=[10, 10], + context_audio_frames_list=[15, 15], + phoneme_lens_list=[15, 35], + ), + ), + ( + "All different (B=3)", + dict( + batch_size=3, + text_lens_list=[12, 20, 28], + audio_frames_list=[20, 30, 40], + context_text_lens_list=[6, 10, 14], + context_audio_frames_list=[8, 15, 22], + phoneme_lens_list=[15, 25, 35], + ), + ), + ] + + all_passed = True + for test_name, kwargs in test_configs: + batch = create_synthetic_batch(model, device=device, **kwargs) + passed = run_single_test(model, batch, test_name, device) + if not passed: + all_passed = False + + # Final summary + print(f"\n{'='*60}") + if all_passed: + print("✓ ALL TESTS PASSED") + else: + print("✗ SOME TESTS FAILED") + sys.exit(1) + print(f"{'='*60}") + + +if __name__ == '__main__': + main() From 4a0e36b8b80a670be464901fe766b90ad8f9c800 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sat, 7 Feb 2026 14:09:47 -0800 Subject: [PATCH 42/94] tested and verified that infer batch works correctly with teacher forcing and matches process batch output Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 58 ++++++++++--------- .../tts/test_infer_vs_process_batch.py | 2 +- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 0bb3aaff8441..580039e3db1b 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -2977,33 +2977,35 @@ def streaming_step( state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes) # Check for EOS in each frame and track exact end position - # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S) - all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S) - - # For each batch item, find if/where EOS occurs in this step's frames - eos_in_sampled = audio_codes_unstacked == self.audio_eos_id # (B, C, S) - eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id # (B, C, S) - eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1) # (B, S) - - # Find first frame with EOS per batch item (or S if none) - eos_frame_idx = torch.where( - eos_any_codebook.any(dim=1), - eos_any_codebook.int().argmax(dim=1), # first frame with EOS - torch.full((batch_size,), S, device=device), # no EOS in this step - ) # (B,) - - audio_eos_detected = eos_any_codebook.any(dim=1) # (B,) - state.finished = state.finished | audio_eos_detected - - # Track audio prediction end index (in frames) for items that just ended - newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1) - if newly_ended_audio.any(): - # End index = current frame count + frame offset where EOS was found - current_frame_count = len(state.all_predictions) * self.frame_stacking_factor - end_frame_idx = current_frame_count + eos_frame_idx - state.audio_prediction_end_idx = torch.where( - newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx - ) + # Skip EOS detection in teacher-forced mode - rely on GT exhaustion instead + if state.gt_audio_embeddings is None: + # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S) + all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S) + + # For each batch item, find if/where EOS occurs in this step's frames + eos_in_sampled = audio_codes_unstacked == self.audio_eos_id # (B, C, S) + eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id # (B, C, S) + eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1) # (B, S) + + # Find first frame with EOS per batch item (or S if none) + eos_frame_idx = torch.where( + eos_any_codebook.any(dim=1), + eos_any_codebook.int().argmax(dim=1), # first frame with EOS + torch.full((batch_size,), S, device=device), # no EOS in this step + ) # (B,) + + audio_eos_detected = eos_any_codebook.any(dim=1) # (B,) + state.finished = state.finished | audio_eos_detected + + # Track audio prediction end index (in frames) for items that just ended + newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1) + if newly_ended_audio.any(): + # End index = current frame count + frame offset where EOS was found + current_frame_count = len(state.all_predictions) * self.frame_stacking_factor + end_frame_idx = current_frame_count + eos_frame_idx + state.audio_prediction_end_idx = torch.where( + newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx + ) # Store unstacked codes state.all_predictions.append(audio_codes_unstacked) @@ -3030,7 +3032,7 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: # Sample phonemes if state.phoneme_sampling_method == 'argmax': - pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.01) + pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.0) else: pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py index b9838602586a..006be87ebaa2 100644 --- a/tests/collections/tts/test_infer_vs_process_batch.py +++ b/tests/collections/tts/test_infer_vs_process_batch.py @@ -49,7 +49,7 @@ def build_minimal_config(codecmodel_path: str) -> OmegaConf: 'intermediate_size': 512, 'mlp_hidden_act': 'silu', 'mlp_bias': False, - 'hybrid_override_pattern': 'M*', + 'hybrid_override_pattern': 'M*', # All Mamba layers 'layer_norm_epsilon': 1e-5, 'residual_in_fp32': True, }, From 0518c99606751ef74370a6598d87eaab57e7b6af Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sat, 7 Feb 2026 15:07:14 -0800 Subject: [PATCH 43/94] added legacy option to still work with 21fps F2F model Signed-off-by: Shehzeen Hussain --- examples/tts/magpietts_inference.py | 4 ++++ nemo/collections/tts/models/easy_magpietts.py | 21 +++++++++++++------ .../modules/magpietts_inference/inference.py | 8 +++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index 1e7753798db4..feead6519875 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -507,6 +507,7 @@ def create_argument_parser() -> argparse.ArgumentParser: target_group.add_argument('--cer_target', type=float, default=None) target_group.add_argument('--ssim_target', type=float, default=None) target_group.add_argument('--is_decoder_only_model', action='store_true') + target_group.add_argument('--legacy_context_stacking', action='store_true', help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking') target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt']) target_group.add_argument( '--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial'] @@ -575,6 +576,9 @@ def main(argv=None): phoneme_input_type=args.phoneme_input_type, phoneme_sampling_method=args.phoneme_sampling_method, dropout_text_input=args.dropout_text_input, + legacy_context_stacking=args.legacy_context_stacking, + longform_mode=args.longform_mode, + longform_word_threshold=args.longform_word_threshold, ) eval_config = EvaluationConfig( diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 580039e3db1b..8d7a956ac5d4 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -935,10 +935,15 @@ def local_transformer_sample_autoregressive( ) # (B, num_tokens_per_codebook) codebook_logits_rescored = codebook_logits.clone() codebook_logits_rescored[indices_to_remove] = float('-inf') - codebook_probs = torch.softmax( - codebook_logits_rescored / temperature, dim=-1 - ) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + + if temperature <= 0.0: + # Argmax sampling for deterministic output + codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) + else: + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) if use_cfg: codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] all_preds.append(codebook_preds) @@ -1226,11 +1231,15 @@ def prepare_context_tensors( eos_id=self.context_audio_eos_id, ) + # Use legacy audio_bos_id/audio_eos_id if flag is set + stack_bos_id = self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id + stack_eos_id = self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id + context_audio_codes, context_audio_codes_lens = self.stack_codes( context_audio_codes, context_audio_codes_lens, - self.context_audio_bos_id, - self.context_audio_eos_id, + stack_bos_id, + stack_eos_id, self.frame_stacking_factor, self.num_audio_codebooks, ) diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index 9b0db0f7f75e..70ca811f58a2 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -77,6 +77,11 @@ class InferenceConfig: phoneme_input_type: str = "gt" # gt or predicted phoneme_sampling_method: str = "argmax" # argmax or multinomial dropout_text_input: bool = False + legacy_context_stacking: bool = False # Use audio_bos_id/audio_eos_id for context stacking + + # Longform inference mode + longform_mode: str = "auto" # "auto" | "always" | "never" + longform_word_threshold: int = 40 # Word threshold for auto-detection is_decoder_only_model: bool = False def build_identifier(self) -> str: @@ -146,6 +151,9 @@ def __init__( self.model = model self.config = config + # Set legacy context stacking flag on model + self.model.legacy_context_stacking = config.legacy_context_stacking + # Set phoneme probability to 1 for inference self._configure_tokenizer() From fa0fafb7faf3059d7c8f12dda21fa798153e17d4 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sat, 7 Feb 2026 15:31:07 -0800 Subject: [PATCH 44/94] remove streaming decode because it not being used Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 57 +------------------ 1 file changed, 1 insertion(+), 56 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 8d7a956ac5d4..d48bf3125d10 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -2809,7 +2809,7 @@ def streaming_step( # Prediction mode: use BOS or last predicted phoneme first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0) has_last_phoneme = ( - needs_phoneme & ~first_phoneme_step & (state.last_phoneme_tokens is not None) + needs_phoneme & (~first_phoneme_step) & (state.last_phoneme_tokens is not None) ) if first_phoneme_step.any(): @@ -3077,61 +3077,6 @@ def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, tor return audio_codes_next, all_codes_next_argmax - def streaming_decode( - self, - state: StreamingState, - previous_decode_length: int = 0, - ) -> Tuple[torch.Tensor, torch.Tensor, int]: - """ - Decode accumulated audio codes to waveform, returning only the new chunk. - - WARNING: This function does not yet support batch_size > 1. - Do not use with batched streaming inference. Use streaming_finalize instead. - - This function takes all predicted codes so far and decodes them, but only - returns the newly generated audio portion (after previous_decode_length). - - Args: - state: Current StreamingState containing all_predictions. - previous_decode_length: Number of audio samples already decoded and returned - in previous calls. Use 0 on first call. - - Returns: - Tuple of: - - new_audio: Newly generated audio waveform (1, new_samples) - - new_audio_len: Length of new audio (1,) - - total_decode_length: Total decoded length so far (use as previous_decode_length - for next call) - """ - if len(state.all_predictions) == 0: - return ( - torch.zeros(1, 0, device=state.device), - torch.zeros(1, dtype=torch.long, device=state.device), - previous_decode_length, - ) - - with torch.inference_mode(): - # Concatenate all predictions - each is (1, C, S), concat gives (1, C, T_total_frames) - predicted_codes = torch.cat(state.all_predictions, dim=-1) # (1, C, T_total_frames) - predicted_codes_lens = torch.tensor([predicted_codes.size(-1)], device=state.device) - - # Decode to audio (codes are already unstacked, no EOS removal needed) - audio, audio_len, _ = self.codes_to_audio(predicted_codes, predicted_codes_lens) - - # Extract only new audio - total_decode_length = audio_len[0].item() - if total_decode_length <= previous_decode_length: - return ( - torch.zeros(1, 0, device=state.device), - torch.zeros(1, dtype=torch.long, device=state.device), - previous_decode_length, - ) - - new_audio = audio[:, previous_decode_length:total_decode_length] - new_audio_len = torch.tensor([total_decode_length - previous_decode_length], device=state.device) - - return new_audio, new_audio_len, total_decode_length - def streaming_finalize( self, state: StreamingState, From 432605bc6a227449bec2b1fef370bfa3d6ee0f86 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sun, 8 Feb 2026 09:19:09 -0800 Subject: [PATCH 45/94] pass phoneme EOS to next step Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index d48bf3125d10..8708c6f6732e 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -151,6 +151,7 @@ class StreamingState: phoneme_steps: Number of phoneme prediction steps taken per batch item (B,). audio_steps: Number of audio prediction steps taken per batch item (B,). phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor. + phoneme_eos_detected: Whether the phoneme EOS has been predicted per batch item (B,) bool tensor. finished: Whether generation is complete per batch item (B,) bool tensor. device: Device tensors are on. training_mode: The training mode being used for inference. @@ -187,6 +188,7 @@ class StreamingState: phoneme_steps: torch.Tensor audio_steps: torch.Tensor phoneme_stream_ended: torch.Tensor + phoneme_eos_detected: torch.Tensor finished: torch.Tensor device: torch.device training_mode: TrainingMode @@ -2654,6 +2656,7 @@ def streaming_init( phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device), audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device), phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device), + phoneme_eos_detected=torch.zeros(batch_size, dtype=torch.bool, device=device), finished=torch.zeros(batch_size, dtype=torch.bool, device=device), device=device, training_mode=selected_training_mode, @@ -2828,8 +2831,12 @@ def streaming_step( ) # (B, 1, E) last_mask = has_last_phoneme.view(batch_size, 1, 1).float() phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask + + # Only end phoneme stream in prediction mode when the phoneme EOS is detected + state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected next_input = next_input + phoneme_emb + # --- Audio embedding for audio phase items --- if needs_audio.any(): @@ -2947,6 +2954,7 @@ def streaming_step( dim=1 ) # (B,) state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected + state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected # Track phoneme prediction end index for items that just ended newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1) From b239c2f8ad15576e1cd2c666abdd3452c05c5a23 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Sun, 8 Feb 2026 18:50:08 -0800 Subject: [PATCH 46/94] exlcude codec model from optimizer params Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 8708c6f6732e..9f4e74a7141a 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -557,7 +557,7 @@ def setup_optimizer_param_groups(self): """ modules_to_exclude = { '_speaker_verification_model', - # '_codec_model', + '_codec_model', '_eval_asr_model', '_eval_speaker_verification_model', 'whisper_model', From ff68871058dfdf93a591f50d1d373105aad5f588 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 9 Feb 2026 00:59:20 -0800 Subject: [PATCH 47/94] reduce dropout prob, change default delays to 0,1 Signed-off-by: Shehzeen Hussain --- examples/tts/conf/magpietts/easy_magpietts.yaml | 12 ++++++------ .../tts/conf/magpietts/easy_magpietts_lhotse.yaml | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 8c44fef3f173..11ab71ab3a9b 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -73,7 +73,7 @@ model: local_transformer_n_heads: 12 local_transformer_hidden_dim: 1536 - cfg_unconditional_prob: 0.1 + cfg_unconditional_prob: 0.05 # To get special_tokens of the tokenzer, you can do: # model.tokenizer.first_tokenizer.additional_special_tokens @@ -82,15 +82,15 @@ model: # Each mode has its own task embedding that is prepended to the context. # During inference, you can specify which mode to use via the 'inference_mode' parameter. training_modes: - - name: "streaming_4_8" + - name: "streaming_0_1" text_input_mode: "streaming" # Options: "full", "streaming" - streaming_phonemes_delay: 4 - streaming_speech_delay: 8 + streaming_phonemes_delay: 0 + streaming_speech_delay: 1 frame_stacking_factor: 2 phoneme_stacking_factor: 1 - dropout_text_input_prob: 0.3 - dropout_phoneme_input_prob: 0.3 + dropout_text_input_prob: 0.1 + dropout_phoneme_input_prob: 0.1 phoneme_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index af943ee25dbb..dd6cf50d7c25 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -70,22 +70,22 @@ model: local_transformer_n_heads: 12 local_transformer_hidden_dim: 1536 - cfg_unconditional_prob: 0.1 + cfg_unconditional_prob: 0.05 # Multi-mode training configuration # The model will randomly select one of the modes for each batch during training. # Each mode has its own task embedding that is prepended to the context. # During inference, you can specify which mode to use via the 'inference_mode' parameter. training_modes: - - name: "streaming_4_8" + - name: "streaming_0_1" text_input_mode: "streaming" # Options: "full", "streaming" - streaming_phonemes_delay: 4 - streaming_speech_delay: 8 + streaming_phonemes_delay: 0 + streaming_speech_delay: 1 frame_stacking_factor: 2 phoneme_stacking_factor: 1 - dropout_text_input_prob: 0.3 - dropout_phoneme_input_prob: 0.3 + dropout_text_input_prob: 0.1 + dropout_phoneme_input_prob: 0.1 phoneme_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer From 021bd9e7d9e01ff855eb3a0a5e5b1f4046b1db01 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 9 Feb 2026 09:12:09 -0800 Subject: [PATCH 48/94] bug fix Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 9f4e74a7141a..a42b7e6da5f1 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -3011,7 +3011,7 @@ def streaming_step( torch.full((batch_size,), S, device=device), # no EOS in this step ) # (B,) - audio_eos_detected = eos_any_codebook.any(dim=1) # (B,) + audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio state.finished = state.finished | audio_eos_detected # Track audio prediction end index (in frames) for items that just ended From cb2cff1f26138073537205743dc5bac15fd5301c Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 9 Feb 2026 15:38:18 -0800 Subject: [PATCH 49/94] phoneme EOS handling bug fix Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index a42b7e6da5f1..36b7f0f6b451 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -2953,7 +2953,7 @@ def streaming_step( ).any( dim=1 ) # (B,) - state.phoneme_stream_ended = state.phoneme_stream_ended | phoneme_eos_detected + state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected # Track phoneme prediction end index for items that just ended From 386814d56b20b7f7443301947ee606206acdd37d Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 9 Feb 2026 18:17:56 -0800 Subject: [PATCH 50/94] phoneme corruption methodology implemented Signed-off-by: Shehzeen Hussain --- .../tts/conf/magpietts/easy_magpietts.yaml | 5 +- .../conf/magpietts/easy_magpietts_lhotse.yaml | 5 +- .../tts/data/text_to_speech_dataset_lhotse.py | 3 +- nemo/collections/tts/models/easy_magpietts.py | 132 ++++++++++++++++-- .../tts/test_infer_vs_process_batch.py | 1 - 5 files changed, 128 insertions(+), 18 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 11ab71ab3a9b..9545897ceda3 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -89,8 +89,11 @@ model: frame_stacking_factor: 2 phoneme_stacking_factor: 1 + phoneme_confidence_unk_threshold: 0.35 dropout_text_input_prob: 0.1 - dropout_phoneme_input_prob: 0.1 + phoneme_corruption_batch_prob: 0.1 + phoneme_corruption_timestep_ratio: 0.15 + phoneme_corruption_unk_mode_prob: 0.5 phoneme_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index dd6cf50d7c25..19d39f4cf320 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -84,8 +84,11 @@ model: frame_stacking_factor: 2 phoneme_stacking_factor: 1 + phoneme_confidence_unk_threshold: 0.35 dropout_text_input_prob: 0.1 - dropout_phoneme_input_prob: 0.1 + phoneme_corruption_batch_prob: 0.1 + phoneme_corruption_timestep_ratio: 0.15 + phoneme_corruption_unk_mode_prob: 0.5 phoneme_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index ba111838efa3..464b988b9415 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -65,7 +65,8 @@ def instantiate_phoneme_tokenizer(phoneme_tokenizer_config): phoneme_vocab_size = len(phoneme_tokenizer.tokens) phoneme_tokenizer.bos_token_id = phoneme_vocab_size phoneme_tokenizer.eos_token_id = phoneme_vocab_size + 1 - phoneme_tokenizer.vocab_size = phoneme_vocab_size + 2 + phoneme_tokenizer.unk_token_id = phoneme_vocab_size + 2 + phoneme_tokenizer.vocab_size = phoneme_vocab_size + 3 return phoneme_tokenizer diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 36b7f0f6b451..2fc5da261fe6 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -373,11 +373,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.cfg_unk_token_id = num_tokens - 1 self.phoneme_tokenizer = None self.dropout_text_input_prob = cfg.get('dropout_text_input_prob', 0.0) - self.dropout_phoneme_input_prob = cfg.get('dropout_phoneme_input_prob', 0.0) + self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0) + self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0) + self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5) if cfg.get('phoneme_tokenizer', None) is not None: self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size + # If max phoneme probability is below this threshold at inference-time, + # replace the predicted timestep with UNK to reduce error propagation. + self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.35) self.pad_context_text_to_max_duration = False @@ -1341,8 +1346,9 @@ def prepare_phoneme_channel_embeddings( phoneme_tokens: torch.Tensor, phoneme_tokens_lens: torch.Tensor, delay: torch.Tensor, - dropout_phoneme_input: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + apply_corruption: bool = False, + dropout_complete_phoneme_channel: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[str]]: """ Prepare phoneme embeddings as a channel input with delay handling. @@ -1355,7 +1361,8 @@ def prepare_phoneme_channel_embeddings( phoneme_tokens_lens: Length of phoneme tokens for each batch item (B,) delay: Number of zero positions to prepend for each batch item (B,). This is typically context_lens + phoneme_delay. - dropout_phoneme_input: If True, return all zeros (for phoneme dropout regularization). + apply_corruption: If True, apply phoneme-token corruption before embedding. + dropout_complete_phoneme_channel: If True, zero-out the whole phoneme channel embedding. Returns: Tuple of: @@ -1363,6 +1370,7 @@ def prepare_phoneme_channel_embeddings( - phoneme_channel_lens: Total length of phoneme channel for each batch item (B,) - phoneme_tokens_stacked: Stacked phoneme tokens (B, S, T') - phoneme_tokens_lens_stacked: Length of stacked phoneme tokens (B,) + - corruption_mode: None, "unk", or "repeat_skip" """ batch_size = phoneme_tokens.size(0) device = phoneme_tokens.device @@ -1378,6 +1386,13 @@ def prepare_phoneme_channel_embeddings( 1, ) + phoneme_corruption_mode = None + if apply_corruption: + phoneme_tokens_stacked, phoneme_corruption_mode = self.corrupt_stacked_phoneme_tokens( + phoneme_tokens_stacked=phoneme_tokens_stacked, + phoneme_tokens_lens_stacked=phoneme_tokens_lens_stacked, + ) + # Embed phoneme tokens phoneme_embedded = self.embed_phoneme_tokens(phoneme_tokens_stacked) # (B, T', E) @@ -1386,7 +1401,7 @@ def prepare_phoneme_channel_embeddings( phoneme_embedded = phoneme_embedded * phoneme_mask.unsqueeze(2) # (B, T', E) # Handle phoneme dropout - zero out the embeddings - if dropout_phoneme_input: + if dropout_complete_phoneme_channel: phoneme_embedded = phoneme_embedded * 0.0 # Create zero tensor for delay padding @@ -1399,7 +1414,78 @@ def prepare_phoneme_channel_embeddings( lengths=[delay, phoneme_tokens_lens_stacked], ) - return phoneme_channel_embedding, phoneme_channel_lens, phoneme_tokens_stacked, phoneme_tokens_lens_stacked + return ( + phoneme_channel_embedding, + phoneme_channel_lens, + phoneme_tokens_stacked, + phoneme_tokens_lens_stacked, + phoneme_corruption_mode, + ) + + def corrupt_stacked_phoneme_tokens( + self, + phoneme_tokens_stacked: torch.Tensor, + phoneme_tokens_lens_stacked: torch.Tensor, + ) -> Tuple[torch.Tensor, Optional[str]]: + """ + Corrupt stacked phoneme tokens for robustness to phoneme prediction errors. + + Two corruption modes are supported: + 1. UNK replacement at selected timesteps (all stacked channels replaced). + 2. Repeat/skip corruption via a shared index remapping over the valid prefix. + """ + if self.phoneme_tokenizer is None: + return phoneme_tokens_stacked, None + if self.phoneme_corruption_batch_prob <= 0.0: + return phoneme_tokens_stacked, None + if self.phoneme_corruption_timestep_ratio <= 0.0: + return phoneme_tokens_stacked, None + if torch.rand(1).item() >= self.phoneme_corruption_batch_prob: + return phoneme_tokens_stacked, None + + min_len = int(phoneme_tokens_lens_stacked.min().item()) + # Need room for BOS and EOS plus at least one interior timestep. + if min_len <= 2: + return phoneme_tokens_stacked, None + + # Corrupt only interior steps, keeping BOS/EOS untouched. + valid_start = 1 + valid_end = min_len - 1 # exclusive + num_valid_steps = max(0, valid_end - valid_start) + if num_valid_steps == 0: + return phoneme_tokens_stacked, None + + num_corrupt_steps = int(round(num_valid_steps * self.phoneme_corruption_timestep_ratio)) + num_corrupt_steps = max(1, min(num_valid_steps, num_corrupt_steps)) + + corrupted = phoneme_tokens_stacked.clone() + mode = 'unk' if torch.rand(1).item() < self.phoneme_corruption_unk_mode_prob else 'repeat_skip' + + candidate_steps = torch.arange(valid_start, valid_end, device=phoneme_tokens_stacked.device) + corrupt_steps = candidate_steps[torch.randperm(num_valid_steps, device=phoneme_tokens_stacked.device)][ + :num_corrupt_steps + ] + + if mode == 'unk': + if not hasattr(self.phoneme_tokenizer, 'unk_token_id'): + raise ValueError("Phoneme tokenizer is missing `unk_token_id` required for UNK corruption.") + corrupted[:, :, corrupt_steps] = self.phoneme_tokenizer.unk_token_id + return corrupted, mode + + # Repeat/skip corruption with a shared remap over [0, min_len). + # This keeps batched execution efficient and applies the same corrupted timeline across the batch. + source_index = torch.arange(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long) + step_delta = torch.ones(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long) + op_is_repeat = torch.rand(corrupt_steps.numel(), device=phoneme_tokens_stacked.device) < 0.5 + step_delta[corrupt_steps] = torch.where(op_is_repeat, torch.zeros_like(corrupt_steps), torch.full_like(corrupt_steps, 2)) + source_index = torch.cumsum(step_delta, dim=0) - step_delta[0] + source_index = torch.clamp(source_index, min=0, max=min_len - 1) + source_index[0] = 0 + source_index[-1] = min_len - 1 + + corrupted_prefix = phoneme_tokens_stacked[:, :, :min_len].index_select(dim=2, index=source_index) + corrupted[:, :, :min_len] = corrupted_prefix + return corrupted, mode def prepare_audio_channel_embeddings( self, @@ -1657,10 +1743,7 @@ def process_batch( # Determine dropout flags dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False - dropout_phoneme_input = (random.random() < self.dropout_phoneme_input_prob) if mode == 'train' else False - if dropout_phoneme_input and dropout_text_input: - dropout_phoneme_input = random.random() < 0.5 - dropout_text_input = not dropout_phoneme_input + dropout_phoneme_input = False # Determine CFG unconditional dropout dropout_conditional_input = False @@ -1707,17 +1790,24 @@ def process_batch( phoneme_channel_embedding = None phoneme_tokens_stacked = None phoneme_tokens_lens_stacked = None + phoneme_corruption_mode = None + dropout_complete_phoneme_channel = False if self.phoneme_tokenizer is not None and phoneme_tokens is not None: + # Corrupt phonemes only when text input is not dropped. + apply_phoneme_corruption = mode == 'train' and not dropout_text_input and not dropout_conditional_input + dropout_complete_phoneme_channel = dropout_conditional_input ( phoneme_channel_embedding, phoneme_channel_lens, phoneme_tokens_stacked, phoneme_tokens_lens_stacked, + phoneme_corruption_mode, ) = self.prepare_phoneme_channel_embeddings( phoneme_tokens=phoneme_tokens, phoneme_tokens_lens=phoneme_tokens_lens, delay=phoneme_delay, - dropout_phoneme_input=dropout_phoneme_input or dropout_conditional_input, + apply_corruption=apply_phoneme_corruption, + dropout_complete_phoneme_channel=dropout_complete_phoneme_channel, ) # 5. Prepare audio channel embeddings @@ -1854,14 +1944,12 @@ def process_batch( pb_phoneme_tokens_target = phoneme_tokens_stacked[:, :, 1:].long() pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1 - if not (dropout_conditional_input or dropout_text_input or dropout_phoneme_input): + if phoneme_corruption_mode != 'repeat_skip' and not dropout_complete_phoneme_channel: phoneme_loss, _ = self.compute_phoneme_loss( pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target ) - print("No Dropout - phoneme loss:", phoneme_loss.item()) else: phoneme_loss = torch.tensor(0.0, device=logits.device) - print("Dropout - phoneme loss skipped", phoneme_loss.item()) loss = loss + phoneme_loss @@ -3046,6 +3134,8 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: # Get phoneme logits all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] + phoneme_logits = all_code_logits_t_phoneme.view(actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size) + max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values # (B, phoneme_stacking_factor) # Sample phonemes if state.phoneme_sampling_method == 'argmax': @@ -3054,6 +3144,20 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk ) + + # In prediction mode, low-confidence phoneme steps are replaced with UNK across + # all stacked channels (except steps where EOS is predicted). + if ( + state.phoneme_input_type != 'gt' + and hasattr(self.phoneme_tokenizer, 'unk_token_id') + and self.phoneme_confidence_unk_threshold > 0.0 + ): + underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any(dim=1, keepdim=True) # (B, 1) + eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True) + replace_with_unk = underconfident_step & (~eos_predicted_step) + if replace_with_unk.any(): + unk_tokens = torch.full_like(pred_phoneme_tokens, self.phoneme_tokenizer.unk_token_id) + pred_phoneme_tokens = torch.where(replace_with_unk, unk_tokens, pred_phoneme_tokens) # (B, phoneme_stacking_factor) return pred_phoneme_tokens diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py index 006be87ebaa2..d225136989f1 100644 --- a/tests/collections/tts/test_infer_vs_process_batch.py +++ b/tests/collections/tts/test_infer_vs_process_batch.py @@ -82,7 +82,6 @@ def build_minimal_config(codecmodel_path: str) -> OmegaConf: 'frame_stacking_factor': 2, 'cfg_unconditional_prob': 0.0, 'dropout_text_input_prob': 0.0, - 'dropout_phoneme_input_prob': 0.0, 'local_transformer_type': 'none', 'run_val_inference': False, # Optim placeholder (required by ModelPT but not used) From 2bd08ed66180ccddbb3046bfa593fc541d045159 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 9 Feb 2026 18:28:19 -0800 Subject: [PATCH 51/94] revisit defaults and update Signed-off-by: Shehzeen Hussain --- examples/tts/conf/magpietts/easy_magpietts.yaml | 2 +- examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml | 2 +- nemo/collections/tts/models/easy_magpietts.py | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 9545897ceda3..ef2ad794c2d0 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -89,7 +89,7 @@ model: frame_stacking_factor: 2 phoneme_stacking_factor: 1 - phoneme_confidence_unk_threshold: 0.35 + phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation. dropout_text_input_prob: 0.1 phoneme_corruption_batch_prob: 0.1 phoneme_corruption_timestep_ratio: 0.15 diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index 19d39f4cf320..a6330272a1da 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -84,7 +84,7 @@ model: frame_stacking_factor: 2 phoneme_stacking_factor: 1 - phoneme_confidence_unk_threshold: 0.35 + phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation. dropout_text_input_prob: 0.1 phoneme_corruption_batch_prob: 0.1 phoneme_corruption_timestep_ratio: 0.15 diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 2fc5da261fe6..eec9e58a4161 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -382,7 +382,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size # If max phoneme probability is below this threshold at inference-time, # replace the predicted timestep with UNK to reduce error propagation. - self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.35) + self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0) self.pad_context_text_to_max_duration = False @@ -1743,7 +1743,6 @@ def process_batch( # Determine dropout flags dropout_text_input = (random.random() < self.dropout_text_input_prob) if mode == 'train' else False - dropout_phoneme_input = False # Determine CFG unconditional dropout dropout_conditional_input = False @@ -1794,7 +1793,7 @@ def process_batch( dropout_complete_phoneme_channel = False if self.phoneme_tokenizer is not None and phoneme_tokens is not None: # Corrupt phonemes only when text input is not dropped. - apply_phoneme_corruption = mode == 'train' and not dropout_text_input and not dropout_conditional_input + apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input) dropout_complete_phoneme_channel = dropout_conditional_input ( phoneme_channel_embedding, From 18e39b04d91093c881bf70846c033029e6944bce Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 9 Feb 2026 22:23:00 -0800 Subject: [PATCH 52/94] bug fix phoneme loss Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index eec9e58a4161..6b49977013e8 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -1348,7 +1348,7 @@ def prepare_phoneme_channel_embeddings( delay: torch.Tensor, apply_corruption: bool = False, dropout_complete_phoneme_channel: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[str]]: + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[str]]: """ Prepare phoneme embeddings as a channel input with delay handling. @@ -1370,6 +1370,7 @@ def prepare_phoneme_channel_embeddings( - phoneme_channel_lens: Total length of phoneme channel for each batch item (B,) - phoneme_tokens_stacked: Stacked phoneme tokens (B, S, T') - phoneme_tokens_lens_stacked: Length of stacked phoneme tokens (B,) + - phoneme_tokens_stacked_clean: Clean stacked phoneme tokens before corruption (B, S, T') - corruption_mode: None, "unk", or "repeat_skip" """ batch_size = phoneme_tokens.size(0) @@ -1385,6 +1386,7 @@ def prepare_phoneme_channel_embeddings( self.phoneme_stacking_factor, 1, ) + phoneme_tokens_stacked_clean = phoneme_tokens_stacked.clone() phoneme_corruption_mode = None if apply_corruption: @@ -1419,6 +1421,7 @@ def prepare_phoneme_channel_embeddings( phoneme_channel_lens, phoneme_tokens_stacked, phoneme_tokens_lens_stacked, + phoneme_tokens_stacked_clean, phoneme_corruption_mode, ) @@ -1789,6 +1792,7 @@ def process_batch( phoneme_channel_embedding = None phoneme_tokens_stacked = None phoneme_tokens_lens_stacked = None + phoneme_tokens_stacked_clean = None phoneme_corruption_mode = None dropout_complete_phoneme_channel = False if self.phoneme_tokenizer is not None and phoneme_tokens is not None: @@ -1800,6 +1804,7 @@ def process_batch( phoneme_channel_lens, phoneme_tokens_stacked, phoneme_tokens_lens_stacked, + phoneme_tokens_stacked_clean, phoneme_corruption_mode, ) = self.prepare_phoneme_channel_embeddings( phoneme_tokens=phoneme_tokens, @@ -1940,10 +1945,10 @@ def process_batch( target_lens=phoneme_tokens_lens_stacked - 1, ) pb_phoneme_logits = self.phoneme_final_proj(pred_embeddings_phoneme) - pb_phoneme_tokens_target = phoneme_tokens_stacked[:, :, 1:].long() + pb_phoneme_tokens_target = phoneme_tokens_stacked_clean[:, :, 1:].long() pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1 - if phoneme_corruption_mode != 'repeat_skip' and not dropout_complete_phoneme_channel: + if (phoneme_corruption_mode != 'repeat_skip') and not (dropout_complete_phoneme_channel or dropout_conditional_input or dropout_text_input): phoneme_loss, _ = self.compute_phoneme_loss( pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target ) From e5d141b93b0328c52a66ecdd3ffac8abea47d1e7 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Tue, 10 Feb 2026 09:47:31 -0800 Subject: [PATCH 53/94] another inference bug fix Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 6b49977013e8..b92ebf03d8ac 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -380,6 +380,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size + self.phoneme_vocab_size -= 1 # If max phoneme probability is below this threshold at inference-time, # replace the predicted timestep with UNK to reduce error propagation. self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0) @@ -412,6 +413,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): for _ in range(self.phoneme_stacking_factor): phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim)) self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) + print("phoneme_vocab_size for final proj.", self.phoneme_vocab_size) self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) # Decoder backend selection - supports HuggingFace models or NemotronH @@ -2832,8 +2834,8 @@ def streaming_step( # ==================== DETERMINE PHASES PER BATCH ITEM ==================== needs_context = state.context_position < state.full_context_lens # (B,) bool needs_text = (~needs_context) & (~state.text_finished) - needs_phoneme = (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended) - needs_audio = (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished) + needs_phoneme = (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended) + needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished) next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) # --- Context phase items: use next context embedding --- @@ -2874,7 +2876,7 @@ def streaming_step( # The EOS token itself IS embedded normally (matching process_batch behavior # where EOS is part of the text sequence). After this step, text_finished is set # so subsequent steps won't add any text embedding. - is_eos_token = text_tokens == self.eos_id # (B,) bool + is_eos_token = text_tokens == self.eos_id & needs_text # (B,) bool text_add_mask = needs_text.view(batch_size, 1, 1).float() next_input = next_input + text_embedded * text_add_mask state.text_finished = state.text_finished | is_eos_token From b1b86f0c68d5fd81c893f8521da938a0b12e64ad Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Tue, 10 Feb 2026 09:51:12 -0800 Subject: [PATCH 54/94] phoneme vocab size fix Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index b92ebf03d8ac..33e0c6cf0aef 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -380,7 +380,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size - self.phoneme_vocab_size -= 1 # If max phoneme probability is below this threshold at inference-time, # replace the predicted timestep with UNK to reduce error propagation. self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0) From 4a872aaa9c9328f03d26e99482cc508c0e93585b Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Tue, 10 Feb 2026 10:04:16 -0800 Subject: [PATCH 55/94] bug fix Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 33e0c6cf0aef..9a12b2480adf 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -2875,7 +2875,7 @@ def streaming_step( # The EOS token itself IS embedded normally (matching process_batch behavior # where EOS is part of the text sequence). After this step, text_finished is set # so subsequent steps won't add any text embedding. - is_eos_token = text_tokens == self.eos_id & needs_text # (B,) bool + is_eos_token = (text_tokens == self.eos_id) & needs_text # (B,) bool text_add_mask = needs_text.view(batch_size, 1, 1).float() next_input = next_input + text_embedded * text_add_mask state.text_finished = state.text_finished | is_eos_token From beaee7b944e8a2f0812c3c67e156a36cf2850422 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Tue, 10 Feb 2026 11:46:59 -0800 Subject: [PATCH 56/94] handle legacy model phoneme vocab size Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 9a12b2480adf..ca3b1eb56c2e 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -380,6 +380,10 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size + if cfg.get('phoneme_corruption_batch_prob', None) is None: + # Legacy mode: remove the UNK token from the phoneme vocabulary + # TODO: Remove this. + self.phoneme_vocab_size -= 1 # If max phoneme probability is below this threshold at inference-time, # replace the predicted timestep with UNK to reduce error propagation. self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0) @@ -412,7 +416,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): for _ in range(self.phoneme_stacking_factor): phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim)) self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) - print("phoneme_vocab_size for final proj.", self.phoneme_vocab_size) self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) # Decoder backend selection - supports HuggingFace models or NemotronH From 0879a1251ba737408b35c00848ac37659e53789f Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Wed, 11 Feb 2026 12:06:15 -0800 Subject: [PATCH 57/94] context duration handling - stop repeating excessively Signed-off-by: Shehzeen Hussain --- .../tts/data/text_to_speech_dataset.py | 18 ++++++++++++++---- .../tts/data/text_to_speech_dataset_lhotse.py | 18 ++++++++++++++---- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index e25e703f52ee..f680a8d9eb34 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -420,6 +420,12 @@ def get_num_audio_samples_to_slice(self, duration, sample_rate): def __getitem__(self, index): data = self.data_samples[index] + + def _sample_context_duration_with_available_limit(available_duration_sec: float) -> float: + effective_duration_max = min(self.context_duration_max, available_duration_sec) + effective_duration_max = max(self.context_duration_min, effective_duration_max) + return random.uniform(self.context_duration_min, effective_duration_max) + tokenizer_name = "english_phoneme" # Default to english phoneme tokenizer if data.tokenizer_names is not None: # Pick a random tokenizer from the list of tokenizers @@ -489,8 +495,10 @@ def __getitem__(self, index): if self.load_cached_codes_if_available and 'context_audio_codes_path' in data.manifest_entry: context_audio_codes_path = data.manifest_entry['context_audio_codes_path'] context_audio_codes = torch.load(context_audio_codes_path) # (8, T) - # Sample random duration between self.context_duration_min and self.context_duration_max - _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max) + _available_context_duration = ( + context_audio_codes.shape[1] * self.codec_model_samples_per_frame / self.sample_rate + ) + _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration) _num_frames_to_slice = int( _context_duration_to_slice * self.sample_rate / self.codec_model_samples_per_frame ) @@ -517,7 +525,8 @@ def __getitem__(self, index): duration=context_duration, ) context_audio_array = context_audio_array.samples - _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max) + _available_context_duration = len(context_audio_array) / self.sample_rate + _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration) _num_samples_to_slice = self.get_num_audio_samples_to_slice(_context_duration_to_slice, self.sample_rate) if _num_samples_to_slice < len(context_audio_array): start_idx = random.randint(0, len(context_audio_array) - _num_samples_to_slice) @@ -566,7 +575,8 @@ def __getitem__(self, index): sample_rate=16000, volume_norm=self.volume_norm, ) - _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max) + _available_context_duration = len(audio_array_16khz) / 16000 + _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration) _num_samples_to_slice = int(_context_duration_to_slice * 16000) if _num_samples_to_slice < len(audio_array_16khz): start_idx = random.randint(0, len(audio_array_16khz) - _num_samples_to_slice) diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 464b988b9415..56a80d6af63c 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -232,6 +232,12 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: ) # raw text here is the string of normalized text or text stored in the supervision segment. Used to distinguish from text tokens. phoneme_token_list = [] phoneme_token_len_list = [] + + def _sample_context_duration_with_available_limit(available_duration_sec: float) -> float: + effective_duration_max = min(self.context_duration_max, available_duration_sec) + effective_duration_max = max(self.context_duration_min, effective_duration_max) + return random.uniform(self.context_duration_min, effective_duration_max) + for cut in cuts: speaker = cut.supervisions[0].speaker if not check_speaker_format(speaker): @@ -276,8 +282,10 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: # and duration are None to the load function. context_audio_codes_array = cut.context_codes.load().astype(np.int32) context_audio_codes = torch.from_numpy(context_audio_codes_array) # (C, T) - # Sample random duration between self.context_duration_min and self.context_duration_max - _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max) + _available_context_duration = ( + context_audio_codes.shape[1] * self.codec_model_samples_per_frame / self.sample_rate + ) + _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration) _num_frames_to_slice = int( _context_duration_to_slice * self.sample_rate / self.codec_model_samples_per_frame ) @@ -301,7 +309,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: context_audio_array = cut.context_audio.resample(self.sample_rate).load_audio().squeeze(0) if self.volume_norm: context_audio_array = normalize_volume(context_audio_array) - _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max) + _available_context_duration = len(context_audio_array) / self.sample_rate + _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration) _num_samples_to_slice = self.get_num_audio_samples_to_slice( _context_duration_to_slice, self.sample_rate ) @@ -351,7 +360,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: audio_array_16khz = cut.target_audio.resample(16_000).load_audio().squeeze(0) if self.volume_norm: audio_array_16khz = normalize_volume(audio_array_16khz) - _context_duration_to_slice = random.uniform(self.context_duration_min, self.context_duration_max) + _available_context_duration = len(audio_array_16khz) / 16_000 + _context_duration_to_slice = _sample_context_duration_with_available_limit(_available_context_duration) _num_samples_to_slice = int(_context_duration_to_slice * 16_000) if _num_samples_to_slice < len(audio_array_16khz): start_idx = random.randint(0, len(audio_array_16khz) - _num_samples_to_slice) From ae557ac2696516216f759855b248e951a18459b7 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Wed, 11 Feb 2026 12:24:32 -0800 Subject: [PATCH 58/94] clamp cer and wer to 1 Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index ca3b1eb56c2e..47138ac18094 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -2263,8 +2263,8 @@ def validation_step(self, batch, batch_idx): if pred_transcripts[idx] is None: continue gt_transcript = process_text_for_cer(batch['raw_texts'][idx]) - cer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=True) - wer = word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False) + cer = min(word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=True), 1.0) + wer = min(word_error_rate([pred_transcripts[idx]], [gt_transcript], use_cer=False), 1.0) batch_cer.append(cer) batch_wer.append(wer) ssim = None From 3d69a1253fa27d7b787ad5e3ef5717cb96c14020 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Tue, 17 Feb 2026 00:13:57 -0800 Subject: [PATCH 59/94] Preference Optimization for EasyMagpieTTS (#64) * PO for EM-TTS Signed-off-by: Shehzeen Hussain * add PO mode in training Signed-off-by: Shehzeen Hussain * PO code update Signed-off-by: Shehzeen Hussain * wip Signed-off-by: Shehzeen Hussain * wip Signed-off-by: Shehzeen Hussain * wip Signed-off-by: Shehzeen Hussain * wip Signed-off-by: Shehzeen Hussain * bug fixes Signed-off-by: Shehzeen Hussain * logging for gradient tracking Signed-off-by: Shehzeen Hussain * GRPO working Signed-off-by: Shehzeen Hussain --------- Signed-off-by: Shehzeen Hussain Signed-off-by: Shehzeen Hussain --- examples/tts/easy_magpietts.py | 24 +- .../tts/data/text_to_speech_dataset_lhotse.py | 5 +- nemo/collections/tts/models/__init__.py | 2 + nemo/collections/tts/models/easy_magpietts.py | 50 +- .../easy_magpietts_preference_optimization.py | 1141 +++++++++++++++++ nemo/core/classes/modelPT.py | 2 +- 6 files changed, 1208 insertions(+), 16 deletions(-) create mode 100644 nemo/collections/tts/models/easy_magpietts_preference_optimization.py diff --git a/examples/tts/easy_magpietts.py b/examples/tts/easy_magpietts.py index 4195060b87ef..5e9be71a7805 100644 --- a/examples/tts/easy_magpietts.py +++ b/examples/tts/easy_magpietts.py @@ -14,9 +14,9 @@ import lightning.pytorch as pl import torch.multiprocessing as mp -from omegaconf import OmegaConf +from omegaconf import OmegaConf, open_dict -from nemo.collections.tts.models import EasyMagpieTTSModel +from nemo.collections.tts.models import EasyMagpieTTSModel, EasyMagpieTTSModelOnlinePO from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -42,15 +42,25 @@ def main(cfg): trainer.callbacks.append(pl.callbacks.LearningRateMonitor(logging_interval='step', log_weight_decay=True)) exp_manager(trainer, cfg.get("exp_manager", None)) - model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer) + mode = cfg.get('mode', 'train') + if mode == 'train': + model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer) + elif mode == 'onlinepo_train': + model_cfg = cfg.model + with open_dict(model_cfg): + model_cfg.reference_model_ckpt_path = cfg.init_from_ptl_ckpt + model = EasyMagpieTTSModelOnlinePO(cfg=model_cfg, trainer=trainer) + elif mode == 'test': + model = EasyMagpieTTSModel(cfg=cfg.model, trainer=trainer) + else: + raise NotImplementedError(f"Only train, onlinepo_train and test modes are supported. Got {mode}") + model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - if cfg.get('mode', 'train') == 'train': + if mode in ['train', 'onlinepo_train']: trainer.fit(model) - elif cfg.get('mode', 'train') == 'test': + elif mode == 'test': trainer.test(model) - else: - raise NotImplementedError(f"Only train and test modes are supported. Got {cfg.mode}") if __name__ == '__main__': diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 56a80d6af63c..ffd6b5629cc4 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -244,7 +244,10 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float) raise ValueError(f"Invalid format in cut.supervisions[0].speaker: {speaker}") dataset_name = speaker.strip().split()[2].split(":")[-1] dataset_name_list.append(dataset_name) - language = cut.supervisions[0].language if cut.supervisions[0].has_custom("language") else "en" + if cut.has_custom("lang"): + language = cut.lang + else: + language = cut.supervisions[0].language if cut.supervisions[0].has_custom("language") else "en" language_list.append(language) # target audio or target codes diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py index 20984cfccc6a..0783c79bacab 100644 --- a/nemo/collections/tts/models/__init__.py +++ b/nemo/collections/tts/models/__init__.py @@ -15,6 +15,7 @@ from nemo.collections.tts.models.aligner import AlignerModel from nemo.collections.tts.models.audio_codec import AudioCodecModel from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel +from nemo.collections.tts.models.easy_magpietts_preference_optimization import EasyMagpieTTSModelOnlinePO from nemo.collections.tts.models.fastpitch import FastPitchModel from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL from nemo.collections.tts.models.hifigan import HifiGanModel @@ -36,6 +37,7 @@ "InferBatchOutput", "MagpieTTSModel", "EasyMagpieTTSModel", + "EasyMagpieTTSModelOnlinePO", "MagpieTTSModelOfflinePODataGen", "MagpieTTSModelOfflinePO", "MagpieTTSModelOnlinePO", diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 47138ac18094..5dd61563788d 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -454,6 +454,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) self.decoder.set_input_embeddings(self.text_embedding) + # self.decoder.float() # Task embedding for multi-mode training # Each mode has a unique task embedding that is prepended to the context @@ -718,6 +719,14 @@ def codes_to_audio(self, codes, codes_len): # Pass the modified integer token IDs if self._codec_converter is not None: codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) + if codes_len.min() < 4: + # Pad the codes with 0s to make the minimum length 4 + # codes is (B, C, T) + codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0) + # Updates all lens less than 4 to 4 + codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len) + codes = codes[:,:,:codes_len.max()] + audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) # audio: (B, T) # audio_len: (B,) @@ -934,6 +943,12 @@ def local_transformer_sample_autoregressive( cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits codebook_logits[:actual_batch_size] = cfg_logits + # Replace NaN/inf then clamp to prevent extreme values (e.g. from CFG) causing NaN in softmax + # print("codebook_logits stats before nan_to_num") + # print(f"min: {codebook_logits.min()}, max: {codebook_logits.max()}, mean: {codebook_logits.mean()}, std: {codebook_logits.std()}") + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) + for item_idx in unfinished_items: codebook_logits[item_idx, self.audio_eos_id] = float('-inf') for item_idx in finished_items: @@ -985,6 +1000,9 @@ def sample_codes_from_logits( si = idx * self.num_all_tokens_per_codebook ei = si + self.num_all_tokens_per_codebook codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) + # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) for item_idx in unfinished_items: codebook_logits[item_idx, self.audio_eos_id] = float('-inf') for item_idx in finished_items: @@ -1016,6 +1034,9 @@ def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, t si = idx * self.phoneme_vocab_size ei = si + self.phoneme_vocab_size codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) + # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( -1 @@ -2145,11 +2166,11 @@ def validation_step(self, batch, batch_idx): if self.run_val_inference: infer_output = self.infer_batch( batch, - max_decoder_steps=220, + max_decoder_steps=300, temperature=0.7, topk=80, use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR, - use_cfg=True, + use_cfg=self.cfg.get('inference_use_cfg_in_val', True), cfg_scale=2.5 ) @@ -2610,6 +2631,7 @@ def streaming_init( gt_phoneme_tokens_lens: Optional[torch.Tensor] = None, gt_audio_codes: Optional[torch.Tensor] = None, gt_audio_codes_lens: Optional[torch.Tensor] = None, + use_inference_mode: bool = True, ) -> StreamingState: """ Initialize streaming TTS inference state. @@ -2655,7 +2677,8 @@ def streaming_init( Returns: StreamingState: Initial state for streaming inference. """ - with torch.inference_mode(): + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): batch_size = context_audio_codes.size(0) device = context_audio_codes.device @@ -2785,6 +2808,7 @@ def streaming_step( state: StreamingState, text_tokens: Optional[torch.Tensor] = None, force_dropout_text: bool = False, + use_inference_mode: bool = True, ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]: """ Perform one streaming inference step with batch support. @@ -2827,7 +2851,8 @@ def streaming_step( if state.finished.all(): return state, None, None - with torch.inference_mode(): + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): device = state.device batch_size = state.batch_size streaming_speech_delay = state.training_mode.streaming_speech_delay @@ -3200,6 +3225,7 @@ def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, tor def streaming_finalize( self, state: StreamingState, + use_inference_mode: bool = True, ) -> StreamingFinalizeOutput: """ Finalize streaming and return the complete generated audio and phoneme predictions. @@ -3249,7 +3275,8 @@ def streaming_finalize( phoneme_text=phoneme_text_list, ) - with torch.inference_mode(): + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames) all_codes = torch.cat(state.all_predictions, dim=-1) # (B, C, T_total_frames) total_frames = all_codes.size(-1) @@ -3317,6 +3344,7 @@ def infer_batch( phoneme_sampling_method: str = 'argmax', force_dropout_text: bool = False, use_teacher_forced: bool = False, + use_inference_mode: bool = True, ) -> InferBatchOutput: """ Batch inference using streaming infrastructure. @@ -3352,7 +3380,8 @@ def infer_batch( Returns: InferBatchOutput containing predicted audio, codes, and RTF metrics. """ - with torch.inference_mode(): + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): start_time = time.time() # Extract tensors from batch @@ -3440,6 +3469,7 @@ def infer_batch( gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, gt_audio_codes=gt_audio_codes_for_init, gt_audio_codes_lens=gt_audio_codes_lens_for_init, + use_inference_mode=use_inference_mode, ) time_to_first_prediction = None @@ -3447,7 +3477,12 @@ def infer_batch( device = text.device # Generate until all items are finished or max steps reached + print("Generation started") + gen_step = 0 while not state.finished.all() and len(state.all_predictions) < max_decoder_steps: + gen_step += 1 + if gen_step % 10 == 0: + print(f"Generation step {gen_step} ") # Gather the correct text token for each batch item based on text_tokens_seen # Items in context phase will have their token ignored by streaming_step positions = state.text_tokens_seen.clamp(max=text.size(1) - 1) @@ -3463,6 +3498,7 @@ def infer_batch( state=state, text_tokens=current_tokens, force_dropout_text=force_dropout_text, + use_inference_mode=use_inference_mode, ) # Record time to first audio prediction @@ -3472,7 +3508,7 @@ def infer_batch( tts_generation_time = time.time() - generation_start_time # Finalize and decode audio - finalize_output = self.streaming_finalize(state) + finalize_output = self.streaming_finalize(state, use_inference_mode=use_inference_mode) end_time = time.time() total_time = end_time - start_time diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py new file mode 100644 index 000000000000..1bc94c14206f --- /dev/null +++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py @@ -0,0 +1,1141 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import random +import time +from typing import Dict, List, Optional + +import numpy as np +import soundfile as sf +import torch +from lightning.pytorch import Trainer +from omegaconf import DictConfig, open_dict + +import nemo.collections.asr as nemo_asr +from nemo.collections.asr.metrics.wer import word_error_rate +from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig +from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel +from nemo.collections.tts.parts.utils.helpers import ( + get_mask_from_lengths, + get_speaker_embeddings_from_filepaths, + process_text_for_cer, + transcribe_with_whisper, +) +from nemo.utils import logging + +try: + import torchaudio + from torchaudio.pipelines import SQUIM_OBJECTIVE + + HAVE_TORCHAUDIO = True +except ImportError: + HAVE_TORCHAUDIO = False + +try: + from nemo_text_processing.text_normalization.normalize import Normalizer + + PYNINI_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + Normalizer = None + PYNINI_AVAILABLE = False + + +class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel): + """ + EasyMagpie-TTS online preference optimization model (GRPO / DR-GRPO). + + Training flow: + 1. Sample multiple generations per prompt. + 2. Compute rewards (CER/SSIM/PESQ). + 3. Compute group-normalized advantages. + 4. Run teacher-forced policy forward on generated codes and optimize GRPO objective. + 5. Add auxiliary phoneme loss from the same forward pass with GT phoneme tokens. + """ + + def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): + super().__init__(cfg, trainer) + + self.run_val_inference = True # Always run validation inference in PO. + self.automatic_optimization = False + + ref_model_cfg = copy.deepcopy(cfg) + with open_dict(ref_model_cfg): + ref_model_cfg.train_ds = None + ref_model_cfg.validation_ds = None + + self.reference_free = self.cfg.get('reference_free', False) + if not self.reference_free: + self._reference_model = EasyMagpieTTSModel(cfg=ref_model_cfg) + logging.info("Loading EasyMagpie reference model from checkpoint") + self._reference_model.load_state_dict( + torch.load(cfg.reference_model_ckpt_path, map_location="cpu", weights_only=False)['state_dict'] + ) + self._reference_model.freeze() + self._reference_model._no_state_dict = True + logging.info("Reference model loaded and frozen") + + reward_asr_model = cfg.get('reward_asr_model', 'nemo') + if reward_asr_model == 'nemo': + self._eval_asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained( + model_name=cfg.get('reward_asr_model_name', "nvidia/parakeet-ctc-0.6b") + ) + self._eval_asr_model.freeze() + self.whisper_processor = None + self.whisper_model = None + elif reward_asr_model == 'whisper': + from transformers import WhisperForConditionalGeneration, WhisperProcessor + + self._eval_asr_model = None + self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") + self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") + self.whisper_model.eval() + for param in self.whisper_model.parameters(): + param.requires_grad = False + self.use_multilingual_asr = True + else: + raise ValueError(f"Unknown reward_asr_model: {reward_asr_model}") + + self._eval_speaker_verification_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( + model_name=cfg.get('speaker_verification_model_name', 'titanet_large') + ) + self._eval_speaker_verification_model.freeze() + + use_pesq = self.cfg.get('use_pesq', False) + if use_pesq: + assert HAVE_TORCHAUDIO, "torchaudio is required for PESQ reward." + self.squim_objective_model = SQUIM_OBJECTIVE.get_model() + + self.loss_type = self.cfg.get('loss_type', 'grpo') + if self.loss_type not in ['grpo', 'dr_grpo']: + raise ValueError( + f"Received loss_type={self.loss_type}. Supported values: ['grpo', 'dr_grpo']." + ) + self.scale_rewards = self.cfg.get('scale_rewards', True) + self.max_decoder_steps = self.cfg.get('max_decoder_steps', 220) + self.aux_phoneme_loss_weight = self.cfg.get('aux_phoneme_loss_weight', 1.0) + self.po_groups_per_subbatch = max(int(self.cfg.get('po_groups_per_subbatch', 1)), 1) + + self._normalize_whisper_transcript = self.cfg.get('normalize_whisper_transcript', True) + if reward_asr_model == 'whisper' and self._normalize_whisper_transcript: + self._normalizer_cache = {} + + # Filter out poor groups for stable optimization. + self.best_cer_threshold = self.cfg.get('best_cer_threshold', 1.0) + self.worst_cer_threshold = self.cfg.get('worst_cer_threshold', 1.0) + + if self.trainer is not None and str(self.trainer.precision) in ("32", "32-true"): + self.decoder.float() + + def _get_trainable_module_groups(self) -> Dict[str, List[torch.nn.Parameter]]: + """Return a dict mapping module-group name → list of trainable parameters.""" + modules_to_exclude = { + '_speaker_verification_model', '_codec_model', '_eval_asr_model', + '_eval_speaker_verification_model', '_reference_model', + 'whisper_model', 'whisper_processor', 'squim_objective_model', + } + groups: Dict[str, List[torch.nn.Parameter]] = {} + for name, module in self.named_children(): + if name in modules_to_exclude: + continue + params = [p for p in module.parameters() if p.requires_grad] + if params: + groups[name] = params + return groups + + @torch.no_grad() + def _compute_grad_and_weight_metrics(self) -> Dict[str, float]: + """Compute per-module grad_norm, weight_norm, and global aggregates.""" + module_groups = self._get_trainable_module_groups() + metrics: Dict[str, float] = {} + all_grad_norms, all_weight_norms = [], [] + + for group_name, params in module_groups.items(): + grad_norms, weight_norms = [], [] + for p in params: + weight_norms.append(p.data.norm(2).item()) + if p.grad is not None: + grad_norms.append(p.grad.data.norm(2).item()) + + module_weight_norm = float(np.sqrt(sum(w ** 2 for w in weight_norms))) + metrics[f'weight_norm/{group_name}'] = module_weight_norm + all_weight_norms.extend(weight_norms) + + if grad_norms: + module_grad_norm = float(np.sqrt(sum(g ** 2 for g in grad_norms))) + metrics[f'grad_norm/{group_name}'] = module_grad_norm + all_grad_norms.extend(grad_norms) + else: + metrics[f'grad_norm/{group_name}'] = 0.0 + + if all_grad_norms: + metrics['grad_norm/global'] = float(np.sqrt(sum(g ** 2 for g in all_grad_norms))) + if all_weight_norms: + metrics['weight_norm/global'] = float(np.sqrt(sum(w ** 2 for w in all_weight_norms))) + return metrics + + @torch.no_grad() + def _compute_weight_update_metrics(self, prev_weights: Dict[int, torch.Tensor]) -> Dict[str, float]: + """Compute per-module weight delta norms (how much weights changed after optimizer step).""" + metrics: Dict[str, float] = {} + module_groups = self._get_trainable_module_groups() + all_deltas = [] + for group_name, params in module_groups.items(): + deltas = [] + for p in params: + pid = id(p) + if pid in prev_weights: + deltas.append((p.data - prev_weights[pid]).norm(2).item()) + if deltas: + metrics[f'weight_delta/{group_name}'] = float(np.sqrt(sum(d ** 2 for d in deltas))) + all_deltas.extend(deltas) + if all_deltas: + metrics['weight_delta/global'] = float(np.sqrt(sum(d ** 2 for d in all_deltas))) + return metrics + + @torch.no_grad() + def _snapshot_trainable_weights(self) -> Dict[int, torch.Tensor]: + """Take a snapshot of all trainable parameter values (by param id).""" + snapshot = {} + for params in self._get_trainable_module_groups().values(): + for p in params: + snapshot[id(p)] = p.data.clone() + return snapshot + + def _print_grad_weight_summary(self, metrics: Dict[str, float], step: int) -> None: + """Print a compact per-module summary of grad_norm / weight_norm / weight_delta.""" + if not getattr(self.trainer, "is_global_zero", True): + return + + lines = [f"\n[grad/weight] step={step} " + f"grad={metrics.get('grad_norm/global', 0.0):.6f} " + f"w={metrics.get('weight_norm/global', 0.0):.4f} " + f"Δw={metrics.get('weight_delta/global', 0.0):.8f}"] + + module_names = sorted( + k.split('/')[1] for k in metrics + if k.startswith('weight_norm/') and k != 'weight_norm/global' + ) + for name in module_names: + gn = metrics.get(f'grad_norm/{name}', 0.0) + wn = metrics.get(f'weight_norm/{name}', 0.0) + wd = metrics.get(f'weight_delta/{name}', 0.0) + lines.append(f" {name:40s} grad={gn:.6f} w={wn:.4f} Δw={wd:.8f}") + + summary = "\n".join(lines) + print(summary) + logging.info(summary) + + def setup_optimizer_param_groups(self): + """ + Exclude frozen eval/reference modules AND modules that receive no gradients + from the PO loss (final_proj, lm_text_head, phoneme_final_proj) from the + optimizer. Including them would subject their weights to weight decay without + any learning signal, slowly degrading them. + """ + modules_to_exclude = { + '_speaker_verification_model', + '_codec_model', + '_eval_asr_model', + '_eval_speaker_verification_model', + '_reference_model', + 'whisper_model', + 'whisper_processor', + # These modules are not used by the PO loss and receive no gradients. + # Including them would only apply weight decay, degrading their weights. + 'final_proj', + 'lm_text_head', + 'phoneme_final_proj', + } + + excluded_param_ids = set() + for name, module in self.named_children(): + if name in modules_to_exclude and hasattr(module, "parameters"): + for param in module.parameters(): + excluded_param_ids.add(id(param)) + + trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids] + self._optimizer_param_groups = [{"params": trainable_params}] + + def state_dict(self, destination=None, prefix='', keep_vars=False): + state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars) + keys_substrings_to_exclude = ['_reference_model'] + for key in list(state_dict.keys()): + if any(substring in key for substring in keys_substrings_to_exclude): + del state_dict[key] + return state_dict + + def _get_cached_normalizer(self, lang_key: Optional[str]): + if not PYNINI_AVAILABLE: + return None + lang_key = lang_key if lang_key else "en" + if lang_key not in self._normalizer_cache: + logging.info(f"Creating normalizer for language: {lang_key}") + try: + self._normalizer_cache[lang_key] = Normalizer(input_case="cased", lang=lang_key) + except Exception as e: + logging.warning(f"Failed to create normalizer for language: {lang_key}. Error: {e}") + self._normalizer_cache[lang_key] = None + return self._normalizer_cache[lang_key] + + def _get_per_token_logps(self, logits: torch.Tensor, labels: torch.Tensor, loss_mask: torch.Tensor) -> torch.Tensor: + # Force fp32 for log_softmax to avoid bf16 precision issues that sever the + # gradient path through the GRPO "exp(logps - logps.detach())" trick. + # Under bf16 autocast, the tiny gradient signal through this identity-like + # expression gets rounded to zero, disconnecting local_transformer_out_projections. + with torch.cuda.amp.autocast(enabled=False): + logits_fp32 = logits.float() + per_token_logps = torch.gather(logits_fp32.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2) + per_token_logps = per_token_logps * loss_mask.float() + return per_token_logps + + + def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): + """ + Override parent to force fp32 computation for the entire local transformer logits path. + + Under bf16-mixed autocast, the nn.Linear out_projections execute in bf16 and insert + ToCopyBackward0 nodes in the autograd graph. The GRPO loss formula + ``exp(logps - logps.detach())`` produces an identity in the forward pass, but the + gradient signal through this expression is extremely small. The bf16 ToCopyBackward0 + nodes round these tiny gradients to zero, completely severing the gradient path to + local_transformer_out_projections. Running the full computation in fp32 preserves + the gradient fidelity. + """ + with torch.cuda.amp.autocast(enabled=False): + # Cast dec_out to fp32 if it's in a lower precision (e.g. bf16 from autocast) + dec_out_fp32 = dec_out.float() + return super().compute_local_transformer_logits( + dec_out_fp32, audio_codes_target, targets_offset_by_one=targets_offset_by_one + ) + + def repeat_items_in_batch(self, batch: Dict, num_repeats: int) -> Dict: + repeated_batch = {} + for key, value in batch.items(): + if isinstance(value, torch.Tensor): + repeated_batch[key] = value.repeat_interleave(num_repeats, dim=0) + elif isinstance(value, list): + repeated_value = [] + for item in value: + repeated_value.extend([item] * num_repeats) + repeated_batch[key] = repeated_value + else: + repeated_batch[key] = value + return repeated_batch + + def _get_audio_dir(self) -> str: + if self.logger is not None and hasattr(self.logger, "log_dir") and self.logger.log_dir is not None: + log_dir = self.logger.log_dir + elif self.trainer is not None and self.trainer.log_dir is not None: + log_dir = self.trainer.log_dir + else: + log_dir = "." + audio_dir = os.path.join(log_dir, 'online_po_audios') + os.makedirs(audio_dir, exist_ok=True) + return audio_dir + + def _save_waveforms_to_paths( + self, + waveforms: torch.Tensor, + waveform_lens: torch.Tensor, + prefix: str, + sample_rate: int, + ) -> List[str]: + audio_dir = self._get_audio_dir() + time_id = time.time_ns() + paths = [] + for idx in range(waveforms.size(0)): + wav = waveforms[idx].float().detach().cpu().numpy() + wav = wav[: int(waveform_lens[idx].item())] + # path = os.path.join(audio_dir, f'{prefix}_rank{self.global_rank}_{time_id}_{idx}.wav') + path = os.path.join(audio_dir, f'{prefix}_rank{self.global_rank}_{idx}.wav') + sf.write(path, wav, sample_rate) + paths.append(path) + return paths + + def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]: + """ + Build per-item reference audio paths for speaker similarity reward. + Priority: audio_filepaths -> context_audio -> context_audio_codes. + """ + if 'context_audio' in batch_repeated and 'context_audio_lens' in batch_repeated: + # TODO: Handle text context here support here. + return self._save_waveforms_to_paths( + waveforms=batch_repeated['context_audio'], + waveform_lens=batch_repeated['context_audio_lens'], + prefix='reference_context_audio', + sample_rate=self.sample_rate, + ) + + if 'context_audio_codes' in batch_repeated and 'context_audio_codes_lens' in batch_repeated: + context_codes = batch_repeated['context_audio_codes'].clone() + context_lens = batch_repeated['context_audio_codes_lens'].clone() + + target_codes = batch_repeated['audio_codes'].clone() + target_lens = batch_repeated['audio_codes_lens'].clone() + + # For items where context_lens < 3, fall back to target_codes/target_lens + # This is for items with text context + short_context_mask = context_lens < 3 + if short_context_mask.any(): + # Pad the shorter tensor along the time dimension if needed + max_len = max(context_codes.shape[-1], target_codes.shape[-1]) + if context_codes.shape[-1] < max_len: + pad_size = max_len - context_codes.shape[-1] + context_codes = torch.nn.functional.pad(context_codes, (0, pad_size), value=0) + if target_codes.shape[-1] < max_len: + pad_size = max_len - target_codes.shape[-1] + target_codes = torch.nn.functional.pad(target_codes, (0, pad_size), value=0) + context_codes[short_context_mask] = target_codes[short_context_mask] + context_lens[short_context_mask] = target_lens[short_context_mask] + # Slice to the actual max length needed + context_codes = context_codes[..., :context_lens.max()] + + if self._codec_converter is not None: + context_codes = self._codec_converter.convert_original_to_new( + audio_tokens=context_codes, audio_lens=context_lens + ).long() + context_audio, context_audio_lens, _ = self.codes_to_audio(context_codes, context_lens) + return self._save_waveforms_to_paths( + waveforms=context_audio, + waveform_lens=context_audio_lens, + prefix='reference_context_codes_decoded', + sample_rate=self.output_sample_rate, + ) + + raise ValueError( + "Could not construct reference audio for speaker similarity. Need one of: " + "context_audio/context_audio_lens, or context_audio_codes/context_audio_codes_lens." + ) + + def _run_easy_process_batch( + self, + model: EasyMagpieTTSModel, + batch: Dict, + audio_codes: torch.Tensor, + audio_codes_lens: torch.Tensor, + mode: str, + ): + if 'context_audio_codes' in batch: + context_audio_codes = batch['context_audio_codes'] + context_audio_codes_lens = batch['context_audio_codes_lens'] + else: + context_audio_codes, context_audio_codes_lens = model.audio_to_codes( + batch['context_audio'], batch['context_audio_lens'] + ) + + return model.process_batch( + text=batch['text'], + text_lens=batch['text_lens'], + context_text_tokens=batch['context_text_tokens'], + context_text_tokens_lens=batch['context_text_tokens_lens'], + audio_codes=audio_codes, + audio_codes_lens=audio_codes_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + phoneme_tokens=batch.get('phoneme_tokens'), + phoneme_tokens_lens=batch.get('phoneme_tokens_lens'), + mode=mode, + ) + + def _format_text_table(self, headers: List[str], rows: List[List[str]]) -> str: + col_widths = [len(h) for h in headers] + for row in rows: + for col_idx, value in enumerate(row): + col_widths[col_idx] = max(col_widths[col_idx], len(value)) + + header_line = " | ".join(headers[col_idx].ljust(col_widths[col_idx]) for col_idx in range(len(headers))) + separator = "-+-".join("-" * col_widths[col_idx] for col_idx in range(len(headers))) + row_lines = [ + " | ".join(row[col_idx].ljust(col_widths[col_idx]) for col_idx in range(len(headers))) for row in rows + ] + return "\n".join([header_line, separator] + row_lines) + + def _print_group_cer_wer_table( + self, + batch: Dict, + batch_metrics: List[Dict], + group_idx: int, + group_start_idx: int, + group_end_idx: int, + is_group_valid: bool, + mean_reward: float, + std_reward: float, + ) -> None: + if not getattr(self.trainer, "is_global_zero", True): + return + + prompt_text = str(batch['raw_texts'][group_idx]).replace("\n", " ") + if len(prompt_text) > 120: + prompt_text = f"{prompt_text[:117]}..." + + rows = [] + for local_idx, metric_idx in enumerate(range(group_start_idx, group_end_idx)): + item_metrics = batch_metrics[metric_idx] + rows.append( + [ + str(local_idx), + f"{item_metrics['cer_gt']:.4f}", + f"{item_metrics['wer_gt']:.4f}", + f"{item_metrics['spk_similarity']:.4f}", + f"{item_metrics['reward']:.4f}", + f"{item_metrics.get('advantage', 0.0):.4f}", + ] + ) + + table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "reward", "advantage"], rows=rows) + print( + f"[generate_and_reward] group={group_idx} valid={is_group_valid} " + f"mean_reward={mean_reward:.4f} std_reward={std_reward:.4f}\n" + f"prompt: {prompt_text}\n{table}\n" + ) + + def generate_and_reward( + self, + batch: Dict, + num_generations_per_item: int, + mode: str = 'train', + use_local_transformer_for_inference: bool = False, + ): + batch_repeated = self.repeat_items_in_batch(batch, num_generations_per_item) + reward_asr_model = self.cfg.get('reward_asr_model', 'nemo') + use_pesq = self.cfg.get('use_pesq', False) + + use_cfg = False + cfg_scale = 1.0 + inference_cfg_prob = self.cfg.get('inference_cfg_prob', 0.0) + if (inference_cfg_prob == 1.0) or (inference_cfg_prob > 0.0 and mode == 'train'): + use_cfg = random.random() < inference_cfg_prob + cfg_scale = self.cfg.get('inference_cfg_scale', 1.0) + + phoneme_input_type = 'pred' + gt_phoneme_input_prob = self.cfg.get('gt_phoneme_input_prob', 0.0) + can_use_gt_phonemes = ('phoneme_tokens' in batch_repeated) and ('phoneme_tokens_lens' in batch_repeated) + if can_use_gt_phonemes and gt_phoneme_input_prob > 0.0 and mode == 'train': + phoneme_input_type = 'gt' if random.random() < gt_phoneme_input_prob else 'pred' + + generation_start_time = time.perf_counter() + print("Inference started") + output = self.infer_batch( + batch=batch_repeated, + max_decoder_steps=self.max_decoder_steps, + temperature=self.cfg.get('inference_temperature', 0.7), + topk=self.cfg.get('inference_topk', 80), + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer_for_inference=use_local_transformer_for_inference, + phoneme_input_type=phoneme_input_type, + phoneme_sampling_method=self.cfg.get('inference_phoneme_sampling_method', 'argmax'), + force_dropout_text=False, + use_teacher_forced=False, + use_inference_mode=False, + ) + print("Inference ended") + audio_generation_time_sec = time.perf_counter() - generation_start_time + + predicted_audio = output.predicted_audio + predicted_audio_lens = output.predicted_audio_lens + predicted_codes = output.predicted_codes + predicted_codes_lens = output.predicted_codes_lens + save_start_time = time.perf_counter() + predicted_audio_paths = self._save_waveforms_to_paths( + waveforms=predicted_audio, + waveform_lens=predicted_audio_lens, + prefix='generated', + sample_rate=self.output_sample_rate, + ) + audio_save_time_sec = time.perf_counter() - save_start_time + audio_durations = [int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0))] + + rewarding_start_time = time.perf_counter() + if reward_asr_model == 'nemo': + pred_transcripts = self._eval_asr_model.transcribe( + predicted_audio_paths, + batch_size=len(predicted_audio_paths), + override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0), + ) + pred_transcripts = [process_text_for_cer(transcript.text) for transcript in pred_transcripts] + else: + self.whisper_model.to(self.device) + pred_transcripts = [] + langs = batch_repeated.get('languages', ['en'] * len(predicted_audio_paths)) + for item_idx, audio_path in enumerate(predicted_audio_paths): + language = langs[item_idx] if item_idx < len(langs) else 'en' + normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None + print(f"Transcribing audio {audio_path} with language {language}") + transcript = transcribe_with_whisper( + audio_filepath=audio_path, + language=language, + whisper_processor=self.whisper_processor, + whisper_model=self.whisper_model, + device=self.device, + normalizer=normalizer, + ) + print(f"Pred Transcript: {transcript}") + print(f"Normalized Pred Text: {process_text_for_cer(transcript)}") + print(f"Raw Text: {batch_repeated['raw_texts'][item_idx]}") + print("--------------------------------") + pred_transcripts.append(process_text_for_cer(transcript)) + + reference_audio_paths = self._get_reference_audio_paths(batch_repeated) + try: + pred_speaker_embeddings = get_speaker_embeddings_from_filepaths( + predicted_audio_paths, self._eval_speaker_verification_model, self.device + ) + gt_speaker_embeddings = get_speaker_embeddings_from_filepaths( + reference_audio_paths, self._eval_speaker_verification_model, self.device + ) + except Exception as e: + logging.warning(f"Speaker-embedding reward failed. Falling back to zero SSIM reward. Error: {e}") + pred_speaker_embeddings = None + gt_speaker_embeddings = None + + batch_metrics = [] + cer_reward_weight = self.cfg.get('cer_reward_weight', 0.5) + ssim_reward_weight = self.cfg.get('ssim_reward_weight', 0.5) + pesq_reward_weight = self.cfg.get('pesq_reward_weight', 0.0) + min_valid_codes_len = self.cfg.get('min_valid_codes_len', 4) + max_valid_codes_len = self.cfg.get( + 'max_valid_codes_len', self.max_decoder_steps * self.frame_stacking_factor - 1 + ) + + for idx in range(predicted_audio.size(0)): + pred_transcript = pred_transcripts[idx] + gt_transcript = process_text_for_cer(batch_repeated['raw_texts'][idx]) + cer_gt = min(max(word_error_rate([pred_transcript], [gt_transcript], use_cer=True), 0.0), 1.0) + wer_gt = min(max(word_error_rate([pred_transcript], [gt_transcript], use_cer=False), 0.0), 1.0) + + if pred_speaker_embeddings is not None and gt_speaker_embeddings is not None: + spk_embedding_pred = pred_speaker_embeddings[idx].cpu().float().numpy() + spk_embedding_gt = gt_speaker_embeddings[idx].cpu().float().numpy() + denom = max(np.linalg.norm(spk_embedding_pred) * np.linalg.norm(spk_embedding_gt), 1e-8) + spk_similarity = float(np.dot(spk_embedding_pred, spk_embedding_gt) / denom) + else: + spk_similarity = 0.0 + + if use_pesq: + sample_audio, sr = torchaudio.load(predicted_audio_paths[idx]) + sample_audio = sample_audio.to(self.device) + if sr != 16000: + sample_audio = torchaudio.functional.resample(sample_audio, sr, 16000) + _, pesq_hyp, _ = self.squim_objective_model(sample_audio) + pesq_hyp = float(pesq_hyp.item()) + else: + pesq_hyp = 0.0 + + item_metrics = { + 'cer_gt': float(cer_gt), + 'wer_gt': float(wer_gt), + 'duration': float(audio_durations[idx]), + 'spk_similarity': float(spk_similarity), + 'pred_transcript': pred_transcript, + 'gt_transcript': gt_transcript, + 'codes_len': int(predicted_codes_lens[idx].item()), + 'pesq': float(pesq_hyp), + } + + best_ssim_achievable = self.cfg.get('best_ssim_achievable', 0.9) + mean_cer_dataset = self.cfg.get('mean_cer_dataset', 0.1) + mean_ssim_dataset = self.cfg.get('mean_ssim_dataset', 0.6) + + item_cer = item_metrics['cer_gt'] + item_ssim = max(min(item_metrics['spk_similarity'], best_ssim_achievable), 0.0) + if item_cer <= mean_cer_dataset: + cer_reward = 0.5 + 0.5 * (mean_cer_dataset - item_cer) / max(mean_cer_dataset, 1e-8) + else: + cer_reward = 0.5 - 0.5 * (item_cer - mean_cer_dataset) / max(1.0 - mean_cer_dataset, 1e-8) + + if item_ssim >= mean_ssim_dataset: + spk_similarity_reward = 0.5 + 0.5 * (item_ssim - mean_ssim_dataset) / max( + best_ssim_achievable - mean_ssim_dataset, 1e-8 + ) + else: + spk_similarity_reward = 0.5 - 0.5 * (mean_ssim_dataset - item_ssim) / max(mean_ssim_dataset, 1e-8) + + pesq_reward = item_metrics['pesq'] / 4.5 if use_pesq else 0.0 + reward = ( + cer_reward * cer_reward_weight + + spk_similarity_reward * ssim_reward_weight + + pesq_reward * pesq_reward_weight + ) + if (item_metrics['codes_len'] >= max_valid_codes_len) or (item_metrics['codes_len'] <= min_valid_codes_len): + item_metrics['_needs_group_min_reward'] = True + else: + item_metrics['_needs_group_min_reward'] = False + + item_metrics['cer_reward'] = float(cer_reward) + item_metrics['spk_similarity_reward'] = float(spk_similarity_reward) + item_metrics['pesq_reward'] = float(pesq_reward) + item_metrics['reward'] = float(reward) + batch_metrics.append(item_metrics) + + # Second pass: replace rewards for items with invalid code lengths with the group minimum reward + num_groups = len(batch['raw_texts']) + for group_idx in range(num_groups): + group_start_idx = group_idx * num_generations_per_item + group_end_idx = group_start_idx + num_generations_per_item + group_rewards = [batch_metrics[idx]['reward'] for idx in range(group_start_idx, group_end_idx)] + group_min_reward = min(group_rewards) + for idx in range(group_start_idx, group_end_idx): + if batch_metrics[idx]['_needs_group_min_reward']: + batch_metrics[idx]['reward'] = float(group_min_reward) + + all_groups_mean_reward = 0.0 + all_groups_std_reward = 0.0 + group_validities = [] + for group_idx in range(num_groups): + group_start_idx = group_idx * num_generations_per_item + group_end_idx = group_start_idx + num_generations_per_item + group_rewards = [batch_metrics[idx]['reward'] for idx in range(group_start_idx, group_end_idx)] + group_cers = [batch_metrics[idx]['cer_gt'] for idx in range(group_start_idx, group_end_idx)] + mean_reward = float(np.mean(group_rewards)) + std_reward = float(np.std(group_rewards)) + is_group_valid = True + if min(group_cers) > self.best_cer_threshold: + is_group_valid = False + if max(group_cers) > self.worst_cer_threshold: + is_group_valid = False + + for idx in range(group_start_idx, group_end_idx): + advantage = batch_metrics[idx]['reward'] - mean_reward + if self.scale_rewards: + advantage = advantage / (std_reward + 1e-4) + batch_metrics[idx]['advantage'] = float(advantage) + group_validities.append(is_group_valid) + + self._print_group_cer_wer_table( + batch=batch, + batch_metrics=batch_metrics, + group_idx=group_idx, + group_start_idx=group_start_idx, + group_end_idx=group_end_idx, + is_group_valid=is_group_valid, + mean_reward=mean_reward, + std_reward=std_reward, + ) + + all_groups_mean_reward += mean_reward + all_groups_std_reward += std_reward + + all_groups_mean_reward = all_groups_mean_reward / max(num_groups, 1) + all_groups_std_reward = all_groups_std_reward / max(num_groups, 1) + advantages = torch.tensor([x['advantage'] for x in batch_metrics], device=self.device, dtype=torch.float32) + group_validities = torch.tensor(group_validities, device=self.device, dtype=torch.float32) + rewarding_time_sec = time.perf_counter() - rewarding_start_time + + return { + 'mean_reward': torch.tensor(all_groups_mean_reward, device=self.device, dtype=torch.float32), + 'std_reward': torch.tensor(all_groups_std_reward, device=self.device, dtype=torch.float32), + 'batch_repeated': batch_repeated, + 'metrics': batch_metrics, + 'predicted_codes': predicted_codes, + 'predicted_codes_lens': predicted_codes_lens, + 'advantages': advantages, + 'group_validities': group_validities, + 'rollout_phoneme_input_type': phoneme_input_type, + 'timings': { + 'audio_generation_time_sec': float(audio_generation_time_sec), + 'audio_save_time_sec': float(audio_save_time_sec), + 'rewarding_time_sec': float(rewarding_time_sec), + }, + } + + def process_batch_online_po(self, batch: Dict, n_generations_per_item: int, mode: str = 'train'): + generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs( + batch=batch, + n_generations_per_item=n_generations_per_item, + mode=mode, + ) + chunked_outputs = self._run_teacher_forced_chunked_po( + generated_codes_and_metrics=generated_codes_and_metrics, + batch_repeated=batch_repeated, + predicted_codes=predicted_codes, + predicted_codes_lens=predicted_codes_lens, + n_generations_per_item=n_generations_per_item, + do_backward=False, + ) + return { + 'mean_reward': generated_codes_and_metrics['mean_reward'], + 'std_reward': generated_codes_and_metrics['std_reward'], + 'loss': chunked_outputs['loss'], + 'po_loss': chunked_outputs['po_loss'], + 'phoneme_aux_loss': chunked_outputs['phoneme_aux_loss'], + 'kl_loss': chunked_outputs['kl_loss'], + 'used_gt_phoneme_input': chunked_outputs['used_gt_phoneme_input'], + 'batch_metrics': generated_codes_and_metrics['metrics'], + } + + def _slice_batch_range(self, batch: Dict, start_idx: int, end_idx: int) -> Dict: + sliced_batch = {} + for key, value in batch.items(): + if isinstance(value, torch.Tensor): + sliced_batch[key] = value[start_idx:end_idx] + elif isinstance(value, list): + sliced_batch[key] = value[start_idx:end_idx] + else: + sliced_batch[key] = value + + # Keep explicit keys only to avoid accidental slicing of non-temporal tensors. + temporal_key_pairs = [ + ('text', 'text_lens'), + ('context_text_tokens', 'context_text_tokens_lens'), + ('audio_codes', 'audio_codes_lens'), + ('context_audio_codes', 'context_audio_codes_lens'), + ('phoneme_tokens', 'phoneme_tokens_lens'), + ('context_audio', 'context_audio_lens'), + ('audio', 'audio_lens'), + ] + for tensor_key, lens_key in temporal_key_pairs: + tensor_value = sliced_batch.get(tensor_key) + lens = sliced_batch.get(lens_key) + if not isinstance(tensor_value, torch.Tensor) or not isinstance(lens, torch.Tensor): + continue + if tensor_value.dim() < 2 or tensor_value.size(0) != lens.size(0): + continue + + local_max_len = int(lens.max().item()) if lens.numel() > 0 else 0 + local_max_len = min(local_max_len, tensor_value.size(-1)) + sliced_batch[tensor_key] = tensor_value[..., :local_max_len] + + return sliced_batch + + def _iter_group_ranges(self, num_groups: int, groups_per_subbatch: int): + for group_start in range(0, num_groups, groups_per_subbatch): + yield group_start, min(group_start + groups_per_subbatch, num_groups) + + def _prepare_online_po_inputs(self, batch: Dict, n_generations_per_item: int, mode: str): + use_local_transformer_for_inference = False + use_local_transformer_prob = self.cfg.get('use_local_transformer_prob', 0.0) + if use_local_transformer_prob > 0.0 and mode == 'train': + use_local_transformer_for_inference = random.random() < use_local_transformer_prob + + with torch.no_grad(): + self.eval() + generated_codes_and_metrics = self.generate_and_reward( + batch=batch, + num_generations_per_item=n_generations_per_item, + mode=mode, + use_local_transformer_for_inference=use_local_transformer_for_inference, + ) + self.train() + + batch_repeated = generated_codes_and_metrics['batch_repeated'] + predicted_codes = generated_codes_and_metrics['predicted_codes'] + predicted_codes_lens = generated_codes_and_metrics['predicted_codes_lens'] + predicted_codes = predicted_codes[:, :, : predicted_codes_lens.max()] + predicted_codes = self._codec_converter.convert_new_to_original( + audio_tokens=predicted_codes, audio_lens=predicted_codes_lens + ) + batch_repeated['audio_codes'] = predicted_codes + batch_repeated['audio_codes_lens'] = predicted_codes_lens + if 'audio' in batch_repeated: + del batch_repeated['audio'] + if 'audio_lens' in batch_repeated: + del batch_repeated['audio_lens'] + + return generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens + + def _compute_po_losses_from_outputs( + self, + policy_output, + reference_output, + advantages: torch.Tensor, + group_validities: torch.Tensor, + rollout_phoneme_input_type: str, + ): + logits = policy_output.local_transformer_logits + if logits is None: + logits = policy_output.logits + ref_logits = None + if reference_output is not None: + ref_logits = reference_output.local_transformer_logits + if ref_logits is None: + ref_logits = reference_output.logits + + audio_codes_target = policy_output.audio_codes_target.long() + audio_codes_lens_target = policy_output.audio_codes_lens_target + audio_loss_mask = get_mask_from_lengths(audio_codes_lens_target).float() + + n_codebooks = audio_codes_target.size(1) + total_loss = None + total_kl = None + for codebook_idx in range(n_codebooks): + si = codebook_idx * self.num_all_tokens_per_codebook + ei = si + self.num_all_tokens_per_codebook + codebook_logits = logits[:, :, si:ei] + codebook_labels = audio_codes_target[:, codebook_idx, :] + per_token_logps = self._get_per_token_logps(codebook_logits, codebook_labels, audio_loss_mask) + # Ensure the GRPO policy gradient trick stays in fp32 to preserve gradient signal + with torch.cuda.amp.autocast(enabled=False): + per_token_loss = -(torch.exp(per_token_logps.float() - per_token_logps.float().detach()) * advantages.float().unsqueeze(1)) + per_token_loss = per_token_loss * group_validities.float().unsqueeze(1) + + if not self.reference_free and ref_logits is not None: + with torch.no_grad(): + ref_codebook_logits = ref_logits[:, :, si:ei] + per_token_ref_logps = self._get_per_token_logps( + ref_codebook_logits, codebook_labels, audio_loss_mask + ) + with torch.cuda.amp.autocast(enabled=False): + per_token_kl = ( + torch.exp(per_token_ref_logps.float() - per_token_logps.float()) - (per_token_ref_logps.float() - per_token_logps.float()) - 1 + ) + per_token_loss = per_token_loss + self.cfg.get('grpo_beta', 0.0) * per_token_kl + codebook_kl_loss_mean = ( + (per_token_kl * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8) + ).mean() + else: + codebook_kl_loss_mean = torch.tensor(0.0, device=self.device) + + if self.loss_type == "grpo": + codebook_loss = ( + (per_token_loss * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8) + ).mean() + elif self.loss_type == "dr_grpo": + total_tokens = per_token_loss.shape[0] * self.max_decoder_steps + codebook_loss = (per_token_loss * audio_loss_mask).sum() / max(total_tokens, 1) + else: + raise ValueError(f"Unknown loss function: {self.loss_type}") + + if total_loss is None: + total_loss = codebook_loss + total_kl = codebook_kl_loss_mean + else: + total_loss += codebook_loss + total_kl += codebook_kl_loss_mean + + total_po_loss = total_loss / n_codebooks + total_kl = total_kl / n_codebooks + + phoneme_aux_loss = policy_output.phoneme_loss if rollout_phoneme_input_type == 'gt' else None + if phoneme_aux_loss is None: + phoneme_aux_loss = torch.tensor(0.0, device=self.device) + total_loss = total_po_loss + self.aux_phoneme_loss_weight * phoneme_aux_loss + + return { + 'loss': total_loss, + 'po_loss': total_po_loss, + 'phoneme_aux_loss': phoneme_aux_loss, + 'kl_loss': total_kl, + 'used_gt_phoneme_input': float(rollout_phoneme_input_type == 'gt'), + } + + def _run_teacher_forced_chunked_po( + self, + generated_codes_and_metrics: Dict, + batch_repeated: Dict, + predicted_codes: torch.Tensor, + predicted_codes_lens: torch.Tensor, + n_generations_per_item: int, + do_backward: bool, + ): + num_groups = len(batch_repeated['raw_texts']) // n_generations_per_item + groups_per_subbatch = max(self.po_groups_per_subbatch, 1) + + accumulated_loss = torch.tensor(0.0, device=self.device) + accumulated_po_loss = torch.tensor(0.0, device=self.device) + accumulated_phoneme_aux_loss = torch.tensor(0.0, device=self.device) + accumulated_kl_loss = torch.tensor(0.0, device=self.device) + used_gt_phoneme_input = 0.0 + + for group_start_idx, group_end_idx in self._iter_group_ranges(num_groups, groups_per_subbatch): + item_start_idx = group_start_idx * n_generations_per_item + item_end_idx = group_end_idx * n_generations_per_item + group_weight = float(group_end_idx - group_start_idx) / max(float(num_groups), 1.0) + + batch_sub = self._slice_batch_range(batch_repeated, item_start_idx, item_end_idx) + predicted_codes_sub = predicted_codes[item_start_idx:item_end_idx] + predicted_codes_lens_sub = predicted_codes_lens[item_start_idx:item_end_idx] + predicted_codes_sub = predicted_codes_sub[:, :, : predicted_codes_lens_sub.max()] + advantages_sub = generated_codes_and_metrics['advantages'][item_start_idx:item_end_idx] + group_validities_sub = generated_codes_and_metrics['group_validities'][item_start_idx:item_end_idx] + rollout_phoneme_input_type = generated_codes_and_metrics.get('rollout_phoneme_input_type', 'pred') + + # Use mode='val' intentionally for stable PO optimization: + # no random input dropout, no CFG unconditional dropout, no random phoneme corruption. + policy_output = self._run_easy_process_batch( + model=self, + batch=batch_sub, + audio_codes=predicted_codes_sub, + audio_codes_lens=predicted_codes_lens_sub, + mode='val', + ) + + reference_output = None + if not self.reference_free: + with torch.no_grad(): + reference_output = self._run_easy_process_batch( + model=self._reference_model, + batch=batch_sub, + audio_codes=predicted_codes_sub, + audio_codes_lens=predicted_codes_lens_sub, + mode='val', + ) + + chunk_outputs = self._compute_po_losses_from_outputs( + policy_output=policy_output, + reference_output=reference_output, + advantages=advantages_sub, + group_validities=group_validities_sub, + rollout_phoneme_input_type=rollout_phoneme_input_type, + ) + + if do_backward: + self.manual_backward(chunk_outputs['loss'] * group_weight) + + accumulated_loss = accumulated_loss + chunk_outputs['loss'].detach() * group_weight + accumulated_po_loss = accumulated_po_loss + chunk_outputs['po_loss'].detach() * group_weight + accumulated_phoneme_aux_loss = ( + accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * group_weight + ) + accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * group_weight + used_gt_phoneme_input = max(used_gt_phoneme_input, chunk_outputs['used_gt_phoneme_input']) + + return { + 'loss': accumulated_loss, + 'po_loss': accumulated_po_loss, + 'phoneme_aux_loss': accumulated_phoneme_aux_loss, + 'kl_loss': accumulated_kl_loss, + 'used_gt_phoneme_input': used_gt_phoneme_input, + } + + def training_step(self, batch, batch_idx): + n_generations_per_item = self.cfg.get('n_generations_per_item', 6) + optimizer = self.optimizers() + if isinstance(optimizer, (list, tuple)): + if len(optimizer) != 1: + raise ValueError(f"Expected a single optimizer, got {len(optimizer)}.") + optimizer = optimizer[0] + optimizer.zero_grad(set_to_none=True) + + # Snapshot weights before optimizer step to measure weight deltas. + prev_weights = self._snapshot_trainable_weights() + + generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs( + batch=batch, + n_generations_per_item=n_generations_per_item, + mode='train', + ) + teacher_forced_start_time = time.perf_counter() + po_outputs = self._run_teacher_forced_chunked_po( + generated_codes_and_metrics=generated_codes_and_metrics, + batch_repeated=batch_repeated, + predicted_codes=predicted_codes, + predicted_codes_lens=predicted_codes_lens, + n_generations_per_item=n_generations_per_item, + do_backward=True, + ) + teacher_forced_time_sec = time.perf_counter() - teacher_forced_start_time + + # Compute gradient/weight metrics BEFORE optimizer.step() clears gradients. + grad_weight_metrics = self._compute_grad_and_weight_metrics() + + optimizer.step() + + # Step the LR scheduler (required in manual optimization mode). + lr_schedulers = self.lr_schedulers() + if lr_schedulers is not None: + if isinstance(lr_schedulers, (list, tuple)): + for sched in lr_schedulers: + sched.step() + else: + lr_schedulers.step() + + # Compute weight delta metrics AFTER optimizer.step(). + grad_weight_metrics.update(self._compute_weight_update_metrics(prev_weights)) + + # Log learning rate. + self.log('learning_rate', optimizer.param_groups[0]['lr'], prog_bar=False, sync_dist=True) + + # Core training metrics. + self.log('train_loss', po_outputs['loss'], prog_bar=True, sync_dist=True) + self.log('train_po_loss', po_outputs['po_loss'], prog_bar=True, sync_dist=True) + self.log('train_phoneme_aux_loss', po_outputs['phoneme_aux_loss'], prog_bar=True, sync_dist=True) + self.log('train_kl_loss', po_outputs['kl_loss'], prog_bar=True, sync_dist=True) + self.log('train_used_gt_phoneme_input', po_outputs['used_gt_phoneme_input'], prog_bar=True, sync_dist=True) + self.log('train_mean_reward', generated_codes_and_metrics['mean_reward'], prog_bar=True, sync_dist=True) + self.log('train_std_reward', generated_codes_and_metrics['std_reward'], prog_bar=True, sync_dist=True) + + # Gradient / weight diagnostics to wandb. + for metric_name, metric_value in grad_weight_metrics.items(): + self.log(f'train_{metric_name}', metric_value, prog_bar=False, sync_dist=True) + + # Compact summary to stdout / log file. + self._print_grad_weight_summary(grad_weight_metrics, step=self.global_step) + + # Timing metrics. + timings = generated_codes_and_metrics.get('timings', {}) + for tkey in ('audio_generation_time_sec', 'audio_save_time_sec', 'rewarding_time_sec'): + self.log(f'train_{tkey}', float(timings.get(tkey, 0.0)), prog_bar=False, sync_dist=True) + self.log('train_teacher_forced_time_sec', teacher_forced_time_sec, prog_bar=False, sync_dist=True) + + # def validation_step(self, batch, batch_idx): + # val_n_generations_per_item = self.cfg.get('val_n_generations_per_item', 1) + # po_outputs = self.process_batch_online_po( + # batch=batch, + # n_generations_per_item=val_n_generations_per_item, + # mode='val', + # ) + # self.validation_step_outputs.append( + # { + # 'mean_reward': po_outputs['mean_reward'], + # 'std_reward': po_outputs['std_reward'], + # 'val_loss': po_outputs['loss'], + # 'val_po_loss': po_outputs['po_loss'], + # 'val_phoneme_aux_loss': po_outputs['phoneme_aux_loss'], + # 'val_kl_loss': po_outputs['kl_loss'], + # 'val_used_gt_phoneme_input': torch.tensor( + # po_outputs['used_gt_phoneme_input'], device=self.device, dtype=torch.float32 + # ), + # 'batch_metrics': po_outputs['batch_metrics'], + # } + # ) + + # def on_validation_epoch_end(self): + # def collect(key: str): + # values = [] + # for x in self.validation_step_outputs: + # if x[key] is not None: + # values.append(x[key]) + # else: + # values.append(torch.tensor(0.0, device=self.device)) + # return torch.stack(values).mean() if len(values) > 0 else torch.tensor(0.0, device=self.device) + + # val_loss = collect("val_loss") + # val_po_loss = collect("val_po_loss") + # val_phoneme_aux_loss = collect("val_phoneme_aux_loss") + # val_kl_loss = collect("val_kl_loss") + # val_used_gt_phoneme_input = collect("val_used_gt_phoneme_input") + # mean_reward = collect("mean_reward") + # std_reward = collect("std_reward") + + # self.log("val_loss", val_loss, prog_bar=True, sync_dist=True) + # self.log("val_po_loss", val_po_loss, prog_bar=True, sync_dist=True) + # self.log("val_phoneme_aux_loss", val_phoneme_aux_loss, prog_bar=True, sync_dist=True) + # self.log("val_kl_loss", val_kl_loss, prog_bar=True, sync_dist=True) + # self.log("val_used_gt_phoneme_input", val_used_gt_phoneme_input, prog_bar=True, sync_dist=True) + # self.log("val_mean_reward", mean_reward, prog_bar=True, sync_dist=True) + # self.log("val_std_reward", std_reward, prog_bar=True, sync_dist=True) + + # mean_metrics = {} + # for val_output in self.validation_step_outputs: + # for item_metrics in val_output['batch_metrics']: + # for key, value in item_metrics.items(): + # if "transcript" not in key: + # mean_metrics.setdefault(key, []).append(value) + # for key, values in mean_metrics.items(): + # self.log(f"val_{key}", float(np.mean(values)), prog_bar=True, sync_dist=True) + + # self.validation_step_outputs.clear() diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 027ca47a4e82..6d91ad25f976 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -1411,7 +1411,7 @@ def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: st if isinstance(cfg.init_from_ptl_ckpt, str): # Restore checkpoint ckpt_path = cfg.pop('init_from_ptl_ckpt') - ckpt = torch.load(ckpt_path, map_location=map_location) + ckpt = torch.load(ckpt_path, map_location=map_location, weights_only=False) # Restore checkpoint into current model self.load_state_dict(ckpt['state_dict'], strict=False) From 2ca71812552b91d7be7752d902254f2ba6efdd4d Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Wed, 18 Feb 2026 11:47:27 -0800 Subject: [PATCH 60/94] po stabilize Signed-off-by: Shehzeen Hussain --- .../easy_magpietts_preference_optimization.py | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py index 1bc94c14206f..d5c94fec59b1 100644 --- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py @@ -132,10 +132,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if reward_asr_model == 'whisper' and self._normalize_whisper_transcript: self._normalizer_cache = {} + # Entropy bonus coefficient – encourages exploration and prevents mode collapse. + # Set to 0.0 to disable. Typical range: 0.001–0.01. + self.entropy_coeff = self.cfg.get('entropy_coeff', 0.0) + # Filter out poor groups for stable optimization. self.best_cer_threshold = self.cfg.get('best_cer_threshold', 1.0) self.worst_cer_threshold = self.cfg.get('worst_cer_threshold', 1.0) + + if self.trainer is not None and str(self.trainer.precision) in ("32", "32-true"): self.decoder.float() @@ -871,6 +877,7 @@ def _compute_po_losses_from_outputs( n_codebooks = audio_codes_target.size(1) total_loss = None total_kl = None + total_entropy = None for codebook_idx in range(n_codebooks): si = codebook_idx * self.num_all_tokens_per_codebook ei = si + self.num_all_tokens_per_codebook @@ -882,6 +889,16 @@ def _compute_po_losses_from_outputs( per_token_loss = -(torch.exp(per_token_logps.float() - per_token_logps.float().detach()) * advantages.float().unsqueeze(1)) per_token_loss = per_token_loss * group_validities.float().unsqueeze(1) + # Per-token entropy of the policy distribution (always computed for logging). + with torch.cuda.amp.autocast(enabled=False): + logits_fp32 = codebook_logits.float() + log_probs = logits_fp32.log_softmax(-1) # [B, T, V] + probs = log_probs.exp() # [B, T, V] + per_token_entropy = -(probs * log_probs).sum(-1) # [B, T] + codebook_entropy = ( + (per_token_entropy * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8) + ).mean() + if not self.reference_free and ref_logits is not None: with torch.no_grad(): ref_codebook_logits = ref_logits[:, :, si:ei] @@ -912,23 +929,31 @@ def _compute_po_losses_from_outputs( if total_loss is None: total_loss = codebook_loss total_kl = codebook_kl_loss_mean + total_entropy = codebook_entropy else: total_loss += codebook_loss total_kl += codebook_kl_loss_mean + total_entropy += codebook_entropy total_po_loss = total_loss / n_codebooks total_kl = total_kl / n_codebooks + total_entropy = total_entropy / n_codebooks phoneme_aux_loss = policy_output.phoneme_loss if rollout_phoneme_input_type == 'gt' else None if phoneme_aux_loss is None: phoneme_aux_loss = torch.tensor(0.0, device=self.device) + + # Subtracting entropy encourages higher entropy (more exploration / prevents mode collapse). total_loss = total_po_loss + self.aux_phoneme_loss_weight * phoneme_aux_loss + if self.entropy_coeff > 0: + total_loss = total_loss - self.entropy_coeff * total_entropy return { 'loss': total_loss, 'po_loss': total_po_loss, 'phoneme_aux_loss': phoneme_aux_loss, 'kl_loss': total_kl, + 'entropy': total_entropy, 'used_gt_phoneme_input': float(rollout_phoneme_input_type == 'gt'), } @@ -948,6 +973,7 @@ def _run_teacher_forced_chunked_po( accumulated_po_loss = torch.tensor(0.0, device=self.device) accumulated_phoneme_aux_loss = torch.tensor(0.0, device=self.device) accumulated_kl_loss = torch.tensor(0.0, device=self.device) + accumulated_entropy = torch.tensor(0.0, device=self.device) used_gt_phoneme_input = 0.0 for group_start_idx, group_end_idx in self._iter_group_ranges(num_groups, groups_per_subbatch): @@ -1001,6 +1027,7 @@ def _run_teacher_forced_chunked_po( accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * group_weight ) accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * group_weight + accumulated_entropy = accumulated_entropy + chunk_outputs['entropy'].detach() * group_weight used_gt_phoneme_input = max(used_gt_phoneme_input, chunk_outputs['used_gt_phoneme_input']) return { @@ -1008,6 +1035,7 @@ def _run_teacher_forced_chunked_po( 'po_loss': accumulated_po_loss, 'phoneme_aux_loss': accumulated_phoneme_aux_loss, 'kl_loss': accumulated_kl_loss, + 'entropy': accumulated_entropy, 'used_gt_phoneme_input': used_gt_phoneme_input, } @@ -1039,7 +1067,15 @@ def training_step(self, batch, batch_idx): ) teacher_forced_time_sec = time.perf_counter() - teacher_forced_start_time - # Compute gradient/weight metrics BEFORE optimizer.step() clears gradients. + # Clip gradients to prevent catastrophic updates from outlier batches. + max_grad_norm = self.cfg.get('max_grad_norm', 1.0) + if max_grad_norm > 0: + torch.nn.utils.clip_grad_norm_( + [p for p in self.parameters() if p.requires_grad and p.grad is not None], + max_norm=max_grad_norm, + ) + + # Compute gradient/weight metrics AFTER clipping but BEFORE optimizer.step() clears them. grad_weight_metrics = self._compute_grad_and_weight_metrics() optimizer.step() @@ -1064,6 +1100,7 @@ def training_step(self, batch, batch_idx): self.log('train_po_loss', po_outputs['po_loss'], prog_bar=True, sync_dist=True) self.log('train_phoneme_aux_loss', po_outputs['phoneme_aux_loss'], prog_bar=True, sync_dist=True) self.log('train_kl_loss', po_outputs['kl_loss'], prog_bar=True, sync_dist=True) + self.log('train_entropy', po_outputs['entropy'], prog_bar=True, sync_dist=True) self.log('train_used_gt_phoneme_input', po_outputs['used_gt_phoneme_input'], prog_bar=True, sync_dist=True) self.log('train_mean_reward', generated_codes_and_metrics['mean_reward'], prog_bar=True, sync_dist=True) self.log('train_std_reward', generated_codes_and_metrics['std_reward'], prog_bar=True, sync_dist=True) From 1af65a9411a81f7502d8cbeed177111d63ff022c Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 18 Feb 2026 20:51:33 -0500 Subject: [PATCH 61/94] mamba config update Signed-off-by: Paarth Neekhara --- .../tts/conf/magpietts/easy_magpietts.yaml | 18 ++++++++++++++---- .../conf/magpietts/easy_magpietts_lhotse.yaml | 18 ++++++++++++++---- nemo/collections/tts/models/easy_magpietts.py | 3 ++- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index ef2ad794c2d0..3a9a274b624c 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -21,11 +21,11 @@ model: transformer_hf_backend: "Qwen/Qwen2.5-1.5B" # NemotronH config (used when decoder_type: "nemotron_h") - # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern: + # Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern: # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer nemotron_h_config: hidden_size: 1536 # Should match embedding_dim - num_hidden_layers: 24 + num_hidden_layers: 48 vocab_size: 131072 # Attention config num_attention_heads: 12 @@ -47,8 +47,17 @@ model: intermediate_size: 4096 mlp_hidden_act: "silu" mlp_bias: false - # Layer pattern: alternating Mamba and Attention - hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*" + # MoE config (scaled from Nemotron-3-Nano-30B-A3B) + n_routed_experts: 48 + num_experts_per_tok: 6 + moe_intermediate_size: 1024 + moe_shared_expert_intermediate_size: 2048 + n_group: 1 + topk_group: 1 + routed_scaling_factor: 2.5 + norm_topk_prob: true + # Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention + hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*" # Normalization layer_norm_epsilon: 1e-5 residual_in_fp32: true @@ -69,6 +78,7 @@ model: local_transformer_type: "autoregressive" # "none", "autoregressive" # Below args are only relevant if use_local_transformer is autoregressive local_transformer_loss_scale: 1.0 + phoneme_loss_weight: 1.0 local_transformer_n_layers: 3 local_transformer_n_heads: 12 local_transformer_hidden_dim: 1536 diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index a6330272a1da..459c7cd071df 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -20,11 +20,11 @@ model: transformer_hf_backend: "Qwen/Qwen2.5-1.5B" # NemotronH config (used when decoder_type: "nemotron_h") - # This is a hybrid Mamba2/Attention model. Layer types are specified via hybrid_override_pattern: + # Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern: # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer nemotron_h_config: hidden_size: 1536 # Should match embedding_dim - num_hidden_layers: 24 + num_hidden_layers: 48 vocab_size: 131072 # Attention config num_attention_heads: 12 @@ -46,8 +46,17 @@ model: intermediate_size: 4096 mlp_hidden_act: "silu" mlp_bias: false - # Layer pattern: alternating Mamba and Attention - hybrid_override_pattern: "M*M*M*M*M*M*M*M*M*M*M*M*" + # MoE config (scaled from Nemotron-3-Nano-30B-A3B) + n_routed_experts: 48 + num_experts_per_tok: 6 + moe_intermediate_size: 1024 + moe_shared_expert_intermediate_size: 2048 + n_group: 1 + topk_group: 1 + routed_scaling_factor: 2.5 + norm_topk_prob: true + # Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention + hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*" # Normalization layer_norm_epsilon: 1e-5 residual_in_fp32: true @@ -66,6 +75,7 @@ model: local_transformer_type: "autoregressive" # "none", "autoregressive" # Below args are only relevant if use_local_transformer is autoregressive local_transformer_loss_scale: 1.0 + phoneme_loss_weight: 1.0 local_transformer_n_layers: 3 local_transformer_n_heads: 12 local_transformer_hidden_dim: 1536 diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 5dd61563788d..c2249ce43092 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -376,6 +376,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0) self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0) self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5) + self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0) if cfg.get('phoneme_tokenizer', None) is not None: self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) @@ -1980,7 +1981,7 @@ def process_batch( else: phoneme_loss = torch.tensor(0.0, device=logits.device) - loss = loss + phoneme_loss + loss = loss + self.phoneme_loss_weight * phoneme_loss return ProcessBatchOutput( loss=loss, From 89cee8f0a65d947dd714a7e398126acb96876e3b Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Wed, 18 Feb 2026 23:44:20 -0800 Subject: [PATCH 62/94] fix weight initialization bugs in mamba Signed-off-by: Paarth Neekhara --- nemo/collections/tts/modules/nemotron_h_decoder.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py index f89e0a8fd326..ba5aa25a77c0 100644 --- a/nemo/collections/tts/modules/nemotron_h_decoder.py +++ b/nemo/collections/tts/modules/nemotron_h_decoder.py @@ -898,6 +898,7 @@ def __init__(self, config: NemotronHConfig): self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size), dtype=torch.float32)) self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32)) + nn.init.normal_(self.weight, mean=0.0, std=config.initializer_range) @torch.no_grad() def get_topk_indices(self, scores: torch.Tensor) -> torch.Tensor: @@ -1176,13 +1177,11 @@ def _init_weights(self): elif isinstance(module, nn.Embedding): nn.init.normal_(module.weight, std=self.config.initializer_range) - # Rescale prenorm residual weights for better training stability - # Following GPT-2 paper: scale by 1/sqrt(2 * n_layer) + # Rescale residual-branch output projections for better training stability. + # Apply 1/sqrt(num_hidden_layers) to Mamba, attention, and MLP/MoE branches. if self.config.rescale_prenorm_residual: for name, p in self.named_parameters(): - if "out_proj.weight" in name: - # Special Scaled Initialization for residual projections - # Scale by 1/sqrt(num_hidden_layers) + if any(k in name for k in ("out_proj.weight", "o_proj.weight", "down_proj.weight")): with torch.no_grad(): p /= math.sqrt(self.config.num_hidden_layers) From fb3343f27f793427e2803af7a87d7812625e0d79 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Fri, 20 Feb 2026 14:20:32 -0800 Subject: [PATCH 63/94] Magpietts decoderonly 2601 flash (#65) * config options Signed-off-by: Paarth Neekhara * flash attention and timing stats Signed-off-by: Paarth Neekhara * clean up timing code Signed-off-by: Paarth Neekhara --------- Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts.py | 9 +- .../tts/modules/nemotron_h_decoder.py | 131 +++++++++++++++++- 2 files changed, 131 insertions(+), 9 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index c2249ce43092..a69bb9b80801 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -377,6 +377,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0) self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5) self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0) + self.parallel_codebook_loss_scale = cfg.get('parallel_codebook_loss_scale', 1.0) + self.local_transformer_loss_scale = cfg.get('local_transformer_loss_scale', 1.0) if cfg.get('phoneme_tokenizer', None) is not None: self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) @@ -1942,7 +1944,7 @@ def process_batch( # Compute codebook loss codebook_loss, _ = self.compute_loss(logits, audio_codes_target, audio_codes_lens_target) - loss = codebook_loss + loss = self.parallel_codebook_loss_scale * codebook_loss # Compute local transformer loss if applicable local_transformer_loss = None @@ -1955,8 +1957,7 @@ def process_batch( local_transformer_loss, _ = self.compute_loss( local_transformer_logits, audio_codes_target, audio_codes_lens_target ) - local_transformer_loss_scale = self.cfg.get('local_transformer_loss_scale', 1.0) - loss = loss + local_transformer_loss_scale * local_transformer_loss + loss = loss + self.local_transformer_loss_scale * local_transformer_loss # Compute phoneme loss if applicable phoneme_loss = None @@ -2167,7 +2168,7 @@ def validation_step(self, batch, batch_idx): if self.run_val_inference: infer_output = self.infer_batch( batch, - max_decoder_steps=300, + max_decoder_steps=330, temperature=0.7, topk=80, use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR, diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py index ba5aa25a77c0..ec30a1e7a699 100644 --- a/nemo/collections/tts/modules/nemotron_h_decoder.py +++ b/nemo/collections/tts/modules/nemotron_h_decoder.py @@ -63,11 +63,19 @@ CAUSAL_CONV1D_AVAILABLE = False try: - from flash_attn import flash_attn_func + from transformers.utils.import_utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10 - FLASH_ATTN_AVAILABLE = True + if is_flash_attn_2_available(): + from transformers.modeling_flash_attention_utils import _flash_attention_forward + + FLASH_ATTN_AVAILABLE = True + else: + _flash_attention_forward = None + FLASH_ATTN_AVAILABLE = False except ImportError: - flash_attn_func = None + is_flash_attn_2_available = None + is_flash_attn_greater_or_equal_2_10 = None + _flash_attention_forward = None FLASH_ATTN_AVAILABLE = False @@ -858,6 +866,101 @@ def forward( return attn_output, None, past_key_value +class NemotronHFlashAttention2(NemotronHAttention): + """ + FlashAttention2 path for NemotronH attention. + + Falls back to eager/SDPA attention if flash-attn is not installed. + """ + + def __init__(self, config: NemotronHConfig, layer_idx: int): + super().__init__(config=config, layer_idx=layer_idx) + self._flash_attn_uses_top_left_mask = ( + not is_flash_attn_greater_or_equal_2_10() if is_flash_attn_greater_or_equal_2_10 is not None else True + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if not FLASH_ATTN_AVAILABLE or _flash_attention_forward is None: + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Query is [B, T, H, D] for flash-attn helper. + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + # Keep key/value as [B, H_kv, T, D] while updating cache. + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if past_key_value is not None: + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Convert key/value to [B, T, H, D] for flash-attn helper. + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + sliding_window=getattr(self.config, "sliding_window", None), + is_causal=self.is_causal, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +NEMOTRONH_ATTENTION_CLASSES = { + "eager": NemotronHAttention, + "sdpa": NemotronHAttention, + "flash_attention_2": NemotronHFlashAttention2, +} + + class NemotronHMLP(nn.Module): """MLP layer for NemotronH.""" @@ -1082,7 +1185,15 @@ def __init__(self, config: NemotronHConfig, layer_idx: int): if self.block_type == "mamba": self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx) elif self.block_type == "attention": - self.mixer = NemotronHAttention(config, layer_idx=layer_idx) + attn_impl = config._attn_implementation + if attn_impl == "flash_attention_2" and not FLASH_ATTN_AVAILABLE: + logging.warning( + "NemotronH requested _attn_implementation='flash_attention_2' but flash-attn is unavailable. " + "Falling back to sdpa." + ) + attn_impl = "sdpa" + attn_cls = NEMOTRONH_ATTENTION_CLASSES.get(attn_impl, NemotronHAttention) + self.mixer = attn_cls(config, layer_idx=layer_idx) elif self.block_type == "mlp": self.mixer = NemotronHMLP(config, layer_idx=layer_idx) elif self.block_type == "moe": @@ -1119,7 +1230,12 @@ def _forward_impl( if self.block_type == "mamba": hidden_states = self.mixer(hidden_states, cache_params=cache_params, cache_position=cache_position) elif self.block_type == "attention": - hidden_states = self.mixer(hidden_states, cache_position=cache_position, past_key_value=cache_params) + hidden_states = self.mixer( + hidden_states, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_value=cache_params, + ) hidden_states = hidden_states[0] elif self.block_type in ("mlp", "moe"): hidden_states = self.mixer(hidden_states) @@ -1284,6 +1400,11 @@ def forward( def _create_causal_mask(self, attention_mask, input_tensor, cache_position): """Create causal attention mask.""" + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and torch.any(attention_mask == 0): + return attention_mask + return None + dtype, device = input_tensor.dtype, input_tensor.device min_dtype = torch.finfo(dtype).min sequence_length = input_tensor.shape[1] From dab64378c8833ba57763253f05b2dbafe7b41006 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Fri, 20 Feb 2026 17:55:05 -0800 Subject: [PATCH 64/94] add do tts method Signed-off-by: Shehzeen Hussain --- examples/tts/magpietts_inference.py | 2 +- nemo/collections/tts/models/easy_magpietts.py | 125 ++++++++++++++++++ 2 files changed, 126 insertions(+), 1 deletion(-) diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index feead6519875..e97fc0ea7e9e 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -558,7 +558,7 @@ def main(argv=None): if args.longform_mode in {'always', 'auto'}: model_inference_parameters["max_decoder_steps"] = args.longform_max_decoder_steps elif args.is_decoder_only_model: - model_inference_parameters["max_decoder_steps"] = 220 + model_inference_parameters["max_decoder_steps"] = 300 else: model_inference_parameters["max_decoder_steps"] = 440 diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index a69bb9b80801..91a790a8c2a5 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -3556,6 +3556,131 @@ def infer_batch( phoneme_prediction_start_idx=state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None, ) + @staticmethod + def _load_audio_for_inference(audio_path: str, target_sample_rate: int) -> torch.Tensor: + """ + Load context audio and resample if needed. + Returns tensor of shape (1, num_samples). + """ + audio, sr = sf.read(audio_path, dtype='float32') + if len(audio.shape) > 1: + audio = audio.mean(axis=1) + if sr != target_sample_rate: + import librosa + + audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate) + return torch.from_numpy(audio).unsqueeze(0) + + @staticmethod + def _adjust_audio_to_duration_for_inference( + audio: torch.Tensor, + sample_rate: int, + target_duration: float, + codec_model_samples_per_frame: int, + ) -> torch.Tensor: + """ + Match the same duration-alignment logic used in magpietts_streaming_inference.py. + """ + num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame) + target_num_samples = num_codec_frames * codec_model_samples_per_frame + current_num_samples = audio.size(1) + + if current_num_samples >= target_num_samples: + audio = audio[:, :target_num_samples] + else: + num_repeats = int(np.ceil(target_num_samples / current_num_samples)) + audio_repeated = audio.repeat(1, num_repeats) + audio = audio_repeated[:, :target_num_samples] + return audio + + def do_tts( + self, + transcript: str, + context_audio_file_path: Optional[str] = None, + context_text: str = "[NO TEXT CONTEXT]", + main_tokenizer_name: Optional[str] = None, + context_audio_duration: float = 5.0, + use_cfg: bool = True, + cfg_scale: float = 2.5, + use_local_transformer: bool = True, + temperature: float = 0.7, + topk: int = 80, + max_steps: int = 330, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Generate speech from transcript using EasyMagpie inference with optional context text/audio. + """ + if transcript is None or transcript.strip() == "": + raise ValueError("`transcript` must be a non-empty string.") + + device = next(self.parameters()).device + transcript = transcript.strip() + context_text = (context_text or "[NO TEXT CONTEXT]").strip() + + if main_tokenizer_name is None: + # Match model init behavior: default to first configured tokenizer. + main_tokenizer_name = list(self.cfg.text_tokenizers.keys())[0] + if main_tokenizer_name not in self.tokenizer.tokenizers: + raise ValueError( + f"Unknown main_tokenizer_name='{main_tokenizer_name}'. " + f"Available tokenizers: {list(self.tokenizer.tokenizers.keys())}" + ) + + text_tokens = self.tokenizer.encode(transcript, tokenizer_name=main_tokenizer_name) + [self.eos_id] + text = torch.tensor([text_tokens], dtype=torch.long, device=device) + text_lens = torch.tensor([len(text_tokens)], dtype=torch.long, device=device) + + context_text_tokens = self.tokenizer.encode(context_text, tokenizer_name=self.text_conditioning_tokenizer_name) + context_text_tensor = torch.tensor([context_text_tokens], dtype=torch.long, device=device) + context_text_lens = torch.tensor([len(context_text_tokens)], dtype=torch.long, device=device) + + if context_audio_file_path is not None and context_audio_file_path.strip() != "": + context_audio = self._load_audio_for_inference(context_audio_file_path, self.sample_rate) + context_audio = self._adjust_audio_to_duration_for_inference( + context_audio, + self.sample_rate, + context_audio_duration, + self.codec_model_samples_per_frame, + ) + context_audio = context_audio.to(device) + context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device) + with torch.inference_mode(): + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + else: + context_audio_codes = torch.zeros( + 1, + self.data_num_audio_codebooks, + 0, + dtype=torch.long, + device=device, + ) + context_audio_codes_lens = torch.zeros(1, dtype=torch.long, device=device) + + batch = { + 'text': text, + 'text_lens': text_lens, + 'context_text_tokens': context_text_tensor, + 'context_text_tokens_lens': context_text_lens, + 'context_audio_codes': context_audio_codes, + 'context_audio_codes_lens': context_audio_codes_lens, + } + + with torch.inference_mode(): + output = self.infer_batch( + batch=batch, + max_decoder_steps=max_steps, + temperature=temperature, + topk=topk, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer_for_inference=use_local_transformer, + phoneme_input_type='pred', + phoneme_sampling_method='argmax', + use_teacher_forced=False, + use_inference_mode=True, + ) + return output.predicted_audio, output.predicted_audio_lens + @classmethod def list_available_models(cls) -> List[PretrainedModelInfo]: return [] From d58581b22c0b6b7f9e2a68654eda38df1ca2e482 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sun, 22 Feb 2026 22:23:11 -0500 Subject: [PATCH 65/94] bug fix Signed-off-by: Paarth Neekhara --- examples/tts/magpietts_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index e97fc0ea7e9e..19085f78eb96 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -510,7 +510,7 @@ def create_argument_parser() -> argparse.ArgumentParser: target_group.add_argument('--legacy_context_stacking', action='store_true', help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking') target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt']) target_group.add_argument( - '--phoneme_sampling_method', type=str, default='greedy', choices=['greedy', 'multinomial'] + '--phoneme_sampling_method', type=str, default='argmax', choices=['argmax', 'multinomial'] ) target_group.add_argument('--dropout_text_input', action='store_true') From 9ec6767637f31b5d37c634654dc9e0cc9e6f17ac Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 23 Feb 2026 22:37:40 -0800 Subject: [PATCH 66/94] Magpietts decoderonly 2601 utmos po (#67) * add utmos to PO Signed-off-by: Shehzeen Hussain * utmos in PO Signed-off-by: Shehzeen Hussain * whisper update Signed-off-by: Shehzeen Hussain * batched utmos Signed-off-by: Shehzeen Hussain --------- Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 97 +++++++++-- .../easy_magpietts_preference_optimization.py | 161 +++++++++++++----- .../magpietts_preference_optimization.py | 20 ++- nemo/collections/tts/modules/utmosv2.py | 13 +- nemo/collections/tts/parts/utils/helpers.py | 136 +++++++++++++-- 5 files changed, 351 insertions(+), 76 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 91a790a8c2a5..680a313618e6 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -51,15 +51,24 @@ cosine_schedule, ) from nemo.collections.tts.parts.utils.helpers import ( + compute_utmos_scores_from_filepaths, get_mask_from_lengths, get_speaker_embeddings_from_filepaths, process_text_for_cer, transcribe_with_whisper, + transcribe_with_whisper_from_filepaths, ) from nemo.core.classes import ModelPT from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +try: + from nemo.collections.tts.modules.utmosv2 import UTMOSv2Calculator + + HAVE_UTMOSV2 = True +except (ImportError, ModuleNotFoundError): + HAVE_UTMOSV2 = False + @dataclass class TrainingMode: @@ -562,6 +571,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self._eval_speaker_verification_model.freeze() logging.info("Eval models loaded successfully.") + # UTMOSv2 naturalness scoring for validation (optional) + self.use_utmos = cfg.get('use_utmos', False) + if self.use_utmos: + assert HAVE_UTMOSV2, ( + "UTMOSv2 is required for UTMOS scoring but is not installed. " + "Install it with: pip install git+https://github.com/sarulab-speech/UTMOSv2.git@v1.2.1" + ) + self._utmos_calculator = UTMOSv2Calculator(device='cpu') + logging.info("UTMOSv2 calculator initialized for validation naturalness scoring") + def setup_optimizer_param_groups(self): """ Override to exclude frozen eval/inference-only models from the optimizer. @@ -575,6 +594,7 @@ def setup_optimizer_param_groups(self): '_eval_speaker_verification_model', 'whisper_model', 'whisper_processor', + '_utmos_calculator', } # Collect parameter ids to exclude @@ -610,6 +630,7 @@ def state_dict(self, destination=None, prefix='', keep_vars=False): '_eval_speaker_verification_model', 'whisper_model', 'whisper_processor', + '_utmos_calculator', ] for key in list(state_dict.keys()): if any([substring in key for substring in keys_substrings_to_exclude]): @@ -633,6 +654,7 @@ def load_state_dict(self, state_dict, strict=True): '_eval_speaker_verification_model', 'whisper_model', 'whisper_processor', + '_utmos_calculator', ]: continue if any(param.numel() > 0 for param in child.parameters()): @@ -2243,21 +2265,34 @@ def validation_step(self, batch, batch_idx): languages = batch.get('languages', None) if languages is None: languages = ['en'] * len(predicted_audio_paths) - pred_transcripts = [] - for audio_path, lang in zip(predicted_audio_paths, languages): - try: - transcript = transcribe_with_whisper( - audio_path, - lang, - self.whisper_processor, - self.whisper_model, - self.device, - normalizer=None, - ) - pred_transcripts.append(process_text_for_cer(transcript)) - except Exception as e: - logging.warning(f"Val ASR transcription failed for {audio_path}: {e}") - pred_transcripts.append(None) + try: + transcripts = transcribe_with_whisper_from_filepaths( + audio_filepaths=predicted_audio_paths, + language=languages, + whisper_processor=self.whisper_processor, + whisper_model=self.whisper_model, + device=self.device, + normalizer=None, + ) + pred_transcripts = [process_text_for_cer(transcript) for transcript in transcripts] + except Exception as e: + logging.warning(f"Val batched ASR transcription failed, falling back to per-file mode: {e}") + pred_transcripts = [] + for item_idx, audio_path in enumerate(predicted_audio_paths): + lang = languages[item_idx] if item_idx < len(languages) else 'en' + try: + transcript = transcribe_with_whisper( + audio_path, + lang, + self.whisper_processor, + self.whisper_model, + self.device, + normalizer=None, + ) + pred_transcripts.append(process_text_for_cer(transcript)) + except Exception as inner_e: + logging.warning(f"Val ASR transcription failed for {audio_path}: {inner_e}") + pred_transcripts.append(None) else: pred_transcripts = self._eval_asr_model.transcribe( predicted_audio_paths, @@ -2280,8 +2315,23 @@ def validation_step(self, batch, batch_idx): logging.warning(f"Val speaker embeddings failed: {e}") pred_embeddings = ctx_embeddings = None + utmos_scores = None + if getattr(self, 'use_utmos', False) and hasattr(self, '_utmos_calculator'): + utmos_batch_size = max(int(self.cfg.get('utmos_batch_size', len(predicted_audio_paths))), 1) + utmos_num_workers = max(int(self.cfg.get('utmos_num_workers', 0)), 0) + try: + utmos_scores = compute_utmos_scores_from_filepaths( + audio_filepaths=predicted_audio_paths, + utmos_calculator=self._utmos_calculator, + batch_size=utmos_batch_size, + num_workers=utmos_num_workers, + rank_tag=str(self.global_rank), + ) + except Exception as e: + raise RuntimeError(f"Val UTMOSv2 batched scoring failed: {e}") from e + # Compute per-sample metrics for successful cases only - batch_cer, batch_wer, batch_ssim = [], [], [] + batch_cer, batch_wer, batch_ssim, batch_utmos = [], [], [], [] for idx in range(len(predicted_audio_paths)): if pred_transcripts[idx] is None: continue @@ -2296,9 +2346,16 @@ def validation_step(self, batch, batch_idx): ctx_emb = ctx_embeddings[idx].cpu().float().numpy() ssim = float(np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb))) batch_ssim.append(ssim) + + # UTMOSv2 naturalness score (MOS on 1-5 scale) + utmos_score = None if utmos_scores is None else float(utmos_scores[idx]) + if utmos_score is not None: + batch_utmos.append(utmos_score) + + utmos_str = f", UTMOS={utmos_score:.4f}" if utmos_score is not None else "" logging.info( f"[Val] rank{self.global_rank}_batch{batch_idx}_idx{idx}: " - f"CER={cer:.4f}, WER={wer:.4f} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'" + f"CER={cer:.4f}, WER={wer:.4f}{utmos_str} | GT: '{gt_transcript[:50]}...' | Pred: '{pred_transcripts[idx][:50]}...'" ) # Save per-audio metrics JSON file alongside the audio file @@ -2307,6 +2364,7 @@ def validation_step(self, batch, batch_idx): 'cer': float(cer), 'wer': float(wer), 'ssim': ssim, + 'utmos': utmos_score, 'gt_transcript': gt_transcript, 'pred_transcript': pred_transcripts[idx], 'audio_path': predicted_audio_paths[idx], @@ -2331,6 +2389,8 @@ def validation_step(self, batch, batch_idx): val_output['val_wer_list'] = batch_wer if batch_ssim: val_output['val_ssim'] = torch.tensor(np.mean(batch_ssim), device=self.device) + if batch_utmos: + val_output['val_utmos'] = torch.tensor(np.mean(batch_utmos), device=self.device) self.validation_step_outputs.append(val_output) @@ -2363,6 +2423,7 @@ def collect_if_exists(key): val_cer = collect_if_exists("val_cer") val_wer = collect_if_exists("val_wer") val_ssim = collect_if_exists("val_ssim") + val_utmos = collect_if_exists("val_utmos") if val_cer is not None: self.log("val/cer", val_cer, prog_bar=True, sync_dist=True) @@ -2370,6 +2431,8 @@ def collect_if_exists(key): self.log("val/wer", val_wer, prog_bar=True, sync_dist=True) if val_ssim is not None: self.log("val/ssim", val_ssim, prog_bar=True, sync_dist=True) + if val_utmos is not None: + self.log("val/utmos", val_utmos, prog_bar=True, sync_dist=True) if self.use_multilingual_asr: lang_cer = {} diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py index d5c94fec59b1..45d9bd542b59 100644 --- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py @@ -29,10 +29,11 @@ from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel from nemo.collections.tts.parts.utils.helpers import ( + compute_utmos_scores_from_filepaths, get_mask_from_lengths, get_speaker_embeddings_from_filepaths, process_text_for_cer, - transcribe_with_whisper, + transcribe_with_whisper_from_filepaths, ) from nemo.utils import logging @@ -52,6 +53,13 @@ Normalizer = None PYNINI_AVAILABLE = False +try: + from nemo.collections.tts.modules.utmosv2 import UTMOSv2Calculator + + HAVE_UTMOSV2 = True +except (ImportError, ModuleNotFoundError): + HAVE_UTMOSV2 = False + class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel): """ @@ -59,7 +67,7 @@ class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel): Training flow: 1. Sample multiple generations per prompt. - 2. Compute rewards (CER/SSIM/PESQ). + 2. Compute rewards (CER/SSIM/PESQ/UTMOSv2). 3. Compute group-normalized advantages. 4. Run teacher-forced policy forward on generated codes and optimize GRPO objective. 5. Add auxiliary phoneme loss from the same forward pass with GT phoneme tokens. @@ -118,6 +126,16 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): assert HAVE_TORCHAUDIO, "torchaudio is required for PESQ reward." self.squim_objective_model = SQUIM_OBJECTIVE.get_model() + self.use_utmos = self.cfg.get('use_utmos', False) + if self.use_utmos: + assert HAVE_UTMOSV2, ( + "UTMOSv2 is required for the UTMOS reward but is not installed. " + "Install it with: pip install git+https://github.com/sarulab-speech/UTMOSv2.git@v1.2.1" + ) + # Initialize on CPU; we score from saved wav files so no GPU needed. + self._utmos_calculator = UTMOSv2Calculator(device='cpu') + logging.info("UTMOSv2 calculator initialized for naturalness reward") + self.loss_type = self.cfg.get('loss_type', 'grpo') if self.loss_type not in ['grpo', 'dr_grpo']: raise ValueError( @@ -151,6 +169,7 @@ def _get_trainable_module_groups(self) -> Dict[str, List[torch.nn.Parameter]]: '_speaker_verification_model', '_codec_model', '_eval_asr_model', '_eval_speaker_verification_model', '_reference_model', 'whisper_model', 'whisper_processor', 'squim_objective_model', + '_utmos_calculator', } groups: Dict[str, List[torch.nn.Parameter]] = {} for name, module in self.named_children(): @@ -259,6 +278,7 @@ def setup_optimizer_param_groups(self): '_reference_model', 'whisper_model', 'whisper_processor', + '_utmos_calculator', # These modules are not used by the PO loss and receive no gradients. # Including them would only apply weight decay, degrading their weights. 'final_proj', @@ -277,7 +297,7 @@ def setup_optimizer_param_groups(self): def state_dict(self, destination=None, prefix='', keep_vars=False): state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars) - keys_substrings_to_exclude = ['_reference_model'] + keys_substrings_to_exclude = ['_reference_model', '_utmos_calculator'] for key in list(state_dict.keys()): if any(substring in key for substring in keys_substrings_to_exclude): del state_dict[key] @@ -496,18 +516,86 @@ def _print_group_cer_wer_table( f"{item_metrics['cer_gt']:.4f}", f"{item_metrics['wer_gt']:.4f}", f"{item_metrics['spk_similarity']:.4f}", + f"{item_metrics.get('utmos', 0.0):.4f}", f"{item_metrics['reward']:.4f}", f"{item_metrics.get('advantage', 0.0):.4f}", ] ) - table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "reward", "advantage"], rows=rows) + table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "utmos", "reward", "advantage"], rows=rows) print( f"[generate_and_reward] group={group_idx} valid={is_group_valid} " f"mean_reward={mean_reward:.4f} std_reward={std_reward:.4f}\n" f"prompt: {prompt_text}\n{table}\n" ) + def _compute_pred_transcripts(self, predicted_audio_paths: List[str], batch_repeated: Dict, reward_asr_model: str) -> List[str]: + if reward_asr_model == 'nemo': + pred_transcripts = self._eval_asr_model.transcribe( + predicted_audio_paths, + batch_size=len(predicted_audio_paths), + override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0), + ) + return [process_text_for_cer(transcript.text) for transcript in pred_transcripts] + + self.whisper_model.to(self.device) + pred_transcripts = [""] * len(predicted_audio_paths) + langs = batch_repeated.get('languages', ['en'] * len(predicted_audio_paths)) + language_groups = {} + for item_idx, audio_path in enumerate(predicted_audio_paths): + language = langs[item_idx] if item_idx < len(langs) else 'en' + language_groups.setdefault(language, []).append((item_idx, audio_path)) + + for language, grouped_items in language_groups.items(): + normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None + grouped_paths = [audio_path for _, audio_path in grouped_items] + group_transcripts = transcribe_with_whisper_from_filepaths( + audio_filepaths=grouped_paths, + language=language, + whisper_processor=self.whisper_processor, + whisper_model=self.whisper_model, + device=self.device, + normalizer=normalizer, + ) + for (item_idx, _), transcript in zip(grouped_items, group_transcripts): + pred_transcripts[item_idx] = process_text_for_cer(transcript) + return pred_transcripts + + def _compute_speaker_embeddings_parallel( + self, predicted_audio_paths: List[str], batch: Dict, num_generations_per_item: int + ): + reference_audio_paths = self._get_reference_audio_paths(batch) + pred_speaker_embeddings = get_speaker_embeddings_from_filepaths( + predicted_audio_paths, self._eval_speaker_verification_model, self.device + ) + gt_speaker_embeddings = get_speaker_embeddings_from_filepaths( + reference_audio_paths, self._eval_speaker_verification_model, self.device + ) + if num_generations_per_item > 1: + gt_speaker_embeddings = gt_speaker_embeddings.repeat_interleave(num_generations_per_item, dim=0) + + if gt_speaker_embeddings.size(0) != pred_speaker_embeddings.size(0): + raise RuntimeError( + f"Speaker embedding size mismatch. GT={gt_speaker_embeddings.size(0)}, " + f"Pred={pred_speaker_embeddings.size(0)}." + ) + return pred_speaker_embeddings, gt_speaker_embeddings + + def _compute_utmos_scores_batched(self, predicted_audio_paths: List[str]) -> List[float]: + if not self.use_utmos: + return [0.0] * len(predicted_audio_paths) + if len(predicted_audio_paths) == 0: + return [] + utmos_batch_size = max(int(self.cfg.get('utmos_batch_size', len(predicted_audio_paths))), 1) + utmos_num_workers = max(int(self.cfg.get('utmos_num_workers', 0)), 0) + return compute_utmos_scores_from_filepaths( + audio_filepaths=predicted_audio_paths, + utmos_calculator=self._utmos_calculator, + batch_size=utmos_batch_size, + num_workers=utmos_num_workers, + rank_tag=str(self.global_rank), + ) + def generate_and_reward( self, batch: Dict, @@ -566,57 +654,31 @@ def generate_and_reward( audio_durations = [int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0))] rewarding_start_time = time.perf_counter() - if reward_asr_model == 'nemo': - pred_transcripts = self._eval_asr_model.transcribe( - predicted_audio_paths, - batch_size=len(predicted_audio_paths), - override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0), - ) - pred_transcripts = [process_text_for_cer(transcript.text) for transcript in pred_transcripts] - else: - self.whisper_model.to(self.device) - pred_transcripts = [] - langs = batch_repeated.get('languages', ['en'] * len(predicted_audio_paths)) - for item_idx, audio_path in enumerate(predicted_audio_paths): - language = langs[item_idx] if item_idx < len(langs) else 'en' - normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None - print(f"Transcribing audio {audio_path} with language {language}") - transcript = transcribe_with_whisper( - audio_filepath=audio_path, - language=language, - whisper_processor=self.whisper_processor, - whisper_model=self.whisper_model, - device=self.device, - normalizer=normalizer, - ) - print(f"Pred Transcript: {transcript}") - print(f"Normalized Pred Text: {process_text_for_cer(transcript)}") - print(f"Raw Text: {batch_repeated['raw_texts'][item_idx]}") - print("--------------------------------") - pred_transcripts.append(process_text_for_cer(transcript)) - - reference_audio_paths = self._get_reference_audio_paths(batch_repeated) + pred_transcripts = self._compute_pred_transcripts(predicted_audio_paths, batch_repeated, reward_asr_model) try: - pred_speaker_embeddings = get_speaker_embeddings_from_filepaths( - predicted_audio_paths, self._eval_speaker_verification_model, self.device - ) - gt_speaker_embeddings = get_speaker_embeddings_from_filepaths( - reference_audio_paths, self._eval_speaker_verification_model, self.device + pred_speaker_embeddings, gt_speaker_embeddings = self._compute_speaker_embeddings_parallel( + predicted_audio_paths, batch, num_generations_per_item ) except Exception as e: logging.warning(f"Speaker-embedding reward failed. Falling back to zero SSIM reward. Error: {e}") pred_speaker_embeddings = None gt_speaker_embeddings = None + utmos_scores = self._compute_utmos_scores_batched(predicted_audio_paths) batch_metrics = [] cer_reward_weight = self.cfg.get('cer_reward_weight', 0.5) ssim_reward_weight = self.cfg.get('ssim_reward_weight', 0.5) pesq_reward_weight = self.cfg.get('pesq_reward_weight', 0.0) + utmos_reward_weight = self.cfg.get('utmos_reward_weight', 0.0) min_valid_codes_len = self.cfg.get('min_valid_codes_len', 4) max_valid_codes_len = self.cfg.get( 'max_valid_codes_len', self.max_decoder_steps * self.frame_stacking_factor - 1 ) + # UTMOSv2 reward shaping parameters (MOS scale is 1–5). + mean_utmos_dataset = self.cfg.get('mean_utmos_dataset', 3.5) + best_utmos_achievable = self.cfg.get('best_utmos_achievable', 4.5) + for idx in range(predicted_audio.size(0)): pred_transcript = pred_transcripts[idx] gt_transcript = process_text_for_cer(batch_repeated['raw_texts'][idx]) @@ -641,6 +703,8 @@ def generate_and_reward( else: pesq_hyp = 0.0 + utmos_score = utmos_scores[idx] + item_metrics = { 'cer_gt': float(cer_gt), 'wer_gt': float(wer_gt), @@ -650,6 +714,7 @@ def generate_and_reward( 'gt_transcript': gt_transcript, 'codes_len': int(predicted_codes_lens[idx].item()), 'pesq': float(pesq_hyp), + 'utmos': float(utmos_score), } best_ssim_achievable = self.cfg.get('best_ssim_achievable', 0.9) @@ -671,10 +736,27 @@ def generate_and_reward( spk_similarity_reward = 0.5 - 0.5 * (mean_ssim_dataset - item_ssim) / max(mean_ssim_dataset, 1e-8) pesq_reward = item_metrics['pesq'] / 4.5 if use_pesq else 0.0 + + # UTMOSv2 reward: piecewise linear shaping centered on mean_utmos_dataset, + # analogous to the CER and SSIM reward shaping. + if self.use_utmos: + item_utmos = max(min(utmos_score, best_utmos_achievable), 1.0) + if item_utmos >= mean_utmos_dataset: + utmos_reward = 0.5 + 0.5 * (item_utmos - mean_utmos_dataset) / max( + best_utmos_achievable - mean_utmos_dataset, 1e-8 + ) + else: + utmos_reward = 0.5 - 0.5 * (mean_utmos_dataset - item_utmos) / max( + mean_utmos_dataset - 1.0, 1e-8 + ) + else: + utmos_reward = 0.0 + reward = ( cer_reward * cer_reward_weight + spk_similarity_reward * ssim_reward_weight + pesq_reward * pesq_reward_weight + + utmos_reward * utmos_reward_weight ) if (item_metrics['codes_len'] >= max_valid_codes_len) or (item_metrics['codes_len'] <= min_valid_codes_len): item_metrics['_needs_group_min_reward'] = True @@ -684,6 +766,7 @@ def generate_and_reward( item_metrics['cer_reward'] = float(cer_reward) item_metrics['spk_similarity_reward'] = float(spk_similarity_reward) item_metrics['pesq_reward'] = float(pesq_reward) + item_metrics['utmos_reward'] = float(utmos_reward) item_metrics['reward'] = float(reward) batch_metrics.append(item_metrics) diff --git a/nemo/collections/tts/models/magpietts_preference_optimization.py b/nemo/collections/tts/models/magpietts_preference_optimization.py index a6d11f6ac1ae..d754f5718130 100644 --- a/nemo/collections/tts/models/magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/magpietts_preference_optimization.py @@ -30,6 +30,7 @@ get_speaker_embeddings_from_filepaths, process_text_for_cer, transcribe_with_whisper, + transcribe_with_whisper_from_filepaths, ) from nemo.utils import logging @@ -661,14 +662,25 @@ def generate_and_reward( ) pred_transcripts = [process_text_for_cer(transcript.text) for transcript in pred_transcripts] elif self.cfg.get("reward_asr_model", "nemo") == "whisper": - pred_transcripts = [] + pred_transcripts = [""] * len(predicted_audio_paths) + language_groups = {} for item_idx, audio_path in enumerate(predicted_audio_paths): language = batch_repeated['languages'][item_idx] + language_groups.setdefault(language, []).append((item_idx, audio_path)) + + for language, grouped_items in language_groups.items(): normalizer = self._get_cached_normalizer(language) if self._normalize_whisper_transcript else None - transcript = transcribe_with_whisper( - audio_path, language, self.whisper_processor, self.whisper_model, self.device, normalizer + grouped_paths = [audio_path for _, audio_path in grouped_items] + grouped_transcripts = transcribe_with_whisper_from_filepaths( + audio_filepaths=grouped_paths, + language=language, + whisper_processor=self.whisper_processor, + whisper_model=self.whisper_model, + device=self.device, + normalizer=normalizer, ) - pred_transcripts.append(transcript) + for (item_idx, _), transcript in zip(grouped_items, grouped_transcripts): + pred_transcripts[item_idx] = transcript pred_transcripts = [process_text_for_cer(transcript) for transcript in pred_transcripts] else: # Address CodeQL issue where pred_transcripts might be undefined for future code diff --git a/nemo/collections/tts/modules/utmosv2.py b/nemo/collections/tts/modules/utmosv2.py index fb1dc76d17bd..46b17316d0ea 100644 --- a/nemo/collections/tts/modules/utmosv2.py +++ b/nemo/collections/tts/modules/utmosv2.py @@ -62,21 +62,28 @@ def __call__(self, file_path): mos_score = self.model.predict(input_path=file_path, num_repetitions=1, num_workers=0) return mos_score - def process_directory(self, input_dir: str, batch_size: int = 16) -> list[dict[str, str | float]]: + def process_directory( + self, input_dir: str, batch_size: int = 16, num_workers: int = None + ) -> list[dict[str, str | float]]: """ Computes UTMOSv2 scores for all `*.wav` files in the given directory. Args: input_dir: The directory containing the audio files. - batch_size: The number of audio files to process in parallel. + batch_size: The number of audio files per scoring batch. + num_workers: Number of worker processes used by UTMOS internals. + Set to 0 to avoid multiprocessing pickling issues. Returns: A list of dictionaries, each containing the file path and the UTMOSv2 score. """ + if num_workers is None: + num_workers = batch_size + with torch.inference_mode(): # UTMOSV2 tends to launch many of OpenMP threads which overloads the machine's CPUs # while actually slowing down the prediction. Limit the number of threads here. with threadpool_limits(limits=1): results = self.model.predict( - input_dir=input_dir, num_repetitions=1, num_workers=batch_size, batch_size=batch_size + input_dir=input_dir, num_repetitions=1, num_workers=num_workers, batch_size=batch_size ) return results diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py index a8ee48ce57ef..cf6dbbdcd494 100644 --- a/nemo/collections/tts/parts/utils/helpers.py +++ b/nemo/collections/tts/parts/utils/helpers.py @@ -43,8 +43,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import string +import os +import shutil +import tempfile from enum import Enum -from typing import Any, Optional, Tuple +from collections import defaultdict +from typing import Any, List, Optional, Sequence, Tuple, Union import librosa import matplotlib.pylab as plt @@ -845,19 +849,69 @@ def transcribe_with_whisper( """ Transcribe audio with Whisper. Optionally normalize the transcript if a normalizer is provided. """ - speech_array, sampling_rate = librosa.load(audio_filepath, sr=16000) - forced_decoder_ids = ( - whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe") if language else None + transcripts = transcribe_with_whisper_from_filepaths( + audio_filepaths=[audio_filepath], + language=language, + whisper_processor=whisper_processor, + whisper_model=whisper_model, + device=device, + normalizer=normalizer, ) - inputs = whisper_processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt").input_features - inputs = inputs.to(device) - with torch.no_grad(): - predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids) - transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True) - result = transcription[0] - if normalizer is not None: - result = normalizer.normalize(result) - return result + return transcripts[0] + + +def transcribe_with_whisper_from_filepaths( + audio_filepaths: Sequence[str], + language: Optional[Union[str, Sequence[Optional[str]]]], + whisper_processor: Any, + whisper_model: Any, + device: torch.device, + normalizer: Optional[Any] = None, + batch_size: Optional[int] = None, +) -> List[str]: + """ + Transcribe a list of audios with Whisper using batched inference. + Supports a single language for all files or per-file language values. + """ + if len(audio_filepaths) == 0: + return [] + + if batch_size is None: + batch_size = len(audio_filepaths) + if batch_size <= 0: + raise ValueError(f"batch_size must be > 0, but received: {batch_size}") + + if isinstance(language, str) or language is None: + languages = [language] * len(audio_filepaths) + else: + if len(language) != len(audio_filepaths): + raise ValueError( + f"Expected len(language) == len(audio_filepaths), but got {len(language)} and {len(audio_filepaths)}." + ) + languages = list(language) + + grouped_indices = defaultdict(list) + for idx, lang in enumerate(languages): + grouped_indices[lang].append(idx) + + transcripts = [""] * len(audio_filepaths) + for lang, indices in grouped_indices.items(): + forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=lang, task="transcribe") if lang else None + for start_idx in range(0, len(indices), batch_size): + batch_indices = indices[start_idx : start_idx + batch_size] + speech_arrays = [librosa.load(audio_filepaths[idx], sr=16000)[0] for idx in batch_indices] + inputs = whisper_processor( + speech_arrays, sampling_rate=16000, return_tensors="pt", padding=True + ).input_features.to(device) + with torch.no_grad(): + predicted_ids = whisper_model.generate(inputs, forced_decoder_ids=forced_decoder_ids) + batch_transcripts = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True) + if normalizer is not None: + batch_transcripts = [normalizer.normalize(text) for text in batch_transcripts] + for idx, text in zip(batch_indices, batch_transcripts): + transcripts[idx] = text + + return transcripts def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model, device): @@ -883,3 +937,59 @@ def get_speaker_embeddings_from_filepaths(filepaths, speaker_verification_model, ) return speaker_embeddings + + +def compute_utmos_scores_from_filepaths( + audio_filepaths: Sequence[str], + utmos_calculator: Any, + batch_size: int = 8, + num_workers: int = 0, + rank_tag: str = "0", +) -> List[float]: + """ + Compute UTMOS scores in strict batched mode for a list of wav filepaths. + + Expected UTMOS batch output schema (per item): + {'file_path': , 'predicted_mos': } + """ + if len(audio_filepaths) == 0: + return [] + + batch_size = max(int(batch_size), 1) + num_workers = max(int(num_workers), 0) + scores = [0.0] * len(audio_filepaths) + + with tempfile.TemporaryDirectory(prefix=f"utmos_rank{rank_tag}_") as tmp_dir: + file_to_idx = {} + for idx, src_path in enumerate(audio_filepaths): + tmp_name = f"{idx:06d}.wav" + tmp_path = os.path.join(tmp_dir, tmp_name) + try: + os.symlink(src_path, tmp_path) + except OSError: + try: + os.link(src_path, tmp_path) + except OSError: + shutil.copy2(src_path, tmp_path) + file_to_idx[tmp_name] = idx + + batch_results = utmos_calculator.process_directory(tmp_dir, batch_size=batch_size, num_workers=num_workers) + if not isinstance(batch_results, list): + raise RuntimeError(f"Unexpected UTMOSv2 output type: {type(batch_results)}") + + for item in batch_results: + if not isinstance(item, dict): + raise RuntimeError(f"Unexpected UTMOSv2 batch item type: {type(item)}") + if 'file_path' not in item or 'predicted_mos' not in item: + raise RuntimeError( + "Unexpected UTMOSv2 batch item schema. Expected keys: 'file_path' and 'predicted_mos'. " + f"Got keys: {list(item.keys())}" + ) + idx = file_to_idx.get(os.path.basename(str(item['file_path']))) + if idx is None: + raise RuntimeError( + f"UTMOSv2 returned unknown file path '{item['file_path']}' that does not map to this batch." + ) + scores[idx] = float(item['predicted_mos']) + + return scores From acc05a10f89c82f0870b232a0ca3455c1133af71 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Tue, 24 Feb 2026 14:43:22 -0500 Subject: [PATCH 67/94] full phoneme channel dropout option Signed-off-by: Paarth Neekhara --- examples/tts/conf/magpietts/easy_magpietts.yaml | 1 + examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml | 1 + nemo/collections/tts/models/easy_magpietts.py | 5 +++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index 3a9a274b624c..a668686dc28c 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -104,6 +104,7 @@ model: phoneme_corruption_batch_prob: 0.1 phoneme_corruption_timestep_ratio: 0.15 phoneme_corruption_unk_mode_prob: 0.5 + phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel" phoneme_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index 459c7cd071df..6eb4d03a98d2 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -99,6 +99,7 @@ model: phoneme_corruption_batch_prob: 0.1 phoneme_corruption_timestep_ratio: 0.15 phoneme_corruption_unk_mode_prob: 0.5 + phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel" phoneme_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 680a313618e6..bb6d8c208f1f 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -385,6 +385,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0) self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0) self.phoneme_corruption_unk_mode_prob = cfg.get('phoneme_corruption_unk_mode_prob', 0.5) + self.phoneme_corruption_type = cfg.get('phoneme_corruption_type', 'repeat_skip_unk') self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0) self.parallel_codebook_loss_scale = cfg.get('parallel_codebook_loss_scale', 1.0) self.local_transformer_loss_scale = cfg.get('local_transformer_loss_scale', 1.0) @@ -1847,8 +1848,8 @@ def process_batch( dropout_complete_phoneme_channel = False if self.phoneme_tokenizer is not None and phoneme_tokens is not None: # Corrupt phonemes only when text input is not dropped. - apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input) - dropout_complete_phoneme_channel = dropout_conditional_input + apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input) and self.phoneme_corruption_type == 'repeat_skip_unk' + dropout_complete_phoneme_channel = mode == 'train' and ( dropout_conditional_input or (self.phoneme_corruption_type == 'complete_channel' and torch.rand(1).item() < self.phoneme_corruption_batch_prob)) ( phoneme_channel_embedding, phoneme_channel_lens, From 2c4520b41850e2f725a3d9ceaa78532e22f05d12 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Fri, 27 Feb 2026 01:12:54 -0800 Subject: [PATCH 68/94] gt phoneme option in do_tts Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index bb6d8c208f1f..f7b254ec7977 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -3670,9 +3670,11 @@ def do_tts( temperature: float = 0.7, topk: int = 80, max_steps: int = 330, + gt_phoneme_text: Optional[str] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Generate speech from transcript using EasyMagpie inference with optional context text/audio. + Optionally accepts ground-truth phoneme text (IPA string) for decoder-only inference. """ if transcript is None or transcript.strip() == "": raise ValueError("`transcript` must be a non-empty string.") @@ -3728,6 +3730,19 @@ def do_tts( 'context_audio_codes': context_audio_codes, 'context_audio_codes_lens': context_audio_codes_lens, } + phoneme_input_type = 'pred' + if gt_phoneme_text is not None: + if self.phoneme_tokenizer is None: + raise ValueError("Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided.") + gt_phoneme_text = gt_phoneme_text.strip() + if gt_phoneme_text == "": + raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.") + gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text) + if len(gt_phoneme_tokens) == 0: + raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.") + batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device) + batch['phoneme_tokens_lens'] = torch.tensor([len(gt_phoneme_tokens)], dtype=torch.long, device=device) + phoneme_input_type = 'gt' with torch.inference_mode(): output = self.infer_batch( @@ -3738,7 +3753,7 @@ def do_tts( use_cfg=use_cfg, cfg_scale=cfg_scale, use_local_transformer_for_inference=use_local_transformer, - phoneme_input_type='pred', + phoneme_input_type=phoneme_input_type, phoneme_sampling_method='argmax', use_teacher_forced=False, use_inference_mode=True, From 6501ada7c5d2f7eeb59b050f92578ae551dbed00 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Fri, 27 Feb 2026 11:53:08 -0800 Subject: [PATCH 69/94] bug fix Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/models/easy_magpietts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index f7b254ec7977..54305e088663 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -3738,6 +3738,7 @@ def do_tts( if gt_phoneme_text == "": raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.") gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text) + gt_phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] if len(gt_phoneme_tokens) == 0: raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.") batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device) From 6d635aa5f2eecb1932874ae1ad287e7714282840 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sun, 1 Mar 2026 19:52:49 -0800 Subject: [PATCH 70/94] ignore phoneme channel for some languages Signed-off-by: Paarth Neekhara --- nemo/collections/tts/data/text_to_speech_dataset.py | 8 +++++++- .../collections/tts/data/text_to_speech_dataset_lhotse.py | 5 +++++ nemo/collections/tts/models/easy_magpietts.py | 2 ++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index f680a8d9eb34..65671b8606ed 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -379,6 +379,7 @@ def __init__( context_duration_max: float = 10.0, text_context_remapping: Dict[str, str] = None, text_context_remapping_prob: float = 0.0, + ignore_phoneme_languages: List[str] = None, ): super().__init__( dataset_meta=dataset_meta, @@ -412,6 +413,7 @@ def __init__( self.context_duration_max = context_duration_max self.text_context_remapping = text_context_remapping self.text_context_remapping_prob = text_context_remapping_prob + self.ignore_phoneme_languages = ignore_phoneme_languages or [] def get_num_audio_samples_to_slice(self, duration, sample_rate): num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame) @@ -430,6 +432,7 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float) if data.tokenizer_names is not None: # Pick a random tokenizer from the list of tokenizers tokenizer_name = random.choice(data.tokenizer_names) + language = data.manifest_entry.get('language', 'en') tokens = self.text_tokenizer.encode(text=data.text, tokenizer_name=tokenizer_name) tokens = tokens + [self.eos_id] # Not adding BOS id tokens = torch.tensor(tokens, dtype=torch.int32) @@ -450,6 +453,9 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float) f"Text: {data.text}" ) phoneme_text = data.manifest_entry['ipa'] + if language in self.ignore_phoneme_languages: + # Ignore phoneme tokenization for this language. + phoneme_text = "" else: phoneme_text = data.text phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text) @@ -628,7 +634,7 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float) else: example['raw_text'] = data.text - example['language'] = data.manifest_entry.get('language', 'en') + example['language'] = language if "reward" in data.manifest_entry: example["reward"] = data.manifest_entry["reward"] diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index ffd6b5629cc4..cb478c87fe7f 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -151,6 +151,7 @@ def __init__( text_context_remapping: Dict[str, str] = None, text_context_remapping_prob: float = 0.0, phoneme_tokenizer_config: DictConfig = None, + ignore_phoneme_languages: List[str] = None, ): super().__init__() self.sample_rate = sample_rate @@ -175,6 +176,7 @@ def __init__( self.text_context_remapping = text_context_remapping self.text_context_remapping_prob = text_context_remapping_prob self.phoneme_tokenizer_config = phoneme_tokenizer_config + self.ignore_phoneme_languages = ignore_phoneme_languages or [] def get_num_audio_samples_to_slice(self, duration, sample_rate): num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame) @@ -436,6 +438,9 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float) f"Cut ID: {cut.id}, Text: {text_str}" ) phoneme_text = cut.supervisions[0].ipa + if language in self.ignore_phoneme_languages: + # Ignore phoneme tokenization for this language + phoneme_text = "" else: phoneme_text = text_str phoneme_tokens = self.phoneme_tokenizer.encode(phoneme_text) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 54305e088663..e0f2a87da55a 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -2477,6 +2477,7 @@ def get_dataset(self, dataset_cfg, dataset_type): pad_context_text_to_max_duration=self.pad_context_text_to_max_duration, context_duration_min=self.cfg.context_duration_min, context_duration_max=self.cfg.context_duration_max, + ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []), ) dataset.load_16khz_audio = False dataset.tokenizer_config = ( @@ -2506,6 +2507,7 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name, tokenizer_config=self.cfg.text_tokenizers, phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None), + ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []), ) data_loader = get_lhotse_dataloader_from_config( From df4027704b6c6e0505024de3e33ba417885b1aa8 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 2 Mar 2026 18:31:49 -0800 Subject: [PATCH 71/94] PO updates, cross lingual dataset creation Signed-off-by: Shehzeen Hussain --- .../easy_magpietts_preference_optimization.py | 32 +- .../create_crosslingual_context_dataset.py | 939 ++++++++++++++++++ .../magpietts/inspect_crosslingual_dataset.py | 151 +++ 3 files changed, 1109 insertions(+), 13 deletions(-) create mode 100644 scripts/magpietts/create_crosslingual_context_dataset.py create mode 100644 scripts/magpietts/inspect_crosslingual_dataset.py diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py index 45d9bd542b59..1643474dc5ce 100644 --- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py @@ -145,6 +145,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.max_decoder_steps = self.cfg.get('max_decoder_steps', 220) self.aux_phoneme_loss_weight = self.cfg.get('aux_phoneme_loss_weight', 1.0) self.po_groups_per_subbatch = max(int(self.cfg.get('po_groups_per_subbatch', 1)), 1) + self.batch_size_for_chunked_tf = self.cfg.get('batch_size_for_chunked_tf', 4) self._normalize_whisper_transcript = self.cfg.get('normalize_whisper_transcript', True) if reward_asr_model == 'whisper' and self._normalize_whisper_transcript: @@ -1049,8 +1050,14 @@ def _run_teacher_forced_chunked_po( n_generations_per_item: int, do_backward: bool, ): - num_groups = len(batch_repeated['raw_texts']) // n_generations_per_item - groups_per_subbatch = max(self.po_groups_per_subbatch, 1) + total_items = len(batch_repeated['raw_texts']) + if self.batch_size_for_chunked_tf is not None: + chunk_size = self.batch_size_for_chunked_tf + else: + # Backward compatibility: preserve previous effective item-chunk size + # when the new explicit batch-size chunking config is not set. + chunk_size = max(self.po_groups_per_subbatch, 1) * max(n_generations_per_item, 1) + chunk_size = max(int(chunk_size), 1) accumulated_loss = torch.tensor(0.0, device=self.device) accumulated_po_loss = torch.tensor(0.0, device=self.device) @@ -1059,10 +1066,9 @@ def _run_teacher_forced_chunked_po( accumulated_entropy = torch.tensor(0.0, device=self.device) used_gt_phoneme_input = 0.0 - for group_start_idx, group_end_idx in self._iter_group_ranges(num_groups, groups_per_subbatch): - item_start_idx = group_start_idx * n_generations_per_item - item_end_idx = group_end_idx * n_generations_per_item - group_weight = float(group_end_idx - group_start_idx) / max(float(num_groups), 1.0) + for item_start_idx in range(0, total_items, chunk_size): + item_end_idx = min(item_start_idx + chunk_size, total_items) + chunk_weight = float(item_end_idx - item_start_idx) / max(float(total_items), 1.0) batch_sub = self._slice_batch_range(batch_repeated, item_start_idx, item_end_idx) predicted_codes_sub = predicted_codes[item_start_idx:item_end_idx] @@ -1102,15 +1108,15 @@ def _run_teacher_forced_chunked_po( ) if do_backward: - self.manual_backward(chunk_outputs['loss'] * group_weight) + self.manual_backward(chunk_outputs['loss'] * chunk_weight) - accumulated_loss = accumulated_loss + chunk_outputs['loss'].detach() * group_weight - accumulated_po_loss = accumulated_po_loss + chunk_outputs['po_loss'].detach() * group_weight + accumulated_loss = accumulated_loss + chunk_outputs['loss'].detach() * chunk_weight + accumulated_po_loss = accumulated_po_loss + chunk_outputs['po_loss'].detach() * chunk_weight accumulated_phoneme_aux_loss = ( - accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * group_weight + accumulated_phoneme_aux_loss + chunk_outputs['phoneme_aux_loss'].detach() * chunk_weight ) - accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * group_weight - accumulated_entropy = accumulated_entropy + chunk_outputs['entropy'].detach() * group_weight + accumulated_kl_loss = accumulated_kl_loss + chunk_outputs['kl_loss'].detach() * chunk_weight + accumulated_entropy = accumulated_entropy + chunk_outputs['entropy'].detach() * chunk_weight used_gt_phoneme_input = max(used_gt_phoneme_input, chunk_outputs['used_gt_phoneme_input']) return { @@ -1151,7 +1157,7 @@ def training_step(self, batch, batch_idx): teacher_forced_time_sec = time.perf_counter() - teacher_forced_start_time # Clip gradients to prevent catastrophic updates from outlier batches. - max_grad_norm = self.cfg.get('max_grad_norm', 1.0) + max_grad_norm = self.cfg.get('max_grad_norm', 0.0) if max_grad_norm > 0: torch.nn.utils.clip_grad_norm_( [p for p in self.parameters() if p.requires_grad and p.grad is not None], diff --git a/scripts/magpietts/create_crosslingual_context_dataset.py b/scripts/magpietts/create_crosslingual_context_dataset.py new file mode 100644 index 000000000000..2b488eb4097c --- /dev/null +++ b/scripts/magpietts/create_crosslingual_context_dataset.py @@ -0,0 +1,939 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Creates a cross-lingual context dataset for TTS training. + +For each target utterance in language A, finds the closest speaker voice from a +different language B (using TitaNet speaker embeddings) and pairs the target with +context audio from that cross-lingual speaker. + +The script operates in three stages: + Stage 1: Build a per-speaker TitaNet embedding index across all languages. + Stage 2: Compute cross-lingual speaker matches and sample a language-balanced subset. + Stage 3: Extract audio to disk and write a NeMo-format JSONL manifest. + +After running this script, use create_lhotse_shar_from_nemo_manifest.py to convert +the output manifest into lhotse shar format, then optionally run +extend_lhotse_shards_with_audio_codes.py to add codec codes. + +Example usage: + python scripts/magpietts/create_crosslingual_context_dataset.py \ + --master-yaml /data/magpie_pretraining_data/manifests/ipa_manifests/train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml \ + --output-dir /data/crosslingual_context_dataset \ + --target-hours 50.0 \ + --samples-per-speaker 5 \ + --seed 42 \ + --log-level INFO +""" + +import argparse +import glob as glob_module +import gzip +import json +import logging +import os +import pickle +import random +import re +from collections import defaultdict +from typing import Any, Dict, List, Tuple + +import numpy as np +import soundfile as sf +import torch +import yaml +from lhotse import CutSet +from tqdm import tqdm + +TITANET_MODEL_NAME = "nvidia/speakerverification_en_titanet_large" +TITANET_SAMPLE_RATE = 16000 + + +# --------------------------------------------------------------------------- +# YAML / shar helpers +# --------------------------------------------------------------------------- + +def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]: + """ + Parse the master multilingual YAML and each per-language YAML it references. + Returns {language: [list of shar_entry dicts with context_audio]}. + """ + yaml_base_dir = os.path.dirname(yaml_path) + with open(yaml_path, 'r') as f: + master_entries = yaml.safe_load(f) + + lang_to_shar_entries: Dict[str, List[Dict]] = defaultdict(list) + for entry in master_entries: + lang = entry.get("tags", {}).get("lang") + child_yaml_path = entry.get("input_cfg") + if not lang or not child_yaml_path: + continue + if not os.path.isabs(child_yaml_path): + child_yaml_path = os.path.join(yaml_base_dir, child_yaml_path) + if not os.path.isfile(child_yaml_path): + logging.warning(f"Per-language YAML not found: {child_yaml_path}") + continue + with open(child_yaml_path, 'r') as f: + child_entries = yaml.safe_load(f) + for ce in child_entries: + shar_path = ce.get("shar_path", {}) + if "context_audio" not in shar_path: + logging.debug(f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}") + continue + lang_to_shar_entries[lang].append(ce) + + return dict(lang_to_shar_entries) + + +def expand_shar_range(pattern: str) -> List[str]: + """ + Expand a shar path pattern like '.../cuts.{000000..001231}.jsonl.gz' + into a list of concrete file paths. + """ + match = re.search(r'\{(\d+)\.\.(\d+)\}', pattern) + if not match: + return [pattern] + start_idx = int(match.group(1)) + end_idx = int(match.group(2)) + width = len(match.group(1)) + prefix = pattern[:match.start()] + suffix = pattern[match.end():] + return [f"{prefix}{i:0{width}d}{suffix}" for i in range(start_idx, end_idx + 1)] + + +def parse_speaker_field(speaker_str: str) -> Tuple[str, str, str]: + """Extract (language, dataset, speaker_id) from '| Language:XX Dataset:YYY Speaker:ZZZ |'.""" + lang_m = re.search(r"Language:(\w+)", speaker_str) + dataset_m = re.search(r"Dataset:([\w\d\W]+?) Speaker:", speaker_str) + spk_m = re.search(r"Speaker:([\w\d\W]+?) \|", speaker_str) + lang = lang_m.group(1) if lang_m else "unknown" + dataset = dataset_m.group(1).strip() if dataset_m else "unknown" + speaker_id = spk_m.group(1).strip() if spk_m else "unknown" + return lang, dataset, speaker_id + + +# --------------------------------------------------------------------------- +# Stage 1: Build speaker embedding index +# --------------------------------------------------------------------------- + +def discover_speakers_from_cuts( + lang_to_shar_entries: Dict[str, List[Dict]], + max_cuts_per_speaker: int, + max_shards_per_dataset: int = 0, +) -> Dict[str, Dict]: + """ + Pass 1 (metadata only): Read cut JSONL files to discover unique speakers + and collect up to max_cuts_per_speaker cut metadata entries per speaker. + + Args: + max_shards_per_dataset: If > 0, only scan this many .jsonl.gz shard + files per shar group (dataset) instead of all shards. This + dramatically speeds up discovery for large datasets while still + finding most speakers. + + Returns: {speaker_str: {"language": str, "cut_metas": [list of (shar_entry, shard_idx, cut_json_dict)]}} + """ + speaker_info: Dict[str, Dict] = {} + + for lang, shar_entries in lang_to_shar_entries.items(): + logging.info(f"[Stage 1] Discovering speakers for language: {lang} ({len(shar_entries)} shar groups)") + for se in shar_entries: + cuts_pattern = se["shar_path"]["cuts"] + cuts_files = expand_shar_range(cuts_pattern) + if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset: + logging.info( + f" Limiting scan to {max_shards_per_dataset}/{len(cuts_files)} " + f"shards for dataset: {cuts_pattern}" + ) + cuts_files = cuts_files[:max_shards_per_dataset] + for cuts_file in cuts_files: + if not os.path.isfile(cuts_file): + continue + shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file) + shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0 + try: + with gzip.open(cuts_file, 'rt', encoding='utf-8') as f: + for line in f: + cut_json = json.loads(line) + supervisions = cut_json.get("supervisions", []) + if not supervisions: + continue + speaker_str = supervisions[0].get("speaker", "") + if not speaker_str: + continue + if speaker_str not in speaker_info: + speaker_info[speaker_str] = { + "language": lang, + "cut_metas": [], + } + if len(speaker_info[speaker_str]["cut_metas"]) < max_cuts_per_speaker: + speaker_info[speaker_str]["cut_metas"].append((se, shard_idx, cut_json)) + except Exception as e: + logging.warning(f"Error reading {cuts_file}: {e}") + + logging.info(f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages") + for lang in sorted(lang_to_shar_entries.keys()): + n = sum(1 for v in speaker_info.values() if v["language"] == lang) + logging.info(f" {lang}: {n} speakers") + return speaker_info + + +def compute_speaker_embeddings( + speaker_info: Dict[str, Dict], + sv_model: torch.nn.Module, + device: torch.device, + batch_size: int = 16, +) -> Dict[str, Dict]: + """ + Pass 2: For each speaker, load audio from shar tars for the sampled cuts, + compute TitaNet embeddings, and average them into a single representative vector. + + Returns: {speaker_str: {"language": str, "embedding": np.ndarray}} + """ + speaker_embeddings: Dict[str, Dict] = {} + + speakers_needing_audio = {} + for spk, info in speaker_info.items(): + cut_metas = info["cut_metas"] + if not cut_metas: + continue + grouped_by_shar_and_shard: Dict[str, Dict[int, List]] = defaultdict(lambda: defaultdict(list)) + for (se, shard_idx, cut_json) in cut_metas: + shar_key = json.dumps(se["shar_path"], sort_keys=True) + grouped_by_shar_and_shard[shar_key][shard_idx].append((se, cut_json)) + speakers_needing_audio[spk] = { + "language": info["language"], + "grouped": grouped_by_shar_and_shard, + } + + # Collect audio in batches: load from shar, accumulate waveforms per speaker + speaker_audio_tensors: Dict[str, List[torch.Tensor]] = defaultdict(list) + + logging.info(f"[Stage 1] Loading audio for {len(speakers_needing_audio)} speakers to compute embeddings...") + + # Group all (shar_entry, shard_idx) that we need to load + shar_shard_to_speakers: Dict[Tuple[str, int], List[Tuple[str, str]]] = defaultdict(list) + for spk, data in speakers_needing_audio.items(): + for shar_key, shard_map in data["grouped"].items(): + for shard_idx, items in shard_map.items(): + for (se, cut_json) in items: + cut_id = cut_json.get("id", "") + shar_shard_to_speakers[(shar_key, shard_idx)].append((spk, cut_id)) + + # Process shard by shard to minimize tar file openings + total_shards = len(shar_shard_to_speakers) + for (shar_key, shard_idx), spk_cut_pairs in tqdm( + shar_shard_to_speakers.items(), desc="[Stage 1] Loading audio shards", total=total_shards + ): + se_shar_path = json.loads(shar_key) + cuts_files = expand_shar_range(se_shar_path["cuts"]) + target_audio_files = expand_shar_range(se_shar_path.get("target_audio", "")) + + if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files): + logging.warning(f"Shard index {shard_idx} out of range, skipping") + continue + + cut_file = cuts_files[shard_idx] + target_tar = target_audio_files[shard_idx] + + if not os.path.isfile(cut_file) or not os.path.isfile(target_tar): + logging.warning(f"Missing shard files: cuts={cut_file}, target={target_tar}") + continue + + needed_cut_ids = {cut_id for (_, cut_id) in spk_cut_pairs} + cut_id_to_spk = {cut_id: spk for (spk, cut_id) in spk_cut_pairs} + + try: + fields = { + "cuts": [cut_file], + "recording": [target_tar], + } + # Also include context_recording if available, to avoid errors + context_audio_files = expand_shar_range(se_shar_path.get("context_audio", "")) + if shard_idx < len(context_audio_files) and os.path.isfile(context_audio_files[shard_idx]): + fields["context_recording"] = [context_audio_files[shard_idx]] + + shard_cutset = CutSet.from_shar(fields=fields) + for cut in shard_cutset: + if cut.id in needed_cut_ids: + spk = cut_id_to_spk[cut.id] + audio_np = cut.recording.resample(TITANET_SAMPLE_RATE).load_audio().squeeze(0) + audio_tensor = torch.from_numpy(audio_np).float() + speaker_audio_tensors[spk].append(audio_tensor) + needed_cut_ids.discard(cut.id) + if not needed_cut_ids: + break + except Exception as e: + logging.warning(f"Error loading shard {cut_file}: {e}") + + # Now compute embeddings in batches + logging.info(f"[Stage 1] Computing TitaNet embeddings for {len(speaker_audio_tensors)} speakers...") + all_speakers = list(speaker_audio_tensors.keys()) + + for batch_start in tqdm(range(0, len(all_speakers), batch_size), desc="[Stage 1] TitaNet batches"): + batch_speakers = all_speakers[batch_start : batch_start + batch_size] + audio_list = [] + audio_lens = [] + spk_indices = [] # maps each audio in batch back to speaker + + for spk in batch_speakers: + for audio_t in speaker_audio_tensors[spk]: + audio_list.append(audio_t.to(device)) + audio_lens.append(audio_t.size(0)) + spk_indices.append(spk) + + if not audio_list: + continue + + batch_lens = torch.tensor(audio_lens, device=device).long() + max_len = int(batch_lens.max().item()) + padded = torch.zeros(len(audio_list), max_len, device=device, dtype=torch.float32) + for i, t in enumerate(audio_list): + padded[i, : t.size(0)] = t + + with torch.inference_mode(): + _, embeddings = sv_model.forward(input_signal=padded, input_signal_length=batch_lens) + + embeddings_np = embeddings.cpu().float().numpy() + + # Average embeddings per speaker + spk_emb_accum: Dict[str, List[np.ndarray]] = defaultdict(list) + for i, spk in enumerate(spk_indices): + spk_emb_accum[spk].append(embeddings_np[i]) + + for spk in batch_speakers: + if spk in spk_emb_accum and spk_emb_accum[spk]: + avg_emb = np.mean(spk_emb_accum[spk], axis=0) + avg_emb = avg_emb / (np.linalg.norm(avg_emb) + 1e-8) + speaker_embeddings[spk] = { + "language": speakers_needing_audio[spk]["language"], + "embedding": avg_emb, + } + + logging.info(f"[Stage 1] Computed embeddings for {len(speaker_embeddings)} speakers") + return speaker_embeddings + + +def run_stage1( + lang_to_shar_entries: Dict[str, List[Dict]], + samples_per_speaker: int, + device: torch.device, + index_path: str, + batch_size: int = 16, + max_shards_per_dataset: int = 0, +) -> Dict[str, Dict]: + """Run full Stage 1: discover speakers, load audio, compute embeddings, save index.""" + if os.path.isfile(index_path): + logging.info(f"[Stage 1] Loading cached speaker index from {index_path}") + with open(index_path, 'rb') as f: + return pickle.load(f) + + from nemo.collections.asr.models import EncDecSpeakerLabelModel + + logging.info(f"[Stage 1] Loading TitaNet model: {TITANET_MODEL_NAME}") + sv_model = EncDecSpeakerLabelModel.from_pretrained(TITANET_MODEL_NAME) + sv_model = sv_model.to(device) + sv_model.eval() + + speaker_info = discover_speakers_from_cuts( + lang_to_shar_entries, + max_cuts_per_speaker=samples_per_speaker, + max_shards_per_dataset=max_shards_per_dataset, + ) + speaker_embeddings = compute_speaker_embeddings(speaker_info, sv_model, device, batch_size=batch_size) + + os.makedirs(os.path.dirname(index_path), exist_ok=True) + with open(index_path, 'wb') as f: + pickle.dump(speaker_embeddings, f) + logging.info(f"[Stage 1] Saved speaker index to {index_path}") + + del sv_model + torch.cuda.empty_cache() + return speaker_embeddings + + +# --------------------------------------------------------------------------- +# Stage 2: Cross-lingual speaker matching + language-balanced sampling +# --------------------------------------------------------------------------- + +def build_crosslingual_map(speaker_embeddings: Dict[str, Dict]) -> Dict[str, Tuple[str, float]]: + """ + For each speaker S in language L, find the closest speaker S' from a different + language by cosine similarity of their TitaNet embeddings. + + Returns: {speaker_str: (best_match_speaker_str, cosine_similarity)} + """ + speakers = list(speaker_embeddings.keys()) + n = len(speakers) + logging.info(f"[Stage 2] Building cross-lingual map for {n} speakers...") + + # Build embedding matrix + emb_matrix = np.stack([speaker_embeddings[s]["embedding"] for s in speakers]) + langs = [speaker_embeddings[s]["language"] for s in speakers] + + # Cosine similarity matrix (embeddings are already L2-normalized) + sim_matrix = emb_matrix @ emb_matrix.T + + cross_lingual_map: Dict[str, Tuple[str, float]] = {} + for i in range(n): + best_j = -1 + best_sim = -2.0 + for j in range(n): + if langs[j] == langs[i]: + continue + if sim_matrix[i, j] > best_sim: + best_sim = sim_matrix[i, j] + best_j = j + if best_j >= 0: + cross_lingual_map[speakers[i]] = (speakers[best_j], float(best_sim)) + else: + logging.warning(f"No cross-lingual match found for speaker: {speakers[i]}") + + logging.info(f"[Stage 2] Built cross-lingual map with {len(cross_lingual_map)} entries") + avg_sim = np.mean([v[1] for v in cross_lingual_map.values()]) if cross_lingual_map else 0 + logging.info(f"[Stage 2] Average cross-lingual similarity: {avg_sim:.4f}") + return cross_lingual_map + + +def sample_balanced_cuts( + lang_to_shar_entries: Dict[str, List[Dict]], + cross_lingual_map: Dict[str, Tuple[str, float]], + target_hours: float, + seed: int, + max_shards_per_dataset: int = 0, +) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]: + """ + Sample cuts across languages so each language contributes approximately + target_hours / num_languages hours of target audio. + + Args: + max_shards_per_dataset: If > 0, only read this many shard files per + dataset. Since we only need ~6.25h per language, reading a small + fraction of shards is sufficient and avoids scanning tens of + thousands of files for large datasets. + + Returns: + target_cuts_by_lang: {lang: [list of cut_json dicts with extra metadata]} + context_pool_by_speaker: {speaker_str: [list of (shar_entry, shard_idx, cut_json)]} + """ + rng = random.Random(seed) + num_langs = len(lang_to_shar_entries) + hours_per_lang = target_hours / num_langs + secs_per_lang = hours_per_lang * 3600 + # Collect 3x the target to allow shuffling diversity + collect_secs_per_lang = secs_per_lang * 3 + + logging.info(f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)") + + all_matched_speakers = set(v[0] for v in cross_lingual_map.values()) + + target_cuts_by_lang: Dict[str, List[Dict]] = {} + context_pool_by_speaker: Dict[str, List] = defaultdict(list) + + for lang, shar_entries in lang_to_shar_entries.items(): + logging.info(f"[Stage 2] Reading cuts for language: {lang}") + lang_cuts = [] + lang_collected_secs = 0.0 + lang_done = False + + for se in shar_entries: + if lang_done: + break + cuts_pattern = se["shar_path"]["cuts"] + cuts_files = expand_shar_range(cuts_pattern) + if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset: + cuts_files = cuts_files[:max_shards_per_dataset] + logging.info( + f" Limiting to {max_shards_per_dataset} shards for dataset: " + f"{se['shar_path']['cuts']}" + ) + for cuts_file in cuts_files: + if lang_done: + break + if not os.path.isfile(cuts_file): + continue + shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file) + shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0 + try: + with gzip.open(cuts_file, 'rt', encoding='utf-8') as f: + for line in f: + cut_json = json.loads(line) + speaker_str = cut_json.get("supervisions", [{}])[0].get("speaker", "") + if not speaker_str: + continue + if speaker_str in all_matched_speakers: + context_pool_by_speaker[speaker_str].append((se, shard_idx, cut_json)) + if speaker_str in cross_lingual_map: + cut_json["_shar_entry"] = se + cut_json["_shard_idx"] = shard_idx + cut_json["_speaker_str"] = speaker_str + lang_cuts.append(cut_json) + lang_collected_secs += cut_json.get("duration", 0) + if lang_collected_secs >= collect_secs_per_lang: + lang_done = True + break + except Exception as e: + logging.warning(f"Error reading {cuts_file}: {e}") + + logging.info(f" {lang}: {len(lang_cuts)} candidate target cuts ({lang_collected_secs / 3600:.2f}h collected)") + + rng.shuffle(lang_cuts) + sampled = [] + total_dur = 0.0 + for cut_json in lang_cuts: + dur = cut_json.get("duration", 0) + if dur <= 0: + continue + sampled.append(cut_json) + total_dur += dur + if total_dur >= secs_per_lang: + break + + target_cuts_by_lang[lang] = sampled + logging.info(f" {lang}: sampled {len(sampled)} cuts, {total_dur / 3600:.2f}h") + + total_sampled = sum(len(v) for v in target_cuts_by_lang.values()) + total_hours = sum(sum(c.get("duration", 0) for c in v) for v in target_cuts_by_lang.values()) / 3600 + logging.info(f"[Stage 2] Total sampled: {total_sampled} cuts, {total_hours:.2f}h") + return target_cuts_by_lang, dict(context_pool_by_speaker) + + +# --------------------------------------------------------------------------- +# Stage 3: Extract audio + write NeMo manifest +# --------------------------------------------------------------------------- + +def run_stage3( + target_cuts_by_lang: Dict[str, List[Dict]], + context_pool_by_speaker: Dict[str, List], + cross_lingual_map: Dict[str, Tuple[str, float]], + speaker_embeddings: Dict[str, Dict], + output_dir: str, + sample_rate: int, + seed: int, +): + """ + For each sampled target cut, pick a context utterance from the matched + cross-lingual speaker, extract both audios to disk, and write the manifest. + """ + rng = random.Random(seed) + audio_dir = os.path.join(output_dir, "extracted_audio") + target_audio_dir = os.path.join(audio_dir, "target") + context_audio_dir = os.path.join(audio_dir, "context") + os.makedirs(target_audio_dir, exist_ok=True) + os.makedirs(context_audio_dir, exist_ok=True) + + manifest_path = os.path.join(output_dir, "manifest.json") + + # Build a quick lookup: for each context cut we might need to load, + # index by (shar_key, shard_idx, cut_id) + # First, assign a context cut to each target + assignments: List[Dict] = [] + for lang, cuts in target_cuts_by_lang.items(): + for cut_json in cuts: + spk = cut_json["_speaker_str"] + matched_spk, ssim = cross_lingual_map[spk] + ctx_pool = context_pool_by_speaker.get(matched_spk, []) + if not ctx_pool: + logging.warning(f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}") + continue + ctx_se, ctx_shard_idx, ctx_cut_json = rng.choice(ctx_pool) + assignments.append({ + "target_cut_json": cut_json, + "target_shar_entry": cut_json["_shar_entry"], + "target_shard_idx": cut_json["_shard_idx"], + "target_speaker": spk, + "context_cut_json": ctx_cut_json, + "context_shar_entry": ctx_se, + "context_shard_idx": ctx_shard_idx, + "context_speaker": matched_spk, + "ssim": ssim, + "lang": lang, + }) + + logging.info(f"[Stage 3] Total assignments: {len(assignments)}") + + # Group by (shar_key, shard_idx) for efficient loading + # We need to load target and context audio from potentially different shards + # Strategy: process all assignments, grouping audio loads by shard + target_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list) + context_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list) + + for idx, a in enumerate(assignments): + t_shar_key = json.dumps(a["target_shar_entry"]["shar_path"], sort_keys=True) + target_loads[(t_shar_key, a["target_shard_idx"])].append(idx) + c_shar_key = json.dumps(a["context_shar_entry"]["shar_path"], sort_keys=True) + context_loads[(c_shar_key, a["context_shard_idx"])].append(idx) + + # Arrays to hold extracted audio file paths + target_audio_paths = [None] * len(assignments) + context_audio_paths = [None] * len(assignments) + + def _save_audio_from_shard( + shard_loads: Dict[Tuple[str, int], List[int]], + assignments_list: List[Dict], + cut_json_key: str, + out_subdir: str, + out_paths_array: List, + audio_field: str, + ): + """Load cuts from shar tars and save individual audio files to disk.""" + total_shards = len(shard_loads) + for (shar_key_str, shard_idx), indices in tqdm( + shard_loads.items(), desc=f"[Stage 3] Extracting {audio_field}", total=total_shards + ): + se_shar_path = json.loads(shar_key_str) + cuts_files = expand_shar_range(se_shar_path["cuts"]) + target_audio_files = expand_shar_range(se_shar_path.get("target_audio", "")) + + if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files): + logging.warning(f"Shard {shard_idx} out of range, skipping") + continue + + cut_file = cuts_files[shard_idx] + tar_file = target_audio_files[shard_idx] + + if not os.path.isfile(cut_file) or not os.path.isfile(tar_file): + logging.warning(f"Missing files: {cut_file} or {tar_file}") + continue + + needed_cut_ids = {} + for i in indices: + cj = assignments_list[i][cut_json_key] + cid = cj.get("id", "") + needed_cut_ids[cid] = i + + try: + fields = {"cuts": [cut_file], "recording": [tar_file]} + ctx_audio_files = expand_shar_range(se_shar_path.get("context_audio", "")) + if ctx_audio_files and shard_idx < len(ctx_audio_files) and os.path.isfile(ctx_audio_files[shard_idx]): + fields["context_recording"] = [ctx_audio_files[shard_idx]] + + shard_cutset = CutSet.from_shar(fields=fields) + for cut in shard_cutset: + if cut.id in needed_cut_ids: + assign_idx = needed_cut_ids[cut.id] + audio_np = cut.recording.resample(sample_rate).load_audio().squeeze(0) + safe_id = cut.id.replace("/", "_") + out_file = os.path.join(out_subdir, f"{safe_id}.wav") + sf.write(out_file, audio_np, sample_rate) + out_paths_array[assign_idx] = os.path.relpath(out_file, os.path.join(output_dir, "extracted_audio")) + del needed_cut_ids[cut.id] + if not needed_cut_ids: + break + except Exception as e: + logging.warning(f"Error processing shard {cut_file}: {e}") + + # Extract target audio + logging.info(f"[Stage 3] Extracting target audio from {len(target_loads)} shards...") + _save_audio_from_shard( + target_loads, assignments, "target_cut_json", + target_audio_dir, target_audio_paths, "target_audio", + ) + + # Extract context audio + logging.info(f"[Stage 3] Extracting context audio from {len(context_loads)} shards...") + _save_audio_from_shard( + context_loads, assignments, "context_cut_json", + context_audio_dir, context_audio_paths, "context_audio", + ) + + # Write manifest + logging.info(f"[Stage 3] Writing manifest to {manifest_path}") + written = 0 + skipped = 0 + with open(manifest_path, 'w', encoding='utf-8') as f: + for idx, a in enumerate(assignments): + if target_audio_paths[idx] is None or context_audio_paths[idx] is None: + skipped += 1 + continue + + t_cut = a["target_cut_json"] + c_cut = a["context_cut_json"] + t_sup = t_cut.get("supervisions", [{}])[0] + + text = t_sup.get("text", "") + normalized_text = t_sup.get("custom", {}).get("normalized_text", text) + ipa = t_sup.get("custom", {}).get("ipa", "") + speaker = t_sup.get("speaker", "") + duration = t_cut.get("duration", 0) + context_duration = c_cut.get("duration", 0) + ctx_lang_parsed, _, _ = parse_speaker_field(a["context_speaker"]) + + target_lang_parsed, _, _ = parse_speaker_field(speaker) + + entry = { + "audio_filepath": target_audio_paths[idx], + "text": text, + "normalized_text": normalized_text, + "speaker": speaker, + "language": target_lang_parsed, + "duration": duration, + "context_audio_filepath": context_audio_paths[idx], + "context_audio_duration": context_duration, + "context_speaker_similarity": round(a["ssim"], 6), + "context_language": ctx_lang_parsed, + "context_speaker": a["context_speaker"], + } + if ipa: + entry["ipa"] = ipa + + # Carry over any additional custom fields from the target supervision + _exclude_custom_keys = { + "target_audio_codes_path", "context_audio_codes_path", + "context_audio_text", "context_audio_normalized_text", + "context_audio_offset" + } + for k, v in t_sup.get("custom", {}).items(): + if k not in entry and k not in _exclude_custom_keys: + entry[k] = v + + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + written += 1 + + logging.info(f"[Stage 3] Manifest written: {written} entries, {skipped} skipped") + return manifest_path + + +# --------------------------------------------------------------------------- +# YAML config generation (post Stage 4) +# --------------------------------------------------------------------------- + +def generate_yaml_config(lhotse_shar_dir: str, output_yaml_path: str, data_mount_prefix: str = "/data"): + """ + Generate a lhotse YAML config pointing to the cross-lingual shar dataset. + Call this after running create_lhotse_shar_from_nemo_manifest.py on the manifest. + + Args: + lhotse_shar_dir: Absolute path to the lhotse_shar output directory + (containing cuts/, target_audio/, context_audio/). + output_yaml_path: Path to write the YAML config file. + data_mount_prefix: If shar_dir is under a mount, replace the host prefix + with this docker-internal prefix. Pass empty string to skip. + """ + cuts_dir = os.path.join(lhotse_shar_dir, "cuts") + target_audio_dir = os.path.join(lhotse_shar_dir, "target_audio") + context_audio_dir = os.path.join(lhotse_shar_dir, "context_audio") + + cuts_files = sorted(glob_module.glob(os.path.join(cuts_dir, "cuts.*.jsonl.gz"))) + context_files = sorted(glob_module.glob(os.path.join(context_audio_dir, "recording.*.tar"))) + + if not cuts_files: + logging.error(f"No cut files found in {cuts_dir}") + return + + # Determine shard range + first_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1)) + last_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[-1]).group(1)) + width = len(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1)) + + def _make_range_pattern(directory: str, prefix: str, ext: str) -> str: + path = os.path.join(directory, f"{prefix}.{{{first_idx:0{width}d}..{last_idx:0{width}d}}}.{ext}") + return path + + shar_path = { + "cuts": _make_range_pattern(cuts_dir, "cuts", "jsonl.gz"), + "target_audio": _make_range_pattern(target_audio_dir, "recording", "tar"), + } + if context_files: + shar_path["context_audio"] = _make_range_pattern(context_audio_dir, "recording", "tar") + + # Check for codec codes + for codec_dir_name in os.listdir(lhotse_shar_dir): + codec_subdir = os.path.join(lhotse_shar_dir, codec_dir_name) + if not os.path.isdir(codec_subdir): + continue + target_codes_dir = os.path.join(codec_subdir, "target_codes") + context_codes_dir = os.path.join(codec_subdir, "context_codes") + if os.path.isdir(target_codes_dir): + tc_files = sorted(glob_module.glob(os.path.join(target_codes_dir, "codes.*.tar"))) + if tc_files: + tc_first = int(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1)) + tc_last = int(re.search(r"codes\.(\d+)\.tar$", tc_files[-1]).group(1)) + tc_width = len(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1)) + shar_path["target_codes"] = os.path.join( + target_codes_dir, f"codes.{{{tc_first:0{tc_width}d}..{tc_last:0{tc_width}d}}}.tar" + ) + if os.path.isdir(context_codes_dir): + cc_files = sorted(glob_module.glob(os.path.join(context_codes_dir, "codes.*.tar"))) + if cc_files: + cc_first = int(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1)) + cc_last = int(re.search(r"codes\.(\d+)\.tar$", cc_files[-1]).group(1)) + cc_width = len(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1)) + shar_path["context_codes"] = os.path.join( + context_codes_dir, f"codes.{{{cc_first:0{cc_width}d}..{cc_last:0{cc_width}d}}}.tar" + ) + + yaml_entry = [{ + "type": "lhotse_shar", + "shar_path": shar_path, + "weight": 1.0, + "tags": { + "task": "tts", + "lang": "crosslingual", + "tokenizer_names": ["nemotron_nano_30b"], + }, + }] + + os.makedirs(os.path.dirname(output_yaml_path) or ".", exist_ok=True) + with open(output_yaml_path, 'w') as f: + yaml.dump(yaml_entry, f, default_flow_style=False, sort_keys=False) + logging.info(f"YAML config written to {output_yaml_path}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Create a cross-lingual context TTS dataset from multilingual lhotse shar data.", + ) + parser.add_argument( + "--master-yaml", required=True, type=str, + help="Path to the master multilingual YAML (e.g. train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml).", + ) + parser.add_argument( + "--output-dir", required=True, type=str, + help="Base directory for all outputs (extracted audio, manifest, speaker index).", + ) + parser.add_argument( + "--target-hours", type=float, default=50.0, + help="Total hours of target audio to sample (split equally across languages).", + ) + parser.add_argument( + "--samples-per-speaker", type=int, default=5, + help="Number of utterances per speaker to use for computing the average TitaNet embedding.", + ) + parser.add_argument( + "--sample-rate", type=int, default=24000, + help="Sample rate for saving extracted audio files.", + ) + parser.add_argument( + "--embedding-batch-size", type=int, default=16, + help="Batch size for TitaNet embedding computation.", + ) + parser.add_argument( + "--max-shards-per-dataset", type=int, default=0, + help="Max number of .jsonl.gz shard files to scan per dataset during " + "speaker discovery (Stage 1). 0 means scan all shards. " + "Setting this to e.g. 10 dramatically speeds up discovery while " + "still finding most speakers.", + ) + parser.add_argument( + "--seed", type=int, default=42, + help="Random seed for reproducibility.", + ) + parser.add_argument( + "--log-level", type=str, default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logging level.", + ) + parser.add_argument( + "--generate-yaml", type=str, default=None, + help="If provided, skip stages 1-3 and instead generate a YAML config " + "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. " + "Value is the output YAML file path.", + ) + args = parser.parse_args() + + log_level = getattr(logging, args.log_level.upper(), logging.INFO) + logging.basicConfig( + level=log_level, + format='%(asctime)s - %(levelname)s - %(message)s', + ) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + + os.makedirs(args.output_dir, exist_ok=True) + + # --- Generate YAML config mode (post Stage 4) --- + if args.generate_yaml: + lhotse_shar_dir = os.path.join(args.output_dir, "lhotse_shar") + generate_yaml_config(lhotse_shar_dir, args.generate_yaml) + return + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logging.info(f"Using device: {device}") + + # --- Parse master YAML --- + logging.info(f"Parsing master YAML: {args.master_yaml}") + lang_to_shar_entries = parse_master_yaml(args.master_yaml) + if not lang_to_shar_entries: + logging.error("No shar entries found. Check the master YAML path and contents.") + return + + for lang, entries in sorted(lang_to_shar_entries.items()): + logging.info(f" Language '{lang}': {len(entries)} shar groups (with context_audio)") + + # --- Stage 1: Build speaker embedding index --- + index_path = os.path.join(args.output_dir, "speaker_embedding_index.pkl") + speaker_embeddings = run_stage1( + lang_to_shar_entries, + samples_per_speaker=args.samples_per_speaker, + device=device, + index_path=index_path, + batch_size=args.embedding_batch_size, + max_shards_per_dataset=args.max_shards_per_dataset, + ) + + # --- Stage 2: Cross-lingual matching + balanced sampling --- + cross_lingual_map = build_crosslingual_map(speaker_embeddings) + target_cuts_by_lang, context_pool_by_speaker = sample_balanced_cuts( + lang_to_shar_entries, cross_lingual_map, + target_hours=args.target_hours, seed=args.seed, + max_shards_per_dataset=args.max_shards_per_dataset, + ) + + # --- Stage 3: Extract audio + write manifest --- + manifest_path = run_stage3( + target_cuts_by_lang, context_pool_by_speaker, cross_lingual_map, + speaker_embeddings, args.output_dir, args.sample_rate, args.seed, + ) + + # --- Summary --- + logging.info("=" * 60) + logging.info("Cross-lingual context dataset creation complete!") + logging.info(f" Manifest: {manifest_path}") + logging.info(f" Audio dir: {os.path.join(args.output_dir, 'extracted_audio')}") + logging.info("") + logging.info("Next steps:") + logging.info(" 1. Convert to lhotse shar format:") + logging.info(f" python scripts/magpietts/create_lhotse_shar_from_nemo_manifest.py \\") + logging.info(f" --manifest-path {manifest_path} \\") + logging.info(f" --audio-base-dir {os.path.join(args.output_dir, 'extracted_audio')} \\") + logging.info(f" --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\") + logging.info(f" --num-jobs 16 --processing-chunk-size 256 --audio-format flac --shuffle --shuffle-seed 42") + logging.info("") + logging.info(" 2. (Optional) Add codec codes:") + logging.info(f" python scripts/magpietts/extend_lhotse_shards_with_audio_codes.py \\") + logging.info(f" --cuts-dir {os.path.join(args.output_dir, 'lhotse_shar', 'cuts')} \\") + logging.info(f" --target-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'target_audio')} \\") + logging.info(f" --context-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'context_audio')} \\") + logging.info(f" --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\") + logging.info(f" --codec-model-path ") + logging.info("") + yaml_out = os.path.join(args.output_dir, "crosslingual_context.yaml") + logging.info(" 3. Generate YAML config for training:") + logging.info(f" python scripts/magpietts/create_crosslingual_context_dataset.py \\") + logging.info(f" --master-yaml {args.master_yaml} \\") + logging.info(f" --output-dir {args.output_dir} \\") + logging.info(f" --generate-yaml {yaml_out}") + logging.info("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/scripts/magpietts/inspect_crosslingual_dataset.py b/scripts/magpietts/inspect_crosslingual_dataset.py new file mode 100644 index 000000000000..9ce0c648fe39 --- /dev/null +++ b/scripts/magpietts/inspect_crosslingual_dataset.py @@ -0,0 +1,151 @@ +""" +Inspect the cross-lingual context dataset by decoding target and context +audio codes back to waveforms and saving them alongside the original +recording audio for comparison. + +Usage (inside docker): + python scripts/magpietts/inspect_crosslingual_dataset.py \ + --shar-dir /data/crosslingual_context_dataset/lhotse_shar \ + --codec-model-path /model_artifacts/25fps_spectral_codec_with_bandwidth_extension.nemo \ + --codec-name 25fpsSpectralCodecBWE \ + --output-dir /data/crosslingual_context_dataset/inspect \ + --num-samples 10 +""" + +import argparse +import logging +import os + +import numpy as np +import soundfile as sf +import torch +from lhotse import CutSet + +from nemo.collections.tts.models import AudioCodecModel + + +def main(): + parser = argparse.ArgumentParser(description="Inspect cross-lingual dataset: decode codes and save audio.") + parser.add_argument("--shar-dir", required=True, help="Path to lhotse_shar directory.") + parser.add_argument("--codec-model-path", required=True, help="Path to .nemo codec model.") + parser.add_argument("--codec-name", default="25fpsSpectralCodecBWE", help="Codec subdirectory name.") + parser.add_argument("--output-dir", required=True, help="Directory to save inspection outputs.") + parser.add_argument("--num-samples", type=int, default=10, help="Number of samples to inspect.") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + + os.makedirs(args.output_dir, exist_ok=True) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Load codec model + logging.info(f"Loading codec model from {args.codec_model_path}") + codec_model = AudioCodecModel.restore_from(args.codec_model_path, map_location="cpu", strict=False) + codec_model = codec_model.to(device) + codec_model.eval() + codec_sr = codec_model.sample_rate + logging.info(f"Codec output sample rate: {codec_sr}") + + # Build shar fields for first shard + cuts_dir = os.path.join(args.shar_dir, "cuts") + target_audio_dir = os.path.join(args.shar_dir, "target_audio") + context_audio_dir = os.path.join(args.shar_dir, "context_audio") + target_codes_dir = os.path.join(args.shar_dir, args.codec_name, "target_codes") + context_codes_dir = os.path.join(args.shar_dir, args.codec_name, "context_codes") + + # Use first shard only + fields = { + "cuts": [os.path.join(cuts_dir, "cuts.000000.jsonl.gz")], + "recording": [os.path.join(target_audio_dir, "recording.000000.tar")], + "context_recording": [os.path.join(context_audio_dir, "recording.000000.tar")], + "target_codes": [os.path.join(target_codes_dir, "codes.000000.tar")], + "context_codes": [os.path.join(context_codes_dir, "codes.000000.tar")], + } + + for k, v in fields.items(): + if not os.path.isfile(v[0]): + logging.error(f"Missing file for '{k}': {v[0]}") + return + + logging.info("Loading CutSet from shar...") + cutset = CutSet.from_shar(fields=fields) + + count = 0 + for cut in cutset: + if count >= args.num_samples: + break + + sup = cut.supervisions[0] if cut.supervisions else None + lang = sup.language if sup else "unk" + speaker = sup.speaker if sup else "unk" + ctx_lang = sup.custom.get("context_language", "unk") if sup and hasattr(sup, "custom") else "unk" + ssim = sup.custom.get("context_speaker_similarity", "N/A") if sup and hasattr(sup, "custom") else "N/A" + + sample_dir = os.path.join(args.output_dir, f"sample_{count:03d}_{lang}") + os.makedirs(sample_dir, exist_ok=True) + + logging.info(f"--- Sample {count} ---") + logging.info(f" Cut ID: {cut.id}") + logging.info(f" Target lang: {lang}, Context lang: {ctx_lang}, SSIM: {ssim}") + logging.info(f" Speaker: {speaker}") + if sup: + logging.info(f" Text: {sup.text[:80]}...") + + # 1. Save original target recording audio + target_audio_np = cut.recording.resample(codec_sr).load_audio().squeeze(0) + sf.write(os.path.join(sample_dir, "target_recording.wav"), target_audio_np, codec_sr) + logging.info(f" Saved target_recording.wav ({len(target_audio_np)/codec_sr:.2f}s)") + + # 2. Save original context recording audio + if cut.has_custom("context_recording"): + ctx_audio_np = cut.context_recording.resample(codec_sr).load_audio().squeeze(0) + sf.write(os.path.join(sample_dir, "context_recording.wav"), ctx_audio_np, codec_sr) + logging.info(f" Saved context_recording.wav ({len(ctx_audio_np)/codec_sr:.2f}s)") + + # 3. Decode target codes -> audio + if cut.has_custom("target_codes"): + target_codes_np = cut.target_codes.load().astype(np.int32) # (C, T) + target_codes_t = torch.from_numpy(target_codes_np).unsqueeze(0).to(device) # (1, C, T) + target_codes_len = torch.tensor([target_codes_t.shape[2]], device=device) + with torch.inference_mode(): + decoded_target, decoded_target_len = codec_model.decode( + tokens=target_codes_t, tokens_len=target_codes_len + ) + decoded_target_np = decoded_target[0, :decoded_target_len[0]].cpu().float().numpy() + sf.write(os.path.join(sample_dir, "target_decoded_from_codes.wav"), decoded_target_np, codec_model.output_sample_rate) + logging.info(f" Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}") + else: + logging.warning(f" No target_codes found for cut {cut.id}") + + # 4. Decode context codes -> audio + if cut.has_custom("context_codes"): + ctx_codes_np = cut.context_codes.load().astype(np.int32) # (C, T) + ctx_codes_t = torch.from_numpy(ctx_codes_np).unsqueeze(0).to(device) # (1, C, T) + ctx_codes_len = torch.tensor([ctx_codes_t.shape[2]], device=device) + with torch.inference_mode(): + decoded_ctx, decoded_ctx_len = codec_model.decode( + tokens=ctx_codes_t, tokens_len=ctx_codes_len + ) + decoded_ctx_np = decoded_ctx[0, :decoded_ctx_len[0]].cpu().float().numpy() + sf.write(os.path.join(sample_dir, "context_decoded_from_codes.wav"), decoded_ctx_np, codec_model.output_sample_rate) + logging.info(f" Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}") + else: + logging.warning(f" No context_codes found for cut {cut.id}") + + # 5. Write metadata + with open(os.path.join(sample_dir, "info.txt"), "w") as f: + f.write(f"cut_id: {cut.id}\n") + f.write(f"target_language: {lang}\n") + f.write(f"context_language: {ctx_lang}\n") + f.write(f"speaker: {speaker}\n") + f.write(f"context_speaker_similarity: {ssim}\n") + f.write(f"text: {sup.text if sup else ''}\n") + f.write(f"duration: {cut.duration}\n") + + count += 1 + + logging.info(f"Done. Saved {count} samples to {args.output_dir}") + + +if __name__ == "__main__": + main() From c14083a5a7dfddf5b0702431a3b666e95f2a6f68 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Wed, 4 Mar 2026 18:33:21 -0800 Subject: [PATCH 72/94] add language to text contexts. Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/data/text_to_speech_dataset.py | 8 +++++++- .../tts/data/text_to_speech_dataset_lhotse.py | 9 ++++++++- nemo/collections/tts/models/easy_magpietts.py | 3 +++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index 65671b8606ed..4d99c463d18b 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -380,6 +380,7 @@ def __init__( text_context_remapping: Dict[str, str] = None, text_context_remapping_prob: float = 0.0, ignore_phoneme_languages: List[str] = None, + add_language_to_context_text: bool = False, ): super().__init__( dataset_meta=dataset_meta, @@ -414,6 +415,7 @@ def __init__( self.text_context_remapping = text_context_remapping self.text_context_remapping_prob = text_context_remapping_prob self.ignore_phoneme_languages = ignore_phoneme_languages or [] + self.add_language_to_context_text = add_language_to_context_text def get_num_audio_samples_to_slice(self, duration, sample_rate): num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame) @@ -602,7 +604,11 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float) context_tokens = self.text_tokenizer.encode(context_text, self.text_conditioning_tokenizer_name) example['has_text_context'] = True else: - context_tokens = self.text_tokenizer.encode("[NO TEXT CONTEXT]", self.text_conditioning_tokenizer_name) + if self.add_language_to_context_text: + context_text = f"[{language.upper()}]" + else: + context_text = "[NO TEXT CONTEXT]" + context_tokens = self.text_tokenizer.encode(context_text, self.text_conditioning_tokenizer_name) example['has_text_context'] = False if self.pad_context_text_to_max_duration: _required_len = ( diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index cb478c87fe7f..356cc8ca4d15 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -152,6 +152,7 @@ def __init__( text_context_remapping_prob: float = 0.0, phoneme_tokenizer_config: DictConfig = None, ignore_phoneme_languages: List[str] = None, + add_language_to_context_text: bool = False, ): super().__init__() self.sample_rate = sample_rate @@ -177,6 +178,7 @@ def __init__( self.text_context_remapping_prob = text_context_remapping_prob self.phoneme_tokenizer_config = phoneme_tokenizer_config self.ignore_phoneme_languages = ignore_phoneme_languages or [] + self.add_language_to_context_text = add_language_to_context_text def get_num_audio_samples_to_slice(self, duration, sample_rate): num_codec_frames = int(duration * sample_rate / self.codec_model_samples_per_frame) @@ -388,8 +390,13 @@ def _sample_context_duration_with_available_limit(available_duration_sec: float) ) has_text_context = True else: + if self.add_language_to_context_text: + context_text = f"[{language.upper()}]" + else: + context_text = "[NO TEXT CONTEXT]" + context_text_tokens = self.text_tokenizer.encode( - "[NO TEXT CONTEXT]", tokenizer_name=self.text_conditioning_tokenizer_name + context_text, tokenizer_name=self.text_conditioning_tokenizer_name ) has_text_context = False if self.pad_context_text_to_max_duration: diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index e0f2a87da55a..e69e877ed94c 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -402,6 +402,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0) self.pad_context_text_to_max_duration = False + self.add_language_to_context_text = cfg.get('add_language_to_context_text', False) super().__init__(cfg=cfg, trainer=trainer) @@ -2475,6 +2476,7 @@ def get_dataset(self, dataset_cfg, dataset_type): use_text_conditioning_tokenizer=True, text_conditioning_tokenizer_name=self.text_conditioning_tokenizer_name, pad_context_text_to_max_duration=self.pad_context_text_to_max_duration, + add_language_to_context_text=self.add_language_to_context_text, context_duration_min=self.cfg.context_duration_min, context_duration_max=self.cfg.context_duration_max, ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []), @@ -2508,6 +2510,7 @@ def get_lhotse_dataloader(self, dataset_cfg, mode='train') -> torch.utils.data.D tokenizer_config=self.cfg.text_tokenizers, phoneme_tokenizer_config=self.cfg.get("phoneme_tokenizer", None), ignore_phoneme_languages=self.cfg.get("ignore_phoneme_languages", []), + add_language_to_context_text=self.add_language_to_context_text, ) data_loader = get_lhotse_dataloader_from_config( From 19ff0eae460568bb7b840db141bbef3867270e95 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Mon, 9 Mar 2026 16:32:58 +0000 Subject: [PATCH 73/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- examples/tts/magpietts_inference.py | 6 +- nemo/collections/tts/models/easy_magpietts.py | 88 ++++++--- .../easy_magpietts_preference_optimization.py | 110 ++++++----- .../modules/magpietts_inference/inference.py | 2 + nemo/collections/tts/modules/utmosv2.py | 2 +- nemo/collections/tts/parts/utils/helpers.py | 9 +- .../create_crosslingual_context_dataset.py | 174 ++++++++++++------ .../magpietts/inspect_crosslingual_dataset.py | 28 ++- .../tts/test_infer_vs_process_batch.py | 20 +- 9 files changed, 285 insertions(+), 154 deletions(-) diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index 19085f78eb96..d38c093eb1de 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -507,7 +507,11 @@ def create_argument_parser() -> argparse.ArgumentParser: target_group.add_argument('--cer_target', type=float, default=None) target_group.add_argument('--ssim_target', type=float, default=None) target_group.add_argument('--is_decoder_only_model', action='store_true') - target_group.add_argument('--legacy_context_stacking', action='store_true', help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking') + target_group.add_argument( + '--legacy_context_stacking', + action='store_true', + help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking', + ) target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt']) target_group.add_argument( '--phoneme_sampling_method', type=str, default='argmax', choices=['argmax', 'multinomial'] diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index e69e877ed94c..e8bb877dfb53 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -752,7 +752,7 @@ def codes_to_audio(self, codes, codes_len): codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0) # Updates all lens less than 4 to 4 codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len) - codes = codes[:,:,:codes_len.max()] + codes = codes[:, :, : codes_len.max()] audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) # audio: (B, T) @@ -1291,9 +1291,13 @@ def prepare_context_tensors( ) # Use legacy audio_bos_id/audio_eos_id if flag is set - stack_bos_id = self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id - stack_eos_id = self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id - + stack_bos_id = ( + self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id + ) + stack_eos_id = ( + self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id + ) + context_audio_codes, context_audio_codes_lens = self.stack_codes( context_audio_codes, context_audio_codes_lens, @@ -1532,7 +1536,9 @@ def corrupt_stacked_phoneme_tokens( source_index = torch.arange(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long) step_delta = torch.ones(min_len, device=phoneme_tokens_stacked.device, dtype=torch.long) op_is_repeat = torch.rand(corrupt_steps.numel(), device=phoneme_tokens_stacked.device) < 0.5 - step_delta[corrupt_steps] = torch.where(op_is_repeat, torch.zeros_like(corrupt_steps), torch.full_like(corrupt_steps, 2)) + step_delta[corrupt_steps] = torch.where( + op_is_repeat, torch.zeros_like(corrupt_steps), torch.full_like(corrupt_steps, 2) + ) source_index = torch.cumsum(step_delta, dim=0) - step_delta[0] source_index = torch.clamp(source_index, min=0, max=min_len - 1) source_index[0] = 0 @@ -1849,8 +1855,19 @@ def process_batch( dropout_complete_phoneme_channel = False if self.phoneme_tokenizer is not None and phoneme_tokens is not None: # Corrupt phonemes only when text input is not dropped. - apply_phoneme_corruption = mode == 'train' and (not dropout_text_input) and (not dropout_conditional_input) and self.phoneme_corruption_type == 'repeat_skip_unk' - dropout_complete_phoneme_channel = mode == 'train' and ( dropout_conditional_input or (self.phoneme_corruption_type == 'complete_channel' and torch.rand(1).item() < self.phoneme_corruption_batch_prob)) + apply_phoneme_corruption = ( + mode == 'train' + and (not dropout_text_input) + and (not dropout_conditional_input) + and self.phoneme_corruption_type == 'repeat_skip_unk' + ) + dropout_complete_phoneme_channel = mode == 'train' and ( + dropout_conditional_input + or ( + self.phoneme_corruption_type == 'complete_channel' + and torch.rand(1).item() < self.phoneme_corruption_batch_prob + ) + ) ( phoneme_channel_embedding, phoneme_channel_lens, @@ -1999,7 +2016,9 @@ def process_batch( pb_phoneme_tokens_target = phoneme_tokens_stacked_clean[:, :, 1:].long() pb_phoneme_tokens_lens_target = phoneme_tokens_lens_stacked - 1 - if (phoneme_corruption_mode != 'repeat_skip') and not (dropout_complete_phoneme_channel or dropout_conditional_input or dropout_text_input): + if (phoneme_corruption_mode != 'repeat_skip') and not ( + dropout_complete_phoneme_channel or dropout_conditional_input or dropout_text_input + ): phoneme_loss, _ = self.compute_phoneme_loss( pb_phoneme_logits, pb_phoneme_tokens_target, pb_phoneme_tokens_lens_target ) @@ -2197,7 +2216,7 @@ def validation_step(self, batch, batch_idx): topk=80, use_local_transformer_for_inference=self.local_transformer_type == LocalTransformerType.AR, use_cfg=self.cfg.get('inference_use_cfg_in_val', True), - cfg_scale=2.5 + cfg_scale=2.5, ) # Get audio output directory @@ -2248,11 +2267,7 @@ def validation_step(self, batch, batch_idx): # Save context audio for SSIM computation ctx_audio_np = ( - context_audio_cleaned[idx] - .float() - .detach() - .cpu() - .numpy()[: context_audio_lens_cleaned[idx]] + context_audio_cleaned[idx].float().detach().cpu().numpy()[: context_audio_lens_cleaned[idx]] ) ctx_path = os.path.join(audio_dir, f'rank{self.global_rank}_batch{batch_idx}_idx{idx}_context.wav') sf.write(ctx_path, ctx_audio_np, self.output_sample_rate) @@ -2278,7 +2293,9 @@ def validation_step(self, batch, batch_idx): ) pred_transcripts = [process_text_for_cer(transcript) for transcript in transcripts] except Exception as e: - logging.warning(f"Val batched ASR transcription failed, falling back to per-file mode: {e}") + logging.warning( + f"Val batched ASR transcription failed, falling back to per-file mode: {e}" + ) pred_transcripts = [] for item_idx, audio_path in enumerate(predicted_audio_paths): lang = languages[item_idx] if item_idx < len(languages) else 'en' @@ -2346,7 +2363,9 @@ def validation_step(self, batch, batch_idx): if pred_embeddings is not None and ctx_embeddings is not None: pred_emb = pred_embeddings[idx].cpu().float().numpy() ctx_emb = ctx_embeddings[idx].cpu().float().numpy() - ssim = float(np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb))) + ssim = float( + np.dot(pred_emb, ctx_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ctx_emb)) + ) batch_ssim.append(ssim) # UTMOSv2 naturalness score (MOS on 1-5 scale) @@ -2932,7 +2951,9 @@ def streaming_step( # ==================== DETERMINE PHASES PER BATCH ITEM ==================== needs_context = state.context_position < state.full_context_lens # (B,) bool needs_text = (~needs_context) & (~state.text_finished) - needs_phoneme = (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended) + needs_phoneme = ( + (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended) + ) needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished) next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) @@ -2974,7 +2995,7 @@ def streaming_step( # The EOS token itself IS embedded normally (matching process_batch behavior # where EOS is part of the text sequence). After this step, text_finished is set # so subsequent steps won't add any text embedding. - is_eos_token = (text_tokens == self.eos_id) & needs_text # (B,) bool + is_eos_token = (text_tokens == self.eos_id) & needs_text # (B,) bool text_add_mask = needs_text.view(batch_size, 1, 1).float() next_input = next_input + text_embedded * text_add_mask state.text_finished = state.text_finished | is_eos_token @@ -3023,12 +3044,11 @@ def streaming_step( ) # (B, 1, E) last_mask = has_last_phoneme.view(batch_size, 1, 1).float() phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask - + # Only end phoneme stream in prediction mode when the phoneme EOS is detected state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected next_input = next_input + phoneme_emb - # --- Audio embedding for audio phase items --- if needs_audio.any(): @@ -3041,7 +3061,9 @@ def streaming_step( positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1) gt_emb = state.gt_audio_embeddings[ torch.arange(batch_size, device=device), positions, : - ].unsqueeze(1) # (B, 1, E) + ].unsqueeze( + 1 + ) # (B, 1, E) audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float() audio_emb = audio_emb + gt_emb * audio_mask else: @@ -3145,7 +3167,7 @@ def streaming_step( ).any( dim=1 ) # (B,) - + state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected # Track phoneme prediction end index for items that just ended @@ -3203,7 +3225,7 @@ def streaming_step( torch.full((batch_size,), S, device=device), # no EOS in this step ) # (B,) - audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio + audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio state.finished = state.finished | audio_eos_detected # Track audio prediction end index (in frames) for items that just ended @@ -3238,7 +3260,9 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: # Get phoneme logits all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] - phoneme_logits = all_code_logits_t_phoneme.view(actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size) + phoneme_logits = all_code_logits_t_phoneme.view( + actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size + ) max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values # (B, phoneme_stacking_factor) # Sample phonemes @@ -3256,7 +3280,9 @@ def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: and hasattr(self.phoneme_tokenizer, 'unk_token_id') and self.phoneme_confidence_unk_threshold > 0.0 ): - underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any(dim=1, keepdim=True) # (B, 1) + underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any( + dim=1, keepdim=True + ) # (B, 1) eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True) replace_with_unk = underconfident_step & (~eos_predicted_step) if replace_with_unk.any(): @@ -3622,7 +3648,9 @@ def infer_batch( rtf_metrics=rtf_metrics, predicted_phoneme_tokens=ib_phoneme_tokens, predicted_phoneme_tokens_lens=ib_phoneme_tokens_lens, - phoneme_prediction_start_idx=state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None, + phoneme_prediction_start_idx=( + state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None + ), ) @staticmethod @@ -3738,12 +3766,16 @@ def do_tts( phoneme_input_type = 'pred' if gt_phoneme_text is not None: if self.phoneme_tokenizer is None: - raise ValueError("Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided.") + raise ValueError( + "Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided." + ) gt_phoneme_text = gt_phoneme_text.strip() if gt_phoneme_text == "": raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.") gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text) - gt_phoneme_tokens = [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + gt_phoneme_tokens = ( + [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + ) if len(gt_phoneme_tokens) == 0: raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.") batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device) diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py index 1643474dc5ce..020e7af77aa5 100644 --- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py @@ -75,8 +75,8 @@ class EasyMagpieTTSModelOnlinePO(EasyMagpieTTSModel): def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): super().__init__(cfg, trainer) - - self.run_val_inference = True # Always run validation inference in PO. + + self.run_val_inference = True # Always run validation inference in PO. self.automatic_optimization = False ref_model_cfg = copy.deepcopy(cfg) @@ -138,9 +138,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.loss_type = self.cfg.get('loss_type', 'grpo') if self.loss_type not in ['grpo', 'dr_grpo']: - raise ValueError( - f"Received loss_type={self.loss_type}. Supported values: ['grpo', 'dr_grpo']." - ) + raise ValueError(f"Received loss_type={self.loss_type}. Supported values: ['grpo', 'dr_grpo'].") self.scale_rewards = self.cfg.get('scale_rewards', True) self.max_decoder_steps = self.cfg.get('max_decoder_steps', 220) self.aux_phoneme_loss_weight = self.cfg.get('aux_phoneme_loss_weight', 1.0) @@ -159,17 +157,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.best_cer_threshold = self.cfg.get('best_cer_threshold', 1.0) self.worst_cer_threshold = self.cfg.get('worst_cer_threshold', 1.0) - - if self.trainer is not None and str(self.trainer.precision) in ("32", "32-true"): self.decoder.float() def _get_trainable_module_groups(self) -> Dict[str, List[torch.nn.Parameter]]: """Return a dict mapping module-group name → list of trainable parameters.""" modules_to_exclude = { - '_speaker_verification_model', '_codec_model', '_eval_asr_model', - '_eval_speaker_verification_model', '_reference_model', - 'whisper_model', 'whisper_processor', 'squim_objective_model', + '_speaker_verification_model', + '_codec_model', + '_eval_asr_model', + '_eval_speaker_verification_model', + '_reference_model', + 'whisper_model', + 'whisper_processor', + 'squim_objective_model', '_utmos_calculator', } groups: Dict[str, List[torch.nn.Parameter]] = {} @@ -195,21 +196,21 @@ def _compute_grad_and_weight_metrics(self) -> Dict[str, float]: if p.grad is not None: grad_norms.append(p.grad.data.norm(2).item()) - module_weight_norm = float(np.sqrt(sum(w ** 2 for w in weight_norms))) + module_weight_norm = float(np.sqrt(sum(w**2 for w in weight_norms))) metrics[f'weight_norm/{group_name}'] = module_weight_norm all_weight_norms.extend(weight_norms) if grad_norms: - module_grad_norm = float(np.sqrt(sum(g ** 2 for g in grad_norms))) + module_grad_norm = float(np.sqrt(sum(g**2 for g in grad_norms))) metrics[f'grad_norm/{group_name}'] = module_grad_norm all_grad_norms.extend(grad_norms) else: metrics[f'grad_norm/{group_name}'] = 0.0 if all_grad_norms: - metrics['grad_norm/global'] = float(np.sqrt(sum(g ** 2 for g in all_grad_norms))) + metrics['grad_norm/global'] = float(np.sqrt(sum(g**2 for g in all_grad_norms))) if all_weight_norms: - metrics['weight_norm/global'] = float(np.sqrt(sum(w ** 2 for w in all_weight_norms))) + metrics['weight_norm/global'] = float(np.sqrt(sum(w**2 for w in all_weight_norms))) return metrics @torch.no_grad() @@ -225,10 +226,10 @@ def _compute_weight_update_metrics(self, prev_weights: Dict[int, torch.Tensor]) if pid in prev_weights: deltas.append((p.data - prev_weights[pid]).norm(2).item()) if deltas: - metrics[f'weight_delta/{group_name}'] = float(np.sqrt(sum(d ** 2 for d in deltas))) + metrics[f'weight_delta/{group_name}'] = float(np.sqrt(sum(d**2 for d in deltas))) all_deltas.extend(deltas) if all_deltas: - metrics['weight_delta/global'] = float(np.sqrt(sum(d ** 2 for d in all_deltas))) + metrics['weight_delta/global'] = float(np.sqrt(sum(d**2 for d in all_deltas))) return metrics @torch.no_grad() @@ -245,14 +246,15 @@ def _print_grad_weight_summary(self, metrics: Dict[str, float], step: int) -> No if not getattr(self.trainer, "is_global_zero", True): return - lines = [f"\n[grad/weight] step={step} " - f"grad={metrics.get('grad_norm/global', 0.0):.6f} " - f"w={metrics.get('weight_norm/global', 0.0):.4f} " - f"Δw={metrics.get('weight_delta/global', 0.0):.8f}"] + lines = [ + f"\n[grad/weight] step={step} " + f"grad={metrics.get('grad_norm/global', 0.0):.6f} " + f"w={metrics.get('weight_norm/global', 0.0):.4f} " + f"Δw={metrics.get('weight_delta/global', 0.0):.8f}" + ] module_names = sorted( - k.split('/')[1] for k in metrics - if k.startswith('weight_norm/') and k != 'weight_norm/global' + k.split('/')[1] for k in metrics if k.startswith('weight_norm/') and k != 'weight_norm/global' ) for name in module_names: gn = metrics.get(f'grad_norm/{name}', 0.0) @@ -317,7 +319,9 @@ def _get_cached_normalizer(self, lang_key: Optional[str]): self._normalizer_cache[lang_key] = None return self._normalizer_cache[lang_key] - def _get_per_token_logps(self, logits: torch.Tensor, labels: torch.Tensor, loss_mask: torch.Tensor) -> torch.Tensor: + def _get_per_token_logps( + self, logits: torch.Tensor, labels: torch.Tensor, loss_mask: torch.Tensor + ) -> torch.Tensor: # Force fp32 for log_softmax to avoid bf16 precision issues that sever the # gradient path through the GRPO "exp(logps - logps.detach())" trick. # Under bf16 autocast, the tiny gradient signal through this identity-like @@ -328,7 +332,6 @@ def _get_per_token_logps(self, logits: torch.Tensor, labels: torch.Tensor, loss_ per_token_logps = per_token_logps * loss_mask.float() return per_token_logps - def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): """ Override parent to force fp32 computation for the entire local transformer logits path. @@ -428,7 +431,7 @@ def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]: context_codes[short_context_mask] = target_codes[short_context_mask] context_lens[short_context_mask] = target_lens[short_context_mask] # Slice to the actual max length needed - context_codes = context_codes[..., :context_lens.max()] + context_codes = context_codes[..., : context_lens.max()] if self._codec_converter is not None: context_codes = self._codec_converter.convert_original_to_new( @@ -523,19 +526,25 @@ def _print_group_cer_wer_table( ] ) - table = self._format_text_table(headers=["item", "cer", "wer", "ssim", "utmos", "reward", "advantage"], rows=rows) + table = self._format_text_table( + headers=["item", "cer", "wer", "ssim", "utmos", "reward", "advantage"], rows=rows + ) print( f"[generate_and_reward] group={group_idx} valid={is_group_valid} " f"mean_reward={mean_reward:.4f} std_reward={std_reward:.4f}\n" f"prompt: {prompt_text}\n{table}\n" ) - def _compute_pred_transcripts(self, predicted_audio_paths: List[str], batch_repeated: Dict, reward_asr_model: str) -> List[str]: + def _compute_pred_transcripts( + self, predicted_audio_paths: List[str], batch_repeated: Dict, reward_asr_model: str + ) -> List[str]: if reward_asr_model == 'nemo': pred_transcripts = self._eval_asr_model.transcribe( predicted_audio_paths, batch_size=len(predicted_audio_paths), - override_config=TranscribeConfig(use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0), + override_config=TranscribeConfig( + use_lhotse=False, batch_size=len(predicted_audio_paths), num_workers=0 + ), ) return [process_text_for_cer(transcript.text) for transcript in pred_transcripts] @@ -652,7 +661,9 @@ def generate_and_reward( sample_rate=self.output_sample_rate, ) audio_save_time_sec = time.perf_counter() - save_start_time - audio_durations = [int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0))] + audio_durations = [ + int(predicted_audio_lens[idx].item()) / self.output_sample_rate for idx in range(predicted_audio.size(0)) + ] rewarding_start_time = time.perf_counter() pred_transcripts = self._compute_pred_transcripts(predicted_audio_paths, batch_repeated, reward_asr_model) @@ -747,9 +758,7 @@ def generate_and_reward( best_utmos_achievable - mean_utmos_dataset, 1e-8 ) else: - utmos_reward = 0.5 - 0.5 * (mean_utmos_dataset - item_utmos) / max( - mean_utmos_dataset - 1.0, 1e-8 - ) + utmos_reward = 0.5 - 0.5 * (mean_utmos_dataset - item_utmos) / max(mean_utmos_dataset - 1.0, 1e-8) else: utmos_reward = 0.0 @@ -759,7 +768,9 @@ def generate_and_reward( + pesq_reward * pesq_reward_weight + utmos_reward * utmos_reward_weight ) - if (item_metrics['codes_len'] >= max_valid_codes_len) or (item_metrics['codes_len'] <= min_valid_codes_len): + if (item_metrics['codes_len'] >= max_valid_codes_len) or ( + item_metrics['codes_len'] <= min_valid_codes_len + ): item_metrics['_needs_group_min_reward'] = True else: item_metrics['_needs_group_min_reward'] = False @@ -843,10 +854,12 @@ def generate_and_reward( } def process_batch_online_po(self, batch: Dict, n_generations_per_item: int, mode: str = 'train'): - generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs( - batch=batch, - n_generations_per_item=n_generations_per_item, - mode=mode, + generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = ( + self._prepare_online_po_inputs( + batch=batch, + n_generations_per_item=n_generations_per_item, + mode=mode, + ) ) chunked_outputs = self._run_teacher_forced_chunked_po( generated_codes_and_metrics=generated_codes_and_metrics, @@ -970,14 +983,17 @@ def _compute_po_losses_from_outputs( per_token_logps = self._get_per_token_logps(codebook_logits, codebook_labels, audio_loss_mask) # Ensure the GRPO policy gradient trick stays in fp32 to preserve gradient signal with torch.cuda.amp.autocast(enabled=False): - per_token_loss = -(torch.exp(per_token_logps.float() - per_token_logps.float().detach()) * advantages.float().unsqueeze(1)) + per_token_loss = -( + torch.exp(per_token_logps.float() - per_token_logps.float().detach()) + * advantages.float().unsqueeze(1) + ) per_token_loss = per_token_loss * group_validities.float().unsqueeze(1) # Per-token entropy of the policy distribution (always computed for logging). with torch.cuda.amp.autocast(enabled=False): logits_fp32 = codebook_logits.float() - log_probs = logits_fp32.log_softmax(-1) # [B, T, V] - probs = log_probs.exp() # [B, T, V] + log_probs = logits_fp32.log_softmax(-1) # [B, T, V] + probs = log_probs.exp() # [B, T, V] per_token_entropy = -(probs * log_probs).sum(-1) # [B, T] codebook_entropy = ( (per_token_entropy * audio_loss_mask).sum(dim=1) / audio_loss_mask.sum(dim=1).clamp_min(1e-8) @@ -991,7 +1007,9 @@ def _compute_po_losses_from_outputs( ) with torch.cuda.amp.autocast(enabled=False): per_token_kl = ( - torch.exp(per_token_ref_logps.float() - per_token_logps.float()) - (per_token_ref_logps.float() - per_token_logps.float()) - 1 + torch.exp(per_token_ref_logps.float() - per_token_logps.float()) + - (per_token_ref_logps.float() - per_token_logps.float()) + - 1 ) per_token_loss = per_token_loss + self.cfg.get('grpo_beta', 0.0) * per_token_kl codebook_kl_loss_mean = ( @@ -1140,10 +1158,12 @@ def training_step(self, batch, batch_idx): # Snapshot weights before optimizer step to measure weight deltas. prev_weights = self._snapshot_trainable_weights() - generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = self._prepare_online_po_inputs( - batch=batch, - n_generations_per_item=n_generations_per_item, - mode='train', + generated_codes_and_metrics, batch_repeated, predicted_codes, predicted_codes_lens = ( + self._prepare_online_po_inputs( + batch=batch, + n_generations_per_item=n_generations_per_item, + mode='train', + ) ) teacher_forced_start_time = time.perf_counter() po_outputs = self._run_teacher_forced_chunked_po( diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index 70ca811f58a2..cf325b91d71c 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -84,6 +84,7 @@ class InferenceConfig: longform_word_threshold: int = 40 # Word threshold for auto-detection is_decoder_only_model: bool = False + def build_identifier(self) -> str: """Build a unique identifier string for this configuration. @@ -374,6 +375,7 @@ def _run_decoder_only_inference( item_idx += 1 return all_rtf_metrics, generated_audio_paths, codec_file_paths + @staticmethod def _batch_to_cuda(batch: dict) -> dict: """Move batch tensors to CUDA device.""" diff --git a/nemo/collections/tts/modules/utmosv2.py b/nemo/collections/tts/modules/utmosv2.py index 46b17316d0ea..e71d7e5f0316 100644 --- a/nemo/collections/tts/modules/utmosv2.py +++ b/nemo/collections/tts/modules/utmosv2.py @@ -77,7 +77,7 @@ def process_directory( """ if num_workers is None: num_workers = batch_size - + with torch.inference_mode(): # UTMOSV2 tends to launch many of OpenMP threads which overloads the machine's CPUs # while actually slowing down the prediction. Limit the number of threads here. diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py index cf6dbbdcd494..1bbb88ef3434 100644 --- a/nemo/collections/tts/parts/utils/helpers.py +++ b/nemo/collections/tts/parts/utils/helpers.py @@ -42,12 +42,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import string import os import shutil +import string import tempfile -from enum import Enum from collections import defaultdict +from enum import Enum from typing import Any, List, Optional, Sequence, Tuple, Union import librosa @@ -62,7 +62,6 @@ from nemo.utils import logging from nemo.utils.decorators import deprecated - try: from lightning.pytorch.utilities import rank_zero_only except ModuleNotFoundError: @@ -896,7 +895,9 @@ def transcribe_with_whisper_from_filepaths( transcripts = [""] * len(audio_filepaths) for lang, indices in grouped_indices.items(): - forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=lang, task="transcribe") if lang else None + forced_decoder_ids = ( + whisper_processor.get_decoder_prompt_ids(language=lang, task="transcribe") if lang else None + ) for start_idx in range(0, len(indices), batch_size): batch_indices = indices[start_idx : start_idx + batch_size] speech_arrays = [librosa.load(audio_filepaths[idx], sr=16000)[0] for idx in batch_indices] diff --git a/scripts/magpietts/create_crosslingual_context_dataset.py b/scripts/magpietts/create_crosslingual_context_dataset.py index 2b488eb4097c..c3bb7008a509 100644 --- a/scripts/magpietts/create_crosslingual_context_dataset.py +++ b/scripts/magpietts/create_crosslingual_context_dataset.py @@ -65,6 +65,7 @@ # YAML / shar helpers # --------------------------------------------------------------------------- + def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]: """ Parse the master multilingual YAML and each per-language YAML it references. @@ -90,7 +91,9 @@ def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]: for ce in child_entries: shar_path = ce.get("shar_path", {}) if "context_audio" not in shar_path: - logging.debug(f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}") + logging.debug( + f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}" + ) continue lang_to_shar_entries[lang].append(ce) @@ -108,8 +111,8 @@ def expand_shar_range(pattern: str) -> List[str]: start_idx = int(match.group(1)) end_idx = int(match.group(2)) width = len(match.group(1)) - prefix = pattern[:match.start()] - suffix = pattern[match.end():] + prefix = pattern[: match.start()] + suffix = pattern[match.end() :] return [f"{prefix}{i:0{width}d}{suffix}" for i in range(start_idx, end_idx + 1)] @@ -128,6 +131,7 @@ def parse_speaker_field(speaker_str: str) -> Tuple[str, str, str]: # Stage 1: Build speaker embedding index # --------------------------------------------------------------------------- + def discover_speakers_from_cuts( lang_to_shar_entries: Dict[str, List[Dict]], max_cuts_per_speaker: int, @@ -183,7 +187,9 @@ def discover_speakers_from_cuts( except Exception as e: logging.warning(f"Error reading {cuts_file}: {e}") - logging.info(f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages") + logging.info( + f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages" + ) for lang in sorted(lang_to_shar_entries.keys()): n = sum(1 for v in speaker_info.values() if v["language"] == lang) logging.info(f" {lang}: {n} speakers") @@ -210,7 +216,7 @@ def compute_speaker_embeddings( if not cut_metas: continue grouped_by_shar_and_shard: Dict[str, Dict[int, List]] = defaultdict(lambda: defaultdict(list)) - for (se, shard_idx, cut_json) in cut_metas: + for se, shard_idx, cut_json in cut_metas: shar_key = json.dumps(se["shar_path"], sort_keys=True) grouped_by_shar_and_shard[shar_key][shard_idx].append((se, cut_json)) speakers_needing_audio[spk] = { @@ -228,7 +234,7 @@ def compute_speaker_embeddings( for spk, data in speakers_needing_audio.items(): for shar_key, shard_map in data["grouped"].items(): for shard_idx, items in shard_map.items(): - for (se, cut_json) in items: + for se, cut_json in items: cut_id = cut_json.get("id", "") shar_shard_to_speakers[(shar_key, shard_idx)].append((spk, cut_id)) @@ -368,6 +374,7 @@ def run_stage1( # Stage 2: Cross-lingual speaker matching + language-balanced sampling # --------------------------------------------------------------------------- + def build_crosslingual_map(speaker_embeddings: Dict[str, Dict]) -> Dict[str, Tuple[str, float]]: """ For each speaker S in language L, find the closest speaker S' from a different @@ -435,7 +442,9 @@ def sample_balanced_cuts( # Collect 3x the target to allow shuffling diversity collect_secs_per_lang = secs_per_lang * 3 - logging.info(f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)") + logging.info( + f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)" + ) all_matched_speakers = set(v[0] for v in cross_lingual_map.values()) @@ -456,8 +465,7 @@ def sample_balanced_cuts( if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset: cuts_files = cuts_files[:max_shards_per_dataset] logging.info( - f" Limiting to {max_shards_per_dataset} shards for dataset: " - f"{se['shar_path']['cuts']}" + f" Limiting to {max_shards_per_dataset} shards for dataset: " f"{se['shar_path']['cuts']}" ) for cuts_file in cuts_files: if lang_done: @@ -514,6 +522,7 @@ def sample_balanced_cuts( # Stage 3: Extract audio + write NeMo manifest # --------------------------------------------------------------------------- + def run_stage3( target_cuts_by_lang: Dict[str, List[Dict]], context_pool_by_speaker: Dict[str, List], @@ -546,21 +555,25 @@ def run_stage3( matched_spk, ssim = cross_lingual_map[spk] ctx_pool = context_pool_by_speaker.get(matched_spk, []) if not ctx_pool: - logging.warning(f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}") + logging.warning( + f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}" + ) continue ctx_se, ctx_shard_idx, ctx_cut_json = rng.choice(ctx_pool) - assignments.append({ - "target_cut_json": cut_json, - "target_shar_entry": cut_json["_shar_entry"], - "target_shard_idx": cut_json["_shard_idx"], - "target_speaker": spk, - "context_cut_json": ctx_cut_json, - "context_shar_entry": ctx_se, - "context_shard_idx": ctx_shard_idx, - "context_speaker": matched_spk, - "ssim": ssim, - "lang": lang, - }) + assignments.append( + { + "target_cut_json": cut_json, + "target_shar_entry": cut_json["_shar_entry"], + "target_shard_idx": cut_json["_shard_idx"], + "target_speaker": spk, + "context_cut_json": ctx_cut_json, + "context_shar_entry": ctx_se, + "context_shard_idx": ctx_shard_idx, + "context_speaker": matched_spk, + "ssim": ssim, + "lang": lang, + } + ) logging.info(f"[Stage 3] Total assignments: {len(assignments)}") @@ -628,7 +641,9 @@ def _save_audio_from_shard( safe_id = cut.id.replace("/", "_") out_file = os.path.join(out_subdir, f"{safe_id}.wav") sf.write(out_file, audio_np, sample_rate) - out_paths_array[assign_idx] = os.path.relpath(out_file, os.path.join(output_dir, "extracted_audio")) + out_paths_array[assign_idx] = os.path.relpath( + out_file, os.path.join(output_dir, "extracted_audio") + ) del needed_cut_ids[cut.id] if not needed_cut_ids: break @@ -638,15 +653,23 @@ def _save_audio_from_shard( # Extract target audio logging.info(f"[Stage 3] Extracting target audio from {len(target_loads)} shards...") _save_audio_from_shard( - target_loads, assignments, "target_cut_json", - target_audio_dir, target_audio_paths, "target_audio", + target_loads, + assignments, + "target_cut_json", + target_audio_dir, + target_audio_paths, + "target_audio", ) # Extract context audio logging.info(f"[Stage 3] Extracting context audio from {len(context_loads)} shards...") _save_audio_from_shard( - context_loads, assignments, "context_cut_json", - context_audio_dir, context_audio_paths, "context_audio", + context_loads, + assignments, + "context_cut_json", + context_audio_dir, + context_audio_paths, + "context_audio", ) # Write manifest @@ -691,9 +714,11 @@ def _save_audio_from_shard( # Carry over any additional custom fields from the target supervision _exclude_custom_keys = { - "target_audio_codes_path", "context_audio_codes_path", - "context_audio_text", "context_audio_normalized_text", - "context_audio_offset" + "target_audio_codes_path", + "context_audio_codes_path", + "context_audio_text", + "context_audio_normalized_text", + "context_audio_offset", } for k, v in t_sup.get("custom", {}).items(): if k not in entry and k not in _exclude_custom_keys: @@ -710,6 +735,7 @@ def _save_audio_from_shard( # YAML config generation (post Stage 4) # --------------------------------------------------------------------------- + def generate_yaml_config(lhotse_shar_dir: str, output_yaml_path: str, data_mount_prefix: str = "/data"): """ Generate a lhotse YAML config pointing to the cross-lingual shar dataset. @@ -775,16 +801,18 @@ def _make_range_pattern(directory: str, prefix: str, ext: str) -> str: context_codes_dir, f"codes.{{{cc_first:0{cc_width}d}..{cc_last:0{cc_width}d}}}.tar" ) - yaml_entry = [{ - "type": "lhotse_shar", - "shar_path": shar_path, - "weight": 1.0, - "tags": { - "task": "tts", - "lang": "crosslingual", - "tokenizer_names": ["nemotron_nano_30b"], - }, - }] + yaml_entry = [ + { + "type": "lhotse_shar", + "shar_path": shar_path, + "weight": 1.0, + "tags": { + "task": "tts", + "lang": "crosslingual", + "tokenizer_names": ["nemotron_nano_30b"], + }, + } + ] os.makedirs(os.path.dirname(output_yaml_path) or ".", exist_ok=True) with open(output_yaml_path, 'w') as f: @@ -796,56 +824,77 @@ def _make_range_pattern(directory: str, prefix: str, ext: str) -> str: # Main # --------------------------------------------------------------------------- + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create a cross-lingual context TTS dataset from multilingual lhotse shar data.", ) parser.add_argument( - "--master-yaml", required=True, type=str, + "--master-yaml", + required=True, + type=str, help="Path to the master multilingual YAML (e.g. train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml).", ) parser.add_argument( - "--output-dir", required=True, type=str, + "--output-dir", + required=True, + type=str, help="Base directory for all outputs (extracted audio, manifest, speaker index).", ) parser.add_argument( - "--target-hours", type=float, default=50.0, + "--target-hours", + type=float, + default=50.0, help="Total hours of target audio to sample (split equally across languages).", ) parser.add_argument( - "--samples-per-speaker", type=int, default=5, + "--samples-per-speaker", + type=int, + default=5, help="Number of utterances per speaker to use for computing the average TitaNet embedding.", ) parser.add_argument( - "--sample-rate", type=int, default=24000, + "--sample-rate", + type=int, + default=24000, help="Sample rate for saving extracted audio files.", ) parser.add_argument( - "--embedding-batch-size", type=int, default=16, + "--embedding-batch-size", + type=int, + default=16, help="Batch size for TitaNet embedding computation.", ) parser.add_argument( - "--max-shards-per-dataset", type=int, default=0, + "--max-shards-per-dataset", + type=int, + default=0, help="Max number of .jsonl.gz shard files to scan per dataset during " - "speaker discovery (Stage 1). 0 means scan all shards. " - "Setting this to e.g. 10 dramatically speeds up discovery while " - "still finding most speakers.", + "speaker discovery (Stage 1). 0 means scan all shards. " + "Setting this to e.g. 10 dramatically speeds up discovery while " + "still finding most speakers.", ) parser.add_argument( - "--seed", type=int, default=42, + "--seed", + type=int, + default=42, help="Random seed for reproducibility.", ) parser.add_argument( - "--log-level", type=str, default="INFO", + "--log-level", + type=str, + default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Logging level.", ) parser.add_argument( - "--generate-yaml", type=str, default=None, + "--generate-yaml", + type=str, + default=None, help="If provided, skip stages 1-3 and instead generate a YAML config " - "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. " - "Value is the output YAML file path.", + "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. " + "Value is the output YAML file path.", ) args = parser.parse_args() @@ -893,15 +942,22 @@ def main(): # --- Stage 2: Cross-lingual matching + balanced sampling --- cross_lingual_map = build_crosslingual_map(speaker_embeddings) target_cuts_by_lang, context_pool_by_speaker = sample_balanced_cuts( - lang_to_shar_entries, cross_lingual_map, - target_hours=args.target_hours, seed=args.seed, + lang_to_shar_entries, + cross_lingual_map, + target_hours=args.target_hours, + seed=args.seed, max_shards_per_dataset=args.max_shards_per_dataset, ) # --- Stage 3: Extract audio + write manifest --- manifest_path = run_stage3( - target_cuts_by_lang, context_pool_by_speaker, cross_lingual_map, - speaker_embeddings, args.output_dir, args.sample_rate, args.seed, + target_cuts_by_lang, + context_pool_by_speaker, + cross_lingual_map, + speaker_embeddings, + args.output_dir, + args.sample_rate, + args.seed, ) # --- Summary --- diff --git a/scripts/magpietts/inspect_crosslingual_dataset.py b/scripts/magpietts/inspect_crosslingual_dataset.py index 9ce0c648fe39..6fed5c93adf5 100644 --- a/scripts/magpietts/inspect_crosslingual_dataset.py +++ b/scripts/magpietts/inspect_crosslingual_dataset.py @@ -111,9 +111,15 @@ def main(): decoded_target, decoded_target_len = codec_model.decode( tokens=target_codes_t, tokens_len=target_codes_len ) - decoded_target_np = decoded_target[0, :decoded_target_len[0]].cpu().float().numpy() - sf.write(os.path.join(sample_dir, "target_decoded_from_codes.wav"), decoded_target_np, codec_model.output_sample_rate) - logging.info(f" Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}") + decoded_target_np = decoded_target[0, : decoded_target_len[0]].cpu().float().numpy() + sf.write( + os.path.join(sample_dir, "target_decoded_from_codes.wav"), + decoded_target_np, + codec_model.output_sample_rate, + ) + logging.info( + f" Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}" + ) else: logging.warning(f" No target_codes found for cut {cut.id}") @@ -123,12 +129,16 @@ def main(): ctx_codes_t = torch.from_numpy(ctx_codes_np).unsqueeze(0).to(device) # (1, C, T) ctx_codes_len = torch.tensor([ctx_codes_t.shape[2]], device=device) with torch.inference_mode(): - decoded_ctx, decoded_ctx_len = codec_model.decode( - tokens=ctx_codes_t, tokens_len=ctx_codes_len - ) - decoded_ctx_np = decoded_ctx[0, :decoded_ctx_len[0]].cpu().float().numpy() - sf.write(os.path.join(sample_dir, "context_decoded_from_codes.wav"), decoded_ctx_np, codec_model.output_sample_rate) - logging.info(f" Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}") + decoded_ctx, decoded_ctx_len = codec_model.decode(tokens=ctx_codes_t, tokens_len=ctx_codes_len) + decoded_ctx_np = decoded_ctx[0, : decoded_ctx_len[0]].cpu().float().numpy() + sf.write( + os.path.join(sample_dir, "context_decoded_from_codes.wav"), + decoded_ctx_np, + codec_model.output_sample_rate, + ) + logging.info( + f" Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}" + ) else: logging.warning(f" No context_codes found for cut {cut.id}") diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py index d225136989f1..3741deddf430 100644 --- a/tests/collections/tts/test_infer_vs_process_batch.py +++ b/tests/collections/tts/test_infer_vs_process_batch.py @@ -142,7 +142,7 @@ def create_synthetic_batch( text = torch.zeros(batch_size, max_text_len, dtype=torch.long, device=device) for b in range(batch_size): tl = text_lens_list[b] - text[b, :tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device) + text[b, : tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device) text[b, tl - 1] = model.eos_id # EOS as last valid token text_lens = torch.tensor(text_lens_list, dtype=torch.long, device=device) @@ -161,7 +161,9 @@ def create_synthetic_batch( audio_codes_lens = torch.tensor(audio_frames_list, dtype=torch.long, device=device) # Context audio codes (raw, without BOS/EOS) - context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device) + context_audio_codes = torch.zeros( + batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device + ) for b in range(batch_size): caf = context_audio_frames_list[b] context_audio_codes[b, :, :caf] = torch.randint(0, codebook_size, (num_codebooks, caf), device=device) @@ -249,8 +251,10 @@ def compare_audio_codes(model, pb_output, ib_output, batch): num_show = min(10, mismatch_positions.size(0)) for i in range(num_show): cb, t = mismatch_positions[i].tolist() - print(f" Mismatch at codebook={cb}, time={t}: " - f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}") + print( + f" Mismatch at codebook={cb}, time={t}: " + f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}" + ) return all_match @@ -308,7 +312,7 @@ def compare_phoneme_predictions(model, pb_output, ib_output, batch): # infer_batch phoneme preds: slice from start_idx for this batch item start = max(0, ib_start_idx[b].item()) - ib_ph_b = ib_phoneme_preds[b, :, start:start + compare_len] + ib_ph_b = ib_phoneme_preds[b, :, start : start + compare_len] matches = (pb_ph_b == ib_ph_b).all() num_matching = (pb_ph_b == ib_ph_b).sum().item() @@ -325,8 +329,10 @@ def compare_phoneme_predictions(model, pb_output, ib_output, batch): num_show = min(10, mismatch_positions.size(0)) for i in range(num_show): sf, t = mismatch_positions[i].tolist() - print(f" Mismatch at stacking_factor={sf}, time={t}: " - f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}") + print( + f" Mismatch at stacking_factor={sf}, time={t}: " + f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}" + ) return all_match From 1f0f83f38b6d811d059073603560825d91e89175 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Mon, 9 Mar 2026 13:04:03 -0400 Subject: [PATCH 74/94] tokenizer import change Signed-off-by: Paarth Neekhara --- .../common/tokenizers/text_to_speech/tts_tokenizers.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index 81f875750d64..65b27bc6b62f 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -14,11 +14,13 @@ # limitations under the License. import itertools +import os import string from abc import ABC, abstractmethod from contextlib import contextmanager from typing import List, Optional, Union +from tokenizers import Tokenizer from transformers import PreTrainedTokenizerBase from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import ( @@ -1180,10 +1182,6 @@ class IPABPETokenizer: """ def __init__(self, tokenizer_path: str): - import os - - from tokenizers import Tokenizer - if os.path.isdir(tokenizer_path): tokenizer_file = os.path.join(tokenizer_path, "tokenizer.json") else: From a61b60a7277c128b5068e2bcf2922796c63709d4 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Mon, 9 Mar 2026 13:12:50 -0400 Subject: [PATCH 75/94] remove unnecessary imports Signed-off-by: Paarth Neekhara --- nemo/collections/tts/modules/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py index 0c9a8c182b71..866f418dbacd 100644 --- a/nemo/collections/tts/modules/__init__.py +++ b/nemo/collections/tts/modules/__init__.py @@ -15,12 +15,6 @@ import nemo.collections.tts.modules.adapters import nemo.collections.tts.modules.ffn_modules import nemo.collections.tts.modules.moe_modules -from nemo.collections.tts.modules.nemotron_h_decoder import ( - HybridMambaAttentionDynamicCache, - NemotronHConfig, - NemotronHForCausalLM, - NemotronHModel, -) from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet From f090106e12de460303ebd7f0d30ad9bf72ae8700 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Mon, 9 Mar 2026 13:28:30 -0400 Subject: [PATCH 76/94] cleanup Signed-off-by: Paarth Neekhara --- .../tts/modules/nemotron_h_decoder.py | 4 +- .../create_crosslingual_context_dataset.py | 995 -- .../magpietts/inspect_crosslingual_dataset.py | 161 - ...okenizer_2048_en_de_es_fr_hi_it_vi_zh.json | 9954 ----------------- 4 files changed, 2 insertions(+), 11112 deletions(-) delete mode 100644 scripts/magpietts/create_crosslingual_context_dataset.py delete mode 100644 scripts/magpietts/inspect_crosslingual_dataset.py delete mode 100644 scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json diff --git a/nemo/collections/tts/modules/nemotron_h_decoder.py b/nemo/collections/tts/modules/nemotron_h_decoder.py index ec30a1e7a699..986359c0e2b3 100644 --- a/nemo/collections/tts/modules/nemotron_h_decoder.py +++ b/nemo/collections/tts/modules/nemotron_h_decoder.py @@ -21,8 +21,8 @@ """ import math -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple, Union +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple, Union import torch import torch.nn.functional as F diff --git a/scripts/magpietts/create_crosslingual_context_dataset.py b/scripts/magpietts/create_crosslingual_context_dataset.py deleted file mode 100644 index c3bb7008a509..000000000000 --- a/scripts/magpietts/create_crosslingual_context_dataset.py +++ /dev/null @@ -1,995 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Creates a cross-lingual context dataset for TTS training. - -For each target utterance in language A, finds the closest speaker voice from a -different language B (using TitaNet speaker embeddings) and pairs the target with -context audio from that cross-lingual speaker. - -The script operates in three stages: - Stage 1: Build a per-speaker TitaNet embedding index across all languages. - Stage 2: Compute cross-lingual speaker matches and sample a language-balanced subset. - Stage 3: Extract audio to disk and write a NeMo-format JSONL manifest. - -After running this script, use create_lhotse_shar_from_nemo_manifest.py to convert -the output manifest into lhotse shar format, then optionally run -extend_lhotse_shards_with_audio_codes.py to add codec codes. - -Example usage: - python scripts/magpietts/create_crosslingual_context_dataset.py \ - --master-yaml /data/magpie_pretraining_data/manifests/ipa_manifests/train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml \ - --output-dir /data/crosslingual_context_dataset \ - --target-hours 50.0 \ - --samples-per-speaker 5 \ - --seed 42 \ - --log-level INFO -""" - -import argparse -import glob as glob_module -import gzip -import json -import logging -import os -import pickle -import random -import re -from collections import defaultdict -from typing import Any, Dict, List, Tuple - -import numpy as np -import soundfile as sf -import torch -import yaml -from lhotse import CutSet -from tqdm import tqdm - -TITANET_MODEL_NAME = "nvidia/speakerverification_en_titanet_large" -TITANET_SAMPLE_RATE = 16000 - - -# --------------------------------------------------------------------------- -# YAML / shar helpers -# --------------------------------------------------------------------------- - - -def parse_master_yaml(yaml_path: str) -> Dict[str, List[Dict]]: - """ - Parse the master multilingual YAML and each per-language YAML it references. - Returns {language: [list of shar_entry dicts with context_audio]}. - """ - yaml_base_dir = os.path.dirname(yaml_path) - with open(yaml_path, 'r') as f: - master_entries = yaml.safe_load(f) - - lang_to_shar_entries: Dict[str, List[Dict]] = defaultdict(list) - for entry in master_entries: - lang = entry.get("tags", {}).get("lang") - child_yaml_path = entry.get("input_cfg") - if not lang or not child_yaml_path: - continue - if not os.path.isabs(child_yaml_path): - child_yaml_path = os.path.join(yaml_base_dir, child_yaml_path) - if not os.path.isfile(child_yaml_path): - logging.warning(f"Per-language YAML not found: {child_yaml_path}") - continue - with open(child_yaml_path, 'r') as f: - child_entries = yaml.safe_load(f) - for ce in child_entries: - shar_path = ce.get("shar_path", {}) - if "context_audio" not in shar_path: - logging.debug( - f"Skipping text-context-only entry (no context_audio): {shar_path.get('cuts', 'unknown')}" - ) - continue - lang_to_shar_entries[lang].append(ce) - - return dict(lang_to_shar_entries) - - -def expand_shar_range(pattern: str) -> List[str]: - """ - Expand a shar path pattern like '.../cuts.{000000..001231}.jsonl.gz' - into a list of concrete file paths. - """ - match = re.search(r'\{(\d+)\.\.(\d+)\}', pattern) - if not match: - return [pattern] - start_idx = int(match.group(1)) - end_idx = int(match.group(2)) - width = len(match.group(1)) - prefix = pattern[: match.start()] - suffix = pattern[match.end() :] - return [f"{prefix}{i:0{width}d}{suffix}" for i in range(start_idx, end_idx + 1)] - - -def parse_speaker_field(speaker_str: str) -> Tuple[str, str, str]: - """Extract (language, dataset, speaker_id) from '| Language:XX Dataset:YYY Speaker:ZZZ |'.""" - lang_m = re.search(r"Language:(\w+)", speaker_str) - dataset_m = re.search(r"Dataset:([\w\d\W]+?) Speaker:", speaker_str) - spk_m = re.search(r"Speaker:([\w\d\W]+?) \|", speaker_str) - lang = lang_m.group(1) if lang_m else "unknown" - dataset = dataset_m.group(1).strip() if dataset_m else "unknown" - speaker_id = spk_m.group(1).strip() if spk_m else "unknown" - return lang, dataset, speaker_id - - -# --------------------------------------------------------------------------- -# Stage 1: Build speaker embedding index -# --------------------------------------------------------------------------- - - -def discover_speakers_from_cuts( - lang_to_shar_entries: Dict[str, List[Dict]], - max_cuts_per_speaker: int, - max_shards_per_dataset: int = 0, -) -> Dict[str, Dict]: - """ - Pass 1 (metadata only): Read cut JSONL files to discover unique speakers - and collect up to max_cuts_per_speaker cut metadata entries per speaker. - - Args: - max_shards_per_dataset: If > 0, only scan this many .jsonl.gz shard - files per shar group (dataset) instead of all shards. This - dramatically speeds up discovery for large datasets while still - finding most speakers. - - Returns: {speaker_str: {"language": str, "cut_metas": [list of (shar_entry, shard_idx, cut_json_dict)]}} - """ - speaker_info: Dict[str, Dict] = {} - - for lang, shar_entries in lang_to_shar_entries.items(): - logging.info(f"[Stage 1] Discovering speakers for language: {lang} ({len(shar_entries)} shar groups)") - for se in shar_entries: - cuts_pattern = se["shar_path"]["cuts"] - cuts_files = expand_shar_range(cuts_pattern) - if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset: - logging.info( - f" Limiting scan to {max_shards_per_dataset}/{len(cuts_files)} " - f"shards for dataset: {cuts_pattern}" - ) - cuts_files = cuts_files[:max_shards_per_dataset] - for cuts_file in cuts_files: - if not os.path.isfile(cuts_file): - continue - shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file) - shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0 - try: - with gzip.open(cuts_file, 'rt', encoding='utf-8') as f: - for line in f: - cut_json = json.loads(line) - supervisions = cut_json.get("supervisions", []) - if not supervisions: - continue - speaker_str = supervisions[0].get("speaker", "") - if not speaker_str: - continue - if speaker_str not in speaker_info: - speaker_info[speaker_str] = { - "language": lang, - "cut_metas": [], - } - if len(speaker_info[speaker_str]["cut_metas"]) < max_cuts_per_speaker: - speaker_info[speaker_str]["cut_metas"].append((se, shard_idx, cut_json)) - except Exception as e: - logging.warning(f"Error reading {cuts_file}: {e}") - - logging.info( - f"[Stage 1] Discovered {len(speaker_info)} unique speakers across {len(lang_to_shar_entries)} languages" - ) - for lang in sorted(lang_to_shar_entries.keys()): - n = sum(1 for v in speaker_info.values() if v["language"] == lang) - logging.info(f" {lang}: {n} speakers") - return speaker_info - - -def compute_speaker_embeddings( - speaker_info: Dict[str, Dict], - sv_model: torch.nn.Module, - device: torch.device, - batch_size: int = 16, -) -> Dict[str, Dict]: - """ - Pass 2: For each speaker, load audio from shar tars for the sampled cuts, - compute TitaNet embeddings, and average them into a single representative vector. - - Returns: {speaker_str: {"language": str, "embedding": np.ndarray}} - """ - speaker_embeddings: Dict[str, Dict] = {} - - speakers_needing_audio = {} - for spk, info in speaker_info.items(): - cut_metas = info["cut_metas"] - if not cut_metas: - continue - grouped_by_shar_and_shard: Dict[str, Dict[int, List]] = defaultdict(lambda: defaultdict(list)) - for se, shard_idx, cut_json in cut_metas: - shar_key = json.dumps(se["shar_path"], sort_keys=True) - grouped_by_shar_and_shard[shar_key][shard_idx].append((se, cut_json)) - speakers_needing_audio[spk] = { - "language": info["language"], - "grouped": grouped_by_shar_and_shard, - } - - # Collect audio in batches: load from shar, accumulate waveforms per speaker - speaker_audio_tensors: Dict[str, List[torch.Tensor]] = defaultdict(list) - - logging.info(f"[Stage 1] Loading audio for {len(speakers_needing_audio)} speakers to compute embeddings...") - - # Group all (shar_entry, shard_idx) that we need to load - shar_shard_to_speakers: Dict[Tuple[str, int], List[Tuple[str, str]]] = defaultdict(list) - for spk, data in speakers_needing_audio.items(): - for shar_key, shard_map in data["grouped"].items(): - for shard_idx, items in shard_map.items(): - for se, cut_json in items: - cut_id = cut_json.get("id", "") - shar_shard_to_speakers[(shar_key, shard_idx)].append((spk, cut_id)) - - # Process shard by shard to minimize tar file openings - total_shards = len(shar_shard_to_speakers) - for (shar_key, shard_idx), spk_cut_pairs in tqdm( - shar_shard_to_speakers.items(), desc="[Stage 1] Loading audio shards", total=total_shards - ): - se_shar_path = json.loads(shar_key) - cuts_files = expand_shar_range(se_shar_path["cuts"]) - target_audio_files = expand_shar_range(se_shar_path.get("target_audio", "")) - - if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files): - logging.warning(f"Shard index {shard_idx} out of range, skipping") - continue - - cut_file = cuts_files[shard_idx] - target_tar = target_audio_files[shard_idx] - - if not os.path.isfile(cut_file) or not os.path.isfile(target_tar): - logging.warning(f"Missing shard files: cuts={cut_file}, target={target_tar}") - continue - - needed_cut_ids = {cut_id for (_, cut_id) in spk_cut_pairs} - cut_id_to_spk = {cut_id: spk for (spk, cut_id) in spk_cut_pairs} - - try: - fields = { - "cuts": [cut_file], - "recording": [target_tar], - } - # Also include context_recording if available, to avoid errors - context_audio_files = expand_shar_range(se_shar_path.get("context_audio", "")) - if shard_idx < len(context_audio_files) and os.path.isfile(context_audio_files[shard_idx]): - fields["context_recording"] = [context_audio_files[shard_idx]] - - shard_cutset = CutSet.from_shar(fields=fields) - for cut in shard_cutset: - if cut.id in needed_cut_ids: - spk = cut_id_to_spk[cut.id] - audio_np = cut.recording.resample(TITANET_SAMPLE_RATE).load_audio().squeeze(0) - audio_tensor = torch.from_numpy(audio_np).float() - speaker_audio_tensors[spk].append(audio_tensor) - needed_cut_ids.discard(cut.id) - if not needed_cut_ids: - break - except Exception as e: - logging.warning(f"Error loading shard {cut_file}: {e}") - - # Now compute embeddings in batches - logging.info(f"[Stage 1] Computing TitaNet embeddings for {len(speaker_audio_tensors)} speakers...") - all_speakers = list(speaker_audio_tensors.keys()) - - for batch_start in tqdm(range(0, len(all_speakers), batch_size), desc="[Stage 1] TitaNet batches"): - batch_speakers = all_speakers[batch_start : batch_start + batch_size] - audio_list = [] - audio_lens = [] - spk_indices = [] # maps each audio in batch back to speaker - - for spk in batch_speakers: - for audio_t in speaker_audio_tensors[spk]: - audio_list.append(audio_t.to(device)) - audio_lens.append(audio_t.size(0)) - spk_indices.append(spk) - - if not audio_list: - continue - - batch_lens = torch.tensor(audio_lens, device=device).long() - max_len = int(batch_lens.max().item()) - padded = torch.zeros(len(audio_list), max_len, device=device, dtype=torch.float32) - for i, t in enumerate(audio_list): - padded[i, : t.size(0)] = t - - with torch.inference_mode(): - _, embeddings = sv_model.forward(input_signal=padded, input_signal_length=batch_lens) - - embeddings_np = embeddings.cpu().float().numpy() - - # Average embeddings per speaker - spk_emb_accum: Dict[str, List[np.ndarray]] = defaultdict(list) - for i, spk in enumerate(spk_indices): - spk_emb_accum[spk].append(embeddings_np[i]) - - for spk in batch_speakers: - if spk in spk_emb_accum and spk_emb_accum[spk]: - avg_emb = np.mean(spk_emb_accum[spk], axis=0) - avg_emb = avg_emb / (np.linalg.norm(avg_emb) + 1e-8) - speaker_embeddings[spk] = { - "language": speakers_needing_audio[spk]["language"], - "embedding": avg_emb, - } - - logging.info(f"[Stage 1] Computed embeddings for {len(speaker_embeddings)} speakers") - return speaker_embeddings - - -def run_stage1( - lang_to_shar_entries: Dict[str, List[Dict]], - samples_per_speaker: int, - device: torch.device, - index_path: str, - batch_size: int = 16, - max_shards_per_dataset: int = 0, -) -> Dict[str, Dict]: - """Run full Stage 1: discover speakers, load audio, compute embeddings, save index.""" - if os.path.isfile(index_path): - logging.info(f"[Stage 1] Loading cached speaker index from {index_path}") - with open(index_path, 'rb') as f: - return pickle.load(f) - - from nemo.collections.asr.models import EncDecSpeakerLabelModel - - logging.info(f"[Stage 1] Loading TitaNet model: {TITANET_MODEL_NAME}") - sv_model = EncDecSpeakerLabelModel.from_pretrained(TITANET_MODEL_NAME) - sv_model = sv_model.to(device) - sv_model.eval() - - speaker_info = discover_speakers_from_cuts( - lang_to_shar_entries, - max_cuts_per_speaker=samples_per_speaker, - max_shards_per_dataset=max_shards_per_dataset, - ) - speaker_embeddings = compute_speaker_embeddings(speaker_info, sv_model, device, batch_size=batch_size) - - os.makedirs(os.path.dirname(index_path), exist_ok=True) - with open(index_path, 'wb') as f: - pickle.dump(speaker_embeddings, f) - logging.info(f"[Stage 1] Saved speaker index to {index_path}") - - del sv_model - torch.cuda.empty_cache() - return speaker_embeddings - - -# --------------------------------------------------------------------------- -# Stage 2: Cross-lingual speaker matching + language-balanced sampling -# --------------------------------------------------------------------------- - - -def build_crosslingual_map(speaker_embeddings: Dict[str, Dict]) -> Dict[str, Tuple[str, float]]: - """ - For each speaker S in language L, find the closest speaker S' from a different - language by cosine similarity of their TitaNet embeddings. - - Returns: {speaker_str: (best_match_speaker_str, cosine_similarity)} - """ - speakers = list(speaker_embeddings.keys()) - n = len(speakers) - logging.info(f"[Stage 2] Building cross-lingual map for {n} speakers...") - - # Build embedding matrix - emb_matrix = np.stack([speaker_embeddings[s]["embedding"] for s in speakers]) - langs = [speaker_embeddings[s]["language"] for s in speakers] - - # Cosine similarity matrix (embeddings are already L2-normalized) - sim_matrix = emb_matrix @ emb_matrix.T - - cross_lingual_map: Dict[str, Tuple[str, float]] = {} - for i in range(n): - best_j = -1 - best_sim = -2.0 - for j in range(n): - if langs[j] == langs[i]: - continue - if sim_matrix[i, j] > best_sim: - best_sim = sim_matrix[i, j] - best_j = j - if best_j >= 0: - cross_lingual_map[speakers[i]] = (speakers[best_j], float(best_sim)) - else: - logging.warning(f"No cross-lingual match found for speaker: {speakers[i]}") - - logging.info(f"[Stage 2] Built cross-lingual map with {len(cross_lingual_map)} entries") - avg_sim = np.mean([v[1] for v in cross_lingual_map.values()]) if cross_lingual_map else 0 - logging.info(f"[Stage 2] Average cross-lingual similarity: {avg_sim:.4f}") - return cross_lingual_map - - -def sample_balanced_cuts( - lang_to_shar_entries: Dict[str, List[Dict]], - cross_lingual_map: Dict[str, Tuple[str, float]], - target_hours: float, - seed: int, - max_shards_per_dataset: int = 0, -) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]: - """ - Sample cuts across languages so each language contributes approximately - target_hours / num_languages hours of target audio. - - Args: - max_shards_per_dataset: If > 0, only read this many shard files per - dataset. Since we only need ~6.25h per language, reading a small - fraction of shards is sufficient and avoids scanning tens of - thousands of files for large datasets. - - Returns: - target_cuts_by_lang: {lang: [list of cut_json dicts with extra metadata]} - context_pool_by_speaker: {speaker_str: [list of (shar_entry, shard_idx, cut_json)]} - """ - rng = random.Random(seed) - num_langs = len(lang_to_shar_entries) - hours_per_lang = target_hours / num_langs - secs_per_lang = hours_per_lang * 3600 - # Collect 3x the target to allow shuffling diversity - collect_secs_per_lang = secs_per_lang * 3 - - logging.info( - f"[Stage 2] Sampling ~{hours_per_lang:.2f}h per language ({num_langs} languages, {target_hours}h total)" - ) - - all_matched_speakers = set(v[0] for v in cross_lingual_map.values()) - - target_cuts_by_lang: Dict[str, List[Dict]] = {} - context_pool_by_speaker: Dict[str, List] = defaultdict(list) - - for lang, shar_entries in lang_to_shar_entries.items(): - logging.info(f"[Stage 2] Reading cuts for language: {lang}") - lang_cuts = [] - lang_collected_secs = 0.0 - lang_done = False - - for se in shar_entries: - if lang_done: - break - cuts_pattern = se["shar_path"]["cuts"] - cuts_files = expand_shar_range(cuts_pattern) - if max_shards_per_dataset > 0 and len(cuts_files) > max_shards_per_dataset: - cuts_files = cuts_files[:max_shards_per_dataset] - logging.info( - f" Limiting to {max_shards_per_dataset} shards for dataset: " f"{se['shar_path']['cuts']}" - ) - for cuts_file in cuts_files: - if lang_done: - break - if not os.path.isfile(cuts_file): - continue - shard_idx_match = re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_file) - shard_idx = int(shard_idx_match.group(1)) if shard_idx_match else 0 - try: - with gzip.open(cuts_file, 'rt', encoding='utf-8') as f: - for line in f: - cut_json = json.loads(line) - speaker_str = cut_json.get("supervisions", [{}])[0].get("speaker", "") - if not speaker_str: - continue - if speaker_str in all_matched_speakers: - context_pool_by_speaker[speaker_str].append((se, shard_idx, cut_json)) - if speaker_str in cross_lingual_map: - cut_json["_shar_entry"] = se - cut_json["_shard_idx"] = shard_idx - cut_json["_speaker_str"] = speaker_str - lang_cuts.append(cut_json) - lang_collected_secs += cut_json.get("duration", 0) - if lang_collected_secs >= collect_secs_per_lang: - lang_done = True - break - except Exception as e: - logging.warning(f"Error reading {cuts_file}: {e}") - - logging.info(f" {lang}: {len(lang_cuts)} candidate target cuts ({lang_collected_secs / 3600:.2f}h collected)") - - rng.shuffle(lang_cuts) - sampled = [] - total_dur = 0.0 - for cut_json in lang_cuts: - dur = cut_json.get("duration", 0) - if dur <= 0: - continue - sampled.append(cut_json) - total_dur += dur - if total_dur >= secs_per_lang: - break - - target_cuts_by_lang[lang] = sampled - logging.info(f" {lang}: sampled {len(sampled)} cuts, {total_dur / 3600:.2f}h") - - total_sampled = sum(len(v) for v in target_cuts_by_lang.values()) - total_hours = sum(sum(c.get("duration", 0) for c in v) for v in target_cuts_by_lang.values()) / 3600 - logging.info(f"[Stage 2] Total sampled: {total_sampled} cuts, {total_hours:.2f}h") - return target_cuts_by_lang, dict(context_pool_by_speaker) - - -# --------------------------------------------------------------------------- -# Stage 3: Extract audio + write NeMo manifest -# --------------------------------------------------------------------------- - - -def run_stage3( - target_cuts_by_lang: Dict[str, List[Dict]], - context_pool_by_speaker: Dict[str, List], - cross_lingual_map: Dict[str, Tuple[str, float]], - speaker_embeddings: Dict[str, Dict], - output_dir: str, - sample_rate: int, - seed: int, -): - """ - For each sampled target cut, pick a context utterance from the matched - cross-lingual speaker, extract both audios to disk, and write the manifest. - """ - rng = random.Random(seed) - audio_dir = os.path.join(output_dir, "extracted_audio") - target_audio_dir = os.path.join(audio_dir, "target") - context_audio_dir = os.path.join(audio_dir, "context") - os.makedirs(target_audio_dir, exist_ok=True) - os.makedirs(context_audio_dir, exist_ok=True) - - manifest_path = os.path.join(output_dir, "manifest.json") - - # Build a quick lookup: for each context cut we might need to load, - # index by (shar_key, shard_idx, cut_id) - # First, assign a context cut to each target - assignments: List[Dict] = [] - for lang, cuts in target_cuts_by_lang.items(): - for cut_json in cuts: - spk = cut_json["_speaker_str"] - matched_spk, ssim = cross_lingual_map[spk] - ctx_pool = context_pool_by_speaker.get(matched_spk, []) - if not ctx_pool: - logging.warning( - f"No context pool for matched speaker {matched_spk}, skipping cut {cut_json.get('id', '')}" - ) - continue - ctx_se, ctx_shard_idx, ctx_cut_json = rng.choice(ctx_pool) - assignments.append( - { - "target_cut_json": cut_json, - "target_shar_entry": cut_json["_shar_entry"], - "target_shard_idx": cut_json["_shard_idx"], - "target_speaker": spk, - "context_cut_json": ctx_cut_json, - "context_shar_entry": ctx_se, - "context_shard_idx": ctx_shard_idx, - "context_speaker": matched_spk, - "ssim": ssim, - "lang": lang, - } - ) - - logging.info(f"[Stage 3] Total assignments: {len(assignments)}") - - # Group by (shar_key, shard_idx) for efficient loading - # We need to load target and context audio from potentially different shards - # Strategy: process all assignments, grouping audio loads by shard - target_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list) - context_loads: Dict[Tuple[str, int], List[int]] = defaultdict(list) - - for idx, a in enumerate(assignments): - t_shar_key = json.dumps(a["target_shar_entry"]["shar_path"], sort_keys=True) - target_loads[(t_shar_key, a["target_shard_idx"])].append(idx) - c_shar_key = json.dumps(a["context_shar_entry"]["shar_path"], sort_keys=True) - context_loads[(c_shar_key, a["context_shard_idx"])].append(idx) - - # Arrays to hold extracted audio file paths - target_audio_paths = [None] * len(assignments) - context_audio_paths = [None] * len(assignments) - - def _save_audio_from_shard( - shard_loads: Dict[Tuple[str, int], List[int]], - assignments_list: List[Dict], - cut_json_key: str, - out_subdir: str, - out_paths_array: List, - audio_field: str, - ): - """Load cuts from shar tars and save individual audio files to disk.""" - total_shards = len(shard_loads) - for (shar_key_str, shard_idx), indices in tqdm( - shard_loads.items(), desc=f"[Stage 3] Extracting {audio_field}", total=total_shards - ): - se_shar_path = json.loads(shar_key_str) - cuts_files = expand_shar_range(se_shar_path["cuts"]) - target_audio_files = expand_shar_range(se_shar_path.get("target_audio", "")) - - if shard_idx >= len(cuts_files) or shard_idx >= len(target_audio_files): - logging.warning(f"Shard {shard_idx} out of range, skipping") - continue - - cut_file = cuts_files[shard_idx] - tar_file = target_audio_files[shard_idx] - - if not os.path.isfile(cut_file) or not os.path.isfile(tar_file): - logging.warning(f"Missing files: {cut_file} or {tar_file}") - continue - - needed_cut_ids = {} - for i in indices: - cj = assignments_list[i][cut_json_key] - cid = cj.get("id", "") - needed_cut_ids[cid] = i - - try: - fields = {"cuts": [cut_file], "recording": [tar_file]} - ctx_audio_files = expand_shar_range(se_shar_path.get("context_audio", "")) - if ctx_audio_files and shard_idx < len(ctx_audio_files) and os.path.isfile(ctx_audio_files[shard_idx]): - fields["context_recording"] = [ctx_audio_files[shard_idx]] - - shard_cutset = CutSet.from_shar(fields=fields) - for cut in shard_cutset: - if cut.id in needed_cut_ids: - assign_idx = needed_cut_ids[cut.id] - audio_np = cut.recording.resample(sample_rate).load_audio().squeeze(0) - safe_id = cut.id.replace("/", "_") - out_file = os.path.join(out_subdir, f"{safe_id}.wav") - sf.write(out_file, audio_np, sample_rate) - out_paths_array[assign_idx] = os.path.relpath( - out_file, os.path.join(output_dir, "extracted_audio") - ) - del needed_cut_ids[cut.id] - if not needed_cut_ids: - break - except Exception as e: - logging.warning(f"Error processing shard {cut_file}: {e}") - - # Extract target audio - logging.info(f"[Stage 3] Extracting target audio from {len(target_loads)} shards...") - _save_audio_from_shard( - target_loads, - assignments, - "target_cut_json", - target_audio_dir, - target_audio_paths, - "target_audio", - ) - - # Extract context audio - logging.info(f"[Stage 3] Extracting context audio from {len(context_loads)} shards...") - _save_audio_from_shard( - context_loads, - assignments, - "context_cut_json", - context_audio_dir, - context_audio_paths, - "context_audio", - ) - - # Write manifest - logging.info(f"[Stage 3] Writing manifest to {manifest_path}") - written = 0 - skipped = 0 - with open(manifest_path, 'w', encoding='utf-8') as f: - for idx, a in enumerate(assignments): - if target_audio_paths[idx] is None or context_audio_paths[idx] is None: - skipped += 1 - continue - - t_cut = a["target_cut_json"] - c_cut = a["context_cut_json"] - t_sup = t_cut.get("supervisions", [{}])[0] - - text = t_sup.get("text", "") - normalized_text = t_sup.get("custom", {}).get("normalized_text", text) - ipa = t_sup.get("custom", {}).get("ipa", "") - speaker = t_sup.get("speaker", "") - duration = t_cut.get("duration", 0) - context_duration = c_cut.get("duration", 0) - ctx_lang_parsed, _, _ = parse_speaker_field(a["context_speaker"]) - - target_lang_parsed, _, _ = parse_speaker_field(speaker) - - entry = { - "audio_filepath": target_audio_paths[idx], - "text": text, - "normalized_text": normalized_text, - "speaker": speaker, - "language": target_lang_parsed, - "duration": duration, - "context_audio_filepath": context_audio_paths[idx], - "context_audio_duration": context_duration, - "context_speaker_similarity": round(a["ssim"], 6), - "context_language": ctx_lang_parsed, - "context_speaker": a["context_speaker"], - } - if ipa: - entry["ipa"] = ipa - - # Carry over any additional custom fields from the target supervision - _exclude_custom_keys = { - "target_audio_codes_path", - "context_audio_codes_path", - "context_audio_text", - "context_audio_normalized_text", - "context_audio_offset", - } - for k, v in t_sup.get("custom", {}).items(): - if k not in entry and k not in _exclude_custom_keys: - entry[k] = v - - f.write(json.dumps(entry, ensure_ascii=False) + "\n") - written += 1 - - logging.info(f"[Stage 3] Manifest written: {written} entries, {skipped} skipped") - return manifest_path - - -# --------------------------------------------------------------------------- -# YAML config generation (post Stage 4) -# --------------------------------------------------------------------------- - - -def generate_yaml_config(lhotse_shar_dir: str, output_yaml_path: str, data_mount_prefix: str = "/data"): - """ - Generate a lhotse YAML config pointing to the cross-lingual shar dataset. - Call this after running create_lhotse_shar_from_nemo_manifest.py on the manifest. - - Args: - lhotse_shar_dir: Absolute path to the lhotse_shar output directory - (containing cuts/, target_audio/, context_audio/). - output_yaml_path: Path to write the YAML config file. - data_mount_prefix: If shar_dir is under a mount, replace the host prefix - with this docker-internal prefix. Pass empty string to skip. - """ - cuts_dir = os.path.join(lhotse_shar_dir, "cuts") - target_audio_dir = os.path.join(lhotse_shar_dir, "target_audio") - context_audio_dir = os.path.join(lhotse_shar_dir, "context_audio") - - cuts_files = sorted(glob_module.glob(os.path.join(cuts_dir, "cuts.*.jsonl.gz"))) - context_files = sorted(glob_module.glob(os.path.join(context_audio_dir, "recording.*.tar"))) - - if not cuts_files: - logging.error(f"No cut files found in {cuts_dir}") - return - - # Determine shard range - first_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1)) - last_idx = int(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[-1]).group(1)) - width = len(re.search(r"cuts\.(\d+)\.jsonl\.gz$", cuts_files[0]).group(1)) - - def _make_range_pattern(directory: str, prefix: str, ext: str) -> str: - path = os.path.join(directory, f"{prefix}.{{{first_idx:0{width}d}..{last_idx:0{width}d}}}.{ext}") - return path - - shar_path = { - "cuts": _make_range_pattern(cuts_dir, "cuts", "jsonl.gz"), - "target_audio": _make_range_pattern(target_audio_dir, "recording", "tar"), - } - if context_files: - shar_path["context_audio"] = _make_range_pattern(context_audio_dir, "recording", "tar") - - # Check for codec codes - for codec_dir_name in os.listdir(lhotse_shar_dir): - codec_subdir = os.path.join(lhotse_shar_dir, codec_dir_name) - if not os.path.isdir(codec_subdir): - continue - target_codes_dir = os.path.join(codec_subdir, "target_codes") - context_codes_dir = os.path.join(codec_subdir, "context_codes") - if os.path.isdir(target_codes_dir): - tc_files = sorted(glob_module.glob(os.path.join(target_codes_dir, "codes.*.tar"))) - if tc_files: - tc_first = int(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1)) - tc_last = int(re.search(r"codes\.(\d+)\.tar$", tc_files[-1]).group(1)) - tc_width = len(re.search(r"codes\.(\d+)\.tar$", tc_files[0]).group(1)) - shar_path["target_codes"] = os.path.join( - target_codes_dir, f"codes.{{{tc_first:0{tc_width}d}..{tc_last:0{tc_width}d}}}.tar" - ) - if os.path.isdir(context_codes_dir): - cc_files = sorted(glob_module.glob(os.path.join(context_codes_dir, "codes.*.tar"))) - if cc_files: - cc_first = int(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1)) - cc_last = int(re.search(r"codes\.(\d+)\.tar$", cc_files[-1]).group(1)) - cc_width = len(re.search(r"codes\.(\d+)\.tar$", cc_files[0]).group(1)) - shar_path["context_codes"] = os.path.join( - context_codes_dir, f"codes.{{{cc_first:0{cc_width}d}..{cc_last:0{cc_width}d}}}.tar" - ) - - yaml_entry = [ - { - "type": "lhotse_shar", - "shar_path": shar_path, - "weight": 1.0, - "tags": { - "task": "tts", - "lang": "crosslingual", - "tokenizer_names": ["nemotron_nano_30b"], - }, - } - ] - - os.makedirs(os.path.dirname(output_yaml_path) or ".", exist_ok=True) - with open(output_yaml_path, 'w') as f: - yaml.dump(yaml_entry, f, default_flow_style=False, sort_keys=False) - logging.info(f"YAML config written to {output_yaml_path}") - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="Create a cross-lingual context TTS dataset from multilingual lhotse shar data.", - ) - parser.add_argument( - "--master-yaml", - required=True, - type=str, - help="Path to the master multilingual YAML (e.g. train_25fpsSpectralCodecBWE_en_de_es_fr_hi_it_vi_zh_with_ipa.yaml).", - ) - parser.add_argument( - "--output-dir", - required=True, - type=str, - help="Base directory for all outputs (extracted audio, manifest, speaker index).", - ) - parser.add_argument( - "--target-hours", - type=float, - default=50.0, - help="Total hours of target audio to sample (split equally across languages).", - ) - parser.add_argument( - "--samples-per-speaker", - type=int, - default=5, - help="Number of utterances per speaker to use for computing the average TitaNet embedding.", - ) - parser.add_argument( - "--sample-rate", - type=int, - default=24000, - help="Sample rate for saving extracted audio files.", - ) - parser.add_argument( - "--embedding-batch-size", - type=int, - default=16, - help="Batch size for TitaNet embedding computation.", - ) - parser.add_argument( - "--max-shards-per-dataset", - type=int, - default=0, - help="Max number of .jsonl.gz shard files to scan per dataset during " - "speaker discovery (Stage 1). 0 means scan all shards. " - "Setting this to e.g. 10 dramatically speeds up discovery while " - "still finding most speakers.", - ) - parser.add_argument( - "--seed", - type=int, - default=42, - help="Random seed for reproducibility.", - ) - parser.add_argument( - "--log-level", - type=str, - default="INFO", - choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], - help="Logging level.", - ) - parser.add_argument( - "--generate-yaml", - type=str, - default=None, - help="If provided, skip stages 1-3 and instead generate a YAML config " - "pointing to the lhotse shar in OUTPUT_DIR/lhotse_shar. " - "Value is the output YAML file path.", - ) - args = parser.parse_args() - - log_level = getattr(logging, args.log_level.upper(), logging.INFO) - logging.basicConfig( - level=log_level, - format='%(asctime)s - %(levelname)s - %(message)s', - ) - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - - os.makedirs(args.output_dir, exist_ok=True) - - # --- Generate YAML config mode (post Stage 4) --- - if args.generate_yaml: - lhotse_shar_dir = os.path.join(args.output_dir, "lhotse_shar") - generate_yaml_config(lhotse_shar_dir, args.generate_yaml) - return - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - logging.info(f"Using device: {device}") - - # --- Parse master YAML --- - logging.info(f"Parsing master YAML: {args.master_yaml}") - lang_to_shar_entries = parse_master_yaml(args.master_yaml) - if not lang_to_shar_entries: - logging.error("No shar entries found. Check the master YAML path and contents.") - return - - for lang, entries in sorted(lang_to_shar_entries.items()): - logging.info(f" Language '{lang}': {len(entries)} shar groups (with context_audio)") - - # --- Stage 1: Build speaker embedding index --- - index_path = os.path.join(args.output_dir, "speaker_embedding_index.pkl") - speaker_embeddings = run_stage1( - lang_to_shar_entries, - samples_per_speaker=args.samples_per_speaker, - device=device, - index_path=index_path, - batch_size=args.embedding_batch_size, - max_shards_per_dataset=args.max_shards_per_dataset, - ) - - # --- Stage 2: Cross-lingual matching + balanced sampling --- - cross_lingual_map = build_crosslingual_map(speaker_embeddings) - target_cuts_by_lang, context_pool_by_speaker = sample_balanced_cuts( - lang_to_shar_entries, - cross_lingual_map, - target_hours=args.target_hours, - seed=args.seed, - max_shards_per_dataset=args.max_shards_per_dataset, - ) - - # --- Stage 3: Extract audio + write manifest --- - manifest_path = run_stage3( - target_cuts_by_lang, - context_pool_by_speaker, - cross_lingual_map, - speaker_embeddings, - args.output_dir, - args.sample_rate, - args.seed, - ) - - # --- Summary --- - logging.info("=" * 60) - logging.info("Cross-lingual context dataset creation complete!") - logging.info(f" Manifest: {manifest_path}") - logging.info(f" Audio dir: {os.path.join(args.output_dir, 'extracted_audio')}") - logging.info("") - logging.info("Next steps:") - logging.info(" 1. Convert to lhotse shar format:") - logging.info(f" python scripts/magpietts/create_lhotse_shar_from_nemo_manifest.py \\") - logging.info(f" --manifest-path {manifest_path} \\") - logging.info(f" --audio-base-dir {os.path.join(args.output_dir, 'extracted_audio')} \\") - logging.info(f" --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\") - logging.info(f" --num-jobs 16 --processing-chunk-size 256 --audio-format flac --shuffle --shuffle-seed 42") - logging.info("") - logging.info(" 2. (Optional) Add codec codes:") - logging.info(f" python scripts/magpietts/extend_lhotse_shards_with_audio_codes.py \\") - logging.info(f" --cuts-dir {os.path.join(args.output_dir, 'lhotse_shar', 'cuts')} \\") - logging.info(f" --target-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'target_audio')} \\") - logging.info(f" --context-audio-dir {os.path.join(args.output_dir, 'lhotse_shar', 'context_audio')} \\") - logging.info(f" --output-dir {os.path.join(args.output_dir, 'lhotse_shar')} \\") - logging.info(f" --codec-model-path ") - logging.info("") - yaml_out = os.path.join(args.output_dir, "crosslingual_context.yaml") - logging.info(" 3. Generate YAML config for training:") - logging.info(f" python scripts/magpietts/create_crosslingual_context_dataset.py \\") - logging.info(f" --master-yaml {args.master_yaml} \\") - logging.info(f" --output-dir {args.output_dir} \\") - logging.info(f" --generate-yaml {yaml_out}") - logging.info("=" * 60) - - -if __name__ == "__main__": - main() diff --git a/scripts/magpietts/inspect_crosslingual_dataset.py b/scripts/magpietts/inspect_crosslingual_dataset.py deleted file mode 100644 index 6fed5c93adf5..000000000000 --- a/scripts/magpietts/inspect_crosslingual_dataset.py +++ /dev/null @@ -1,161 +0,0 @@ -""" -Inspect the cross-lingual context dataset by decoding target and context -audio codes back to waveforms and saving them alongside the original -recording audio for comparison. - -Usage (inside docker): - python scripts/magpietts/inspect_crosslingual_dataset.py \ - --shar-dir /data/crosslingual_context_dataset/lhotse_shar \ - --codec-model-path /model_artifacts/25fps_spectral_codec_with_bandwidth_extension.nemo \ - --codec-name 25fpsSpectralCodecBWE \ - --output-dir /data/crosslingual_context_dataset/inspect \ - --num-samples 10 -""" - -import argparse -import logging -import os - -import numpy as np -import soundfile as sf -import torch -from lhotse import CutSet - -from nemo.collections.tts.models import AudioCodecModel - - -def main(): - parser = argparse.ArgumentParser(description="Inspect cross-lingual dataset: decode codes and save audio.") - parser.add_argument("--shar-dir", required=True, help="Path to lhotse_shar directory.") - parser.add_argument("--codec-model-path", required=True, help="Path to .nemo codec model.") - parser.add_argument("--codec-name", default="25fpsSpectralCodecBWE", help="Codec subdirectory name.") - parser.add_argument("--output-dir", required=True, help="Directory to save inspection outputs.") - parser.add_argument("--num-samples", type=int, default=10, help="Number of samples to inspect.") - args = parser.parse_args() - - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") - - os.makedirs(args.output_dir, exist_ok=True) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - # Load codec model - logging.info(f"Loading codec model from {args.codec_model_path}") - codec_model = AudioCodecModel.restore_from(args.codec_model_path, map_location="cpu", strict=False) - codec_model = codec_model.to(device) - codec_model.eval() - codec_sr = codec_model.sample_rate - logging.info(f"Codec output sample rate: {codec_sr}") - - # Build shar fields for first shard - cuts_dir = os.path.join(args.shar_dir, "cuts") - target_audio_dir = os.path.join(args.shar_dir, "target_audio") - context_audio_dir = os.path.join(args.shar_dir, "context_audio") - target_codes_dir = os.path.join(args.shar_dir, args.codec_name, "target_codes") - context_codes_dir = os.path.join(args.shar_dir, args.codec_name, "context_codes") - - # Use first shard only - fields = { - "cuts": [os.path.join(cuts_dir, "cuts.000000.jsonl.gz")], - "recording": [os.path.join(target_audio_dir, "recording.000000.tar")], - "context_recording": [os.path.join(context_audio_dir, "recording.000000.tar")], - "target_codes": [os.path.join(target_codes_dir, "codes.000000.tar")], - "context_codes": [os.path.join(context_codes_dir, "codes.000000.tar")], - } - - for k, v in fields.items(): - if not os.path.isfile(v[0]): - logging.error(f"Missing file for '{k}': {v[0]}") - return - - logging.info("Loading CutSet from shar...") - cutset = CutSet.from_shar(fields=fields) - - count = 0 - for cut in cutset: - if count >= args.num_samples: - break - - sup = cut.supervisions[0] if cut.supervisions else None - lang = sup.language if sup else "unk" - speaker = sup.speaker if sup else "unk" - ctx_lang = sup.custom.get("context_language", "unk") if sup and hasattr(sup, "custom") else "unk" - ssim = sup.custom.get("context_speaker_similarity", "N/A") if sup and hasattr(sup, "custom") else "N/A" - - sample_dir = os.path.join(args.output_dir, f"sample_{count:03d}_{lang}") - os.makedirs(sample_dir, exist_ok=True) - - logging.info(f"--- Sample {count} ---") - logging.info(f" Cut ID: {cut.id}") - logging.info(f" Target lang: {lang}, Context lang: {ctx_lang}, SSIM: {ssim}") - logging.info(f" Speaker: {speaker}") - if sup: - logging.info(f" Text: {sup.text[:80]}...") - - # 1. Save original target recording audio - target_audio_np = cut.recording.resample(codec_sr).load_audio().squeeze(0) - sf.write(os.path.join(sample_dir, "target_recording.wav"), target_audio_np, codec_sr) - logging.info(f" Saved target_recording.wav ({len(target_audio_np)/codec_sr:.2f}s)") - - # 2. Save original context recording audio - if cut.has_custom("context_recording"): - ctx_audio_np = cut.context_recording.resample(codec_sr).load_audio().squeeze(0) - sf.write(os.path.join(sample_dir, "context_recording.wav"), ctx_audio_np, codec_sr) - logging.info(f" Saved context_recording.wav ({len(ctx_audio_np)/codec_sr:.2f}s)") - - # 3. Decode target codes -> audio - if cut.has_custom("target_codes"): - target_codes_np = cut.target_codes.load().astype(np.int32) # (C, T) - target_codes_t = torch.from_numpy(target_codes_np).unsqueeze(0).to(device) # (1, C, T) - target_codes_len = torch.tensor([target_codes_t.shape[2]], device=device) - with torch.inference_mode(): - decoded_target, decoded_target_len = codec_model.decode( - tokens=target_codes_t, tokens_len=target_codes_len - ) - decoded_target_np = decoded_target[0, : decoded_target_len[0]].cpu().float().numpy() - sf.write( - os.path.join(sample_dir, "target_decoded_from_codes.wav"), - decoded_target_np, - codec_model.output_sample_rate, - ) - logging.info( - f" Saved target_decoded_from_codes.wav ({len(decoded_target_np)/codec_model.output_sample_rate:.2f}s), codes shape: {target_codes_np.shape}" - ) - else: - logging.warning(f" No target_codes found for cut {cut.id}") - - # 4. Decode context codes -> audio - if cut.has_custom("context_codes"): - ctx_codes_np = cut.context_codes.load().astype(np.int32) # (C, T) - ctx_codes_t = torch.from_numpy(ctx_codes_np).unsqueeze(0).to(device) # (1, C, T) - ctx_codes_len = torch.tensor([ctx_codes_t.shape[2]], device=device) - with torch.inference_mode(): - decoded_ctx, decoded_ctx_len = codec_model.decode(tokens=ctx_codes_t, tokens_len=ctx_codes_len) - decoded_ctx_np = decoded_ctx[0, : decoded_ctx_len[0]].cpu().float().numpy() - sf.write( - os.path.join(sample_dir, "context_decoded_from_codes.wav"), - decoded_ctx_np, - codec_model.output_sample_rate, - ) - logging.info( - f" Saved context_decoded_from_codes.wav ({len(decoded_ctx_np)/codec_model.output_sample_rate:.2f}s), codes shape: {ctx_codes_np.shape}" - ) - else: - logging.warning(f" No context_codes found for cut {cut.id}") - - # 5. Write metadata - with open(os.path.join(sample_dir, "info.txt"), "w") as f: - f.write(f"cut_id: {cut.id}\n") - f.write(f"target_language: {lang}\n") - f.write(f"context_language: {ctx_lang}\n") - f.write(f"speaker: {speaker}\n") - f.write(f"context_speaker_similarity: {ssim}\n") - f.write(f"text: {sup.text if sup else ''}\n") - f.write(f"duration: {cut.duration}\n") - - count += 1 - - logging.info(f"Done. Saved {count} samples to {args.output_dir}") - - -if __name__ == "__main__": - main() diff --git a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json b/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json deleted file mode 100644 index 6d7e35116405..000000000000 --- a/scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json +++ /dev/null @@ -1,9954 +0,0 @@ -{ - "version": "1.0", - "truncation": null, - "padding": null, - "added_tokens": [ - { - "id": 0, - "content": "", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - }, - { - "id": 1, - "content": "", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - }, - { - "id": 2, - "content": "", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - } - ], - "normalizer": null, - "pre_tokenizer": { - "type": "ByteLevel", - "add_prefix_space": false, - "trim_offsets": true, - "use_regex": true - }, - "post_processor": null, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": true, - "use_regex": true - }, - "model": { - "type": "BPE", - "dropout": null, - "unk_token": "", - "continuing_subword_prefix": null, - "end_of_word_suffix": null, - "fuse_unk": false, - "byte_fallback": false, - "ignore_merges": false, - "vocab": { - "": 0, - "": 1, - "": 2, - "(": 3, - ")": 4, - "-": 5, - ".": 6, - "1": 7, - "2": 8, - "4": 9, - "5": 10, - "6": 11, - "7": 12, - "F": 13, - "a": 14, - "b": 15, - "c": 16, - "d": 17, - "e": 18, - "f": 19, - "h": 20, - "i": 21, - "j": 22, - "k": 23, - "l": 24, - "m": 25, - "n": 26, - "o": 27, - "p": 28, - "q": 29, - "r": 30, - "s": 31, - "t": 32, - "u": 33, - "v": 34, - "w": 35, - "x": 36, - "y": 37, - "z": 38, - "¡": 39, - "£": 40, - "¦": 41, - "§": 42, - "©": 43, - "ª": 44, - "¬": 45, - "°": 46, - "²": 47, - "³": 48, - "¸": 49, - "¹": 50, - "¾": 51, - "Ã": 52, - "Å": 53, - "É": 54, - "Ê": 55, - "Ë": 56, - "Ì": 57, - "Î": 58, - "Ï": 59, - "Ċ": 60, - "Ġ": 61, - "Ģ": 62, - "ģ": 63, - "Ĥ": 64, - "ĥ": 65, - "ĩ": 66, - "Ī": 67, - "Ĭ": 68, - "ĭ": 69, - "Į": 70, - "į": 71, - "İ": 72, - "ı": 73, - "IJ": 74, - "ij": 75, - "Ĵ": 76, - "ĵ": 77, - "Ķ": 78, - "ķ": 79, - "ĸ": 80, - "Ĺ": 81, - "Ļ": 82, - "Ľ": 83, - "ľ": 84, - "Ŀ": 85, - "Ł": 86, - "ËĪ": 87, - "ËIJ": 88, - "ËĪÉ": 89, - "ËĮ": 90, - "ÉĻ": 91, - "ËĪa": 92, - "ËĪi": 93, - "Ġt": 94, - "ɪ": 95, - "ɾ": 96, - "ĠÉ": 97, - "Ġk": 98, - "Éľ": 99, - "Ġs": 100, - "ËĪe": 101, - "ÉĽ": 102, - "ËĪo": 103, - "Ġl": 104, - "ËĪÉĽ": 105, - "Ġd": 106, - "ÊĬ": 107, - "ËĪaËIJ": 108, - "Ġp": 109, - "Ìĥ": 110, - "Ġm": 111, - "ËĪu": 112, - "Åĭ": 113, - "ð": 114, - "ËĪÉĶ": 115, - "ÊĮ": 116, - "ËĮa": 117, - "Ġh": 118, - "ËĪÊĮ": 119, - "Ġn": 120, - "Êģ": 121, - "ËĪÉij": 122, - "Êĥ": 123, - "eËIJ": 124, - "Ġa": 125, - "Ġb": 126, - "ÉĶ": 127, - "ËĪÉĻ": 128, - "ÉĻn": 129, - "Ġf": 130, - "ËĪɪ": 131, - "É¡": 132, - "ËĪeËIJ": 133, - "Ġj": 134, - "nt": 135, - "Ġð": 136, - "ĠËĮ": 137, - "Ġts": 138, - "ĠÉ¡": 139, - "Éķ": 140, - "ËĪoËIJ": 141, - "ʰ": 142, - "aËIJ": 143, - "ËĪy": 144, - "ĠtÉķ": 145, - "ËĪiËIJ": 146, - "ĠÊ": 147, - "Ġv": 148, - "Ġw": 149, - "st": 150, - "Éij": 151, - "nd": 152, - "ËĮi": 153, - "̪": 154, - "ËĮe": 155, - "Ġz": 156, - "ËĪaɪ": 157, - "ËĪiÉĽ": 158, - "β": 159, - "ɹ": 160, - "ĠËĮa": 161, - "θ": 162, - "ĠhÉĽ": 163, - "ÊĪ": 164, - "iËIJ": 165, - "ËĮo": 166, - "Ġɪ": 167, - "Éľn": 168, - "Ġx": 169, - "ĠtÉĻ": 170, - "ËĪuËIJ": 171, - "ËĮÉĻ": 172, - "ĠjËĪi": 173, - "ËĮÉĽ": 174, - "ĠÉĽ": 175, - "ĠËĪa": 176, - "ËĮaËIJ": 177, - "Ġla": 178, - "Ġðe": 179, - "ĠhÉĽËIJ": 180, - "Ġe": 181, - "ç": 182, - "ÉĻl": 183, - "oËIJ": 184, - "ËĪÉiju": 185, - "ÊĴ": 186, - "uËIJ": 187, - "ĠÉĹ": 188, - "ĠÉķ": 189, - "ËĮeËIJ": 190, - "ĠtÉķËĪi": 191, - "os": 192, - "ËĪÉĶËIJ": 193, - "as": 194, - "ËĪÊĬ": 195, - "Ġi": 196, - "ËĪai": 197, - "ɲ": 198, - "ɪn": 199, - "ts": 200, - "ÉľÅĭ": 201, - "ĠÉŁ": 202, - "ĠÊĥ": 203, - "ËĪeɪ": 204, - "ÉĽÉ¾": 205, - "ËĪÉĽËIJ": 206, - "ËĪÉĽÉ¾": 207, - "Ġr": 208, - "tÊĥ": 209, - "ËĮÉĶ": 210, - "ĠdÉĻ": 211, - "tÉĻ": 212, - "ou": 213, - "ËĪyÉĻ": 214, - "ĠËĮi": 215, - "ÉĻɾ": 216, - "ËĪÉĻÊĬ": 217, - "ËĪÊĮɾ": 218, - "ËĪÉĴ": 219, - "Ġth": 220, - "ËĪon": 221, - "Êĭ": 222, - "ËĪÉijËIJ": 223, - "ËĪÊĮh": 224, - "wËĪa": 225, - "ËĪei": 226, - "ll": 227, - "ĠÉIJ": 228, - "ÉijËIJ": 229, - "an": 230, - "ÉŁ": 231, - "ĠÊĭ": 232, - "Ġko": 233, - "kh": 234, - "ɪÅĭ": 235, - "ËĪaËIJɪ": 236, - "ĠtÊĥ": 237, - "ËĪaËIJt": 238, - "ĠËĮe": 239, - "ĠtÉķh": 240, - "ËĪuo": 241, - "ËĪonÉ¡": 242, - "Éĸ": 243, - "at": 244, - "Ġke": 245, - "ÉĴ": 246, - "ĠÉķËĪi": 247, - "ø": 248, - "ĠÉij": 249, - "ËĪeËIJk": 250, - "Åĵ": 251, - "re": 252, - "Ġɾ": 253, - "ĠkÉĶ": 254, - "ËĮÊĬ": 255, - "sk": 256, - "ĠÊĬ": 257, - "Ġand": 258, - "ɪç": 259, - "Ġme": 260, - "ËĪaɾ": 261, - "ĠËĪɪ": 262, - "na": 263, - "Ġβ": 264, - "ĠlËĪi": 265, - "jaËIJ": 266, - "li": 267, - "no": 268, - "Ġɪn": 269, - "ĠdËĮi": 270, - "Ġɲ": 271, - "tËIJ": 272, - "ÉĻm": 273, - "ĠlÉĻ": 274, - "ĠðÉĻ": 275, - "ɪk": 276, - "ËĪÉĽl": 277, - "Éľt": 278, - "Ġse": 279, - "es": 280, - "ËĪou": 281, - "ËĪaÊĬ": 282, - "ĠÉĶ": 283, - "ɪt": 284, - "ĠÅĭ": 285, - "ËĪÉĽn": 286, - "Êİ": 287, - "Ġkh": 288, - "ËĪÉĽnt": 289, - "ËĪaËIJɾ": 290, - "Ġki": 291, - "mp": 292, - "lt": 293, - "É£": 294, - "Ġpa": 295, - "ËĪÉĻËIJ": 296, - "ɪs": 297, - "ĠÉĴ": 298, - "Ġle": 299, - "ÉªÉľ": 300, - "ËĪÉĽt": 301, - "Ġde": 302, - "Ġɹ": 303, - "ĠtËĪoËIJ": 304, - "ĠÊģ": 305, - "ÊĥÉĻn": 306, - "ĠÊĬnt": 307, - "ËĪÉĶɾ": 308, - "ËĪað": 309, - "Ġaɪ": 310, - "ĠÊIJ": 311, - "ĠmËĪa": 312, - "ra": 313, - "ĠkËĪɪ": 314, - "kt": 315, - "ËIJp": 316, - "ĠÊĪ": 317, - "ËĪaËIJÊĬ": 318, - "ĠkËĪÊĮɾ": 319, - "ĠËĪÊĮ": 320, - "ĠÉĴv": 321, - "Ġel": 322, - "ks": 323, - "Ġkw": 324, - "ÉĻt": 325, - "ndo": 326, - "ei": 327, - "ĠËĮaËIJp": 328, - "se": 329, - "ÉĻɹ": 330, - "ËĪuei": 331, - "ÉĻs": 332, - "ĠkËĮo": 333, - "ĠÊĤ": 334, - "ĠËĮÊĬ": 335, - "Ġc": 336, - "ĠÉĽn": 337, - "ËĪant": 338, - "θj": 339, - "ËĮoËIJ": 340, - "ĠËĪaËIJ": 341, - "Ġpɾ": 342, - "si": 343, - "ĠËĪe": 344, - "ĠjuËIJ": 345, - "ĠkËĮe": 346, - "ËĮɪ": 347, - "ÉĶn": 348, - "ĠsËĪÊĮ": 349, - "ĠËĪu": 350, - "ni": 351, - "Ġst": 352, - "ĠdiËIJ": 353, - "ĠkeËIJ": 354, - "ĠjËĪiou": 355, - "ËĪaiÉľ": 356, - "ĠdÊĴ": 357, - "ĠËĪÉĶ": 358, - "va": 359, - "ËIJɾ": 360, - "ËĪø": 361, - "ËĮÉĻÊĬ": 362, - "ĠpËĪu": 363, - "Ġsu": 364, - "Ġma": 365, - "ĠÉĻ": 366, - "dÊĴ": 367, - "Ġpʰ": 368, - "le": 369, - "in": 370, - "ĠtÉķhËĪi": 371, - "ĠwËĪo": 372, - "ro": 373, - "ËĮy": 374, - "ɾa": 375, - "ĠsËĪi": 376, - "ðÉĻ": 377, - "ĠseËIJ": 378, - "la": 379, - "ĠÊĴ": 380, - "mb": 381, - "ĠhËĪoËIJ": 382, - "Ġbʰ": 383, - "ĠÉĽÉ¾": 384, - "Ġðat": 385, - "sp": 386, - "ÉĶɾ": 387, - "en": 388, - "ĠsÉĻ": 389, - "ËĪÉĶÉľ": 390, - "ĠlËĮa": 391, - "ĠËĮÉĽ": 392, - "ĠËĪy": 393, - "É¡aËIJ": 394, - "ĠdÉĽÉ¾": 395, - "ËĪÉĽÊģ": 396, - "Éľkh": 397, - "ËĪiÉĻ": 398, - "ËĪan": 399, - "ĠmËĪo": 400, - "ËĪaβ": 401, - "Ġal": 402, - "ĠËĪeËIJ": 403, - "Ġθ": 404, - "ĠnËĪi": 405, - "pʰ": 406, - "lla": 407, - "Ġpl": 408, - "ËĪÅĵ": 409, - "jËĪÉiju": 410, - "Ġav": 411, - "ĠmËĪi": 412, - "ĠfËĪa": 413, - "ËĪÉľ": 414, - "me": 415, - "ËĮÉĻh": 416, - "ËĪuÉĻ": 417, - "it": 418, - "jËĪe": 419, - "Ġo": 420, - "ËĪÉľËIJ": 421, - "ĠtÉķËĪiou": 422, - "ÉĶËIJ": 423, - "ĠnÉĻ": 424, - "ËĪÉĻÉľn": 425, - "ĠmÉĻ": 426, - "ĠdeËIJ": 427, - "mo": 428, - "sa": 429, - "jËĪÉĶ": 430, - "ËĪal": 431, - "ĠtÉķËĪiÉĽ": 432, - "ĠÉ¡ÉĻ": 433, - "ða": 434, - "Ġɪz": 435, - "Ġsa": 436, - "ri": 437, - "ĠËĮil": 438, - "ËĮu": 439, - "ĠkaËIJ": 440, - "ĠÉĻËIJ": 441, - "ĠÉĸ": 442, - "Ġka": 443, - "ËĪÊĮhi": 444, - "ĠjeËIJ": 445, - "Ġtʰ": 446, - "ne": 447, - "kËIJ": 448, - "ĠtsËĪai": 449, - "ĠËĪeËIJk": 450, - "nk": 451, - "ti": 452, - "ËĪaÉľn": 453, - "ĠkËIJ": 454, - "É¡ÉĻn": 455, - "ËĪia": 456, - "ĠÉĶËIJɾ": 457, - "Êı": 458, - "ĠËĮÊĮ": 459, - "ĠzËĪaËIJ": 460, - "Ġlos": 461, - "ÉĽs": 462, - "ËĪÉĶn": 463, - "ÉĽnt": 464, - "ÉĽn": 465, - "ĠÉŁËĪoËIJ": 466, - "çt": 467, - "Ġdas": 468, - "ĠxËĮo": 469, - "ËĪuÉľ": 470, - "ËĪas": 471, - "ĠbËĪÊĮ": 472, - "ËĪiÉĽÉľn": 473, - "ÉIJ": 474, - "ĠtsuËIJ": 475, - "ĠpËĮÉĽ": 476, - "ĠnËĪÉĶ": 477, - "ÊĬt": 478, - "ma": 479, - "ĠnËĪo": 480, - "ĠlËĪɪ": 481, - "ËĪÉĽs": 482, - "ɪl": 483, - "ĠÉķËĪiÉĽ": 484, - "ĠËĪÊĬ": 485, - "ÉĴt": 486, - "to": 487, - "ĠËĪo": 488, - "ËĮon": 489, - "ĠkwËĪa": 490, - "Ġɪt": 491, - "ĠhoËIJ": 492, - "ËĪiËIJk": 493, - "ĠËĮaËIJpk": 494, - "ËĪaɪn": 495, - "æ": 496, - "ÉĻnt": 497, - "ta": 498, - "lo": 499, - "ĠnËĪÉij": 500, - "ĠlËĪa": 501, - "ËĪiÉľ": 502, - "ĠwËĪei": 503, - "ÉĽÊģ": 504, - "ĠtËĪa": 505, - "ĠɾËĮÉĻh": 506, - "ĠÉķËĪiÉij": 507, - "ËĮiËIJ": 508, - "ËĮÉĽl": 509, - "ĠtÉĻÉľ": 510, - "ĠkËĪuo": 511, - "ĠtËĪu": 512, - "jËĪÉĽ": 513, - "ĠËĮin": 514, - "ɾe": 515, - "ĠkoËIJ": 516, - "ĠkËĪa": 517, - "ɾi": 518, - "ĠtÉķËĪiÉij": 519, - "lÉĻ": 520, - "ĠkÉĻ": 521, - "ĠtËĪi": 522, - "ĠÅĭËĪyÉĻ": 523, - "Ġtsh": 524, - "er": 525, - "av": 526, - "ĠkÉĶn": 527, - "ËĪÉĻÉľÅĭ": 528, - "ðo": 529, - "ËĪaËIJn": 530, - "ĠbʰËĪi": 531, - "ĠkËIJjaËIJ": 532, - "ÉĻz": 533, - "ĠpÊģ": 534, - "ĠdËĪɪ": 535, - "ĠziËIJ": 536, - "É¡eËIJ": 537, - "ĠtËĪÉĻ": 538, - "ɪz": 539, - "ĠnËĮon": 540, - "taËIJ": 541, - "bl": 542, - "te": 543, - "nËĮeËIJ": 544, - "ËĪɪl": 545, - "so": 546, - "ko": 547, - "uÊģ": 548, - "ĠÉ£": 549, - "ĠpaÊģ": 550, - "ĠËĪÉĽ": 551, - "jËĪuËIJ": 552, - "ËĮÊĮ": 553, - "yn": 554, - "ËĪiËIJn": 555, - "ĠlËĪaɪ": 556, - "ËĪɪÅĭ": 557, - "ĠtÉķhËĪy": 558, - "ĠnËĪÊĮhi": 559, - "ĠdËĮe": 560, - "ĠjËĪÉiju": 561, - "ĠtËĪÉiju": 562, - "ĠhËĪo": 563, - "ɪd": 564, - "ĠthËĪÉij": 565, - "mËĪe": 566, - "ĠËĪÉĻ": 567, - "ja": 568, - "Ġph": 569, - "ÉĽt": 570, - "ĠkËĪÊĮ": 571, - "tÉĻn": 572, - "mËĪÉij": 573, - "wËĪe": 574, - "ĠËĮaɪn": 575, - "Ġðɪs": 576, - "É¡ÉĻ": 577, - "ĠnËĪaËIJ": 578, - "ĠbËĪaËIJ": 579, - "Ġaθ": 580, - "ĠmËĮa": 581, - "ËĪÊĮha": 582, - "ĠdËĮa": 583, - "ËĪÊı": 584, - "ĠɲËĮy": 585, - "ĠpËĪa": 586, - "ËĪaðo": 587, - "di": 588, - "bÉľ": 589, - "ɳ": 590, - "ĠwiËIJ": 591, - "ĠnËĪɪ": 592, - "ĠÉ¡ËĪÉĶÉľ": 593, - "tËIJo": 594, - "ËĮÉĻm": 595, - "ËĪaËIJr": 596, - "ĠmÉĽ": 597, - "ËĪeËIJÉ¡aËIJ": 598, - "ĠsËĮi": 599, - "ĠlËĮaËIJ": 600, - "nËĮaËIJ": 601, - "Ġsp": 602, - "tÊģ": 603, - "ĠÊİ": 604, - "ËĮÉijËIJ": 605, - "Ġkl": 606, - "kʰ": 607, - "il": 608, - "ĠÊĥt": 609, - "ĠËĮÊĬn": 610, - "al": 611, - "ĠsËĪÉĽ": 612, - "ĠmËĪaËIJ": 613, - "ĠÅĵ": 614, - "ĠÉ¡ËĪÊĮ": 615, - "ĠpËĮÉĽr": 616, - "ɾËĪa": 617, - "ËIJÊĪ": 618, - "ËĪaβa": 619, - "ĠwËĪÉĴ": 620, - "ĠxËĪuei": 621, - "ĠkhËĪo": 622, - "Ġlas": 623, - "ĠÉĹËĪo": 624, - "ĠfÉĽÉ¾": 625, - "ĠjËĪiÉĽ": 626, - "ĠtËĪe": 627, - "ĠkËĮÉĶ": 628, - "ĠdeËIJn": 629, - "Ġmo": 630, - "ĠpËĪi": 631, - "ĠtËĪÉij": 632, - "ËĪÉĽst": 633, - "wËĪÉij": 634, - "ËĪaɪt": 635, - "ÉĻÊĬ": 636, - "ĠËĪi": 637, - "ɪj": 638, - "aɪ": 639, - "ËĪaËIJÉľ": 640, - "ĠËĪɪs": 641, - "ĠpÉĶɾ": 642, - "Ã¦Éľn": 643, - "ka": 644, - "ÅĭÉ¡": 645, - "bÉĻn": 646, - "ÊĬf": 647, - "Ġpɹ": 648, - "ĠlËĮe": 649, - "ËĪiËIJd": 650, - "ËĪaËIJre": 651, - "ĠmËĪÊĮ": 652, - "ÉĻr": 653, - "ĠdÉij": 654, - "ËĪaËIJto": 655, - "ĠpËĪeËIJ": 656, - "ĠdËĪoËIJ": 657, - "ĠsËĮÊĬ": 658, - "ĠhËĪi": 659, - "ĠsËĪa": 660, - "ËĪeËIJn": 661, - "dÉĻ": 662, - "Ġpj": 663, - "ËĪÅĵÊģ": 664, - "lɪç": 665, - "ÉĴn": 666, - "ĠËĪÉĻr": 667, - "tËĪe": 668, - "Ġil": 669, - "ËĪaËIJl": 670, - "ĠsËĮÉĻÊĬ": 671, - "sÊĪ": 672, - "ĠdËĪuËIJ": 673, - "hËĪÉij": 674, - "ĠxËĪou": 675, - "ĠlËĪaiÉľ": 676, - "wËĪo": 677, - "ËĪÉĽnte": 678, - "Ġsy": 679, - "Ġzɪç": 680, - "ĠÉ¡ËĪu": 681, - "ĠÉķËĪy": 682, - "ËĪÉĶËIJl": 683, - "ÉĶl": 684, - "ĠtËĪo": 685, - "ĠÊĭoËIJ": 686, - "ĠiËIJ": 687, - "wËĪaða": 688, - "ËĪando": 689, - "Ġaθɼnt": 690, - "ĠaθɼntwËĪaða": 691, - "ĠtËĪiÉĽ": 692, - "ËĪeiÉľ": 693, - "ĠpËĮa": 694, - "ĠnËĪaɪ": 695, - "wa": 696, - "Ġfr": 697, - "ĠÊIJËĪÉĻÉľn": 698, - "ËĪua": 699, - "mi": 700, - "ĠmËĪÉĽ": 701, - "ËĪeËIJkʰ": 702, - "cʰ": 703, - "ĠwËĪÉij": 704, - "sta": 705, - "Ġtu": 706, - "Ġsk": 707, - "ËĪÉĶl": 708, - "ËĪeËIJÊĪ": 709, - "ĠlËĪaËIJɪ": 710, - "ĠlËĪaËIJ": 711, - "ËĪÉĽËIJs": 712, - "ËĪÉĽÉ¾a": 713, - "ËĪÉĻÉľt": 714, - "Ġyn": 715, - "dÉĻn": 716, - "Ġdi": 717, - "ËĪiËIJs": 718, - "Ġðel": 719, - "ËĪÊĮr": 720, - "ĠhËĪaËIJ": 721, - "ĠbÉĻ": 722, - "ĠjËĪuËIJ": 723, - "lle": 724, - "sto": 725, - "ËĪɪt": 726, - "ËĪoËIJɾ": 727, - "bʰ": 728, - "mÉĻn": 729, - "ËĮuÉĻ": 730, - "ËĮÉĻɾ": 731, - "ËĪÊĮn": 732, - "ĠlËĪaɪk": 733, - "ĠbËĪa": 734, - "ɪð": 735, - "Ġlo": 736, - "zi": 737, - "ËĪÊĮst": 738, - "mËĪi": 739, - "ÉĶÊģ": 740, - "ĠnËĪɪçt": 741, - "Ġtɾ": 742, - "ĠdËĪeËIJkʰ": 743, - "ĠsËĮe": 744, - "ĠnËĪÉĻÊĬ": 745, - "Ġu": 746, - "Ġsi": 747, - "Ġɪç": 748, - "Ġpr": 749, - "ĠtÉķËĪy": 750, - "ĠmËĪu": 751, - "za": 752, - "ĠtÊģ": 753, - "Ġwɪð": 754, - "tËĪÉĽ": 755, - "ĠpËĪÊĮɾ": 756, - "ĠkËĪÉĶ": 757, - "ËĪoËIJr": 758, - "ĠhËĮa": 759, - "ĠkËĪonÉ¡": 760, - "ĠpuÊģ": 761, - "Ġdy": 762, - "ËĪɪn": 763, - "nte": 764, - "ĠkËĮa": 765, - "ËĪÉĻɪ": 766, - "Ġmi": 767, - "ĠÉ¡ËĮuÉĻ": 768, - "Ġʲ": 769, - "ĠfËĪÉij": 770, - "ĠvÉijËIJ": 771, - "ĠËĮaÊĬ": 772, - "ËĮuËIJ": 773, - "ĠËĪun": 774, - "ĠjËĪÊĮha": 775, - "juËIJ": 776, - "Ġmɪt": 777, - "ĠlËĪÉĽ": 778, - "ËĪeËIJÊĥ": 779, - "ĠfÉĶËIJ": 780, - "mÉĻ": 781, - "ɾt": 782, - "ĠkËĮon": 783, - "ĠlËĪÉĶ": 784, - "ĠxËĪÉiju": 785, - "pl": 786, - "ĠdËĪi": 787, - "ĠlËĪoËIJ": 788, - "sÉĻ": 789, - "ËĪaËIJva": 790, - "ĠlËĪu": 791, - "ĠÉ¡ËĮÉĻÊĬ": 792, - "Ġhav": 793, - "ĠËĮaËIJpkËĮoËIJ": 794, - "ɾËĪi": 795, - "ĠfËĪÉĻ": 796, - "ĠhËĮÉĻm": 797, - "ËĪonÉ¡Éľ": 798, - "jo": 799, - "ĠsÉĶ": 800, - "ËĪaËIJd": 801, - "wËĪiÉĻ": 802, - "ËĪand": 803, - "ËĮaɪn": 804, - "tɾ": 805, - "ĠËĮɪ": 806, - "ĠËĪuna": 807, - "ĠxwËĪÉij": 808, - "ĠjÉĶËIJ": 809, - "ÊģËĪi": 810, - "ĠkËĪuoÉľ": 811, - "Ġaβ": 812, - "ĠÉ¡ËĪaËIJ": 813, - "ano": 814, - "tÉĻl": 815, - "ĠrËĮe": 816, - "ËĮÊĮt": 817, - "ĠjËĪiÉij": 818, - "ĠɾËĮÉĻhaËIJ": 819, - "ĠmËĪe": 820, - "ĠËĪyÃ¦Éľn": 821, - "ĠfËĪu": 822, - "Ġbl": 823, - "nËĪi": 824, - "sÉĻn": 825, - "Ġaɪn": 826, - "ËĪiÊĬ": 827, - "Ġðeɪ": 828, - "Ġɪts": 829, - "Ġ(": 830, - "ËĪyËIJ": 831, - "ÉĻd": 832, - "ĠËĮo": 833, - "ĠÉĽs": 834, - "ĠviËIJ": 835, - "ËIJÉ¡eËIJ": 836, - "kËĪe": 837, - "ĠËĪal": 838, - "ÉĽl": 839, - "ĠÊĮ": 840, - "ËIJo": 841, - "ĠkËĪo": 842, - "ĠÊĪËĪuËIJ": 843, - "ĠsËĪɪ": 844, - "ËĪeËIJɾ": 845, - "Éľm": 846, - "ËĮÉĻn": 847, - "ËĪaËIJi": 848, - "ËĪoËIJl": 849, - "ɪËĮeËIJ": 850, - "ĠʲËĪy": 851, - "ĠkËĪÉĶËIJ": 852, - "sËĪi": 853, - "ĠlËĪe": 854, - "ËĮÉĴt": 855, - "ËĪiËIJp": 856, - "aÊģ": 857, - "ĠθËĪɪÅĭ": 858, - "ËĪÉĻËIJɪ": 859, - "ËĪÊĮl": 860, - "ĠhËĪoËIJtaËIJ": 861, - "ËĪoɪ": 862, - "nto": 863, - "zh": 864, - "ĠdeËIJm": 865, - "ĠkÉĶm": 866, - "ʰËĪiËIJk": 867, - "ĠdÊĴËĪÊĮst": 868, - "pɾ": 869, - "Ġly": 870, - "hËĪu": 871, - "ËĪÉĶø": 872, - "ËĪaËIJs": 873, - "ĠËĪan": 874, - "ĠËĪÉĴ": 875, - "Ġkan": 876, - "ĠtsËĪuo": 877, - "ËĪeËIJva": 878, - "Ġɡɾ": 879, - "Ġpo": 880, - "ĠtÊĥËĪÉĶ": 881, - "Êİa": 882, - "ĠmËĮi": 883, - "Êĥt": 884, - "tËĪi": 885, - "ĠhËĪÊĮ": 886, - "tÊĥe": 887, - "ĠfÉĶn": 888, - "ve": 889, - "ĠnËĮe": 890, - "ËĪÉĶÊģ": 891, - "iz": 892, - "ĠsËĪuo": 893, - "ËĪÉĽËIJr": 894, - "wËĪaÊģ": 895, - "ËĪaða": 896, - "Åĭk": 897, - "po": 898, - "ĠkËĪi": 899, - "ËĪad": 900, - "ĠvËĪi": 901, - "tÉķ": 902, - "ĠkËĪÉĻ": 903, - "ĠwËĪu": 904, - "ÉĴz": 905, - "ĠvÉijËIJɾ": 906, - "ÊģËĪÉĽ": 907, - "ĠkËĪaËIJ": 908, - "ke": 909, - "nÉĻ": 910, - "ËĪÊĮb": 911, - "ËĪuËIJɾ": 912, - "ËĮÉĻËIJ": 913, - "ĠÊĪʰËĪiËIJk": 914, - "ĠkËĪu": 915, - "ĠbËĮÊĮt": 916, - "Ġat": 917, - "Ġfɹ": 918, - "ËĪax": 919, - "ĠzoËIJ": 920, - "ĠtËĪaËIJ": 921, - "ĠðËĮe": 922, - "neËIJ": 923, - "ĠÉijËIJ": 924, - "ĠaÊĬf": 925, - "am": 926, - "ÊĬÅĭ": 927, - "ĠÉĶËIJ": 928, - "ĠÉķËĪiÉľÅĭ": 929, - "ĠËĪÉĶËIJl": 930, - "ɪm": 931, - "jËĪo": 932, - "ËĪiËIJÉŁ": 933, - "ĠkwËĮÉĽ": 934, - "ĠmËĪas": 935, - "ÉĻh": 936, - "ĠËĪaÊĬ": 937, - "ËĪÉĶɪ": 938, - "É¡ÉĻɾ": 939, - "rÉĻn": 940, - "ËĪɪk": 941, - "sse": 942, - "ĠpËĪÉij": 943, - "ĠÉĹËĮe": 944, - "ĠÉĹËĪi": 945, - "Ġaz": 946, - "ĠÉ¡ËĪÊĮjaËIJ": 947, - "ze": 948, - "ĠÉĹËĮaËIJ": 949, - "ĠfËĪi": 950, - "ĠËĮÉĴn": 951, - "ĠxËĪo": 952, - "ĠËĮÊĬna": 953, - "ĠtʰaËIJ": 954, - "ĠsÉij": 955, - "ËĪeɪÊĥÉĻn": 956, - "ĠtÉķËĪiÉľ": 957, - "ĠÉŁaËIJ": 958, - "pËIJ": 959, - "Ġply": 960, - "θËĪi": 961, - "ËIJÉĸ": 962, - "ĠtËĪuei": 963, - "ĠlËĪÉĻ": 964, - "ĠdÉijËIJ": 965, - "ft": 966, - "ËĪam": 967, - "ĠsËĪÊĮkt": 968, - "ĠtËĪou": 969, - "ĠpËĪiÉĽ": 970, - "ĠËĪai": 971, - "ĠwËĪÉĴn": 972, - "ĠzËĮaɪn": 973, - "Ġest": 974, - "ĠmÉĶ": 975, - "ĠtÉķjËĪÉiju": 976, - "Éľp": 977, - "ËĪÊĮz": 978, - "bi": 979, - "ËĪÉĽËIJseËIJ": 980, - "ĠlËĪy": 981, - "ĠmËĮe": 982, - "ĠdËĮÉĽl": 983, - "ËĪiËIJl": 984, - "ĠkËĮomo": 985, - "ĠhËĪaÉľn": 986, - "ËĪoËIJne": 987, - "ĠkËĪÊĮɾt": 988, - "ĠsyÊģ": 989, - "ËĮÉĶɾ": 990, - "Ġɪf": 991, - "uv": 992, - "zÉĻn": 993, - "ol": 994, - "Ïĩ": 995, - "im": 996, - "ĠmËĪiÉĽ": 997, - "Ġðɪ": 998, - "ĠvËĪÉĽ": 999, - "ÊĬd": 1000, - "Ġtr": 1001, - "ËĪeËIJs": 1002, - "ðe": 1003, - "de": 1004, - "ʰÏĩ": 1005, - "ÉŁÊ°": 1006, - "ËĮÉĻËIJÉªÉľ": 1007, - "bËIJ": 1008, - "ËĪÊĬk": 1009, - "ĠnËĪÉĶÉªÉľ": 1010, - "ĠËĮiËIJ": 1011, - "ËĪÉijËIJt": 1012, - "ËĪiËIJɾ": 1013, - "Ġtɹ": 1014, - "ɾÉĶ": 1015, - "ĠwÉĴz": 1016, - "Ġvu": 1017, - "bÉĻl": 1018, - "bÉĻ": 1019, - "ɹi": 1020, - "nts": 1021, - "ĠsËĪaËIJ": 1022, - "dʰ": 1023, - "ĠtÊĬ": 1024, - "ĠÊİËĮi": 1025, - "βa": 1026, - "hËĪÉĻÉľÅĭ": 1027, - "ĠsËĪiËIJ": 1028, - "ĠpËĮaɾa": 1029, - "ËĪÉĽÉ¾ÉĶ": 1030, - "ËĪɪs": 1031, - "É£o": 1032, - "ĠËĮal": 1033, - "or": 1034, - "ĠbËĪÊĮh": 1035, - "ĠkËĪoËIJ": 1036, - "ĠtËĪÉĽ": 1037, - "ĠpËĪo": 1038, - "ĠÊĴÉĻ": 1039, - "pÊģ": 1040, - "ĠËĪaɪ": 1041, - "hËĪÉijÉľÅĭ": 1042, - "ÉĻli": 1043, - "ËĪeɪt": 1044, - "ĠjËĪiouÉľ": 1045, - "ĠdËĪÉĻ": 1046, - "ĠmËĪÉĶËIJ": 1047, - "lËĪi": 1048, - "ËĮyÉĻ": 1049, - "ĠlËĪoËIJÉ¡": 1050, - "ĠnËĪÊĮ": 1051, - "ĠhËĪÊĬ": 1052, - "ĠnËĪÉĻÉľÅĭ": 1053, - "ĠÊģÉĻ": 1054, - "zËĪi": 1055, - "ĠtËĪuËIJ": 1056, - "ĠkËĮome": 1057, - "ĠlËĪeËIJ": 1058, - "ËĪaËIJtaËIJ": 1059, - "Ġan": 1060, - "ĠËĪyu": 1061, - "ĠËĮÊĮÉ¡ÉĻɾ": 1062, - "ĠËĪɪn": 1063, - "ĠhËĪoÉĻ": 1064, - "vÉĻ": 1065, - "ËĪøËIJ": 1066, - "θja": 1067, - "ËĪuÉĻÉľn": 1068, - "ĠkÉĻɾ": 1069, - "ËĪat": 1070, - "jËĪø": 1071, - "ËĪÉĽtÊģ": 1072, - "ĠpËĪÉiju": 1073, - "stÉĻ": 1074, - "ĠwÉĴt": 1075, - "ËĪeËIJl": 1076, - "ÊĪi": 1077, - "ĠxËĪaiÉľ": 1078, - "ËĪyÊģ": 1079, - "ĠhËĪoËIJÉ¡aËIJ": 1080, - "ĠtsËĪi": 1081, - "ĠËĪÊĮp": 1082, - "ĠnËĮÉĴt": 1083, - "ĠlËĪɪeËIJ": 1084, - "ĠhËĪa": 1085, - "Ġfl": 1086, - "ĠnËĪeËIJ": 1087, - "ËĮaËIJɪ": 1088, - "ĠtËĪuo": 1089, - "tÊĥËIJ": 1090, - "sËĪe": 1091, - "bʰi": 1092, - "ĠbËĪÊĮhÊĬt": 1093, - "ËĪÉĽnd": 1094, - "ĠsËĪÉĶ": 1095, - "ÉĻns": 1096, - "ËĮÉĻl": 1097, - "ÉĽÉľ": 1098, - "ĠÉ¡l": 1099, - "ËĪɪɾ": 1100, - "ËĪaËIJta": 1101, - "ÉľËIJ": 1102, - "ËĪÉĽnto": 1103, - "skËĮoËIJ": 1104, - "ËĪÉĽk": 1105, - "tsi": 1106, - "ĠtËĪonÉ¡": 1107, - "ĠbiËIJ": 1108, - "ĠhËĪaËIJɪ": 1109, - "ĠbËĪi": 1110, - "jj": 1111, - "Êİi": 1112, - "Ġkʰ": 1113, - "ĠsËĪo": 1114, - "llo": 1115, - "Ġbaɪ": 1116, - "ĠÉĽnt": 1117, - "ĠËĪiËIJ": 1118, - "ĠÉ¡ËĪo": 1119, - "ɾeËIJ": 1120, - "ĠkÊĭ": 1121, - "ĠmËĪeiÉľ": 1122, - "ÊĬËĪÉĶËIJ": 1123, - "ĠtËĪaɪ": 1124, - "Ġsus": 1125, - "Ġri": 1126, - "ĠvËĮÉĽ": 1127, - "ËĪiËIJno": 1128, - "vano": 1129, - "ĠdËĮiËIJ": 1130, - "ĠÊIJËĪaÉľn": 1131, - "ÊĤ": 1132, - "ĠÉIJb": 1133, - "ËĪaËIJh": 1134, - "ɪÊĥ": 1135, - "ĠdËĮella": 1136, - "tËIJi": 1137, - "ĠËĪÊĬn": 1138, - "ĠhiËIJ": 1139, - "ĠbËĪaËIJt": 1140, - "ĠthËĪi": 1141, - "Ġam": 1142, - "ĠËĪoËIJ": 1143, - "Ġhu": 1144, - "ĠkËĪÊĮh": 1145, - "ĠzËĪÉijËIJ": 1146, - "ĠÉ¡ËĮÉĶ": 1147, - "ĠËĪÉĻÊĬ": 1148, - "yËĪi": 1149, - "ĠlËĪÊĮ": 1150, - "ĠdËĪeËIJ": 1151, - "ĠsËĪÉĶËIJ": 1152, - "skËĮeËIJ": 1153, - "ɾo": 1154, - "ÊģËĪÉij": 1155, - "tËĪa": 1156, - "ĠkËĪÊĬ": 1157, - "ËĪante": 1158, - "ĠdÉĶ": 1159, - "ĠsËĪeɪ": 1160, - "ĠsÉĽt": 1161, - "ɹɪ": 1162, - "ĠÉ¡ËĮÉĻÊĬɪÅĭ": 1163, - "zo": 1164, - "ĠjËĪaËIJ": 1165, - "ĠÉĴvðÉĻ": 1166, - "ĠÊĿ": 1167, - "ĠÉĽl": 1168, - "ĠsËĪoËIJ": 1169, - "ĠthËĪiÉľ": 1170, - "ĠËĪÉĽl": 1171, - "ĠlyËĮi": 1172, - "ndÊĴ": 1173, - "ĠÉķjËĪÉiju": 1174, - "θa": 1175, - "ĠɾËĮÉĻheËIJ": 1176, - "Ġmaɪ": 1177, - "jÉĻ": 1178, - "ĠËĪÊĮb": 1179, - "asjËĪÉĶ": 1180, - "dÊģ": 1181, - "ĠkhËĪa": 1182, - "ĠËĪes": 1183, - "vi": 1184, - "fi": 1185, - "ËĮÉĻb": 1186, - "Ġre": 1187, - "ĠavËĮÉĽ": 1188, - "ĠtËĮi": 1189, - "Ġkɾ": 1190, - "Ġbɪk": 1191, - "ste": 1192, - "ËĪeËIJÊĥc": 1193, - "pt": 1194, - "zÉĻ": 1195, - "ĠwËĪaËIJ": 1196, - "kl": 1197, - "ĠsËĪÊĮm": 1198, - "ɪÊĪ": 1199, - "dz": 1200, - "vo": 1201, - "ËĮaÊĬt": 1202, - "nde": 1203, - "ĠdÉĽs": 1204, - "ĠÉŁËĪaËIJ": 1205, - "ĠrËĮi": 1206, - "sËĮeËIJ": 1207, - "É¡i": 1208, - "Ġals": 1209, - "ËĪiðo": 1210, - "ĠnËĪiÉľn": 1211, - "ÊĬl": 1212, - "tsËIJ": 1213, - "ËĪanto": 1214, - "ĠÉĹËĪÉĻÊĬ": 1215, - "kËIJi": 1216, - "ĠsËĪÊĮb": 1217, - "ĠnËĪa": 1218, - "ĠlËĮo": 1219, - "ĠphËĪi": 1220, - "mËĮe": 1221, - "Ġfa": 1222, - "kÉĻ": 1223, - "ĠzËĪu": 1224, - "ns": 1225, - "ĠÊģe": 1226, - "ĠbËĪo": 1227, - "ËĪaËIJti": 1228, - "Ġman": 1229, - "ĠlËĪiÉij": 1230, - "ĠÉĹËĮyÉĻ": 1231, - "ĠfËĪÉĶËIJ": 1232, - "ĠkÊĭËĪeËIJÊĥc": 1233, - "ĠxËĪÉij": 1234, - "ĠtÉķËĪu": 1235, - "jÉĻɾ": 1236, - "Ġɪst": 1237, - "wËĪi": 1238, - "ĠËĮaɪnÉĻ": 1239, - "ɪɡ": 1240, - "ĠsÊĪ": 1241, - "ËĪiÉĻl": 1242, - "ĠnËĪiÉĽÉľn": 1243, - "ĠËĮÉĽËIJ": 1244, - "ËĪaɪnd": 1245, - "ĠzËĪi": 1246, - "vÉĻn": 1247, - "mz": 1248, - "ðos": 1249, - "dÊĴËIJ": 1250, - "jËĪa": 1251, - "ɾËĪÉĶ": 1252, - "lËĪe": 1253, - "ʲ": 1254, - "ĠvËĪÉĶ": 1255, - "ĠlËĪiÉĽ": 1256, - "θe": 1257, - "mËĪente": 1258, - "ĠɪnðÉĻ": 1259, - "Ġaɪm": 1260, - "nÉĻn": 1261, - "ĠhÉĻm": 1262, - "ɾaËIJ": 1263, - "ĠsËĪuoÉľ": 1264, - "ĠɲËĪi": 1265, - "ĠɹËĪiÉĻl": 1266, - "lËĪa": 1267, - "ĠbËĪÉĶ": 1268, - "ĠkËĪai": 1269, - "ÊģËĪa": 1270, - "ĠwËĪÉľËIJ": 1271, - "ĠaËIJ": 1272, - "Ġpas": 1273, - "ËĪÊĮs": 1274, - "wËĪÉĽÉ¾": 1275, - "ĠÉĹËĪe": 1276, - "ĠhËĮatÉĻ": 1277, - "aɪn": 1278, - "ĠËĪÉĶpʰ": 1279, - "ÊģËĪe": 1280, - "ĠÉŁaËIJËĪeËIJÉ¡aËIJ": 1281, - "ĠËĪÊĬs": 1282, - "ĠtÉķhËĪiÉľ": 1283, - "ntÊĥ": 1284, - "ĠxËĪuo": 1285, - "ËĪuÊģ": 1286, - "Ġɪm": 1287, - "ɳÉĸ": 1288, - "ËĪyÉĻÉľkh": 1289, - "ĠËĪyÉĽ": 1290, - "ĠmËĮaËIJ": 1291, - "ÅĵÊģ": 1292, - "ĠËĪalt": 1293, - "ĠkÉĻm": 1294, - "Êİo": 1295, - "ĠÉIJn": 1296, - "Ġfy": 1297, - "ĠËĮÉĽra": 1298, - "ĠÉ¡ËĪÊĬ": 1299, - "ĠpËĪÊĮ": 1300, - "ls": 1301, - "ĠlËĪiËIJ": 1302, - "ĠÊĤËĪy": 1303, - "ĠbɪkËĪÊĮz": 1304, - "ĠÉ¡ÉĽt": 1305, - "Ġbɾ": 1306, - "tʰ": 1307, - "tÉĻlËĮÉĻb": 1308, - "xo": 1309, - "skËĮaËIJ": 1310, - "ɲʲ": 1311, - "ËĪeËIJkÊĪ": 1312, - "rÉĻ": 1313, - "tÊĥo": 1314, - "ĠpÊģÉĶ": 1315, - "ĠɹËĪaɪt": 1316, - "ĠpËĪei": 1317, - "ËĮɪç": 1318, - "jËĪÉĽÉ¾": 1319, - "tËIJa": 1320, - "ĠÉIJbËĮaÊĬt": 1321, - "ĠkÊĭËĪeËIJÊĥcÉĻn": 1322, - "ĠvËĪe": 1323, - "ÊĬÉľ": 1324, - "ĠakËĪe": 1325, - "ĠpËĪai": 1326, - "vËĪÉĽ": 1327, - "Ġθɹ": 1328, - "ɪf": 1329, - "ĠavËĪÉĽ": 1330, - "ĠkËĪe": 1331, - "dËĪi": 1332, - "ËĪeËIJÉĸ": 1333, - "ĠbÉĻt": 1334, - "ÊĪʰ": 1335, - "teËIJ": 1336, - "θjËĪÉĶn": 1337, - "dÉľ": 1338, - "ĠjËĪiÉľ": 1339, - "Ġve": 1340, - "É£ËĪu": 1341, - "ËĪÊĮhÉĻl": 1342, - "ĠpÉĶ": 1343, - "ĠÉ¡r": 1344, - "Ġða": 1345, - "ĠvËĪiËIJ": 1346, - "ĠËĮÉijËIJ": 1347, - "ËĪÉĻÊĬnt": 1348, - "ĠbËĪaËIJɾ": 1349, - "ĠmËĪÊĮtÉĻlËĮÉĻb": 1350, - "ld": 1351, - "ĠtÉķËĮÉĶ": 1352, - "pa": 1353, - "ðËĪad": 1354, - "ËĪiɾ": 1355, - "ĠxËĪu": 1356, - "ĠlËĪiÉľÅĭ": 1357, - "ËĪeɪs": 1358, - "ĠÉĹËĮeÉľn": 1359, - "ĠthËĪiÉĽ": 1360, - "tËIJe": 1361, - "ĠavËĮÉĽk": 1362, - "ĠËĮÉĶ": 1363, - "ĠkËĪÉiju": 1364, - "ɪv": 1365, - "iËIJz": 1366, - "ËĪos": 1367, - "Ġɡɹ": 1368, - "and": 1369, - "ĠlËĪiou": 1370, - "ĠËĪoÉľ": 1371, - "É¡l": 1372, - "ĠpËĪÉĶËIJ": 1373, - "ĠmËĮeËIJ": 1374, - "ĠkËĪÉĴ": 1375, - "nos": 1376, - "çÉĻn": 1377, - "fÉĻn": 1378, - "ĠsËĪÊĮktËĮeËIJ": 1379, - "ĠËĪaɪn": 1380, - "ËĪoËIJre": 1381, - "jËĪÉĽn": 1382, - "ĠðËĪÉĽn": 1383, - "ĠtÉķhËĪiÉĽÉľn": 1384, - "ĠhËĪaɪ": 1385, - "ɾËĪÉĽ": 1386, - "ĠsËĪu": 1387, - "ĠkËĪɪjaËIJ": 1388, - "ĠpjËĮÊĬ": 1389, - "ĠhÉĻmËĮaËIJ": 1390, - "ĠËĮÊĮp": 1391, - "ĠpËĪÊĮhÉĻl": 1392, - "ĠxËĪÉĻ": 1393, - "dËĪe": 1394, - "ĠmÉij": 1395, - "ĠÊĬm": 1396, - "ndÉĻ": 1397, - "ĠdËĪÉĻÊĬnt": 1398, - "ËĪeËIJÊĥÉĻn": 1399, - "Ġðats": 1400, - "is": 1401, - "ĠcËĪaËIJh": 1402, - "pe": 1403, - "ĠsËĮo": 1404, - "ĠðËĪe": 1405, - "ĠsËĪaËIJt": 1406, - "ËĪaÊģ": 1407, - "ĠsËĪe": 1408, - "ÉĻk": 1409, - "ɪÊĭ": 1410, - "ĠkËĪoËIJi": 1411, - "kÉĶ": 1412, - "ĠvËĪaËIJÊĬ": 1413, - "ĠfËĪei": 1414, - "ĠlËĪeËIJk": 1415, - "ĠhËĪiÉĻ": 1416, - "ĠaÊĬ": 1417, - "ËĪÉĽndo": 1418, - "ËĪes": 1419, - "ĠzËĪÉĶ": 1420, - "ĠËĪÉĽÉ¾a": 1421, - "nËĪiÉľn": 1422, - "ĠkËĪÊĮm": 1423, - "ĠlËĪÉĴ": 1424, - "ɪst": 1425, - "ĠpÉij": 1426, - "ĠfËĪÉĶ": 1427, - "ĠthËĪonÉ¡": 1428, - "nke": 1429, - "ËĮɪk": 1430, - "ĠɲËĪÉĻ": 1431, - "ËĮÊĮm": 1432, - "ËĪiËIJt": 1433, - "ĠwËĪÉĴnt": 1434, - "ËĪaβan": 1435, - "ĠbËĪÊĮr": 1436, - "ÉĽnd": 1437, - "ĠËĮÉijËIJbÉľ": 1438, - "ĠvËĪaɪ": 1439, - "ĠtÊĥËĮi": 1440, - "ĠθËĪɪÅĭk": 1441, - "sti": 1442, - "Ġkɹ": 1443, - "ĠËĪaÊĬt": 1444, - "stÉĻn": 1445, - "ĠÊĭËĪÊĮn": 1446, - "ĠÉ¡ËĮaËIJ": 1447, - "ËĪaËIJÉľÉ²": 1448, - "Êģi": 1449, - "ĠnËĪÉĶx": 1450, - "ĠɹËĪiÉĻlɪ": 1451, - "ĠvËĮi": 1452, - "ĠðeÉĻ": 1453, - "ËĮɪtÊĥ": 1454, - "ĠvËĪyÉĻ": 1455, - "ĠËĮaËIJpkËĮaËIJ": 1456, - "ĠfËĮaËIJɪ": 1457, - "ĠpËĪÉĶ": 1458, - "ĠnËĪÊĮmb": 1459, - "θes": 1460, - "jËĪÉĽÊģ": 1461, - "ĠkËĪÊĬcʰ": 1462, - "mËĪÉĽ": 1463, - "ĠvËĪu": 1464, - "ĠlÅĵÊģ": 1465, - "ĠiËIJm": 1466, - "ÊĪÉĻɾ": 1467, - "tÊĥi": 1468, - "ËIJs": 1469, - "ĠtËĪy": 1470, - "ĠmËĪiÉľÅĭ": 1471, - "ɾËĪe": 1472, - "mËĮa": 1473, - "ĠmËĮiËIJ": 1474, - "ĠÉĽks": 1475, - "ɪp": 1476, - "ĠkËĪÊĮɾnËĮaËIJ": 1477, - "ĠËĮaÊĬx": 1478, - "rËĪiËIJ": 1479, - "ĠcËĪÊĮl": 1480, - "mos": 1481, - "ĠkËĪÊĮɾtËĮeËIJ": 1482, - "iËIJɾ": 1483, - "kÉĻn": 1484, - "ĠdËĪu": 1485, - "naËIJ": 1486, - "ĠpwËĪe": 1487, - "ËĮÉĶɪ": 1488, - "ĠtÉķhËĪiÉĽ": 1489, - "ĠβËĪi": 1490, - "ËĪiÉĽÉľt": 1491, - "Ġte": 1492, - "ËĪaðos": 1493, - "mËĪa": 1494, - "ĠvËĪo": 1495, - "ĠmËĪɪ": 1496, - "ĠbËĮi": 1497, - "ad": 1498, - "do": 1499, - "ĠnËĪaÊĬ": 1500, - "ĠʲËĪyÉľ": 1501, - "wËĪÉĽ": 1502, - "ËĪis": 1503, - "el": 1504, - "Ġpar": 1505, - "ĠtËĪai": 1506, - "ĠdËĪɪjaËIJ": 1507, - "hËĪi": 1508, - "ĠɾËĪÊĮ": 1509, - "ĠdËĪe": 1510, - "ËĪaɪd": 1511, - "Ġper": 1512, - "ĠsËĮÉĶ": 1513, - "we": 1514, - "ÊĬm": 1515, - "Ġin": 1516, - "ĠjËĪuËIJz": 1517, - "ËĪiËIJpÉĻl": 1518, - "ĠÊĭËĪaËIJl": 1519, - "ĠetËĪÉĽ": 1520, - "ËĮÉĽm": 1521, - "ĠnËĪu": 1522, - "ËĪÉĽkt": 1523, - "ĠiËIJɾ": 1524, - "Ġbɹ": 1525, - "ĠtshËĪi": 1526, - "ĠÉĹËĪÉĶÉľ": 1527, - "ĠkwËĮa": 1528, - "ĠfËĪuÉľ": 1529, - "wËĮa": 1530, - "ĠdËĪiËIJ": 1531, - "ĠÉ¡ËĪyÉĻ": 1532, - "ËĮÉĽËIJ": 1533, - "rËĪa": 1534, - "Ġne": 1535, - "ĠzËĪyÉĻ": 1536, - "ĠbËĪaɪ": 1537, - "ĠÉŁËĪÊĮb": 1538, - "ËĪuËIJto": 1539, - "ÊĬnt": 1540, - "Ġcʰ": 1541, - "ËĪÉĽnti": 1542, - "ËĪoÉĻ": 1543, - "ĠsËĮÊĮm": 1544, - "ĠlÉij": 1545, - "ËĮeva": 1546, - "É¾ÉĽ": 1547, - "ntÉľ": 1548, - "ĠmËĪÉĽn": 1549, - "ËĪÉijËIJk": 1550, - "Ġkil": 1551, - "ËĪones": 1552, - "ff": 1553, - "ĠmËĪÉĽËIJ": 1554, - "ĠvËĪÉĻɪ": 1555, - "ĠËĪÉĶËIJ": 1556, - "ĠËĮɪnt": 1557, - "ÊĬn": 1558, - "Ġwɪl": 1559, - "Ġsin": 1560, - "ĠËĮalla": 1561, - "ĠaβËĪia": 1562, - "pi": 1563, - "ËĪoÉľ": 1564, - "ɪjËĮaËIJ": 1565, - "ku": 1566, - "ĠvËĪɪ": 1567, - "Ġtut": 1568, - "ĠtËĪeÉľ": 1569, - "ĠhËĪÉĶ": 1570, - "βɾe": 1571, - "sÉĻɾ": 1572, - "ĠkhËĪai": 1573, - "ĠmËĪÉĶ": 1574, - "Ġta": 1575, - "ĠɲËĪaËIJ": 1576, - "Ġnu": 1577, - "ËĪuËIJn": 1578, - "ĠÉĻËIJÉľ": 1579, - "ĠËĪaÊĬf": 1580, - "ËĪiËIJdÉľ": 1581, - "nti": 1582, - "ĠpËĪiËIJpÉĻl": 1583, - "Ġkj": 1584, - "Ġpe": 1585, - "ĠmËĪÉij": 1586, - "ËĮaɪ": 1587, - "ËĪaËIJle": 1588, - "ĠvËĮÉĻËIJÉªÉľ": 1589, - "mpo": 1590, - "ĠkËĪɪt": 1591, - "ĠnËĮÉĽ": 1592, - "ĠÉŁËĪaËIJtaËIJ": 1593, - "ĠsËĪaËIJtʰ": 1594, - "ĠÉŁËĪi": 1595, - "Ġso": 1596, - "ĠbËĪÉĽ": 1597, - "kËĪi": 1598, - "ɪti": 1599, - "Ġtsi": 1600, - "ĠkÊģ": 1601, - "ËĮÉĴ": 1602, - "É¡ÉĻl": 1603, - "kst": 1604, - "ĠmËĪÉĻËIJ": 1605, - "ËĪÊĮk": 1606, - "ĠnËĪaËIJÊĬ": 1607, - "Ġap": 1608, - "ĠlËĪɪkʰ": 1609, - "lli": 1610, - "ĠkwËĪal": 1611, - "ĠËĪÉĻËIJ": 1612, - "ĠtsËĪuei": 1613, - "Ġdo": 1614, - "ĠkËIJjËĪo": 1615, - "ÊĬz": 1616, - "ĠpËĪaËIJ": 1617, - "ĠmËĪuËIJ": 1618, - "ĠÉ¡ÉĻv": 1619, - "rËĪi": 1620, - "Ġtw": 1621, - "ËĮɪn": 1622, - "dËĪÉij": 1623, - "ĠðËĪi": 1624, - "ĠËĪaËIJi": 1625, - "ĠhËĪiÉĽ": 1626, - "ĠðËĮÉĽm": 1627, - "ĠpʰËĪɪɾ": 1628, - "ÉĴm": 1629, - "ĠËĮeËIJ": 1630, - "ĠthËĪaiÉľ": 1631, - "ĠvËĪas": 1632, - "ĠnÉijËIJ": 1633, - "pÉĻn": 1634, - "ĠpËĮÉĻɾ": 1635, - "ĠÉĹËĪaËIJɪ": 1636, - "ËĪouÉľ": 1637, - "ĠÊIJËĪuÉľ": 1638, - "ĠmËĪan": 1639, - "ĠtËĪÉĻÉªÉľ": 1640, - "ĠlËĪaËIJÊĬ": 1641, - "mËĪÉĽnte": 1642, - "ĠfËĪam": 1643, - "sjËĪÉĶ": 1644, - "ĠpËĪÉĻ": 1645, - "ËĪeËIJm": 1646, - "ĠpËĪÊĮr": 1647, - "jËĪi": 1648, - "ĠlÉĽ": 1649, - "Ġten": 1650, - "ËĪoËIJra": 1651, - "ki": 1652, - "ĠÊĤËĪaËIJÊĬ": 1653, - "kɪ": 1654, - "bËIJe": 1655, - "ËĪalt": 1656, - "ðɪ": 1657, - "pËĪi": 1658, - "ĠËĮÉĽnt": 1659, - "ĠmËĪei": 1660, - "ĠhËĪÉĻÊĬ": 1661, - "ĠhËĪÉĽÉ¾": 1662, - "jËĪÉij": 1663, - "ĠhËĪÊĬaËIJ": 1664, - "mÉľ": 1665, - "Ġdʰ": 1666, - "ĠtÊĥËĪe": 1667, - "lËĪÉĽ": 1668, - "ËĪaËIJte": 1669, - "ĠpËĪuËIJ": 1670, - "ĠmËĪÊĬ": 1671, - "ËĪaËIJɪÊĪ": 1672, - "diËIJ": 1673, - "ĠfɹÉĴm": 1674, - "ĠhËĪÉijËIJ": 1675, - "βo": 1676, - "ĠmËĪiÉľn": 1677, - "ĠðiËIJz": 1678, - "ĠkËĪou": 1679, - "ËĪiËIJna": 1680, - "ĠavËĮeva": 1681, - "ĠËĪaËIJɾ": 1682, - "ĠnËĪuËIJɾ": 1683, - "ĠβËĪe": 1684, - "Ġzaɪn": 1685, - "ËĪÉĽd": 1686, - "ÉĹ": 1687, - "ËĪeɪk": 1688, - "sËĮÉĻÊĬ": 1689, - "ËĪeËIJÉŁ": 1690, - "ĠÊĤËĪÉĻËIJ": 1691, - "je": 1692, - "cʰËIJ": 1693, - "ËĪÉĶr": 1694, - "ÉĽËIJ": 1695, - "ĠtÉķhËĪyÃ¦Éľn": 1696, - "ĠËĮaɪnÉĻn": 1697, - "ĠiËIJn": 1698, - "ĠbËĪÊĮc": 1699, - "ËĪiËIJm": 1700, - "ɾas": 1701, - "ËĮÉĻs": 1702, - "ĠvËĪeËIJ": 1703, - "ĠËĪÉĻrÉľ": 1704, - "ĠduËIJ": 1705, - "ntÉĻ": 1706, - "ĠpɹËĪÉĴ": 1707, - "ĠbËĪɪ": 1708, - "ĠwËĪoÉľ": 1709, - "nËĮi": 1710, - "ĠhÉIJ": 1711, - "ĠkËĪÉĽ": 1712, - "Ġet": 1713, - "jËĪÉĽndo": 1714, - "ĠËĪaiÉľ": 1715, - "Ġli": 1716, - "ĠËĪaÊĬs": 1717, - "kËIJo": 1718, - "ĠÉĹËĪyÉĻ": 1719, - "keËIJ": 1720, - "ĠfËĪiËIJl": 1721, - "ĠbʰËĪaËIJi": 1722, - "ĠÉ¡ÉĻÊĥ": 1723, - "ÊĴËĪe": 1724, - "ĠnjËĪuËIJ": 1725, - "ĠËĪak": 1726, - "ĠÉĹËĪaËIJ": 1727, - "zËĪa": 1728, - "vËĪe": 1729, - "ĠhËĮaÊĬ": 1730, - "ÉIJç": 1731, - "ĠɾËĪÊĮkʰ": 1732, - "pËĪe": 1733, - "ĠtÉĻbi": 1734, - "ĠpËĪÊĮhÉĻlËĮeËIJ": 1735, - "ĠfËĪÉĽ": 1736, - "ĠwËĮɪtÊĥ": 1737, - "ĠtÉķËĪyÉĽÉľ": 1738, - "wËĮe": 1739, - "ËĮaɪt": 1740, - "ĠnÉijËIJx": 1741, - "ĠkËĪÉĶËIJn": 1742, - "ÊĬk": 1743, - "ĠbËĪaËIJd": 1744, - "ÅĭÉĻn": 1745, - "Ġni": 1746, - "ĠbËĪe": 1747, - "ĠmËĮÊĬ": 1748, - "ËĪar": 1749, - "ĠmËĮeɪk": 1750, - "ĠsËĪaËIJɾ": 1751, - "βe": 1752, - "ĠtÉķhËĪiÉľÅĭ": 1753, - "itËĪe": 1754, - "kËĮe": 1755, - "ËĪÉĽËIJl": 1756, - "ËĮÉĴn": 1757, - "ËĮÉij": 1758, - "ĠbËĪɪl": 1759, - "ĠwÊĬd": 1760, - "ĠbËĪoËIJl": 1761, - "rd": 1762, - "iÉĻ": 1763, - "Ġda": 1764, - "ĠbËĪaËIJÊĬ": 1765, - "ĠnËĪÊĮmbÉĻɾ": 1766, - "ËĪaËIJÉªÉľ": 1767, - "ĠÉĽm": 1768, - "ĠmiËIJɾ": 1769, - "ËĪeɪm": 1770, - "los": 1771, - "ËĮÉĽt": 1772, - "ĠËĮaÊĬs": 1773, - "ĠmËĪaÉľt": 1774, - "ĠwËĪuÉĻ": 1775, - "ĠwËĪeɪ": 1776, - "Ġseɲ": 1777, - "ĠbjËĪÉĽ": 1778, - "ĠwÉĽn": 1779, - "fl": 1780, - "ĠkhwËĪa": 1781, - "dËĪÉĽ": 1782, - "vɹɪ": 1783, - "ĠËĪaɾ": 1784, - "jËĪÉijuÉľ": 1785, - "ĠËĮaËIJpkËĮeËIJ": 1786, - "bÊģ": 1787, - "ĠtËĪaɪm": 1788, - "ĠËĪÉij": 1789, - "ĠsËĮa": 1790, - "ĠzËĪoɪ": 1791, - "ËĪÉĶɾa": 1792, - "ĠdËĪø": 1793, - "ËĪÉĶɾt": 1794, - "ĠÅĭËĪÉĶ": 1795, - "min": 1796, - "ĠlËĪÊĬk": 1797, - "ËĪÉĶËIJt": 1798, - "ĠËĪÉĶtɾ": 1799, - "ĠfËĪaɪ": 1800, - "ĠÉ¡ÉĴt": 1801, - "ËĪeËIJÉĻn": 1802, - "kËĪÉĶ": 1803, - "ĠvËĪÉĽÉ¹i": 1804, - "mÉĽ": 1805, - "ËĪaɪz": 1806, - "Ġesp": 1807, - "ɲa": 1808, - "ĠlËĪo": 1809, - "ËĪÉĽËIJra": 1810, - "βËĪi": 1811, - "ouÉľ": 1812, - "ËĮÉĻk": 1813, - "tÊĥuËIJ": 1814, - "ĠnËĪyÉĻ": 1815, - "ÊĪɾ": 1816, - "ĠÉ¡ËĪy": 1817, - "ĠtËĪoðo": 1818, - "ËĪɪçt": 1819, - "Ġmɪç": 1820, - "ĠËĪand": 1821, - "ĠkwËĮÉĽl": 1822, - "ĠÊĤËĪaËIJ": 1823, - "ĠnËĪiÉľ": 1824, - "ËĪÉĶp": 1825, - "ËĪiËIJz": 1826, - "ĠÊĤËĪaÊĬ": 1827, - "ĠɾËĮÉĻhi": 1828, - "ĠsËĮÊĬo": 1829, - "ĠÉĽÉ¡": 1830, - "ĠdÅĵ": 1831, - "ĠÉ¡ËĮaËIJÉªÉľ": 1832, - "dɪ": 1833, - "lËĮa": 1834, - "stËĪi": 1835, - "ĠdËĮiËIJz": 1836, - "ĠtËĮÊĬ": 1837, - "θi": 1838, - "ĠËĪɪskËĮoËIJ": 1839, - "ndÉĻn": 1840, - "Ġtsv": 1841, - "ĠhËĪÉĻËIJ": 1842, - "ĠÊĥËĪÊĬ": 1843, - "ÉĻtËĮeËIJ": 1844, - "pËĮÉĽ": 1845, - "ËĪaɾÉĶn": 1846, - "ĠpÉĽÊģ": 1847, - "Ġy": 1848, - "mnËĮeËIJ": 1849, - "ËĪÉĽllo": 1850, - "ĠÉ¡ËĪÉĻ": 1851, - "ĠËĮad": 1852, - "ĠÊĥv": 1853, - "ËĪÊıɾ": 1854, - "rËĪe": 1855, - "yËIJ": 1856, - "ĠpËĪaËIJs": 1857, - "ĠËĪÉĽn": 1858, - "ɪdÊĴ": 1859, - "ËĪuai": 1860, - "Ġfi": 1861, - "ĠtËĪyÉĻ": 1862, - "ËĪaËIJÉŁ": 1863, - "ĠtjËĪe": 1864, - "ËĪaËIJnaËIJ": 1865, - "stɾ": 1866, - "Êİe": 1867, - "ËĮeɪt": 1868, - "ba": 1869, - "ðas": 1870, - "vÊģ": 1871, - "ĠzËĪÉĻËIJ": 1872, - "ËĪaËIJli": 1873, - "ÉŁÊ°eËIJ": 1874, - "ËĪaËIJteËIJ": 1875, - "ĠvËĪa": 1876, - "Ġsal": 1877, - "ËĪaËIJno": 1878, - "ĠÉ¡ÉĻz": 1879, - "ĠhËĪoËIJti": 1880, - "ĠɲËĪiÉĽ": 1881, - "tÉľ": 1882, - "ĠËĪaËIJp": 1883, - "ĠwËĪÉĽl": 1884, - "ĠmËĪɪl": 1885, - "ĠfyËIJɾ": 1886, - "ËĪÉĽËIJsaËIJ": 1887, - "ĠbËĮiËIJ": 1888, - "ËĪaËIJjaËIJ": 1889, - "ËĪɪp": 1890, - "ĠfÊģ": 1891, - "tsiËĪoËIJne": 1892, - "ĠwËĪuÉľ": 1893, - "Ġvi": 1894, - "ĠwËĪÉijÉľn": 1895, - "ËĪoËIJn": 1896, - "ĠÉĹËĪÉĻɪ": 1897, - "ĠÊĿËĪo": 1898, - "Ġra": 1899, - "mÉĻnt": 1900, - "ËĪaÊĬnd": 1901, - "ĠpÉĽÉ¾": 1902, - "ĠÉĹËĪaËIJÊĬ": 1903, - "oËIJɾ": 1904, - "hËĪo": 1905, - "ĠÉĴn": 1906, - "ĠÊİe": 1907, - "ĠsËĪɪks": 1908, - "É¡n": 1909, - "ĠÉ¡ËĪa": 1910, - "Ġθj": 1911, - "ĠpËĪe": 1912, - "spe": 1913, - "ĠvËĪÉĻ": 1914, - "ĠfËĪɪ": 1915, - "ĠËĮɪntÊĬ": 1916, - "lÉĻn": 1917, - "ĠnËĪiËIJd": 1918, - "ĠsËĮÊĬa": 1919, - "ĠËĪum": 1920, - "ĠdËĪeɪ": 1921, - "ĠËĪÊĮbʰi": 1922, - "ËĪÉijËIJɾ": 1923, - "ĠbËĪiÉĽÉľt": 1924, - "Êİos": 1925, - "ĠtshËĪaiÉľ": 1926, - "ĠËĮɪskËĮaËIJ": 1927, - "ĠaÊĬÉĻ": 1928, - "ĠËĪyæ": 1929, - "Ġdyn": 1930, - "ĠmËĪiËIJn": 1931, - "ĠËĪÊĮcʰËIJ": 1932, - "ĠsÉĽ": 1933, - "ĠnËĪy": 1934, - "ĠnËĮÉĽl": 1935, - "ɡɾ": 1936, - "ÊĥËĪe": 1937, - "ĠÊĤËĮÉĽ": 1938, - "ĠËĪÉĽvɹɪ": 1939, - "ËĪÉĽlp": 1940, - "ĠbËĪak": 1941, - "ĠeËIJ": 1942, - "ĠfËĪaËIJ": 1943, - "ĠkÉĽl": 1944, - "ĠËĪeËIJs": 1945, - "jËĪaËIJd": 1946, - "ĠlËĮi": 1947, - "mbɾe": 1948, - "ktÉĻ": 1949, - "nta": 1950, - "tËĪu": 1951, - "ĠðËĪat": 1952, - "ĠËĪaβ": 1953, - "ÉĻɹi": 1954, - "ĠkwËĮÉĽlla": 1955, - "ĠbÉĻn": 1956, - "rËĮÉĽ": 1957, - "ĠnÉĶ": 1958, - "ĠÉ¡ËĪɪ": 1959, - "ĠËĪap": 1960, - "ɹÉĻ": 1961, - "ËĪaÉľkh": 1962, - "ĠÊIJËĪi": 1963, - "ĠËĪÉijËIJ": 1964, - "ɪɡÉĻn": 1965, - "ĠwËĪai": 1966, - "ĠpÉĻt": 1967, - "kËIJa": 1968, - "ĠbËĪÉĽËIJ": 1969, - "ËĪeËIJÊĭ": 1970, - "lsÉĻÊĬ": 1971, - "ĠcËĪaËIJhɪËĮeËIJ": 1972, - "ĠkÉĻn": 1973, - "ĠËĮaɪnÉĻm": 1974, - "ËĪuËIJt": 1975, - "ĠhËĪaÊĬ": 1976, - "ĠtËĪanto": 1977, - "ĠhÉIJz": 1978, - "ĠsËĪÊĮɾ": 1979, - "Ġno": 1980, - "ĠtËĪÉĶËIJ": 1981, - "ĠzËĪaɪ": 1982, - "ĠtÉķËĪiÉĽÉľ": 1983, - "ĠkozËĪi": 1984, - "ĠkËĪei": 1985, - "ðËĪÉĶɾ": 1986, - "ËĮÉĶÊģ": 1987, - "ĠtËĪÊĮɾ": 1988, - "ĠÊIJËĪÉĻ": 1989, - "ĠÉķËĪyÉĽÉľ": 1990, - "ĠmËĮÊĬÉŁÊ°eËIJ": 1991, - "mf": 1992, - "ĠvËĪiËIJdÉľ": 1993, - "kËĪa": 1994, - "ĠÉIJÉ¡": 1995, - "kw": 1996, - "ĠÊģÉĽ": 1997, - "xÉĻn": 1998, - "ĠdÊĬ": 1999, - "ĠkËĪÊĮɾnËĮeËIJ": 2000, - "jËĪaËIJdaËIJ": 2001, - "ĠfÉĻ": 2002, - "ĠËĮimp": 2003, - "Ġhɪz": 2004, - "ĠʰÏĩ": 2005, - "ËĪoËIJni": 2006, - "ĠxËĪiÉľ": 2007, - "ËĪeËIJsÊĪ": 2008, - "ÊıbÉľ": 2009, - "ËĮÉĶɾke": 2010, - "ĠÉ¡ËĪÉĻÊĬ": 2011, - "ËĪɪÊĥÉĻn": 2012, - "les": 2013, - "ĠfËĪiËIJ": 2014, - "É¡tÉĻ": 2015, - "ËĪeËIJre": 2016, - "ĠvËĮaËIJ": 2017, - "ĠËĪeɪ": 2018, - "ĠmËĪuÉĻÉľn": 2019, - "ĠÉ¡ËĪÊĬd": 2020, - "ĠmËĮaɪn": 2021, - "zËĪe": 2022, - "ĠlËĪiÉľ": 2023, - "Ġmu": 2024, - "ĠkËĮÉĽl": 2025, - "ĠjËĮÉĻh": 2026, - "ĠfËĮÉĶɾ": 2027, - "fɹ": 2028, - "ĠkËĪaɪn": 2029, - "ĠËĪÉĴlsÉĻÊĬ": 2030, - "θɪÅĭ": 2031, - "ĠthËĪonÉ¡Éľ": 2032, - "tËĪÉij": 2033, - "θjo": 2034, - "mËĪÉĶ": 2035, - "Ġos": 2036, - "ĠsÊĬ": 2037, - "ĠsËĪÊĮmÉĻ": 2038, - "ĠvËĮÉĽn": 2039, - "nËĪo": 2040, - "ĠËĪaktÊĥuËIJ": 2041, - "É£a": 2042, - "Ġtʰi": 2043, - "ĠfËĮi": 2044, - "ĠvËĪÉĽl": 2045, - "ĠtËĪutËIJi": 2046, - "xos": 2047 - }, - "merges": [ - [ - "Ë", - "Ī" - ], - [ - "Ë", - "IJ" - ], - [ - "ËĪ", - "É" - ], - [ - "Ë", - "Į" - ], - [ - "É", - "Ļ" - ], - [ - "ËĪ", - "a" - ], - [ - "ËĪ", - "i" - ], - [ - "Ġ", - "t" - ], - [ - "É", - "ª" - ], - [ - "É", - "¾" - ], - [ - "Ġ", - "É" - ], - [ - "Ġ", - "k" - ], - [ - "É", - "ľ" - ], - [ - "Ġ", - "s" - ], - [ - "ËĪ", - "e" - ], - [ - "É", - "Ľ" - ], - [ - "ËĪ", - "o" - ], - [ - "Ġ", - "l" - ], - [ - "ËĪÉ", - "Ľ" - ], - [ - "Ġ", - "d" - ], - [ - "Ê", - "Ĭ" - ], - [ - "ËĪa", - "ËIJ" - ], - [ - "Ġ", - "p" - ], - [ - "Ì", - "ĥ" - ], - [ - "Ġ", - "m" - ], - [ - "ËĪ", - "u" - ], - [ - "Å", - "ĭ" - ], - [ - "Ã", - "°" - ], - [ - "ËĪÉ", - "Ķ" - ], - [ - "Ê", - "Į" - ], - [ - "ËĮ", - "a" - ], - [ - "Ġ", - "h" - ], - [ - "ËĪ", - "ÊĮ" - ], - [ - "Ġ", - "n" - ], - [ - "Ê", - "ģ" - ], - [ - "ËĪÉ", - "ij" - ], - [ - "Ê", - "ĥ" - ], - [ - "e", - "ËIJ" - ], - [ - "Ġ", - "a" - ], - [ - "Ġ", - "b" - ], - [ - "É", - "Ķ" - ], - [ - "ËĪÉ", - "Ļ" - ], - [ - "ÉĻ", - "n" - ], - [ - "Ġ", - "f" - ], - [ - "ËĪÉ", - "ª" - ], - [ - "É", - "¡" - ], - [ - "ËĪe", - "ËIJ" - ], - [ - "Ġ", - "j" - ], - [ - "n", - "t" - ], - [ - "Ġ", - "ð" - ], - [ - "Ġ", - "ËĮ" - ], - [ - "Ġt", - "s" - ], - [ - "ĠÉ", - "¡" - ], - [ - "É", - "ķ" - ], - [ - "ËĪo", - "ËIJ" - ], - [ - "Ê", - "°" - ], - [ - "a", - "ËIJ" - ], - [ - "ËĪ", - "y" - ], - [ - "Ġt", - "Éķ" - ], - [ - "ËĪi", - "ËIJ" - ], - [ - "Ġ", - "Ê" - ], - [ - "Ġ", - "v" - ], - [ - "Ġ", - "w" - ], - [ - "s", - "t" - ], - [ - "É", - "ij" - ], - [ - "n", - "d" - ], - [ - "ËĮ", - "i" - ], - [ - "Ì", - "ª" - ], - [ - "ËĮ", - "e" - ], - [ - "Ġ", - "z" - ], - [ - "ËĪa", - "ɪ" - ], - [ - "ËĪi", - "ÉĽ" - ], - [ - "Î", - "²" - ], - [ - "É", - "¹" - ], - [ - "Ġ", - "ËĮa" - ], - [ - "Î", - "¸" - ], - [ - "Ġh", - "ÉĽ" - ], - [ - "Ê", - "Ī" - ], - [ - "i", - "ËIJ" - ], - [ - "ËĮ", - "o" - ], - [ - "Ġ", - "ɪ" - ], - [ - "Éľ", - "n" - ], - [ - "Ġ", - "x" - ], - [ - "Ġt", - "ÉĻ" - ], - [ - "ËĪu", - "ËIJ" - ], - [ - "ËĮ", - "ÉĻ" - ], - [ - "Ġj", - "ËĪi" - ], - [ - "ËĮ", - "ÉĽ" - ], - [ - "ĠÉ", - "Ľ" - ], - [ - "Ġ", - "ËĪa" - ], - [ - "ËĮa", - "ËIJ" - ], - [ - "Ġl", - "a" - ], - [ - "Ġð", - "e" - ], - [ - "ĠhÉĽ", - "ËIJ" - ], - [ - "Ġ", - "e" - ], - [ - "Ã", - "§" - ], - [ - "ÉĻ", - "l" - ], - [ - "o", - "ËIJ" - ], - [ - "ËĪÉij", - "u" - ], - [ - "Ê", - "Ĵ" - ], - [ - "u", - "ËIJ" - ], - [ - "ĠÉ", - "Ĺ" - ], - [ - "ĠÉ", - "ķ" - ], - [ - "ËĮ", - "eËIJ" - ], - [ - "ĠtÉķ", - "ËĪi" - ], - [ - "o", - "s" - ], - [ - "ËĪÉĶ", - "ËIJ" - ], - [ - "a", - "s" - ], - [ - "ËĪ", - "ÊĬ" - ], - [ - "Ġ", - "i" - ], - [ - "ËĪa", - "i" - ], - [ - "É", - "²" - ], - [ - "ɪ", - "n" - ], - [ - "t", - "s" - ], - [ - "Éľ", - "Åĭ" - ], - [ - "ĠÉ", - "Ł" - ], - [ - "Ġ", - "Êĥ" - ], - [ - "ËĪe", - "ɪ" - ], - [ - "ÉĽ", - "ɾ" - ], - [ - "ËĪÉĽ", - "ËIJ" - ], - [ - "ËĪÉĽ", - "ɾ" - ], - [ - "Ġ", - "r" - ], - [ - "t", - "Êĥ" - ], - [ - "ËĮ", - "ÉĶ" - ], - [ - "Ġd", - "ÉĻ" - ], - [ - "t", - "ÉĻ" - ], - [ - "o", - "u" - ], - [ - "ËĪy", - "ÉĻ" - ], - [ - "ĠËĮ", - "i" - ], - [ - "ÉĻ", - "ɾ" - ], - [ - "ËĪÉĻ", - "ÊĬ" - ], - [ - "ËĪÊĮ", - "ɾ" - ], - [ - "ËĪÉ", - "Ĵ" - ], - [ - "Ġt", - "h" - ], - [ - "ËĪo", - "n" - ], - [ - "Ê", - "ĭ" - ], - [ - "ËĪÉij", - "ËIJ" - ], - [ - "ËĪÊĮ", - "h" - ], - [ - "w", - "ËĪa" - ], - [ - "ËĪe", - "i" - ], - [ - "l", - "l" - ], - [ - "ĠÉ", - "IJ" - ], - [ - "Éij", - "ËIJ" - ], - [ - "a", - "n" - ], - [ - "É", - "Ł" - ], - [ - "ĠÊ", - "ĭ" - ], - [ - "Ġk", - "o" - ], - [ - "k", - "h" - ], - [ - "ɪ", - "Åĭ" - ], - [ - "ËĪaËIJ", - "ɪ" - ], - [ - "Ġt", - "Êĥ" - ], - [ - "ËĪaËIJ", - "t" - ], - [ - "ĠËĮ", - "e" - ], - [ - "ĠtÉķ", - "h" - ], - [ - "ËĪu", - "o" - ], - [ - "ËĪon", - "É¡" - ], - [ - "É", - "ĸ" - ], - [ - "a", - "t" - ], - [ - "Ġk", - "e" - ], - [ - "É", - "Ĵ" - ], - [ - "ĠÉķ", - "ËĪi" - ], - [ - "Ã", - "¸" - ], - [ - "ĠÉ", - "ij" - ], - [ - "ËĪeËIJ", - "k" - ], - [ - "Å", - "ĵ" - ], - [ - "r", - "e" - ], - [ - "Ġ", - "ɾ" - ], - [ - "Ġk", - "ÉĶ" - ], - [ - "ËĮ", - "ÊĬ" - ], - [ - "s", - "k" - ], - [ - "Ġ", - "ÊĬ" - ], - [ - "Ġa", - "nd" - ], - [ - "ɪ", - "ç" - ], - [ - "Ġm", - "e" - ], - [ - "ËĪa", - "ɾ" - ], - [ - "Ġ", - "ËĪɪ" - ], - [ - "n", - "a" - ], - [ - "Ġ", - "β" - ], - [ - "Ġl", - "ËĪi" - ], - [ - "j", - "aËIJ" - ], - [ - "l", - "i" - ], - [ - "n", - "o" - ], - [ - "Ġɪ", - "n" - ], - [ - "Ġd", - "ËĮi" - ], - [ - "ĠÉ", - "²" - ], - [ - "t", - "ËIJ" - ], - [ - "ÉĻ", - "m" - ], - [ - "Ġl", - "ÉĻ" - ], - [ - "Ġð", - "ÉĻ" - ], - [ - "ɪ", - "k" - ], - [ - "ËĪÉĽ", - "l" - ], - [ - "Éľ", - "t" - ], - [ - "Ġs", - "e" - ], - [ - "e", - "s" - ], - [ - "ËĪo", - "u" - ], - [ - "ËĪa", - "ÊĬ" - ], - [ - "ĠÉ", - "Ķ" - ], - [ - "ɪ", - "t" - ], - [ - "Ġ", - "Åĭ" - ], - [ - "ËĪÉĽ", - "n" - ], - [ - "Ê", - "İ" - ], - [ - "Ġk", - "h" - ], - [ - "ËĪÉĽ", - "nt" - ], - [ - "ËĪaËIJ", - "ɾ" - ], - [ - "Ġk", - "i" - ], - [ - "m", - "p" - ], - [ - "l", - "t" - ], - [ - "É", - "£" - ], - [ - "Ġp", - "a" - ], - [ - "ËĪÉĻ", - "ËIJ" - ], - [ - "ɪ", - "s" - ], - [ - "ĠÉ", - "Ĵ" - ], - [ - "Ġl", - "e" - ], - [ - "ɪ", - "Éľ" - ], - [ - "ËĪÉĽ", - "t" - ], - [ - "Ġd", - "e" - ], - [ - "ĠÉ", - "¹" - ], - [ - "Ġt", - "ËĪoËIJ" - ], - [ - "Ġ", - "Êģ" - ], - [ - "Êĥ", - "ÉĻn" - ], - [ - "ĠÊĬ", - "nt" - ], - [ - "ËĪÉĶ", - "ɾ" - ], - [ - "ËĪa", - "ð" - ], - [ - "Ġa", - "ɪ" - ], - [ - "ĠÊ", - "IJ" - ], - [ - "Ġm", - "ËĪa" - ], - [ - "r", - "a" - ], - [ - "Ġk", - "ËĪɪ" - ], - [ - "k", - "t" - ], - [ - "ËIJ", - "p" - ], - [ - "ĠÊ", - "Ī" - ], - [ - "ËĪaËIJ", - "ÊĬ" - ], - [ - "Ġk", - "ËĪÊĮɾ" - ], - [ - "Ġ", - "ËĪÊĮ" - ], - [ - "ĠÉĴ", - "v" - ], - [ - "Ġe", - "l" - ], - [ - "k", - "s" - ], - [ - "Ġk", - "w" - ], - [ - "ÉĻ", - "t" - ], - [ - "nd", - "o" - ], - [ - "e", - "i" - ], - [ - "ĠËĮa", - "ËIJp" - ], - [ - "s", - "e" - ], - [ - "ÉĻ", - "ɹ" - ], - [ - "ËĪu", - "ei" - ], - [ - "ÉĻ", - "s" - ], - [ - "Ġk", - "ËĮo" - ], - [ - "ĠÊ", - "Ĥ" - ], - [ - "ĠËĮ", - "ÊĬ" - ], - [ - "Ġ", - "c" - ], - [ - "ĠÉĽ", - "n" - ], - [ - "ËĪa", - "nt" - ], - [ - "θ", - "j" - ], - [ - "ËĮo", - "ËIJ" - ], - [ - "Ġ", - "ËĪaËIJ" - ], - [ - "Ġp", - "ɾ" - ], - [ - "s", - "i" - ], - [ - "Ġ", - "ËĪe" - ], - [ - "Ġj", - "uËIJ" - ], - [ - "Ġk", - "ËĮe" - ], - [ - "ËĮ", - "ɪ" - ], - [ - "ÉĶ", - "n" - ], - [ - "Ġs", - "ËĪÊĮ" - ], - [ - "Ġ", - "ËĪu" - ], - [ - "n", - "i" - ], - [ - "Ġs", - "t" - ], - [ - "Ġd", - "iËIJ" - ], - [ - "Ġk", - "eËIJ" - ], - [ - "ĠjËĪi", - "ou" - ], - [ - "ËĪai", - "Éľ" - ], - [ - "Ġd", - "ÊĴ" - ], - [ - "Ġ", - "ËĪÉĶ" - ], - [ - "v", - "a" - ], - [ - "ËIJ", - "ɾ" - ], - [ - "ËĪ", - "ø" - ], - [ - "ËĮÉĻ", - "ÊĬ" - ], - [ - "Ġp", - "ËĪu" - ], - [ - "Ġs", - "u" - ], - [ - "Ġm", - "a" - ], - [ - "Ġ", - "ÉĻ" - ], - [ - "d", - "ÊĴ" - ], - [ - "Ġp", - "ʰ" - ], - [ - "l", - "e" - ], - [ - "i", - "n" - ], - [ - "ĠtÉķh", - "ËĪi" - ], - [ - "Ġw", - "ËĪo" - ], - [ - "r", - "o" - ], - [ - "ËĮ", - "y" - ], - [ - "ɾ", - "a" - ], - [ - "Ġs", - "ËĪi" - ], - [ - "ð", - "ÉĻ" - ], - [ - "Ġs", - "eËIJ" - ], - [ - "l", - "a" - ], - [ - "ĠÊ", - "Ĵ" - ], - [ - "m", - "b" - ], - [ - "Ġh", - "ËĪoËIJ" - ], - [ - "Ġb", - "ʰ" - ], - [ - "ĠÉĽ", - "ɾ" - ], - [ - "Ġð", - "at" - ], - [ - "s", - "p" - ], - [ - "ÉĶ", - "ɾ" - ], - [ - "e", - "n" - ], - [ - "Ġs", - "ÉĻ" - ], - [ - "ËĪÉĶ", - "Éľ" - ], - [ - "Ġl", - "ËĮa" - ], - [ - "ĠËĮ", - "ÉĽ" - ], - [ - "Ġ", - "ËĪy" - ], - [ - "É¡", - "aËIJ" - ], - [ - "Ġd", - "ÉĽÉ¾" - ], - [ - "ËĪÉĽ", - "Êģ" - ], - [ - "Éľ", - "kh" - ], - [ - "ËĪi", - "ÉĻ" - ], - [ - "ËĪa", - "n" - ], - [ - "Ġm", - "ËĪo" - ], - [ - "ËĪa", - "β" - ], - [ - "Ġa", - "l" - ], - [ - "Ġ", - "ËĪeËIJ" - ], - [ - "Ġ", - "θ" - ], - [ - "Ġn", - "ËĪi" - ], - [ - "p", - "ʰ" - ], - [ - "ll", - "a" - ], - [ - "Ġp", - "l" - ], - [ - "ËĪ", - "Åĵ" - ], - [ - "j", - "ËĪÉiju" - ], - [ - "Ġa", - "v" - ], - [ - "Ġm", - "ËĪi" - ], - [ - "Ġf", - "ËĪa" - ], - [ - "ËĪÉ", - "ľ" - ], - [ - "m", - "e" - ], - [ - "ËĮÉĻ", - "h" - ], - [ - "ËĪu", - "ÉĻ" - ], - [ - "i", - "t" - ], - [ - "j", - "ËĪe" - ], - [ - "Ġ", - "o" - ], - [ - "ËĪÉľ", - "ËIJ" - ], - [ - "ĠtÉķËĪi", - "ou" - ], - [ - "ÉĶ", - "ËIJ" - ], - [ - "Ġn", - "ÉĻ" - ], - [ - "ËĪÉĻ", - "Éľn" - ], - [ - "Ġm", - "ÉĻ" - ], - [ - "Ġd", - "eËIJ" - ], - [ - "m", - "o" - ], - [ - "s", - "a" - ], - [ - "j", - "ËĪÉĶ" - ], - [ - "ËĪa", - "l" - ], - [ - "ĠtÉķ", - "ËĪiÉĽ" - ], - [ - "ĠÉ¡", - "ÉĻ" - ], - [ - "ð", - "a" - ], - [ - "Ġɪ", - "z" - ], - [ - "Ġs", - "a" - ], - [ - "r", - "i" - ], - [ - "ĠËĮi", - "l" - ], - [ - "ËĮ", - "u" - ], - [ - "Ġk", - "aËIJ" - ], - [ - "ĠÉĻ", - "ËIJ" - ], - [ - "ĠÉ", - "ĸ" - ], - [ - "Ġk", - "a" - ], - [ - "ËĪÊĮh", - "i" - ], - [ - "Ġj", - "eËIJ" - ], - [ - "Ġt", - "ʰ" - ], - [ - "n", - "e" - ], - [ - "k", - "ËIJ" - ], - [ - "Ġts", - "ËĪai" - ], - [ - "Ġ", - "ËĪeËIJk" - ], - [ - "n", - "k" - ], - [ - "t", - "i" - ], - [ - "ËĪa", - "Éľn" - ], - [ - "Ġk", - "ËIJ" - ], - [ - "É¡", - "ÉĻn" - ], - [ - "ËĪi", - "a" - ], - [ - "ĠÉĶ", - "ËIJɾ" - ], - [ - "Ê", - "ı" - ], - [ - "ĠËĮ", - "ÊĮ" - ], - [ - "Ġz", - "ËĪaËIJ" - ], - [ - "Ġl", - "os" - ], - [ - "ÉĽ", - "s" - ], - [ - "ËĪÉĶ", - "n" - ], - [ - "ÉĽ", - "nt" - ], - [ - "ÉĽ", - "n" - ], - [ - "ĠÉŁ", - "ËĪoËIJ" - ], - [ - "ç", - "t" - ], - [ - "Ġd", - "as" - ], - [ - "Ġx", - "ËĮo" - ], - [ - "ËĪu", - "Éľ" - ], - [ - "ËĪa", - "s" - ], - [ - "Ġb", - "ËĪÊĮ" - ], - [ - "ËĪiÉĽ", - "Éľn" - ], - [ - "É", - "IJ" - ], - [ - "Ġts", - "uËIJ" - ], - [ - "Ġp", - "ËĮÉĽ" - ], - [ - "Ġn", - "ËĪÉĶ" - ], - [ - "ÊĬ", - "t" - ], - [ - "m", - "a" - ], - [ - "Ġn", - "ËĪo" - ], - [ - "Ġl", - "ËĪɪ" - ], - [ - "ËĪÉĽ", - "s" - ], - [ - "ɪ", - "l" - ], - [ - "ĠÉķ", - "ËĪiÉĽ" - ], - [ - "Ġ", - "ËĪÊĬ" - ], - [ - "ÉĴ", - "t" - ], - [ - "t", - "o" - ], - [ - "Ġ", - "ËĪo" - ], - [ - "ËĮo", - "n" - ], - [ - "Ġk", - "wËĪa" - ], - [ - "Ġɪ", - "t" - ], - [ - "Ġh", - "oËIJ" - ], - [ - "ËĪiËIJ", - "k" - ], - [ - "ĠËĮaËIJp", - "k" - ], - [ - "ËĪaɪ", - "n" - ], - [ - "Ã", - "¦" - ], - [ - "ÉĻn", - "t" - ], - [ - "t", - "a" - ], - [ - "l", - "o" - ], - [ - "Ġn", - "ËĪÉij" - ], - [ - "Ġl", - "ËĪa" - ], - [ - "ËĪi", - "Éľ" - ], - [ - "Ġw", - "ËĪei" - ], - [ - "ÉĽ", - "Êģ" - ], - [ - "Ġt", - "ËĪa" - ], - [ - "Ġɾ", - "ËĮÉĻh" - ], - [ - "ĠÉķËĪi", - "Éij" - ], - [ - "ËĮi", - "ËIJ" - ], - [ - "ËĮÉĽ", - "l" - ], - [ - "ĠtÉĻ", - "Éľ" - ], - [ - "Ġk", - "ËĪuo" - ], - [ - "Ġt", - "ËĪu" - ], - [ - "j", - "ËĪÉĽ" - ], - [ - "ĠËĮi", - "n" - ], - [ - "ɾ", - "e" - ], - [ - "Ġk", - "oËIJ" - ], - [ - "Ġk", - "ËĪa" - ], - [ - "ɾ", - "i" - ], - [ - "ĠtÉķËĪi", - "Éij" - ], - [ - "l", - "ÉĻ" - ], - [ - "Ġk", - "ÉĻ" - ], - [ - "Ġt", - "ËĪi" - ], - [ - "ĠÅĭ", - "ËĪyÉĻ" - ], - [ - "Ġts", - "h" - ], - [ - "e", - "r" - ], - [ - "a", - "v" - ], - [ - "ĠkÉĶ", - "n" - ], - [ - "ËĪÉĻ", - "ÉľÅĭ" - ], - [ - "ð", - "o" - ], - [ - "ËĪaËIJ", - "n" - ], - [ - "Ġbʰ", - "ËĪi" - ], - [ - "ĠkËIJ", - "jaËIJ" - ], - [ - "ÉĻ", - "z" - ], - [ - "Ġp", - "Êģ" - ], - [ - "Ġd", - "ËĪɪ" - ], - [ - "Ġz", - "iËIJ" - ], - [ - "É¡", - "eËIJ" - ], - [ - "Ġt", - "ËĪÉĻ" - ], - [ - "ɪ", - "z" - ], - [ - "Ġn", - "ËĮon" - ], - [ - "t", - "aËIJ" - ], - [ - "b", - "l" - ], - [ - "t", - "e" - ], - [ - "n", - "ËĮeËIJ" - ], - [ - "ËĪɪ", - "l" - ], - [ - "s", - "o" - ], - [ - "k", - "o" - ], - [ - "u", - "Êģ" - ], - [ - "ĠÉ", - "£" - ], - [ - "Ġpa", - "Êģ" - ], - [ - "Ġ", - "ËĪÉĽ" - ], - [ - "j", - "ËĪuËIJ" - ], - [ - "ËĮ", - "ÊĮ" - ], - [ - "y", - "n" - ], - [ - "ËĪiËIJ", - "n" - ], - [ - "Ġl", - "ËĪaɪ" - ], - [ - "ËĪɪ", - "Åĭ" - ], - [ - "ĠtÉķh", - "ËĪy" - ], - [ - "Ġn", - "ËĪÊĮhi" - ], - [ - "Ġd", - "ËĮe" - ], - [ - "Ġj", - "ËĪÉiju" - ], - [ - "Ġt", - "ËĪÉiju" - ], - [ - "Ġh", - "ËĪo" - ], - [ - "ɪ", - "d" - ], - [ - "Ġth", - "ËĪÉij" - ], - [ - "m", - "ËĪe" - ], - [ - "Ġ", - "ËĪÉĻ" - ], - [ - "j", - "a" - ], - [ - "Ġp", - "h" - ], - [ - "ÉĽ", - "t" - ], - [ - "Ġk", - "ËĪÊĮ" - ], - [ - "t", - "ÉĻn" - ], - [ - "m", - "ËĪÉij" - ], - [ - "w", - "ËĪe" - ], - [ - "ĠËĮa", - "ɪn" - ], - [ - "Ġð", - "ɪs" - ], - [ - "É¡", - "ÉĻ" - ], - [ - "Ġn", - "ËĪaËIJ" - ], - [ - "Ġb", - "ËĪaËIJ" - ], - [ - "Ġa", - "θ" - ], - [ - "Ġm", - "ËĮa" - ], - [ - "ËĪÊĮh", - "a" - ], - [ - "Ġd", - "ËĮa" - ], - [ - "ËĪ", - "Êı" - ], - [ - "Ġɲ", - "ËĮy" - ], - [ - "Ġp", - "ËĪa" - ], - [ - "ËĪað", - "o" - ], - [ - "d", - "i" - ], - [ - "b", - "Éľ" - ], - [ - "É", - "³" - ], - [ - "Ġw", - "iËIJ" - ], - [ - "Ġn", - "ËĪɪ" - ], - [ - "ĠÉ¡", - "ËĪÉĶÉľ" - ], - [ - "tËIJ", - "o" - ], - [ - "ËĮÉĻ", - "m" - ], - [ - "ËĪaËIJ", - "r" - ], - [ - "Ġm", - "ÉĽ" - ], - [ - "ËĪeËIJ", - "É¡aËIJ" - ], - [ - "Ġs", - "ËĮi" - ], - [ - "Ġl", - "ËĮaËIJ" - ], - [ - "n", - "ËĮaËIJ" - ], - [ - "Ġs", - "p" - ], - [ - "t", - "Êģ" - ], - [ - "ĠÊ", - "İ" - ], - [ - "ËĮ", - "ÉijËIJ" - ], - [ - "Ġk", - "l" - ], - [ - "k", - "ʰ" - ], - [ - "i", - "l" - ], - [ - "ĠÊĥ", - "t" - ], - [ - "ĠËĮÊĬ", - "n" - ], - [ - "a", - "l" - ], - [ - "Ġs", - "ËĪÉĽ" - ], - [ - "Ġm", - "ËĪaËIJ" - ], - [ - "Ġ", - "Åĵ" - ], - [ - "ĠÉ¡", - "ËĪÊĮ" - ], - [ - "ĠpËĮÉĽ", - "r" - ], - [ - "ɾ", - "ËĪa" - ], - [ - "ËIJ", - "ÊĪ" - ], - [ - "ËĪaβ", - "a" - ], - [ - "Ġw", - "ËĪÉĴ" - ], - [ - "Ġx", - "ËĪuei" - ], - [ - "Ġkh", - "ËĪo" - ], - [ - "Ġla", - "s" - ], - [ - "ĠÉĹ", - "ËĪo" - ], - [ - "Ġf", - "ÉĽÉ¾" - ], - [ - "Ġj", - "ËĪiÉĽ" - ], - [ - "Ġt", - "ËĪe" - ], - [ - "Ġk", - "ËĮÉĶ" - ], - [ - "ĠdeËIJ", - "n" - ], - [ - "Ġm", - "o" - ], - [ - "Ġp", - "ËĪi" - ], - [ - "Ġt", - "ËĪÉij" - ], - [ - "ËĪÉĽ", - "st" - ], - [ - "w", - "ËĪÉij" - ], - [ - "ËĪaɪ", - "t" - ], - [ - "ÉĻ", - "ÊĬ" - ], - [ - "Ġ", - "ËĪi" - ], - [ - "ɪ", - "j" - ], - [ - "a", - "ɪ" - ], - [ - "ËĪaËIJ", - "Éľ" - ], - [ - "ĠËĪɪ", - "s" - ], - [ - "Ġp", - "ÉĶɾ" - ], - [ - "æ", - "Éľn" - ], - [ - "k", - "a" - ], - [ - "Åĭ", - "É¡" - ], - [ - "b", - "ÉĻn" - ], - [ - "ÊĬ", - "f" - ], - [ - "Ġp", - "ɹ" - ], - [ - "Ġl", - "ËĮe" - ], - [ - "ËĪiËIJ", - "d" - ], - [ - "ËĪaËIJ", - "re" - ], - [ - "Ġm", - "ËĪÊĮ" - ], - [ - "ÉĻ", - "r" - ], - [ - "Ġd", - "Éij" - ], - [ - "ËĪaËIJt", - "o" - ], - [ - "Ġp", - "ËĪeËIJ" - ], - [ - "Ġd", - "ËĪoËIJ" - ], - [ - "Ġs", - "ËĮÊĬ" - ], - [ - "Ġh", - "ËĪi" - ], - [ - "Ġs", - "ËĪa" - ], - [ - "ËĪeËIJ", - "n" - ], - [ - "d", - "ÉĻ" - ], - [ - "Ġp", - "j" - ], - [ - "ËĪÅĵ", - "Êģ" - ], - [ - "l", - "ɪç" - ], - [ - "ÉĴ", - "n" - ], - [ - "ĠËĪÉĻ", - "r" - ], - [ - "t", - "ËĪe" - ], - [ - "Ġi", - "l" - ], - [ - "ËĪaËIJ", - "l" - ], - [ - "Ġs", - "ËĮÉĻÊĬ" - ], - [ - "s", - "ÊĪ" - ], - [ - "Ġd", - "ËĪuËIJ" - ], - [ - "h", - "ËĪÉij" - ], - [ - "Ġx", - "ËĪou" - ], - [ - "Ġl", - "ËĪaiÉľ" - ], - [ - "w", - "ËĪo" - ], - [ - "ËĪÉĽnt", - "e" - ], - [ - "Ġs", - "y" - ], - [ - "Ġz", - "ɪç" - ], - [ - "ĠÉ¡", - "ËĪu" - ], - [ - "ĠÉķ", - "ËĪy" - ], - [ - "ËĪÉĶËIJ", - "l" - ], - [ - "ÉĶ", - "l" - ], - [ - "Ġt", - "ËĪo" - ], - [ - "ĠÊĭ", - "oËIJ" - ], - [ - "Ġ", - "iËIJ" - ], - [ - "wËĪa", - "ða" - ], - [ - "ËĪa", - "ndo" - ], - [ - "Ġaθ", - "ÉĽnt" - ], - [ - "Ġaθɼnt", - "wËĪaða" - ], - [ - "Ġt", - "ËĪiÉĽ" - ], - [ - "ËĪei", - "Éľ" - ], - [ - "Ġp", - "ËĮa" - ], - [ - "Ġn", - "ËĪaɪ" - ], - [ - "w", - "a" - ], - [ - "Ġf", - "r" - ], - [ - "ĠÊIJ", - "ËĪÉĻÉľn" - ], - [ - "ËĪu", - "a" - ], - [ - "m", - "i" - ], - [ - "Ġm", - "ËĪÉĽ" - ], - [ - "ËĪeËIJk", - "ʰ" - ], - [ - "c", - "ʰ" - ], - [ - "Ġw", - "ËĪÉij" - ], - [ - "st", - "a" - ], - [ - "Ġt", - "u" - ], - [ - "Ġs", - "k" - ], - [ - "ËĪÉĶ", - "l" - ], - [ - "ËĪeËIJ", - "ÊĪ" - ], - [ - "Ġl", - "ËĪaËIJɪ" - ], - [ - "Ġl", - "ËĪaËIJ" - ], - [ - "ËĪÉĽËIJ", - "s" - ], - [ - "ËĪÉĽÉ¾", - "a" - ], - [ - "ËĪÉĻ", - "Éľt" - ], - [ - "Ġ", - "yn" - ], - [ - "d", - "ÉĻn" - ], - [ - "Ġd", - "i" - ], - [ - "ËĪiËIJ", - "s" - ], - [ - "Ġðe", - "l" - ], - [ - "ËĪÊĮ", - "r" - ], - [ - "Ġh", - "ËĪaËIJ" - ], - [ - "Ġb", - "ÉĻ" - ], - [ - "Ġj", - "ËĪuËIJ" - ], - [ - "ll", - "e" - ], - [ - "st", - "o" - ], - [ - "ËĪɪ", - "t" - ], - [ - "ËĪoËIJ", - "ɾ" - ], - [ - "b", - "ʰ" - ], - [ - "m", - "ÉĻn" - ], - [ - "ËĮu", - "ÉĻ" - ], - [ - "ËĮÉĻ", - "ɾ" - ], - [ - "ËĪÊĮ", - "n" - ], - [ - "ĠlËĪaɪ", - "k" - ], - [ - "Ġb", - "ËĪa" - ], - [ - "ɪ", - "ð" - ], - [ - "Ġl", - "o" - ], - [ - "z", - "i" - ], - [ - "ËĪÊĮ", - "st" - ], - [ - "m", - "ËĪi" - ], - [ - "ÉĶ", - "Êģ" - ], - [ - "ĠnËĪɪ", - "çt" - ], - [ - "Ġt", - "ɾ" - ], - [ - "Ġd", - "ËĪeËIJkʰ" - ], - [ - "Ġs", - "ËĮe" - ], - [ - "Ġn", - "ËĪÉĻÊĬ" - ], - [ - "Ġ", - "u" - ], - [ - "Ġs", - "i" - ], - [ - "Ġɪ", - "ç" - ], - [ - "Ġp", - "r" - ], - [ - "ĠtÉķ", - "ËĪy" - ], - [ - "Ġm", - "ËĪu" - ], - [ - "z", - "a" - ], - [ - "Ġt", - "Êģ" - ], - [ - "Ġw", - "ɪð" - ], - [ - "t", - "ËĪÉĽ" - ], - [ - "Ġp", - "ËĪÊĮɾ" - ], - [ - "Ġk", - "ËĪÉĶ" - ], - [ - "ËĪoËIJ", - "r" - ], - [ - "Ġh", - "ËĮa" - ], - [ - "Ġk", - "ËĪonÉ¡" - ], - [ - "Ġp", - "uÊģ" - ], - [ - "Ġd", - "y" - ], - [ - "ËĪɪ", - "n" - ], - [ - "nt", - "e" - ], - [ - "Ġk", - "ËĮa" - ], - [ - "ËĪÉĻ", - "ɪ" - ], - [ - "Ġm", - "i" - ], - [ - "ĠÉ¡", - "ËĮuÉĻ" - ], - [ - "ĠÊ", - "²" - ], - [ - "Ġf", - "ËĪÉij" - ], - [ - "Ġv", - "ÉijËIJ" - ], - [ - "ĠËĮa", - "ÊĬ" - ], - [ - "ËĮ", - "uËIJ" - ], - [ - "ĠËĪu", - "n" - ], - [ - "Ġj", - "ËĪÊĮha" - ], - [ - "j", - "uËIJ" - ], - [ - "Ġm", - "ɪt" - ], - [ - "Ġl", - "ËĪÉĽ" - ], - [ - "ËĪeËIJ", - "Êĥ" - ], - [ - "Ġf", - "ÉĶËIJ" - ], - [ - "m", - "ÉĻ" - ], - [ - "ɾ", - "t" - ], - [ - "ĠkËĮo", - "n" - ], - [ - "Ġl", - "ËĪÉĶ" - ], - [ - "Ġx", - "ËĪÉiju" - ], - [ - "p", - "l" - ], - [ - "Ġd", - "ËĪi" - ], - [ - "Ġl", - "ËĪoËIJ" - ], - [ - "s", - "ÉĻ" - ], - [ - "ËĪaËIJ", - "va" - ], - [ - "Ġl", - "ËĪu" - ], - [ - "ĠÉ¡", - "ËĮÉĻÊĬ" - ], - [ - "Ġh", - "av" - ], - [ - "ĠËĮaËIJpk", - "ËĮoËIJ" - ], - [ - "ɾ", - "ËĪi" - ], - [ - "Ġf", - "ËĪÉĻ" - ], - [ - "Ġh", - "ËĮÉĻm" - ], - [ - "ËĪonÉ¡", - "Éľ" - ], - [ - "j", - "o" - ], - [ - "Ġs", - "ÉĶ" - ], - [ - "ËĪaËIJ", - "d" - ], - [ - "w", - "ËĪiÉĻ" - ], - [ - "ËĪa", - "nd" - ], - [ - "ËĮa", - "ɪn" - ], - [ - "t", - "ɾ" - ], - [ - "ĠËĮ", - "ɪ" - ], - [ - "ĠËĪu", - "na" - ], - [ - "Ġx", - "wËĪÉij" - ], - [ - "Ġj", - "ÉĶËIJ" - ], - [ - "Êģ", - "ËĪi" - ], - [ - "ĠkËĪuo", - "Éľ" - ], - [ - "Ġa", - "β" - ], - [ - "ĠÉ¡", - "ËĪaËIJ" - ], - [ - "an", - "o" - ], - [ - "t", - "ÉĻl" - ], - [ - "Ġr", - "ËĮe" - ], - [ - "ËĮÊĮ", - "t" - ], - [ - "ĠjËĪi", - "Éij" - ], - [ - "ĠɾËĮÉĻh", - "aËIJ" - ], - [ - "Ġm", - "ËĪe" - ], - [ - "ĠËĪy", - "Ã¦Éľn" - ], - [ - "Ġf", - "ËĪu" - ], - [ - "Ġb", - "l" - ], - [ - "n", - "ËĪi" - ], - [ - "s", - "ÉĻn" - ], - [ - "Ġa", - "ɪn" - ], - [ - "ËĪi", - "ÊĬ" - ], - [ - "Ġðe", - "ɪ" - ], - [ - "Ġɪ", - "ts" - ], - [ - "Ġ", - "(" - ], - [ - "ËĪy", - "ËIJ" - ], - [ - "ÉĻ", - "d" - ], - [ - "ĠËĮ", - "o" - ], - [ - "ĠÉĽ", - "s" - ], - [ - "Ġv", - "iËIJ" - ], - [ - "ËIJ", - "É¡eËIJ" - ], - [ - "k", - "ËĪe" - ], - [ - "ĠËĪa", - "l" - ], - [ - "ÉĽ", - "l" - ], - [ - "Ġ", - "ÊĮ" - ], - [ - "ËIJ", - "o" - ], - [ - "Ġk", - "ËĪo" - ], - [ - "ĠÊĪ", - "ËĪuËIJ" - ], - [ - "Ġs", - "ËĪɪ" - ], - [ - "ËĪeËIJ", - "ɾ" - ], - [ - "Éľ", - "m" - ], - [ - "ËĮ", - "ÉĻn" - ], - [ - "ËĪaËIJ", - "i" - ], - [ - "ËĪoËIJ", - "l" - ], - [ - "ɪ", - "ËĮeËIJ" - ], - [ - "Ġʲ", - "ËĪy" - ], - [ - "Ġk", - "ËĪÉĶËIJ" - ], - [ - "s", - "ËĪi" - ], - [ - "Ġl", - "ËĪe" - ], - [ - "ËĮ", - "ÉĴt" - ], - [ - "ËĪiËIJ", - "p" - ], - [ - "a", - "Êģ" - ], - [ - "Ġθ", - "ËĪɪÅĭ" - ], - [ - "ËĪÉĻËIJ", - "ɪ" - ], - [ - "ËĪÊĮ", - "l" - ], - [ - "ĠhËĪoËIJ", - "taËIJ" - ], - [ - "ËĪo", - "ɪ" - ], - [ - "nt", - "o" - ], - [ - "z", - "h" - ], - [ - "ĠdeËIJ", - "m" - ], - [ - "ĠkÉĶ", - "m" - ], - [ - "ʰ", - "ËĪiËIJk" - ], - [ - "ĠdÊĴ", - "ËĪÊĮst" - ], - [ - "p", - "ɾ" - ], - [ - "Ġl", - "y" - ], - [ - "h", - "ËĪu" - ], - [ - "ËĪÉĶ", - "ø" - ], - [ - "ËĪaËIJ", - "s" - ], - [ - "ĠËĪa", - "n" - ], - [ - "Ġ", - "ËĪÉĴ" - ], - [ - "Ġk", - "an" - ], - [ - "Ġts", - "ËĪuo" - ], - [ - "ËĪeËIJ", - "va" - ], - [ - "ĠÉ¡", - "ɾ" - ], - [ - "Ġp", - "o" - ], - [ - "ĠtÊĥ", - "ËĪÉĶ" - ], - [ - "Êİ", - "a" - ], - [ - "Ġm", - "ËĮi" - ], - [ - "Êĥ", - "t" - ], - [ - "t", - "ËĪi" - ], - [ - "Ġh", - "ËĪÊĮ" - ], - [ - "tÊĥ", - "e" - ], - [ - "Ġf", - "ÉĶn" - ], - [ - "v", - "e" - ], - [ - "Ġn", - "ËĮe" - ], - [ - "ËĪÉĶ", - "Êģ" - ], - [ - "i", - "z" - ], - [ - "Ġs", - "ËĪuo" - ], - [ - "ËĪÉĽËIJ", - "r" - ], - [ - "wËĪa", - "Êģ" - ], - [ - "ËĪað", - "a" - ], - [ - "Åĭ", - "k" - ], - [ - "p", - "o" - ], - [ - "Ġk", - "ËĪi" - ], - [ - "ËĪa", - "d" - ], - [ - "Ġv", - "ËĪi" - ], - [ - "t", - "Éķ" - ], - [ - "Ġk", - "ËĪÉĻ" - ], - [ - "Ġw", - "ËĪu" - ], - [ - "ÉĴ", - "z" - ], - [ - "ĠvÉijËIJ", - "ɾ" - ], - [ - "Êģ", - "ËĪÉĽ" - ], - [ - "Ġk", - "ËĪaËIJ" - ], - [ - "k", - "e" - ], - [ - "n", - "ÉĻ" - ], - [ - "ËĪÊĮ", - "b" - ], - [ - "ËĪuËIJ", - "ɾ" - ], - [ - "ËĮÉĻ", - "ËIJ" - ], - [ - "ĠÊĪ", - "ʰËĪiËIJk" - ], - [ - "Ġk", - "ËĪu" - ], - [ - "Ġb", - "ËĮÊĮt" - ], - [ - "Ġa", - "t" - ], - [ - "Ġf", - "ɹ" - ], - [ - "ËĪa", - "x" - ], - [ - "Ġz", - "oËIJ" - ], - [ - "Ġt", - "ËĪaËIJ" - ], - [ - "Ġð", - "ËĮe" - ], - [ - "n", - "eËIJ" - ], - [ - "ĠÉij", - "ËIJ" - ], - [ - "Ġa", - "ÊĬf" - ], - [ - "a", - "m" - ], - [ - "ÊĬ", - "Åĭ" - ], - [ - "ĠÉĶ", - "ËIJ" - ], - [ - "ĠÉķËĪi", - "ÉľÅĭ" - ], - [ - "Ġ", - "ËĪÉĶËIJl" - ], - [ - "ɪ", - "m" - ], - [ - "j", - "ËĪo" - ], - [ - "ËĪiËIJ", - "ÉŁ" - ], - [ - "Ġkw", - "ËĮÉĽ" - ], - [ - "ĠmËĪa", - "s" - ], - [ - "ÉĻ", - "h" - ], - [ - "ĠËĪa", - "ÊĬ" - ], - [ - "ËĪÉĶ", - "ɪ" - ], - [ - "É¡", - "ÉĻɾ" - ], - [ - "r", - "ÉĻn" - ], - [ - "ËĪɪ", - "k" - ], - [ - "s", - "se" - ], - [ - "Ġp", - "ËĪÉij" - ], - [ - "ĠÉĹ", - "ËĮe" - ], - [ - "ĠÉĹ", - "ËĪi" - ], - [ - "Ġa", - "z" - ], - [ - "ĠÉ¡ËĪÊĮ", - "jaËIJ" - ], - [ - "z", - "e" - ], - [ - "ĠÉĹ", - "ËĮaËIJ" - ], - [ - "Ġf", - "ËĪi" - ], - [ - "ĠËĮ", - "ÉĴn" - ], - [ - "Ġx", - "ËĪo" - ], - [ - "ĠËĮÊĬ", - "na" - ], - [ - "Ġtʰ", - "aËIJ" - ], - [ - "Ġs", - "Éij" - ], - [ - "ËĪeɪ", - "ÊĥÉĻn" - ], - [ - "ĠtÉķËĪi", - "Éľ" - ], - [ - "ĠÉŁ", - "aËIJ" - ], - [ - "p", - "ËIJ" - ], - [ - "Ġpl", - "y" - ], - [ - "θ", - "ËĪi" - ], - [ - "ËIJ", - "Éĸ" - ], - [ - "Ġt", - "ËĪuei" - ], - [ - "Ġl", - "ËĪÉĻ" - ], - [ - "Ġd", - "ÉijËIJ" - ], - [ - "f", - "t" - ], - [ - "ËĪa", - "m" - ], - [ - "ĠsËĪÊĮ", - "kt" - ], - [ - "Ġt", - "ËĪou" - ], - [ - "Ġp", - "ËĪiÉĽ" - ], - [ - "ĠËĪa", - "i" - ], - [ - "ĠwËĪÉĴ", - "n" - ], - [ - "Ġz", - "ËĮaɪn" - ], - [ - "Ġe", - "st" - ], - [ - "Ġm", - "ÉĶ" - ], - [ - "ĠtÉķ", - "jËĪÉiju" - ], - [ - "Éľ", - "p" - ], - [ - "ËĪÊĮ", - "z" - ], - [ - "b", - "i" - ], - [ - "ËĪÉĽËIJs", - "eËIJ" - ], - [ - "Ġl", - "ËĪy" - ], - [ - "Ġm", - "ËĮe" - ], - [ - "Ġd", - "ËĮÉĽl" - ], - [ - "ËĪiËIJ", - "l" - ], - [ - "ĠkËĮo", - "mo" - ], - [ - "Ġh", - "ËĪaÉľn" - ], - [ - "ËĪoËIJ", - "ne" - ], - [ - "ĠkËĪÊĮɾ", - "t" - ], - [ - "Ġsy", - "Êģ" - ], - [ - "ËĮÉĶ", - "ɾ" - ], - [ - "Ġɪ", - "f" - ], - [ - "u", - "v" - ], - [ - "z", - "ÉĻn" - ], - [ - "o", - "l" - ], - [ - "Ï", - "ĩ" - ], - [ - "i", - "m" - ], - [ - "Ġm", - "ËĪiÉĽ" - ], - [ - "Ġð", - "ɪ" - ], - [ - "Ġv", - "ËĪÉĽ" - ], - [ - "ÊĬ", - "d" - ], - [ - "Ġt", - "r" - ], - [ - "ËĪeËIJ", - "s" - ], - [ - "ð", - "e" - ], - [ - "d", - "e" - ], - [ - "ʰ", - "Ïĩ" - ], - [ - "ÉŁ", - "ʰ" - ], - [ - "ËĮÉĻËIJ", - "ÉªÉľ" - ], - [ - "b", - "ËIJ" - ], - [ - "ËĪÊĬ", - "k" - ], - [ - "ĠnËĪÉĶ", - "ÉªÉľ" - ], - [ - "ĠËĮ", - "iËIJ" - ], - [ - "ËĪÉijËIJ", - "t" - ], - [ - "ËĪiËIJ", - "ɾ" - ], - [ - "Ġt", - "ɹ" - ], - [ - "ɾ", - "ÉĶ" - ], - [ - "Ġw", - "ÉĴz" - ], - [ - "Ġv", - "u" - ], - [ - "b", - "ÉĻl" - ], - [ - "b", - "ÉĻ" - ], - [ - "ɹ", - "i" - ], - [ - "nt", - "s" - ], - [ - "Ġs", - "ËĪaËIJ" - ], - [ - "d", - "ʰ" - ], - [ - "Ġt", - "ÊĬ" - ], - [ - "ĠÊİ", - "ËĮi" - ], - [ - "β", - "a" - ], - [ - "h", - "ËĪÉĻÉľÅĭ" - ], - [ - "Ġs", - "ËĪiËIJ" - ], - [ - "ĠpËĮa", - "ɾa" - ], - [ - "ËĪÉĽÉ¾", - "ÉĶ" - ], - [ - "ËĪɪ", - "s" - ], - [ - "É£", - "o" - ], - [ - "ĠËĮa", - "l" - ], - [ - "o", - "r" - ], - [ - "Ġb", - "ËĪÊĮh" - ], - [ - "Ġk", - "ËĪoËIJ" - ], - [ - "Ġt", - "ËĪÉĽ" - ], - [ - "Ġp", - "ËĪo" - ], - [ - "ĠÊĴ", - "ÉĻ" - ], - [ - "p", - "Êģ" - ], - [ - "Ġ", - "ËĪaɪ" - ], - [ - "hËĪÉij", - "ÉľÅĭ" - ], - [ - "ÉĻl", - "i" - ], - [ - "ËĪeɪ", - "t" - ], - [ - "ĠjËĪiou", - "Éľ" - ], - [ - "Ġd", - "ËĪÉĻ" - ], - [ - "Ġm", - "ËĪÉĶËIJ" - ], - [ - "l", - "ËĪi" - ], - [ - "ËĮy", - "ÉĻ" - ], - [ - "ĠlËĪoËIJ", - "É¡" - ], - [ - "Ġn", - "ËĪÊĮ" - ], - [ - "Ġh", - "ËĪÊĬ" - ], - [ - "Ġn", - "ËĪÉĻÉľÅĭ" - ], - [ - "ĠÊģ", - "ÉĻ" - ], - [ - "z", - "ËĪi" - ], - [ - "Ġt", - "ËĪuËIJ" - ], - [ - "ĠkËĮo", - "me" - ], - [ - "Ġl", - "ËĪeËIJ" - ], - [ - "ËĪaËIJt", - "aËIJ" - ], - [ - "Ġa", - "n" - ], - [ - "ĠËĪy", - "u" - ], - [ - "ĠËĮÊĮ", - "É¡ÉĻɾ" - ], - [ - "ĠËĪɪ", - "n" - ], - [ - "ĠhËĪo", - "ÉĻ" - ], - [ - "v", - "ÉĻ" - ], - [ - "ËĪø", - "ËIJ" - ], - [ - "θj", - "a" - ], - [ - "ËĪuÉĻ", - "Éľn" - ], - [ - "Ġk", - "ÉĻɾ" - ], - [ - "ËĪa", - "t" - ], - [ - "j", - "ËĪø" - ], - [ - "ËĪÉĽt", - "Êģ" - ], - [ - "Ġp", - "ËĪÉiju" - ], - [ - "st", - "ÉĻ" - ], - [ - "Ġw", - "ÉĴt" - ], - [ - "ËĪeËIJ", - "l" - ], - [ - "ÊĪ", - "i" - ], - [ - "Ġx", - "ËĪaiÉľ" - ], - [ - "ËĪy", - "Êģ" - ], - [ - "ĠhËĪoËIJ", - "É¡aËIJ" - ], - [ - "Ġts", - "ËĪi" - ], - [ - "ĠËĪÊĮ", - "p" - ], - [ - "Ġn", - "ËĮÉĴt" - ], - [ - "ĠlËĪɪ", - "eËIJ" - ], - [ - "Ġh", - "ËĪa" - ], - [ - "Ġf", - "l" - ], - [ - "Ġn", - "ËĪeËIJ" - ], - [ - "ËĮaËIJ", - "ɪ" - ], - [ - "Ġt", - "ËĪuo" - ], - [ - "tÊĥ", - "ËIJ" - ], - [ - "s", - "ËĪe" - ], - [ - "bʰ", - "i" - ], - [ - "ĠbËĪÊĮh", - "ÊĬt" - ], - [ - "ËĪÉĽ", - "nd" - ], - [ - "Ġs", - "ËĪÉĶ" - ], - [ - "ÉĻn", - "s" - ], - [ - "ËĮÉĻ", - "l" - ], - [ - "ÉĽ", - "Éľ" - ], - [ - "ĠÉ¡", - "l" - ], - [ - "ËĪɪ", - "ɾ" - ], - [ - "ËĪaËIJt", - "a" - ], - [ - "Éľ", - "ËIJ" - ], - [ - "ËĪÉĽnt", - "o" - ], - [ - "sk", - "ËĮoËIJ" - ], - [ - "ËĪÉĽ", - "k" - ], - [ - "ts", - "i" - ], - [ - "Ġt", - "ËĪonÉ¡" - ], - [ - "Ġb", - "iËIJ" - ], - [ - "Ġh", - "ËĪaËIJɪ" - ], - [ - "Ġb", - "ËĪi" - ], - [ - "j", - "j" - ], - [ - "Êİ", - "i" - ], - [ - "Ġk", - "ʰ" - ], - [ - "Ġs", - "ËĪo" - ], - [ - "ll", - "o" - ], - [ - "Ġb", - "aɪ" - ], - [ - "ĠÉĽ", - "nt" - ], - [ - "Ġ", - "ËĪiËIJ" - ], - [ - "ĠÉ¡", - "ËĪo" - ], - [ - "ɾ", - "eËIJ" - ], - [ - "Ġk", - "Êĭ" - ], - [ - "Ġm", - "ËĪeiÉľ" - ], - [ - "ÊĬ", - "ËĪÉĶËIJ" - ], - [ - "Ġt", - "ËĪaɪ" - ], - [ - "Ġsu", - "s" - ], - [ - "Ġr", - "i" - ], - [ - "Ġv", - "ËĮÉĽ" - ], - [ - "ËĪiËIJ", - "no" - ], - [ - "v", - "ano" - ], - [ - "ĠdËĮi", - "ËIJ" - ], - [ - "ĠÊIJ", - "ËĪaÉľn" - ], - [ - "Ê", - "Ĥ" - ], - [ - "ĠÉIJ", - "b" - ], - [ - "ËĪaËIJ", - "h" - ], - [ - "ɪ", - "Êĥ" - ], - [ - "ĠdËĮe", - "lla" - ], - [ - "tËIJ", - "i" - ], - [ - "ĠËĪÊĬ", - "n" - ], - [ - "Ġh", - "iËIJ" - ], - [ - "Ġb", - "ËĪaËIJt" - ], - [ - "Ġth", - "ËĪi" - ], - [ - "Ġa", - "m" - ], - [ - "Ġ", - "ËĪoËIJ" - ], - [ - "Ġh", - "u" - ], - [ - "Ġk", - "ËĪÊĮh" - ], - [ - "Ġz", - "ËĪÉijËIJ" - ], - [ - "ĠÉ¡", - "ËĮÉĶ" - ], - [ - "Ġ", - "ËĪÉĻÊĬ" - ], - [ - "y", - "ËĪi" - ], - [ - "Ġl", - "ËĪÊĮ" - ], - [ - "Ġd", - "ËĪeËIJ" - ], - [ - "Ġs", - "ËĪÉĶËIJ" - ], - [ - "sk", - "ËĮeËIJ" - ], - [ - "ɾ", - "o" - ], - [ - "Êģ", - "ËĪÉij" - ], - [ - "t", - "ËĪa" - ], - [ - "Ġk", - "ËĪÊĬ" - ], - [ - "ËĪant", - "e" - ], - [ - "Ġd", - "ÉĶ" - ], - [ - "Ġs", - "ËĪeɪ" - ], - [ - "Ġs", - "ÉĽt" - ], - [ - "ɹ", - "ɪ" - ], - [ - "ĠÉ¡ËĮÉĻÊĬ", - "ɪÅĭ" - ], - [ - "z", - "o" - ], - [ - "Ġj", - "ËĪaËIJ" - ], - [ - "ĠÉĴv", - "ðÉĻ" - ], - [ - "ĠÊ", - "Ŀ" - ], - [ - "ĠÉĽ", - "l" - ], - [ - "Ġs", - "ËĪoËIJ" - ], - [ - "Ġth", - "ËĪiÉľ" - ], - [ - "Ġ", - "ËĪÉĽl" - ], - [ - "Ġly", - "ËĮi" - ], - [ - "nd", - "ÊĴ" - ], - [ - "ĠÉķ", - "jËĪÉiju" - ], - [ - "θ", - "a" - ], - [ - "ĠɾËĮÉĻh", - "eËIJ" - ], - [ - "Ġma", - "ɪ" - ], - [ - "j", - "ÉĻ" - ], - [ - "ĠËĪÊĮ", - "b" - ], - [ - "as", - "jËĪÉĶ" - ], - [ - "d", - "Êģ" - ], - [ - "Ġkh", - "ËĪa" - ], - [ - "ĠËĪe", - "s" - ], - [ - "v", - "i" - ], - [ - "f", - "i" - ], - [ - "ËĮÉĻ", - "b" - ], - [ - "Ġr", - "e" - ], - [ - "Ġav", - "ËĮÉĽ" - ], - [ - "Ġt", - "ËĮi" - ], - [ - "Ġk", - "ɾ" - ], - [ - "Ġb", - "ɪk" - ], - [ - "st", - "e" - ], - [ - "ËĪeËIJÊĥ", - "c" - ], - [ - "p", - "t" - ], - [ - "z", - "ÉĻ" - ], - [ - "Ġw", - "ËĪaËIJ" - ], - [ - "k", - "l" - ], - [ - "ĠsËĪÊĮ", - "m" - ], - [ - "ɪ", - "ÊĪ" - ], - [ - "d", - "z" - ], - [ - "v", - "o" - ], - [ - "ËĮa", - "ÊĬt" - ], - [ - "nd", - "e" - ], - [ - "Ġd", - "ÉĽs" - ], - [ - "ĠÉŁ", - "ËĪaËIJ" - ], - [ - "Ġr", - "ËĮi" - ], - [ - "s", - "ËĮeËIJ" - ], - [ - "É¡", - "i" - ], - [ - "Ġal", - "s" - ], - [ - "ËĪi", - "ðo" - ], - [ - "ĠnËĪi", - "Éľn" - ], - [ - "ÊĬ", - "l" - ], - [ - "ts", - "ËIJ" - ], - [ - "ËĪant", - "o" - ], - [ - "ĠÉĹ", - "ËĪÉĻÊĬ" - ], - [ - "kËIJ", - "i" - ], - [ - "ĠsËĪÊĮ", - "b" - ], - [ - "Ġn", - "ËĪa" - ], - [ - "Ġl", - "ËĮo" - ], - [ - "Ġph", - "ËĪi" - ], - [ - "m", - "ËĮe" - ], - [ - "Ġf", - "a" - ], - [ - "k", - "ÉĻ" - ], - [ - "Ġz", - "ËĪu" - ], - [ - "n", - "s" - ], - [ - "ĠÊģ", - "e" - ], - [ - "Ġb", - "ËĪo" - ], - [ - "ËĪaËIJt", - "i" - ], - [ - "Ġm", - "an" - ], - [ - "ĠlËĪi", - "Éij" - ], - [ - "ĠÉĹ", - "ËĮyÉĻ" - ], - [ - "Ġf", - "ËĪÉĶËIJ" - ], - [ - "ĠkÊĭ", - "ËĪeËIJÊĥc" - ], - [ - "Ġx", - "ËĪÉij" - ], - [ - "ĠtÉķ", - "ËĪu" - ], - [ - "j", - "ÉĻɾ" - ], - [ - "Ġɪ", - "st" - ], - [ - "w", - "ËĪi" - ], - [ - "ĠËĮaɪn", - "ÉĻ" - ], - [ - "ɪ", - "É¡" - ], - [ - "Ġs", - "ÊĪ" - ], - [ - "ËĪi", - "ÉĻl" - ], - [ - "Ġn", - "ËĪiÉĽÉľn" - ], - [ - "ĠËĮÉĽ", - "ËIJ" - ], - [ - "ËĪaɪ", - "nd" - ], - [ - "Ġz", - "ËĪi" - ], - [ - "v", - "ÉĻn" - ], - [ - "m", - "z" - ], - [ - "ð", - "os" - ], - [ - "dÊĴ", - "ËIJ" - ], - [ - "j", - "ËĪa" - ], - [ - "ɾ", - "ËĪÉĶ" - ], - [ - "l", - "ËĪe" - ], - [ - "Ê", - "²" - ], - [ - "Ġv", - "ËĪÉĶ" - ], - [ - "Ġl", - "ËĪiÉĽ" - ], - [ - "θ", - "e" - ], - [ - "mËĪe", - "nte" - ], - [ - "Ġɪn", - "ðÉĻ" - ], - [ - "Ġaɪ", - "m" - ], - [ - "n", - "ÉĻn" - ], - [ - "Ġh", - "ÉĻm" - ], - [ - "ɾ", - "aËIJ" - ], - [ - "ĠsËĪuo", - "Éľ" - ], - [ - "Ġɲ", - "ËĪi" - ], - [ - "Ġɹ", - "ËĪiÉĻl" - ], - [ - "l", - "ËĪa" - ], - [ - "Ġb", - "ËĪÉĶ" - ], - [ - "Ġk", - "ËĪai" - ], - [ - "Êģ", - "ËĪa" - ], - [ - "Ġw", - "ËĪÉľËIJ" - ], - [ - "Ġa", - "ËIJ" - ], - [ - "Ġp", - "as" - ], - [ - "ËĪÊĮ", - "s" - ], - [ - "w", - "ËĪÉĽÉ¾" - ], - [ - "ĠÉĹ", - "ËĪe" - ], - [ - "ĠhËĮa", - "tÉĻ" - ], - [ - "a", - "ɪn" - ], - [ - "ĠËĪÉĶ", - "pʰ" - ], - [ - "Êģ", - "ËĪe" - ], - [ - "ĠÉŁaËIJ", - "ËĪeËIJÉ¡aËIJ" - ], - [ - "ĠËĪÊĬ", - "s" - ], - [ - "ĠtÉķhËĪi", - "Éľ" - ], - [ - "nt", - "Êĥ" - ], - [ - "Ġx", - "ËĪuo" - ], - [ - "ËĪu", - "Êģ" - ], - [ - "Ġɪ", - "m" - ], - [ - "ɳ", - "Éĸ" - ], - [ - "ËĪyÉĻ", - "Éľkh" - ], - [ - "ĠËĪy", - "ÉĽ" - ], - [ - "Ġm", - "ËĮaËIJ" - ], - [ - "Åĵ", - "Êģ" - ], - [ - "ĠËĪa", - "lt" - ], - [ - "Ġk", - "ÉĻm" - ], - [ - "Êİ", - "o" - ], - [ - "ĠÉIJ", - "n" - ], - [ - "Ġf", - "y" - ], - [ - "ĠËĮÉĽ", - "ra" - ], - [ - "ĠÉ¡", - "ËĪÊĬ" - ], - [ - "Ġp", - "ËĪÊĮ" - ], - [ - "l", - "s" - ], - [ - "Ġl", - "ËĪiËIJ" - ], - [ - "ĠÊĤ", - "ËĪy" - ], - [ - "Ġbɪk", - "ËĪÊĮz" - ], - [ - "ĠÉ¡", - "ÉĽt" - ], - [ - "Ġb", - "ɾ" - ], - [ - "t", - "ʰ" - ], - [ - "tÉĻl", - "ËĮÉĻb" - ], - [ - "x", - "o" - ], - [ - "sk", - "ËĮaËIJ" - ], - [ - "ɲ", - "ʲ" - ], - [ - "ËĪeËIJk", - "ÊĪ" - ], - [ - "r", - "ÉĻ" - ], - [ - "tÊĥ", - "o" - ], - [ - "ĠpÊģ", - "ÉĶ" - ], - [ - "Ġɹ", - "ËĪaɪt" - ], - [ - "Ġp", - "ËĪei" - ], - [ - "ËĮ", - "ɪç" - ], - [ - "j", - "ËĪÉĽÉ¾" - ], - [ - "tËIJ", - "a" - ], - [ - "ĠÉIJb", - "ËĮaÊĬt" - ], - [ - "ĠkÊĭËĪeËIJÊĥc", - "ÉĻn" - ], - [ - "Ġv", - "ËĪe" - ], - [ - "ÊĬ", - "Éľ" - ], - [ - "Ġa", - "kËĪe" - ], - [ - "Ġp", - "ËĪai" - ], - [ - "v", - "ËĪÉĽ" - ], - [ - "Ġθ", - "ɹ" - ], - [ - "ɪ", - "f" - ], - [ - "Ġav", - "ËĪÉĽ" - ], - [ - "Ġk", - "ËĪe" - ], - [ - "d", - "ËĪi" - ], - [ - "ËĪeËIJ", - "Éĸ" - ], - [ - "Ġb", - "ÉĻt" - ], - [ - "ÊĪ", - "ʰ" - ], - [ - "t", - "eËIJ" - ], - [ - "θj", - "ËĪÉĶn" - ], - [ - "d", - "Éľ" - ], - [ - "ĠjËĪi", - "Éľ" - ], - [ - "Ġv", - "e" - ], - [ - "É£", - "ËĪu" - ], - [ - "ËĪÊĮh", - "ÉĻl" - ], - [ - "Ġp", - "ÉĶ" - ], - [ - "ĠÉ¡", - "r" - ], - [ - "Ġð", - "a" - ], - [ - "Ġv", - "ËĪiËIJ" - ], - [ - "ĠËĮ", - "ÉijËIJ" - ], - [ - "ËĪÉĻÊĬ", - "nt" - ], - [ - "Ġb", - "ËĪaËIJɾ" - ], - [ - "ĠmËĪÊĮ", - "tÉĻlËĮÉĻb" - ], - [ - "l", - "d" - ], - [ - "ĠtÉķ", - "ËĮÉĶ" - ], - [ - "p", - "a" - ], - [ - "ð", - "ËĪad" - ], - [ - "ËĪi", - "ɾ" - ], - [ - "Ġx", - "ËĪu" - ], - [ - "ĠlËĪi", - "ÉľÅĭ" - ], - [ - "ËĪeɪ", - "s" - ], - [ - "ĠÉĹËĮe", - "Éľn" - ], - [ - "Ġth", - "ËĪiÉĽ" - ], - [ - "tËIJ", - "e" - ], - [ - "ĠavËĮÉĽ", - "k" - ], - [ - "ĠËĮ", - "ÉĶ" - ], - [ - "Ġk", - "ËĪÉiju" - ], - [ - "ɪ", - "v" - ], - [ - "iËIJ", - "z" - ], - [ - "ËĪo", - "s" - ], - [ - "ĠÉ¡", - "ɹ" - ], - [ - "a", - "nd" - ], - [ - "ĠlËĪi", - "ou" - ], - [ - "ĠËĪo", - "Éľ" - ], - [ - "É¡", - "l" - ], - [ - "Ġp", - "ËĪÉĶËIJ" - ], - [ - "Ġm", - "ËĮeËIJ" - ], - [ - "Ġk", - "ËĪÉĴ" - ], - [ - "n", - "os" - ], - [ - "ç", - "ÉĻn" - ], - [ - "f", - "ÉĻn" - ], - [ - "ĠsËĪÊĮkt", - "ËĮeËIJ" - ], - [ - "Ġ", - "ËĪaɪn" - ], - [ - "ËĪoËIJ", - "re" - ], - [ - "j", - "ËĪÉĽn" - ], - [ - "Ġð", - "ËĪÉĽn" - ], - [ - "ĠtÉķh", - "ËĪiÉĽÉľn" - ], - [ - "Ġh", - "ËĪaɪ" - ], - [ - "ɾ", - "ËĪÉĽ" - ], - [ - "Ġs", - "ËĪu" - ], - [ - "ĠkËĪɪ", - "jaËIJ" - ], - [ - "Ġpj", - "ËĮÊĬ" - ], - [ - "ĠhÉĻm", - "ËĮaËIJ" - ], - [ - "ĠËĮÊĮ", - "p" - ], - [ - "Ġp", - "ËĪÊĮhÉĻl" - ], - [ - "Ġx", - "ËĪÉĻ" - ], - [ - "d", - "ËĪe" - ], - [ - "Ġm", - "Éij" - ], - [ - "ĠÊĬ", - "m" - ], - [ - "nd", - "ÉĻ" - ], - [ - "Ġd", - "ËĪÉĻÊĬnt" - ], - [ - "ËĪeËIJ", - "ÊĥÉĻn" - ], - [ - "Ġða", - "ts" - ], - [ - "i", - "s" - ], - [ - "Ġc", - "ËĪaËIJh" - ], - [ - "p", - "e" - ], - [ - "Ġs", - "ËĮo" - ], - [ - "Ġð", - "ËĪe" - ], - [ - "Ġs", - "ËĪaËIJt" - ], - [ - "ËĪa", - "Êģ" - ], - [ - "Ġs", - "ËĪe" - ], - [ - "ÉĻ", - "k" - ], - [ - "ɪ", - "Êĭ" - ], - [ - "ĠkËĪoËIJ", - "i" - ], - [ - "k", - "ÉĶ" - ], - [ - "Ġv", - "ËĪaËIJÊĬ" - ], - [ - "Ġf", - "ËĪei" - ], - [ - "Ġl", - "ËĪeËIJk" - ], - [ - "Ġh", - "ËĪiÉĻ" - ], - [ - "Ġa", - "ÊĬ" - ], - [ - "ËĪÉĽ", - "ndo" - ], - [ - "ËĪe", - "s" - ], - [ - "Ġz", - "ËĪÉĶ" - ], - [ - "Ġ", - "ËĪÉĽÉ¾a" - ], - [ - "nËĪi", - "Éľn" - ], - [ - "ĠkËĪÊĮ", - "m" - ], - [ - "Ġl", - "ËĪÉĴ" - ], - [ - "ɪ", - "st" - ], - [ - "Ġp", - "Éij" - ], - [ - "Ġf", - "ËĪÉĶ" - ], - [ - "Ġth", - "ËĪonÉ¡" - ], - [ - "nk", - "e" - ], - [ - "ËĮ", - "ɪk" - ], - [ - "Ġɲ", - "ËĪÉĻ" - ], - [ - "ËĮÊĮ", - "m" - ], - [ - "ËĪiËIJ", - "t" - ], - [ - "ĠwËĪÉĴ", - "nt" - ], - [ - "ËĪaβ", - "an" - ], - [ - "ĠbËĪÊĮ", - "r" - ], - [ - "ÉĽ", - "nd" - ], - [ - "ĠËĮÉijËIJ", - "bÉľ" - ], - [ - "Ġv", - "ËĪaɪ" - ], - [ - "ĠtÊĥ", - "ËĮi" - ], - [ - "ĠθËĪɪÅĭ", - "k" - ], - [ - "st", - "i" - ], - [ - "Ġk", - "ɹ" - ], - [ - "ĠËĪa", - "ÊĬt" - ], - [ - "st", - "ÉĻn" - ], - [ - "ĠÊĭ", - "ËĪÊĮn" - ], - [ - "ĠÉ¡", - "ËĮaËIJ" - ], - [ - "ËĪaËIJÉľ", - "ɲ" - ], - [ - "Êģ", - "i" - ], - [ - "ĠnËĪÉĶ", - "x" - ], - [ - "ĠɹËĪiÉĻl", - "ɪ" - ], - [ - "Ġv", - "ËĮi" - ], - [ - "Ġðe", - "ÉĻ" - ], - [ - "ËĮɪ", - "tÊĥ" - ], - [ - "Ġv", - "ËĪyÉĻ" - ], - [ - "ĠËĮaËIJpk", - "ËĮaËIJ" - ], - [ - "Ġf", - "ËĮaËIJɪ" - ], - [ - "Ġp", - "ËĪÉĶ" - ], - [ - "ĠnËĪÊĮ", - "mb" - ], - [ - "θ", - "es" - ], - [ - "j", - "ËĪÉĽÊģ" - ], - [ - "ĠkËĪÊĬ", - "cʰ" - ], - [ - "m", - "ËĪÉĽ" - ], - [ - "Ġv", - "ËĪu" - ], - [ - "Ġl", - "ÅĵÊģ" - ], - [ - "ĠiËIJ", - "m" - ], - [ - "ÊĪ", - "ÉĻɾ" - ], - [ - "tÊĥ", - "i" - ], - [ - "ËIJ", - "s" - ], - [ - "Ġt", - "ËĪy" - ], - [ - "ĠmËĪi", - "ÉľÅĭ" - ], - [ - "ɾ", - "ËĪe" - ], - [ - "m", - "ËĮa" - ], - [ - "Ġm", - "ËĮiËIJ" - ], - [ - "ĠÉĽ", - "ks" - ], - [ - "ɪ", - "p" - ], - [ - "ĠkËĪÊĮɾ", - "nËĮaËIJ" - ], - [ - "ĠËĮaÊĬ", - "x" - ], - [ - "r", - "ËĪiËIJ" - ], - [ - "Ġc", - "ËĪÊĮl" - ], - [ - "m", - "os" - ], - [ - "ĠkËĪÊĮɾt", - "ËĮeËIJ" - ], - [ - "iËIJ", - "ɾ" - ], - [ - "k", - "ÉĻn" - ], - [ - "Ġd", - "ËĪu" - ], - [ - "n", - "aËIJ" - ], - [ - "Ġp", - "wËĪe" - ], - [ - "ËĮÉĶ", - "ɪ" - ], - [ - "ĠtÉķh", - "ËĪiÉĽ" - ], - [ - "Ġβ", - "ËĪi" - ], - [ - "ËĪiÉĽ", - "Éľt" - ], - [ - "Ġt", - "e" - ], - [ - "ËĪað", - "os" - ], - [ - "m", - "ËĪa" - ], - [ - "Ġv", - "ËĪo" - ], - [ - "Ġm", - "ËĪɪ" - ], - [ - "Ġb", - "ËĮi" - ], - [ - "a", - "d" - ], - [ - "d", - "o" - ], - [ - "Ġn", - "ËĪaÊĬ" - ], - [ - "ĠʲËĪy", - "Éľ" - ], - [ - "w", - "ËĪÉĽ" - ], - [ - "ËĪi", - "s" - ], - [ - "e", - "l" - ], - [ - "Ġpa", - "r" - ], - [ - "Ġt", - "ËĪai" - ], - [ - "ĠdËĪɪ", - "jaËIJ" - ], - [ - "h", - "ËĪi" - ], - [ - "Ġɾ", - "ËĪÊĮ" - ], - [ - "Ġd", - "ËĪe" - ], - [ - "ËĪaɪ", - "d" - ], - [ - "Ġp", - "er" - ], - [ - "Ġs", - "ËĮÉĶ" - ], - [ - "w", - "e" - ], - [ - "ÊĬ", - "m" - ], - [ - "Ġi", - "n" - ], - [ - "ĠjËĪuËIJ", - "z" - ], - [ - "ËĪiËIJp", - "ÉĻl" - ], - [ - "ĠÊĭ", - "ËĪaËIJl" - ], - [ - "Ġe", - "tËĪÉĽ" - ], - [ - "ËĮÉĽ", - "m" - ], - [ - "Ġn", - "ËĪu" - ], - [ - "ËĪÉĽ", - "kt" - ], - [ - "ĠiËIJ", - "ɾ" - ], - [ - "Ġb", - "ɹ" - ], - [ - "Ġtsh", - "ËĪi" - ], - [ - "ĠÉĹ", - "ËĪÉĶÉľ" - ], - [ - "Ġkw", - "ËĮa" - ], - [ - "Ġf", - "ËĪuÉľ" - ], - [ - "w", - "ËĮa" - ], - [ - "Ġd", - "ËĪiËIJ" - ], - [ - "ĠÉ¡", - "ËĪyÉĻ" - ], - [ - "ËĮÉĽ", - "ËIJ" - ], - [ - "r", - "ËĪa" - ], - [ - "Ġn", - "e" - ], - [ - "Ġz", - "ËĪyÉĻ" - ], - [ - "Ġb", - "ËĪaɪ" - ], - [ - "ĠÉŁ", - "ËĪÊĮb" - ], - [ - "ËĪuËIJ", - "to" - ], - [ - "ÊĬ", - "nt" - ], - [ - "Ġc", - "ʰ" - ], - [ - "ËĪÉĽnt", - "i" - ], - [ - "ËĪo", - "ÉĻ" - ], - [ - "Ġs", - "ËĮÊĮm" - ], - [ - "Ġl", - "Éij" - ], - [ - "ËĮe", - "va" - ], - [ - "ɾ", - "ÉĽ" - ], - [ - "nt", - "Éľ" - ], - [ - "Ġm", - "ËĪÉĽn" - ], - [ - "ËĪÉijËIJ", - "k" - ], - [ - "Ġki", - "l" - ], - [ - "ËĪon", - "es" - ], - [ - "f", - "f" - ], - [ - "Ġm", - "ËĪÉĽËIJ" - ], - [ - "Ġv", - "ËĪÉĻɪ" - ], - [ - "Ġ", - "ËĪÉĶËIJ" - ], - [ - "ĠËĮɪ", - "nt" - ], - [ - "ÊĬ", - "n" - ], - [ - "Ġw", - "ɪl" - ], - [ - "Ġs", - "in" - ], - [ - "ĠËĮa", - "lla" - ], - [ - "Ġaβ", - "ËĪia" - ], - [ - "p", - "i" - ], - [ - "ËĪo", - "Éľ" - ], - [ - "ɪj", - "ËĮaËIJ" - ], - [ - "k", - "u" - ], - [ - "Ġv", - "ËĪɪ" - ], - [ - "Ġtu", - "t" - ], - [ - "ĠtËĪe", - "Éľ" - ], - [ - "Ġh", - "ËĪÉĶ" - ], - [ - "β", - "ɾe" - ], - [ - "s", - "ÉĻɾ" - ], - [ - "Ġkh", - "ËĪai" - ], - [ - "Ġm", - "ËĪÉĶ" - ], - [ - "Ġt", - "a" - ], - [ - "Ġɲ", - "ËĪaËIJ" - ], - [ - "Ġn", - "u" - ], - [ - "ËĪuËIJ", - "n" - ], - [ - "ĠÉĻËIJ", - "Éľ" - ], - [ - "ĠËĪa", - "ÊĬf" - ], - [ - "ËĪiËIJd", - "Éľ" - ], - [ - "nt", - "i" - ], - [ - "Ġp", - "ËĪiËIJpÉĻl" - ], - [ - "Ġk", - "j" - ], - [ - "Ġp", - "e" - ], - [ - "Ġm", - "ËĪÉij" - ], - [ - "ËĮa", - "ɪ" - ], - [ - "ËĪaËIJ", - "le" - ], - [ - "Ġv", - "ËĮÉĻËIJÉªÉľ" - ], - [ - "mp", - "o" - ], - [ - "ĠkËĪɪ", - "t" - ], - [ - "Ġn", - "ËĮÉĽ" - ], - [ - "ĠÉŁ", - "ËĪaËIJtaËIJ" - ], - [ - "ĠsËĪaËIJt", - "ʰ" - ], - [ - "ĠÉŁ", - "ËĪi" - ], - [ - "Ġs", - "o" - ], - [ - "Ġb", - "ËĪÉĽ" - ], - [ - "k", - "ËĪi" - ], - [ - "ɪt", - "i" - ], - [ - "Ġts", - "i" - ], - [ - "Ġk", - "Êģ" - ], - [ - "ËĮ", - "ÉĴ" - ], - [ - "É¡", - "ÉĻl" - ], - [ - "k", - "st" - ], - [ - "Ġm", - "ËĪÉĻËIJ" - ], - [ - "ËĪÊĮ", - "k" - ], - [ - "Ġn", - "ËĪaËIJÊĬ" - ], - [ - "Ġa", - "p" - ], - [ - "ĠlËĪɪ", - "kʰ" - ], - [ - "ll", - "i" - ], - [ - "ĠkwËĪa", - "l" - ], - [ - "Ġ", - "ËĪÉĻËIJ" - ], - [ - "Ġts", - "ËĪuei" - ], - [ - "Ġd", - "o" - ], - [ - "ĠkËIJ", - "jËĪo" - ], - [ - "ÊĬ", - "z" - ], - [ - "Ġp", - "ËĪaËIJ" - ], - [ - "Ġm", - "ËĪuËIJ" - ], - [ - "ĠÉ¡ÉĻ", - "v" - ], - [ - "r", - "ËĪi" - ], - [ - "Ġt", - "w" - ], - [ - "ËĮ", - "ɪn" - ], - [ - "d", - "ËĪÉij" - ], - [ - "Ġð", - "ËĪi" - ], - [ - "ĠËĪaËIJ", - "i" - ], - [ - "Ġh", - "ËĪiÉĽ" - ], - [ - "Ġð", - "ËĮÉĽm" - ], - [ - "Ġpʰ", - "ËĪɪɾ" - ], - [ - "ÉĴ", - "m" - ], - [ - "ĠËĮ", - "eËIJ" - ], - [ - "Ġth", - "ËĪaiÉľ" - ], - [ - "Ġv", - "ËĪas" - ], - [ - "Ġn", - "ÉijËIJ" - ], - [ - "p", - "ÉĻn" - ], - [ - "Ġp", - "ËĮÉĻɾ" - ], - [ - "ĠÉĹ", - "ËĪaËIJɪ" - ], - [ - "ËĪou", - "Éľ" - ], - [ - "ĠÊIJ", - "ËĪuÉľ" - ], - [ - "ĠmËĪa", - "n" - ], - [ - "ĠtËĪÉĻ", - "ÉªÉľ" - ], - [ - "Ġl", - "ËĪaËIJÊĬ" - ], - [ - "m", - "ËĪÉĽnte" - ], - [ - "ĠfËĪa", - "m" - ], - [ - "s", - "jËĪÉĶ" - ], - [ - "Ġp", - "ËĪÉĻ" - ], - [ - "ËĪeËIJ", - "m" - ], - [ - "Ġp", - "ËĪÊĮr" - ], - [ - "j", - "ËĪi" - ], - [ - "Ġl", - "ÉĽ" - ], - [ - "Ġt", - "en" - ], - [ - "ËĪoËIJ", - "ra" - ], - [ - "k", - "i" - ], - [ - "ĠÊĤ", - "ËĪaËIJÊĬ" - ], - [ - "k", - "ɪ" - ], - [ - "bËIJ", - "e" - ], - [ - "ËĪa", - "lt" - ], - [ - "ð", - "ɪ" - ], - [ - "p", - "ËĪi" - ], - [ - "ĠËĮÉĽ", - "nt" - ], - [ - "Ġm", - "ËĪei" - ], - [ - "Ġh", - "ËĪÉĻÊĬ" - ], - [ - "Ġh", - "ËĪÉĽÉ¾" - ], - [ - "j", - "ËĪÉij" - ], - [ - "ĠhËĪÊĬ", - "aËIJ" - ], - [ - "m", - "Éľ" - ], - [ - "Ġd", - "ʰ" - ], - [ - "ĠtÊĥ", - "ËĪe" - ], - [ - "l", - "ËĪÉĽ" - ], - [ - "ËĪaËIJt", - "e" - ], - [ - "Ġp", - "ËĪuËIJ" - ], - [ - "Ġm", - "ËĪÊĬ" - ], - [ - "ËĪaËIJɪ", - "ÊĪ" - ], - [ - "d", - "iËIJ" - ], - [ - "Ġfɹ", - "ÉĴm" - ], - [ - "Ġh", - "ËĪÉijËIJ" - ], - [ - "β", - "o" - ], - [ - "ĠmËĪi", - "Éľn" - ], - [ - "Ġð", - "iËIJz" - ], - [ - "Ġk", - "ËĪou" - ], - [ - "ËĪiËIJ", - "na" - ], - [ - "Ġav", - "ËĮeva" - ], - [ - "Ġ", - "ËĪaËIJɾ" - ], - [ - "Ġn", - "ËĪuËIJɾ" - ], - [ - "Ġβ", - "ËĪe" - ], - [ - "Ġz", - "aɪn" - ], - [ - "ËĪÉĽ", - "d" - ], - [ - "É", - "Ĺ" - ], - [ - "ËĪeɪ", - "k" - ], - [ - "s", - "ËĮÉĻÊĬ" - ], - [ - "ËĪeËIJ", - "ÉŁ" - ], - [ - "ĠÊĤ", - "ËĪÉĻËIJ" - ], - [ - "j", - "e" - ], - [ - "cʰ", - "ËIJ" - ], - [ - "ËĪÉĶ", - "r" - ], - [ - "ÉĽ", - "ËIJ" - ], - [ - "ĠtÉķhËĪy", - "Ã¦Éľn" - ], - [ - "ĠËĮaɪn", - "ÉĻn" - ], - [ - "ĠiËIJ", - "n" - ], - [ - "ĠbËĪÊĮ", - "c" - ], - [ - "ËĪiËIJ", - "m" - ], - [ - "ɾ", - "as" - ], - [ - "ËĮÉĻ", - "s" - ], - [ - "Ġv", - "ËĪeËIJ" - ], - [ - "ĠËĪÉĻr", - "Éľ" - ], - [ - "Ġd", - "uËIJ" - ], - [ - "nt", - "ÉĻ" - ], - [ - "Ġpɹ", - "ËĪÉĴ" - ], - [ - "Ġb", - "ËĪɪ" - ], - [ - "ĠwËĪo", - "Éľ" - ], - [ - "n", - "ËĮi" - ], - [ - "Ġh", - "ÉIJ" - ], - [ - "Ġk", - "ËĪÉĽ" - ], - [ - "Ġe", - "t" - ], - [ - "jËĪÉĽ", - "ndo" - ], - [ - "ĠËĪai", - "Éľ" - ], - [ - "Ġl", - "i" - ], - [ - "ĠËĪaÊĬ", - "s" - ], - [ - "kËIJ", - "o" - ], - [ - "ĠÉĹ", - "ËĪyÉĻ" - ], - [ - "k", - "eËIJ" - ], - [ - "Ġf", - "ËĪiËIJl" - ], - [ - "Ġbʰ", - "ËĪaËIJi" - ], - [ - "ĠÉ¡ÉĻ", - "Êĥ" - ], - [ - "ÊĴ", - "ËĪe" - ], - [ - "Ġn", - "jËĪuËIJ" - ], - [ - "ĠËĪa", - "k" - ], - [ - "ĠÉĹ", - "ËĪaËIJ" - ], - [ - "z", - "ËĪa" - ], - [ - "v", - "ËĪe" - ], - [ - "ĠhËĮa", - "ÊĬ" - ], - [ - "ÉIJ", - "ç" - ], - [ - "ĠɾËĪÊĮ", - "kʰ" - ], - [ - "p", - "ËĪe" - ], - [ - "ĠtÉĻ", - "bi" - ], - [ - "ĠpËĪÊĮhÉĻl", - "ËĮeËIJ" - ], - [ - "Ġf", - "ËĪÉĽ" - ], - [ - "Ġw", - "ËĮɪtÊĥ" - ], - [ - "ĠtÉķËĪy", - "ÉĽÉľ" - ], - [ - "w", - "ËĮe" - ], - [ - "ËĮa", - "ɪt" - ], - [ - "ĠnÉijËIJ", - "x" - ], - [ - "ĠkËĪÉĶËIJ", - "n" - ], - [ - "ÊĬ", - "k" - ], - [ - "ĠbËĪaËIJ", - "d" - ], - [ - "Åĭ", - "ÉĻn" - ], - [ - "Ġn", - "i" - ], - [ - "Ġb", - "ËĪe" - ], - [ - "Ġm", - "ËĮÊĬ" - ], - [ - "ËĪa", - "r" - ], - [ - "ĠmËĮe", - "ɪk" - ], - [ - "Ġs", - "ËĪaËIJɾ" - ], - [ - "β", - "e" - ], - [ - "ĠtÉķhËĪi", - "ÉľÅĭ" - ], - [ - "it", - "ËĪe" - ], - [ - "k", - "ËĮe" - ], - [ - "ËĪÉĽËIJ", - "l" - ], - [ - "ËĮ", - "ÉĴn" - ], - [ - "ËĮ", - "Éij" - ], - [ - "Ġb", - "ËĪɪl" - ], - [ - "Ġw", - "ÊĬd" - ], - [ - "Ġb", - "ËĪoËIJl" - ], - [ - "r", - "d" - ], - [ - "i", - "ÉĻ" - ], - [ - "Ġd", - "a" - ], - [ - "Ġb", - "ËĪaËIJÊĬ" - ], - [ - "ĠnËĪÊĮmb", - "ÉĻɾ" - ], - [ - "ËĪaËIJɪ", - "Éľ" - ], - [ - "ĠÉĽ", - "m" - ], - [ - "Ġm", - "iËIJɾ" - ], - [ - "ËĪeɪ", - "m" - ], - [ - "l", - "os" - ], - [ - "ËĮÉĽ", - "t" - ], - [ - "ĠËĮaÊĬ", - "s" - ], - [ - "ĠmËĪa", - "Éľt" - ], - [ - "Ġw", - "ËĪuÉĻ" - ], - [ - "Ġw", - "ËĪeɪ" - ], - [ - "Ġse", - "ɲ" - ], - [ - "Ġb", - "jËĪÉĽ" - ], - [ - "Ġw", - "ÉĽn" - ], - [ - "f", - "l" - ], - [ - "Ġkh", - "wËĪa" - ], - [ - "d", - "ËĪÉĽ" - ], - [ - "v", - "ɹɪ" - ], - [ - "ĠËĪa", - "ɾ" - ], - [ - "jËĪÉiju", - "Éľ" - ], - [ - "ĠËĮaËIJpk", - "ËĮeËIJ" - ], - [ - "b", - "Êģ" - ], - [ - "ĠtËĪaɪ", - "m" - ], - [ - "Ġ", - "ËĪÉij" - ], - [ - "Ġs", - "ËĮa" - ], - [ - "Ġz", - "ËĪoɪ" - ], - [ - "ËĪÉĶɾ", - "a" - ], - [ - "Ġd", - "ËĪø" - ], - [ - "ËĪÉĶɾ", - "t" - ], - [ - "ĠÅĭ", - "ËĪÉĶ" - ], - [ - "m", - "in" - ], - [ - "Ġl", - "ËĪÊĬk" - ], - [ - "ËĪÉĶËIJ", - "t" - ], - [ - "ĠËĪÉĶ", - "tɾ" - ], - [ - "Ġf", - "ËĪaɪ" - ], - [ - "ĠÉ¡", - "ÉĴt" - ], - [ - "ËĪeËIJ", - "ÉĻn" - ], - [ - "k", - "ËĪÉĶ" - ], - [ - "ĠvËĪÉĽ", - "ɹi" - ], - [ - "m", - "ÉĽ" - ], - [ - "ËĪaɪ", - "z" - ], - [ - "Ġe", - "sp" - ], - [ - "ɲ", - "a" - ], - [ - "Ġl", - "ËĪo" - ], - [ - "ËĪÉĽËIJ", - "ra" - ], - [ - "β", - "ËĪi" - ], - [ - "ou", - "Éľ" - ], - [ - "ËĮÉĻ", - "k" - ], - [ - "tÊĥ", - "uËIJ" - ], - [ - "Ġn", - "ËĪyÉĻ" - ], - [ - "ÊĪ", - "ɾ" - ], - [ - "ĠÉ¡", - "ËĪy" - ], - [ - "ĠtËĪo", - "ðo" - ], - [ - "ËĪɪ", - "çt" - ], - [ - "Ġm", - "ɪç" - ], - [ - "ĠËĪa", - "nd" - ], - [ - "Ġkw", - "ËĮÉĽl" - ], - [ - "ĠÊĤ", - "ËĪaËIJ" - ], - [ - "ĠnËĪi", - "Éľ" - ], - [ - "ËĪÉĶ", - "p" - ], - [ - "ËĪiËIJ", - "z" - ], - [ - "ĠÊĤ", - "ËĪaÊĬ" - ], - [ - "ĠɾËĮÉĻh", - "i" - ], - [ - "ĠsËĮÊĬ", - "o" - ], - [ - "ĠÉĽ", - "É¡" - ], - [ - "Ġd", - "Åĵ" - ], - [ - "ĠÉ¡ËĮaËIJ", - "ÉªÉľ" - ], - [ - "d", - "ɪ" - ], - [ - "l", - "ËĮa" - ], - [ - "st", - "ËĪi" - ], - [ - "ĠdËĮiËIJ", - "z" - ], - [ - "Ġt", - "ËĮÊĬ" - ], - [ - "θ", - "i" - ], - [ - "ĠËĪɪ", - "skËĮoËIJ" - ], - [ - "nd", - "ÉĻn" - ], - [ - "Ġts", - "v" - ], - [ - "Ġh", - "ËĪÉĻËIJ" - ], - [ - "ĠÊĥ", - "ËĪÊĬ" - ], - [ - "ÉĻt", - "ËĮeËIJ" - ], - [ - "p", - "ËĮÉĽ" - ], - [ - "ËĪaɾ", - "ÉĶn" - ], - [ - "Ġp", - "ÉĽÊģ" - ], - [ - "Ġ", - "y" - ], - [ - "m", - "nËĮeËIJ" - ], - [ - "ËĪÉĽ", - "llo" - ], - [ - "ĠÉ¡", - "ËĪÉĻ" - ], - [ - "ĠËĮa", - "d" - ], - [ - "ĠÊĥ", - "v" - ], - [ - "ËĪÊı", - "ɾ" - ], - [ - "r", - "ËĪe" - ], - [ - "y", - "ËIJ" - ], - [ - "Ġp", - "ËĪaËIJs" - ], - [ - "Ġ", - "ËĪÉĽn" - ], - [ - "ɪ", - "dÊĴ" - ], - [ - "ËĪua", - "i" - ], - [ - "Ġf", - "i" - ], - [ - "Ġt", - "ËĪyÉĻ" - ], - [ - "ËĪaËIJ", - "ÉŁ" - ], - [ - "Ġt", - "jËĪe" - ], - [ - "ËĪaËIJn", - "aËIJ" - ], - [ - "st", - "ɾ" - ], - [ - "Êİ", - "e" - ], - [ - "ËĮe", - "ɪt" - ], - [ - "b", - "a" - ], - [ - "ð", - "as" - ], - [ - "v", - "Êģ" - ], - [ - "Ġz", - "ËĪÉĻËIJ" - ], - [ - "ËĪaËIJ", - "li" - ], - [ - "ÉŁÊ°", - "eËIJ" - ], - [ - "ËĪaËIJt", - "eËIJ" - ], - [ - "Ġv", - "ËĪa" - ], - [ - "Ġsa", - "l" - ], - [ - "ËĪaËIJ", - "no" - ], - [ - "ĠÉ¡ÉĻ", - "z" - ], - [ - "ĠhËĪoËIJ", - "ti" - ], - [ - "Ġɲ", - "ËĪiÉĽ" - ], - [ - "t", - "Éľ" - ], - [ - "ĠËĪaËIJ", - "p" - ], - [ - "Ġw", - "ËĪÉĽl" - ], - [ - "Ġm", - "ËĪɪl" - ], - [ - "Ġfy", - "ËIJɾ" - ], - [ - "ËĪÉĽËIJs", - "aËIJ" - ], - [ - "Ġb", - "ËĮiËIJ" - ], - [ - "ËĪaËIJ", - "jaËIJ" - ], - [ - "ËĪɪ", - "p" - ], - [ - "Ġf", - "Êģ" - ], - [ - "tsi", - "ËĪoËIJne" - ], - [ - "Ġw", - "ËĪuÉľ" - ], - [ - "Ġv", - "i" - ], - [ - "ĠwËĪÉij", - "Éľn" - ], - [ - "ËĪoËIJ", - "n" - ], - [ - "ĠÉĹ", - "ËĪÉĻɪ" - ], - [ - "ĠÊĿ", - "ËĪo" - ], - [ - "Ġr", - "a" - ], - [ - "m", - "ÉĻnt" - ], - [ - "ËĪaÊĬ", - "nd" - ], - [ - "Ġp", - "ÉĽÉ¾" - ], - [ - "ĠÉĹ", - "ËĪaËIJÊĬ" - ], - [ - "oËIJ", - "ɾ" - ], - [ - "h", - "ËĪo" - ], - [ - "ĠÉĴ", - "n" - ], - [ - "ĠÊİ", - "e" - ], - [ - "ĠsËĪɪ", - "ks" - ], - [ - "É¡", - "n" - ], - [ - "ĠÉ¡", - "ËĪa" - ], - [ - "Ġ", - "θj" - ], - [ - "Ġp", - "ËĪe" - ], - [ - "sp", - "e" - ], - [ - "Ġv", - "ËĪÉĻ" - ], - [ - "Ġf", - "ËĪɪ" - ], - [ - "ĠËĮɪnt", - "ÊĬ" - ], - [ - "l", - "ÉĻn" - ], - [ - "Ġn", - "ËĪiËIJd" - ], - [ - "ĠsËĮÊĬ", - "a" - ], - [ - "ĠËĪu", - "m" - ], - [ - "Ġd", - "ËĪeɪ" - ], - [ - "ĠËĪÊĮ", - "bʰi" - ], - [ - "ËĪÉijËIJ", - "ɾ" - ], - [ - "Ġb", - "ËĪiÉĽÉľt" - ], - [ - "Êİ", - "os" - ], - [ - "Ġtsh", - "ËĪaiÉľ" - ], - [ - "ĠËĮɪ", - "skËĮaËIJ" - ], - [ - "ĠaÊĬ", - "ÉĻ" - ], - [ - "ĠËĪy", - "æ" - ], - [ - "Ġd", - "yn" - ], - [ - "Ġm", - "ËĪiËIJn" - ], - [ - "ĠËĪÊĮ", - "cʰËIJ" - ], - [ - "Ġs", - "ÉĽ" - ], - [ - "Ġn", - "ËĪy" - ], - [ - "Ġn", - "ËĮÉĽl" - ], - [ - "É¡", - "ɾ" - ], - [ - "Êĥ", - "ËĪe" - ], - [ - "ĠÊĤ", - "ËĮÉĽ" - ], - [ - "ĠËĪÉĽ", - "vɹɪ" - ], - [ - "ËĪÉĽl", - "p" - ], - [ - "ĠbËĪa", - "k" - ], - [ - "Ġ", - "eËIJ" - ], - [ - "Ġf", - "ËĪaËIJ" - ], - [ - "Ġk", - "ÉĽl" - ], - [ - "ĠËĪeËIJ", - "s" - ], - [ - "j", - "ËĪaËIJd" - ], - [ - "Ġl", - "ËĮi" - ], - [ - "mb", - "ɾe" - ], - [ - "k", - "tÉĻ" - ], - [ - "nt", - "a" - ], - [ - "t", - "ËĪu" - ], - [ - "Ġð", - "ËĪat" - ], - [ - "ĠËĪa", - "β" - ], - [ - "ÉĻɹ", - "i" - ], - [ - "ĠkwËĮÉĽ", - "lla" - ], - [ - "Ġb", - "ÉĻn" - ], - [ - "r", - "ËĮÉĽ" - ], - [ - "Ġn", - "ÉĶ" - ], - [ - "ĠÉ¡", - "ËĪɪ" - ], - [ - "ĠËĪa", - "p" - ], - [ - "ɹ", - "ÉĻ" - ], - [ - "ËĪa", - "Éľkh" - ], - [ - "ĠÊIJ", - "ËĪi" - ], - [ - "Ġ", - "ËĪÉijËIJ" - ], - [ - "ɪ", - "É¡ÉĻn" - ], - [ - "Ġw", - "ËĪai" - ], - [ - "Ġp", - "ÉĻt" - ], - [ - "kËIJ", - "a" - ], - [ - "Ġb", - "ËĪÉĽËIJ" - ], - [ - "ËĪeËIJ", - "Êĭ" - ], - [ - "ls", - "ÉĻÊĬ" - ], - [ - "ĠcËĪaËIJh", - "ɪËĮeËIJ" - ], - [ - "Ġk", - "ÉĻn" - ], - [ - "ĠËĮaɪn", - "ÉĻm" - ], - [ - "ËĪuËIJ", - "t" - ], - [ - "Ġh", - "ËĪaÊĬ" - ], - [ - "Ġt", - "ËĪanto" - ], - [ - "ĠhÉIJ", - "z" - ], - [ - "Ġs", - "ËĪÊĮɾ" - ], - [ - "Ġn", - "o" - ], - [ - "Ġt", - "ËĪÉĶËIJ" - ], - [ - "Ġz", - "ËĪaɪ" - ], - [ - "ĠtÉķËĪiÉĽ", - "Éľ" - ], - [ - "Ġko", - "zËĪi" - ], - [ - "Ġk", - "ËĪei" - ], - [ - "ð", - "ËĪÉĶɾ" - ], - [ - "ËĮÉĶ", - "Êģ" - ], - [ - "Ġt", - "ËĪÊĮɾ" - ], - [ - "ĠÊIJ", - "ËĪÉĻ" - ], - [ - "ĠÉķËĪy", - "ÉĽÉľ" - ], - [ - "ĠmËĮÊĬ", - "ÉŁÊ°eËIJ" - ], - [ - "m", - "f" - ], - [ - "Ġv", - "ËĪiËIJdÉľ" - ], - [ - "k", - "ËĪa" - ], - [ - "ĠÉIJ", - "É¡" - ], - [ - "k", - "w" - ], - [ - "ĠÊģ", - "ÉĽ" - ], - [ - "x", - "ÉĻn" - ], - [ - "Ġd", - "ÊĬ" - ], - [ - "ĠkËĪÊĮɾ", - "nËĮeËIJ" - ], - [ - "jËĪaËIJd", - "aËIJ" - ], - [ - "Ġf", - "ÉĻ" - ], - [ - "ĠËĮi", - "mp" - ], - [ - "Ġh", - "ɪz" - ], - [ - "Ġ", - "ʰÏĩ" - ], - [ - "ËĪoËIJ", - "ni" - ], - [ - "Ġx", - "ËĪiÉľ" - ], - [ - "ËĪeËIJ", - "sÊĪ" - ], - [ - "Êı", - "bÉľ" - ], - [ - "ËĮÉĶɾ", - "ke" - ], - [ - "ĠÉ¡", - "ËĪÉĻÊĬ" - ], - [ - "ËĪɪ", - "ÊĥÉĻn" - ], - [ - "l", - "es" - ], - [ - "Ġf", - "ËĪiËIJ" - ], - [ - "É¡", - "tÉĻ" - ], - [ - "ËĪeËIJ", - "re" - ], - [ - "Ġv", - "ËĮaËIJ" - ], - [ - "Ġ", - "ËĪeɪ" - ], - [ - "Ġm", - "ËĪuÉĻÉľn" - ], - [ - "ĠÉ¡ËĪÊĬ", - "d" - ], - [ - "ĠmËĮa", - "ɪn" - ], - [ - "z", - "ËĪe" - ], - [ - "ĠlËĪi", - "Éľ" - ], - [ - "Ġm", - "u" - ], - [ - "Ġk", - "ËĮÉĽl" - ], - [ - "Ġj", - "ËĮÉĻh" - ], - [ - "Ġf", - "ËĮÉĶɾ" - ], - [ - "f", - "ɹ" - ], - [ - "Ġk", - "ËĪaɪn" - ], - [ - "ĠËĪÉĴ", - "lsÉĻÊĬ" - ], - [ - "θ", - "ɪÅĭ" - ], - [ - "Ġth", - "ËĪonÉ¡Éľ" - ], - [ - "t", - "ËĪÉij" - ], - [ - "θj", - "o" - ], - [ - "m", - "ËĪÉĶ" - ], - [ - "Ġ", - "os" - ], - [ - "Ġs", - "ÊĬ" - ], - [ - "ĠsËĪÊĮ", - "mÉĻ" - ], - [ - "ĠvËĮÉĽ", - "n" - ], - [ - "n", - "ËĪo" - ], - [ - "ĠËĪak", - "tÊĥuËIJ" - ], - [ - "É£", - "a" - ], - [ - "Ġtʰ", - "i" - ], - [ - "Ġf", - "ËĮi" - ], - [ - "Ġv", - "ËĪÉĽl" - ], - [ - "ĠtËĪu", - "tËIJi" - ], - [ - "x", - "os" - ] - ] - } -} \ No newline at end of file From b70c49502062f0d7f6307ec763218b7f511e8cab Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Mon, 9 Mar 2026 13:59:38 -0400 Subject: [PATCH 77/94] remove name from training modes Signed-off-by: Paarth Neekhara --- examples/tts/conf/magpietts/easy_magpietts.yaml | 6 +++--- .../tts/conf/magpietts/easy_magpietts_lhotse.yaml | 6 +++--- nemo/collections/tts/models/easy_magpietts.py | 11 +++++++---- nemo/collections/tts/modules/__init__.py | 6 +----- tests/collections/tts/test_infer_vs_process_batch.py | 1 - 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/examples/tts/conf/magpietts/easy_magpietts.yaml b/examples/tts/conf/magpietts/easy_magpietts.yaml index a668686dc28c..c6612499993d 100644 --- a/examples/tts/conf/magpietts/easy_magpietts.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts.yaml @@ -90,10 +90,10 @@ model: # Multi-mode training configuration # The model will randomly select one of the modes for each batch during training. # Each mode has its own task embedding that is prepended to the context. - # During inference, you can specify which mode to use via the 'inference_mode' parameter. + # During inference, you can specify which mode to use via the derived + # 'inference_mode' string: "{text_input_mode}_{streaming_phonemes_delay}_{streaming_speech_delay}". training_modes: - - name: "streaming_0_1" - text_input_mode: "streaming" # Options: "full", "streaming" + - text_input_mode: "streaming" # Options: "full", "streaming" streaming_phonemes_delay: 0 streaming_speech_delay: 1 diff --git a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml index 6eb4d03a98d2..f43814a6a479 100644 --- a/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml +++ b/examples/tts/conf/magpietts/easy_magpietts_lhotse.yaml @@ -85,10 +85,10 @@ model: # Multi-mode training configuration # The model will randomly select one of the modes for each batch during training. # Each mode has its own task embedding that is prepended to the context. - # During inference, you can specify which mode to use via the 'inference_mode' parameter. + # During inference, you can specify which mode to use via the derived + # 'inference_mode' string: "{text_input_mode}_{streaming_phonemes_delay}_{streaming_speech_delay}". training_modes: - - name: "streaming_0_1" - text_input_mode: "streaming" # Options: "full", "streaming" + - text_input_mode: "streaming" # Options: "full", "streaming" streaming_phonemes_delay: 0 streaming_speech_delay: 1 diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index e8bb877dfb53..b366a32cd024 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -76,19 +76,24 @@ class TrainingMode: Configuration for a training mode in multi-mode training. Attributes: - name: Unique identifier for this mode (e.g., "full", "streaming_4_8") text_input_mode: Either "full" or "streaming" streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode) streaming_speech_delay: Delay for speech stream (only used in streaming mode) mode_idx: Index of this mode in the list of modes (used for task embedding lookup) """ - name: str text_input_mode: str streaming_phonemes_delay: int streaming_speech_delay: int mode_idx: int + @property + def name(self) -> str: + """Derived identifier used for inference selection and logging.""" + return ( + f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}" + ) + @dataclass class ProcessBatchOutput: @@ -335,7 +340,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Create a default training mode for backward compatibility self.training_modes = [ TrainingMode( - name="streaming_4_8", text_input_mode="streaming", streaming_phonemes_delay=4, streaming_speech_delay=8, @@ -347,7 +351,6 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.training_modes = [] for mode_idx, mode_cfg in enumerate(training_modes_cfg): mode = TrainingMode( - name=mode_cfg.name, text_input_mode=mode_cfg.text_input_mode, streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0), streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0), diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py index 866f418dbacd..ceda09492ada 100644 --- a/nemo/collections/tts/modules/__init__.py +++ b/nemo/collections/tts/modules/__init__.py @@ -14,8 +14,4 @@ import nemo.collections.tts.modules.adapters import nemo.collections.tts.modules.ffn_modules -import nemo.collections.tts.modules.moe_modules -from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder -from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder -from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet -from nemo.collections.tts.modules.waveglow import WaveGlowModule +import nemo.collections.tts.modules.moe_modules \ No newline at end of file diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py index 3741deddf430..0ea66e2870ef 100644 --- a/tests/collections/tts/test_infer_vs_process_batch.py +++ b/tests/collections/tts/test_infer_vs_process_batch.py @@ -73,7 +73,6 @@ def build_minimal_config(codecmodel_path: str) -> OmegaConf: # Training modes (single streaming mode) 'training_modes': [ { - 'name': 'streaming_4_8', 'text_input_mode': 'streaming', 'streaming_phonemes_delay': 4, 'streaming_speech_delay': 8, From 00acdb475a3f1b9c94e9934f9180492f9416b96d Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Mon, 9 Mar 2026 19:47:54 +0000 Subject: [PATCH 78/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 4 +--- nemo/collections/tts/modules/__init__.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index b366a32cd024..d999fcc31739 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -90,9 +90,7 @@ class TrainingMode: @property def name(self) -> str: """Derived identifier used for inference selection and logging.""" - return ( - f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}" - ) + return f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}" @dataclass diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py index ceda09492ada..c4dffba34215 100644 --- a/nemo/collections/tts/modules/__init__.py +++ b/nemo/collections/tts/modules/__init__.py @@ -14,4 +14,4 @@ import nemo.collections.tts.modules.adapters import nemo.collections.tts.modules.ffn_modules -import nemo.collections.tts.modules.moe_modules \ No newline at end of file +import nemo.collections.tts.modules.moe_modules From 49bd6ff7e836bc7fdaeb47fa6caba554b4eefce5 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Mon, 9 Mar 2026 14:14:44 -0700 Subject: [PATCH 79/94] removing some debugging statements Signed-off-by: Shehzeen Hussain --- nemo/collections/tts/data/text_to_speech_dataset_lhotse.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index 356cc8ca4d15..c1ac9975d215 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -205,10 +205,8 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: self.eos_id = self.bos_id + 1 self.pad_id = self.text_tokenizer.pad + # initialize the phoneme tokenizer once per dataset/worker when config is available. if self.phoneme_tokenizer is None and self.phoneme_tokenizer_config is not None: - worker_info = torch.utils.data.get_worker_info() - worker_id = worker_info.id if worker_info is not None else 0 - logging.info(f"Worker {worker_id} initializing phoneme tokenizer...") self.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.phoneme_tokenizer_config) # define list to store batched information From fdbf72d059493ff0db2b7be5fa10242d22582e4b Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Mon, 9 Mar 2026 19:24:15 -0700 Subject: [PATCH 80/94] new base class (#68) * new base class Signed-off-by: Paarth Neekhara * Magpie models refactoring Signed-off-by: Paarth Neekhara --------- Signed-off-by: Paarth Neekhara --- examples/tts/magpietts_inference.py | 8 +- nemo/collections/tts/models/__init__.py | 4 + nemo/collections/tts/models/base_magpietts.py | 569 ++++ nemo/collections/tts/models/easy_magpietts.py | 2336 +---------------- .../tts/models/easy_magpietts_inference.py | 2018 ++++++++++++++ nemo/collections/tts/models/magpietts.py | 621 +---- .../tts/modules/magpietts_inference/utils.py | 3 + 7 files changed, 2638 insertions(+), 2921 deletions(-) create mode 100644 nemo/collections/tts/models/base_magpietts.py create mode 100644 nemo/collections/tts/models/easy_magpietts_inference.py diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index d38c093eb1de..f1ed60c27428 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -559,12 +559,8 @@ def main(argv=None): model_inference_parameters[field_name] = arg_from_cmdline if "max_decoder_steps" not in model_inference_parameters: - if args.longform_mode in {'always', 'auto'}: - model_inference_parameters["max_decoder_steps"] = args.longform_max_decoder_steps - elif args.is_decoder_only_model: + if args.is_decoder_only_model: model_inference_parameters["max_decoder_steps"] = 300 - else: - model_inference_parameters["max_decoder_steps"] = 440 inference_config = InferenceConfig( model_inference_parameters=ModelInferenceParameters.from_dict(model_inference_parameters), @@ -581,8 +577,6 @@ def main(argv=None): phoneme_sampling_method=args.phoneme_sampling_method, dropout_text_input=args.dropout_text_input, legacy_context_stacking=args.legacy_context_stacking, - longform_mode=args.longform_mode, - longform_word_threshold=args.longform_word_threshold, ) eval_config = EvaluationConfig( diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py index 0783c79bacab..28d49bca1c81 100644 --- a/nemo/collections/tts/models/__init__.py +++ b/nemo/collections/tts/models/__init__.py @@ -14,7 +14,9 @@ from nemo.collections.tts.models.aligner import AlignerModel from nemo.collections.tts.models.audio_codec import AudioCodecModel +from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel +from nemo.collections.tts.models.easy_magpietts_inference import EasyMagpieTTSInferenceModel from nemo.collections.tts.models.easy_magpietts_preference_optimization import EasyMagpieTTSModelOnlinePO from nemo.collections.tts.models.fastpitch import FastPitchModel from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL @@ -30,6 +32,7 @@ __all__ = [ "AlignerModel", "AudioCodecModel", + "BaseMagpieTTSModel", "FastPitchModel", "FastPitchModel_SSL", "SSLDisentangler", @@ -37,6 +40,7 @@ "InferBatchOutput", "MagpieTTSModel", "EasyMagpieTTSModel", + "EasyMagpieTTSInferenceModel", "EasyMagpieTTSModelOnlinePO", "MagpieTTSModelOfflinePODataGen", "MagpieTTSModelOfflinePO", diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py new file mode 100644 index 000000000000..f3eacb945051 --- /dev/null +++ b/nemo/collections/tts/models/base_magpietts.py @@ -0,0 +1,569 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional + +import numpy as np +import torch +from torch.utils.data import get_worker_info + +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( + instantiate_phoneme_tokenizer, + setup_tokenizers, +) +from nemo.collections.tts.modules.magpietts_modules import ( + SpecialAudioToken, + cosine_schedule, +) +from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths +from nemo.core.classes import ModelPT +from nemo.utils import logging + + +def worker_init_fn(worker_id): + """Per-worker init for DataLoader workers. + + Sets up tokenizers for the dataset (text and optionally phoneme) + when using multiprocessing. + """ + logging.info(f"Worker {worker_id} initializing...") + worker_info = get_worker_info() + dataset = worker_info.dataset + tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type) + dataset.text_tokenizer = tokenizer + if hasattr(dataset, 'phoneme_tokenizer_config'): + dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config) + + +class BaseMagpieTTSModel(ModelPT): + """Base class for MagpieTTS models. + + Contains shared functionality for audio codec helpers, special token + manipulation, local transformer functions, and state dict handling. + Subclasses (EasyMagpieTTSModel, MagpieTTSModel) provide their own + ``__init__``, data loading, training/inference logic, etc. + """ + + # ------------------------------------------------------------------ + # State-dict exclusion – subclasses override + # ------------------------------------------------------------------ + + def _get_state_dict_keys_to_exclude(self) -> List[str]: + """Return list of key substrings to exclude from checkpoint save/load. + + Subclasses should override to specify model-specific exclusions + (e.g. codec model, eval models). + """ + return ['_codec_model'] + + # ------------------------------------------------------------------ + # state_dict / load_state_dict / optimizer param groups + # ------------------------------------------------------------------ + + def state_dict(self, destination=None, prefix='', keep_vars=False): + if hasattr(self, '_no_state_dict') and self._no_state_dict: + return {} + state_dict = super().state_dict(destination, prefix, keep_vars) + keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude() + for key in list(state_dict.keys()): + if any(substring in key for substring in keys_substrings_to_exclude): + del state_dict[key] + return state_dict + + def load_state_dict(self, state_dict, strict=True): + if not strict: + super().load_state_dict(state_dict, strict=False) + modules_to_skip = self._get_state_dict_keys_to_exclude() + for name, child in self.named_children(): + if name in modules_to_skip: + continue + if any(param.numel() > 0 for param in child.parameters()): + new_state_dict = {} + for key in state_dict.keys(): + name_with_dot = f"{name}." + if key.startswith(name_with_dot): + new_state_dict[key[len(name_with_dot):]] = state_dict[key] + child.load_state_dict(new_state_dict) + + def setup_optimizer_param_groups(self): + """Exclude frozen eval/inference-only models from the optimizer.""" + modules_to_exclude = set(self._get_state_dict_keys_to_exclude()) + + excluded_param_ids = set() + for name, module in self.named_children(): + if name in modules_to_exclude: + for param in module.parameters(): + excluded_param_ids.add(id(param)) + + trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids] + + logging.info( + f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, " + f"{len(excluded_param_ids)} params excluded (eval models)" + ) + + self._optimizer_param_groups = [{"params": trainable_params}] + + # ------------------------------------------------------------------ + # Special token helpers + # ------------------------------------------------------------------ + + def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1): + # codes: (B, C, T') + codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0) + codes_len = codes_len + num_eos_tokens + for idx in range(codes.size(0)): + codes[idx, :, codes_len[idx] - 1] = eos_id + return codes, codes_len + + def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1): + # codes: (B, C, T') + codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id) + codes_len = codes_len + num_bos_tokens + codes, codes_len = self.add_eos_token( + codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens + ) + return codes, codes_len + + def remove_bos_token(self, codes, codes_len, num_tokens=1): + codes = codes[:, :, num_tokens:] + codes_len = codes_len - num_tokens + return codes, codes_len + + def remove_embedded_bos_token(self, embedded, embedded_len): + embedded = embedded[:, 1:, :] + embedded_len = embedded_len - 1 + return embedded, embedded_len + + def remove_eos_token(self, codes, codes_len): + codes_len = codes_len - 1 + codes = codes[:, :, :-1] + mask = get_mask_from_lengths(lengths=codes_len) + codes = codes * mask.unsqueeze(1) + return codes, codes_len + + def remove_embedded_eos_token(self, embedded, embedded_len): + # embedded: (B, T', D) + embedded_len = embedded_len - 1 + embedded = embedded[:, :-1, :] + mask = get_mask_from_lengths(lengths=embedded_len) + embedded = embedded * mask.unsqueeze(2) + return embedded, embedded_len + + def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1): + codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens) + codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len) + return codes, codes_len + + # ------------------------------------------------------------------ + # Audio codec helpers + # ------------------------------------------------------------------ + + def audio_to_codes(self, audio, audio_len, sample_rate=None): + self._codec_model.eval() + with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): + codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate) + return codes, codes_len + + def codes_to_audio(self, codes, codes_len): + # codes: (B, C, T') + self._codec_model.eval() + with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): + if self._codec_converter is not None: + codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) + audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) + return audio, audio_len, codes + + # ------------------------------------------------------------------ + # Padding / forbidden-logits helpers + # ------------------------------------------------------------------ + + def pad_audio_codes(self, audio_codes: torch.Tensor): + """Pads the time dimension of the audio codes to a multiple of the frame stacking factor. + + Args: + audio_codes: (B, C, T) + Returns: + (B, C, T_padded) + """ + T = audio_codes.size(2) + T_padded = int(np.ceil(T / self.frame_stacking_factor) * self.frame_stacking_factor) + num_pad = T_padded - T + audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad)) + return audio_codes + + def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool = False) -> torch.Tensor: + """Sets logits of forbidden tokens to ``-inf`` so they will never be sampled. + + Specifically, we forbid sampling of all special tokens except AUDIO_EOS + which is allowed by default. + + Args: + logits: (B, C, num_audio_tokens_per_codebook) + forbid_audio_eos: If True, also forbid AUDIO_EOS tokens from being sampled. + """ + logits[ + :, + :, + SpecialAudioToken.get_forbidden_tokens(self.codebook_size, forbid_audio_eos=forbid_audio_eos), + ] = float('-inf') + return logits + + # ------------------------------------------------------------------ + # MaskGit helpers + # ------------------------------------------------------------------ + + def maskgit_create_random_mask(self, codes): + """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN.""" + B, C, T = codes.shape + rand_values = torch.rand(B, T, device=codes.device) + frac_masked = cosine_schedule(rand_values) + n_masked = torch.ceil(frac_masked * C).long() + random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) + mask_indices = torch.arange(C, device=codes.device).view(1, C, 1) + mask = mask_indices < n_masked.view(B, 1, T) + mask = torch.gather(mask, 1, random_permutations) + return mask + + def maskgit_apply_random_mask(self, codes): + """Randomly replaces some codes with MASK_TOKEN following the cosine schedule.""" + mask = self.maskgit_create_random_mask(codes) + codes_with_mask = torch.where(mask, self.mask_token_id, codes) + return codes_with_mask, mask + + # ------------------------------------------------------------------ + # Local transformer – training + # ------------------------------------------------------------------ + + def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): + """Predicts the logits for all codebooks using the local transformer. + + Used in both autoregressive (AR) and MaskGit (MG) modes during + training and validation (not inference/sampling). + + The sequence layout is slightly different between AR and MG modes, as shown below + (using an 8-codebook setup as an example):: + + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | AR target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | none | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | MG target | none | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | Input | Magpie | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + | | Latent | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | Seq. Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + + Args: + dec_out: (B, T', E) + audio_codes_target: (B, C, T') + targets_offset_by_one: if False, target for index 0 is codebook 0 (AR); + if True, target for index 1 is codebook 0 (MaskGit). + """ + C = self.num_audio_codebooks + dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) + local_transformer_input = [dec_out_all] + audio_codes_target = self.pad_audio_codes(audio_codes_target).long() + for fs_index in range(self.frame_stacking_factor): + for codebook_num in range(C): + codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor] + codes = codes.reshape(-1) + codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes) + codebook_embedding = self.audio_in_projection(codebook_embedding) + local_transformer_input.append(codebook_embedding) + + local_transformer_input = torch.stack(local_transformer_input, dim=1) + local_transformer_input = self.local_transformer_in_projection(local_transformer_input) + _mask = torch.ones( + local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device + ) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] + if not targets_offset_by_one: + local_transformer_output = local_transformer_output[:, :-1, :] + else: + local_transformer_output = local_transformer_output[:, 1:, :] + + local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output) + + all_code_logits = [] + for fs_index in range(self.frame_stacking_factor): + for codebook_num in range(audio_codes_target.size(1)): + codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C]( + local_transformer_output[:, codebook_num + fs_index * C, :] + ) + all_code_logits.append(codebook_logits) + all_code_logits = torch.cat(all_code_logits, dim=1) + + all_code_logits = all_code_logits.view( + audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1 + ) + + return all_code_logits + + # ------------------------------------------------------------------ + # Local transformer – AR sampling + # ------------------------------------------------------------------ + + def local_transformer_sample_autoregressive( + self, + dec_output: torch.Tensor, + temperature: float = 0.7, + topk: int = 80, + unfinished_items: Dict[int, bool] = {}, + finished_items: Dict[int, bool] = {}, + use_cfg: bool = False, + cfg_scale: float = 1.0, + use_kv_cache: bool = True, + forbid_audio_eos: bool = False, + ) -> torch.Tensor: + """Sample audio codes autoregressively across codebooks using the local transformer. + + Uses multinomial sampling with temperature, top-k, and + classifier-free guidance (CFG). + + Args: + dec_output: Decoder output tensor (B, E). + temperature: Sampling temperature. When <= 0, uses argmax. + topk: Number of top-probability tokens to consider. + unfinished_items: Batch indices that have not completed generation (EOS forbidden). + finished_items: Batch indices that are completed (EOS forced). + use_cfg: Whether to use classifier-free guidance (doubled batch). + cfg_scale: Scale factor for CFG. + use_kv_cache: Whether to use key-value caching in the local transformer. + forbid_audio_eos: Whether to globally forbid audio EOS. + + Returns: + Sampled audio codes (B, num_codebooks, frame_stacking_factor). + """ + self.local_transformer.reset_cache(use_cache=use_kv_cache) + dec_output = dec_output.unsqueeze(1) # (B, 1, E) + local_transformer_input = self.local_transformer_in_projection(dec_output) + all_preds = [] + for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor): + _mask = torch.ones( + local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device + ) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] + + lt_out_for_proj = self.local_transformer_audio_out_projection(local_transformer_output[:, -1, :]) + codebook_logits = self.local_transformer_out_projections[codebook_num](lt_out_for_proj) + + if use_cfg: + actual_batch_size = codebook_logits.size(0) // 2 + conditional_logits = codebook_logits[:actual_batch_size] + unconditional_logits = codebook_logits[actual_batch_size:] + cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + codebook_logits[:actual_batch_size] = cfg_logits + + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) + + for item_idx in unfinished_items: + codebook_logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + codebook_logits[item_idx, :] = float('-inf') + codebook_logits[item_idx, self.audio_eos_id] = 0.0 + + codebook_logits = self.clear_forbidden_logits( + codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos + ).squeeze(1) + + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1) + codebook_logits_rescored = codebook_logits.clone() + codebook_logits_rescored[indices_to_remove] = float('-inf') + + if temperature <= 0.0: + codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) + else: + codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) + codebook_preds = torch.multinomial(codebook_probs, 1) + + if use_cfg: + codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] + all_preds.append(codebook_preds) + + next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1) + next_local_transformer_input = self.audio_in_projection(next_local_transformer_input) + next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) + local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) + + all_preds = torch.cat(all_preds, dim=1) # (B, num_codebooks * frame_stacking_factor) + all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1) + if use_cfg: + all_preds = all_preds[:actual_batch_size] + + return all_preds + + # ------------------------------------------------------------------ + # Local transformer – MaskGit sampling + # ------------------------------------------------------------------ + + def local_transformer_sample_maskgit( + self, + dec_output: torch.Tensor, + temperature: float = 0.7, + topk: int = 80, + unfinished_items: Dict[int, bool] = {}, + finished_items: Dict[int, bool] = {}, + use_cfg: bool = False, + cfg_scale: float = 1.0, + n_steps: int = 3, + noise_scale: float = 0.0, + fixed_schedule: Optional[List[int]] = None, + dynamic_cfg_scale: bool = False, + sampling_type: Optional[str] = None, + forbid_audio_eos: bool = False, + ) -> torch.Tensor: + """Sample audio codes using MaskGit-like iterative prediction with the local transformer. + + If frame-stacking is enabled, the codes for all frames in the stack + are sampled, treated as one long sequence. + + Args: + dec_output: Decoder output tensor (B, E). + temperature: Sampling temperature. + topk: Number of top-probability tokens to consider. + unfinished_items: Batch indices that have not completed generation. + finished_items: Batch indices that are completed. + use_cfg: Whether to use classifier-free guidance. + cfg_scale: Scale factor for CFG. + n_steps: Number of iterative refinement steps. + noise_scale: Scale factor for noise added to confidence scores. + fixed_schedule: Fixed schedule for number of tokens to unmask per step. + dynamic_cfg_scale: Whether to dynamically adjust CFG scale. + sampling_type: Sampling strategy (``"default"``, ``"causal"``, + ``"purity_causal"``, ``"purity_default"``). + forbid_audio_eos: Whether to globally forbid audio EOS. + + Returns: + Sampled audio codes (B, num_codebooks, frame_stacking_factor). + """ + device = dec_output.device + self.local_transformer.reset_cache(use_cache=False) + dec_output = dec_output.unsqueeze(1) + local_transformer_input_init = self.local_transformer_in_projection(dec_output) + codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor + B = dec_output.size(0) + + min_confidence = 0 + max_confidence = 5 + confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device) + codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long) + sampled_codes = codes.clone() + if fixed_schedule is not None: + n_steps = len(fixed_schedule) + for step in range(n_steps): + progress = step / n_steps + frac_masked = cosine_schedule(torch.tensor(progress)) + if sampling_type == "causal" or sampling_type == "purity_causal": + frac_masked = torch.ones_like(frac_masked) * (1.0 - progress) + if fixed_schedule is None: + n_masked = torch.ceil(codebook_seq_len * frac_masked).long() + else: + n_masked = codebook_seq_len - fixed_schedule[step] + n_unmasked = codebook_seq_len - n_masked + + if sampling_type == "causal" or sampling_type == "purity_causal": + n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1)) + confidences[:, n_frames_to_allow * self.num_audio_codebooks:] = min_confidence - 1 + + _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) + if use_cfg: + actual_batch_size = topk_indices.size(0) // 2 + assert ( + topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size] + ).all(), "Topk indices are not the same for conditional and unconditional codes" + + unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices) + codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) + + local_transformer_input = local_transformer_input_init + for codebook_num in range(codebook_seq_len): + next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) + next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) + local_transformer_input = torch.cat( + [local_transformer_input, next_local_transformer_input], dim=1 + ) + + _mask = torch.ones(B, codebook_seq_len + 1, device=device) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] + + logits = [] + for codebook_num in range(codebook_seq_len): + codebook_logits = self.local_transformer_out_projections[codebook_num]( + local_transformer_output[:, codebook_num + 1, :] + ) + logits.append(codebook_logits) + logits = torch.stack(logits, dim=1) + + if use_cfg: + actual_batch_size = logits.size(0) // 2 + conditional_logits = logits[:actual_batch_size] + unconditional_logits = logits[actual_batch_size:] + if not dynamic_cfg_scale: + current_cfg_scale = cfg_scale + else: + progress = step / (n_steps - 1) + interp = progress + current_cfg_scale = (cfg_scale - 1) * interp + 1.0 + cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits + logits[:actual_batch_size] = cfg_logits + + logits = self.clear_forbidden_logits(logits, forbid_audio_eos=forbid_audio_eos) + + for item_idx in unfinished_items: + logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + logits[item_idx, :, :] = float('-inf') + logits[item_idx, :, self.audio_eos_id] = 0.0 + + logits_topk = torch.topk(logits, topk, dim=-1)[0] + indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) + logits_rescored = logits.clone() + logits_rescored[indices_to_remove] = float('-inf') + probs = torch.softmax(logits_rescored / temperature, dim=-1) + sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len) + if use_cfg: + sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size] + probs[actual_batch_size:] = probs[:actual_batch_size] + if sampling_type != "purity_causal" and sampling_type != "purity_default": + confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) + else: + confidences = probs.max(dim=2)[0] + sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) + if noise_scale > 0.0: + noise = ( + (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps) + ) + confidences += noise + confidences[actual_batch_size:] = confidences[:actual_batch_size] + confidence_eps = 0.1 + assert ( + confidences.max() + confidence_eps < max_confidence + ), f"Predicted confidence is approaching max_confidence: {confidences.max()}" + confidences.scatter_( + index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float) + ) + codes = sampled_codes + assert not ( + codes == self.mask_token_id + ).any(), "Codes contain mask tokens after completion of MaskGit sampling" + + codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1) + + if use_cfg: + codes = codes[:actual_batch_size] + return codes diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index d999fcc31739..115b8e2d6a99 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -14,10 +14,8 @@ import json import os import random -import time from dataclasses import dataclass -from functools import partial -from typing import Any, Dict, List, Optional, Sequence, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np import soundfile as sf @@ -28,9 +26,7 @@ from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger from omegaconf import DictConfig from torch import nn -from torch.utils.data import get_worker_info from torch.utils.data.distributed import DistributedSampler -from transformers import AutoConfig, AutoModel, AutoModelForCausalLM import nemo.collections.asr as nemo_asr from nemo.collections.asr.metrics.wer import word_error_rate @@ -41,15 +37,15 @@ instantiate_phoneme_tokenizer, setup_tokenizers, ) -from nemo.collections.tts.models import AudioCodecModel -from nemo.collections.tts.modules import transformer_2501 -from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter -from nemo.collections.tts.modules.magpietts_modules import ( - CharAwareSubwordEncoder, - LocalTransformerType, - SpecialAudioToken, - cosine_schedule, +from nemo.collections.tts.models.base_magpietts import worker_init_fn +from nemo.collections.tts.models.easy_magpietts_inference import ( + EasyMagpieTTSInferenceModel, + InferBatchOutput, + StreamingFinalizeOutput, + StreamingState, + TrainingMode, ) +from nemo.collections.tts.modules.magpietts_modules import LocalTransformerType from nemo.collections.tts.parts.utils.helpers import ( compute_utmos_scores_from_filepaths, get_mask_from_lengths, @@ -58,8 +54,6 @@ transcribe_with_whisper, transcribe_with_whisper_from_filepaths, ) -from nemo.core.classes import ModelPT -from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging try: @@ -70,29 +64,6 @@ HAVE_UTMOSV2 = False -@dataclass -class TrainingMode: - """ - Configuration for a training mode in multi-mode training. - - Attributes: - text_input_mode: Either "full" or "streaming" - streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode) - streaming_speech_delay: Delay for speech stream (only used in streaming mode) - mode_idx: Index of this mode in the list of modes (used for task embedding lookup) - """ - - text_input_mode: str - streaming_phonemes_delay: int - streaming_speech_delay: int - mode_idx: int - - @property - def name(self) -> str: - """Derived identifier used for inference selection and logging.""" - return f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}" - - @dataclass class ProcessBatchOutput: """ @@ -100,19 +71,19 @@ class ProcessBatchOutput: Attributes: loss: Total combined loss (codebook_loss + phoneme_loss + local_transformer_loss) - codebook_loss: Loss for audio codebook prediction - phoneme_loss: Loss for phoneme prediction (None if phoneme_tokenizer is not used) - local_transformer_loss: Loss from local transformer (None if not using local transformer) - local_transformer_logits: Logits from local transformer, shape (B, T', num_codebooks * num_tokens_per_codebook) - logits: Predicted logits from the main decoder, shape (B, T', num_codebooks * num_tokens_per_codebook) - phoneme_logits: Predicted phoneme logits, shape (B, T', phoneme_stacking_factor * phoneme_vocab_size). None if no phoneme tokenizer. - phoneme_tokens_target: Target phoneme tokens (shifted), shape (B, S, T'). None if no phoneme tokenizer. - phoneme_tokens_lens_target: Length of target phoneme tokens (B,). None if no phoneme tokenizer. - audio_codes_target: Target audio codes for the decoder, shape (B, C, T') - audio_codes_lens_target: Length of target audio codes for each batch item, shape (B,) - context_audio_codes: Audio codes extracted from context audio, shape (B, C, T') - context_audio_codes_lens: Length of context audio codes for each batch item, shape (B,) - selected_training_mode: Name of the selected training mode (None if multi_mode_training is disabled) + codebook_loss: Cross-entropy loss for parallel audio codebook prediction + phoneme_loss: Cross-entropy loss for phoneme prediction (None if no phoneme tokenizer) + local_transformer_loss: Loss from local transformer (None if not used) + local_transformer_logits: Logits from local transformer (None if not used) + logits: Predicted logits for audio codes (B, T', num_codebooks * num_tokens_per_codebook) + phoneme_logits: Predicted logits for phoneme tokens (None if no phoneme tokenizer) + phoneme_tokens_target: Target phoneme tokens for loss computation + phoneme_tokens_lens_target: Lengths of target phoneme tokens + audio_codes_target: Target audio codes for loss computation (B, C, T'-1) + audio_codes_lens_target: Lengths of target audio codes (B,) + context_audio_codes: Processed context audio codes (B, C, T') + context_audio_codes_lens: Length of processed context audio codes (B,) + selected_training_mode: Name of the training mode used for this batch (e.g., "streaming_4_8") """ loss: torch.Tensor @@ -128,260 +99,22 @@ class ProcessBatchOutput: audio_codes_lens_target: torch.Tensor context_audio_codes: torch.Tensor context_audio_codes_lens: torch.Tensor - selected_training_mode: Optional[str] = None + selected_training_mode: Optional[str] -@dataclass -class StreamingState: +class EasyMagpieTTSModel(EasyMagpieTTSInferenceModel): """ - State for streaming TTS inference with batch support. - - This dataclass maintains all the necessary state for autoregressive streaming - generation, allowing text tokens to be fed incrementally. Supports arbitrary - batch sizes where each batch item can have different context lengths and be - in different phases. - - The streaming operates in four phases (per batch item): - 1. Context phase (context_position < full_context_lens): Processing remaining context - 2. Prompt phase (text_tokens_seen < phoneme_delay): Only text, no predictions - 3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): Phoneme predictions only - 4. Audio phase (text_tokens_seen >= speech_delay): Both phoneme and audio predictions - - Attributes: - batch_size: Number of items in the batch. - past_key_values: KV cache from the transformer for efficient autoregressive decoding. - cache_seq_len: Current sequence length in the cache. - all_predictions: List of predicted audio codes at each timestep, each tensor is (B, C, S) unstacked. - all_phoneme_predictions: List of predicted phoneme tokens at each timestep, each tensor is (B, phoneme_stacking_factor). - context_audio_codes: Processed context audio codes with special tokens. - context_audio_codes_lens: Length of context audio codes. - context_lens: Total context length (task_embedding + context_audio + context_text). - full_context_embedding: Full context embedding for each batch item (B, T_max_context, E). - full_context_lens: Full context length for each batch item (B,). - context_position: How much context has been processed per batch item (B,). - text_tokens_seen: Number of text tokens processed so far per batch item (B,). - phoneme_steps: Number of phoneme prediction steps taken per batch item (B,). - audio_steps: Number of audio prediction steps taken per batch item (B,). - phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor. - phoneme_eos_detected: Whether the phoneme EOS has been predicted per batch item (B,) bool tensor. - finished: Whether generation is complete per batch item (B,) bool tensor. - device: Device tensors are on. - training_mode: The training mode being used for inference. - use_cfg: Whether classifier-free guidance is enabled. - cfg_scale: CFG scale factor. - use_local_transformer: Whether to use local transformer for inference. - temperature: Sampling temperature. - topk: Top-k sampling parameter. - dummy_context_embedding_unconditional: Unconditional embedding for CFG (if enabled). - last_hidden: Last hidden state from transformer. - text_finished: Whether text input has finished per batch item (B,) bool tensor. - phoneme_input_type: 'gt' or 'pred' for phoneme tokens. - phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. - last_phoneme_tokens: Last predicted phoneme tokens (B, phoneme_stacking_factor). - last_audio_codes: Last predicted audio codes (B, num_codebooks). - audio_prediction_start_idx: Global frame index where audio predictions start per batch item (B,). - audio_prediction_end_idx: Global frame index where audio predictions end per batch item (B,), -1 if not ended. - phoneme_prediction_start_idx: Global step index where phoneme predictions start per batch item (B,). - phoneme_prediction_end_idx: Global step index where phoneme predictions end per batch item (B,), -1 if not ended. - """ - - batch_size: int - past_key_values: Optional[Tuple] - cache_seq_len: int - all_predictions: List[torch.Tensor] - all_phoneme_predictions: List[torch.Tensor] - context_audio_codes: torch.Tensor - context_audio_codes_lens: torch.Tensor - context_lens: torch.Tensor - full_context_embedding: torch.Tensor - full_context_lens: torch.Tensor - context_position: torch.Tensor - text_tokens_seen: torch.Tensor - phoneme_steps: torch.Tensor - audio_steps: torch.Tensor - phoneme_stream_ended: torch.Tensor - phoneme_eos_detected: torch.Tensor - finished: torch.Tensor - device: torch.device - training_mode: TrainingMode - use_cfg: bool - cfg_scale: float - use_local_transformer: bool - temperature: float - topk: int - dummy_context_embedding_unconditional: Optional[torch.Tensor] - last_hidden: torch.Tensor - text_finished: torch.Tensor - phoneme_input_type: str - phoneme_sampling_method: str - last_phoneme_tokens: Optional[torch.Tensor] - last_audio_codes: Optional[torch.Tensor] - audio_prediction_start_idx: torch.Tensor - audio_prediction_end_idx: torch.Tensor - phoneme_prediction_start_idx: torch.Tensor - phoneme_prediction_end_idx: torch.Tensor - gt_phoneme_embeddings: Optional[torch.Tensor] = None # (B, T', E) pre-computed GT embeddings - gt_phoneme_lens: Optional[torch.Tensor] = None # (B,) lengths after stacking - gt_audio_embeddings: Optional[torch.Tensor] = None # (B, T', E) pre-computed GT audio embeddings - gt_audio_lens: Optional[torch.Tensor] = None # (B,) lengths after stacking - - -@dataclass -class StreamingFinalizeOutput: - """Output from streaming_finalize containing audio and phoneme predictions.""" + Magpie-TTS Model Decoder Only Model with training support. - audio: torch.Tensor # (B, max_audio_len) generated audio waveform - audio_len: torch.Tensor # (B,) length of audio per batch item - audio_codes: torch.Tensor # (B, num_codebooks, T) generated audio codes - audio_codes_len: torch.Tensor # (B,) length of codes per batch item - phoneme_tokens: List[List[int]] # List of phoneme token sequences per batch item - phoneme_text: List[str] # Decoded phoneme strings per batch item - - -@dataclass -class InferBatchOutput: - """Output dataclass for EasyMagpieTTS infer_batch method.""" - - predicted_audio: torch.Tensor # (B, T_audio) - predicted_audio_lens: torch.Tensor # (B,) - predicted_codes: torch.Tensor # (B, num_codebooks, T_frames) - predicted_codes_lens: torch.Tensor # (B,) - rtf_metrics: Dict[str, Any] - predicted_phoneme_tokens: Optional[torch.Tensor] = None # (B, phoneme_stacking_factor, T_phoneme_steps) - predicted_phoneme_tokens_lens: Optional[torch.Tensor] = None # (B,) number of valid phoneme steps per item - phoneme_prediction_start_idx: Optional[torch.Tensor] = None # (B,) start index into predicted_phoneme_tokens - - -def worker_init_fn(worker_id): - # For mp.set_start_method("spawn", force=True) - # The dataset class should be picklable, so we initialize non-picklable objects here - logging.info(f"Worker {worker_id} initializing...") - worker_info = get_worker_info() - dataset = worker_info.dataset # Get the dataset instance in this worker - tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type) - dataset.text_tokenizer = tokenizer - if hasattr(dataset, 'phoneme_tokenizer_config'): - dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config) - - -class EasyMagpieTTSModel(ModelPT): - """ - Magpie-TTS Model Decoder Only Model - audio/text + Subclasses EasyMagpieTTSInferenceModel to add training_step, validation_step, + process_batch, data loading, and training-specific configuration (loss weights, + phoneme corruption, eval models for validation metrics). """ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.num_nodes * trainer.num_devices - - # load codec - codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False) - self.sample_rate = codec_model.sample_rate - self.output_sample_rate = codec_model.output_sample_rate - - if hasattr(codec_model, "discriminator"): - # del codec discriminator to free memory - del codec_model.discriminator - - # Set up codebook configuration - vector_quantizer = cfg.get('vector_quantizer') - if vector_quantizer is not None: - vector_quantizer = instantiate(vector_quantizer) - num_audio_codebooks = vector_quantizer.num_codebooks - codebook_size = vector_quantizer.codebook_size - codec_converter = VectorQuantizerIndexConverter( - vector_quantizer_original=codec_model.vector_quantizer, - vector_quantizer_new=vector_quantizer, - ) - data_num_audio_codebooks = codec_model.vector_quantizer.num_codebooks - else: - num_audio_codebooks = codec_model.num_codebooks - data_num_audio_codebooks = num_audio_codebooks - codebook_size = codec_model.codebook_size - codec_converter = None - - # The dataloader needs to know the number of codebooks that the context codes were stored in - # In the case where there are no context codes saved, and there is no context audio (in the text context path), - # We create a dummy context code tensor that is only [context_BOS, context_EOS] that is repeated for - # data_num_audio_codebooks - self.data_num_audio_codebooks = data_num_audio_codebooks - self.num_audio_codebooks = num_audio_codebooks - self.codebook_size = codebook_size - - self.codec_model_samples_per_frame = codec_model.samples_per_frame - # Our codebooks start with actual audio codec tokens, followed by special tokens. - # The `forced_*` options are for backward compatibility for models trained with older code. - # Our codebooks start with actual audio codec tokens, followed by special tokens. - # The `forced_*` options are for backward compatibility for models trained with older code. - get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size) - self.audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_BOS) - self.audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_EOS) - self.context_audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_BOS) - self.context_audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_EOS) - self.mask_token_id = get_token_index(SpecialAudioToken.MASK_TOKEN) - self.num_all_tokens_per_codebook = self.codebook_size + len(SpecialAudioToken) - self.use_bpe_char_tokenizer = cfg.get('use_bpe_char_tokenizer', False) - - # If specified, use this as the text conditioning tokenizer. Otherwise, use the first tokenizer. - self.text_conditioning_tokenizer_name = cfg.get('text_conditioning_tokenizer_name', None) - if self.text_conditioning_tokenizer_name is None: - self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0] - - self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0) - - # Multi-mode training configuration - # The model trains with multiple text input modes (full, streaming with various delays) - # Each mode has its own task embedding that is prepended to the context - training_modes_cfg = cfg.get('training_modes', None) - if training_modes_cfg is None: - # Create a default training mode for backward compatibility - self.training_modes = [ - TrainingMode( - text_input_mode="streaming", - streaming_phonemes_delay=4, - streaming_speech_delay=8, - mode_idx=0, - ) - ] - - else: - self.training_modes = [] - for mode_idx, mode_cfg in enumerate(training_modes_cfg): - mode = TrainingMode( - text_input_mode=mode_cfg.text_input_mode, - streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0), - streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0), - mode_idx=mode_idx, - ) - self.training_modes.append(mode) - - logging.info(f"Multi-mode training with {len(self.training_modes)} modes:") - for mode in self.training_modes: - logging.info( - f" - {mode.name}: text_input_mode={mode.text_input_mode}, " - f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, " - f"streaming_speech_delay={mode.streaming_speech_delay}" - ) - - # Create a mapping from mode name to mode object for easy lookup during inference - self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes} - # Default mode for inference if not specified (first mode in the list) - self.default_inference_mode = self.training_modes[0].name - - self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1) - - self.tokenizer = setup_tokenizers( - all_tokenizers_config=cfg.text_tokenizers, - mode='train', - ) + super().__init__(cfg=cfg, trainer=trainer) - num_tokens_tokenizer = len(self.tokenizer.tokens) - num_tokens = num_tokens_tokenizer + 3 # +3 for BOS, EOS, CFG_UNK - self.bos_id = num_tokens - 3 - self.eos_id = num_tokens - 2 - self.cfg_unk_token_id = num_tokens - 1 - self.phoneme_tokenizer = None + # Training-specific configuration self.dropout_text_input_prob = cfg.get('dropout_text_input_prob', 0.0) self.phoneme_corruption_batch_prob = cfg.get('phoneme_corruption_batch_prob', 0.0) self.phoneme_corruption_timestep_ratio = cfg.get('phoneme_corruption_timestep_ratio', 0.0) @@ -390,163 +123,9 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.phoneme_loss_weight = cfg.get('phoneme_loss_weight', 1.0) self.parallel_codebook_loss_scale = cfg.get('parallel_codebook_loss_scale', 1.0) self.local_transformer_loss_scale = cfg.get('local_transformer_loss_scale', 1.0) - if cfg.get('phoneme_tokenizer', None) is not None: - self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) - self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) - self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size - if cfg.get('phoneme_corruption_batch_prob', None) is None: - # Legacy mode: remove the UNK token from the phoneme vocabulary - # TODO: Remove this. - self.phoneme_vocab_size -= 1 - # If max phoneme probability is below this threshold at inference-time, - # replace the predicted timestep with UNK to reduce error propagation. - self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0) - - self.pad_context_text_to_max_duration = False - self.add_language_to_context_text = cfg.get('add_language_to_context_text', False) - - super().__init__(cfg=cfg, trainer=trainer) - - # This needs to happen after super().__init__() - self._codec_model = codec_model - self._codec_model.freeze() # Lightning does requires_grad = False and self.eval() - self._codec_converter = codec_converter - - # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters - self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim) - - audio_embeddings = [] - for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): - audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, self.audio_embedding_dim)) - self.audio_embeddings = nn.ModuleList(audio_embeddings) - # Projection from audio_embedding_dim to embedding_dim (Identity if same) - if self.audio_embedding_dim != cfg.embedding_dim: - self.audio_in_projection = nn.Linear(self.audio_embedding_dim, cfg.embedding_dim) - else: - self.audio_in_projection = nn.Identity() - - if self.phoneme_tokenizer is not None: - phoneme_embeddings = [] - for _ in range(self.phoneme_stacking_factor): - phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim)) - self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) - self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) - - # Decoder backend selection - supports HuggingFace models or NemotronH - self.decoder_type = cfg.get('decoder_type', 'huggingface') # backward compatible default - logging.info(f"Using decoder type: {self.decoder_type}") - - if self.decoder_type == 'huggingface': - # Existing HuggingFace path - self.transformer_backend_config = AutoConfig.from_pretrained( - cfg.transformer_hf_backend, - trust_remote_code=True, - ) - hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) - self.decoder = hf_transformer.model - self.lm_text_head = hf_transformer.lm_head - - elif self.decoder_type == 'nemotron_h': - # NemotronH hybrid Mamba2/Attention backend - from nemo.collections.tts.modules.nemotron_h_decoder import NemotronHConfig, NemotronHForCausalLM - - # Build config from YAML parameters - nemotron_h_config_dict = dict(cfg.get('nemotron_h_config', {})) - # Ensure hidden_size matches embedding_dim for compatibility - if 'hidden_size' not in nemotron_h_config_dict: - nemotron_h_config_dict['hidden_size'] = cfg.embedding_dim - nemotron_config = NemotronHConfig(**nemotron_h_config_dict) - nemotron_model = NemotronHForCausalLM(nemotron_config) - self.decoder = nemotron_model.backbone - self.lm_text_head = nemotron_model.lm_head - logging.info( - f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}..." - ) - - else: - raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'") - - self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) - self.decoder.set_input_embeddings(self.text_embedding) - # self.decoder.float() - - # Task embedding for multi-mode training - # Each mode has a unique task embedding that is prepended to the context - # Only create task embedding if there are multiple modes - num_modes = len(self.training_modes) - if num_modes > 1: - self.task_embedding = nn.Embedding(num_modes, cfg.embedding_dim) - logging.info(f"Created task embedding with {num_modes} modes, embedding_dim={cfg.embedding_dim}") - else: - self.task_embedding = None - logging.info(f"Single training mode '{self.training_modes[0].name}', skipping task embedding") - - if self.use_bpe_char_tokenizer: - # BPE char tokenizer - assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer" - tokenizer_name = self.tokenizer.tokenizer_names[0] - tokenizer = self.tokenizer.tokenizers[tokenizer_name] - subword_vocab = tokenizer.get_vocab() - # special tokens will be stored as it is in the char_vocab - # Each special token will only be mapped to one char id - special_vocab = { - '': self.bos_id, - '': self.eos_id, - '': self.cfg_unk_token_id, - } - self.cas_encoder = CharAwareSubwordEncoder( - d_embed=cfg.embedding_dim, - llm_tokenizer_vocab=subword_vocab, - subword_padding_idx=self.tokenizer.pad, - special_vocab=special_vocab, - ) - - # Projection from hidden_dim to audio_embedding_dim before final_proj (Identity if same) - if self.audio_embedding_dim != cfg.hidden_dim: - self.audio_out_projection = nn.Linear(cfg.hidden_dim, self.audio_embedding_dim) - else: - self.audio_out_projection = nn.Identity() - - self.final_proj = nn.Linear( - self.audio_embedding_dim, - self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor, - ) self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none') - self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower()) - logging.info(f"Local transformer type: {self.local_transformer_type}") - if self.local_transformer_type != LocalTransformerType.NO_LT: - local_transformer_hidden_dim = cfg.get('local_transformer_hidden_dim', 256) - if local_transformer_hidden_dim != cfg.hidden_dim: - self.local_transformer_in_projection = nn.Linear(cfg.hidden_dim, local_transformer_hidden_dim) - else: - self.local_transformer_in_projection = nn.Identity() - self.local_transformer = transformer_2501.Transformer( - n_layers=self.cfg.get('local_transformer_n_layers', 2), - d_model=local_transformer_hidden_dim, - d_ffn=local_transformer_hidden_dim * 4, - sa_n_heads=self.cfg.get('local_transformer_n_heads', 1), - kernel_size=1, - is_causal=self.local_transformer_type == LocalTransformerType.AR, - max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2, - use_learnable_pos_emb=True, - ) - # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same) - if self.audio_embedding_dim != local_transformer_hidden_dim: - self.local_transformer_audio_out_projection = nn.Linear( - local_transformer_hidden_dim, self.audio_embedding_dim - ) - else: - self.local_transformer_audio_out_projection = nn.Identity() - local_transformer_out_projections = [] - for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): - # Have a separate projection layer for each codebook, to distinguish between them - local_transformer_out_projections.append( - nn.Linear(self.audio_embedding_dim, self.num_all_tokens_per_codebook) - ) - self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) - # Validation inference with metrics (optional) self.run_val_inference = cfg.get('run_val_inference', False) self.use_multilingual_asr = cfg.get('use_multilingual_asr', False) @@ -584,270 +163,15 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self._utmos_calculator = UTMOSv2Calculator(device='cpu') logging.info("UTMOSv2 calculator initialized for validation naturalness scoring") - def setup_optimizer_param_groups(self): - """ - Override to exclude frozen eval/inference-only models from the optimizer. - This prevents optimizer state mismatch errors when resuming from checkpoints - that were saved before these eval models were added. - """ - modules_to_exclude = { - '_speaker_verification_model', - '_codec_model', - '_eval_asr_model', - '_eval_speaker_verification_model', - 'whisper_model', - 'whisper_processor', - '_utmos_calculator', - } - - # Collect parameter ids to exclude - excluded_param_ids = set() - for name, module in self.named_children(): - if name in modules_to_exclude: - for param in module.parameters(): - excluded_param_ids.add(id(param)) - - # Build param group with only trainable (non-excluded) parameters - trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids] - - logging.info( - f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, " - f"{len(excluded_param_ids)} params excluded (eval models)" - ) - - self._optimizer_param_groups = [{"params": trainable_params}] - - def state_dict(self, destination=None, prefix='', keep_vars=False): - """ - Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model - from the checkpoint. The codec model is saved in a separate checkpoint. - """ - if hasattr(self, '_no_state_dict') and self._no_state_dict: - return {} - # Don't save the speaker verification and codec model in the state dict - state_dict = super().state_dict(destination, prefix, keep_vars) - keys_substrings_to_exclude = [ + def _get_state_dict_keys_to_exclude(self): + return super()._get_state_dict_keys_to_exclude() + [ '_speaker_verification_model', - '_codec_model', '_eval_asr_model', '_eval_speaker_verification_model', 'whisper_model', 'whisper_processor', '_utmos_calculator', ] - for key in list(state_dict.keys()): - if any([substring in key for substring in keys_substrings_to_exclude]): - del state_dict[key] - return state_dict - - def load_state_dict(self, state_dict, strict=True): - """ - Modify load_state_dict so that we don't restore weights to _speaker_verification_model and _codec_model when - strict is True. - When strict is False, we can call pytorch's load_state_dict. - When strict is True, we loop through all parameters and rename them to enable loading. - """ - if strict == False: - super().load_state_dict(state_dict, strict=False) - for name, child in self.named_children(): - if name in [ - '_speaker_verification_model', - '_codec_model', - '_eval_asr_model', - '_eval_speaker_verification_model', - 'whisper_model', - 'whisper_processor', - '_utmos_calculator', - ]: - continue - if any(param.numel() > 0 for param in child.parameters()): - # If the module has parameters, we want to change the default mapping so that the state_dict gets - # loaded. - # Ex: state_dict[encoder.position_embeddings.weight] -> new_state_dict[position_embeddings.weight] - new_state_dict = {} - for key in state_dict.keys(): - name_with_dot = f"{name}." - if key.startswith(name_with_dot): - new_state_dict[key[len(name_with_dot) :]] = state_dict[key] - child.load_state_dict(new_state_dict) - - def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1): - # codes: (B, C, T') - # codes_len: (B,) - codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0) - codes_len = codes_len + num_eos_tokens - # Insert EOS token at new final token entry - for idx in range(codes.size(0)): - codes[idx, :, codes_len[idx] - 1] = eos_id - - return codes, codes_len - - def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1): - # codes: (B, C, T') - # codes_len: (B,) - codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id) - codes_len = codes_len + num_bos_tokens - codes, codes_len = self.add_eos_token( - codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens - ) - return codes, codes_len - - def remove_bos_token(self, codes, codes_len, num_tokens=1): - # codes: (B, C, T') - # codes_len: (B,) - codes = codes[:, :, num_tokens:] - codes_len = codes_len - num_tokens - return codes, codes_len - - def remove_embedded_bos_token(self, embedded, embedded_len): - # codes: (B, T', C) - # codes_len: (B,) - embedded = embedded[:, 1:, :] - embedded_len = embedded_len - 1 - return embedded, embedded_len - - def remove_eos_token(self, codes, codes_len): - # codes: (B, C, T') - # codes_len: (B,) - codes_len = codes_len - 1 - codes = codes[:, :, :-1] - mask = get_mask_from_lengths(lengths=codes_len) - codes = codes * mask.unsqueeze(1) - return codes, codes_len - - def remove_embedded_eos_token(self, embedded, embedded_len): - # embedded: (B, T', D) - # embedded_len: (B,) - embedded_len = embedded_len - 1 - embedded = embedded[:, :-1, :] - mask = get_mask_from_lengths(lengths=embedded_len) - embedded = embedded * mask.unsqueeze(2) - return embedded, embedded_len - - def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1): - codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens) - codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len) - return codes, codes_len - - def audio_to_codes(self, audio, audio_len, sample_rate=None): - self._codec_model.eval() - with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): - codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate) - return codes, codes_len - - def codes_to_audio(self, codes, codes_len): - # codes: (B, C, T') - # codes_len: (B,) - self._codec_model.eval() - if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: - # Unstack the audio codes if they are stacked - codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) - - with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): - # Pass the modified integer token IDs - if self._codec_converter is not None: - codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) - if codes_len.min() < 4: - # Pad the codes with 0s to make the minimum length 4 - # codes is (B, C, T) - codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0) - # Updates all lens less than 4 to 4 - codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len) - codes = codes[:, :, : codes_len.max()] - - audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) - # audio: (B, T) - # audio_len: (B,) - return audio, audio_len, codes - - def embed_audio_tokens(self, audio_tokens): - # audio_tokens: (B, C, T') - # Add and average the embeddings of the audio tokens across the codebooks - audio_embedding = None - for c in range(audio_tokens.size(1)): - embedding = self.audio_embeddings[c](audio_tokens[:, c, :]) - if audio_embedding is None: - audio_embedding = embedding - else: - audio_embedding = audio_embedding + embedding - audio_embedding = audio_embedding / audio_tokens.size(1) - # Project from audio_embedding_dim to embedding_dim - audio_embedding = self.audio_in_projection(audio_embedding) - return audio_embedding - - def embed_phoneme_tokens(self, phoneme_tokens): - # phoneme_tokens: (B, S, T') - phoneme_embedding = None - for c in range(phoneme_tokens.size(1)): - embedding = self.phoneme_embeddings[c](phoneme_tokens[:, c, :]) - if phoneme_embedding is None: - phoneme_embedding = embedding - else: - phoneme_embedding = phoneme_embedding + embedding - phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1) - return phoneme_embedding - - def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): - """ - Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes. - This function is used in training and validation, not inference/sampling. - The sequence layout is slightly different between AR and MG modes, as shown in the diagram below, - (using an 8-codebook setup as an example): - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | AR target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | none | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | MG target | none | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | Input | Magpie | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - | | Latent | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | Seq. Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - - dec_out: (B, T', E) - audio_codes_target: (B, C, T') - targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive) - if True, the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit) - """ - dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', hidden_dim) - local_transformer_input = [dec_out_all] - for codebook_num in range(audio_codes_target.size(1)): - codes = audio_codes_target[:, codebook_num] # (B, T') - codes = codes.reshape(-1) # (B*T',) - codebook_embedding = self.audio_embeddings[codebook_num](codes) # (B*T', audio_embedding_dim) - # Project from audio_embedding_dim to embedding_dim - codebook_embedding = self.audio_in_projection(codebook_embedding) - local_transformer_input.append(codebook_embedding) - - local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E) - local_transformer_input = self.local_transformer_in_projection(local_transformer_input) # (B*T', C+1, 128) - _mask = torch.ones( - local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device - ) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B*T', C+1, E) - if not targets_offset_by_one: - # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc. - local_transformer_output = local_transformer_output[:, :-1, :] # (B*T', C, E) - else: - # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc. - local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E) - # Project from local_transformer_hidden_dim to audio_embedding_dim - local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output) - all_code_logits = [] - for codebook_num in range(audio_codes_target.size(1)): - # Using a separate projection layer for each codebook (to distinguish between them) - # Checked the time - this loop is not taking much time (compared to the local transformer forward pass) - codebook_logits = self.local_transformer_out_projections[codebook_num]( - local_transformer_output[:, codebook_num, :] - ) # (B*T', num_all_tokens_per_codebook) - all_code_logits.append(codebook_logits) - all_code_logits = torch.cat(all_code_logits, dim=1) # (B*T', num_codebooks * num_all_tokens_per_codebook) - - all_code_logits = all_code_logits.view( - audio_codes_target.size(0), audio_codes_target.size(2), -1 - ) # (B, T', C * num_all_tokens_per_codebook) - - return all_code_logits def compute_loss(self, logits, audio_codes, audio_codes_lens): """ @@ -898,192 +222,6 @@ def compute_phoneme_loss(self, logits, phoneme_tokens, phoneme_tokens_lens): total_phoneme_loss = total_phoneme_loss / self.phoneme_stacking_factor return total_phoneme_loss, loss_mask - def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None, cache_position=None): - # Only pass cache_position for NemotronH (HF transformers may not accept it) - if self.decoder_type == 'nemotron_h': - backend_out = self.decoder( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - use_cache=use_cache, - past_key_values=past_key_values, - cache_position=cache_position, - ) - else: - backend_out = self.decoder( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - use_cache=use_cache, - past_key_values=past_key_values, - ) - # hidden_states = backend_out.last_hidden_state # (B, T_total, H) - return backend_out - - def logits_to_audio_codes(self, all_code_logits, audio_codes_lens): - # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook) - # audio_codes_lens: (B,) - all_preds = [] - for idx in range(self.num_audio_codebooks * self.frame_stacking_factor): - si = idx * self.num_all_tokens_per_codebook - ei = si + self.num_all_tokens_per_codebook - codebook_logits = all_code_logits[:, :, si:ei] - codebook_probs = torch.softmax(codebook_logits, dim=-1) # (B, T', num_tokens_per_codebook) - # argmax to get the tokens - codebook_preds = torch.argmax(codebook_probs, dim=-1) # (B, T') - all_preds.append(codebook_preds) - - all_preds = torch.stack(all_preds, dim=1) # (B, C, T') - audio_mask = get_mask_from_lengths(audio_codes_lens) - all_preds = all_preds * audio_mask.unsqueeze(1) - - return all_preds - - def local_transformer_sample_autoregressive( - self, - dec_output, - temperature=0.7, - topk=80, - unfinished_items={}, - finished_items={}, - use_cfg=False, - cfg_scale=1.0, - ): - # dec_output: (B, E) - self.local_transformer.reset_cache(use_cache=False) - dec_output = dec_output.unsqueeze(1) # (B, 1, E) - local_transformer_input = self.local_transformer_in_projection(dec_output) # (B, 1, 128) - all_preds = [] - for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor): - _mask = torch.ones( - local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device - ) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128) - # Project from local_transformer_hidden_dim to audio_embedding_dim - local_transformer_output_projected = self.local_transformer_audio_out_projection( - local_transformer_output[:, -1, :] - ) - codebook_logits = self.local_transformer_out_projections[codebook_num]( - local_transformer_output_projected - ) # (B, num_all_tokens_per_codebook) - if use_cfg: - actual_batch_size = codebook_logits.size(0) // 2 - conditional_logits = codebook_logits[:actual_batch_size] - unconditional_logits = codebook_logits[actual_batch_size:] - cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits - codebook_logits[:actual_batch_size] = cfg_logits - - # Replace NaN/inf then clamp to prevent extreme values (e.g. from CFG) causing NaN in softmax - # print("codebook_logits stats before nan_to_num") - # print(f"min: {codebook_logits.min()}, max: {codebook_logits.max()}, mean: {codebook_logits.mean()}, std: {codebook_logits.std()}") - codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) - codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) - - for item_idx in unfinished_items: - codebook_logits[item_idx, self.audio_eos_id] = float('-inf') - for item_idx in finished_items: - codebook_logits[item_idx, :] = float('-inf') - codebook_logits[item_idx, self.audio_eos_id] = 0.0 - - codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) - indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( - -1 - ) # (B, num_tokens_per_codebook) - codebook_logits_rescored = codebook_logits.clone() - codebook_logits_rescored[indices_to_remove] = float('-inf') - - if temperature <= 0.0: - # Argmax sampling for deterministic output - codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) - else: - codebook_probs = torch.softmax( - codebook_logits_rescored / temperature, dim=-1 - ) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) - if use_cfg: - codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] - all_preds.append(codebook_preds) - next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze( - 1 - ) # (B, 1, audio_embedding_dim) - # Project from audio_embedding_dim to embedding_dim, then to local_transformer_hidden_dim - next_local_transformer_input = self.audio_in_projection(next_local_transformer_input) - next_local_transformer_input = self.local_transformer_in_projection( - next_local_transformer_input - ) # (B, 1, local_transformer_hidden_dim) - local_transformer_input = torch.cat( - [local_transformer_input, next_local_transformer_input], dim=1 - ) # (B, T+1, local_transformer_hidden_dim) - - all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) - if use_cfg: - all_preds = all_preds[:actual_batch_size] - - return all_preds - - def sample_codes_from_logits( - self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={} - ): - # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep - all_preds = [] - for idx in range(self.num_audio_codebooks * self.frame_stacking_factor): - si = idx * self.num_all_tokens_per_codebook - ei = si + self.num_all_tokens_per_codebook - codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) - # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax - codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) - codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) - for item_idx in unfinished_items: - codebook_logits[item_idx, self.audio_eos_id] = float('-inf') - for item_idx in finished_items: - codebook_logits[item_idx, :] = float('-inf') - codebook_logits[item_idx, self.audio_eos_id] = 0.0 - codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) - indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( - -1 - ) # (B, num_tokens_per_codebook) - codebook_logits_rescored = codebook_logits.clone() - codebook_logits_rescored[indices_to_remove] = float('-inf') - - if temperature <= 0.0: - # Argmax sampling for deterministic output - codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) - else: - codebook_probs = torch.softmax( - codebook_logits_rescored / temperature, dim=-1 - ) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) - all_preds.append(codebook_preds) - all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) - return all_preds - - def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, topk=80): - # all_code_logits_t: (B, phoneme_stacking_factor * phoneme_vocab_size), logits at a given timestep - all_preds = [] - for idx in range(self.phoneme_stacking_factor): - si = idx * self.phoneme_vocab_size - ei = si + self.phoneme_vocab_size - codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) - # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax - codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) - codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) - codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) - indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( - -1 - ) # (B, num_tokens_per_codebook) - codebook_logits_rescored = codebook_logits.clone() - codebook_logits_rescored[indices_to_remove] = float('-inf') - - if temperature <= 0.0: - # Argmax sampling for deterministic output - codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) - else: - codebook_probs = torch.softmax( - codebook_logits_rescored / temperature, dim=-1 - ) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) - all_preds.append(codebook_preds) - all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) - return all_preds - def log_val_audio_example( self, logits, @@ -1169,181 +307,6 @@ def log_val_audio_example( return wandb_audio_log - def join_embeddings_temporally( - self, - embeddings: Sequence[torch.Tensor], # [ (B, Ti, E), … ] - lengths: Sequence[torch.Tensor], # [ (B,), … ] same order/size as `embeddings` - pad_embed: torch.Tensor | None = None, # (E,) defaults to zeros - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Merges Multiple Embedding sequences into a single Embedding Sequence. - - Args: - embeddings : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding - lengths : Sequence of tensors, each of shape (B,) - pad_embed : (E,) — embedding to use for padding, defaults to zeros - - Returns: - joined : (B, max_sum_len, E) — merged & padded - out_lengths : (B,) — total lengths of each batch element after merging - """ - if len(embeddings) == 0: - raise ValueError("contexts must be non-empty") - - B, _, E = embeddings[0].shape - device = embeddings[0].device - dtype = embeddings[0].dtype - - # 1. compute output sizes - len_stack = torch.stack(tuple(lengths), dim=0) # (N, B) - out_lengths = len_stack.sum(0) - max_len = int(out_lengths.max()) - - if pad_embed is None: - pad_embed = torch.zeros(E, dtype=dtype, device=device) - - joined = pad_embed.expand(B, max_len, E).clone() # (B,max_len,E) - - # batch row indices - batch_rows = torch.arange(B, device=device).unsqueeze(1) # (B,1) - - # running offset keeps “write cursor” for each row - offset = torch.zeros(B, dtype=torch.long, device=device) # (B,) - - for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)): - Ti = embedding_i.shape[1] - t_idx = torch.arange(Ti, device=device) # (Ti,) - mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1) # (B,Ti) - - # destination columns: offset + t - dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti) - - # Assign embedding_i to the correct positions in joined - # Ensure dtype matches to avoid errors during mixed-precision training - joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask].to(joined.dtype) - - # move cursor past this segment - offset += len_i - - return joined, out_lengths - - def prepare_context_tensors( - self, - context_text_tokens: torch.Tensor, - context_text_tokens_lens: torch.Tensor, - context_audio_codes: Optional[torch.Tensor] = None, - context_audio_codes_lens: Optional[torch.Tensor] = None, - context_audio: Optional[torch.Tensor] = None, - context_audio_lens: Optional[torch.Tensor] = None, - training_mode: Optional[TrainingMode] = None, - dropout_conditional_input: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Prepare context tensors (without text) for the simplified process_batch. - - This function processes context audio and context text to create the combined - context embedding. - Args: - context_text_tokens: Context text token IDs for speaker/style conditioning (B, L) - context_text_tokens_lens: Length of context text for each batch item (B,) - context_audio_codes: Pre-computed audio codes for context audio (B, C, T'). - If None, will be computed from context_audio. - context_audio_codes_lens: Length of context audio codes (B,). - Required if context_audio_codes is provided. - context_audio: Raw context audio waveform (B, T). - Used to compute context_audio_codes if not provided. - context_audio_lens: Length of context audio (B,). - Required if context_audio is provided. - training_mode: Optional TrainingMode object specifying the mode to use. - If None, uses the first mode from training_modes as default. - dropout_conditional_input: If True, replace context with CFG unconditional token. - - Returns: - Tuple of: - - context_embedding: Combined context embedding (B, T_context, E) - - context_lens: Total context length per batch item (B,) - - context_audio_codes: Processed audio codes with special tokens (B, C, T') - - context_audio_codes_lens: Length of processed context audio codes (B,) - """ - # Determine the mode parameters to use - if training_mode is None: - training_mode = self.training_modes[0] - - current_mode_idx = training_mode.mode_idx - batch_size = context_text_tokens.size(0) - device = context_text_tokens.device - - # Context Audio - if context_audio_codes is None: - if context_audio is None: - raise ValueError("Either context_audio_codes or context_audio must be provided") - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) - - if self._codec_converter is not None: - context_audio_codes = self._codec_converter.convert_original_to_new( - audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens - ).long() - - context_audio_codes, context_audio_codes_lens = self.add_special_tokens( - codes=context_audio_codes, - codes_len=context_audio_codes_lens, - bos_id=self.context_audio_bos_id, - eos_id=self.context_audio_eos_id, - ) - - # Use legacy audio_bos_id/audio_eos_id if flag is set - stack_bos_id = ( - self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id - ) - stack_eos_id = ( - self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id - ) - - context_audio_codes, context_audio_codes_lens = self.stack_codes( - context_audio_codes, - context_audio_codes_lens, - stack_bos_id, - stack_eos_id, - self.frame_stacking_factor, - self.num_audio_codebooks, - ) - context_audio_embedded = self.embed_audio_tokens(context_audio_codes) # (B, T', E) - - # Context Text - context_text_lens = context_text_tokens_lens - context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens) # (B, L, E) - - # Prepare task embedding for multi-mode training - task_embedding = None - task_embedding_lens = None - if self.task_embedding is not None and current_mode_idx is not None: - mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=device) - task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1) # (B, 1, E) - task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=device) # (B,) - - # Combine context embeddings: [task_embedding | context_audio | context_text] - if task_embedding is not None: - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[task_embedding, context_audio_embedded, context_text_embedded], - lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens], - ) - else: - context_embedding, context_lens = self.join_embeddings_temporally( - embeddings=[context_audio_embedded, context_text_embedded], - lengths=[context_audio_codes_lens, context_text_lens], - ) - - # Handle CFG unconditional dropout - if dropout_conditional_input: - cfg_token_id = self.cfg_unk_token_id - cfg_token_embedding = self.decoder.get_input_embeddings()( - torch.full((batch_size, 1), cfg_token_id, device=device) - ) # (B, 1, E) - # Expand CFG token to match context embedding size - context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1) # (B, T_context, E) - - return context_embedding, context_lens, context_audio_codes, context_audio_codes_lens - def prepare_text_channel_embeddings( self, text: torch.Tensor, @@ -1652,99 +615,6 @@ def slice_pred_embeddings(self, transformer_out, context_lens, target_lens): sliced = torch.gather(transformer_out, dim=1, index=gather_indices_exp) return sliced - def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks): - """ - Stack multiple time steps into the channel dimension to reduce sequence length. - - This function reshapes audio/phoneme codes by grouping consecutive time steps together - and placing them in the channel dimension. This allows the model to process multiple - frames in parallel while reducing the sequence length. - - Args: - codes: Input codes tensor of shape (B, C, T) where B is batch size, - C is number of codebooks, and T is sequence length. - codes_lens: Length of valid codes for each batch item, shape (B,). - bos_id: Beginning-of-sequence token ID used to detect and handle BOS tokens. - eos_id: End-of-sequence token ID used for padding. - stacking_factor: Number of time steps to stack together. If 1, no stacking is performed. - num_codebooks: Number of codebooks in the input. - - Returns: - Tuple of: - - stacked_codes: Reshaped codes of shape (B, C * stacking_factor, T // stacking_factor). - If input contains BOS tokens, they are preserved at the beginning. - - new_lens: Updated sequence lengths after stacking, shape (B,). - """ - if stacking_factor == 1: - return codes, codes_lens - - contains_bos = codes[0, 0, 0].item() == bos_id - if contains_bos: - bos_tensor_repeated = torch.full( - (codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device - ) # (B,stacking_factor*C, 1) - codes = codes[:, :, 1:] # Remove the bos token - codes_lens = codes_lens - 1 # Remove the bos token - B, C, T = codes.shape - s = int(stacking_factor) - - # --- Compute max padding needed --- - pad_t = (-T) % s # pad so that T' is divisible by s - pad_tail = torch.full((B, C, pad_t), eos_id, dtype=codes.dtype, device=codes.device) - codes = torch.cat([codes, pad_tail], dim=-1) - - # --- Stack time into channel dimension --- - Tp = codes.shape[-1] - T_out = Tp // s - codes = codes.view(B, C, T_out, s) - codes = codes.permute(0, 1, 3, 2).reshape(B, C * s, T_out) - - new_lens = torch.div(codes_lens + s - 1, s, rounding_mode='floor') - if contains_bos: - codes = torch.cat([bos_tensor_repeated, codes], dim=2) - new_lens = new_lens + 1 - - return codes, new_lens - - def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): - """ - Reverse the stacking operation to recover the original time dimension. - - This is the inverse of `stack_codes`. It takes codes that have been stacked - in the channel dimension and expands them back into the time dimension. - - Args: - stacked_codes: Stacked codes tensor of shape (B, C * stacking_factor, T_stacked) - where T_stacked = T_original // stacking_factor. - stacked_lens: Length of valid stacked sequences for each batch item, shape (B,). - stacking_factor: The stacking factor used in the original `stack_codes` call. - If 1, no unstacking is performed. - - Returns: - Tuple of: - - unstacked_codes: Codes with restored time dimension, shape (B, C, T_stacked * stacking_factor). - - orig_lens: Recovered sequence lengths, shape (B,). Note that these are the - maximum possible lengths; actual valid lengths may be shorter due to - padding applied during stacking. - """ - if stacking_factor == 1: - return stacked_codes, stacked_lens - - B, CxS, T_out = stacked_codes.shape - s = int(stacking_factor) - assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})" - - C = CxS // s - # Reshape: split channels back into (C, s) - x = stacked_codes.view(B, C, s, T_out) - # Bring s back into time dimension - x = x.permute(0, 1, 3, 2).reshape(B, C, T_out * s) - - # Recover original lengths (before padding) - orig_lens = stacked_lens * s - - return x, orig_lens - def process_batch( self, text: torch.Tensor, @@ -2660,1145 +1530,3 @@ def val_dataloader(self): self._val_dl_wrapped_with_dist_sampler = True return self._validation_dl - - def _sample_audio_codes( - self, - last_hidden: torch.Tensor, - all_code_logits_t: torch.Tensor, - temperature: float, - topk: int, - use_local_transformer_for_inference: bool, - use_cfg: bool, - cfg_scale: float, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Sample audio codes from logits using either local transformer or parallel sampling. - - Returns: - audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks) - all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks) - """ - if use_local_transformer_for_inference: - if self.local_transformer_type == LocalTransformerType.AR: - audio_codes_next = self.local_transformer_sample_autoregressive( - dec_output=last_hidden[:, -1, :], - temperature=temperature, - topk=topk, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - ) - else: - raise ValueError( - f"Local transformer inference requested but local transformer type is {self.local_transformer_type}" - ) - # TODO @rfejgin: should we add argmax sampling for EOS here too? - all_codes_next_argmax = audio_codes_next - else: - # Parallel sampling from all codebook logits - audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) - # Argmax sampling for reliable EOS detection - if temperature <= 0.0: - all_codes_next_argmax = audio_codes_next # already argmax - else: - all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) - - return audio_codes_next, all_codes_next_argmax - - def streaming_init( - self, - context_audio_codes: torch.Tensor, - context_audio_codes_lens: torch.Tensor, - context_text_tokens: torch.Tensor, - context_text_tokens_lens: torch.Tensor, - inference_mode: Optional[str] = None, - use_cfg: bool = False, - cfg_scale: float = 1.0, - use_local_transformer: bool = False, - temperature: float = 0.7, - topk: int = 80, - phoneme_input_type: str = 'predicted', - phoneme_sampling_method: str = 'argmax', - gt_phoneme_tokens: Optional[torch.Tensor] = None, - gt_phoneme_tokens_lens: Optional[torch.Tensor] = None, - gt_audio_codes: Optional[torch.Tensor] = None, - gt_audio_codes_lens: Optional[torch.Tensor] = None, - use_inference_mode: bool = True, - ) -> StreamingState: - """ - Initialize streaming TTS inference state. - - This prepares the model for streaming inference by processing the context - (audio + context text) and returning a StreamingState that can be used - with streaming_step() to incrementally generate audio. - - Note: This function does NOT take the main text input. Text tokens are - provided incrementally via streaming_step(). - - For batched inference, each batch item can have a different context length. - This function processes only up to the minimum context length across the batch, - storing the remaining context to be processed in streaming_step's context phase. - - The streaming inference follows phases (per batch item): - 1. Context phase: Processing remaining context (if any) for items with longer context. - 2. Prompt phase: First `streaming_speech_delay` text tokens are processed - without generating audio (building up context). - 3. Generation phase: Audio BOS is added and audio codes are generated - autoregressively, with remaining text tokens added to audio embeddings. - - Args: - context_audio_codes: Pre-computed audio codes for context audio (B, C, T'). - context_audio_codes_lens: Length of context audio codes (B,). - context_text_tokens: Context text token IDs for speaker/style conditioning (B, L). - context_text_tokens_lens: Length of context text (B,). - inference_mode: Name of the inference mode to use (e.g., "streaming_4_8"). - If None, uses the default inference mode. - use_cfg: Whether to use classifier-free guidance. - cfg_scale: CFG scale factor (higher = stronger conditioning). - use_local_transformer: Whether to use local transformer for AR sampling. - temperature: Sampling temperature for audio codes. - topk: Top-k sampling parameter. - phoneme_input_type: 'gt' or 'predicted' for phoneme tokens (use 'predicted' for streaming). - phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. - gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing. - gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,). - gt_audio_codes: Optional GT audio codes (B, C*S, T) already stacked with BOS/EOS, - input portion ([:, :, :-1]) for teacher forcing. Pre-processed by caller. - gt_audio_codes_lens: Lengths of GT audio codes (B,) after stacking. - - Returns: - StreamingState: Initial state for streaming inference. - """ - grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad - with grad_ctx(): - batch_size = context_audio_codes.size(0) - device = context_audio_codes.device - - # Resolve inference mode - mode_name = inference_mode if inference_mode is not None else self.default_inference_mode - if mode_name not in self.mode_name_to_mode: - available_modes = list(self.mode_name_to_mode.keys()) - raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}") - - selected_training_mode = self.mode_name_to_mode[mode_name] - - # Prepare context embedding using shared helper - context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = ( - self.prepare_context_tensors( - context_text_tokens=context_text_tokens, - context_text_tokens_lens=context_text_tokens_lens, - context_audio_codes=context_audio_codes, - context_audio_codes_lens=context_audio_codes_lens, - training_mode=selected_training_mode, - dropout_conditional_input=False, - ) - ) - - # Store full context embedding and lens before any CFG manipulation - full_context_embedding = context_embedding.clone() # (B, T_max, E) - full_context_lens = context_lens.clone() # (B,) - - # Compute min context length - we only process up to this in init - min_context_len = context_lens.min().item() - - # Setup classifier-free guidance if enabled - dummy_context_embedding_unconditional = None - if use_cfg: - dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()( - torch.full((1, 1), self.cfg_unk_token_id, device=device) - ) - # Create unconditional context (same length as conditional) - dummy_context_expanded = dummy_context_embedding_unconditional.expand( - batch_size, context_embedding.size(1), -1 - ) - # Concatenate conditional and unconditional: (2*B, T, E) - context_embedding = torch.cat([context_embedding, dummy_context_expanded], dim=0) - - # First forward pass to process context - only up to min_context_len - cache_position = torch.arange(min_context_len, device=device) - transformer_out = self.forward( - inputs_embeds=context_embedding[:, :min_context_len, :], - attention_mask=None, - use_cache=True, - past_key_values=None, - cache_position=cache_position, - ) - - last_hidden = transformer_out.last_hidden_state - past_kv = transformer_out.past_key_values - current_cache_seq_len = min_context_len - - # Process GT phoneme tokens if provided (for teacher forcing) - gt_phoneme_embeddings = None - gt_phoneme_lens = None - if gt_phoneme_tokens is not None and gt_phoneme_tokens_lens is not None: - gt_phoneme_expanded = gt_phoneme_tokens.unsqueeze(1) # (B, 1, L) - gt_phoneme_stacked, gt_phoneme_lens = self.stack_codes( - gt_phoneme_expanded, - gt_phoneme_tokens_lens, - self.phoneme_tokenizer.bos_token_id, - self.phoneme_tokenizer.eos_token_id, - self.phoneme_stacking_factor, - 1, - ) - gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked) # (B, T', E) - - # Process GT audio codes if provided (for teacher forcing) - gt_audio_embeddings = None - gt_audio_lens_state = None - if gt_audio_codes is not None and gt_audio_codes_lens is not None: - gt_audio_embeddings = self.embed_audio_tokens(gt_audio_codes) # (B, T', E) - gt_audio_lens_state = gt_audio_codes_lens - - # Initialize streaming state with batch support - state = StreamingState( - batch_size=batch_size, - past_key_values=past_kv, - cache_seq_len=current_cache_seq_len, - all_predictions=[], - all_phoneme_predictions=[], - context_audio_codes=context_audio_codes, - context_audio_codes_lens=context_audio_codes_lens, - context_lens=context_lens, - full_context_embedding=full_context_embedding, - full_context_lens=full_context_lens, - context_position=torch.full((batch_size,), min_context_len, dtype=torch.long, device=device), - text_tokens_seen=torch.zeros(batch_size, dtype=torch.long, device=device), - phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device), - audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device), - phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device), - phoneme_eos_detected=torch.zeros(batch_size, dtype=torch.bool, device=device), - finished=torch.zeros(batch_size, dtype=torch.bool, device=device), - device=device, - training_mode=selected_training_mode, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - use_local_transformer=use_local_transformer, - temperature=temperature, - topk=topk, - dummy_context_embedding_unconditional=dummy_context_embedding_unconditional, - last_hidden=last_hidden, - text_finished=torch.zeros(batch_size, dtype=torch.bool, device=device), - phoneme_input_type=phoneme_input_type, - phoneme_sampling_method=phoneme_sampling_method, - last_phoneme_tokens=None, - last_audio_codes=None, - audio_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), - audio_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), - phoneme_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), - phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), - gt_phoneme_embeddings=gt_phoneme_embeddings, - gt_phoneme_lens=gt_phoneme_lens, - gt_audio_embeddings=gt_audio_embeddings, - gt_audio_lens=gt_audio_lens_state, - ) - - return state - - def streaming_step( - self, - state: StreamingState, - text_tokens: Optional[torch.Tensor] = None, - force_dropout_text: bool = False, - use_inference_mode: bool = True, - ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]: - """ - Perform one streaming inference step with batch support. - - This function processes one text token per batch item (or signals end of text with None) - and generates predictions according to the streaming delays. Each batch item can be - in a different phase. - - The streaming operates in four phases per batch item: - 1. Context phase (context_position < full_context_lens): - - Still processing remaining context from streaming_init - - Uses context embedding, ignores text_tokens for this item - 2. Prompt phase (text_tokens_seen < phoneme_delay): - - Only text tokens are processed, KV cache is extended - - No phoneme or audio predictions - 3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): - - Starts with phoneme BOS on first step - - Only phoneme predictions (no audio) - - Input: text embedding + phoneme embedding - 4. Audio phase (text_tokens_seen >= speech_delay): - - Starts with audio BOS on first step - - Both phoneme and audio predictions - - Input: text embedding + phoneme embedding + audio embedding - - IMPORTANT: Only ONE forward call to the decoder per streaming_step. - - Args: - state: Current StreamingState from streaming_init or previous streaming_step. - text_tokens: Next text token for each batch item, shape (B,), or None if text has finished. - For items still in context phase, the text_token value is ignored (can be 0). - When None is passed, the model continues generating until EOS. - - Returns: - Tuple of: - - Updated StreamingState - - Predicted audio codes for this step (B, C, S) unstacked, or None if no items in audio phase - where C = num_audio_codebooks and S = frame_stacking_factor - - Predicted phoneme tokens for this step (B, phoneme_stacking_factor) or None if no items in phoneme phase - """ - if state.finished.all(): - return state, None, None - - grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad - with grad_ctx(): - device = state.device - batch_size = state.batch_size - streaming_speech_delay = state.training_mode.streaming_speech_delay - streaming_phonemes_delay = state.training_mode.streaming_phonemes_delay - - # ==================== DETERMINE PHASES PER BATCH ITEM ==================== - needs_context = state.context_position < state.full_context_lens # (B,) bool - needs_text = (~needs_context) & (~state.text_finished) - needs_phoneme = ( - (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended) - ) - needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished) - - next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) - # --- Context phase items: use next context embedding --- - if needs_context.any(): - # Gather context embeddings at current position for each item - # context_position: (B,) - position indices - # full_context_embedding: (B, T_max, E) - ctx_positions = state.context_position.clone() # (B,) - # Clamp positions to valid range for gathering - ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1) - # Gather: need (B, 1, E) from (B, T, E) at positions (B,) - ctx_emb = state.full_context_embedding[ - torch.arange(batch_size, device=device), ctx_positions, : - ].unsqueeze( - 1 - ) # (B, 1, E) - # Only apply to items in context phase - context_mask = needs_context.view(batch_size, 1, 1).float() - next_input = next_input + ctx_emb * context_mask - - # --- Non-context phase items: handle text embedding --- - text_embedded = None - if text_tokens is not None and needs_text.any(): - # Embed text tokens for all items (will be masked later) - text_tokens_2d = text_tokens.unsqueeze(1) # (B, 1) - text_embedded = self.decoder.get_input_embeddings()(text_tokens_2d) # (B, 1, E) - - # Handle BPE char tokenizer - if self.use_bpe_char_tokenizer: - text_mask = torch.ones_like(text_tokens_2d, dtype=torch.bool) - cas_embedding = self.cas_encoder(text_tokens_2d, subword_mask=text_mask) # (B, 1, E) - text_embedded = text_embedded + cas_embedding - - if force_dropout_text: - text_embedded = text_embedded * 0 - - # Check for EOS tokens - mark those items as text_finished - # The EOS token itself IS embedded normally (matching process_batch behavior - # where EOS is part of the text sequence). After this step, text_finished is set - # so subsequent steps won't add any text embedding. - is_eos_token = (text_tokens == self.eos_id) & needs_text # (B,) bool - text_add_mask = needs_text.view(batch_size, 1, 1).float() - next_input = next_input + text_embedded * text_add_mask - state.text_finished = state.text_finished | is_eos_token - - elif text_tokens is None: - # Text finished signal for items not in context phase - state.text_finished = state.text_finished | ~needs_context - - # --- Phoneme embedding for phoneme and audio phase items --- - if self.phoneme_tokenizer is not None: - if needs_phoneme.any(): - phoneme_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) - - if state.phoneme_input_type == 'gt' and state.gt_phoneme_embeddings is not None: - # Teacher forcing: use pre-computed GT phoneme embeddings - # Only use GT embedding if within valid length, otherwise zero - within_gt_len = state.phoneme_steps < state.gt_phoneme_lens # (B,) - positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1) - gt_emb = state.gt_phoneme_embeddings[ - torch.arange(batch_size, device=device), positions, : - ].unsqueeze( - 1 - ) # (B, 1, E) - phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float() - phoneme_emb = phoneme_emb + gt_emb * phoneme_mask - else: - # Prediction mode: use BOS or last predicted phoneme - first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0) - has_last_phoneme = ( - needs_phoneme & (~first_phoneme_step) & (state.last_phoneme_tokens is not None) - ) - - if first_phoneme_step.any(): - phoneme_bos = torch.full( - (batch_size, self.phoneme_stacking_factor, 1), - self.phoneme_tokenizer.bos_token_id, - device=device, - ).long() - phoneme_bos_emb = self.embed_phoneme_tokens(phoneme_bos) # (B, 1, E) - first_mask = first_phoneme_step.view(batch_size, 1, 1).float() - phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask - - if has_last_phoneme.any() and state.last_phoneme_tokens is not None: - last_phoneme_emb = self.embed_phoneme_tokens( - state.last_phoneme_tokens.unsqueeze(2) - ) # (B, 1, E) - last_mask = has_last_phoneme.view(batch_size, 1, 1).float() - phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask - - # Only end phoneme stream in prediction mode when the phoneme EOS is detected - state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected - - next_input = next_input + phoneme_emb - - # --- Audio embedding for audio phase items --- - if needs_audio.any(): - audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) - - if state.gt_audio_embeddings is not None: - # Teacher forcing: use pre-computed GT audio embeddings - # Only use GT embedding if within valid length, otherwise zero - within_gt_len = state.audio_steps < state.gt_audio_lens # (B,) - positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1) - gt_emb = state.gt_audio_embeddings[ - torch.arange(batch_size, device=device), positions, : - ].unsqueeze( - 1 - ) # (B, 1, E) - audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float() - audio_emb = audio_emb + gt_emb * audio_mask - else: - # Prediction mode: use BOS or last predicted audio - first_audio_step = needs_audio & (state.audio_steps == 0) - has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None) - - if first_audio_step.any(): - # Create BOS for items at first audio step - audio_bos = torch.full( - (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1), - self.audio_bos_id, - device=device, - ).long() - audio_bos_emb = self.embed_audio_tokens(audio_bos) # (B, 1, E) - first_mask = first_audio_step.view(batch_size, 1, 1).float() - audio_emb = audio_emb + audio_bos_emb * first_mask - - if has_last_audio.any() and state.last_audio_codes is not None: - # Use last predicted audio - last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2)) # (B, 1, E) - last_mask = has_last_audio.view(batch_size, 1, 1).float() - audio_emb = audio_emb + last_audio_emb * last_mask - - next_input = next_input + audio_emb - - # ==================== HANDLE CFG ==================== - if state.use_cfg: - # For unconditional branch, use dummy embedding for non-audio items - # and audio-only embedding for audio items - next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand( - batch_size, 1, -1 - ) - # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts - next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context) - context_mask = needs_context.view(batch_size, 1, 1).float() - next_input_unconditional = ( - context_mask * next_input_unconditional_context - + (1 - context_mask) * next_input_unconditional_zeros - ) - - # For audio phase items, we use audio embedding for the unconditional branch - if needs_audio.any(): - audio_mask = needs_audio.view(batch_size, 1, 1).float() - next_input_unconditional = next_input_unconditional * (1 - audio_mask) + audio_emb * audio_mask - - # Concatenate conditional and unconditional: (2*B, 1, E) - next_input = torch.cat([next_input, next_input_unconditional], dim=0) - - # ==================== FORWARD PASS ==================== - cache_position = torch.tensor([state.cache_seq_len], device=device) - transformer_out = self.forward( - inputs_embeds=next_input, - attention_mask=None, - use_cache=True, - past_key_values=state.past_key_values, - cache_position=cache_position, - ) - - state.last_hidden = transformer_out.last_hidden_state - state.past_key_values = transformer_out.past_key_values - state.cache_seq_len += 1 - - # ==================== UPDATE STATE ==================== - # Update context_position for items in context phase - state.context_position = state.context_position + needs_context.long() - # Keep updating text_tokens_seen for items once the context is finished - # This is because this counter is used to determine when to start predicting phonemes and audio - state.text_tokens_seen = state.text_tokens_seen + (~needs_context).long() - - # Update phoneme_steps for items in phoneme or audio phase - state.phoneme_steps = state.phoneme_steps + needs_phoneme.long() - - # Update audio_steps for items in audio phase - state.audio_steps = state.audio_steps + needs_audio.long() - - # ==================== PREDICTIONS ==================== - pred_phoneme_tokens = None - audio_codes_next = None - - # Phoneme predictions for items in phoneme or audio phase - if needs_phoneme.any() and self.phoneme_tokenizer is not None: - # Track phoneme prediction start index for items just entering phoneme phase - first_phoneme_step = needs_phoneme & (state.phoneme_prediction_start_idx == -1) - if first_phoneme_step.any(): - current_phoneme_step_idx = len(state.all_phoneme_predictions) # before append - state.phoneme_prediction_start_idx = torch.where( - first_phoneme_step, - torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx), - state.phoneme_prediction_start_idx, - ) - - # Check which items should predict phonemes (not ended) - pred_phoneme_tokens = self._predict_phoneme_tokens(state) # (B, phoneme_stacking_factor) - state.last_phoneme_tokens = pred_phoneme_tokens - state.all_phoneme_predictions.append(pred_phoneme_tokens) - - # Check for phoneme EOS per item - phoneme_eos_detected = needs_phoneme & ( - pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id - ).any( - dim=1 - ) # (B,) - - state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected - - # Track phoneme prediction end index for items that just ended - newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1) - if newly_ended_phoneme.any(): - current_phoneme_step_idx = len(state.all_phoneme_predictions) # after append - state.phoneme_prediction_end_idx = torch.where( - newly_ended_phoneme, - torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx), - state.phoneme_prediction_end_idx, - ) - - # Audio predictions for items in audio phase - if needs_audio.any(): - # Track audio prediction start index for items just entering audio phase - first_audio_step = needs_audio & (state.audio_prediction_start_idx == -1) - if first_audio_step.any(): - # Track start in terms of frames (not steps) - current_frame_idx = sum(p.size(-1) for p in state.all_predictions) # total frames so far - state.audio_prediction_start_idx = torch.where( - first_audio_step, - torch.full_like(state.audio_prediction_start_idx, current_frame_idx), - state.audio_prediction_start_idx, - ) - - audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state) # (B, C*S) - - # Unstack immediately: (B, C*S) -> (B, C, S) where S = frame_stacking_factor - S = self.frame_stacking_factor - C = self.num_audio_codebooks - audio_codes_unstacked = audio_codes_next_stacked.view(batch_size, C, S) # (B, C, S) - - # Update last_audio_codes with stacked format (needed for next step's embedding) - if state.last_audio_codes is None: - state.last_audio_codes = audio_codes_next_stacked - else: - update_mask = needs_audio.view(batch_size, 1).expand_as(audio_codes_next_stacked) - state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes) - - # Check for EOS in each frame and track exact end position - # Skip EOS detection in teacher-forced mode - rely on GT exhaustion instead - if state.gt_audio_embeddings is None: - # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S) - all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S) - - # For each batch item, find if/where EOS occurs in this step's frames - eos_in_sampled = audio_codes_unstacked == self.audio_eos_id # (B, C, S) - eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id # (B, C, S) - eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1) # (B, S) - - # Find first frame with EOS per batch item (or S if none) - eos_frame_idx = torch.where( - eos_any_codebook.any(dim=1), - eos_any_codebook.int().argmax(dim=1), # first frame with EOS - torch.full((batch_size,), S, device=device), # no EOS in this step - ) # (B,) - - audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio - state.finished = state.finished | audio_eos_detected - - # Track audio prediction end index (in frames) for items that just ended - newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1) - if newly_ended_audio.any(): - # End index = current frame count + frame offset where EOS was found - current_frame_count = len(state.all_predictions) * self.frame_stacking_factor - end_frame_idx = current_frame_count + eos_frame_idx - state.audio_prediction_end_idx = torch.where( - newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx - ) - - # Store unstacked codes - state.all_predictions.append(audio_codes_unstacked) - audio_codes_next = audio_codes_unstacked - - # Force-finish items when GT audio is exhausted (teacher forcing). - # This is checked AFTER predictions so the last valid prediction is still made. - # audio_steps was already incremented above. When audio_steps >= gt_audio_lens, - # we've consumed all GT input positions and made all corresponding predictions. - if state.gt_audio_embeddings is not None and state.gt_audio_lens is not None: - gt_exhausted = needs_audio & (state.audio_steps >= state.gt_audio_lens) - state.finished = state.finished | gt_exhausted - - return state, audio_codes_next, pred_phoneme_tokens - - def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: - """Predict phoneme tokens from the last hidden state.""" - actual_batch_size = state.batch_size - last_hidden = state.last_hidden - - # Get phoneme logits - all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) - all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] - phoneme_logits = all_code_logits_t_phoneme.view( - actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size - ) - max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values # (B, phoneme_stacking_factor) - - # Sample phonemes - if state.phoneme_sampling_method == 'argmax': - pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.0) - else: - pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( - all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk - ) - - # In prediction mode, low-confidence phoneme steps are replaced with UNK across - # all stacked channels (except steps where EOS is predicted). - if ( - state.phoneme_input_type != 'gt' - and hasattr(self.phoneme_tokenizer, 'unk_token_id') - and self.phoneme_confidence_unk_threshold > 0.0 - ): - underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any( - dim=1, keepdim=True - ) # (B, 1) - eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True) - replace_with_unk = underconfident_step & (~eos_predicted_step) - if replace_with_unk.any(): - unk_tokens = torch.full_like(pred_phoneme_tokens, self.phoneme_tokenizer.unk_token_id) - pred_phoneme_tokens = torch.where(replace_with_unk, unk_tokens, pred_phoneme_tokens) - # (B, phoneme_stacking_factor) - return pred_phoneme_tokens - - def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, torch.Tensor]: - """Predict audio codes from the last hidden state.""" - actual_batch_size = state.batch_size - last_hidden = state.last_hidden - - # Compute audio logits - last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) - all_code_logits_t = self.final_proj(last_hidden_audio) - - # Apply CFG if enabled - if state.use_cfg: - conditional_logits = all_code_logits_t[:actual_batch_size] - unconditional_logits = all_code_logits_t[actual_batch_size:] - all_code_logits_t = state.cfg_scale * conditional_logits + (1.0 - state.cfg_scale) * unconditional_logits - - # Sample audio codes - audio_codes_next, all_codes_next_argmax = self._sample_audio_codes( - last_hidden=last_hidden, - all_code_logits_t=all_code_logits_t, - temperature=state.temperature, - topk=state.topk, - use_local_transformer_for_inference=state.use_local_transformer, - use_cfg=state.use_cfg, - cfg_scale=state.cfg_scale, - ) - - return audio_codes_next, all_codes_next_argmax - - def streaming_finalize( - self, - state: StreamingState, - use_inference_mode: bool = True, - ) -> StreamingFinalizeOutput: - """ - Finalize streaming and return the complete generated audio and phoneme predictions. - - This function should be called after all streaming_step() calls are complete - (i.e., when state.finished.all() is True or max steps reached). - - Args: - state: Final StreamingState after streaming is complete. - - Returns: - StreamingFinalizeOutput containing audio, codes, and phoneme predictions. - """ - batch_size = state.batch_size - - # Extract and decode phoneme predictions - phoneme_tokens_list: List[List[int]] = [] - phoneme_text_list: List[str] = [] - if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0: - # Stack phoneme predictions: each is (B, phoneme_stacking_factor) - all_phonemes = torch.stack(state.all_phoneme_predictions, dim=-1) # (B, S, T) - for i in range(batch_size): - start = max(0, state.phoneme_prediction_start_idx[i].item()) - end = state.phoneme_prediction_end_idx[i].item() - if end < 0: - end = all_phonemes.size(-1) - # Flatten stacked phonemes back to sequence - tokens = all_phonemes[i, :, start:end].T.reshape(-1).tolist() - # Remove special tokens (BOS, EOS, PAD) - special = {self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.eos_token_id} - if hasattr(self.phoneme_tokenizer, 'pad_token_id'): - special.add(self.phoneme_tokenizer.pad_token_id) - tokens = [t for t in tokens if t not in special] - phoneme_tokens_list.append(tokens) - phoneme_text_list.append(self.phoneme_tokenizer.decode(tokens)) - else: - phoneme_tokens_list = [[] for _ in range(batch_size)] - phoneme_text_list = ["" for _ in range(batch_size)] - - if len(state.all_predictions) == 0: - return StreamingFinalizeOutput( - audio=torch.zeros(batch_size, 0, device=state.device), - audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), - audio_codes=torch.zeros(batch_size, self.num_audio_codebooks, 0, device=state.device), - audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), - phoneme_tokens=phoneme_tokens_list, - phoneme_text=phoneme_text_list, - ) - - grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad - with grad_ctx(): - # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames) - all_codes = torch.cat(state.all_predictions, dim=-1) # (B, C, T_total_frames) - total_frames = all_codes.size(-1) - num_codebooks = all_codes.size(1) - - # Start and end indices are in frames (not steps) - # If start_idx is -1, item never started audio predictions - use 0 - # If end_idx is -1, item never ended - use total_frames - start_indices = torch.clamp(state.audio_prediction_start_idx, min=0) - end_indices = torch.where( - state.audio_prediction_end_idx >= 0, - state.audio_prediction_end_idx, - torch.full_like(state.audio_prediction_end_idx, total_frames), - ) - - # Calculate per-item lengths (in frames) - predicted_codes_lens = end_indices - start_indices - max_len = predicted_codes_lens.max().item() - - # Handle case where all items have zero-length predictions - if max_len == 0: - return StreamingFinalizeOutput( - audio=torch.zeros(batch_size, 0, device=state.device), - audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), - audio_codes=torch.zeros(batch_size, num_codebooks, 0, device=state.device, dtype=all_codes.dtype), - audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), - phoneme_tokens=phoneme_tokens_list, - phoneme_text=phoneme_text_list, - ) - - # Create padded output tensor and slice each item's valid predictions - predicted_codes = torch.zeros( - batch_size, num_codebooks, max_len, dtype=all_codes.dtype, device=state.device - ) - for i in range(batch_size): - start = start_indices[i].item() - end = end_indices[i].item() - length = end - start - if length > 0: - predicted_codes[i, :, :length] = all_codes[i, :, start:end] - - # No need to remove EOS - end_indices already point to the frame before EOS - # Decode to audio (codes are already unstacked: B, C, T) - audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens) - - return StreamingFinalizeOutput( - audio=audio, - audio_len=audio_len, - audio_codes=predicted_codes, - audio_codes_len=predicted_codes_lens, - phoneme_tokens=phoneme_tokens_list, - phoneme_text=phoneme_text_list, - ) - - def infer_batch( - self, - batch: Dict[str, torch.Tensor], - max_decoder_steps: int = 500, - temperature: float = 0.7, - topk: int = 80, - use_cfg: bool = False, - cfg_scale: float = 1.0, - use_local_transformer_for_inference: bool = False, - phoneme_input_type: str = 'pred', - phoneme_sampling_method: str = 'argmax', - force_dropout_text: bool = False, - use_teacher_forced: bool = False, - use_inference_mode: bool = True, - ) -> InferBatchOutput: - """ - Batch inference using streaming infrastructure. - - This is a simple wrapper around streaming_init, streaming_step, and streaming_finalize - that processes a batch dictionary similar to training_step/validation_step. - - Args: - batch: Dictionary containing: - - text: Text token IDs (B, L) - - text_lens: Lengths (B,) - - context_text_tokens: Context text tokens (B, L') - - context_text_tokens_lens: Lengths (B,) - - context_audio_codes: Context audio codes (B, C, T) OR - - context_audio / context_audio_lens: Raw context audio to encode - - phoneme_tokens (optional): GT phoneme tokens (B, L'') - - phoneme_tokens_lens (optional): Lengths (B,) - For teacher forcing (use_teacher_forced=True), also requires: - - audio_codes / audio_codes_lens: GT audio codes (B, C, T) OR - - audio / audio_lens: Raw audio waveforms to encode - max_decoder_steps: Maximum number of decoder steps. - temperature: Sampling temperature for audio codes. Use 0.0 for argmax. - topk: Top-k sampling parameter. - use_cfg: Whether to use classifier-free guidance. - cfg_scale: CFG scale factor. - use_local_transformer_for_inference: Whether to use local transformer. - phoneme_input_type: 'gt' or 'pred' for phoneme tokens. - phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. - force_dropout_text: Whether to dropout text embeddings. - use_teacher_forced: If True, feed GT audio codes (and force GT phonemes, argmax sampling) - instead of predicted codes at each streaming step. - - Returns: - InferBatchOutput containing predicted audio, codes, and RTF metrics. - """ - grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad - with grad_ctx(): - start_time = time.time() - - # Extract tensors from batch - text = batch['text'] - text_lens = batch['text_lens'] - context_text_tokens = batch['context_text_tokens'] - context_text_tokens_lens = batch['context_text_tokens_lens'] - - # Handle context audio - either use codes directly or encode from audio - if 'context_audio_codes' in batch: - context_audio_codes = batch['context_audio_codes'] - context_audio_codes_lens = batch['context_audio_codes_lens'] - else: - context_audio = batch['context_audio'] - context_audio_lens = batch['context_audio_lens'] - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) - - # Optional GT phoneme tokens for teacher forcing - gt_phoneme_tokens = batch.get('phoneme_tokens') - gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens') - - # Prepare GT audio codes for teacher forcing if requested - gt_audio_codes_for_init = None - gt_audio_codes_lens_for_init = None - if use_teacher_forced: - # Force GT phoneme input and argmax sampling - phoneme_input_type = 'gt' - temperature = 0.0 - - # Get GT audio codes - support both codes and raw audio - if 'audio_codes' in batch: - gt_audio_codes_raw = batch['audio_codes'] - gt_audio_codes_lens_raw = batch['audio_codes_lens'] - elif 'audio' in batch: - gt_audio_codes_raw, gt_audio_codes_lens_raw = self.audio_to_codes( - batch['audio'], batch['audio_lens'] - ) - else: - raise ValueError( - "Teacher forcing requires 'audio_codes'/'audio_codes_lens' or 'audio'/'audio_lens' in batch." - ) - - # Pre-process GT audio codes same as prepare_audio_channel_embeddings: - # codec convert, add BOS/EOS, stack, then take input portion ([:, :, :-1]) - if self._codec_converter is not None: - gt_audio_codes_raw = self._codec_converter.convert_original_to_new( - audio_tokens=gt_audio_codes_raw, audio_lens=gt_audio_codes_lens_raw - ).long() - - gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens( - codes=gt_audio_codes_raw, - codes_len=gt_audio_codes_lens_raw, - bos_id=self.audio_bos_id, - eos_id=self.audio_eos_id, - ) - gt_audio_codes_processed, gt_audio_codes_lens_processed = self.stack_codes( - gt_audio_codes_processed, - gt_audio_codes_lens_processed, - self.audio_bos_id, - self.audio_eos_id, - self.frame_stacking_factor, - self.num_audio_codebooks, - ) - - # Input portion: all tokens except the last (teacher forcing shift) - gt_audio_codes_for_init = gt_audio_codes_processed[:, :, :-1] - gt_audio_codes_lens_for_init = gt_audio_codes_lens_processed - 1 - - batch_size = text.size(0) - - # Initialize streaming state - state = self.streaming_init( - context_audio_codes=context_audio_codes, - context_audio_codes_lens=context_audio_codes_lens, - context_text_tokens=context_text_tokens, - context_text_tokens_lens=context_text_tokens_lens, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - use_local_transformer=use_local_transformer_for_inference, - temperature=temperature, - topk=topk, - phoneme_input_type=phoneme_input_type, - phoneme_sampling_method=phoneme_sampling_method, - gt_phoneme_tokens=gt_phoneme_tokens, - gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, - gt_audio_codes=gt_audio_codes_for_init, - gt_audio_codes_lens=gt_audio_codes_lens_for_init, - use_inference_mode=use_inference_mode, - ) - - time_to_first_prediction = None - generation_start_time = time.time() - device = text.device - - # Generate until all items are finished or max steps reached - print("Generation started") - gen_step = 0 - while not state.finished.all() and len(state.all_predictions) < max_decoder_steps: - gen_step += 1 - if gen_step % 10 == 0: - print(f"Generation step {gen_step} ") - # Gather the correct text token for each batch item based on text_tokens_seen - # Items in context phase will have their token ignored by streaming_step - positions = state.text_tokens_seen.clamp(max=text.size(1) - 1) - current_tokens = text[torch.arange(batch_size, device=device), positions] - - # For items that have exhausted their text, provide EOS token - text_exhausted = state.text_tokens_seen >= text_lens - current_tokens = torch.where( - text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens - ) - - state, audio_codes, phoneme_tokens = self.streaming_step( - state=state, - text_tokens=current_tokens, - force_dropout_text=force_dropout_text, - use_inference_mode=use_inference_mode, - ) - - # Record time to first audio prediction - if time_to_first_prediction is None and audio_codes is not None: - time_to_first_prediction = time.time() - start_time - - tts_generation_time = time.time() - generation_start_time - - # Finalize and decode audio - finalize_output = self.streaming_finalize(state, use_inference_mode=use_inference_mode) - - end_time = time.time() - total_time = end_time - start_time - - # Compute RTF metrics - total_audio_samples = finalize_output.audio_len.sum().item() - total_audio_duration = total_audio_samples / self.output_sample_rate - num_frames = len(state.all_predictions) - tts_generation_time_per_frame = tts_generation_time / num_frames if num_frames > 0 else 0.0 - - rtf_metrics = { - 'rtf': total_audio_duration / total_time if total_time > 0 else 0.0, - 'time_to_first_prediction': time_to_first_prediction, - 'tts_generation_time': tts_generation_time, - 'max_frames_generated': num_frames, - 'tts_generation_time_per_frame': tts_generation_time_per_frame, - 'batch_size': batch_size, - } - - # Extract raw phoneme predictions from state - ib_phoneme_tokens = None - ib_phoneme_tokens_lens = None - if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0: - # Stack: each element is (B, phoneme_stacking_factor), stack along time -> (B, S, T) - ib_phoneme_tokens = torch.stack(state.all_phoneme_predictions, dim=-1) # (B, S, T) - # Compute per-item lengths using start/end indices - ib_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) - for i in range(batch_size): - start = max(0, state.phoneme_prediction_start_idx[i].item()) - end = state.phoneme_prediction_end_idx[i].item() - if end < 0: - end = ib_phoneme_tokens.size(-1) - ib_phoneme_tokens_lens[i] = end - start - - return InferBatchOutput( - predicted_audio=finalize_output.audio, - predicted_audio_lens=finalize_output.audio_len, - predicted_codes=finalize_output.audio_codes, - predicted_codes_lens=finalize_output.audio_codes_len, - rtf_metrics=rtf_metrics, - predicted_phoneme_tokens=ib_phoneme_tokens, - predicted_phoneme_tokens_lens=ib_phoneme_tokens_lens, - phoneme_prediction_start_idx=( - state.phoneme_prediction_start_idx.clone() if ib_phoneme_tokens is not None else None - ), - ) - - @staticmethod - def _load_audio_for_inference(audio_path: str, target_sample_rate: int) -> torch.Tensor: - """ - Load context audio and resample if needed. - Returns tensor of shape (1, num_samples). - """ - audio, sr = sf.read(audio_path, dtype='float32') - if len(audio.shape) > 1: - audio = audio.mean(axis=1) - if sr != target_sample_rate: - import librosa - - audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate) - return torch.from_numpy(audio).unsqueeze(0) - - @staticmethod - def _adjust_audio_to_duration_for_inference( - audio: torch.Tensor, - sample_rate: int, - target_duration: float, - codec_model_samples_per_frame: int, - ) -> torch.Tensor: - """ - Match the same duration-alignment logic used in magpietts_streaming_inference.py. - """ - num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame) - target_num_samples = num_codec_frames * codec_model_samples_per_frame - current_num_samples = audio.size(1) - - if current_num_samples >= target_num_samples: - audio = audio[:, :target_num_samples] - else: - num_repeats = int(np.ceil(target_num_samples / current_num_samples)) - audio_repeated = audio.repeat(1, num_repeats) - audio = audio_repeated[:, :target_num_samples] - return audio - - def do_tts( - self, - transcript: str, - context_audio_file_path: Optional[str] = None, - context_text: str = "[NO TEXT CONTEXT]", - main_tokenizer_name: Optional[str] = None, - context_audio_duration: float = 5.0, - use_cfg: bool = True, - cfg_scale: float = 2.5, - use_local_transformer: bool = True, - temperature: float = 0.7, - topk: int = 80, - max_steps: int = 330, - gt_phoneme_text: Optional[str] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Generate speech from transcript using EasyMagpie inference with optional context text/audio. - Optionally accepts ground-truth phoneme text (IPA string) for decoder-only inference. - """ - if transcript is None or transcript.strip() == "": - raise ValueError("`transcript` must be a non-empty string.") - - device = next(self.parameters()).device - transcript = transcript.strip() - context_text = (context_text or "[NO TEXT CONTEXT]").strip() - - if main_tokenizer_name is None: - # Match model init behavior: default to first configured tokenizer. - main_tokenizer_name = list(self.cfg.text_tokenizers.keys())[0] - if main_tokenizer_name not in self.tokenizer.tokenizers: - raise ValueError( - f"Unknown main_tokenizer_name='{main_tokenizer_name}'. " - f"Available tokenizers: {list(self.tokenizer.tokenizers.keys())}" - ) - - text_tokens = self.tokenizer.encode(transcript, tokenizer_name=main_tokenizer_name) + [self.eos_id] - text = torch.tensor([text_tokens], dtype=torch.long, device=device) - text_lens = torch.tensor([len(text_tokens)], dtype=torch.long, device=device) - - context_text_tokens = self.tokenizer.encode(context_text, tokenizer_name=self.text_conditioning_tokenizer_name) - context_text_tensor = torch.tensor([context_text_tokens], dtype=torch.long, device=device) - context_text_lens = torch.tensor([len(context_text_tokens)], dtype=torch.long, device=device) - - if context_audio_file_path is not None and context_audio_file_path.strip() != "": - context_audio = self._load_audio_for_inference(context_audio_file_path, self.sample_rate) - context_audio = self._adjust_audio_to_duration_for_inference( - context_audio, - self.sample_rate, - context_audio_duration, - self.codec_model_samples_per_frame, - ) - context_audio = context_audio.to(device) - context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device) - with torch.inference_mode(): - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) - else: - context_audio_codes = torch.zeros( - 1, - self.data_num_audio_codebooks, - 0, - dtype=torch.long, - device=device, - ) - context_audio_codes_lens = torch.zeros(1, dtype=torch.long, device=device) - - batch = { - 'text': text, - 'text_lens': text_lens, - 'context_text_tokens': context_text_tensor, - 'context_text_tokens_lens': context_text_lens, - 'context_audio_codes': context_audio_codes, - 'context_audio_codes_lens': context_audio_codes_lens, - } - phoneme_input_type = 'pred' - if gt_phoneme_text is not None: - if self.phoneme_tokenizer is None: - raise ValueError( - "Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided." - ) - gt_phoneme_text = gt_phoneme_text.strip() - if gt_phoneme_text == "": - raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.") - gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text) - gt_phoneme_tokens = ( - [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] - ) - if len(gt_phoneme_tokens) == 0: - raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.") - batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device) - batch['phoneme_tokens_lens'] = torch.tensor([len(gt_phoneme_tokens)], dtype=torch.long, device=device) - phoneme_input_type = 'gt' - - with torch.inference_mode(): - output = self.infer_batch( - batch=batch, - max_decoder_steps=max_steps, - temperature=temperature, - topk=topk, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - use_local_transformer_for_inference=use_local_transformer, - phoneme_input_type=phoneme_input_type, - phoneme_sampling_method='argmax', - use_teacher_forced=False, - use_inference_mode=True, - ) - return output.predicted_audio, output.predicted_audio_lens - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - return [] diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py new file mode 100644 index 000000000000..5bab45559174 --- /dev/null +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -0,0 +1,2018 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time +from dataclasses import dataclass +from functools import partial +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import numpy as np +import soundfile as sf +import torch +from hydra.utils import instantiate +from lightning.pytorch import Trainer +from omegaconf import DictConfig +from torch import nn +from transformers import AutoConfig, AutoModelForCausalLM + +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( + instantiate_phoneme_tokenizer, + setup_tokenizers, +) +from nemo.collections.tts.models import AudioCodecModel +from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel +from nemo.collections.tts.modules import transformer_2501 +from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter +from nemo.collections.tts.modules.magpietts_modules import ( + CharAwareSubwordEncoder, + LocalTransformerType, + SpecialAudioToken, +) +from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths +from nemo.core.classes.common import PretrainedModelInfo +from nemo.utils import logging + + +@dataclass +class TrainingMode: + """ + Configuration for a training mode in multi-mode training. + + Attributes: + text_input_mode: Either "full" or "streaming" + streaming_phonemes_delay: Delay for phoneme stream (only used in streaming mode) + streaming_speech_delay: Delay for speech stream (only used in streaming mode) + mode_idx: Index of this mode in the list of modes (used for task embedding lookup) + """ + + text_input_mode: str + streaming_phonemes_delay: int + streaming_speech_delay: int + mode_idx: int + + @property + def name(self) -> str: + """Derived identifier used for inference selection and logging.""" + return f"{self.text_input_mode}_{self.streaming_phonemes_delay}_{self.streaming_speech_delay}" + + +@dataclass +class StreamingState: + """ + State for streaming TTS inference with batch support. + + This dataclass maintains all the necessary state for autoregressive streaming + generation, allowing text tokens to be fed incrementally. Supports arbitrary + batch sizes where each batch item can have different context lengths and be + in different phases. + + The streaming operates in four phases (per batch item): + 1. Context phase (context_position < full_context_lens): Processing remaining context + 2. Prompt phase (text_tokens_seen < phoneme_delay): Only text, no predictions + 3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): Phoneme predictions only + 4. Audio phase (text_tokens_seen >= speech_delay): Both phoneme and audio predictions + + Attributes: + batch_size: Number of items in the batch. + past_key_values: KV cache from the transformer for efficient autoregressive decoding. + cache_seq_len: Current sequence length in the cache. + all_predictions: List of predicted audio codes at each timestep, each tensor is (B, C, S) unstacked. + all_phoneme_predictions: List of predicted phoneme tokens at each timestep, each tensor is (B, phoneme_stacking_factor). + context_audio_codes: Processed context audio codes with special tokens. + context_audio_codes_lens: Length of context audio codes. + context_lens: Total context length (task_embedding + context_audio + context_text). + full_context_embedding: Full context embedding for each batch item (B, T_max_context, E). + full_context_lens: Full context length for each batch item (B,). + context_position: How much context has been processed per batch item (B,). + text_tokens_seen: Number of text tokens processed so far per batch item (B,). + phoneme_steps: Number of phoneme prediction steps taken per batch item (B,). + audio_steps: Number of audio prediction steps taken per batch item (B,). + phoneme_stream_ended: Whether the phoneme stream has ended per batch item (B,) bool tensor. + phoneme_eos_detected: Whether the phoneme EOS has been predicted per batch item (B,) bool tensor. + finished: Whether generation is complete per batch item (B,) bool tensor. + device: Device tensors are on. + training_mode: The training mode being used for inference. + use_cfg: Whether classifier-free guidance is enabled. + cfg_scale: CFG scale factor. + use_local_transformer: Whether to use local transformer for inference. + temperature: Sampling temperature. + topk: Top-k sampling parameter. + dummy_context_embedding_unconditional: Unconditional embedding for CFG (if enabled). + last_hidden: Last hidden state from transformer. + text_finished: Whether text input has finished per batch item (B,) bool tensor. + phoneme_input_type: 'gt' or 'pred' for phoneme tokens. + phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. + last_phoneme_tokens: Last predicted phoneme tokens (B, phoneme_stacking_factor). + last_audio_codes: Last predicted audio codes (B, num_codebooks). + audio_prediction_start_idx: Global frame index where audio predictions start per batch item (B,). + audio_prediction_end_idx: Global frame index where audio predictions end per batch item (B,), -1 if not ended. + phoneme_prediction_start_idx: Global step index where phoneme predictions start per batch item (B,). + phoneme_prediction_end_idx: Global step index where phoneme predictions end per batch item (B,), -1 if not ended. + """ + + batch_size: int + past_key_values: Optional[Tuple] + cache_seq_len: int + all_predictions: List[torch.Tensor] + all_phoneme_predictions: List[torch.Tensor] + context_audio_codes: torch.Tensor + context_audio_codes_lens: torch.Tensor + context_lens: torch.Tensor + full_context_embedding: torch.Tensor + full_context_lens: torch.Tensor + context_position: torch.Tensor + text_tokens_seen: torch.Tensor + phoneme_steps: torch.Tensor + audio_steps: torch.Tensor + phoneme_stream_ended: torch.Tensor + phoneme_eos_detected: torch.Tensor + finished: torch.Tensor + device: torch.device + training_mode: TrainingMode + use_cfg: bool + cfg_scale: float + use_local_transformer: bool + temperature: float + topk: int + dummy_context_embedding_unconditional: Optional[torch.Tensor] + last_hidden: torch.Tensor + text_finished: torch.Tensor + phoneme_input_type: str + phoneme_sampling_method: str + last_phoneme_tokens: Optional[torch.Tensor] + last_audio_codes: Optional[torch.Tensor] + audio_prediction_start_idx: torch.Tensor + audio_prediction_end_idx: torch.Tensor + phoneme_prediction_start_idx: torch.Tensor + phoneme_prediction_end_idx: torch.Tensor + gt_phoneme_embeddings: Optional[torch.Tensor] = None # (B, T', E) pre-computed GT embeddings + gt_phoneme_lens: Optional[torch.Tensor] = None # (B,) lengths after stacking + gt_audio_embeddings: Optional[torch.Tensor] = None # (B, T', E) pre-computed GT audio embeddings + gt_audio_lens: Optional[torch.Tensor] = None # (B,) lengths after stacking + + +@dataclass +class StreamingFinalizeOutput: + """Output from streaming_finalize containing audio and phoneme predictions.""" + + audio: torch.Tensor # (B, max_audio_len) generated audio waveform + audio_len: torch.Tensor # (B,) length of audio per batch item + audio_codes: torch.Tensor # (B, num_codebooks, T) generated audio codes + audio_codes_len: torch.Tensor # (B,) length of codes per batch item + phoneme_tokens: List[List[int]] # List of phoneme token sequences per batch item + phoneme_text: List[str] # Decoded phoneme strings per batch item + + +@dataclass +class InferBatchOutput: + """Output dataclass for EasyMagpieTTS infer_batch method.""" + + predicted_audio: torch.Tensor # (B, T_audio) + predicted_audio_lens: torch.Tensor # (B,) + predicted_codes: torch.Tensor # (B, num_codebooks, T_frames) + predicted_codes_lens: torch.Tensor # (B,) + rtf_metrics: Dict[str, Any] + predicted_phoneme_tokens: Optional[torch.Tensor] = None # (B, phoneme_stacking_factor, T_phoneme_steps) + predicted_phoneme_tokens_lens: Optional[torch.Tensor] = None # (B,) number of valid phoneme steps per item + phoneme_prediction_start_idx: Optional[torch.Tensor] = None # (B,) start index into predicted_phoneme_tokens + + +class EasyMagpieTTSInferenceModel(BaseMagpieTTSModel): + """ + Inference-only base class for EasyMagpieTTS decoder-only model. + + Contains the model architecture (codec, embeddings, decoder, local transformer), + shared building-block methods, and all inference methods (streaming_init, + streaming_step, streaming_finalize, infer_batch, do_tts). + + EasyMagpieTTSModel subclasses this to add training, validation, and data loading. + """ + + def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): + self.world_size = 1 + if trainer is not None: + self.world_size = trainer.num_nodes * trainer.num_devices + + # load codec + codec_model = AudioCodecModel.restore_from(cfg.get('codecmodel_path'), strict=False) + self.sample_rate = codec_model.sample_rate + self.output_sample_rate = codec_model.output_sample_rate + + if hasattr(codec_model, "discriminator"): + # del codec discriminator to free memory + del codec_model.discriminator + + # Set up codebook configuration + vector_quantizer = cfg.get('vector_quantizer') + if vector_quantizer is not None: + vector_quantizer = instantiate(vector_quantizer) + num_audio_codebooks = vector_quantizer.num_codebooks + codebook_size = vector_quantizer.codebook_size + codec_converter = VectorQuantizerIndexConverter( + vector_quantizer_original=codec_model.vector_quantizer, + vector_quantizer_new=vector_quantizer, + ) + data_num_audio_codebooks = codec_model.vector_quantizer.num_codebooks + else: + num_audio_codebooks = codec_model.num_codebooks + data_num_audio_codebooks = num_audio_codebooks + codebook_size = codec_model.codebook_size + codec_converter = None + + # The dataloader needs to know the number of codebooks that the context codes were stored in + # In the case where there are no context codes saved, and there is no context audio (in the text context path), + # We create a dummy context code tensor that is only [context_BOS, context_EOS] that is repeated for + # data_num_audio_codebooks + self.data_num_audio_codebooks = data_num_audio_codebooks + self.num_audio_codebooks = num_audio_codebooks + self.codebook_size = codebook_size + + self.codec_model_samples_per_frame = codec_model.samples_per_frame + # Our codebooks start with actual audio codec tokens, followed by special tokens. + # The `forced_*` options are for backward compatibility for models trained with older code. + get_token_index = partial(SpecialAudioToken.get_index, base_codebook_size=self.codebook_size) + self.audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_BOS) + self.audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_EOS) + self.context_audio_bos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_BOS) + self.context_audio_eos_id = get_token_index(SpecialAudioToken.AUDIO_CONTEXT_EOS) + self.mask_token_id = get_token_index(SpecialAudioToken.MASK_TOKEN) + self.num_all_tokens_per_codebook = self.codebook_size + len(SpecialAudioToken) + self.use_bpe_char_tokenizer = cfg.get('use_bpe_char_tokenizer', False) + + # If specified, use this as the text conditioning tokenizer. Otherwise, use the first tokenizer. + self.text_conditioning_tokenizer_name = cfg.get('text_conditioning_tokenizer_name', None) + if self.text_conditioning_tokenizer_name is None: + self.text_conditioning_tokenizer_name = list(cfg.text_tokenizers.keys())[0] + + self.cfg_unconditional_prob = cfg.get('cfg_unconditional_prob', 0.0) + + # Multi-mode training configuration + # The model trains with multiple text input modes (full, streaming with various delays) + # Each mode has its own task embedding that is prepended to the context + training_modes_cfg = cfg.get('training_modes', None) + if training_modes_cfg is None: + # Create a default training mode for backward compatibility + self.training_modes = [ + TrainingMode( + text_input_mode="streaming", + streaming_phonemes_delay=4, + streaming_speech_delay=8, + mode_idx=0, + ) + ] + + else: + self.training_modes = [] + for mode_idx, mode_cfg in enumerate(training_modes_cfg): + mode = TrainingMode( + text_input_mode=mode_cfg.text_input_mode, + streaming_phonemes_delay=mode_cfg.get('streaming_phonemes_delay', 0), + streaming_speech_delay=mode_cfg.get('streaming_speech_delay', 0), + mode_idx=mode_idx, + ) + self.training_modes.append(mode) + + logging.info(f"Multi-mode training with {len(self.training_modes)} modes:") + for mode in self.training_modes: + logging.info( + f" - {mode.name}: text_input_mode={mode.text_input_mode}, " + f"streaming_phonemes_delay={mode.streaming_phonemes_delay}, " + f"streaming_speech_delay={mode.streaming_speech_delay}" + ) + + # Create a mapping from mode name to mode object for easy lookup during inference + self.mode_name_to_mode = {mode.name: mode for mode in self.training_modes} + # Default mode for inference if not specified (first mode in the list) + self.default_inference_mode = self.training_modes[0].name + + self.frame_stacking_factor = cfg.get('frame_stacking_factor', 1) + + self.tokenizer = setup_tokenizers( + all_tokenizers_config=cfg.text_tokenizers, + mode='train', + ) + + num_tokens_tokenizer = len(self.tokenizer.tokens) + num_tokens = num_tokens_tokenizer + 3 # +3 for BOS, EOS, CFG_UNK + self.bos_id = num_tokens - 3 + self.eos_id = num_tokens - 2 + self.cfg_unk_token_id = num_tokens - 1 + self.phoneme_tokenizer = None + if cfg.get('phoneme_tokenizer', None) is not None: + self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) + self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) + self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size + if cfg.get('phoneme_corruption_batch_prob', None) is None: + # Legacy mode: remove the UNK token from the phoneme vocabulary + # TODO: Remove this. + self.phoneme_vocab_size -= 1 + # If max phoneme probability is below this threshold at inference-time, + # replace the predicted timestep with UNK to reduce error propagation. + self.phoneme_confidence_unk_threshold = cfg.get('phoneme_confidence_unk_threshold', 0.0) + + self.pad_context_text_to_max_duration = False + self.add_language_to_context_text = cfg.get('add_language_to_context_text', False) + + super().__init__(cfg=cfg, trainer=trainer) + + # This needs to happen after super().__init__() + self._codec_model = codec_model + self._codec_model.freeze() # Lightning does requires_grad = False and self.eval() + self._codec_converter = codec_converter + + # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters + self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim) + + audio_embeddings = [] + for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): + audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, self.audio_embedding_dim)) + self.audio_embeddings = nn.ModuleList(audio_embeddings) + + # Projection from audio_embedding_dim to embedding_dim (Identity if same) + if self.audio_embedding_dim != cfg.embedding_dim: + self.audio_in_projection = nn.Linear(self.audio_embedding_dim, cfg.embedding_dim) + else: + self.audio_in_projection = nn.Identity() + + if self.phoneme_tokenizer is not None: + phoneme_embeddings = [] + for _ in range(self.phoneme_stacking_factor): + phoneme_embeddings.append(nn.Embedding(self.phoneme_vocab_size, cfg.embedding_dim)) + self.phoneme_embeddings = nn.ModuleList(phoneme_embeddings) + self.phoneme_final_proj = nn.Linear(cfg.hidden_dim, self.phoneme_vocab_size * self.phoneme_stacking_factor) + + # Decoder backend selection - supports HuggingFace models or NemotronH + self.decoder_type = cfg.get('decoder_type', 'huggingface') # backward compatible default + logging.info(f"Using decoder type: {self.decoder_type}") + + if self.decoder_type == 'huggingface': + # Existing HuggingFace path + self.transformer_backend_config = AutoConfig.from_pretrained( + cfg.transformer_hf_backend, + trust_remote_code=True, + ) + hf_transformer = AutoModelForCausalLM.from_config(self.transformer_backend_config) + self.decoder = hf_transformer.model + self.lm_text_head = hf_transformer.lm_head + + elif self.decoder_type == 'nemotron_h': + # NemotronH hybrid Mamba2/Attention backend + from nemo.collections.tts.modules.nemotron_h_decoder import NemotronHConfig, NemotronHForCausalLM + + # Build config from YAML parameters + nemotron_h_config_dict = dict(cfg.get('nemotron_h_config', {})) + # Ensure hidden_size matches embedding_dim for compatibility + if 'hidden_size' not in nemotron_h_config_dict: + nemotron_h_config_dict['hidden_size'] = cfg.embedding_dim + nemotron_config = NemotronHConfig(**nemotron_h_config_dict) + nemotron_model = NemotronHForCausalLM(nemotron_config) + self.decoder = nemotron_model.backbone + self.lm_text_head = nemotron_model.lm_head + logging.info( + f"NemotronH config: {nemotron_config.num_hidden_layers} layers, pattern={nemotron_config.hybrid_override_pattern[:20]}..." + ) + + else: + raise ValueError(f"Unknown decoder_type: {self.decoder_type}. Supported: 'huggingface', 'nemotron_h'") + + self.text_embedding = nn.Embedding(num_tokens, cfg.embedding_dim) + self.decoder.set_input_embeddings(self.text_embedding) + + # Task embedding for multi-mode training + # Each mode has a unique task embedding that is prepended to the context + # Only create task embedding if there are multiple modes + num_modes = len(self.training_modes) + if num_modes > 1: + self.task_embedding = nn.Embedding(num_modes, cfg.embedding_dim) + logging.info(f"Created task embedding with {num_modes} modes, embedding_dim={cfg.embedding_dim}") + else: + self.task_embedding = None + logging.info(f"Single training mode '{self.training_modes[0].name}', skipping task embedding") + + if self.use_bpe_char_tokenizer: + # BPE char tokenizer + assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer" + tokenizer_name = self.tokenizer.tokenizer_names[0] + tokenizer = self.tokenizer.tokenizers[tokenizer_name] + subword_vocab = tokenizer.get_vocab() + # special tokens will be stored as it is in the char_vocab + # Each special token will only be mapped to one char id + special_vocab = { + '': self.bos_id, + '': self.eos_id, + '': self.cfg_unk_token_id, + } + self.cas_encoder = CharAwareSubwordEncoder( + d_embed=cfg.embedding_dim, + llm_tokenizer_vocab=subword_vocab, + subword_padding_idx=self.tokenizer.pad, + special_vocab=special_vocab, + ) + + # Projection from hidden_dim to audio_embedding_dim before final_proj (Identity if same) + if self.audio_embedding_dim != cfg.hidden_dim: + self.audio_out_projection = nn.Linear(cfg.hidden_dim, self.audio_embedding_dim) + else: + self.audio_out_projection = nn.Identity() + + self.final_proj = nn.Linear( + self.audio_embedding_dim, + self.num_audio_codebooks * self.num_all_tokens_per_codebook * self.frame_stacking_factor, + ) + + self.local_transformer_type = LocalTransformerType(cfg.get('local_transformer_type', 'none').lower()) + logging.info(f"Local transformer type: {self.local_transformer_type}") + if self.local_transformer_type != LocalTransformerType.NO_LT: + local_transformer_hidden_dim = cfg.get('local_transformer_hidden_dim', 256) + if local_transformer_hidden_dim != cfg.hidden_dim: + self.local_transformer_in_projection = nn.Linear(cfg.hidden_dim, local_transformer_hidden_dim) + else: + self.local_transformer_in_projection = nn.Identity() + self.local_transformer = transformer_2501.Transformer( + n_layers=self.cfg.get('local_transformer_n_layers', 2), + d_model=local_transformer_hidden_dim, + d_ffn=local_transformer_hidden_dim * 4, + sa_n_heads=self.cfg.get('local_transformer_n_heads', 1), + kernel_size=1, + is_causal=self.local_transformer_type == LocalTransformerType.AR, + max_length_causal_mask=self.num_audio_codebooks * self.frame_stacking_factor + 2, + use_learnable_pos_emb=True, + ) + # Projection from local_transformer_hidden_dim to audio_embedding_dim (Identity if same) + if self.audio_embedding_dim != local_transformer_hidden_dim: + self.local_transformer_audio_out_projection = nn.Linear( + local_transformer_hidden_dim, self.audio_embedding_dim + ) + else: + self.local_transformer_audio_out_projection = nn.Identity() + local_transformer_out_projections = [] + for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): + # Have a separate projection layer for each codebook, to distinguish between them + local_transformer_out_projections.append( + nn.Linear(self.audio_embedding_dim, self.num_all_tokens_per_codebook) + ) + self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) + + def _get_state_dict_keys_to_exclude(self): + return [ + '_codec_model', + ] + + def codes_to_audio(self, codes, codes_len): + # codes: (B, C, T') + self._codec_model.eval() + if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: + codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) + + with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): + if self._codec_converter is not None: + codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) + if codes_len.min() < 4: + codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0) + codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len) + codes = codes[:, :, : codes_len.max()] + + audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) + return audio, audio_len, codes + + def embed_audio_tokens(self, audio_tokens): + # audio_tokens: (B, C, T') + # Add and average the embeddings of the audio tokens across the codebooks + audio_embedding = None + for c in range(audio_tokens.size(1)): + embedding = self.audio_embeddings[c](audio_tokens[:, c, :]) + if audio_embedding is None: + audio_embedding = embedding + else: + audio_embedding = audio_embedding + embedding + audio_embedding = audio_embedding / audio_tokens.size(1) + # Project from audio_embedding_dim to embedding_dim + audio_embedding = self.audio_in_projection(audio_embedding) + return audio_embedding + + def embed_phoneme_tokens(self, phoneme_tokens): + # phoneme_tokens: (B, S, T') + phoneme_embedding = None + for c in range(phoneme_tokens.size(1)): + embedding = self.phoneme_embeddings[c](phoneme_tokens[:, c, :]) + if phoneme_embedding is None: + phoneme_embedding = embedding + else: + phoneme_embedding = phoneme_embedding + embedding + phoneme_embedding = phoneme_embedding / phoneme_tokens.size(1) + return phoneme_embedding + + def forward(self, inputs_embeds, attention_mask, use_cache=False, past_key_values=None, cache_position=None): + # Only pass cache_position for NemotronH (HF transformers may not accept it) + if self.decoder_type == 'nemotron_h': + backend_out = self.decoder( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + use_cache=use_cache, + past_key_values=past_key_values, + cache_position=cache_position, + ) + else: + backend_out = self.decoder( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + use_cache=use_cache, + past_key_values=past_key_values, + ) + return backend_out + + def logits_to_audio_codes(self, all_code_logits, audio_codes_lens): + # all_code_logits: (B, T', num_codebooks * num_tokens_per_codebook) + # audio_codes_lens: (B,) + all_preds = [] + for idx in range(self.num_audio_codebooks * self.frame_stacking_factor): + si = idx * self.num_all_tokens_per_codebook + ei = si + self.num_all_tokens_per_codebook + codebook_logits = all_code_logits[:, :, si:ei] + codebook_probs = torch.softmax(codebook_logits, dim=-1) # (B, T', num_tokens_per_codebook) + # argmax to get the tokens + codebook_preds = torch.argmax(codebook_probs, dim=-1) # (B, T') + all_preds.append(codebook_preds) + + all_preds = torch.stack(all_preds, dim=1) # (B, C, T') + audio_mask = get_mask_from_lengths(audio_codes_lens) + all_preds = all_preds * audio_mask.unsqueeze(1) + + return all_preds + + def sample_codes_from_logits( + self, all_code_logits_t, temperature=0.7, topk=80, unfinished_items={}, finished_items={} + ): + # all_code_logits_t: (B, num_codebooks * num_tokens_per_codebook), logits at a given timestep + all_preds = [] + for idx in range(self.num_audio_codebooks * self.frame_stacking_factor): + si = idx * self.num_all_tokens_per_codebook + ei = si + self.num_all_tokens_per_codebook + codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) + # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) + for item_idx in unfinished_items: + codebook_logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + codebook_logits[item_idx, :] = float('-inf') + codebook_logits[item_idx, self.audio_eos_id] = 0.0 + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( + -1 + ) # (B, num_tokens_per_codebook) + codebook_logits_rescored = codebook_logits.clone() + codebook_logits_rescored[indices_to_remove] = float('-inf') + + if temperature <= 0.0: + # Argmax sampling for deterministic output + codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) + else: + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + all_preds.append(codebook_preds) + all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) + return all_preds + + def sample_codes_from_logits_phoneme(self, all_code_logits_t, temperature=0.7, topk=80): + # all_code_logits_t: (B, phoneme_stacking_factor * phoneme_vocab_size), logits at a given timestep + all_preds = [] + for idx in range(self.phoneme_stacking_factor): + si = idx * self.phoneme_vocab_size + ei = si + self.phoneme_vocab_size + codebook_logits = all_code_logits_t[:, si:ei] # (B, num_tokens_per_codebook) + # Replace NaN/inf then clamp to prevent extreme values causing NaN in softmax + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( + -1 + ) # (B, num_tokens_per_codebook) + codebook_logits_rescored = codebook_logits.clone() + codebook_logits_rescored[indices_to_remove] = float('-inf') + + if temperature <= 0.0: + # Argmax sampling for deterministic output + codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) # (B, 1) + else: + codebook_probs = torch.softmax( + codebook_logits_rescored / temperature, dim=-1 + ) # (B, num_tokens_per_codebook) + codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) + all_preds.append(codebook_preds) + all_preds = torch.cat(all_preds, dim=1).long() # (B, num_codebooks) + return all_preds + + def join_embeddings_temporally( + self, + embeddings: Sequence[torch.Tensor], # [ (B, Ti, E), … ] + lengths: Sequence[torch.Tensor], # [ (B,), … ] same order/size as `embeddings` + pad_embed: torch.Tensor | None = None, # (E,) defaults to zeros + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Merges Multiple Embedding sequences into a single Embedding Sequence. + + Args: + embeddings : Sequence of tensors, each of shape (B, Ti, E) — batch, time, embedding + lengths : Sequence of tensors, each of shape (B,) + pad_embed : (E,) — embedding to use for padding, defaults to zeros + + Returns: + joined : (B, max_sum_len, E) — merged & padded + out_lengths : (B,) — total lengths of each batch element after merging + """ + if len(embeddings) == 0: + raise ValueError("contexts must be non-empty") + + B, _, E = embeddings[0].shape + device = embeddings[0].device + dtype = embeddings[0].dtype + + # 1. compute output sizes + len_stack = torch.stack(tuple(lengths), dim=0) # (N, B) + out_lengths = len_stack.sum(0) + max_len = int(out_lengths.max()) + + if pad_embed is None: + pad_embed = torch.zeros(E, dtype=dtype, device=device) + + joined = pad_embed.expand(B, max_len, E).clone() # (B,max_len,E) + + # batch row indices + batch_rows = torch.arange(B, device=device).unsqueeze(1) # (B,1) + + # running offset keeps "write cursor" for each row + offset = torch.zeros(B, dtype=torch.long, device=device) # (B,) + + for i, (embedding_i, len_i) in enumerate(zip(embeddings, lengths)): + Ti = embedding_i.shape[1] + t_idx = torch.arange(Ti, device=device) # (Ti,) + mask = t_idx.unsqueeze(0) < len_i.unsqueeze(1) # (B,Ti) + + # destination columns: offset + t + dest_cols = offset.unsqueeze(1) + t_idx # (B,Ti) + + # Assign embedding_i to the correct positions in joined + # Ensure dtype matches to avoid errors during mixed-precision training + joined[batch_rows.expand_as(mask)[mask], dest_cols[mask]] = embedding_i[mask].to(joined.dtype) + + # move cursor past this segment + offset += len_i + + return joined, out_lengths + + def prepare_context_tensors( + self, + context_text_tokens: torch.Tensor, + context_text_tokens_lens: torch.Tensor, + context_audio_codes: Optional[torch.Tensor] = None, + context_audio_codes_lens: Optional[torch.Tensor] = None, + context_audio: Optional[torch.Tensor] = None, + context_audio_lens: Optional[torch.Tensor] = None, + training_mode: Optional[TrainingMode] = None, + dropout_conditional_input: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Prepare context tensors (without text) for the simplified process_batch. + + This function processes context audio and context text to create the combined + context embedding. + Args: + context_text_tokens: Context text token IDs for speaker/style conditioning (B, L) + context_text_tokens_lens: Length of context text for each batch item (B,) + context_audio_codes: Pre-computed audio codes for context audio (B, C, T'). + If None, will be computed from context_audio. + context_audio_codes_lens: Length of context audio codes (B,). + Required if context_audio_codes is provided. + context_audio: Raw context audio waveform (B, T). + Used to compute context_audio_codes if not provided. + context_audio_lens: Length of context audio (B,). + Required if context_audio is provided. + training_mode: Optional TrainingMode object specifying the mode to use. + If None, uses the first mode from training_modes as default. + dropout_conditional_input: If True, replace context with CFG unconditional token. + + Returns: + Tuple of: + - context_embedding: Combined context embedding (B, T_context, E) + - context_lens: Total context length per batch item (B,) + - context_audio_codes: Processed audio codes with special tokens (B, C, T') + - context_audio_codes_lens: Length of processed context audio codes (B,) + """ + # Determine the mode parameters to use + if training_mode is None: + training_mode = self.training_modes[0] + + current_mode_idx = training_mode.mode_idx + batch_size = context_text_tokens.size(0) + device = context_text_tokens.device + + # Context Audio + if context_audio_codes is None: + if context_audio is None: + raise ValueError("Either context_audio_codes or context_audio must be provided") + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + + if self._codec_converter is not None: + context_audio_codes = self._codec_converter.convert_original_to_new( + audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens + ).long() + + context_audio_codes, context_audio_codes_lens = self.add_special_tokens( + codes=context_audio_codes, + codes_len=context_audio_codes_lens, + bos_id=self.context_audio_bos_id, + eos_id=self.context_audio_eos_id, + ) + + # Use legacy audio_bos_id/audio_eos_id if flag is set + stack_bos_id = ( + self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id + ) + stack_eos_id = ( + self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id + ) + + context_audio_codes, context_audio_codes_lens = self.stack_codes( + context_audio_codes, + context_audio_codes_lens, + stack_bos_id, + stack_eos_id, + self.frame_stacking_factor, + self.num_audio_codebooks, + ) + context_audio_embedded = self.embed_audio_tokens(context_audio_codes) # (B, T', E) + + # Context Text + context_text_lens = context_text_tokens_lens + context_text_embedded = self.decoder.get_input_embeddings()(context_text_tokens) # (B, L, E) + + # Prepare task embedding for multi-mode training + task_embedding = None + task_embedding_lens = None + if self.task_embedding is not None and current_mode_idx is not None: + mode_idx_tensor = torch.full((batch_size,), current_mode_idx, dtype=torch.long, device=device) + task_embedding = self.task_embedding(mode_idx_tensor).unsqueeze(1) # (B, 1, E) + task_embedding_lens = torch.ones(batch_size, dtype=torch.long, device=device) # (B,) + + # Combine context embeddings: [task_embedding | context_audio | context_text] + if task_embedding is not None: + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[task_embedding, context_audio_embedded, context_text_embedded], + lengths=[task_embedding_lens, context_audio_codes_lens, context_text_lens], + ) + else: + context_embedding, context_lens = self.join_embeddings_temporally( + embeddings=[context_audio_embedded, context_text_embedded], + lengths=[context_audio_codes_lens, context_text_lens], + ) + + # Handle CFG unconditional dropout + if dropout_conditional_input: + cfg_token_id = self.cfg_unk_token_id + cfg_token_embedding = self.decoder.get_input_embeddings()( + torch.full((batch_size, 1), cfg_token_id, device=device) + ) # (B, 1, E) + # Expand CFG token to match context embedding size + context_embedding = cfg_token_embedding.expand(-1, context_embedding.size(1), -1) # (B, T_context, E) + + return context_embedding, context_lens, context_audio_codes, context_audio_codes_lens + + def stack_codes(self, codes, codes_lens, bos_id, eos_id, stacking_factor, num_codebooks): + """ + Stack multiple time steps into the channel dimension to reduce sequence length. + + This function reshapes audio/phoneme codes by grouping consecutive time steps together + and placing them in the channel dimension. This allows the model to process multiple + frames in parallel while reducing the sequence length. + + Args: + codes: Input codes tensor of shape (B, C, T) where B is batch size, + C is number of codebooks, and T is sequence length. + codes_lens: Length of valid codes for each batch item, shape (B,). + bos_id: Beginning-of-sequence token ID used to detect and handle BOS tokens. + eos_id: End-of-sequence token ID used for padding. + stacking_factor: Number of time steps to stack together. If 1, no stacking is performed. + num_codebooks: Number of codebooks in the input. + + Returns: + Tuple of: + - stacked_codes: Reshaped codes of shape (B, C * stacking_factor, T // stacking_factor). + If input contains BOS tokens, they are preserved at the beginning. + - new_lens: Updated sequence lengths after stacking, shape (B,). + """ + if stacking_factor == 1: + return codes, codes_lens + + contains_bos = codes[0, 0, 0].item() == bos_id + if contains_bos: + bos_tensor_repeated = torch.full( + (codes.size(0), (stacking_factor) * num_codebooks, 1), bos_id, device=codes.device + ) # (B,stacking_factor*C, 1) + codes = codes[:, :, 1:] # Remove the bos token + codes_lens = codes_lens - 1 # Remove the bos token + B, C, T = codes.shape + s = int(stacking_factor) + + # --- Compute max padding needed --- + pad_t = (-T) % s # pad so that T' is divisible by s + pad_tail = torch.full((B, C, pad_t), eos_id, dtype=codes.dtype, device=codes.device) + codes = torch.cat([codes, pad_tail], dim=-1) + + # --- Stack time into channel dimension --- + Tp = codes.shape[-1] + T_out = Tp // s + codes = codes.view(B, C, T_out, s) + codes = codes.permute(0, 1, 3, 2).reshape(B, C * s, T_out) + + new_lens = torch.div(codes_lens + s - 1, s, rounding_mode='floor') + if contains_bos: + codes = torch.cat([bos_tensor_repeated, codes], dim=2) + new_lens = new_lens + 1 + + return codes, new_lens + + def unstack_codes(self, stacked_codes, stacked_lens, stacking_factor): + """ + Reverse the stacking operation to recover the original time dimension. + + This is the inverse of `stack_codes`. It takes codes that have been stacked + in the channel dimension and expands them back into the time dimension. + + Args: + stacked_codes: Stacked codes tensor of shape (B, C * stacking_factor, T_stacked) + where T_stacked = T_original // stacking_factor. + stacked_lens: Length of valid stacked sequences for each batch item, shape (B,). + stacking_factor: The stacking factor used in the original `stack_codes` call. + If 1, no unstacking is performed. + + Returns: + Tuple of: + - unstacked_codes: Codes with restored time dimension, shape (B, C, T_stacked * stacking_factor). + - orig_lens: Recovered sequence lengths, shape (B,). Note that these are the + maximum possible lengths; actual valid lengths may be shorter due to + padding applied during stacking. + """ + if stacking_factor == 1: + return stacked_codes, stacked_lens + + B, CxS, T_out = stacked_codes.shape + s = int(stacking_factor) + assert CxS % s == 0, f"Channel dim ({CxS}) must be divisible by stacking_factor ({s})" + + C = CxS // s + # Reshape: split channels back into (C, s) + x = stacked_codes.view(B, C, s, T_out) + # Bring s back into time dimension + x = x.permute(0, 1, 3, 2).reshape(B, C, T_out * s) + + # Recover original lengths (before padding) + orig_lens = stacked_lens * s + + return x, orig_lens + + def _sample_audio_codes( + self, + last_hidden: torch.Tensor, + all_code_logits_t: torch.Tensor, + temperature: float, + topk: int, + use_local_transformer_for_inference: bool, + use_cfg: bool, + cfg_scale: float, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Sample audio codes from logits using either local transformer or parallel sampling. + + Returns: + audio_codes_next: Sampled codes with temperature/topk (B, num_codebooks) + all_codes_next_argmax: Argmax sampled codes for EOS detection (B, num_codebooks) + """ + if use_local_transformer_for_inference: + if self.local_transformer_type == LocalTransformerType.AR: + audio_codes_next = self.local_transformer_sample_autoregressive( + dec_output=last_hidden[:, -1, :], + temperature=temperature, + topk=topk, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + ) + # Base class returns (B, C, S); flatten to (B, C*S) for downstream code + audio_codes_next = audio_codes_next.permute(0, 2, 1) + audio_codes_next = audio_codes_next.reshape(audio_codes_next.size(0), -1) + else: + raise ValueError( + f"Local transformer inference requested but local transformer type is {self.local_transformer_type}" + ) + # TODO @rfejgin: should we add argmax sampling for EOS here too? + all_codes_next_argmax = audio_codes_next + else: + # Parallel sampling from all codebook logits + audio_codes_next = self.sample_codes_from_logits(all_code_logits_t, temperature=temperature, topk=topk) + # Argmax sampling for reliable EOS detection + if temperature <= 0.0: + all_codes_next_argmax = audio_codes_next # already argmax + else: + all_codes_next_argmax = self.sample_codes_from_logits(all_code_logits_t, temperature=0.01) + + return audio_codes_next, all_codes_next_argmax + + def streaming_init( + self, + context_audio_codes: torch.Tensor, + context_audio_codes_lens: torch.Tensor, + context_text_tokens: torch.Tensor, + context_text_tokens_lens: torch.Tensor, + inference_mode: Optional[str] = None, + use_cfg: bool = False, + cfg_scale: float = 1.0, + use_local_transformer: bool = False, + temperature: float = 0.7, + topk: int = 80, + phoneme_input_type: str = 'predicted', + phoneme_sampling_method: str = 'argmax', + gt_phoneme_tokens: Optional[torch.Tensor] = None, + gt_phoneme_tokens_lens: Optional[torch.Tensor] = None, + gt_audio_codes: Optional[torch.Tensor] = None, + gt_audio_codes_lens: Optional[torch.Tensor] = None, + use_inference_mode: bool = True, + ) -> StreamingState: + """ + Initialize streaming TTS inference state. + + This prepares the model for streaming inference by processing the context + (audio + context text) and returning a StreamingState that can be used + with streaming_step() to incrementally generate audio. + + Note: This function does NOT take the main text input. Text tokens are + provided incrementally via streaming_step(). + + For batched inference, each batch item can have a different context length. + This function processes only up to the minimum context length across the batch, + storing the remaining context to be processed in streaming_step's context phase. + + The streaming inference follows phases (per batch item): + 1. Context phase: Processing remaining context (if any) for items with longer context. + 2. Prompt phase: First `streaming_speech_delay` text tokens are processed + without generating audio (building up context). + 3. Generation phase: Audio BOS is added and audio codes are generated + autoregressively, with remaining text tokens added to audio embeddings. + + Args: + context_audio_codes: Pre-computed audio codes for context audio (B, C, T'). + context_audio_codes_lens: Length of context audio codes (B,). + context_text_tokens: Context text token IDs for speaker/style conditioning (B, L). + context_text_tokens_lens: Length of context text (B,). + inference_mode: Name of the inference mode to use (e.g., "streaming_4_8"). + If None, uses the default inference mode. + use_cfg: Whether to use classifier-free guidance. + cfg_scale: CFG scale factor (higher = stronger conditioning). + use_local_transformer: Whether to use local transformer for AR sampling. + temperature: Sampling temperature for audio codes. + topk: Top-k sampling parameter. + phoneme_input_type: 'gt' or 'predicted' for phoneme tokens (use 'predicted' for streaming). + phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. + gt_phoneme_tokens: Optional GT phoneme tokens (B, L) with BOS/EOS for teacher forcing. + gt_phoneme_tokens_lens: Lengths of GT phoneme tokens (B,). + gt_audio_codes: Optional GT audio codes (B, C*S, T) already stacked with BOS/EOS, + input portion ([:, :, :-1]) for teacher forcing. Pre-processed by caller. + gt_audio_codes_lens: Lengths of GT audio codes (B,) after stacking. + + Returns: + StreamingState: Initial state for streaming inference. + """ + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): + batch_size = context_audio_codes.size(0) + device = context_audio_codes.device + + # Resolve inference mode + mode_name = inference_mode if inference_mode is not None else self.default_inference_mode + if mode_name not in self.mode_name_to_mode: + available_modes = list(self.mode_name_to_mode.keys()) + raise ValueError(f"Unknown inference mode '{mode_name}'. Available modes: {available_modes}") + + selected_training_mode = self.mode_name_to_mode[mode_name] + + # Prepare context embedding using shared helper + context_embedding, context_lens, context_audio_codes, context_audio_codes_lens = ( + self.prepare_context_tensors( + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + training_mode=selected_training_mode, + dropout_conditional_input=False, + ) + ) + + # Store full context embedding and lens before any CFG manipulation + full_context_embedding = context_embedding.clone() # (B, T_max, E) + full_context_lens = context_lens.clone() # (B,) + + # Compute min context length - we only process up to this in init + min_context_len = context_lens.min().item() + + # Setup classifier-free guidance if enabled + dummy_context_embedding_unconditional = None + if use_cfg: + dummy_context_embedding_unconditional = self.decoder.get_input_embeddings()( + torch.full((1, 1), self.cfg_unk_token_id, device=device) + ) + # Create unconditional context (same length as conditional) + dummy_context_expanded = dummy_context_embedding_unconditional.expand( + batch_size, context_embedding.size(1), -1 + ) + # Concatenate conditional and unconditional: (2*B, T, E) + context_embedding = torch.cat([context_embedding, dummy_context_expanded], dim=0) + + # First forward pass to process context - only up to min_context_len + cache_position = torch.arange(min_context_len, device=device) + transformer_out = self.forward( + inputs_embeds=context_embedding[:, :min_context_len, :], + attention_mask=None, + use_cache=True, + past_key_values=None, + cache_position=cache_position, + ) + + last_hidden = transformer_out.last_hidden_state + past_kv = transformer_out.past_key_values + current_cache_seq_len = min_context_len + + # Process GT phoneme tokens if provided (for teacher forcing) + gt_phoneme_embeddings = None + gt_phoneme_lens = None + if gt_phoneme_tokens is not None and gt_phoneme_tokens_lens is not None: + gt_phoneme_expanded = gt_phoneme_tokens.unsqueeze(1) # (B, 1, L) + gt_phoneme_stacked, gt_phoneme_lens = self.stack_codes( + gt_phoneme_expanded, + gt_phoneme_tokens_lens, + self.phoneme_tokenizer.bos_token_id, + self.phoneme_tokenizer.eos_token_id, + self.phoneme_stacking_factor, + 1, + ) + gt_phoneme_embeddings = self.embed_phoneme_tokens(gt_phoneme_stacked) # (B, T', E) + + # Process GT audio codes if provided (for teacher forcing) + gt_audio_embeddings = None + gt_audio_lens_state = None + if gt_audio_codes is not None and gt_audio_codes_lens is not None: + gt_audio_embeddings = self.embed_audio_tokens(gt_audio_codes) # (B, T', E) + gt_audio_lens_state = gt_audio_codes_lens + + # Initialize streaming state with batch support + state = StreamingState( + batch_size=batch_size, + past_key_values=past_kv, + cache_seq_len=current_cache_seq_len, + all_predictions=[], + all_phoneme_predictions=[], + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + context_lens=context_lens, + full_context_embedding=full_context_embedding, + full_context_lens=full_context_lens, + context_position=torch.full((batch_size,), min_context_len, dtype=torch.long, device=device), + text_tokens_seen=torch.zeros(batch_size, dtype=torch.long, device=device), + phoneme_steps=torch.zeros(batch_size, dtype=torch.long, device=device), + audio_steps=torch.zeros(batch_size, dtype=torch.long, device=device), + phoneme_stream_ended=torch.zeros(batch_size, dtype=torch.bool, device=device), + phoneme_eos_detected=torch.zeros(batch_size, dtype=torch.bool, device=device), + finished=torch.zeros(batch_size, dtype=torch.bool, device=device), + device=device, + training_mode=selected_training_mode, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer=use_local_transformer, + temperature=temperature, + topk=topk, + dummy_context_embedding_unconditional=dummy_context_embedding_unconditional, + last_hidden=last_hidden, + text_finished=torch.zeros(batch_size, dtype=torch.bool, device=device), + phoneme_input_type=phoneme_input_type, + phoneme_sampling_method=phoneme_sampling_method, + last_phoneme_tokens=None, + last_audio_codes=None, + audio_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + audio_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + phoneme_prediction_start_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + phoneme_prediction_end_idx=torch.full((batch_size,), -1, dtype=torch.long, device=device), + gt_phoneme_embeddings=gt_phoneme_embeddings, + gt_phoneme_lens=gt_phoneme_lens, + gt_audio_embeddings=gt_audio_embeddings, + gt_audio_lens=gt_audio_lens_state, + ) + + return state + + def streaming_step( + self, + state: StreamingState, + text_tokens: Optional[torch.Tensor] = None, + force_dropout_text: bool = False, + use_inference_mode: bool = True, + ) -> Tuple[StreamingState, Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform one streaming inference step with batch support. + + This function processes one text token per batch item (or signals end of text with None) + and generates predictions according to the streaming delays. Each batch item can be + in a different phase. + + The streaming operates in four phases per batch item: + 1. Context phase (context_position < full_context_lens): + - Still processing remaining context from streaming_init + - Uses context embedding, ignores text_tokens for this item + 2. Prompt phase (text_tokens_seen < phoneme_delay): + - Only text tokens are processed, KV cache is extended + - No phoneme or audio predictions + 3. Phoneme-only phase (phoneme_delay <= text_tokens_seen < speech_delay): + - Starts with phoneme BOS on first step + - Only phoneme predictions (no audio) + - Input: text embedding + phoneme embedding + 4. Audio phase (text_tokens_seen >= speech_delay): + - Starts with audio BOS on first step + - Both phoneme and audio predictions + - Input: text embedding + phoneme embedding + audio embedding + + IMPORTANT: Only ONE forward call to the decoder per streaming_step. + + Args: + state: Current StreamingState from streaming_init or previous streaming_step. + text_tokens: Next text token for each batch item, shape (B,), or None if text has finished. + For items still in context phase, the text_token value is ignored (can be 0). + When None is passed, the model continues generating until EOS. + + Returns: + Tuple of: + - Updated StreamingState + - Predicted audio codes for this step (B, C, S) unstacked, or None if no items in audio phase + where C = num_audio_codebooks and S = frame_stacking_factor + - Predicted phoneme tokens for this step (B, phoneme_stacking_factor) or None if no items in phoneme phase + """ + if state.finished.all(): + return state, None, None + + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): + device = state.device + batch_size = state.batch_size + streaming_speech_delay = state.training_mode.streaming_speech_delay + streaming_phonemes_delay = state.training_mode.streaming_phonemes_delay + + # ==================== DETERMINE PHASES PER BATCH ITEM ==================== + needs_context = state.context_position < state.full_context_lens # (B,) bool + needs_text = (~needs_context) & (~state.text_finished) + needs_phoneme = ( + (~needs_context) & (state.text_tokens_seen >= streaming_phonemes_delay) & (~state.phoneme_stream_ended) + ) + needs_audio = (~needs_context) & (state.text_tokens_seen >= streaming_speech_delay) & (~state.finished) + + next_input = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) + # --- Context phase items: use next context embedding --- + if needs_context.any(): + # Gather context embeddings at current position for each item + # context_position: (B,) - position indices + # full_context_embedding: (B, T_max, E) + ctx_positions = state.context_position.clone() # (B,) + # Clamp positions to valid range for gathering + ctx_positions = ctx_positions.clamp(max=state.full_context_embedding.size(1) - 1) + # Gather: need (B, 1, E) from (B, T, E) at positions (B,) + ctx_emb = state.full_context_embedding[ + torch.arange(batch_size, device=device), ctx_positions, : + ].unsqueeze( + 1 + ) # (B, 1, E) + # Only apply to items in context phase + context_mask = needs_context.view(batch_size, 1, 1).float() + next_input = next_input + ctx_emb * context_mask + + # --- Non-context phase items: handle text embedding --- + text_embedded = None + if text_tokens is not None and needs_text.any(): + # Embed text tokens for all items (will be masked later) + text_tokens_2d = text_tokens.unsqueeze(1) # (B, 1) + text_embedded = self.decoder.get_input_embeddings()(text_tokens_2d) # (B, 1, E) + + # Handle BPE char tokenizer + if self.use_bpe_char_tokenizer: + text_mask = torch.ones_like(text_tokens_2d, dtype=torch.bool) + cas_embedding = self.cas_encoder(text_tokens_2d, subword_mask=text_mask) # (B, 1, E) + text_embedded = text_embedded + cas_embedding + + if force_dropout_text: + text_embedded = text_embedded * 0 + + # Check for EOS tokens - mark those items as text_finished + # The EOS token itself IS embedded normally (matching process_batch behavior + # where EOS is part of the text sequence). After this step, text_finished is set + # so subsequent steps won't add any text embedding. + is_eos_token = (text_tokens == self.eos_id) & needs_text # (B,) bool + text_add_mask = needs_text.view(batch_size, 1, 1).float() + next_input = next_input + text_embedded * text_add_mask + state.text_finished = state.text_finished | is_eos_token + + elif text_tokens is None: + # Text finished signal for items not in context phase + state.text_finished = state.text_finished | ~needs_context + + # --- Phoneme embedding for phoneme and audio phase items --- + if self.phoneme_tokenizer is not None: + if needs_phoneme.any(): + phoneme_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) + + if state.phoneme_input_type == 'gt' and state.gt_phoneme_embeddings is not None: + # Teacher forcing: use pre-computed GT phoneme embeddings + # Only use GT embedding if within valid length, otherwise zero + within_gt_len = state.phoneme_steps < state.gt_phoneme_lens # (B,) + positions = state.phoneme_steps.clamp(max=state.gt_phoneme_embeddings.size(1) - 1) + gt_emb = state.gt_phoneme_embeddings[ + torch.arange(batch_size, device=device), positions, : + ].unsqueeze( + 1 + ) # (B, 1, E) + phoneme_mask = (needs_phoneme & within_gt_len).view(batch_size, 1, 1).float() + phoneme_emb = phoneme_emb + gt_emb * phoneme_mask + else: + # Prediction mode: use BOS or last predicted phoneme + first_phoneme_step = needs_phoneme & (state.phoneme_steps == 0) + has_last_phoneme = ( + needs_phoneme & (~first_phoneme_step) & (state.last_phoneme_tokens is not None) + ) + + if first_phoneme_step.any(): + phoneme_bos = torch.full( + (batch_size, self.phoneme_stacking_factor, 1), + self.phoneme_tokenizer.bos_token_id, + device=device, + ).long() + phoneme_bos_emb = self.embed_phoneme_tokens(phoneme_bos) # (B, 1, E) + first_mask = first_phoneme_step.view(batch_size, 1, 1).float() + phoneme_emb = phoneme_emb + phoneme_bos_emb * first_mask + + if has_last_phoneme.any() and state.last_phoneme_tokens is not None: + last_phoneme_emb = self.embed_phoneme_tokens( + state.last_phoneme_tokens.unsqueeze(2) + ) # (B, 1, E) + last_mask = has_last_phoneme.view(batch_size, 1, 1).float() + phoneme_emb = phoneme_emb + last_phoneme_emb * last_mask + + # Only end phoneme stream in prediction mode when the phoneme EOS is detected + state.phoneme_stream_ended = state.phoneme_stream_ended | state.phoneme_eos_detected + + next_input = next_input + phoneme_emb + + # --- Audio embedding for audio phase items --- + if needs_audio.any(): + audio_emb = torch.zeros(batch_size, 1, self.cfg.embedding_dim, device=device) + + if state.gt_audio_embeddings is not None: + # Teacher forcing: use pre-computed GT audio embeddings + # Only use GT embedding if within valid length, otherwise zero + within_gt_len = state.audio_steps < state.gt_audio_lens # (B,) + positions = state.audio_steps.clamp(max=state.gt_audio_embeddings.size(1) - 1) + gt_emb = state.gt_audio_embeddings[ + torch.arange(batch_size, device=device), positions, : + ].unsqueeze( + 1 + ) # (B, 1, E) + audio_mask = (needs_audio & within_gt_len).view(batch_size, 1, 1).float() + audio_emb = audio_emb + gt_emb * audio_mask + else: + # Prediction mode: use BOS or last predicted audio + first_audio_step = needs_audio & (state.audio_steps == 0) + has_last_audio = needs_audio & ~first_audio_step & (state.last_audio_codes is not None) + + if first_audio_step.any(): + # Create BOS for items at first audio step + audio_bos = torch.full( + (batch_size, self.num_audio_codebooks * self.frame_stacking_factor, 1), + self.audio_bos_id, + device=device, + ).long() + audio_bos_emb = self.embed_audio_tokens(audio_bos) # (B, 1, E) + first_mask = first_audio_step.view(batch_size, 1, 1).float() + audio_emb = audio_emb + audio_bos_emb * first_mask + + if has_last_audio.any() and state.last_audio_codes is not None: + # Use last predicted audio + last_audio_emb = self.embed_audio_tokens(state.last_audio_codes.unsqueeze(2)) # (B, 1, E) + last_mask = has_last_audio.view(batch_size, 1, 1).float() + audio_emb = audio_emb + last_audio_emb * last_mask + + next_input = next_input + audio_emb + + # ==================== HANDLE CFG ==================== + if state.use_cfg: + # For unconditional branch, use dummy embedding for non-audio items + # and audio-only embedding for audio items + next_input_unconditional_context = state.dummy_context_embedding_unconditional.expand( + batch_size, 1, -1 + ) + # After the context is finished, we use zero embedding for the unconditional branch until audio phase starts + next_input_unconditional_zeros = torch.zeros_like(next_input_unconditional_context) + context_mask = needs_context.view(batch_size, 1, 1).float() + next_input_unconditional = ( + context_mask * next_input_unconditional_context + + (1 - context_mask) * next_input_unconditional_zeros + ) + + # For audio phase items, we use audio embedding for the unconditional branch + if needs_audio.any(): + audio_mask = needs_audio.view(batch_size, 1, 1).float() + next_input_unconditional = next_input_unconditional * (1 - audio_mask) + audio_emb * audio_mask + + # Concatenate conditional and unconditional: (2*B, 1, E) + next_input = torch.cat([next_input, next_input_unconditional], dim=0) + + # ==================== FORWARD PASS ==================== + cache_position = torch.tensor([state.cache_seq_len], device=device) + transformer_out = self.forward( + inputs_embeds=next_input, + attention_mask=None, + use_cache=True, + past_key_values=state.past_key_values, + cache_position=cache_position, + ) + + state.last_hidden = transformer_out.last_hidden_state + state.past_key_values = transformer_out.past_key_values + state.cache_seq_len += 1 + + # ==================== UPDATE STATE ==================== + # Update context_position for items in context phase + state.context_position = state.context_position + needs_context.long() + # Keep updating text_tokens_seen for items once the context is finished + # This is because this counter is used to determine when to start predicting phonemes and audio + state.text_tokens_seen = state.text_tokens_seen + (~needs_context).long() + + # Update phoneme_steps for items in phoneme or audio phase + state.phoneme_steps = state.phoneme_steps + needs_phoneme.long() + + # Update audio_steps for items in audio phase + state.audio_steps = state.audio_steps + needs_audio.long() + + # ==================== PREDICTIONS ==================== + pred_phoneme_tokens = None + audio_codes_next = None + + # Phoneme predictions for items in phoneme or audio phase + if needs_phoneme.any() and self.phoneme_tokenizer is not None: + # Track phoneme prediction start index for items just entering phoneme phase + first_phoneme_step = needs_phoneme & (state.phoneme_prediction_start_idx == -1) + if first_phoneme_step.any(): + current_phoneme_step_idx = len(state.all_phoneme_predictions) # before append + state.phoneme_prediction_start_idx = torch.where( + first_phoneme_step, + torch.full_like(state.phoneme_prediction_start_idx, current_phoneme_step_idx), + state.phoneme_prediction_start_idx, + ) + + # Check which items should predict phonemes (not ended) + pred_phoneme_tokens = self._predict_phoneme_tokens(state) # (B, phoneme_stacking_factor) + state.last_phoneme_tokens = pred_phoneme_tokens + state.all_phoneme_predictions.append(pred_phoneme_tokens) + + # Check for phoneme EOS per item + phoneme_eos_detected = needs_phoneme & ( + pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id + ).any( + dim=1 + ) # (B,) + + state.phoneme_eos_detected = state.phoneme_eos_detected | phoneme_eos_detected + + # Track phoneme prediction end index for items that just ended + newly_ended_phoneme = phoneme_eos_detected & (state.phoneme_prediction_end_idx == -1) + if newly_ended_phoneme.any(): + current_phoneme_step_idx = len(state.all_phoneme_predictions) # after append + state.phoneme_prediction_end_idx = torch.where( + newly_ended_phoneme, + torch.full_like(state.phoneme_prediction_end_idx, current_phoneme_step_idx), + state.phoneme_prediction_end_idx, + ) + + # Audio predictions for items in audio phase + if needs_audio.any(): + # Track audio prediction start index for items just entering audio phase + first_audio_step = needs_audio & (state.audio_prediction_start_idx == -1) + if first_audio_step.any(): + # Track start in terms of frames (not steps) + current_frame_idx = sum(p.size(-1) for p in state.all_predictions) # total frames so far + state.audio_prediction_start_idx = torch.where( + first_audio_step, + torch.full_like(state.audio_prediction_start_idx, current_frame_idx), + state.audio_prediction_start_idx, + ) + + audio_codes_next_stacked, all_codes_next_argmax = self._predict_audio_codes(state) # (B, C*S) + + # Unstack immediately: (B, C*S) -> (B, C, S) where S = frame_stacking_factor + S = self.frame_stacking_factor + C = self.num_audio_codebooks + audio_codes_unstacked = audio_codes_next_stacked.view(batch_size, C, S) # (B, C, S) + + # Update last_audio_codes with stacked format (needed for next step's embedding) + if state.last_audio_codes is None: + state.last_audio_codes = audio_codes_next_stacked + else: + update_mask = needs_audio.view(batch_size, 1).expand_as(audio_codes_next_stacked) + state.last_audio_codes = torch.where(update_mask, audio_codes_next_stacked, state.last_audio_codes) + + # Check for EOS in each frame and track exact end position + # Skip EOS detection in teacher-forced mode - rely on GT exhaustion instead + if state.gt_audio_embeddings is None: + # all_codes_next_argmax is also (B, C*S), reshape to (B, C, S) + all_codes_argmax_unstacked = all_codes_next_argmax.view(batch_size, C, S) + + # For each batch item, find if/where EOS occurs in this step's frames + eos_in_sampled = audio_codes_unstacked == self.audio_eos_id # (B, C, S) + eos_in_argmax = all_codes_argmax_unstacked == self.audio_eos_id # (B, C, S) + eos_any_codebook = eos_in_sampled.any(dim=1) | eos_in_argmax.any(dim=1) # (B, S) + + # Find first frame with EOS per batch item (or S if none) + eos_frame_idx = torch.where( + eos_any_codebook.any(dim=1), + eos_any_codebook.int().argmax(dim=1), # first frame with EOS + torch.full((batch_size,), S, device=device), # no EOS in this step + ) # (B,) + + audio_eos_detected = eos_any_codebook.any(dim=1) & needs_audio + state.finished = state.finished | audio_eos_detected + + # Track audio prediction end index (in frames) for items that just ended + newly_ended_audio = audio_eos_detected & (state.audio_prediction_end_idx == -1) + if newly_ended_audio.any(): + # End index = current frame count + frame offset where EOS was found + current_frame_count = len(state.all_predictions) * self.frame_stacking_factor + end_frame_idx = current_frame_count + eos_frame_idx + state.audio_prediction_end_idx = torch.where( + newly_ended_audio, end_frame_idx, state.audio_prediction_end_idx + ) + + # Store unstacked codes + state.all_predictions.append(audio_codes_unstacked) + audio_codes_next = audio_codes_unstacked + + # Force-finish items when GT audio is exhausted (teacher forcing). + # This is checked AFTER predictions so the last valid prediction is still made. + # audio_steps was already incremented above. When audio_steps >= gt_audio_lens, + # we've consumed all GT input positions and made all corresponding predictions. + if state.gt_audio_embeddings is not None and state.gt_audio_lens is not None: + gt_exhausted = needs_audio & (state.audio_steps >= state.gt_audio_lens) + state.finished = state.finished | gt_exhausted + + return state, audio_codes_next, pred_phoneme_tokens + + def _predict_phoneme_tokens(self, state: StreamingState) -> torch.Tensor: + """Predict phoneme tokens from the last hidden state.""" + actual_batch_size = state.batch_size + last_hidden = state.last_hidden + + # Get phoneme logits + all_code_logits_t_phoneme = self.phoneme_final_proj(last_hidden[:, -1, :]) + all_code_logits_t_phoneme = all_code_logits_t_phoneme[:actual_batch_size] + phoneme_logits = all_code_logits_t_phoneme.view( + actual_batch_size, self.phoneme_stacking_factor, self.phoneme_vocab_size + ) + max_probs = torch.softmax(phoneme_logits, dim=-1).max(dim=-1).values # (B, phoneme_stacking_factor) + + # Sample phonemes + if state.phoneme_sampling_method == 'argmax': + pred_phoneme_tokens = self.sample_codes_from_logits_phoneme(all_code_logits_t_phoneme, temperature=0.0) + else: + pred_phoneme_tokens = self.sample_codes_from_logits_phoneme( + all_code_logits_t_phoneme, temperature=state.temperature, topk=state.topk + ) + + # In prediction mode, low-confidence phoneme steps are replaced with UNK across + # all stacked channels (except steps where EOS is predicted). + if ( + state.phoneme_input_type != 'gt' + and hasattr(self.phoneme_tokenizer, 'unk_token_id') + and self.phoneme_confidence_unk_threshold > 0.0 + ): + underconfident_step = (max_probs < self.phoneme_confidence_unk_threshold).any( + dim=1, keepdim=True + ) # (B, 1) + eos_predicted_step = (pred_phoneme_tokens == self.phoneme_tokenizer.eos_token_id).any(dim=1, keepdim=True) + replace_with_unk = underconfident_step & (~eos_predicted_step) + if replace_with_unk.any(): + unk_tokens = torch.full_like(pred_phoneme_tokens, self.phoneme_tokenizer.unk_token_id) + pred_phoneme_tokens = torch.where(replace_with_unk, unk_tokens, pred_phoneme_tokens) + # (B, phoneme_stacking_factor) + return pred_phoneme_tokens + + def _predict_audio_codes(self, state: StreamingState) -> Tuple[torch.Tensor, torch.Tensor]: + """Predict audio codes from the last hidden state.""" + actual_batch_size = state.batch_size + last_hidden = state.last_hidden + + # Compute audio logits + last_hidden_audio = self.audio_out_projection(last_hidden[:, -1, :]) + all_code_logits_t = self.final_proj(last_hidden_audio) + + # Apply CFG if enabled + if state.use_cfg: + conditional_logits = all_code_logits_t[:actual_batch_size] + unconditional_logits = all_code_logits_t[actual_batch_size:] + all_code_logits_t = state.cfg_scale * conditional_logits + (1.0 - state.cfg_scale) * unconditional_logits + + # Sample audio codes + audio_codes_next, all_codes_next_argmax = self._sample_audio_codes( + last_hidden=last_hidden, + all_code_logits_t=all_code_logits_t, + temperature=state.temperature, + topk=state.topk, + use_local_transformer_for_inference=state.use_local_transformer, + use_cfg=state.use_cfg, + cfg_scale=state.cfg_scale, + ) + + return audio_codes_next, all_codes_next_argmax + + def streaming_finalize( + self, + state: StreamingState, + use_inference_mode: bool = True, + ) -> StreamingFinalizeOutput: + """ + Finalize streaming and return the complete generated audio and phoneme predictions. + + This function should be called after all streaming_step() calls are complete + (i.e., when state.finished.all() is True or max steps reached). + + Args: + state: Final StreamingState after streaming is complete. + + Returns: + StreamingFinalizeOutput containing audio, codes, and phoneme predictions. + """ + batch_size = state.batch_size + + # Extract and decode phoneme predictions + phoneme_tokens_list: List[List[int]] = [] + phoneme_text_list: List[str] = [] + if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0: + # Stack phoneme predictions: each is (B, phoneme_stacking_factor) + all_phonemes = torch.stack(state.all_phoneme_predictions, dim=-1) # (B, S, T) + for i in range(batch_size): + start = max(0, state.phoneme_prediction_start_idx[i].item()) + end = state.phoneme_prediction_end_idx[i].item() + if end < 0: + end = all_phonemes.size(-1) + # Flatten stacked phonemes back to sequence + tokens = all_phonemes[i, :, start:end].T.reshape(-1).tolist() + # Remove special tokens (BOS, EOS, PAD) + special = {self.phoneme_tokenizer.bos_token_id, self.phoneme_tokenizer.eos_token_id} + if hasattr(self.phoneme_tokenizer, 'pad_token_id'): + special.add(self.phoneme_tokenizer.pad_token_id) + tokens = [t for t in tokens if t not in special] + phoneme_tokens_list.append(tokens) + phoneme_text_list.append(self.phoneme_tokenizer.decode(tokens)) + else: + phoneme_tokens_list = [[] for _ in range(batch_size)] + phoneme_text_list = ["" for _ in range(batch_size)] + + if len(state.all_predictions) == 0: + return StreamingFinalizeOutput( + audio=torch.zeros(batch_size, 0, device=state.device), + audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + audio_codes=torch.zeros(batch_size, self.num_audio_codebooks, 0, device=state.device), + audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + phoneme_tokens=phoneme_tokens_list, + phoneme_text=phoneme_text_list, + ) + + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): + # Concatenate all predictions - each is (B, C, S), concat gives (B, C, T_total_frames) + all_codes = torch.cat(state.all_predictions, dim=-1) # (B, C, T_total_frames) + total_frames = all_codes.size(-1) + num_codebooks = all_codes.size(1) + + # Start and end indices are in frames (not steps) + # If start_idx is -1, item never started audio predictions - use 0 + # If end_idx is -1, item never ended - use total_frames + start_indices = torch.clamp(state.audio_prediction_start_idx, min=0) + end_indices = torch.where( + state.audio_prediction_end_idx >= 0, + state.audio_prediction_end_idx, + torch.full_like(state.audio_prediction_end_idx, total_frames), + ) + + # Calculate per-item lengths (in frames) + predicted_codes_lens = end_indices - start_indices + max_len = predicted_codes_lens.max().item() + + # Handle case where all items have zero-length predictions + if max_len == 0: + return StreamingFinalizeOutput( + audio=torch.zeros(batch_size, 0, device=state.device), + audio_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + audio_codes=torch.zeros(batch_size, num_codebooks, 0, device=state.device, dtype=all_codes.dtype), + audio_codes_len=torch.zeros(batch_size, dtype=torch.long, device=state.device), + phoneme_tokens=phoneme_tokens_list, + phoneme_text=phoneme_text_list, + ) + + # Create padded output tensor and slice each item's valid predictions + predicted_codes = torch.zeros( + batch_size, num_codebooks, max_len, dtype=all_codes.dtype, device=state.device + ) + for i in range(batch_size): + start = start_indices[i].item() + end = end_indices[i].item() + length = end - start + if length > 0: + predicted_codes[i, :, :length] = all_codes[i, :, start:end] + + # No need to remove EOS - end_indices already point to the frame before EOS + # Decode to audio (codes are already unstacked: B, C, T) + audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens) + + return StreamingFinalizeOutput( + audio=audio, + audio_len=audio_len, + audio_codes=predicted_codes, + audio_codes_len=predicted_codes_lens, + phoneme_tokens=phoneme_tokens_list, + phoneme_text=phoneme_text_list, + ) + + def infer_batch( + self, + batch: Dict[str, torch.Tensor], + max_decoder_steps: int = 500, + temperature: float = 0.7, + topk: int = 80, + use_cfg: bool = False, + cfg_scale: float = 1.0, + use_local_transformer_for_inference: bool = False, + phoneme_input_type: str = 'pred', + phoneme_sampling_method: str = 'argmax', + force_dropout_text: bool = False, + use_teacher_forced: bool = False, + use_inference_mode: bool = True, + ) -> InferBatchOutput: + """ + Batch inference using streaming infrastructure. + + This is a simple wrapper around streaming_init, streaming_step, and streaming_finalize + that processes a batch dictionary similar to training_step/validation_step. + + Args: + batch: Dictionary containing: + - text: Text token IDs (B, L) + - text_lens: Lengths (B,) + - context_text_tokens: Context text tokens (B, L') + - context_text_tokens_lens: Lengths (B,) + - context_audio_codes: Context audio codes (B, C, T) OR + - context_audio / context_audio_lens: Raw context audio to encode + - phoneme_tokens (optional): GT phoneme tokens (B, L'') + - phoneme_tokens_lens (optional): Lengths (B,) + For teacher forcing (use_teacher_forced=True), also requires: + - audio_codes / audio_codes_lens: GT audio codes (B, C, T) OR + - audio / audio_lens: Raw audio waveforms to encode + max_decoder_steps: Maximum number of decoder steps. + temperature: Sampling temperature for audio codes. Use 0.0 for argmax. + topk: Top-k sampling parameter. + use_cfg: Whether to use classifier-free guidance. + cfg_scale: CFG scale factor. + use_local_transformer_for_inference: Whether to use local transformer. + phoneme_input_type: 'gt' or 'pred' for phoneme tokens. + phoneme_sampling_method: 'argmax' or 'sample' for phoneme token selection. + force_dropout_text: Whether to dropout text embeddings. + use_teacher_forced: If True, feed GT audio codes (and force GT phonemes, argmax sampling) + instead of predicted codes at each streaming step. + + Returns: + InferBatchOutput containing predicted audio, codes, and RTF metrics. + """ + grad_ctx = torch.inference_mode if use_inference_mode else torch.no_grad + with grad_ctx(): + start_time = time.time() + + # Extract tensors from batch + text = batch['text'] + text_lens = batch['text_lens'] + context_text_tokens = batch['context_text_tokens'] + context_text_tokens_lens = batch['context_text_tokens_lens'] + + # Handle context audio - either use codes directly or encode from audio + if 'context_audio_codes' in batch: + context_audio_codes = batch['context_audio_codes'] + context_audio_codes_lens = batch['context_audio_codes_lens'] + else: + context_audio = batch['context_audio'] + context_audio_lens = batch['context_audio_lens'] + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + + # Optional GT phoneme tokens for teacher forcing + gt_phoneme_tokens = batch.get('phoneme_tokens') + gt_phoneme_tokens_lens = batch.get('phoneme_tokens_lens') + + # Prepare GT audio codes for teacher forcing if requested + gt_audio_codes_for_init = None + gt_audio_codes_lens_for_init = None + if use_teacher_forced: + # Force GT phoneme input and argmax sampling + phoneme_input_type = 'gt' + temperature = 0.0 + + # Get GT audio codes + if 'audio_codes' in batch: + gt_audio_codes = batch['audio_codes'] + gt_audio_codes_lens = batch['audio_codes_lens'] + elif 'audio' in batch: + gt_audio = batch['audio'] + gt_audio_lens = batch['audio_lens'] + gt_audio_codes, gt_audio_codes_lens = self.audio_to_codes(gt_audio, gt_audio_lens) + else: + raise ValueError("Teacher forcing requires 'audio_codes' or 'audio' in batch") + + # Convert and add special tokens, then stack + if self._codec_converter is not None: + gt_audio_codes = self._codec_converter.convert_original_to_new( + audio_tokens=gt_audio_codes, audio_lens=gt_audio_codes_lens + ).long() + + gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens( + codes=gt_audio_codes, + codes_len=gt_audio_codes_lens, + bos_id=self.audio_bos_id, + eos_id=self.audio_eos_id, + ) + gt_audio_codes_processed, gt_audio_codes_lens_processed = self.stack_codes( + gt_audio_codes_processed, + gt_audio_codes_lens_processed, + self.audio_bos_id, + self.audio_eos_id, + self.frame_stacking_factor, + self.num_audio_codebooks, + ) + + # Input portion: all tokens except the last (teacher forcing shift) + gt_audio_codes_for_init = gt_audio_codes_processed[:, :, :-1] + gt_audio_codes_lens_for_init = gt_audio_codes_lens_processed - 1 + + batch_size = text.size(0) + + # Initialize streaming state + state = self.streaming_init( + context_audio_codes=context_audio_codes, + context_audio_codes_lens=context_audio_codes_lens, + context_text_tokens=context_text_tokens, + context_text_tokens_lens=context_text_tokens_lens, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer=use_local_transformer_for_inference, + temperature=temperature, + topk=topk, + phoneme_input_type=phoneme_input_type, + phoneme_sampling_method=phoneme_sampling_method, + gt_phoneme_tokens=gt_phoneme_tokens, + gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, + gt_audio_codes=gt_audio_codes_for_init, + gt_audio_codes_lens=gt_audio_codes_lens_for_init, + use_inference_mode=use_inference_mode, + ) + + time_to_first_prediction = None + generation_start_time = time.time() + device = text.device + + # Generate until all items are finished or max steps reached + print("Generation started") + gen_step = 0 + while not state.finished.all() and len(state.all_predictions) < max_decoder_steps: + gen_step += 1 + if gen_step % 10 == 0: + print(f"Generation step {gen_step} ") + # Gather the correct text token for each batch item based on text_tokens_seen + # Items in context phase will have their token ignored by streaming_step + positions = state.text_tokens_seen.clamp(max=text.size(1) - 1) + current_tokens = text[torch.arange(batch_size, device=device), positions] + + # For items that have exhausted their text, provide EOS token + text_exhausted = state.text_tokens_seen >= text_lens + current_tokens = torch.where( + text_exhausted, torch.full_like(current_tokens, self.eos_id), current_tokens + ) + + state, audio_codes, phoneme_tokens = self.streaming_step( + state=state, + text_tokens=current_tokens, + force_dropout_text=force_dropout_text, + use_inference_mode=use_inference_mode, + ) + + # Record time to first audio prediction + if time_to_first_prediction is None and audio_codes is not None: + time_to_first_prediction = time.time() - start_time + + tts_generation_time = time.time() - generation_start_time + + # Finalize and decode audio + finalize_output = self.streaming_finalize(state, use_inference_mode=use_inference_mode) + + end_time = time.time() + total_time = end_time - start_time + + # Compute RTF metrics + total_audio_samples = finalize_output.audio_len.sum().item() + total_audio_duration = total_audio_samples / self.output_sample_rate + num_frames = len(state.all_predictions) + tts_generation_time_per_frame = tts_generation_time / num_frames if num_frames > 0 else 0.0 + + rtf_metrics = { + 'rtf': total_audio_duration / total_time if total_time > 0 else 0.0, + 'time_to_first_prediction': time_to_first_prediction, + 'tts_generation_time': tts_generation_time, + 'total_time': total_time, + 'total_audio_duration': total_audio_duration, + 'total_audio_samples': total_audio_samples, + 'num_decoder_steps': num_frames, + 'tts_generation_time_per_frame': tts_generation_time_per_frame, + } + + # Prepare phoneme token output if available + predicted_phoneme_tokens = None + predicted_phoneme_tokens_lens = None + phoneme_prediction_start_idx_out = None + if self.phoneme_tokenizer is not None and len(state.all_phoneme_predictions) > 0: + predicted_phoneme_tokens = torch.stack(state.all_phoneme_predictions, dim=-1) # (B, S, T) + # Per-item valid phoneme prediction lengths + phoneme_start = torch.clamp(state.phoneme_prediction_start_idx, min=0) + phoneme_end = torch.where( + state.phoneme_prediction_end_idx >= 0, + state.phoneme_prediction_end_idx, + torch.full_like( + state.phoneme_prediction_end_idx, predicted_phoneme_tokens.size(-1) + ), + ) + predicted_phoneme_tokens_lens = phoneme_end - phoneme_start + phoneme_prediction_start_idx_out = phoneme_start + + return InferBatchOutput( + predicted_audio=finalize_output.audio, + predicted_audio_lens=finalize_output.audio_len, + predicted_codes=finalize_output.audio_codes, + predicted_codes_lens=finalize_output.audio_codes_len, + rtf_metrics=rtf_metrics, + predicted_phoneme_tokens=predicted_phoneme_tokens, + predicted_phoneme_tokens_lens=predicted_phoneme_tokens_lens, + phoneme_prediction_start_idx=phoneme_prediction_start_idx_out, + ) + + @staticmethod + def _load_audio_for_inference(audio_path: str, target_sample_rate: int) -> torch.Tensor: + audio_data, sr = sf.read(audio_path, dtype='float32') + if len(audio_data.shape) > 1: + audio_data = audio_data[:, 0] + audio_tensor = torch.tensor(audio_data).unsqueeze(0) + if sr != target_sample_rate: + import torchaudio + + audio_tensor = torchaudio.functional.resample(audio_tensor, sr, target_sample_rate) + return audio_tensor.unsqueeze(0) + + @staticmethod + def _adjust_audio_to_duration_for_inference( + audio: torch.Tensor, sample_rate: int, target_seconds: float, codec_model_samples_per_frame: int + ) -> torch.Tensor: + target_samples = int(target_seconds * sample_rate) + target_samples = (target_samples // codec_model_samples_per_frame) * codec_model_samples_per_frame + if audio.size(-1) > target_samples: + audio = audio[:, :, :target_samples] + elif audio.size(-1) < target_samples: + # repeat to fill + repeats = target_samples // audio.size(-1) + 1 + audio = audio.repeat(1, 1, repeats)[:, :, :target_samples] + return audio + + def do_tts( + self, + transcript: str, + context_audio_file_path: Optional[str] = None, + context_text: str = "[NO TEXT CONTEXT]", + main_tokenizer_name: Optional[str] = None, + context_audio_duration: float = 5.0, + use_cfg: bool = True, + cfg_scale: float = 2.5, + use_local_transformer: bool = True, + temperature: float = 0.7, + topk: int = 80, + max_steps: int = 330, + gt_phoneme_text: Optional[str] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Generate speech from transcript using EasyMagpie inference with optional context text/audio. + Optionally accepts ground-truth phoneme text (IPA string) for decoder-only inference. + """ + if transcript is None or transcript.strip() == "": + raise ValueError("`transcript` must be a non-empty string.") + + device = next(self.parameters()).device + transcript = transcript.strip() + context_text = (context_text or "[NO TEXT CONTEXT]").strip() + + if main_tokenizer_name is None: + # Match model init behavior: default to first configured tokenizer. + main_tokenizer_name = list(self.cfg.text_tokenizers.keys())[0] + if main_tokenizer_name not in self.tokenizer.tokenizers: + raise ValueError( + f"Unknown main_tokenizer_name='{main_tokenizer_name}'. " + f"Available tokenizers: {list(self.tokenizer.tokenizers.keys())}" + ) + + text_tokens = self.tokenizer.encode(transcript, tokenizer_name=main_tokenizer_name) + [self.eos_id] + text = torch.tensor([text_tokens], dtype=torch.long, device=device) + text_lens = torch.tensor([len(text_tokens)], dtype=torch.long, device=device) + + context_text_tokens = self.tokenizer.encode(context_text, tokenizer_name=self.text_conditioning_tokenizer_name) + context_text_tensor = torch.tensor([context_text_tokens], dtype=torch.long, device=device) + context_text_lens = torch.tensor([len(context_text_tokens)], dtype=torch.long, device=device) + + if context_audio_file_path is not None and context_audio_file_path.strip() != "": + context_audio = self._load_audio_for_inference(context_audio_file_path, self.sample_rate) + context_audio = self._adjust_audio_to_duration_for_inference( + context_audio, + self.sample_rate, + context_audio_duration, + self.codec_model_samples_per_frame, + ) + context_audio = context_audio.to(device) + context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device) + with torch.inference_mode(): + context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + else: + context_audio_codes = torch.zeros( + 1, + self.data_num_audio_codebooks, + 0, + dtype=torch.long, + device=device, + ) + context_audio_codes_lens = torch.zeros(1, dtype=torch.long, device=device) + + batch = { + 'text': text, + 'text_lens': text_lens, + 'context_text_tokens': context_text_tensor, + 'context_text_tokens_lens': context_text_lens, + 'context_audio_codes': context_audio_codes, + 'context_audio_codes_lens': context_audio_codes_lens, + } + phoneme_input_type = 'pred' + if gt_phoneme_text is not None: + if self.phoneme_tokenizer is None: + raise ValueError( + "Model does not have a phoneme tokenizer configured, but gt_phoneme_text was provided." + ) + gt_phoneme_text = gt_phoneme_text.strip() + if gt_phoneme_text == "": + raise ValueError("`gt_phoneme_text` must be a non-empty string when provided.") + gt_phoneme_tokens = self.phoneme_tokenizer.encode(gt_phoneme_text) + gt_phoneme_tokens = ( + [self.phoneme_tokenizer.bos_token_id] + gt_phoneme_tokens + [self.phoneme_tokenizer.eos_token_id] + ) + if len(gt_phoneme_tokens) == 0: + raise ValueError("Failed to encode `gt_phoneme_text` into phoneme tokens.") + batch['phoneme_tokens'] = torch.tensor([gt_phoneme_tokens], dtype=torch.long, device=device) + batch['phoneme_tokens_lens'] = torch.tensor([len(gt_phoneme_tokens)], dtype=torch.long, device=device) + phoneme_input_type = 'gt' + + with torch.inference_mode(): + output = self.infer_batch( + batch=batch, + max_decoder_steps=max_steps, + temperature=temperature, + topk=topk, + use_cfg=use_cfg, + cfg_scale=cfg_scale, + use_local_transformer_for_inference=use_local_transformer, + phoneme_input_type=phoneme_input_type, + phoneme_sampling_method='argmax', + use_teacher_forced=False, + use_inference_mode=True, + ) + return output.predicted_audio, output.predicted_audio_lens + + @classmethod + def list_available_models(cls) -> List[PretrainedModelInfo]: + return [] diff --git a/nemo/collections/tts/models/magpietts.py b/nemo/collections/tts/models/magpietts.py index 4d34471af5a1..28af39542f21 100644 --- a/nemo/collections/tts/models/magpietts.py +++ b/nemo/collections/tts/models/magpietts.py @@ -33,13 +33,12 @@ from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict from torch import nn -from torch.utils.data import get_worker_info - from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers from nemo.collections.tts.losses.aligner_loss import ForwardSumLoss from nemo.collections.tts.losses.moe_loss import MoEAuxiliaryLoss, compute_expert_usage from nemo.collections.tts.models import AudioCodecModel +from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel, worker_init_fn from nemo.collections.tts.modules import transformer_2501 from nemo.collections.tts.modules.aligner import AlignmentEncoder from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter @@ -48,7 +47,6 @@ EOSDetectionMethod, LocalTransformerType, SpecialAudioToken, - cosine_schedule, ) from nemo.collections.tts.parts.utils.helpers import ( binarize_attention_parallel, @@ -61,7 +59,6 @@ get_tokenizer_for_language, stack_tensors, ) -from nemo.core.classes import ModelPT from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging @@ -302,17 +299,7 @@ def from_dict(cls, data: dict) -> 'ModelInferenceParameters': return cls(**filtered_data) -def worker_init_fn(worker_id): - # For mp.set_start_method("spawn", force=True) - # The dataset class should be picklable, so we initialize non-picklable objects here - logging.info(f"Worker {worker_id} initializing...") - worker_info = get_worker_info() - dataset = worker_info.dataset # Get the dataset instance in this worker - tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type) - dataset.text_tokenizer = tokenizer - - -class MagpieTTSModel(ModelPT): +class MagpieTTSModel(BaseMagpieTTSModel): """ Magpie-TTS Model Base Class used for training a TTS model that can generate audio codes from transcript and a context audio/text @@ -489,6 +476,11 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim)) self.audio_embeddings = nn.ModuleList(audio_embeddings) + # Identity projections required by BaseMagpieTTSModel local transformer methods. + # MagpieTTSModel embeds directly in embedding_dim, so no projection is needed. + self.audio_in_projection = nn.Identity() + self.local_transformer_audio_out_projection = nn.Identity() + if self.use_bpe_char_tokenizer: # BPE char tokenizer assert len(self.tokenizer.tokenizers) == 1, "BPE char tokenizer should only be used with one tokenizer" @@ -753,29 +745,11 @@ def _setup_inference_parameters(self, cfg: DictConfig) -> None: """ self.inference_parameters = ModelInferenceParameters.from_dict(cfg.get("inference_parameters", {})) - def state_dict(self, destination=None, prefix='', keep_vars=False): - """ - Only used for saving checkpoints. On save, we remove _speaker_verification_model and _codec_model - from the checkpoint. The codec model is saved in a separate checkpoint. - - _speaker_verification_model is only included in older checkpoints with the older single_encoder_sv_tts - model_type that is no longer supported and can likely be removed in a future version. - - If the model has a baked context embedding, the context_encoder weights are also excluded - since they are no longer needed for inference. - """ - if hasattr(self, '_no_state_dict') and self._no_state_dict: - return {} - # Don't save the speaker verification and codec model in the state dict - state_dict = super().state_dict(destination, prefix, keep_vars) - keys_substrings_to_exclude = ['_speaker_verification_model', '_codec_model'] - # If we have a baked context embedding, exclude context_encoder weights + def _get_state_dict_keys_to_exclude(self): + keys = ['_speaker_verification_model', '_codec_model'] if self.has_baked_context_embedding: - keys_substrings_to_exclude.append('context_encoder') - for key in list(state_dict.keys()): - if any([substring in key for substring in keys_substrings_to_exclude]): - del state_dict[key] - return state_dict + keys.append('context_encoder') + return keys def check_frame_stacking_config_validity(self): """ @@ -1021,83 +995,6 @@ def load_state_dict(self, state_dict, strict=True): new_state_dict[key[len(name_with_dot) :]] = state_dict[key] child.load_state_dict(new_state_dict) - def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1): - # codes: (B, C, T') - # codes_len: (B,) - codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0) - codes_len = codes_len + num_eos_tokens - # Insert EOS token at new final token entry - for idx in range(codes.size(0)): - codes[idx, :, codes_len[idx] - 1] = eos_id - - return codes, codes_len - - def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1): - # codes: (B, C, T') - # codes_len: (B,) - codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id) - codes_len = codes_len + num_bos_tokens - codes, codes_len = self.add_eos_token( - codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens - ) - return codes, codes_len - - def remove_bos_token(self, codes, codes_len, num_tokens=1): - # codes: (B, C, T') - # codes_len: (B,) - codes = codes[:, :, num_tokens:] - codes_len = codes_len - num_tokens - return codes, codes_len - - def remove_embedded_bos_token(self, embedded, embedded_len): - # codes: (B, T', C) - # codes_len: (B,) - embedded = embedded[:, 1:, :] - embedded_len = embedded_len - 1 - return embedded, embedded_len - - def remove_eos_token(self, codes, codes_len): - # codes: (B, C, T') - # codes_len: (B,) - codes_len = codes_len - 1 - codes = codes[:, :, :-1] - mask = get_mask_from_lengths(lengths=codes_len) - codes = codes * mask.unsqueeze(1) - return codes, codes_len - - def remove_embedded_eos_token(self, embedded, embedded_len): - # embedded: (B, T', D) - # embedded_len: (B,) - embedded_len = embedded_len - 1 - embedded = embedded[:, :-1, :] - mask = get_mask_from_lengths(lengths=embedded_len) - embedded = embedded * mask.unsqueeze(2) - return embedded, embedded_len - - def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1): - codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens) - codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len) - return codes, codes_len - - def audio_to_codes(self, audio, audio_len, sample_rate=None): - self._codec_model.eval() - with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): - codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate) - return codes, codes_len - - def codes_to_audio(self, codes, codes_len): - # codes: (B, C, T') - # codes_len: (B,) - self._codec_model.eval() - with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): - # Pass the modified integer token IDs - if self._codec_converter is not None: - codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) - audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) - # audio: (B, T) - # audio_len: (B,) - return audio, audio_len, codes - def embed_audio_tokens(self, audio_tokens, audio_tokens_lens): B, C, T = audio_tokens.shape audio_tokens = self.pad_audio_codes(audio_tokens).long() @@ -1118,116 +1015,6 @@ def embed_audio_tokens(self, audio_tokens, audio_tokens_lens): return audio_embedding, audio_embedding_lens - def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): - """ - Predicts the logits for all codebooks using the local transformer. Used in both autoregressive (AR) and MaskGit (MG) modes. - This function is used in training and validation, not inference/sampling. - The sequence layout is slightly different between AR and MG modes, as shown in the diagram below, - (using an 8-codebook setup as an example): - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | AR target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | none | - | codebook | | | | | | | | | | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | MG target | none | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - | codebook | | | | | | | | | | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | input | Magpie | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - | codebook | latent | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | seq. index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - - Args: - dec_out: (B, T', E) - audio_codes_target: (B, C, T') - targets_offset_by_one: bool, if False, the target for index 0 is codebook 0, for index 1 is codebook 1, etc. (autoregressive) - if True, the target for index 1 is codebook 0, for index 2 is codebook 1, etc. (MaskGit) - """ - C = self.num_audio_codebooks - dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) - local_transformer_input = [dec_out_all] - audio_codes_target = self.pad_audio_codes(audio_codes_target).long() - # Build the teacher-forced input to the LT. - for fs_index in range(self.frame_stacking_factor): - for codebook_num in range(C): - # Collect ground truth codes for the current codebook and frame stack index combintation. - codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor] # (B, T') - # Individual timesteps are independently handled by the LT fold time into the batch dimension. - codes = codes.reshape(-1) # (B*T',) - # Embed the codes - codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes) # (B*T', E) - local_transformer_input.append(codebook_embedding) - # Stack the input codes along dimension 1 (codebooks). This is the dimension along which the LT predicts iteratively. - local_transformer_input = torch.stack(local_transformer_input, dim=1) # (B*T', C+1, E) - local_transformer_input = self.local_transformer_in_projection(local_transformer_input) # (B*T', C+1, 128) - _mask = torch.ones( - local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device - ) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B*T', C+1, E) - if not targets_offset_by_one: - # for autoregressive local transformer the target for index 0 is codebook 0, for index 1 is codebook 1, etc. - local_transformer_output = local_transformer_output[:, :-1, :] # (B*T', C, E) - else: - # for MaskGit the target for index **1** is codebook 0, for index 2 is codebook 1, etc. - local_transformer_output = local_transformer_output[:, 1:, :] # (B*T', C, E) - all_code_logits = [] - for fs_index in range(self.frame_stacking_factor): - for codebook_num in range(audio_codes_target.size(1)): - # Using a separate projection layer for each codebook (to distinguish between them) - # Checked the time - this loop is not taking much time (compared to the local transformer forward pass) - codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C]( - local_transformer_output[:, codebook_num + fs_index * C, :] - ) # (B*T', num_all_tokens_per_codebook) - all_code_logits.append(codebook_logits) - all_code_logits = torch.cat( - all_code_logits, dim=1 - ) # (B*T'/frame_stacking_factor, num_codebooks * num_all_tokens_per_codebook * frame_stacking_factor) - - all_code_logits = all_code_logits.view( - audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1 - ) # (B, T'/frame_stacking_factor, C * num_all_tokens_per_codebook * frame_stacking_factor) - - return all_code_logits - - def maskgit_create_random_mask(self, codes): - """ - Creates a mask where True indicates the positions that should be replaced with a MASK_TOKEN. - """ - # Codes: (B, C, T) - B, C, T = codes.shape - # get a uniform random vector uniformly sampled from [0,1) ## Todo does it need to be inclusive on the right? - rand_values = torch.rand(B, T, device=codes.device) - # apply the cosine schedule - frac_masked = cosine_schedule(rand_values) - # how many positions to mask - n_masked = torch.ceil(frac_masked * C).long() # B,T - # The code further below is the vectorized version of this: - # for b in range(B): - # for t in range(T): - # if n_masked[b,t] > 0: - # # get a random permutation of the codebook indices - # perm = torch.randperm(C) - # # mask the top n_masked positions - # mask[b, perm[:n_masked[b,t]], t] = True - # - # Create random permutations - random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) # (B, C, T) - # Create a mask tensor where each position indicates if it should be masked - mask_indices = torch.arange(C, device=codes.device).view(1, C, 1) - mask = mask_indices < n_masked.view(B, 1, T) # (B, C, T) - # Apply the random permutations to the mask - mask = torch.gather(mask, 1, random_permutations) - - return mask # (B, C, T) - - def maskgit_apply_random_mask(self, codes): - # Randomly replaces some codes with the MASK_TOKEN with a proportion following the cosine schedule. - # Codes: (B, C, T) - mask = self.maskgit_create_random_mask(codes) - # replace some tokens with MASK_TOKEN - codes_with_mask = torch.where(mask, self.mask_token_id, codes) - return codes_with_mask, mask - def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=None, frame_stacking_factor=1): """ Computes the audio codebook loss. Used by: @@ -1373,376 +1160,6 @@ def code_to_str(code): output_str += c logging.debug(output_str) - def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool = False) -> torch.Tensor: - """ - Sets logits of forbidden tokens to `-inf` so they will never be sampled. - Specifically, we forbid sampling of all special tokens except AUDIO_EOS - which is allowed by default. - - Args: - logits: (B, C, num_audio_tokens_per_codebook) - forbid_audio_eos (bool, optional): If True, also forbid AUDIO_EOS tokens - from being sampled. Default: False. - """ - logits[ - :, - :, - SpecialAudioToken.get_forbidden_tokens(self.codebook_size, forbid_audio_eos=forbid_audio_eos), - ] = float('-inf') - return logits - - def local_transformer_sample_maskgit( - self, - dec_output: torch.Tensor, - temperature: float = 0.7, - topk: int = 80, - unfinished_items: Dict[int, bool] = {}, - finished_items: Dict[int, bool] = {}, - use_cfg: bool = False, - cfg_scale: float = 1.0, - n_steps: int = 3, - noise_scale: float = 0.0, - fixed_schedule: Optional[List[int]] = None, - dynamic_cfg_scale: bool = False, - sampling_type: Optional[str] = None, - forbid_audio_eos: bool = False, - ) -> torch.Tensor: - """ - Sample audio codes for the current timestep using MaskGit-like iterative - prediction with the local transformer. If frame-stacking is enabled, the - codes for all frames in the stack are sampled, treated as one long sequence. - - The MaskGit process starts with all positions masked and iteratively unmasks the - most confident positions over multiple steps. By "masked" we mean that a - dedicated MASK token is used (as opposed to attention masking). The LT in this - case is a non-causal transformer decoder. At each step the model predicts all - positions at once. Of those predictions, a subset of the most confident - previously-masked positions is kept and unmasked in the next step. The number of - positions that are unmasked at each step is determined by the unmasking - schedule. We support a cosine schedule and a fixed schedule provided by the - user. - - Uses multinomial sampling with temperature, top-k, and classifier-free guidance (CFG). - - Special handling: - - * forbids special tokens (like AUDIO_BOS, AUDIO_CONTEXT_EOS, etc.) from being sampled - * forces / forbids EOS for finished / unfinished items respectively - * optionally, globally forbids audio EOS for all items in the batch. - This is useful early in the generation process. - * supports different unmasking methods, see `sampling_type` argument for details. - - Args: - dec_output (torch.Tensor): Decoder output tensor with shape (B, E) where B is batch size - and E is primary decoder's embedding dimension. - temperature (float, optional): Sampling temperature - topk (int, optional): Number of top-probability tokens to consider in sampling. - unfinished_items (dict, optional): Dictionary containing indices of batch - items that we are confident have not completed generation. For these items, audio EOS - sampling is forbidden. - finished_items (dict, optional): Dictionary containing indices of batch - items that we are confident are completed. For these items, audio EOS sampling - is forced. - use_cfg (bool, optional): Whether to use classifier-free guidance. If True, expects batch size - to be doubled with conditional and unconditional outputs from the primary decoder. - cfg_scale (float, optional): Scale factor for classifier-free guidance. Only used if use_cfg=True. - n_steps (int, optional): Number of iterative refinement steps for MaskGit sampling. - noise_scale (float, optional): Scale factor for noise to add to confidence scores - during sampling (experimental). - fixed_schedule (list, optional): Fixed schedule for number of tokens to unmask at each step. - If None, uses cosine schedule. - dynamic_cfg_scale (bool, optional): Whether to dynamically adjust CFG scale during - sampling (experimental). - sampling_type (str, optional): Type of sampling strategy. Options are: - ["default", "causal", "purity_causal", "purity_default"]. - - * Purity refers to "purity sampling" from https://arxiv.org/abs/2304.01515. If "purity" - is not specified, confidence sampling is used as in the original MaskGit paper. - * "default"/"causal": Controls the order of unmasking across frames when frame-stacking is enabled. - If "causal" is specified, frames are unmasked in causal order. "default" - doesn't impose any constraints on the unmasking order. - forbid_audio_eos (bool, optional): Whether to globally forbid audio EOS for the entire - batch. - - Returns: - torch.Tensor: Sampled audio codes with shape (B, num_codebooks, frame_stacking_factor) - """ - # dec_output: (B, E) - device = dec_output.device - # disable KV cache since our transformer is not causal - self.local_transformer.reset_cache(use_cache=False) - dec_output = dec_output.unsqueeze(1) # (B, 1, E) - local_transformer_input_init = self.local_transformer_in_projection( - dec_output - ) # (B, 1, D) where D is the dimension of the local transformer - codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor - B = dec_output.size(0) - - min_confidence = 0 - # this needs to be large enough that unmasked items will always remain unmasked (even after noise addition) - # Setting it smaller could allow "regret", i.e. re-masking a codebook that was previously unmasked; we might want to try that - max_confidence = 5 - confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device) - # initialize to all masked - codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long) - sampled_codes = codes.clone() - if fixed_schedule is not None: - n_steps = len(fixed_schedule) - for step in range(n_steps): - # how far along we are in the unmasking process - progress = step / n_steps - # get mask fraction - frac_masked = cosine_schedule(torch.tensor(progress)) - if sampling_type == "causal" or sampling_type == "purity_causal": - frac_masked = torch.ones_like(frac_masked) * (1.0 - progress) - # how many codebooks to mask - if fixed_schedule is None: - n_masked = torch.ceil(codebook_seq_len * frac_masked).long() - else: - n_masked = codebook_seq_len - fixed_schedule[step] - n_unmasked = codebook_seq_len - n_masked - - if ( - sampling_type == "causal" or sampling_type == "purity_causal" - ): # and n_unmasked <= self.num_audio_codebooks: - # force second frame not to be unmasked - n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1)) - confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = ( - min_confidence - 1 - ) # only tested for frame_stacking_factor=2 - - # pick top-confidence codebooks up to n_unmasked - _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) - if use_cfg: - actual_batch_size = topk_indices.size(0) // 2 - assert ( - topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size] - ).all(), "Topk indices are not the same for conditional and unconditional codes" - - # replace masks of the top-k confident codebooks with the codes that were sampled for them - unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices) - codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) - - # build transformer input - local_transformer_input = local_transformer_input_init - for codebook_num in range(codebook_seq_len): - next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze( - 1 - ) # (B, 1, 768) - next_local_transformer_input = self.local_transformer_in_projection( - next_local_transformer_input - ) # (B, 1, d_local) - local_transformer_input = torch.cat( - [local_transformer_input, next_local_transformer_input], dim=1 - ) # (B, codebook_num+1, d_local) - - # run transformer - _mask = torch.ones(B, codebook_seq_len + 1, device=device) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)[ - 'output' - ] # (B, C+1, d_local) - - # get logits - logits = [] - for codebook_num in range(codebook_seq_len): - # The `codebook_num+1` is to drop first position which corresponds to the magpie latent - codebook_logits = self.local_transformer_out_projections[codebook_num]( - local_transformer_output[:, codebook_num + 1, :] - ) # (B, num_audio_tokens_per_codebook) - logits.append(codebook_logits) - logits = torch.stack(logits, dim=1) # (B, C*frame_stacking_factor, num_audio_tokens_per_codebook) - - # apply CFG - if use_cfg: - actual_batch_size = logits.size(0) // 2 - conditional_logits = logits[:actual_batch_size] - unconditional_logits = logits[actual_batch_size:] - if not dynamic_cfg_scale: - current_cfg_scale = cfg_scale - else: - # gradually increase the scale until mid point through sampling, then reduce it again - progress = step / (n_steps - 1) - # interp = -abs(progress-0.5)+0.5 # increase from 0..1 in the interval from start to midpoint and then go back to zero - # interp = 1.0 - progress # decrease from 1 to 0 - interp = progress # gradually increase from 0 to 1 - current_cfg_scale = (cfg_scale - 1) * interp + 1.0 # 1.0 --> cfg_scale --> 1.0 - cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits - logits[:actual_batch_size] = cfg_logits - - # Disallow generation of special tokens - logits = self.clear_forbidden_logits(logits, forbid_audio_eos=forbid_audio_eos) - - # handle unfinished and finished items - for item_idx in unfinished_items: - logits[item_idx, self.audio_eos_id] = float('-inf') - for item_idx in finished_items: - logits[item_idx, :, :] = float('-inf') - logits[item_idx, :, self.audio_eos_id] = 0.0 - - # sample with top-k - logits_topk = torch.topk(logits, topk, dim=-1)[0] # (B, C, topk) - indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) # (B, C, num_audio_tokens_per_codebook) - logits_rescored = logits.clone() - logits_rescored[indices_to_remove] = float('-inf') - probs = torch.softmax(logits_rescored / temperature, dim=-1) # (B, C, num_audio_tokens_per_codebook) - sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len) - if use_cfg: - sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size] - probs[actual_batch_size:] = probs[:actual_batch_size] - if sampling_type != "purity_causal" and sampling_type != "purity_default": - confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) - else: - # use the max probability across all tokens for each codebook as the confidence for each codebook; known as "purity sampling" - confidences = probs.max(dim=2)[0] - # replace entries in sampled_codes with previously unmasked codebooks - sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) - # add noise to confidences (as in token-critic paper, https://arxiv.org/abs/2209.04439) - if noise_scale > 0.0: - # get noise from uniform distribution in the interval [-0.5, 0.5), scale it by `noise_scale`, - # and anneal it to 0 as we approach the end of the unmasking process - noise = ( - (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps) - ) # the +2 makes sure that by the last iteration the noise is exactly 0 - confidences += noise - # the conditional and unconditional get different noise and must be fixed to be the same again - confidences[actual_batch_size:] = confidences[:actual_batch_size] - confidence_eps = 0.1 - assert ( - confidences.max() + confidence_eps < max_confidence - ), f"Predicted confidence is approaching max_confidence: {confidences.max()}" - # for unmasked codebooks, set confidence to max so that they will remain unmasked - confidences.scatter_( - index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float) - ) - codes = sampled_codes - assert not ( - codes == self.mask_token_id - ).any(), "Codes contain mask tokens after completion of MaskGit sampling" - - # break stacked groups of frames into individual frames - codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute( - 0, 2, 1 - ) # B, C, frame_stacking_factor - - if use_cfg: - # drop unconditional codes - codes = codes[:actual_batch_size] - return codes - - def local_transformer_sample_autoregressive( - self, - dec_output: torch.Tensor, - temperature: float = 0.7, - topk: int = 80, - unfinished_items: Dict[int, bool] = {}, - finished_items: Dict[int, bool] = {}, - use_cfg: bool = False, - cfg_scale: float = 1.0, - use_kv_cache: bool = True, - forbid_audio_eos: bool = False, - ) -> torch.Tensor: - """ - Sample audio codes autoregressively across codebooks using the local - transformer. Uses multinomial sampling with temperature, top-k, and - classifier-free guidance (CFG). - - The sequence is initialized with the primary decoder's hidden output as the only - input and is gradually extended a code for one codebook at a time, appending the - sampled code as input sequence for the next step. At the last step the sequence - is `num_codebooks` long. If frame stacking is enabled, codes for all frames in - the stack are sampled as one long sequence and the final sequence length is - `num_codebooks * frame_stacking_factor` codes long. - - Special handling: - * forbids special tokens (like AUDIO_BOS, AUDIO_CONTEXT_EOS, etc.) from being sampled - * forces / forbids EOS for finished / unfinished items respectively - * optionally, globally forbids audio EOS (useful early in the generation process) - - Args: - dec_output (torch.Tensor): Decoder output tensor with shape (B, E) where B is batch size - and E is primary decoder's embedding dimension. - temperature (float, optional): Sampling temperature. - topk (int, optional): Number of top-probability tokens to consider in sampling. - unfinished_items (dict, optional): Dictionary containing indices of batch - items that we are confident have not completed generation. For these items, audio EOS - sampling is forbidden. - finished_items (dict, optional): Dictionary containing indices of batch - items that we are confident are completed. For these items, audio EOS sampling - is forced. - use_cfg (bool, optional): Whether to use classifier-free guidance. If True, expects batch size - to be doubled with conditional and unconditional outputs from the primary decoder. - cfg_scale (float, optional): Scale factor for classifier-free guidance. Only used if use_cfg=True. - use_kv_cache (bool, optional): Whether to use key-value caching in the transformer. - forbid_audio_eos (bool, optional): Whether to globally forbid audio EOS for the entire - batch. - - Returns: - torch.Tensor: Sampled audio codes with shape (B, num_codebooks, frame_stacking_factor) - where B is batch size (or actual_batch_size if use_cfg=True). - """ - - self.local_transformer.reset_cache(use_cache=use_kv_cache) - dec_output = dec_output.unsqueeze(1) # (B, 1, E) - local_transformer_input = self.local_transformer_in_projection(dec_output) # (B, 1, 128) - all_preds = [] - for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor): - _mask = torch.ones( - local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device - ) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] # (B, T, 128) - codebook_logits = self.local_transformer_out_projections[codebook_num]( - local_transformer_output[:, -1, :] - ) # (B, num_all_tokens_per_codebook) - if use_cfg: - actual_batch_size = codebook_logits.size(0) // 2 - conditional_logits = codebook_logits[:actual_batch_size] - unconditional_logits = codebook_logits[actual_batch_size:] - cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits - codebook_logits[:actual_batch_size] = cfg_logits - - for item_idx in unfinished_items: - codebook_logits[item_idx, self.audio_eos_id] = float('-inf') - for item_idx in finished_items: - codebook_logits[item_idx, :] = float('-inf') - codebook_logits[item_idx, self.audio_eos_id] = 0.0 - - # Disallow generation of special tokens - codebook_logits = self.clear_forbidden_logits( - codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos - ).squeeze(1) - - codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) - indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze( - -1 - ) # (B, num_tokens_per_codebook) - codebook_logits_rescored = codebook_logits.clone() - codebook_logits_rescored[indices_to_remove] = float('-inf') - codebook_probs = torch.softmax( - codebook_logits_rescored / temperature, dim=-1 - ) # (B, num_tokens_per_codebook) - codebook_preds = torch.multinomial(codebook_probs, 1) # (B, 1) - if use_cfg: - codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] - all_preds.append(codebook_preds) - next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze( - 1 - ) # (B, 1, 128) - next_local_transformer_input = self.local_transformer_in_projection( - next_local_transformer_input - ) # (B, 1, 128) - local_transformer_input = torch.cat( - [local_transformer_input, next_local_transformer_input], dim=1 - ) # (B, T+1, 128) - - all_preds = torch.cat(all_preds, dim=1) # (B, num_codebooks * frame_stacking_factor) - all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute( - 0, 2, 1 - ) # (B, num_codebooks, frame_stacking_factor) - if use_cfg: - all_preds = all_preds[:actual_batch_size] - - return all_preds - def sample_codes_from_logits( self, all_code_logits_t: torch.Tensor, @@ -2079,22 +1496,6 @@ def compute_alignment_loss(self, attention_scores, text_lens, audio_lens, dec_co ) return alignment_loss - def pad_audio_codes(self, audio_codes: torch.Tensor): - """ - Pads the time dimension of the audio codes to a multiple of the frame stacking factor. - Args: - audio_codes (torch.Tensor): B, C, T - frame_stacking_factor (int): The factor that frames will be stacked by. - pad_token (int): The token ID to pad with. - Returns: - B, C, T_padded - """ - T = audio_codes.size(2) - T_padded = int(np.ceil(T / self.frame_stacking_factor) * self.frame_stacking_factor) - num_pad = T_padded - T - audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad)) - return audio_codes - def embed_context_text(self, context_text_tokens): if self.legacy_text_conditioning: context_text_tokens = ( diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index d7dd672867c3..580a6e32ebc7 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -428,6 +428,9 @@ def log_model_architecture_summary(model: MagpieTTSModel) -> Tuple[str, Dict[str - moe_info: String for checkpoint naming (e.g., "MoE_8x2_d2048_softmax_"), empty for dense models - flops_per_component: Dict mapping component name (e.g., "decoder") to its FLOPs metrics dict """ + if isinstance(model, EasyMagpieTTSModel): + return "", {} + logging.info("=" * 60) logging.info("MODEL ARCHITECTURE SUMMARY") logging.info("=" * 60) From f62752322ae38f7e362e00766105e0541144f590 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Tue, 10 Mar 2026 02:25:00 +0000 Subject: [PATCH 81/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/base_magpietts.py | 22 +++++-------------- .../tts/models/easy_magpietts_inference.py | 9 ++------ 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py index f3eacb945051..c2372a1f7980 100644 --- a/nemo/collections/tts/models/base_magpietts.py +++ b/nemo/collections/tts/models/base_magpietts.py @@ -18,14 +18,8 @@ import torch from torch.utils.data import get_worker_info -from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( - instantiate_phoneme_tokenizer, - setup_tokenizers, -) -from nemo.collections.tts.modules.magpietts_modules import ( - SpecialAudioToken, - cosine_schedule, -) +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers +from nemo.collections.tts.modules.magpietts_modules import SpecialAudioToken, cosine_schedule from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths from nemo.core.classes import ModelPT from nemo.utils import logging @@ -93,7 +87,7 @@ def load_state_dict(self, state_dict, strict=True): for key in state_dict.keys(): name_with_dot = f"{name}." if key.startswith(name_with_dot): - new_state_dict[key[len(name_with_dot):]] = state_dict[key] + new_state_dict[key[len(name_with_dot) :]] = state_dict[key] child.load_state_dict(new_state_dict) def setup_optimizer_param_groups(self): @@ -478,7 +472,7 @@ def local_transformer_sample_maskgit( if sampling_type == "causal" or sampling_type == "purity_causal": n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1)) - confidences[:, n_frames_to_allow * self.num_audio_codebooks:] = min_confidence - 1 + confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = min_confidence - 1 _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) if use_cfg: @@ -494,9 +488,7 @@ def local_transformer_sample_maskgit( for codebook_num in range(codebook_seq_len): next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) - local_transformer_input = torch.cat( - [local_transformer_input, next_local_transformer_input], dim=1 - ) + local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) _mask = torch.ones(B, codebook_seq_len + 1, device=device) local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] @@ -545,9 +537,7 @@ def local_transformer_sample_maskgit( confidences = probs.max(dim=2)[0] sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) if noise_scale > 0.0: - noise = ( - (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps) - ) + noise = (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps) confidences += noise confidences[actual_batch_size:] = confidences[:actual_batch_size] confidence_eps = 0.1 diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py index 5bab45559174..a58f12c19b89 100644 --- a/nemo/collections/tts/models/easy_magpietts_inference.py +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -25,10 +25,7 @@ from torch import nn from transformers import AutoConfig, AutoModelForCausalLM -from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( - instantiate_phoneme_tokenizer, - setup_tokenizers, -) +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers from nemo.collections.tts.models import AudioCodecModel from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel from nemo.collections.tts.modules import transformer_2501 @@ -1861,9 +1858,7 @@ def infer_batch( phoneme_end = torch.where( state.phoneme_prediction_end_idx >= 0, state.phoneme_prediction_end_idx, - torch.full_like( - state.phoneme_prediction_end_idx, predicted_phoneme_tokens.size(-1) - ), + torch.full_like(state.phoneme_prediction_end_idx, predicted_phoneme_tokens.size(-1)), ) predicted_phoneme_tokens_lens = phoneme_end - phoneme_start phoneme_prediction_start_idx_out = phoneme_start From c8437acc31e26de3fa9a1f3a27571dcb54683556 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Tue, 10 Mar 2026 02:10:45 -0400 Subject: [PATCH 82/94] cleanup Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/base_magpietts.py | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py index c2372a1f7980..63db14afd264 100644 --- a/nemo/collections/tts/models/base_magpietts.py +++ b/nemo/collections/tts/models/base_magpietts.py @@ -49,10 +49,6 @@ class BaseMagpieTTSModel(ModelPT): ``__init__``, data loading, training/inference logic, etc. """ - # ------------------------------------------------------------------ - # State-dict exclusion – subclasses override - # ------------------------------------------------------------------ - def _get_state_dict_keys_to_exclude(self) -> List[str]: """Return list of key substrings to exclude from checkpoint save/load. @@ -61,10 +57,6 @@ def _get_state_dict_keys_to_exclude(self) -> List[str]: """ return ['_codec_model'] - # ------------------------------------------------------------------ - # state_dict / load_state_dict / optimizer param groups - # ------------------------------------------------------------------ - def state_dict(self, destination=None, prefix='', keep_vars=False): if hasattr(self, '_no_state_dict') and self._no_state_dict: return {} @@ -109,10 +101,6 @@ def setup_optimizer_param_groups(self): self._optimizer_param_groups = [{"params": trainable_params}] - # ------------------------------------------------------------------ - # Special token helpers - # ------------------------------------------------------------------ - def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1): # codes: (B, C, T') codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0) @@ -160,10 +148,6 @@ def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1): codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len) return codes, codes_len - # ------------------------------------------------------------------ - # Audio codec helpers - # ------------------------------------------------------------------ - def audio_to_codes(self, audio, audio_len, sample_rate=None): self._codec_model.eval() with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): @@ -179,10 +163,6 @@ def codes_to_audio(self, codes, codes_len): audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) return audio, audio_len, codes - # ------------------------------------------------------------------ - # Padding / forbidden-logits helpers - # ------------------------------------------------------------------ - def pad_audio_codes(self, audio_codes: torch.Tensor): """Pads the time dimension of the audio codes to a multiple of the frame stacking factor. @@ -214,10 +194,6 @@ def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool = ] = float('-inf') return logits - # ------------------------------------------------------------------ - # MaskGit helpers - # ------------------------------------------------------------------ - def maskgit_create_random_mask(self, codes): """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN.""" B, C, T = codes.shape @@ -236,10 +212,6 @@ def maskgit_apply_random_mask(self, codes): codes_with_mask = torch.where(mask, self.mask_token_id, codes) return codes_with_mask, mask - # ------------------------------------------------------------------ - # Local transformer – training - # ------------------------------------------------------------------ - def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): """Predicts the logits for all codebooks using the local transformer. @@ -306,10 +278,6 @@ def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_ return all_code_logits - # ------------------------------------------------------------------ - # Local transformer – AR sampling - # ------------------------------------------------------------------ - def local_transformer_sample_autoregressive( self, dec_output: torch.Tensor, @@ -401,10 +369,6 @@ def local_transformer_sample_autoregressive( return all_preds - # ------------------------------------------------------------------ - # Local transformer – MaskGit sampling - # ------------------------------------------------------------------ - def local_transformer_sample_maskgit( self, dec_output: torch.Tensor, From dc52f0aecc41c6899d44e9f41e6bbf6c0a635c77 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Tue, 10 Mar 2026 02:39:37 -0400 Subject: [PATCH 83/94] sanitize logits only for easy magpie to preserve magpietts functionality Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/base_magpietts.py | 7 +++++-- nemo/collections/tts/models/easy_magpietts_inference.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py index 63db14afd264..27073282da6c 100644 --- a/nemo/collections/tts/models/base_magpietts.py +++ b/nemo/collections/tts/models/base_magpietts.py @@ -289,6 +289,7 @@ def local_transformer_sample_autoregressive( cfg_scale: float = 1.0, use_kv_cache: bool = True, forbid_audio_eos: bool = False, + sanitize_logits: bool = False, ) -> torch.Tensor: """Sample audio codes autoregressively across codebooks using the local transformer. @@ -305,6 +306,7 @@ def local_transformer_sample_autoregressive( cfg_scale: Scale factor for CFG. use_kv_cache: Whether to use key-value caching in the local transformer. forbid_audio_eos: Whether to globally forbid audio EOS. + sanitize_logits: Whether to clamp/clean logits before sampling. Returns: Sampled audio codes (B, num_codebooks, frame_stacking_factor). @@ -329,8 +331,9 @@ def local_transformer_sample_autoregressive( cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits codebook_logits[:actual_batch_size] = cfg_logits - codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) - codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) + if sanitize_logits: + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) for item_idx in unfinished_items: codebook_logits[item_idx, self.audio_eos_id] = float('-inf') diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py index a58f12c19b89..c5b0bf56112b 100644 --- a/nemo/collections/tts/models/easy_magpietts_inference.py +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -905,6 +905,7 @@ def _sample_audio_codes( topk=topk, use_cfg=use_cfg, cfg_scale=cfg_scale, + sanitize_logits=True, ) # Base class returns (B, C, S); flatten to (B, C*S) for downstream code audio_codes_next = audio_codes_next.permute(0, 2, 1) From f680b8f62a6c602657e804feb10cf68691617677 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Tue, 10 Mar 2026 12:29:10 -0400 Subject: [PATCH 84/94] remove custom phoneme tokenizer instantiation and handle it in the tokenizer class Signed-off-by: Paarth Neekhara --- .../tokenizers/text_to_speech/tts_tokenizers.py | 8 ++++++++ .../tts/data/text_to_speech_dataset_lhotse.py | 12 +----------- nemo/collections/tts/models/base_magpietts.py | 5 +++-- nemo/collections/tts/models/easy_magpietts.py | 5 ++--- .../tts/models/easy_magpietts_inference.py | 4 ++-- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index 65b27bc6b62f..0b6988e3e9a8 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -1192,6 +1192,14 @@ def __init__(self, tokenizer_path: str): self._tokenizer = Tokenizer.from_file(tokenizer_file) self.tokens = self._tokenizer.get_vocab() + phoneme_vocab_size = len(self.tokens) + self.bos_token_id = phoneme_vocab_size + self.eos_token_id = phoneme_vocab_size + 1 + self.unk_token_id = phoneme_vocab_size + 2 + self.vocab_size = phoneme_vocab_size + 3 + self.tokens[""] = self.bos_token_id + self.tokens[""] = self.eos_token_id + self.tokens[""] = self.unk_token_id self.pad = self.tokens.get("", None) def encode(self, text: str) -> List[int]: diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py index c1ac9975d215..f2de40bdb180 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py +++ b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py @@ -60,16 +60,6 @@ def setup_tokenizers(all_tokenizers_config, mode='train'): return aggregated_tokenizer -def instantiate_phoneme_tokenizer(phoneme_tokenizer_config): - phoneme_tokenizer = instantiate(phoneme_tokenizer_config) - phoneme_vocab_size = len(phoneme_tokenizer.tokens) - phoneme_tokenizer.bos_token_id = phoneme_vocab_size - phoneme_tokenizer.eos_token_id = phoneme_vocab_size + 1 - phoneme_tokenizer.unk_token_id = phoneme_vocab_size + 2 - phoneme_tokenizer.vocab_size = phoneme_vocab_size + 3 - return phoneme_tokenizer - - def check_speaker_format(item: str): # enforce the format as example like "| Language:en Dataset:HiFiTTS Speaker:9136_other |". pattern = r"\| Language:\w+ Dataset:[\w\d\W]+ Speaker:[\w\d\W]+ \|" @@ -207,7 +197,7 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List]]: # initialize the phoneme tokenizer once per dataset/worker when config is available. if self.phoneme_tokenizer is None and self.phoneme_tokenizer_config is not None: - self.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.phoneme_tokenizer_config) + self.phoneme_tokenizer = instantiate(self.phoneme_tokenizer_config) # define list to store batched information dataset_name_list = [] diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py index 27073282da6c..f031ebf98fab 100644 --- a/nemo/collections/tts/models/base_magpietts.py +++ b/nemo/collections/tts/models/base_magpietts.py @@ -16,9 +16,10 @@ import numpy as np import torch +from hydra.utils import instantiate from torch.utils.data import get_worker_info -from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers from nemo.collections.tts.modules.magpietts_modules import SpecialAudioToken, cosine_schedule from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths from nemo.core.classes import ModelPT @@ -37,7 +38,7 @@ def worker_init_fn(worker_id): tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type) dataset.text_tokenizer = tokenizer if hasattr(dataset, 'phoneme_tokenizer_config'): - dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(dataset.phoneme_tokenizer_config) + dataset.phoneme_tokenizer = instantiate(dataset.phoneme_tokenizer_config) class BaseMagpieTTSModel(ModelPT): diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 115b8e2d6a99..97e47284aac9 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -34,7 +34,6 @@ from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( MagpieTTSLhotseDataset, - instantiate_phoneme_tokenizer, setup_tokenizers, ) from nemo.collections.tts.models.base_magpietts import worker_init_fn @@ -1428,7 +1427,7 @@ def setup_training_data(self, dataset_cfg): mode='train', ) if self.cfg.get("phoneme_tokenizer", None) is not None: - dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer) + dataset.phoneme_tokenizer = instantiate(self.cfg.phoneme_tokenizer) self._train_dl = torch.utils.data.DataLoader( dataset, @@ -1450,7 +1449,7 @@ def _setup_test_dataloader(self, dataset_cfg) -> torch.utils.data.DataLoader: # For num workers > 0 tokenizer will be assigned in worker_init_fn (since it is not picklable) dataset.text_tokenizer = setup_tokenizers(all_tokenizers_config=self.cfg.text_tokenizers, mode='test') if self.cfg.get("phoneme_tokenizer", None) is not None: - dataset.phoneme_tokenizer = instantiate_phoneme_tokenizer(self.cfg.phoneme_tokenizer) + dataset.phoneme_tokenizer = instantiate(self.cfg.phoneme_tokenizer) data_loader = torch.utils.data.DataLoader( dataset, diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py index c5b0bf56112b..765c234e2683 100644 --- a/nemo/collections/tts/models/easy_magpietts_inference.py +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -25,7 +25,7 @@ from torch import nn from transformers import AutoConfig, AutoModelForCausalLM -from nemo.collections.tts.data.text_to_speech_dataset_lhotse import instantiate_phoneme_tokenizer, setup_tokenizers +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers from nemo.collections.tts.models import AudioCodecModel from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel from nemo.collections.tts.modules import transformer_2501 @@ -306,7 +306,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.cfg_unk_token_id = num_tokens - 1 self.phoneme_tokenizer = None if cfg.get('phoneme_tokenizer', None) is not None: - self.phoneme_tokenizer = instantiate_phoneme_tokenizer(cfg.phoneme_tokenizer) + self.phoneme_tokenizer = instantiate(cfg.phoneme_tokenizer) self.phoneme_stacking_factor = cfg.get('phoneme_stacking_factor', 1) self.phoneme_vocab_size = self.phoneme_tokenizer.vocab_size if cfg.get('phoneme_corruption_batch_prob', None) is None: From 40fb7ebbc12bdc1fa4a819fe98ba45d7c71df783 Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Tue, 10 Mar 2026 16:30:23 +0000 Subject: [PATCH 85/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 97e47284aac9..5a117432b986 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -32,10 +32,7 @@ from nemo.collections.asr.metrics.wer import word_error_rate from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config -from nemo.collections.tts.data.text_to_speech_dataset_lhotse import ( - MagpieTTSLhotseDataset, - setup_tokenizers, -) +from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers from nemo.collections.tts.models.base_magpietts import worker_init_fn from nemo.collections.tts.models.easy_magpietts_inference import ( EasyMagpieTTSInferenceModel, From c8ad57a3dc27f42b44cf68a4aa1d19a205add836 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Tue, 10 Mar 2026 15:57:51 -0700 Subject: [PATCH 86/94] remove streaming inference script Signed-off-by: Shehzeen Hussain --- examples/tts/magpietts_streaming_inference.py | 1030 ----------------- 1 file changed, 1030 deletions(-) delete mode 100644 examples/tts/magpietts_streaming_inference.py diff --git a/examples/tts/magpietts_streaming_inference.py b/examples/tts/magpietts_streaming_inference.py deleted file mode 100644 index d25172d4e1f6..000000000000 --- a/examples/tts/magpietts_streaming_inference.py +++ /dev/null @@ -1,1030 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -MagpieTTS Streaming Inference Test Script. - -This script tests the streaming TTS inference functionality, supporting both -single sample (batch_size=1) and batched inference (batch_size>1). - -For batched inference, each item in the batch can have different context lengths -and be in different processing phases (context, prompt, phoneme-only, audio). - -Example usage: - # Single sample inference from checkpoint - python examples/tts/magpietts_streaming_inference.py \ - --hparams_file /path/to/hparams.yaml \ - --checkpoint_file /path/to/model.ckpt \ - --codecmodel_path /path/to/codec.nemo \ - --context_audio /path/to/context.wav \ - --text "Hello, this is a test of streaming TTS inference." \ - --output_path /path/to/output.wav - - # Batched inference with multiple context audios - python examples/tts/magpietts_streaming_inference.py \ - --nemo_file /path/to/model.nemo \ - --codecmodel_path /path/to/codec.nemo \ - --context_audio /path/to/context1.wav /path/to/context2.wav \ - --context_duration 3.0 5.0 \ - --text "First text to synthesize." "Second text to synthesize." \ - --output_path /path/to/output.wav -""" -from __future__ import annotations - -import argparse -import os -import time -from typing import Optional - -import numpy as np -import soundfile as sf -import torch -from omegaconf import OmegaConf, open_dict - -from nemo.collections.tts.models import EasyMagpieTTSModel -from nemo.utils import logging - - -def load_model( - hparams_file: Optional[str], - checkpoint_file: Optional[str], - nemo_file: Optional[str], - codecmodel_path: str, - device: str = "cuda", -) -> EasyMagpieTTSModel: - """ - Load an EasyMagpieTTSModel from checkpoint or .nemo file. - - Args: - hparams_file: Path to hparams.yaml (required with checkpoint_file). - checkpoint_file: Path to .ckpt file (required with hparams_file). - nemo_file: Path to .nemo file (alternative to hparams + checkpoint). - codecmodel_path: Path to the audio codec model. - device: Device to load model on. - - Returns: - Loaded model ready for inference. - """ - if hparams_file is not None and checkpoint_file is not None: - # Load from hparams + checkpoint - logging.info(f"Loading model from checkpoint: {checkpoint_file}") - model_cfg = OmegaConf.load(hparams_file) - - # Handle different config structures - if "cfg" in model_cfg: - model_cfg = model_cfg.cfg - - with open_dict(model_cfg): - # Override codec model path - model_cfg.codecmodel_path = codecmodel_path - - # Disable training datasets - model_cfg.train_ds = None - model_cfg.validation_ds = None - - model = EasyMagpieTTSModel(cfg=model_cfg) - - # Load weights - ckpt = torch.load(checkpoint_file, weights_only=False) - state_dict = ckpt['state_dict'] - model.load_state_dict(state_dict) - - elif nemo_file is not None: - # Load from .nemo file - logging.info(f"Loading model from NeMo archive: {nemo_file}") - model_cfg = EasyMagpieTTSModel.restore_from(nemo_file, return_config=True) - - with open_dict(model_cfg): - model_cfg.codecmodel_path = codecmodel_path - model_cfg.train_ds = None - model_cfg.validation_ds = None - - model = EasyMagpieTTSModel.restore_from(nemo_file, override_config_path=model_cfg) - - else: - raise ValueError("Must provide either (hparams_file + checkpoint_file) or nemo_file") - - model.to(device) - model.eval() - logging.info("Model loaded and ready for streaming inference.") - - return model - - -def load_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor: - """ - Load audio file and resample if needed. - - Args: - audio_path: Path to audio file. - target_sample_rate: Target sample rate. - - Returns: - Audio tensor of shape (1, num_samples). - """ - audio, sr = sf.read(audio_path, dtype='float32') - - # Convert to mono if stereo - if len(audio.shape) > 1: - audio = audio.mean(axis=1) - - # Resample if needed - if sr != target_sample_rate: - import librosa - - audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate) - - return torch.from_numpy(audio).unsqueeze(0) # (1, num_samples) - - -def adjust_audio_to_duration( - audio: torch.Tensor, - sample_rate: int, - target_duration: float, - codec_model_samples_per_frame: int, -) -> torch.Tensor: - """ - Adjust audio to target_duration seconds, aligned to codec frame boundaries. - - The target number of samples is calculated to align with codec frame boundaries: - 1. Convert target_duration to number of codec frames - 2. Convert codec frames back to samples - - If audio is longer than target, take the first target_duration seconds. - If audio is shorter, repeat it until it reaches target_duration seconds. - - Args: - audio: Audio tensor of shape (1, num_samples). - sample_rate: Sample rate of the audio. - target_duration: Target duration in seconds. - codec_model_samples_per_frame: Number of audio samples per codec frame - (codec downsampling factor). - - Returns: - Audio tensor of shape (1, target_num_samples) where target_num_samples - is aligned to codec frame boundaries. - """ - # Calculate target samples aligned to codec frame boundaries - # Same logic as text_to_speech_dataset.py - num_codec_frames = int(target_duration * sample_rate / codec_model_samples_per_frame) - target_num_samples = num_codec_frames * codec_model_samples_per_frame - current_num_samples = audio.size(1) - - if current_num_samples >= target_num_samples: - # Audio is longer than target - take the first target_duration seconds - audio = audio[:, :target_num_samples] - else: - # Audio is shorter - repeat until we have enough samples - num_repeats = int(np.ceil(target_num_samples / current_num_samples)) - audio_repeated = audio.repeat(1, num_repeats) - audio = audio_repeated[:, :target_num_samples] - - return audio - - -def run_streaming_inference( - model: EasyMagpieTTSModel, - context_audio: torch.Tensor, - context_audio_lens: torch.Tensor, - context_text: str, - text: str, - phoneme_text: Optional[str] = None, - use_gt_phonemes: bool = False, - inference_mode: Optional[str] = None, - use_cfg: bool = False, - cfg_scale: float = 1.5, - use_local_transformer: bool = False, - temperature: float = 0.7, - topk: int = 80, - max_steps: int = 500, - verbose: bool = True, - force_dropout_text: bool = False, -) -> tuple: - """ - Run streaming TTS inference. - - Args: - model: The loaded EasyMagpieTTSModel. - context_audio: Context audio tensor (1, num_samples). - context_audio_lens: Length of context audio (1,). - context_text: Context text for speaker conditioning. - text: Main text to synthesize. - phoneme_text: Optional phoneme text for GT conditioning. If None, uses text. - use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing). - inference_mode: Inference mode name (e.g., "streaming_4_8"). - use_cfg: Whether to use classifier-free guidance. - cfg_scale: CFG scale factor. - use_local_transformer: Whether to use local transformer. - temperature: Sampling temperature. - topk: Top-k sampling parameter. - max_steps: Maximum generation steps. - verbose: Whether to print progress. - - Returns: - Tuple of (output, timing_info, context_audio_decoded, context_audio_decoded_lens). - output is StreamingFinalizeOutput with audio, codes, and phoneme predictions. - context_audio_decoded is the decoded context audio from the model's internal codes (for sanity checking). - """ - device = next(model.parameters()).device - - # Encode context audio to codes - context_audio = context_audio.to(device) - context_audio_lens = context_audio_lens.to(device) - - with torch.inference_mode(): - context_audio_codes, context_audio_codes_lens = model.audio_to_codes(context_audio, context_audio_lens) - - # Tokenize context text - # Use the text conditioning tokenizer - tokenizer_name = model.text_conditioning_tokenizer_name - context_text_tokens = model.tokenizer.encode(context_text, tokenizer_name=tokenizer_name) - context_text_tokens = torch.tensor([context_text_tokens], dtype=torch.long, device=device) - context_text_tokens_lens = torch.tensor([context_text_tokens.size(1)], dtype=torch.long, device=device) - - # Tokenize main text - # Get the appropriate tokenizer name for main text - if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers: - main_tokenizer_name = 'english_phoneme' - else: - main_tokenizer_name = tokenizer_name - - text_tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name) - text_tokens = text_tokens + [model.eos_id] - text_tokens = torch.tensor(text_tokens, dtype=torch.long, device=device) - - # Tokenize phoneme text if provided (for GT phoneme conditioning) - gt_phoneme_tokens = None - gt_phoneme_tokens_lens = None - if model.phoneme_tokenizer is not None: - phoneme_source = phoneme_text if phoneme_text is not None else text - phoneme_tokens_list = model.phoneme_tokenizer.encode(phoneme_source) - # Add BOS and EOS - bos_id = model.phoneme_tokenizer.bos_token_id - eos_id = model.phoneme_tokenizer.eos_token_id - phoneme_tokens_list = [bos_id] + phoneme_tokens_list + [eos_id] - gt_phoneme_tokens = torch.tensor([phoneme_tokens_list], dtype=torch.long, device=device) - gt_phoneme_tokens_lens = torch.tensor([len(phoneme_tokens_list)], dtype=torch.long, device=device) - - phoneme_input_type = 'gt' if use_gt_phonemes else 'pred' - - # Get streaming delays for logging - mode_name = inference_mode or model.default_inference_mode - training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0]) - phoneme_delay = training_mode.streaming_phonemes_delay - speech_delay = training_mode.streaming_speech_delay - - if verbose: - logging.info(f"Context audio codes shape: {context_audio_codes.shape}") - logging.info(f"Context text tokens: {context_text_tokens.shape}") - logging.info(f"Main text tokens: {text_tokens.shape} ({len(text_tokens)} tokens)") - if gt_phoneme_tokens is not None: - logging.info(f"GT phoneme tokens: {gt_phoneme_tokens.shape} ({gt_phoneme_tokens_lens[0].item()} tokens)") - logging.info(f"Phoneme input type: {phoneme_input_type}") - logging.info(f"Using inference mode: {mode_name}") - logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}") - logging.info("Phases: Prompt (0 to phoneme_delay) -> Phoneme-only (phoneme_delay to speech_delay) -> Audio") - - # Initialize streaming state - start_time = time.time() - - state = model.streaming_init( - context_audio_codes=context_audio_codes, - context_audio_codes_lens=context_audio_codes_lens, - context_text_tokens=context_text_tokens, - context_text_tokens_lens=context_text_tokens_lens, - inference_mode=inference_mode, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - use_local_transformer=use_local_transformer, - temperature=temperature, - topk=topk, - phoneme_input_type=phoneme_input_type, - gt_phoneme_tokens=gt_phoneme_tokens, - gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, - ) - - init_time = time.time() - start_time - if verbose: - logging.info(f"Streaming init completed in {init_time:.3f}s") - - # Decode and return context audio for sanity check - # The context_audio_codes in state have special tokens and are stacked - # We need to remove special tokens and decode them - with torch.inference_mode(): - ctx_codes = state.context_audio_codes.clone() - ctx_codes_lens = state.context_audio_codes_lens.clone() - # Remove special tokens (BOS and EOS) - ctx_codes, ctx_codes_lens = model.remove_special_tokens( - codes=ctx_codes, - codes_len=ctx_codes_lens, - ) - # codes_to_audio will handle unstacking internally - context_audio_decoded, context_audio_decoded_lens, _ = model.codes_to_audio(ctx_codes, ctx_codes_lens) - - # Feed text tokens one at a time - generation_start = time.time() - num_audio_frames = 0 - num_phoneme_frames = 0 - prompt_phase_tokens = 0 - phoneme_only_phase_tokens = 0 - - for i, token in enumerate(text_tokens): - state, audio_codes, phoneme_tokens = model.streaming_step( - state, text_tokens=token.unsqueeze(0), force_dropout_text=force_dropout_text - ) - - # Track which phase we're in - if audio_codes is None and phoneme_tokens is None: - prompt_phase_tokens += 1 - elif audio_codes is None and phoneme_tokens is not None: - phoneme_only_phase_tokens += 1 - num_phoneme_frames += 1 - else: - if audio_codes is not None: - num_audio_frames += 1 - if phoneme_tokens is not None: - num_phoneme_frames += 1 - - if verbose and (i + 1) % 10 == 0: - phase = ( - "prompt" - if audio_codes is None and phoneme_tokens is None - else ("phoneme-only" if audio_codes is None else "audio") - ) - logging.info( - f"Processed {i + 1}/{len(text_tokens)} text tokens (phase: {phase}), " - f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}" - ) - - if state.finished: - if verbose: - logging.info(f"EOS detected at text token {i + 1}") - break - - # Continue generating until finished (text has ended) - continuation_steps = 0 - while not state.finished and continuation_steps < max_steps: - state, audio_codes, phoneme_tokens = model.streaming_step( - state, text_tokens=None, force_dropout_text=force_dropout_text - ) - - if audio_codes is not None: - num_audio_frames += 1 - if phoneme_tokens is not None: - num_phoneme_frames += 1 - - continuation_steps += 1 - - if verbose and continuation_steps % 20 == 0: - logging.info( - f"Continuation step {continuation_steps}, " - f"audio frames: {num_audio_frames}, phoneme frames: {num_phoneme_frames}" - ) - - generation_time = time.time() - generation_start - - if verbose: - logging.info(f"Generation completed in {generation_time:.3f}s") - logging.info(f"Prompt phase tokens: {prompt_phase_tokens}") - logging.info(f"Phoneme-only phase tokens: {phoneme_only_phase_tokens}") - logging.info(f"Audio frames generated: {num_audio_frames}") - logging.info(f"Phoneme frames generated: {num_phoneme_frames}") - logging.info(f"Continuation steps: {continuation_steps}") - - # Finalize and get complete audio - output = model.streaming_finalize(state) - - total_time = time.time() - start_time - - if verbose and output.phoneme_text: - logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}") - - timing_info = { - 'init_time': init_time, - 'generation_time': generation_time, - 'total_time': total_time, - 'num_text_tokens': len(text_tokens), - 'prompt_phase_tokens': prompt_phase_tokens, - 'phoneme_only_phase_tokens': phoneme_only_phase_tokens, - 'num_audio_frames': num_audio_frames, - 'num_phoneme_frames': num_phoneme_frames, - 'continuation_steps': continuation_steps, - } - - return output, timing_info, context_audio_decoded, context_audio_decoded_lens - - -def run_batched_streaming_inference( - model: EasyMagpieTTSModel, - context_audios: list[torch.Tensor], - context_audio_lens_list: list[torch.Tensor], - context_texts: list[str], - texts: list[str], - phoneme_texts: Optional[list[str]] = None, - use_gt_phonemes: bool = False, - inference_mode: Optional[str] = None, - use_cfg: bool = False, - cfg_scale: float = 1.5, - use_local_transformer: bool = False, - temperature: float = 0.7, - topk: int = 80, - max_steps: int = 500, - verbose: bool = True, - force_dropout_text: bool = False, -) -> tuple: - """ - Run batched streaming TTS inference. - - Each batch item can have different context lengths. The streaming processes - only the minimum context length initially, then continues processing remaining - context per-item in the "context phase" before moving to prompt/audio phases. - - Args: - model: The loaded EasyMagpieTTSModel. - context_audios: List of context audio tensors, each (1, num_samples). - context_audio_lens_list: List of context audio lengths, each (1,). - context_texts: List of context texts for speaker conditioning. - texts: List of main texts to synthesize. - phoneme_texts: Optional list of phoneme texts for GT conditioning. If None, uses texts. - use_gt_phonemes: If True, use GT phonemes as decoder input (teacher forcing). - inference_mode: Inference mode name (e.g., "streaming_4_8"). - use_cfg: Whether to use classifier-free guidance. - cfg_scale: CFG scale factor. - use_local_transformer: Whether to use local transformer. - temperature: Sampling temperature. - topk: Top-k sampling parameter. - max_steps: Maximum generation steps. - verbose: Whether to print progress. - - Returns: - Tuple of (output, timing_info) where output is StreamingFinalizeOutput. - """ - device = next(model.parameters()).device - batch_size = len(context_audios) - - assert len(context_texts) == batch_size, "Number of context texts must match batch size" - assert len(texts) == batch_size, "Number of texts must match batch size" - - # Encode context audio to codes for each item - context_audio_codes_list = [] - context_audio_codes_lens_list = [] - - with torch.inference_mode(): - for i in range(batch_size): - context_audio = context_audios[i].to(device) - context_audio_lens = context_audio_lens_list[i].to(device) - codes, codes_lens = model.audio_to_codes(context_audio, context_audio_lens) - context_audio_codes_list.append(codes) - context_audio_codes_lens_list.append(codes_lens) - - # Pad and batch context audio codes - max_context_len = max(c.size(-1) for c in context_audio_codes_list) - num_codebooks = context_audio_codes_list[0].size(1) - - context_audio_codes = torch.zeros(batch_size, num_codebooks, max_context_len, dtype=torch.long, device=device) - context_audio_codes_lens = torch.zeros(batch_size, dtype=torch.long, device=device) - - for i in range(batch_size): - codes = context_audio_codes_list[i] - codes_len = context_audio_codes_lens_list[i] - context_audio_codes[i, :, : codes.size(-1)] = codes[0] - context_audio_codes_lens[i] = codes_len[0] - - # Tokenize context texts - tokenizer_name = model.text_conditioning_tokenizer_name - context_text_tokens_list = [] - for ctx_text in context_texts: - tokens = model.tokenizer.encode(ctx_text, tokenizer_name=tokenizer_name) - context_text_tokens_list.append(tokens) - - # Pad and batch context text tokens - max_context_text_len = max(len(t) for t in context_text_tokens_list) - context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device) - context_text_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) - - for i, tokens in enumerate(context_text_tokens_list): - context_text_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) - context_text_tokens_lens[i] = len(tokens) - - # Tokenize main texts - if hasattr(model.tokenizer, 'tokenizers') and 'english_phoneme' in model.tokenizer.tokenizers: - main_tokenizer_name = 'english_phoneme' - else: - main_tokenizer_name = tokenizer_name - - text_tokens_list = [] - for text in texts: - tokens = model.tokenizer.encode(text, tokenizer_name=main_tokenizer_name) - tokens = tokens + [model.eos_id] - text_tokens_list.append(torch.tensor(tokens, dtype=torch.long, device=device)) - - max_text_len = max(len(t) for t in text_tokens_list) - - # Tokenize phoneme texts if model has phoneme tokenizer - gt_phoneme_tokens = None - gt_phoneme_tokens_lens = None - if model.phoneme_tokenizer is not None: - phoneme_sources = phoneme_texts if phoneme_texts is not None else texts - bos_id = model.phoneme_tokenizer.bos_token_id - eos_id = model.phoneme_tokenizer.eos_token_id - phoneme_tokens_lists = [] - for ptext in phoneme_sources: - tokens = model.phoneme_tokenizer.encode(ptext) - tokens = [bos_id] + tokens + [eos_id] - phoneme_tokens_lists.append(tokens) - max_phoneme_len = max(len(t) for t in phoneme_tokens_lists) - gt_phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device) - gt_phoneme_tokens_lens = torch.zeros(batch_size, dtype=torch.long, device=device) - for i, tokens in enumerate(phoneme_tokens_lists): - gt_phoneme_tokens[i, : len(tokens)] = torch.tensor(tokens, dtype=torch.long, device=device) - gt_phoneme_tokens_lens[i] = len(tokens) - - phoneme_input_type = 'gt' if use_gt_phonemes else 'pred' - - # Get streaming delays for logging - mode_name = inference_mode or model.default_inference_mode - training_mode = model.mode_name_to_mode.get(mode_name, model.training_modes[0]) - phoneme_delay = training_mode.streaming_phonemes_delay - speech_delay = training_mode.streaming_speech_delay - - if verbose: - logging.info(f"Batch size: {batch_size}") - logging.info(f"Context audio codes shape: {context_audio_codes.shape}") - logging.info(f"Context audio codes lens: {context_audio_codes_lens.tolist()}") - logging.info(f"Context text tokens shape: {context_text_tokens.shape}") - logging.info(f"Context text tokens lens: {context_text_tokens_lens.tolist()}") - logging.info(f"Max text tokens: {max_text_len}") - logging.info(f"Text tokens per item: {[len(t) for t in text_tokens_list]}") - if gt_phoneme_tokens is not None: - logging.info(f"GT phoneme tokens shape: {gt_phoneme_tokens.shape}") - logging.info(f"GT phoneme tokens lens: {gt_phoneme_tokens_lens.tolist()}") - logging.info(f"Phoneme input type: {phoneme_input_type}") - logging.info(f"Using inference mode: {mode_name}") - logging.info(f"Phoneme delay: {phoneme_delay}, Speech delay: {speech_delay}") - - # Initialize streaming state - start_time = time.time() - - state = model.streaming_init( - context_audio_codes=context_audio_codes, - context_audio_codes_lens=context_audio_codes_lens, - context_text_tokens=context_text_tokens, - context_text_tokens_lens=context_text_tokens_lens, - inference_mode=inference_mode, - use_cfg=use_cfg, - cfg_scale=cfg_scale, - use_local_transformer=use_local_transformer, - temperature=temperature, - topk=topk, - phoneme_input_type=phoneme_input_type, - gt_phoneme_tokens=gt_phoneme_tokens, - gt_phoneme_tokens_lens=gt_phoneme_tokens_lens, - ) - - init_time = time.time() - start_time - if verbose: - logging.info(f"Streaming init completed in {init_time:.3f}s") - logging.info(f"Initial context_position: {state.context_position.tolist()}") - logging.info(f"Full context lens: {state.full_context_lens.tolist()}") - - # Feed text tokens one at a time - generation_start = time.time() - step_count = 0 - num_audio_frames = 0 - - # Track which items have finished their text - text_positions = torch.zeros(batch_size, dtype=torch.long, device=device) - text_finished_mask = torch.zeros(batch_size, dtype=torch.bool, device=device) - - # Main streaming loop - while not state.finished.all() and step_count < max_steps + max_text_len: - # Determine which items are in context phase - in_context_phase = state.context_position < state.full_context_lens - - # Prepare text tokens for this step - # Items in context phase: use 0 (will be ignored) - # Items not in context phase: use their next text token or 0 if text finished - text_tokens_batch = torch.zeros(batch_size, dtype=torch.long, device=device) - - for i in range(batch_size): - if not in_context_phase[i] and not text_finished_mask[i]: - if text_positions[i] < len(text_tokens_list[i]): - text_tokens_batch[i] = text_tokens_list[i][text_positions[i]] - text_positions[i] += 1 - else: - text_finished_mask[i] = True - - # Determine if we should pass None (all items have finished text and exited context) - all_text_done = text_finished_mask.all() and not in_context_phase.any() - - if all_text_done: - state, audio_codes, phoneme_tokens = model.streaming_step( - state, text_tokens=None, force_dropout_text=force_dropout_text - ) - else: - state, audio_codes, phoneme_tokens = model.streaming_step( - state, text_tokens=text_tokens_batch, force_dropout_text=force_dropout_text - ) - - if audio_codes is not None: - num_audio_frames += 1 - - step_count += 1 - - if verbose and step_count % 20 == 0: - in_ctx = state.context_position < state.full_context_lens - logging.info( - f"Step {step_count}: " - f"in_context_phase={in_ctx.tolist()}, " - f"text_positions={text_positions.tolist()}, " - f"audio_frames={num_audio_frames}, " - f"finished={state.finished.tolist()}" - ) - - generation_time = time.time() - generation_start - - if verbose: - logging.info(f"Generation completed in {generation_time:.3f}s") - logging.info(f"Total steps: {step_count}") - logging.info(f"Audio frames generated: {num_audio_frames}") - - # Finalize and get complete audio - output = model.streaming_finalize(state) - - total_time = time.time() - start_time - - if verbose and output.phoneme_text: - for i, ptext in enumerate(output.phoneme_text): - logging.info(f"Predicted phoneme text [{i}]: {ptext}") - - timing_info = { - 'init_time': init_time, - 'generation_time': generation_time, - 'total_time': total_time, - 'num_text_tokens': [len(t) for t in text_tokens_list], - 'num_audio_frames': num_audio_frames, - 'total_steps': step_count, - } - - return output, timing_info - - -def main(): - parser = argparse.ArgumentParser( - description="MagpieTTS Streaming Inference Test Script", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - # Model loading arguments - model_group = parser.add_argument_group('Model Loading') - model_group.add_argument( - '--hparams_file', - type=str, - default=None, - help='Path to hparams.yaml file', - ) - model_group.add_argument( - '--checkpoint_file', - type=str, - default=None, - help='Path to .ckpt checkpoint file', - ) - model_group.add_argument( - '--nemo_file', - type=str, - default=None, - help='Path to .nemo model file', - ) - model_group.add_argument( - '--codecmodel_path', - type=str, - required=True, - help='Path to audio codec model (.nemo)', - ) - - # Input arguments - input_group = parser.add_argument_group('Input') - input_group.add_argument( - '--context_audio', - type=str, - nargs='+', - required=True, - help='Path(s) to context audio file(s) for speaker cloning. ' 'Multiple files enable batched inference.', - ) - input_group.add_argument( - '--context_text', - type=str, - nargs='+', - default=["[NO TEXT CONTEXT]"], - help='Context text(s) for speaker conditioning. Provide one per context audio, ' - 'or a single value to use for all. (default: "[NO TEXT CONTEXT]")', - ) - input_group.add_argument( - '--context_duration', - type=float, - nargs='+', - default=[5.0], - help='Target duration(s) for context audio in seconds. Provide one per context audio, ' - 'or a single value to use for all. If audio is longer, ' - 'first N seconds are used. If shorter, audio is repeated. (default: 5.0)', - ) - input_group.add_argument( - '--text', - type=str, - nargs='+', - required=True, - help='Text(s) to synthesize. Provide one per context audio for batched inference.', - ) - input_group.add_argument( - '--phoneme_text', - type=str, - nargs='+', - default=None, - help='Phoneme text(s) for GT phoneme conditioning. If not provided, uses --text. ' - 'Provide one per context audio for batched inference.', - ) - input_group.add_argument( - '--use_gt_phonemes', - action='store_true', - help='Use ground-truth phonemes as decoder input (teacher forcing). ' - 'If not set, uses model-predicted phonemes.', - ) - - # Output arguments - output_group = parser.add_argument_group('Output') - output_group.add_argument( - '--output_path', - type=str, - default='streaming_output.wav', - help='Path for output audio file', - ) - - # Inference arguments - infer_group = parser.add_argument_group('Inference Parameters') - infer_group.add_argument( - '--inference_mode', - type=str, - default=None, - help='Inference mode name (e.g., "streaming_4_8"). Uses model default if not specified.', - ) - infer_group.add_argument( - '--use_cfg', - action='store_true', - help='Enable classifier-free guidance', - ) - infer_group.add_argument( - '--cfg_scale', - type=float, - default=1.5, - help='CFG scale factor (higher = stronger conditioning)', - ) - infer_group.add_argument( - '--use_local_transformer', - action='store_true', - help='Use local transformer for inference', - ) - infer_group.add_argument( - '--temperature', - type=float, - default=0.7, - help='Sampling temperature', - ) - infer_group.add_argument( - '--topk', - type=int, - default=80, - help='Top-k sampling parameter', - ) - infer_group.add_argument( - '--max_steps', - type=int, - default=500, - help='Maximum generation steps after text ends', - ) - infer_group.add_argument( - '--device', - type=str, - default='cuda', - choices=['cuda', 'cpu'], - help='Device to run inference on', - ) - infer_group.add_argument( - '--verbose', - action='store_true', - help='Print detailed progress information', - ) - infer_group.add_argument( - '--force_dropout_text', - action='store_true', - help='Force dropout of text embeddings (pass zeros) to test phoneme-only inference', - ) - - args = parser.parse_args() - - # Validate arguments - has_ckpt_mode = args.hparams_file is not None and args.checkpoint_file is not None - has_nemo_mode = args.nemo_file is not None - - if not (has_ckpt_mode or has_nemo_mode): - parser.error("Must provide either (--hparams_file and --checkpoint_file) or --nemo_file") - - # Load model - model = load_model( - hparams_file=args.hparams_file, - checkpoint_file=args.checkpoint_file, - nemo_file=args.nemo_file, - codecmodel_path=args.codecmodel_path, - device=args.device, - ) - - model = model.float() - - # Determine batch size from number of context audios - batch_size = len(args.context_audio) - - # Expand context_text, context_duration, and text to match batch_size - context_texts = args.context_text - if len(context_texts) == 1 and batch_size > 1: - context_texts = context_texts * batch_size - elif len(context_texts) != batch_size: - parser.error( - f"Number of context_texts ({len(context_texts)}) must match number of context_audios ({batch_size}) or be 1" - ) - - context_durations = args.context_duration - if len(context_durations) == 1 and batch_size > 1: - context_durations = context_durations * batch_size - elif len(context_durations) != batch_size: - parser.error( - f"Number of context_durations ({len(context_durations)}) must match number of context_audios ({batch_size}) or be 1" - ) - - texts = args.text - if len(texts) == 1 and batch_size > 1: - texts = texts * batch_size - elif len(texts) != batch_size: - parser.error(f"Number of texts ({len(texts)}) must match number of context_audios ({batch_size}) or be 1") - - # Handle phoneme_text - default to text if not provided - phoneme_texts = args.phoneme_text - if phoneme_texts is None: - phoneme_texts = texts - elif len(phoneme_texts) == 1 and batch_size > 1: - phoneme_texts = phoneme_texts * batch_size - elif len(phoneme_texts) != batch_size: - parser.error( - f"Number of phoneme_texts ({len(phoneme_texts)}) must match number of context_audios ({batch_size}) or be 1" - ) - - # Load and process context audios - context_audios = [] - context_audio_lens_list = [] - - for i, (audio_path, duration) in enumerate(zip(args.context_audio, context_durations)): - logging.info(f"Loading context audio {i+1}/{batch_size} from: {audio_path}") - audio = load_audio(audio_path, model.sample_rate) - original_duration = audio.size(1) / model.sample_rate - logging.info(f" Original duration: {original_duration:.2f}s") - - # Adjust to target duration (aligned to codec frame boundaries) - audio = adjust_audio_to_duration(audio, model.sample_rate, duration, model.codec_model_samples_per_frame) - adjusted_duration = audio.size(1) / model.sample_rate - logging.info(f" Adjusted duration: {adjusted_duration:.2f}s (target: {duration}s, codec-aligned)") - - context_audios.append(audio) - context_audio_lens_list.append(torch.tensor([audio.size(1)], dtype=torch.long)) - - logging.info(f"\nBatch size: {batch_size}") - logging.info(f"Context texts: {context_texts}") - logging.info(f"Texts to synthesize: {texts}") - logging.info(f"Phoneme texts: {phoneme_texts}") - logging.info(f"Use GT phonemes: {args.use_gt_phonemes}") - - # Use single-sample or batched inference - if batch_size == 1: - logging.info("\n=== Running single-sample streaming inference ===") - output, timing_info, context_audio_decoded, context_audio_decoded_lens = run_streaming_inference( - model=model, - context_audio=context_audios[0], - context_audio_lens=context_audio_lens_list[0], - context_text=context_texts[0], - text=texts[0], - phoneme_text=phoneme_texts[0], - use_gt_phonemes=args.use_gt_phonemes, - inference_mode=args.inference_mode, - use_cfg=args.use_cfg, - cfg_scale=args.cfg_scale, - use_local_transformer=args.use_local_transformer, - temperature=args.temperature, - topk=args.topk, - max_steps=args.max_steps, - verbose=args.verbose, - force_dropout_text=args.force_dropout_text, - ) - - # Save output - output_dir = os.path.dirname(args.output_path) - if output_dir and not os.path.exists(output_dir): - os.makedirs(output_dir) - - audio_np = output.audio[0, : output.audio_len[0].item()].cpu().numpy() - sf.write(args.output_path, audio_np, model.output_sample_rate) - logging.info(f"Output saved to: {args.output_path}") - - # Save decoded context audio for sanity check - output_base, output_ext = os.path.splitext(args.output_path) - context_output_path = f"{output_base}_context_decoded{output_ext}" - context_audio_np = context_audio_decoded[0, : context_audio_decoded_lens[0].item()].cpu().numpy() - sf.write(context_output_path, context_audio_np, model.output_sample_rate) - - logging.info(f"Context audio (decoded from codes) saved to: {context_output_path}") - logging.info(f"Context audio duration: {context_audio_decoded_lens[0].item() / model.output_sample_rate:.2f}s") - logging.info(f"Audio duration: {output.audio_len[0].item() / model.output_sample_rate:.2f}s") - logging.info(f"Generated codes shape: {output.audio_codes.shape}") - if output.phoneme_text: - logging.info(f"Predicted phoneme text: {output.phoneme_text[0]}") - - # Print timing summary - logging.info("\n=== Timing Summary ===") - logging.info(f"Init time: {timing_info['init_time']:.3f}s") - logging.info(f"Generation time: {timing_info['generation_time']:.3f}s") - logging.info(f"Total time: {timing_info['total_time']:.3f}s") - logging.info(f"Text tokens processed: {timing_info['num_text_tokens']}") - logging.info(f" - Prompt phase tokens: {timing_info['prompt_phase_tokens']}") - logging.info(f" - Phoneme-only phase tokens: {timing_info['phoneme_only_phase_tokens']}") - logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}") - logging.info(f"Phoneme frames generated: {timing_info['num_phoneme_frames']}") - logging.info(f"Continuation steps: {timing_info['continuation_steps']}") - - # Calculate RTF - audio_duration = output.audio_len[0].item() / model.output_sample_rate - rtf = audio_duration / timing_info['total_time'] - logging.info(f"Real-time factor (RTF): {rtf:.2f}x") - - else: - logging.info(f"\n=== Running batched streaming inference (batch_size={batch_size}) ===") - output, timing_info = run_batched_streaming_inference( - model=model, - context_audios=context_audios, - context_audio_lens_list=context_audio_lens_list, - context_texts=context_texts, - texts=texts, - phoneme_texts=phoneme_texts, - use_gt_phonemes=args.use_gt_phonemes, - inference_mode=args.inference_mode, - use_cfg=args.use_cfg, - cfg_scale=args.cfg_scale, - use_local_transformer=args.use_local_transformer, - temperature=args.temperature, - topk=args.topk, - max_steps=args.max_steps, - verbose=args.verbose, - force_dropout_text=args.force_dropout_text, - ) - - # Save outputs for each batch item - output_dir = os.path.dirname(args.output_path) - if output_dir and not os.path.exists(output_dir): - os.makedirs(output_dir) - - output_base, output_ext = os.path.splitext(args.output_path) - - for i in range(batch_size): - output_path_i = f"{output_base}_{i}{output_ext}" - audio_np = output.audio[i, : output.audio_len[i].item()].cpu().numpy() - sf.write(output_path_i, audio_np, model.output_sample_rate) - audio_duration_i = output.audio_len[i].item() / model.output_sample_rate - logging.info(f"Output {i+1}/{batch_size} saved to: {output_path_i} (duration: {audio_duration_i:.2f}s)") - if output.phoneme_text and i < len(output.phoneme_text): - logging.info(f" Predicted phoneme text: {output.phoneme_text[i]}") - - logging.info(f"\nGenerated codes shape: {output.audio_codes.shape}") - - # Print timing summary - logging.info("\n=== Timing Summary ===") - logging.info(f"Init time: {timing_info['init_time']:.3f}s") - logging.info(f"Generation time: {timing_info['generation_time']:.3f}s") - logging.info(f"Total time: {timing_info['total_time']:.3f}s") - logging.info(f"Text tokens per item: {timing_info['num_text_tokens']}") - logging.info(f"Audio frames generated: {timing_info['num_audio_frames']}") - logging.info(f"Total steps: {timing_info['total_steps']}") - - # Calculate average RTF - total_audio_duration = sum(output.audio_len[i].item() for i in range(batch_size)) / model.output_sample_rate - avg_rtf = total_audio_duration / timing_info['total_time'] - logging.info(f"Average real-time factor (RTF): {avg_rtf:.2f}x") - logging.info(f"Total audio duration (all items): {total_audio_duration:.2f}s") - - -if __name__ == "__main__": - main() From cfa582f81fca415dc23514bf095ddcd76e7e85f1 Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Wed, 11 Mar 2026 10:30:34 -0700 Subject: [PATCH 87/94] Magpietts decoderonly 2601 inference refactoring (#69) * undo model pt Signed-off-by: Shehzeen Hussain * remove test infer vs proces batch Signed-off-by: Shehzeen Hussain * undo inference changes for easy magpie to start fresh Signed-off-by: Shehzeen Hussain * inference refactoring Signed-off-by: Shehzeen Hussain --------- Signed-off-by: Shehzeen Hussain --- docs/source/tts/magpietts-longform.rst | 6 +- docs/source/tts/magpietts.rst | 2 +- examples/tts/evalset_config.json | 1 + .../{magpietts_inference.py => tts_infer.py} | 421 ++++++----- nemo/collections/tts/models/easy_magpietts.py | 25 +- .../modules/magpietts_inference/__init__.py | 54 +- .../modules/magpietts_inference/inference.py | 682 +++++++++--------- .../tts/modules/magpietts_inference/utils.py | 113 ++- nemo/core/classes/modelPT.py | 2 +- .../tts/test_infer_vs_process_batch.py | 491 ------------- ...S_InferEvaluate_Magpietts_FrameStacking.sh | 2 +- ...TS_InferEvaluate_Magpietts_MoE_ZeroShot.sh | 2 +- ...TS_InferEvaluate_Magpietts_SeenSpeakers.sh | 2 +- ...L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh | 2 +- ...Evaluatelongform_Magpietts_MoE_ZeroShot.sh | 2 +- ...nferEvaluatelongform_Magpietts_ZeroShot.sh | 2 +- 16 files changed, 736 insertions(+), 1073 deletions(-) rename examples/tts/{magpietts_inference.py => tts_infer.py} (68%) delete mode 100644 tests/collections/tts/test_infer_vs_process_batch.py diff --git a/docs/source/tts/magpietts-longform.rst b/docs/source/tts/magpietts-longform.rst index 33aef42a5abe..fb3eeb659d33 100644 --- a/docs/source/tts/magpietts-longform.rst +++ b/docs/source/tts/magpietts-longform.rst @@ -169,7 +169,7 @@ The ``do_tts`` method automatically detects whether longform inference is needed sf.write("output.wav", long_audio[0].cpu().numpy(), 22050) -Method 2: Using CLI (``magpietts_inference.py``) +Method 2: Using CLI (``tts_infer.py``) ------------------------------------------------ For batch inference from manifests: @@ -177,7 +177,7 @@ For batch inference from manifests: .. code-block:: bash # Auto-detect longform based on text length (default) - python examples/tts/magpietts_inference.py \ + python examples/tts/tts_infer.py \ --nemo_files /path/to/magpietts.nemo \ --datasets_json_path /path/to/evalset_config.json \ --out_dir /path/to/output \ @@ -185,7 +185,7 @@ For batch inference from manifests: --longform_mode auto # Force longform inference for all inputs - python examples/tts/magpietts_inference.py \ + python examples/tts/tts_infer.py \ --nemo_files /path/to/magpietts.nemo \ --datasets_json_path /path/to/evalset_config.json \ --out_dir /path/to/output \ diff --git a/docs/source/tts/magpietts.rst b/docs/source/tts/magpietts.rst index b79c11ea88ff..6d297a694596 100644 --- a/docs/source/tts/magpietts.rst +++ b/docs/source/tts/magpietts.rst @@ -130,7 +130,7 @@ Several parameters control the generation behavior. The temperature setting affe .. code-block:: bash - python examples/tts/magpietts_inference.py \ + python examples/tts/tts_infer.py \ --nemo_files /path/to/magpietts_model.nemo \ --codecmodel_path /path/to/audio_codec.nemo \ --datasets your_evaluation_set \ diff --git a/examples/tts/evalset_config.json b/examples/tts/evalset_config.json index 4be3056020ce..2d61a601f880 100644 --- a/examples/tts/evalset_config.json +++ b/examples/tts/evalset_config.json @@ -15,3 +15,4 @@ "feature_dir": null } } + diff --git a/examples/tts/magpietts_inference.py b/examples/tts/tts_infer.py similarity index 68% rename from examples/tts/magpietts_inference.py rename to examples/tts/tts_infer.py index f1ed60c27428..2c3bec0aa7f7 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/tts_infer.py @@ -12,25 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -MagpieTTS Inference and Evaluation Script. +TTS Inference and Evaluation Script. -Supports both standard and Mixture of Experts (MoE) models with: +Supports both encoder-decoder MagpieTTS and decoder-only EasyMagpieTTS models +with: - Automatic MoE detection and FLOPs calculation - Comprehensive evaluation metrics (RTF, FLOPs, CER, SSIM, etc.) -This script provides a clean CLI for running MagpieTTS inference with optional evaluation. -It decouples inference and evaluation into separate modules for better maintainability. +This script provides a clean CLI for running TTS inference with optional +evaluation. Model-specific behaviour (dataset creation, inference loop, CLI +arguments) is handled by separate runner classes so there is no scattered +if/else branching. Example usage: - # Inference only (from .nemo file) - default behavior - python examples/tts/magpietts_inference.py \\ + # MagpieTTS inference (encoder-decoder, default) + python examples/tts/tts_infer.py \\ + --model_type magpie \\ --nemo_files /path/to/model.nemo \\ --datasets_json_path /path/to/evalset_config.json \\ --out_dir /path/to/output \\ --codecmodel_path /path/to/codec.nemo - # Inference with evaluation (from checkpoint) - python examples/tts/magpietts_inference.py \\ + # EasyMagpieTTS inference (decoder-only) + python examples/tts/tts_infer.py \\ + --model_type easy_magpie \\ + --nemo_files /path/to/model.nemo \\ + --datasets_json_path /path/to/evalset_config.json \\ + --out_dir /path/to/output \\ + --codecmodel_path /path/to/codec.nemo + + # With evaluation + python examples/tts/tts_infer.py \\ + --model_type magpie \\ --hparams_files /path/to/hparams.yaml \\ --checkpoint_files /path/to/model.ckpt \\ --datasets_json_path /path/to/evalset_config.json \\ @@ -53,20 +66,27 @@ import numpy as np from nemo.collections.asr.parts.utils.manifest_utils import read_manifest +from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters from nemo.collections.tts.models.magpietts import ModelInferenceParameters from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import load_evalset_config - -# Import the modular components from nemo.collections.tts.modules.magpietts_inference.evaluation import ( DEFAULT_VIOLIN_METRICS, EvaluationConfig, compute_mean_with_confidence_interval, evaluate_generated_audio_dir, ) -from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner +from nemo.collections.tts.modules.magpietts_inference.inference import ( + BaseInferenceConfig, + BaseInferenceRunner, + EasyMagpieInferenceConfig, + EasyMagpieInferenceRunner, + MagpieInferenceConfig, + MagpieInferenceRunner, +) from nemo.collections.tts.modules.magpietts_inference.utils import ( ModelLoadConfig, get_experiment_name_from_checkpoint_path, + load_easy_magpie_model, load_magpie_model, log_model_architecture_summary, ) @@ -132,50 +152,54 @@ def create_formatted_metrics_mean_ci(metrics_mean_ci: dict) -> dict: def filter_datasets(dataset_meta_info: dict, datasets: Optional[List[str]]) -> List[str]: """Select datasets from the dataset meta info.""" if datasets is None: - # Dataset filtering not specified, return all datasets return list(dataset_meta_info.keys()) else: datasets = datasets.split(",") - # Check if datasets are valid for dataset in datasets: if dataset not in dataset_meta_info: raise ValueError(f"Dataset {dataset} not found in dataset meta info") - # Return all requsted datasets return datasets +# --------------------------------------------------------------------------- +# Core inference + evaluation orchestration (model-type agnostic) +# --------------------------------------------------------------------------- + + def run_inference_and_evaluation( - model_config: ModelLoadConfig, - inference_config: InferenceConfig, + runner: BaseInferenceRunner, + checkpoint_name: str, + inference_config: BaseInferenceConfig, eval_config: EvaluationConfig, dataset_meta_info: dict, - datasets: Optional[List[str]], + datasets: List[str], out_dir: str, + flops_per_component: dict, + moe_info: str, num_repeats: int = 1, confidence_level: float = 0.95, violin_plot_metrics: Optional[List[str]] = None, - log_exp_name: bool = False, clean_up_disk: bool = False, skip_evaluation: bool = False, ) -> Tuple[Optional[float], Optional[float]]: """Run inference and optional evaluation on specified datasets. - Uses unified inference path with automatic text chunking based on - per-sample language thresholds. Short texts are processed as single chunks, - long texts are automatically split into sentences. + This function is model-type agnostic -- it delegates dataset creation + and batch inference to the provided ``runner``. Args: - model_config: Configuration for loading the model. + runner: Concrete inference runner (MagpieInferenceRunner or EasyMagpieInferenceRunner). + checkpoint_name: Human-readable checkpoint identifier for output naming. inference_config: Configuration for inference. eval_config: Configuration for evaluation. dataset_meta_info: Dictionary containing dataset metadata. - datasets: List of dataset names to run inference and evaluation on. If None, all datasets in the - dataset meta info will be processed. + datasets: List of dataset names to process. out_dir: Output directory for results. + flops_per_component: FLOPs info dict from log_model_architecture_summary. + moe_info: MoE identifier string from log_model_architecture_summary. num_repeats: Number of times to repeat inference (for CI estimation). confidence_level: Confidence level for CI calculation. violin_plot_metrics: Metrics to include in violin plots. - log_exp_name: Whether to include experiment name in output paths. clean_up_disk: Whether to clean up output directory after completion. skip_evaluation: Whether to skip evaluation (inference only mode). @@ -185,40 +209,17 @@ def run_inference_and_evaluation( if violin_plot_metrics is None: violin_plot_metrics = list(DEFAULT_VIOLIN_METRICS) - # Remove UTMOSv2 from plots if disabled if not eval_config.with_utmosv2 and 'utmosv2' in violin_plot_metrics: violin_plot_metrics.remove('utmosv2') - # Load model - model, checkpoint_name = load_magpie_model( - model_config, is_decoder_only_model=inference_config.is_decoder_only_model - ) - # change model to fp32 for inference - model = model.float() - - # Log architecture summary and get MoE info + FLOPs metrics - moe_info, flops_per_component = log_model_architecture_summary(model) - - # Add experiment name prefix if requested - if log_exp_name and model_config.checkpoint_file: - exp_name = get_experiment_name_from_checkpoint_path(model_config.checkpoint_file) - checkpoint_name = f"{exp_name}__{checkpoint_name}" - - # Build full checkpoint identifier (include MoE info if present) full_checkpoint_name = ( f"{checkpoint_name}_{moe_info}{inference_config.build_identifier()}_SV_{eval_config.sv_model}" ) - # Create inference runner (uses unified path with automatic text chunking) - logging.info("Using unified inference with automatic text chunking based on language thresholds") - runner = MagpieInferenceRunner(model, inference_config) - - # Tracking metrics across datasets ssim_per_dataset = [] cer_per_dataset = [] all_datasets_filewise_metrics = {} - # CSV headers csv_header = ( "checkpoint_name,dataset,cer_filewise_avg,wer_filewise_avg,cer_cumulative," "wer_cumulative,ssim_pred_gt_avg,ssim_pred_context_avg,ssim_gt_context_avg," @@ -234,17 +235,14 @@ def run_inference_and_evaluation( manifest_records = read_manifest(meta['manifest_path']) language = meta.get('whisper_language', 'en') - # Prepare dataset metadata (remove evaluation-specific keys) dataset_meta_for_dl = copy.deepcopy(meta) for key in ["whisper_language", "load_cached_codes_if_available"]: dataset_meta_for_dl.pop(key, None) - # Setup output directories eval_dir = os.path.join(out_dir, f"{full_checkpoint_name}_{dataset}") audio_dir = os.path.join(eval_dir, "audio") os.makedirs(eval_dir, exist_ok=True) - # Setup CSV files per_run_csv = os.path.join(eval_dir, "all_experiment_metrics.csv") write_csv_header_if_needed(per_run_csv, csv_header) @@ -257,7 +255,6 @@ def run_inference_and_evaluation( repeat_audio_dir = os.path.join(audio_dir, f"repeat_{repeat_idx}") os.makedirs(repeat_audio_dir, exist_ok=True) - # Create dataset and run inference test_dataset = runner.create_dataset({dataset: dataset_meta_for_dl}) if len(test_dataset) != len(manifest_records): @@ -271,14 +268,12 @@ def run_inference_and_evaluation( manifest_records=manifest_records, audio_base_dir=meta['audio_dir'], save_cross_attention_maps=True, - save_context_audio=(repeat_idx == 0), # Only save context audio once - save_predicted_codes=eval_config.with_fcd, # Code files are only needed for FCD computation + save_context_audio=(repeat_idx == 0), + save_predicted_codes=eval_config.with_fcd, ) - # Compute mean RTF metrics mean_rtf = runner.compute_mean_rtf_metrics(rtf_metrics_list) - # Add FLOPs metrics per component for component_name, component_flops in flops_per_component.items(): for key, value in component_flops.items(): mean_rtf[f"{component_name}_{key}"] = value @@ -291,7 +286,6 @@ def run_inference_and_evaluation( logging.info("Skipping evaluation as requested.") continue - # Run evaluation eval_config_for_dataset = EvaluationConfig( sv_model=eval_config.sv_model, asr_model_name=eval_config.asr_model_name, @@ -312,7 +306,6 @@ def run_inference_and_evaluation( metrics_all_repeats.append(metrics) filewise_metrics_all_repeats.extend(filewise_metrics) - # Save metrics with open(os.path.join(eval_dir, f"{dataset}_metrics_{repeat_idx}.json"), "w") as f: json.dump(metrics, f, indent=4) @@ -320,24 +313,19 @@ def run_inference_and_evaluation( with open(os.path.join(eval_dir, f"{dataset}_filewise_metrics_{repeat_idx}.json"), "w") as f: json.dump(sorted_filewise, f, indent=4) - # Append to per-run CSV append_metrics_to_csv(per_run_csv, full_checkpoint_name, dataset, metrics) - # Create violin plot for this repeat violin_path = Path(eval_dir) / f"{dataset}_violin_{repeat_idx}.png" create_violin_plot(filewise_metrics, violin_plot_metrics, violin_path) - # Delete temporary predicted codes files for codec_file_path in codec_file_paths: os.remove(codec_file_path) if skip_evaluation or not metrics_all_repeats: continue - # Store for combined plot all_datasets_filewise_metrics[dataset] = filewise_metrics_all_repeats - # Compute mean with confidence interval across repeats metrics_mean_ci = compute_mean_with_confidence_interval( metrics_all_repeats, confidence=confidence_level, @@ -345,42 +333,76 @@ def run_inference_and_evaluation( formatted_metrics_mean_ci = create_formatted_metrics_mean_ci(metrics_mean_ci) - # Write to aggregated CSV ci_csv = os.path.join(out_dir, "all_experiment_metrics_with_ci.csv") write_csv_header_if_needed(ci_csv, csv_header) append_metrics_to_csv(ci_csv, full_checkpoint_name, dataset, formatted_metrics_mean_ci) - # Track per-dataset means ssim_values = [m['ssim_pred_context_avg'] for m in metrics_all_repeats] cer_values = [m['cer_cumulative'] for m in metrics_all_repeats] ssim_per_dataset.append(np.mean(ssim_values)) cer_per_dataset.append(np.mean(cer_values)) - # Create combined plot if we have multiple datasets if len(all_datasets_filewise_metrics) > 1: combined_plot_path = os.path.join(out_dir, f"{full_checkpoint_name}_combined_violin_plot.png") create_combined_box_plot(all_datasets_filewise_metrics, violin_plot_metrics, combined_plot_path) - # Clean up if requested if clean_up_disk: logging.info(f"Cleaning up output directory: {out_dir}") shutil.rmtree(out_dir) - # Return averaged metrics if ssim_per_dataset and cer_per_dataset: return np.mean(cer_per_dataset), np.mean(ssim_per_dataset) return None, None -def create_argument_parser() -> argparse.ArgumentParser: - """Create the CLI argument parser.""" - parser = argparse.ArgumentParser( - description='MagpieTTS Inference and Evaluation', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__, +# --------------------------------------------------------------------------- +# CLI argument parser +# --------------------------------------------------------------------------- + + +def _add_inference_param_fields( + group: argparse._ArgumentGroup, + param_cls: type, + skip_fields: Optional[set] = None, +) -> None: + """Auto-generate argparse arguments from fields of a dataclass. + + Args: + group: The argparse argument group to add arguments to. + param_cls: The dataclass whose fields to add. + skip_fields: Field names to skip (already added by another group). + """ + if skip_fields is None: + skip_fields = set() + for f in fields(param_cls): + if f.name in skip_fields: + continue + extra_args: dict = {"type": f.type} + if f.type == bool: + extra_args = {"action": "store_true"} + if f.name in ("estimate_alignment_from_layers", "apply_prior_to_layers"): + extra_args = { + "help": "Must be a comma separate string. Not enclosed in brackets", + "type": str, + } + elif f.name == "eos_detection_method": + extra_args["choices"] = [m.value for m in EOSDetectionMethod] + group.add_argument(f"--{f.name}", **extra_args) + + +def _add_common_args(parser: argparse.ArgumentParser) -> None: + """Add arguments shared by all model types.""" + + parser.add_argument( + '--model_type', + type=str, + default='magpie', + choices=['magpie', 'easy_magpie'], + help='Model type: "magpie" for encoder-decoder MagpieTTSModel, ' + '"easy_magpie" for decoder-only EasyMagpieTTSModel', ) - # Model loading arguments + # Model loading model_group = parser.add_argument_group('Model Loading') model_group.add_argument( '--hparams_files', @@ -422,73 +444,37 @@ def create_argument_parser() -> argparse.ArgumentParser: help='Use legacy text conditioning (for old checkpoints)', ) - # Dataset and output arguments + # Dataset and output data_group = parser.add_argument_group('Dataset and Output') data_group.add_argument( '--datasets_json_path', type=str, required=True, default=None, - help='Path to dataset configuration JSON file (will process all datasets in the file if --datasets is not specified)', + help='Path to dataset configuration JSON file', ) data_group.add_argument( '--datasets', type=str, default=None, - help='Comma-separated list of dataset names to process using names from the datasets_json_path file. If not specified, all datasets in the datasets_json_path will be processed.', - ) - data_group.add_argument( - '--out_dir', - type=str, - required=True, - help='Output directory for generated audio and metrics', - ) - data_group.add_argument( - '--log_exp_name', - action='store_true', - help='Include experiment name in output folder name', - ) - data_group.add_argument( - '--clean_up_disk', - action='store_true', - help='Delete output directory after completion', + help='Comma-separated list of dataset names to process', ) + data_group.add_argument('--out_dir', type=str, required=True, help='Output directory') + data_group.add_argument('--log_exp_name', action='store_true') + data_group.add_argument('--clean_up_disk', action='store_true') - # Inference arguments - infer_group = parser.add_argument_group('Inference Parameters') - # Add model specific parameters - for field in fields(ModelInferenceParameters): - extra_args = {"type": field.type} - if field.type == bool: - extra_args["action"] = "store_true" - del extra_args["type"] - if field.name == "estimate_alignment_from_layers" or field.name == "apply_prior_to_layers": - extra_args["help"] = "Must be a comma separate string. Not enclosed in brackets" - extra_args["type"] = str - elif field.name == "eos_detection_method": - extra_args["choices"] = [m.value for m in EOSDetectionMethod] - infer_group.add_argument(f"--{field.name}", **extra_args) + # Common inference parameters + infer_group = parser.add_argument_group('Common Inference Parameters') infer_group.add_argument('--batch_size', type=int, default=32) infer_group.add_argument('--use_cfg', action='store_true', help='Enable classifier-free guidance') - - # Local transformer / MaskGit arguments infer_group.add_argument('--use_local_transformer', action='store_true') - infer_group.add_argument('--maskgit_n_steps', type=int, default=3) - infer_group.add_argument('--maskgit_noise_scale', type=float, default=0.0) - infer_group.add_argument('--maskgit_fixed_schedule', type=int, nargs='+', default=None) - infer_group.add_argument( - '--maskgit_sampling_type', - default=None, - choices=["default", "causal", "purity_causal", "purity_default"], - ) - # Evaluation arguments + # Shared model inference parameters (max_decoder_steps, temperature, topk, cfg_scale) + _add_inference_param_fields(infer_group, EasyModelInferenceParameters) + + # Evaluation eval_group = parser.add_argument_group('Evaluation') - eval_group.add_argument( - '--run_evaluation', - action='store_true', - help='Run evaluation after inference (default: False, inference only)', - ) + eval_group.add_argument('--run_evaluation', action='store_true', help='Run evaluation after inference') eval_group.add_argument('--sv_model', type=str, default="titanet", choices=["titanet", "wavlm"]) eval_group.add_argument('--asr_model_name', type=str, default="nvidia/parakeet-tdt-1.1b") eval_group.add_argument('--num_repeats', type=int, default=1) @@ -500,70 +486,92 @@ def create_argument_parser() -> argparse.ArgumentParser: nargs='*', default=['cer', 'pred_context_ssim', 'utmosv2'], ) - eval_group.add_argument('--disable_fcd', action='store_true', help="Disable Frechet Codec Distance computation") + eval_group.add_argument('--disable_fcd', action='store_true') - # Quality targets (for CI/CD) + # Quality targets target_group = parser.add_argument_group('Quality Targets') target_group.add_argument('--cer_target', type=float, default=None) target_group.add_argument('--ssim_target', type=float, default=None) - target_group.add_argument('--is_decoder_only_model', action='store_true') - target_group.add_argument( - '--legacy_context_stacking', - action='store_true', - help='Use audio_bos_id/audio_eos_id instead of context_audio_bos_id/context_audio_eos_id for context stacking', - ) - target_group.add_argument('--phoneme_input_type', type=str, default='gt', choices=['predicted', 'gt']) - target_group.add_argument( - '--phoneme_sampling_method', type=str, default='argmax', choices=['argmax', 'multinomial'] - ) - target_group.add_argument('--dropout_text_input', action='store_true') - return parser +def _add_magpie_args(parser: argparse.ArgumentParser) -> None: + """Add arguments specific to encoder-decoder MagpieTTSModel.""" + group = parser.add_argument_group('MagpieTTS-specific Parameters') -def main(argv=None): - """Entry point for MagpieTTS inference and evaluation. + # MagpieTTS-specific model inference parameters (attention prior, EOS, etc.) + # Skip fields already added by the common inference group. + shared_field_names = {f.name for f in fields(EasyModelInferenceParameters)} + _add_inference_param_fields(group, ModelInferenceParameters, skip_fields=shared_field_names) - Args: - argv: Command-line arguments. If None, uses sys.argv. - """ - parser = create_argument_parser() - args = parser.parse_args(argv) + group.add_argument('--maskgit_n_steps', type=int, default=3) + group.add_argument('--maskgit_noise_scale', type=float, default=0.0) + group.add_argument('--maskgit_fixed_schedule', type=int, nargs='+', default=None) + group.add_argument( + '--maskgit_sampling_type', + default=None, + choices=["default", "causal", "purity_causal", "purity_default"], + ) - dataset_meta_info = load_evalset_config(args.datasets_json_path) - datasets = filter_datasets(dataset_meta_info, args.datasets) - logging.info(f"Loaded {len(datasets)} datasets: {', '.join(datasets)}") +def _add_easy_magpie_args(parser: argparse.ArgumentParser) -> None: + """Add arguments specific to decoder-only EasyMagpieTTSModel.""" + group = parser.add_argument_group('EasyMagpieTTS-specific Parameters') + group.add_argument( + '--phoneme_input_type', + type=str, + default='gt', + choices=['gt', 'predicted'], + help='Source of phoneme input for decoder-only model', + ) + group.add_argument( + '--phoneme_sampling_method', + type=str, + default='argmax', + choices=['argmax', 'multinomial'], + help='Sampling method for phoneme prediction', + ) + group.add_argument('--dropout_text_input', action='store_true', help='Force dropout on text input') + group.add_argument( + '--legacy_context_stacking', + action='store_true', + help='Use audio_bos_id/audio_eos_id for context stacking', + ) - # Determine mode and validate - has_checkpoint_mode = ( - args.hparams_files is not None - and args.checkpoint_files is not None - and args.hparams_files != "null" - and args.checkpoint_files != "null" + +def create_argument_parser() -> argparse.ArgumentParser: + """Create the CLI argument parser with all argument groups.""" + parser = argparse.ArgumentParser( + description='TTS Inference and Evaluation (MagpieTTS & EasyMagpieTTS)', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, ) - has_nemo_mode = args.nemo_files is not None and args.nemo_files != "null" + _add_common_args(parser) + _add_magpie_args(parser) + _add_easy_magpie_args(parser) + return parser - if not has_checkpoint_mode and not has_nemo_mode: - parser.error("You must provide either:\n 1. --hparams_files and --checkpoint_files\n 2. --nemo_files") - # Build configurations - model_inference_parameters = {} - for field in fields(ModelInferenceParameters): - field_name = field.name - arg_from_cmdline = vars(args)[field_name] - if arg_from_cmdline is not None: - if field_name in ["estimate_alignment_from_layers", "apply_prior_to_layers"]: - model_inference_parameters[field_name] = parse_layer_list(arg_from_cmdline) +# --------------------------------------------------------------------------- +# Config builders (one per model type) +# --------------------------------------------------------------------------- + + +def _build_inference_params_from_args(param_cls: type, args): + """Extract inference parameters from parsed CLI args for the given dataclass.""" + params = {} + for f in fields(param_cls): + arg_val = vars(args).get(f.name) + if arg_val is not None: + if f.name in ("estimate_alignment_from_layers", "apply_prior_to_layers"): + params[f.name] = parse_layer_list(arg_val) else: - model_inference_parameters[field_name] = arg_from_cmdline + params[f.name] = arg_val + return param_cls.from_dict(params) - if "max_decoder_steps" not in model_inference_parameters: - if args.is_decoder_only_model: - model_inference_parameters["max_decoder_steps"] = 300 - inference_config = InferenceConfig( - model_inference_parameters=ModelInferenceParameters.from_dict(model_inference_parameters), +def _build_magpie_config(args) -> MagpieInferenceConfig: + return MagpieInferenceConfig( + model_inference_parameters=_build_inference_params_from_args(ModelInferenceParameters, args), batch_size=args.batch_size, use_cfg=args.use_cfg, apply_attention_prior=args.apply_attention_prior, @@ -572,13 +580,54 @@ def main(argv=None): maskgit_noise_scale=args.maskgit_noise_scale, maskgit_fixed_schedule=args.maskgit_fixed_schedule, maskgit_sampling_type=args.maskgit_sampling_type, - is_decoder_only_model=args.is_decoder_only_model, + ) + + +def _build_easy_magpie_config(args) -> EasyMagpieInferenceConfig: + return EasyMagpieInferenceConfig( + model_inference_parameters=_build_inference_params_from_args(EasyModelInferenceParameters, args), + batch_size=args.batch_size, + use_cfg=args.use_cfg, + use_local_transformer=args.use_local_transformer, phoneme_input_type=args.phoneme_input_type, phoneme_sampling_method=args.phoneme_sampling_method, dropout_text_input=args.dropout_text_input, legacy_context_stacking=args.legacy_context_stacking, ) + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main(argv=None): + """Entry point for TTS inference and evaluation.""" + parser = create_argument_parser() + args = parser.parse_args(argv) + + dataset_meta_info = load_evalset_config(args.datasets_json_path) + datasets = filter_datasets(dataset_meta_info, args.datasets) + logging.info(f"Loaded {len(datasets)} datasets: {', '.join(datasets)}") + + # Validate model loading args + has_checkpoint_mode = ( + args.hparams_files is not None + and args.checkpoint_files is not None + and args.hparams_files != "null" + and args.checkpoint_files != "null" + ) + has_nemo_mode = args.nemo_files is not None and args.nemo_files != "null" + + if not has_checkpoint_mode and not has_nemo_mode: + parser.error("You must provide either:\n 1. --hparams_files and --checkpoint_files\n 2. --nemo_files") + + # Select model loader and config builder based on --model_type + is_easy_magpie = args.model_type == 'easy_magpie' + load_fn = load_easy_magpie_model if is_easy_magpie else load_magpie_model + inference_config = _build_easy_magpie_config(args) if is_easy_magpie else _build_magpie_config(args) + runner_cls = EasyMagpieInferenceRunner if is_easy_magpie else MagpieInferenceRunner + eval_config = EvaluationConfig( sv_model=args.sv_model, asr_model_name=args.asr_model_name, @@ -589,7 +638,7 @@ def main(argv=None): cer, ssim = None, None - # Run for each model (checkpoint or nemo) + # Iterate over model files (checkpoint or nemo) if has_checkpoint_mode: hparam_files = args.hparams_files.split(",") checkpoint_files = args.checkpoint_files.split(",") @@ -609,17 +658,28 @@ def main(argv=None): hparams_from_wandb=args.hparams_file_from_wandb, ) + model, checkpoint_name = load_fn(model_config) + moe_info, flops_per_component = log_model_architecture_summary(model) + + if args.log_exp_name and model_config.checkpoint_file: + exp_name = get_experiment_name_from_checkpoint_path(model_config.checkpoint_file) + checkpoint_name = f"{exp_name}__{checkpoint_name}" + + runner = runner_cls(model, inference_config) + cer, ssim = run_inference_and_evaluation( - model_config=model_config, + runner=runner, + checkpoint_name=checkpoint_name, inference_config=inference_config, eval_config=eval_config, dataset_meta_info=dataset_meta_info, datasets=datasets, out_dir=args.out_dir, + flops_per_component=flops_per_component, + moe_info=moe_info, num_repeats=args.num_repeats, confidence_level=args.confidence_level, violin_plot_metrics=args.violin_plot_metrics, - log_exp_name=args.log_exp_name, clean_up_disk=args.clean_up_disk, skip_evaluation=not args.run_evaluation, ) @@ -635,17 +695,24 @@ def main(argv=None): legacy_text_conditioning=args.legacy_text_conditioning, ) + model, checkpoint_name = load_fn(model_config) + moe_info, flops_per_component = log_model_architecture_summary(model) + + runner = runner_cls(model, inference_config) + cer, ssim = run_inference_and_evaluation( - model_config=model_config, + runner=runner, + checkpoint_name=checkpoint_name, inference_config=inference_config, eval_config=eval_config, dataset_meta_info=dataset_meta_info, datasets=datasets, out_dir=args.out_dir, + flops_per_component=flops_per_component, + moe_info=moe_info, num_repeats=args.num_repeats, confidence_level=args.confidence_level, violin_plot_metrics=args.violin_plot_metrics, - log_exp_name=args.log_exp_name, clean_up_disk=args.clean_up_disk, skip_evaluation=not args.run_evaluation, ) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 5a117432b986..19705eed1ad3 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -14,7 +14,7 @@ import json import os import random -from dataclasses import dataclass +from dataclasses import dataclass, fields from typing import Dict, List, Optional, Tuple import numpy as np @@ -98,6 +98,29 @@ class ProcessBatchOutput: selected_training_mode: Optional[str] +@dataclass +class EasyModelInferenceParameters: + """Inference parameters for the decoder-only EasyMagpieTTS model. + + Attributes: + max_decoder_steps: Maximum number of decoder steps. + temperature: Sampling temperature. + topk: Number of top-probability tokens to consider in sampling. + cfg_scale: Scale factor for classifier-free guidance. + """ + + max_decoder_steps: int = 500 + temperature: float = 0.7 + topk: int = 80 + cfg_scale: float = 2.5 + + @classmethod + def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters': + field_names = {field.name for field in fields(cls)} + filtered_data = {k: v for k, v in data.items() if k in field_names} + return cls(**filtered_data) + + class EasyMagpieTTSModel(EasyMagpieTTSInferenceModel): """ Magpie-TTS Model Decoder Only Model with training support. diff --git a/nemo/collections/tts/modules/magpietts_inference/__init__.py b/nemo/collections/tts/modules/magpietts_inference/__init__.py index fd99780f21b2..b1ff0aefe91e 100644 --- a/nemo/collections/tts/modules/magpietts_inference/__init__.py +++ b/nemo/collections/tts/modules/magpietts_inference/__init__.py @@ -12,35 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -MagpieTTS inference and evaluation subpackage. +TTS inference and evaluation subpackage. This package provides modular components for: - Model loading and configuration (utils.py) -- Batch inference (inference.py) +- Batch inference (inference.py) for both MagpieTTS and EasyMagpieTTS - Audio quality evaluation (evaluation.py) - Metrics visualization (visualization.py) -Example Usage: - from examples.tts.magpietts import ( - InferenceConfig, +Example Usage (MagpieTTS - encoder-decoder): + from nemo.collections.tts.modules.magpietts_inference import ( + MagpieInferenceConfig, MagpieInferenceRunner, load_magpie_model, ModelLoadConfig, ) - # Load model - model_config = ModelLoadConfig( - nemo_file="/path/to/model.nemo", - codecmodel_path="/path/to/codec.nemo", - ) - model, checkpoint_name = load_magpie_model(model_config) + model_config = ModelLoadConfig(nemo_file="/path/to/model.nemo", codecmodel_path="/path/to/codec.nemo") + model, name = load_magpie_model(model_config) + runner = MagpieInferenceRunner(model, MagpieInferenceConfig()) - # Log architecture summary and retrieve MoE info + FLOPs metrics - moe_info, flops_per_component = log_model_architecture_summary(model) +Example Usage (EasyMagpieTTS - decoder-only): + from nemo.collections.tts.modules.magpietts_inference import ( + EasyMagpieInferenceConfig, + EasyMagpieInferenceRunner, + load_easy_magpie_model, + ModelLoadConfig, + ) - # Create runner and run inference - inference_config = InferenceConfig() - runner = MagpieInferenceRunner(model, inference_config) + model_config = ModelLoadConfig(nemo_file="/path/to/model.nemo", codecmodel_path="/path/to/codec.nemo") + model, name = load_easy_magpie_model(model_config) + runner = EasyMagpieInferenceRunner(model, EasyMagpieInferenceConfig()) """ from nemo.collections.tts.modules.magpietts_inference.evaluation import ( @@ -49,11 +51,20 @@ compute_mean_with_confidence_interval, evaluate_generated_audio_dir, ) -from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner +from nemo.collections.tts.modules.magpietts_inference.inference import ( + BaseInferenceConfig, + BaseInferenceRunner, + EasyMagpieInferenceConfig, + EasyMagpieInferenceRunner, + InferenceConfig, + MagpieInferenceConfig, + MagpieInferenceRunner, +) from nemo.collections.tts.modules.magpietts_inference.utils import ( ModelLoadConfig, compute_ffn_flops_per_token, get_experiment_name_from_checkpoint_path, + load_easy_magpie_model, load_magpie_model, log_model_architecture_summary, ) @@ -63,12 +74,19 @@ # Utils "ModelLoadConfig", "load_magpie_model", + "load_easy_magpie_model", "compute_ffn_flops_per_token", "get_experiment_name_from_checkpoint_path", "log_model_architecture_summary", - # Inference + # Inference configs + "BaseInferenceConfig", + "MagpieInferenceConfig", + "EasyMagpieInferenceConfig", "InferenceConfig", + # Inference runners + "BaseInferenceRunner", "MagpieInferenceRunner", + "EasyMagpieInferenceRunner", # Evaluation "EvaluationConfig", "evaluate_generated_audio_dir", diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index cf325b91d71c..d5d34537e088 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -12,15 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Core inference logic for MagpieTTS. +Core inference logic for MagpieTTS models. -This module provides: -- InferenceConfig: Dataclass for inference hyperparameters -- MagpieInferenceRunner: Class for running batch inference with a loaded model - (uses unified inference path with automatic text chunking based on language thresholds) +This module provides a strategy-pattern based inference framework with: +- BaseInferenceConfig / MagpieInferenceConfig / EasyMagpieInferenceConfig +- BaseInferenceRunner / MagpieInferenceRunner / EasyMagpieInferenceRunner + +MagpieInferenceRunner handles the encoder-decoder MagpieTTSModel +(chunked text, generate_speech + codes_to_audio). + +EasyMagpieInferenceRunner handles the decoder-only EasyMagpieTTSModel +(infer_batch, returns audio directly). """ from __future__ import annotations +import abc import glob import os import shutil @@ -34,65 +40,56 @@ from nemo.collections.asr.parts.utils.manifest_utils import read_manifest from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset -from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel +from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters from nemo.collections.tts.models.magpietts import ModelInferenceParameters from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors from nemo.utils import logging +# --------------------------------------------------------------------------- +# Inference config hierarchy +# --------------------------------------------------------------------------- + + @dataclass -class InferenceConfig: - """Configuration for MagpieTTS inference. - - Attributes: - batch_size: Batch size for inference. - use_cfg: Whether to use classifier-free guidance. - apply_attention_prior: Whether to apply attention prior during decoding. - - # Model specific inference parameters - model_inference_parameters: See ModelInferenceParameters dataclass - - # Local transformer / MaskGit parameters - use_local_transformer: Whether to use local transformer for inference. - maskgit_n_steps: Number of MaskGit refinement steps. - maskgit_noise_scale: Noise scale for MaskGit sampling. - maskgit_fixed_schedule: Fixed schedule for MaskGit (optional). - maskgit_sampling_type: Type of MaskGit sampling. +class BaseInferenceConfig(abc.ABC): + """Shared inference configuration fields. + + Subclasses must declare their own ``model_inference_parameters`` field + with the appropriate type (ModelInferenceParameters or + EasyModelInferenceParameters). """ - # Core sampling parameters batch_size: int = 32 use_cfg: bool = False - apply_attention_prior: bool = False + use_local_transformer: bool = False + + @abc.abstractmethod + def build_identifier(self) -> str: + """Build a unique identifier string for naming output directories.""" + ... + + @staticmethod + def _format_layer_list(layers: Optional[List[int]]) -> str: + if layers is None: + return "None" + return "".join(str(_layer) for _layer in layers) + + +@dataclass +class MagpieInferenceConfig(BaseInferenceConfig): + """Configuration for encoder-decoder MagpieTTSModel inference.""" + model_inference_parameters: ModelInferenceParameters = field(default_factory=ModelInferenceParameters) + apply_attention_prior: bool = False - # Local transformer / MaskGit parameters - use_local_transformer: bool = False + # MaskGit parameters maskgit_n_steps: int = 3 maskgit_noise_scale: float = 0.0 maskgit_fixed_schedule: Optional[List[int]] = None maskgit_sampling_type: Optional[str] = None - # Decoder-only inference options - phoneme_input_type: str = "gt" # gt or predicted - phoneme_sampling_method: str = "argmax" # argmax or multinomial - dropout_text_input: bool = False - legacy_context_stacking: bool = False # Use audio_bos_id/audio_eos_id for context stacking - - # Longform inference mode - longform_mode: str = "auto" # "auto" | "always" | "never" - longform_word_threshold: int = 40 # Word threshold for auto-detection - - is_decoder_only_model: bool = False - def build_identifier(self) -> str: - """Build a unique identifier string for this configuration. - - Used for naming output directories and files. - - Returns: - String identifier incorporating key config values. - """ parts = [ f"Temp{self.model_inference_parameters.temperature}", f"Topk{self.model_inference_parameters.topk}", @@ -123,134 +120,69 @@ def build_identifier(self) -> str: return "_".join(parts) - @staticmethod - def _format_layer_list(layers: Optional[List[int]]) -> str: - """Format a list of layer indices as a compact string.""" - if layers is None: - return "None" - return "".join(str(_layer) for _layer in layers) + +@dataclass +class EasyMagpieInferenceConfig(BaseInferenceConfig): + """Configuration for decoder-only EasyMagpieTTSModel inference.""" + + model_inference_parameters: EasyModelInferenceParameters = field( + default_factory=EasyModelInferenceParameters + ) + phoneme_input_type: str = "gt" + phoneme_sampling_method: str = "argmax" + dropout_text_input: bool = False + legacy_context_stacking: bool = False + + def build_identifier(self) -> str: + parts = [ + f"Temp{self.model_inference_parameters.temperature}", + f"Topk{self.model_inference_parameters.topk}", + f"Cfg_{self.use_cfg}_{self.model_inference_parameters.cfg_scale}", + f"LT_{self.use_local_transformer}", + f"Phoneme_{self.phoneme_input_type}_{self.phoneme_sampling_method}", + ] + return "_".join(parts) + + +# Backwards-compatible aliases +InferenceConfig = MagpieInferenceConfig + + +# --------------------------------------------------------------------------- +# Inference runner hierarchy +# --------------------------------------------------------------------------- -class MagpieInferenceRunner: - """Runner class for MagpieTTS batch inference. +class BaseInferenceRunner(abc.ABC): + """Abstract base for TTS inference runners. - Encapsulates the logic for running inference on a dataset, saving outputs, - and collecting metrics. + Provides shared utilities (batch-to-cuda, file cleanup, reference audio + copying, RTF metrics) and declares the interface that concrete runners + must implement. """ - def __init__( - self, # model can be MagpieTTSModel or DecoderOnlyMagpieTTSModel - model: Union[MagpieTTSModel, EasyMagpieTTSModel], - config: InferenceConfig, - ): - """Initialize the inference runner. - - Args: - model: Loaded MagpieTTS model (should be on GPU and in eval mode). - config: Inference configuration. - """ + def __init__(self, model, config: BaseInferenceConfig): self.model = model self.config = config - - # Set legacy context stacking flag on model - self.model.legacy_context_stacking = config.legacy_context_stacking - - # Set phoneme probability to 1 for inference self._configure_tokenizer() - - # Cached state from create_dataset (set when create_dataset is called) self._manifest_records: Optional[List[dict]] = None self._audio_base_dir: Optional[str] = None - def _configure_tokenizer(self) -> None: - """Configure the tokenizer for inference (phoneme prob = 1.0).""" - g2p = None - if isinstance(self.model.tokenizer, AggregatedTTSTokenizer): - if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr( - self.model.tokenizer.tokenizers["english_phoneme"], "g2p" - ): - g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p - elif isinstance(self.model.tokenizer, IPATokenizer): - g2p = self.model.tokenizer.g2p - - if g2p is not None: - g2p.phoneme_probability = 1.0 + # -- interface ----------------------------------------------------------- + @abc.abstractmethod def create_dataset( self, dataset_meta: dict, context_duration_min: Optional[float] = None, context_duration_max: Optional[float] = None, ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]: - """Create an inference dataset. - - Standard MagpieTTS uses the chunked inference dataset from `main`. - Decoder-only MagpieTTS uses the regular dataset and its dedicated - `infer_batch()` inference path. - - Args: - dataset_meta: Dataset metadata dictionary with 'manifest_path' and 'audio_dir'. - context_duration_min: Minimum context duration (uses model default if None). - context_duration_max: Maximum context duration (uses model default if None). - - Returns: - Configured ChunkedTTSInferenceDataset instance. - """ - # Use model defaults if not specified - if context_duration_min is None: - context_duration_min = self.model.cfg.get('context_duration_min', 5.0) - if context_duration_max is None: - context_duration_max = self.model.cfg.get('context_duration_max', 5.0) - - # For multi-encoder models, use fixed 5s context for fair evaluation - if context_duration_min < 5.0 and context_duration_max > 5.0: - context_duration_min = 5.0 - context_duration_max = 5.0 - - # Read manifest and cache for later use - dataset_name = list(dataset_meta.keys())[0] - dataset_info = dataset_meta[dataset_name] - manifest_path = dataset_info.get('manifest_path') - audio_dir = dataset_info.get('audio_dir', '') - logging.info(f"Dataset name: {dataset_name}, manifest_path: {manifest_path}, audio_dir: {audio_dir}") - - self._manifest_records = read_manifest(manifest_path) - self._audio_base_dir = audio_dir - if self.config.is_decoder_only_model: - logging.info("Creating standard inference dataset for decoder-only model") - dataset = MagpieTTSDataset( - dataset_meta=dataset_meta, - sample_rate=self.model.sample_rate, - min_duration=0.5, - max_duration=20, - codec_model_samples_per_frame=self.model.codec_model_samples_per_frame, - bos_id=getattr(self.model, "bos_id", None), - eos_id=self.model.eos_id, - num_audio_codebooks=self.model.num_audio_codebooks, - prior_scaling_factor=None, - load_cached_codes_if_available=False, - dataset_type='test', - tokenizer_config=None, - load_16khz_audio=False, - use_text_conditioning_tokenizer=True, - text_conditioning_tokenizer_name=self.model.text_conditioning_tokenizer_name, - pad_context_text_to_max_duration=False, - context_duration_min=context_duration_min, - context_duration_max=context_duration_max, - ) - dataset.text_tokenizer = self.model.tokenizer - else: - logging.info("Creating unified inference dataset") - dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max) - - if hasattr(self.model, 'phoneme_tokenizer'): - dataset.phoneme_tokenizer = self.model.phoneme_tokenizer - - return dataset + ... + @abc.abstractmethod def run_inference_on_dataset( self, - dataset: ChunkedTTSInferenceDataset, + dataset, output_dir: str, manifest_records: Optional[List[dict]] = None, audio_base_dir: Optional[str] = None, @@ -258,127 +190,66 @@ def run_inference_on_dataset( save_context_audio: bool = True, save_predicted_codes: bool = True, ) -> Tuple[List[dict], List[str], List[str]]: - """Run inference on a dataset. - - Args: - dataset: The inference dataset (created by create_dataset()). - output_dir: Directory to save generated audio and artifacts. - manifest_records: Original manifest records (uses cached if None). - audio_base_dir: Base directory for audio paths (uses cached if None). - save_cross_attention_maps: Whether to save attention map images (not used in unified path). - save_context_audio: Whether to copy context audio files. - save_predicted_codes: Whether to save predicted code files. - - Returns: - Tuple of: - - rtf_metrics: List of real-time factor metrics per batch. - - generated_audio_paths: List of paths to generated audio files. - - codec_file_paths: List of paths to predicted codes files. - """ - # Use cached values if not provided + ... + + # -- shared helpers ------------------------------------------------------ + + def _configure_tokenizer(self) -> None: + """Configure the tokenizer for inference (phoneme prob = 1.0).""" + g2p = None + if isinstance(self.model.tokenizer, AggregatedTTSTokenizer): + if "english_phoneme" in self.model.tokenizer.tokenizers and hasattr( + self.model.tokenizer.tokenizers["english_phoneme"], "g2p" + ): + g2p = self.model.tokenizer.tokenizers["english_phoneme"].g2p + elif isinstance(self.model.tokenizer, IPATokenizer): + g2p = self.model.tokenizer.g2p + + if g2p is not None: + g2p.phoneme_probability = 1.0 + + def _resolve_manifest_and_audio_dir( + self, + manifest_records: Optional[List[dict]], + audio_base_dir: Optional[str], + ) -> Tuple[List[dict], str]: if manifest_records is None: if self._manifest_records is None: raise ValueError("manifest_records not provided and not cached from create_dataset()") manifest_records = self._manifest_records - if audio_base_dir is None: if self._audio_base_dir is None: raise ValueError("audio_base_dir not provided and not cached from create_dataset()") audio_base_dir = self._audio_base_dir + return manifest_records, audio_base_dir - if self.config.is_decoder_only_model: - logging.info("Using decoder-only inference path") - return self._run_decoder_only_inference( - dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes - ) - - logging.info("Using unified inference path") - return self._run_unified_inference( - dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes - ) + def _read_and_cache_manifest(self, dataset_meta: dict) -> Tuple[str, str]: + """Read manifest from dataset_meta, cache records, return (manifest_path, audio_dir).""" + dataset_name = list(dataset_meta.keys())[0] + dataset_info = dataset_meta[dataset_name] + manifest_path = dataset_info.get('manifest_path') + audio_dir = dataset_info.get('audio_dir', '') + logging.info(f"Dataset name: {dataset_name}, manifest_path: {manifest_path}, audio_dir: {audio_dir}") + self._manifest_records = read_manifest(manifest_path) + self._audio_base_dir = audio_dir + return manifest_path, audio_dir - def _run_decoder_only_inference( + def _get_context_durations( self, - dataset: MagpieTTSDataset, - output_dir: str, - manifest_records: List[dict], - audio_base_dir: str, - save_context_audio: bool = True, - save_predicted_codes: bool = True, - ) -> Tuple[List[dict], List[str], List[str]]: - """Run inference for decoder-only models via `infer_batch()`.""" - os.makedirs(output_dir, exist_ok=True) - self._delete_old_generated_files(output_dir) - - dataloader = torch.utils.data.DataLoader( - dataset, - batch_size=self.config.batch_size, - collate_fn=dataset.collate_fn, - num_workers=0, - shuffle=False, - ) - - all_rtf_metrics = [] - generated_audio_paths = [] - codec_file_paths = [] - item_idx = 0 - phoneme_sampling_method = ( - "argmax" if self.config.phoneme_sampling_method == "greedy" else self.config.phoneme_sampling_method - ) - - for batch_idx, batch in enumerate(dataloader): - logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}") - batch = self._batch_to_cuda(batch) - output = self.model.infer_batch( - batch, - max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps, - temperature=self.config.model_inference_parameters.temperature, - topk=self.config.model_inference_parameters.topk, - use_cfg=self.config.use_cfg, - cfg_scale=self.config.model_inference_parameters.cfg_scale, - use_local_transformer_for_inference=self.config.use_local_transformer, - phoneme_input_type=self.config.phoneme_input_type, - phoneme_sampling_method=phoneme_sampling_method, - force_dropout_text=self.config.dropout_text_input, - ) - predicted_audio = output.predicted_audio - predicted_audio_lens = output.predicted_audio_lens - predicted_codes = output.predicted_codes - predicted_codes_lens = output.predicted_codes_lens - rtf_metrics = output.rtf_metrics - - all_rtf_metrics.append(rtf_metrics) - logging.info(f"Output shape: {predicted_audio.size()}") - - for idx in range(predicted_audio.size(0)): - audio_len = predicted_audio_lens[idx].item() - audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len] - audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav") - sample_rate = getattr(self.model, "output_sample_rate", self.model.sample_rate) - sf.write(audio_path, audio_np, sample_rate) - generated_audio_paths.append(audio_path) - - if save_context_audio and item_idx < len(manifest_records): - self._copy_reference_audio( - manifest_records[item_idx], - audio_base_dir, - output_dir, - item_idx, - ) - - if save_predicted_codes: - code_len = predicted_codes_lens[idx].item() - codes_path = os.path.join(output_dir, f"predicted_codes_{item_idx}.pt") - torch.save(predicted_codes[idx, :, :code_len].detach().cpu(), codes_path) - codec_file_paths.append(codes_path) - - item_idx += 1 - - return all_rtf_metrics, generated_audio_paths, codec_file_paths + context_duration_min: Optional[float], + context_duration_max: Optional[float], + ) -> Tuple[float, float]: + if context_duration_min is None: + context_duration_min = self.model.cfg.get('context_duration_min', 5.0) + if context_duration_max is None: + context_duration_max = self.model.cfg.get('context_duration_max', 5.0) + if context_duration_min < 5.0 and context_duration_max > 5.0: + context_duration_min = 5.0 + context_duration_max = 5.0 + return context_duration_min, context_duration_max @staticmethod def _batch_to_cuda(batch: dict) -> dict: - """Move batch tensors to CUDA device.""" batch_cuda = {} for key, value in batch.items(): if isinstance(value, torch.Tensor): @@ -389,7 +260,6 @@ def _batch_to_cuda(batch: dict) -> dict: @staticmethod def _delete_old_generated_files(output_dir: str) -> None: - """Delete leftover generated files from previous runs.""" logging.info(f"Cleaning up old generated files in: {output_dir}") patterns = [ "predicted_codes*.pt", @@ -407,7 +277,6 @@ def _copy_reference_audio( output_dir: str, item_idx: int, ) -> None: - """Copy context and target audio files to output directory.""" context_path = record.get('context_audio_filepath') target_path = record.get('audio_filepath') @@ -425,48 +294,69 @@ def _copy_reference_audio( @staticmethod def compute_mean_rtf_metrics(rtf_metrics_list: List[dict]) -> Dict[str, float]: - """Compute mean RTF metrics across batches.""" if not rtf_metrics_list or not rtf_metrics_list[0]: return {} - mean_metrics = {} for key in rtf_metrics_list[0]: values = [m[key] for m in rtf_metrics_list if key in m] mean_metrics[key] = float(sum(values) / len(values)) if values else 0.0 - return mean_metrics - def _create_chunked_inference_dataset( + +# --------------------------------------------------------------------------- +# MagpieInferenceRunner (encoder-decoder MagpieTTSModel) +# --------------------------------------------------------------------------- + + +class MagpieInferenceRunner(BaseInferenceRunner): + """Runner for encoder-decoder MagpieTTSModel. + + Uses ChunkedTTSInferenceDataset and model.generate_speech() per chunk, + then model.codes_to_audio() to produce waveforms. + """ + + def __init__(self, model, config: MagpieInferenceConfig): + super().__init__(model, config) + + def create_dataset( self, dataset_meta: dict, context_duration_min: Optional[float] = None, context_duration_max: Optional[float] = None, ) -> ChunkedTTSInferenceDataset: - """Create a unified inference dataset. - - Creates ChunkedTTSInferenceDataset which uses language-aware chunking - to automatically handle both short and long texts. + context_duration_min, context_duration_max = self._get_context_durations( + context_duration_min, context_duration_max + ) + self._read_and_cache_manifest(dataset_meta) - Args: - dataset_meta: Dataset metadata dictionary (same format as MagpieTTSDataset). - context_duration_min: Minimum context duration (uses model default if None). - context_duration_max: Maximum context duration (uses model default if None). + logging.info("Creating unified inference dataset") + dataset = self._create_chunked_inference_dataset(dataset_meta, context_duration_min, context_duration_max) + return dataset - Returns: - Configured ChunkedTTSInferenceDataset instance. - """ - # Use model defaults if not specified - if context_duration_min is None: - context_duration_min = self.model.cfg.get('context_duration_min', 5.0) - if context_duration_max is None: - context_duration_max = self.model.cfg.get('context_duration_max', 5.0) + def run_inference_on_dataset( + self, + dataset: ChunkedTTSInferenceDataset, + output_dir: str, + manifest_records: Optional[List[dict]] = None, + audio_base_dir: Optional[str] = None, + save_cross_attention_maps: bool = True, + save_context_audio: bool = True, + save_predicted_codes: bool = True, + ) -> Tuple[List[dict], List[str], List[str]]: + manifest_records, audio_base_dir = self._resolve_manifest_and_audio_dir(manifest_records, audio_base_dir) + logging.info("Using unified inference path") + return self._run_unified_inference( + dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes + ) - # For multi-encoder models, use fixed 5s context for fair evaluation - if context_duration_min < 5.0 and context_duration_max > 5.0: - context_duration_min = 5.0 - context_duration_max = 5.0 + # -- private ------------------------------------------------------------- - # Create unified dataset - language and tokenizer are determined per-sample from manifest + def _create_chunked_inference_dataset( + self, + dataset_meta: dict, + context_duration_min: float, + context_duration_max: float, + ) -> ChunkedTTSInferenceDataset: dataset = ChunkedTTSInferenceDataset( dataset_meta=dataset_meta, sample_rate=self.model.output_sample_rate, @@ -480,10 +370,7 @@ def _create_chunked_inference_dataset( pad_context_text_to_max_duration=self.model.pad_context_text_to_max_duration, load_16khz_audio=self.model.model_type == 'single_encoder_sv_tts', ) - - # Attach model's tokenizer dataset.text_tokenizer = self.model.tokenizer - return dataset def _run_unified_inference( @@ -495,26 +382,6 @@ def _run_unified_inference( save_context_audio: bool = True, save_predicted_codes: bool = True, ) -> Tuple[List[dict], List[str], List[str]]: - """Run unified inference with automatic single/multi-chunk handling. - - Processes all samples through generate_speech, passing - beginning_of_text and end_of_text so the model can handle both - single-chunk (short text) and multi-chunk (long text) cases correctly. - - Args: - dataset: ChunkedTTSInferenceDataset created by create_dataset(). - output_dir: Directory to save generated audio and artifacts. - manifest_records: List of manifest record dictionaries. - audio_base_dir: Base directory for resolving audio paths. - save_context_audio: Whether to copy context audio files. - save_predicted_codes: Whether to save predicted code files. - - Returns: - Tuple of: - - rtf_metrics: List of real-time factor metrics per batch. - - generated_audio_paths: List of paths to generated audio files. - - codec_file_paths: List of paths to predicted codes files. - """ os.makedirs(output_dir, exist_ok=True) self._delete_old_generated_files(output_dir) @@ -522,7 +389,7 @@ def _run_unified_inference( dataset, batch_size=self.config.batch_size, collate_fn=dataset.collate_fn, - num_workers=0, # Avoid multiprocessing issues with CUDA + num_workers=0, shuffle=False, ) @@ -534,54 +401,42 @@ def _run_unified_inference( for batch_idx, batch in enumerate(dataloader): logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}") - # Move batch tensors to CUDA batch = self._batch_to_cuda(batch) - batch['sample_rate'] = self.model.output_sample_rate batch['context_sample_rate'] = self.model.output_sample_rate batch_size = len(batch['chunked_tokens']) max_num_chunks = max(len(tokens) for tokens in batch['chunked_tokens']) - # Clear stale KV cache from prior inference calls (e.g., the previous batch or dataset - # may have left with populated tensors). logging.info(f"Resetting KV cache for decoder: {self.model.use_kv_cache_for_inference}") use_kv_cache_for_this_batch = self.model.use_kv_cache_for_inference if max_num_chunks == 1 else False self.model.decoder.reset_cache(use_cache=use_kv_cache_for_this_batch) - # Create chunk state for this batch chunk_state = self.model.create_chunk_state(batch_size=batch_size) - # Accumulators for predicted codes predicted_codes_per_sample = [[] for _ in range(batch_size)] predicted_codes_lens = [0 for _ in range(batch_size)] - # Overwrite the model's parameters since we want to use the arguments from the commandline self.model.inference_parameters = self.config.model_inference_parameters start_time = time.time() - # Iterate over text chunks (1 for short text, N for long text) for chunk_idx in range(max_num_chunks): - # Extract current chunk tokens for each sample current_tokens = [] current_tokens_lens = [] for b_idx in range(batch_size): current_tokens.append(batch['chunked_tokens'][b_idx][chunk_idx]) current_tokens_lens.append(batch['chunked_tokens_lens'][b_idx][chunk_idx]) - # Pad tokens to max length in this chunk max_len = max(current_tokens_lens) batch['text'] = stack_tensors(current_tokens, max_lens=[max_len]).cuda() batch['text_lens'] = torch.tensor(current_tokens_lens, dtype=torch.int32).cuda() - # Compute is_end_of_text flags (per-sample) is_end_of_text = self._compute_end_of_text_flags( batch, chunk_idx, max_num_chunks, current_tokens_lens, batch_size ) beginning_of_text = chunk_idx == 0 - # Call generate_speech (unified entry point) output = self.model.generate_speech( batch, chunk_state=chunk_state, @@ -595,16 +450,12 @@ def _run_unified_inference( maskgit_sampling_type=self.config.maskgit_sampling_type, ) - # Unpack output chunk_codes = output.predicted_codes chunk_codes_lens = output.predicted_codes_lens - # Accumulate codes for each sample for b_idx in range(batch_size): - # Skip if this sample's text has ended (padding chunks) if is_end_of_text[b_idx] and current_tokens_lens[b_idx] == 1: continue - code_len = chunk_codes_lens[b_idx] if code_len > 0: codes_slice = chunk_codes[b_idx][:, :code_len] @@ -614,17 +465,14 @@ def _run_unified_inference( elapsed = time.time() - start_time logging.info(f"Batch inference time: {elapsed:.2f}s") - # Concatenate codes and convert to audio predicted_codes_list = [] for b_idx in range(batch_size): if predicted_codes_per_sample[b_idx]: concatenated = torch.cat(predicted_codes_per_sample[b_idx], dim=1).cuda() else: - # Empty placeholder concatenated = torch.zeros((self.model.num_audio_codebooks, 1), dtype=torch.long, device='cuda') predicted_codes_list.append(concatenated) - # Stack and convert to audio max_code_len = max(predicted_codes_lens) if any(predicted_codes_lens) else 1 predicted_codes = stack_tensors(predicted_codes_list, max_lens=[max_code_len]).cuda() predicted_codes_lens_tensor = torch.tensor(predicted_codes_lens, dtype=torch.long, device='cuda') @@ -633,7 +481,6 @@ def _run_unified_inference( predicted_codes, predicted_codes_lens_tensor ) - # Compute RTF metrics total_audio_samples = sum(predicted_audio_lens.cpu().tolist()) total_audio_seconds = total_audio_samples / self.model.output_sample_rate rtf = elapsed / total_audio_seconds if total_audio_seconds > 0 else 0.0 @@ -644,7 +491,6 @@ def _run_unified_inference( } all_rtf_metrics.append(rtf_metrics) - # Save outputs predicted_audio_np = predicted_audio.float().detach().cpu().numpy() for b_idx in range(batch_size): @@ -656,7 +502,6 @@ def _run_unified_inference( sf.write(audio_path, audio_np, self.model.output_sample_rate) generated_audio_paths.append(audio_path) - # Copy reference audio if requested if save_context_audio and sample_idx < len(manifest_records): self._copy_reference_audio( manifest_records[sample_idx], @@ -667,7 +512,7 @@ def _run_unified_inference( if save_predicted_codes: codes_path = os.path.join(output_dir, f"predicted_codes_{sample_idx}.pt") - predicted_codes_current = predicted_codes[b_idx, :, : predicted_codes_lens[b_idx]] # C, T + predicted_codes_current = predicted_codes[b_idx, :, : predicted_codes_lens[b_idx]] torch.save(predicted_codes_current, codes_path) codec_file_paths.append(codes_path) @@ -675,38 +520,173 @@ def _run_unified_inference( return all_rtf_metrics, generated_audio_paths, codec_file_paths + @staticmethod def _compute_end_of_text_flags( - self, batch: Dict[str, Any], chunk_idx: int, max_num_chunks: int, current_tokens_lens: List[int], batch_size: int, ) -> List[bool]: - """Compute end-of-text flags for each sample in batch. - - Args: - batch: Current batch dictionary. - chunk_idx: Current chunk index. - max_num_chunks: Maximum number of chunks in this batch. - current_tokens_lens: Token lengths for current chunk per sample. - batch_size: Number of samples in batch. - - Returns: - List of booleans indicating if each sample has reached end of text. - """ is_end_of_text = [] for b_idx in range(batch_size): if chunk_idx == max_num_chunks - 1: - # Last chunk is_end_of_text.append(True) elif current_tokens_lens[b_idx] == 1: - # Current chunk is padding is_end_of_text.append(True) elif batch['chunked_tokens_lens'][b_idx][chunk_idx + 1] == 1: - # Next chunk is padding is_end_of_text.append(True) else: is_end_of_text.append(False) - return is_end_of_text + + +# --------------------------------------------------------------------------- +# EasyMagpieInferenceRunner (decoder-only EasyMagpieTTSModel) +# --------------------------------------------------------------------------- + + +class EasyMagpieInferenceRunner(BaseInferenceRunner): + """Runner for decoder-only EasyMagpieTTSModel. + + Uses MagpieTTSDataset and model.infer_batch() which returns audio directly. + """ + + def __init__(self, model, config: EasyMagpieInferenceConfig): + super().__init__(model, config) + self.model.legacy_context_stacking = config.legacy_context_stacking + + def create_dataset( + self, + dataset_meta: dict, + context_duration_min: Optional[float] = None, + context_duration_max: Optional[float] = None, + ) -> MagpieTTSDataset: + context_duration_min, context_duration_max = self._get_context_durations( + context_duration_min, context_duration_max + ) + self._read_and_cache_manifest(dataset_meta) + + logging.info("Creating inference dataset for decoder-only model") + dataset = MagpieTTSDataset( + dataset_meta=dataset_meta, + sample_rate=self.model.sample_rate, + min_duration=0.5, + max_duration=20, + codec_model_samples_per_frame=self.model.codec_model_samples_per_frame, + bos_id=getattr(self.model, "bos_id", None), + eos_id=self.model.eos_id, + num_audio_codebooks=self.model.num_audio_codebooks, + prior_scaling_factor=None, + load_cached_codes_if_available=False, + dataset_type='test', + tokenizer_config=None, + load_16khz_audio=False, + use_text_conditioning_tokenizer=True, + text_conditioning_tokenizer_name=self.model.text_conditioning_tokenizer_name, + pad_context_text_to_max_duration=False, + context_duration_min=context_duration_min, + context_duration_max=context_duration_max, + ) + dataset.text_tokenizer = self.model.tokenizer + + if hasattr(self.model, 'phoneme_tokenizer'): + dataset.phoneme_tokenizer = self.model.phoneme_tokenizer + + return dataset + + def run_inference_on_dataset( + self, + dataset: MagpieTTSDataset, + output_dir: str, + manifest_records: Optional[List[dict]] = None, + audio_base_dir: Optional[str] = None, + save_cross_attention_maps: bool = True, + save_context_audio: bool = True, + save_predicted_codes: bool = True, + ) -> Tuple[List[dict], List[str], List[str]]: + manifest_records, audio_base_dir = self._resolve_manifest_and_audio_dir(manifest_records, audio_base_dir) + logging.info("Using decoder-only inference path") + return self._run_decoder_only_inference( + dataset, output_dir, manifest_records, audio_base_dir, save_context_audio, save_predicted_codes + ) + + # -- private ------------------------------------------------------------- + + def _run_decoder_only_inference( + self, + dataset: MagpieTTSDataset, + output_dir: str, + manifest_records: List[dict], + audio_base_dir: str, + save_context_audio: bool = True, + save_predicted_codes: bool = True, + ) -> Tuple[List[dict], List[str], List[str]]: + os.makedirs(output_dir, exist_ok=True) + self._delete_old_generated_files(output_dir) + + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=self.config.batch_size, + collate_fn=dataset.collate_fn, + num_workers=0, + shuffle=False, + ) + + all_rtf_metrics = [] + generated_audio_paths = [] + codec_file_paths = [] + item_idx = 0 + phoneme_sampling_method = ( + "argmax" if self.config.phoneme_sampling_method == "greedy" else self.config.phoneme_sampling_method + ) + + for batch_idx, batch in enumerate(dataloader): + logging.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}") + batch = self._batch_to_cuda(batch) + output = self.model.infer_batch( + batch, + max_decoder_steps=self.config.model_inference_parameters.max_decoder_steps, + temperature=self.config.model_inference_parameters.temperature, + topk=self.config.model_inference_parameters.topk, + use_cfg=self.config.use_cfg, + cfg_scale=self.config.model_inference_parameters.cfg_scale, + use_local_transformer_for_inference=self.config.use_local_transformer, + phoneme_input_type=self.config.phoneme_input_type, + phoneme_sampling_method=phoneme_sampling_method, + force_dropout_text=self.config.dropout_text_input, + ) + predicted_audio = output.predicted_audio + predicted_audio_lens = output.predicted_audio_lens + predicted_codes = output.predicted_codes + predicted_codes_lens = output.predicted_codes_lens + rtf_metrics = output.rtf_metrics + + all_rtf_metrics.append(rtf_metrics) + logging.info(f"Output shape: {predicted_audio.size()}") + + for idx in range(predicted_audio.size(0)): + audio_len = predicted_audio_lens[idx].item() + audio_np = predicted_audio[idx].float().detach().cpu().numpy()[:audio_len] + audio_path = os.path.join(output_dir, f"predicted_audio_{item_idx}.wav") + sample_rate = getattr(self.model, "output_sample_rate", self.model.sample_rate) + sf.write(audio_path, audio_np, sample_rate) + generated_audio_paths.append(audio_path) + + if save_context_audio and item_idx < len(manifest_records): + self._copy_reference_audio( + manifest_records[item_idx], + audio_base_dir, + output_dir, + item_idx, + ) + + if save_predicted_codes: + code_len = predicted_codes_lens[idx].item() + codes_path = os.path.join(output_dir, f"predicted_codes_{item_idx}.pt") + torch.save(predicted_codes[idx, :, :code_len].detach().cpu(), codes_path) + codec_file_paths.append(codes_path) + + item_idx += 1 + + return all_rtf_metrics, generated_audio_paths, codec_file_paths diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index 580a6e32ebc7..ca89356494fa 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -23,7 +23,7 @@ import os from dataclasses import dataclass -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple import torch from omegaconf import DictConfig, OmegaConf, open_dict @@ -253,9 +253,7 @@ def update_checkpoint_state_dict(state_dict: dict) -> dict: return new_state_dict -def load_magpie_model( - config: ModelLoadConfig, device: str = "cuda", is_decoder_only_model: bool = False -) -> Tuple[Union[MagpieTTSModel, EasyMagpieTTSModel], str]: +def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[MagpieTTSModel, str]: """Load a MagpieTTS model from checkpoint or NeMo archive. Supports two loading modes: @@ -273,7 +271,7 @@ def load_magpie_model( ValueError: If configuration is invalid or sample rates don't match. """ config.validate() - model_cls = EasyMagpieTTSModel if is_decoder_only_model else MagpieTTSModel + if config.hparams_file is not None and config.checkpoint_file is not None: # Mode 1: Load from hparams + checkpoint model_cfg = OmegaConf.load(config.hparams_file) @@ -292,7 +290,7 @@ def load_magpie_model( config.legacy_text_conditioning, ) - model = model_cls(cfg=model_cfg) + model = MagpieTTSModel(cfg=model_cfg) model.use_kv_cache_for_inference = True # Load weights @@ -304,15 +302,15 @@ def load_magpie_model( checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "") else: - if config.nemo_file.startswith("nvidia/"): - model = model_cls.from_pretrained(config.nemo_file) + if config.nemo_file.startswith("nvidia/"): # TODO @xueyang: why ignore `update_config_for_inference`? + model = MagpieTTSModel.from_pretrained(config.nemo_file) model.use_kv_cache_for_inference = True checkpoint_name = config.nemo_file.split("/")[-1] cfg_sample_rate = None else: # Mode 2: Load from .nemo archive logging.info(f"Loading model from NeMo archive: {config.nemo_file}") - model_cfg = model_cls.restore_from(config.nemo_file, return_config=True) + model_cfg = MagpieTTSModel.restore_from(config.nemo_file, return_config=True) with open_dict(model_cfg): model_cfg, cfg_sample_rate = update_config_for_inference( @@ -322,7 +320,7 @@ def load_magpie_model( config.legacy_text_conditioning, ) - model = model_cls.restore_from(config.nemo_file, override_config_path=model_cfg) + model = MagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg) model.use_kv_cache_for_inference = True checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "") @@ -338,6 +336,69 @@ def load_magpie_model( return model, checkpoint_name +def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[EasyMagpieTTSModel, str]: + """Load an EasyMagpieTTSModel (decoder-only) from checkpoint or NeMo archive. + + Supports two loading modes: + 1. Checkpoint mode: hparams.yaml + .ckpt file + 2. NeMo mode: .nemo archive file + + Args: + config: Model loading configuration. + device: Device to load the model onto ("cuda" or "cpu"). + + Returns: + Tuple of (loaded model, checkpoint name for output labeling). + + Raises: + ValueError: If configuration is invalid. + """ + config.validate() + + if config.hparams_file is not None and config.checkpoint_file is not None: + model_cfg = OmegaConf.load(config.hparams_file) + + if "cfg" in model_cfg: + model_cfg = model_cfg.cfg + if config.hparams_from_wandb: + model_cfg = model_cfg.value + + with open_dict(model_cfg): + model_cfg.codecmodel_path = config.codecmodel_path + model_cfg.train_ds = None + model_cfg.validation_ds = None + + model = EasyMagpieTTSModel(cfg=model_cfg) + + logging.info(f"Loading weights from checkpoint: {config.checkpoint_file}") + ckpt = torch.load(config.checkpoint_file) + state_dict = ckpt['state_dict'] + model.load_state_dict(state_dict) + + checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "") + else: + if config.nemo_file.startswith("nvidia/"): + model = EasyMagpieTTSModel.from_pretrained(config.nemo_file) + checkpoint_name = config.nemo_file.split("/")[-1] + else: + logging.info(f"Loading model from NeMo archive: {config.nemo_file}") + model_cfg = EasyMagpieTTSModel.restore_from(config.nemo_file, return_config=True) + + with open_dict(model_cfg): + model_cfg.codecmodel_path = config.codecmodel_path + model_cfg.train_ds = None + model_cfg.validation_ds = None + + model = EasyMagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg) + checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "") + + model.to(device) + model.eval() + logging.info("EasyMagpieTTS model loaded and ready for inference.") + + return model, checkpoint_name + + def _log_transformer_component(name: str, cfg: DictConfig, use_moe: bool = False) -> dict: """Log architecture info for a single transformer component and return its FLOPs metrics. @@ -414,23 +475,22 @@ def _log_transformer_component(name: str, cfg: DictConfig, use_moe: bool = False return flops_info -def log_model_architecture_summary(model: MagpieTTSModel) -> Tuple[str, Dict[str, dict]]: +def log_model_architecture_summary(model) -> Tuple[str, Dict[str, dict]]: """Log model architecture summary including MoE configuration. Detects and logs MoE configuration for each transformer component, - computing FLOPs metrics and parameter counts. + computing FLOPs metrics and parameter counts. Gracefully handles + decoder-only models (EasyMagpieTTSModel) that use HuggingFace/Nemotron + decoders without the d_model/d_ffn config structure. Args: - model: Loaded MagpieTTS model. + model: Loaded MagpieTTS or EasyMagpieTTS model. Returns: Tuple of: - moe_info: String for checkpoint naming (e.g., "MoE_8x2_d2048_softmax_"), empty for dense models - flops_per_component: Dict mapping component name (e.g., "decoder") to its FLOPs metrics dict """ - if isinstance(model, EasyMagpieTTSModel): - return "", {} - logging.info("=" * 60) logging.info("MODEL ARCHITECTURE SUMMARY") logging.info("=" * 60) @@ -438,23 +498,28 @@ def log_model_architecture_summary(model: MagpieTTSModel) -> Tuple[str, Dict[str flops_per_component: Dict[str, dict] = {} use_moe = getattr(model.cfg, 'use_moe', False) - # Log optional encoder if present - if hasattr(model.cfg, 'encoder'): + # Log optional encoder if present (encoder-decoder models) + if hasattr(model.cfg, 'encoder') and hasattr(model.cfg.encoder, 'd_model'): flops_per_component['encoder'] = _log_transformer_component('encoder', model.cfg.encoder) # Log optional context_encoder if present - if hasattr(model.cfg, 'context_encoder'): + if hasattr(model.cfg, 'context_encoder') and hasattr(model.cfg.context_encoder, 'd_model'): flops_per_component['context_encoder'] = _log_transformer_component( 'context_encoder', model.cfg.context_encoder ) - # Decoder is required - always present in MagpieTTS. MoE only applies to decoder. - flops_per_component['decoder'] = _log_transformer_component('decoder', model.cfg.decoder, use_moe=use_moe) + # Decoder -- only log detailed FLOPs for encoder-decoder models whose + # decoder config exposes d_model/d_ffn. Decoder-only models (EasyMagpieTTS) + # use HuggingFace or Nemotron decoders with a different config shape. + decoder_cfg = getattr(model.cfg, 'decoder', None) + if decoder_cfg is not None and hasattr(decoder_cfg, 'd_model'): + flops_per_component['decoder'] = _log_transformer_component('decoder', decoder_cfg, use_moe=use_moe) + else: + logging.info("DECODER: detailed FLOPs logging not available for this model type") # Build MoE info string for checkpoint naming moe_info = "" - if use_moe: - decoder_cfg = model.cfg.decoder + if use_moe and decoder_cfg is not None and hasattr(decoder_cfg, 'num_experts'): moe_info = ( f"decoder-MoE_{decoder_cfg.num_experts}x{decoder_cfg.top_k_experts}" f"_d{decoder_cfg.d_ffn}_{decoder_cfg.routing_strategy}_" @@ -488,4 +553,4 @@ def get_experiment_name_from_checkpoint_path(checkpoint_path: str) -> str: Returns: The experiment name (parent directory of checkpoints folder). """ - return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path))) + return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path))) \ No newline at end of file diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 6d91ad25f976..027ca47a4e82 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -1411,7 +1411,7 @@ def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: st if isinstance(cfg.init_from_ptl_ckpt, str): # Restore checkpoint ckpt_path = cfg.pop('init_from_ptl_ckpt') - ckpt = torch.load(ckpt_path, map_location=map_location, weights_only=False) + ckpt = torch.load(ckpt_path, map_location=map_location) # Restore checkpoint into current model self.load_state_dict(ckpt['state_dict'], strict=False) diff --git a/tests/collections/tts/test_infer_vs_process_batch.py b/tests/collections/tts/test_infer_vs_process_batch.py deleted file mode 100644 index 0ea66e2870ef..000000000000 --- a/tests/collections/tts/test_infer_vs_process_batch.py +++ /dev/null @@ -1,491 +0,0 @@ -""" -Test script to verify that infer_batch (teacher-forced) produces the same audio code -and phoneme predictions as process_batch (single forward pass). - -Usage: - python tests/collections/tts/test_infer_vs_process_batch.py --codecmodel_path /path/to/codec.nemo - -The script: -1. Builds a tiny NemotronH-backed EasyMagpieTTSModel with a real codec model. -2. Creates synthetic random inputs (with variable lengths per batch item). -3. Runs process_batch (full-sequence forward) and infer_batch (streaming, teacher-forced). -4. Compares the argmax audio code predictions and phoneme predictions from both paths. -5. Repeats for multiple configurations. -""" - -import argparse -import sys -import torch -from omegaconf import OmegaConf - -from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel - - -def build_minimal_config(codecmodel_path: str) -> OmegaConf: - """Build a minimal OmegaConf config for a tiny NemotronH model.""" - hidden_size = 256 - - cfg_dict = { - # Decoder backend - 'decoder_type': 'nemotron_h', - 'nemotron_h_config': { - 'hidden_size': hidden_size, - 'num_hidden_layers': 2, - 'vocab_size': 131072, - 'num_attention_heads': 4, - 'num_key_value_heads': 2, - 'attention_dropout': 0.0, - 'attention_bias': False, - 'max_position_embeddings': 4096, - 'mamba_num_heads': 16, - 'mamba_head_dim': 16, - 'ssm_state_size': 128, - 'conv_kernel': 4, - 'n_groups': 8, - 'chunk_size': 256, - 'mamba_hidden_act': 'silu', - 'use_conv_bias': True, - 'use_bias': False, - 'intermediate_size': 512, - 'mlp_hidden_act': 'silu', - 'mlp_bias': False, - 'hybrid_override_pattern': 'M*', # All Mamba layers - 'layer_norm_epsilon': 1e-5, - 'residual_in_fp32': True, - }, - 'embedding_dim': hidden_size, - 'hidden_dim': hidden_size, - 'audio_embedding_dim': hidden_size, - 'codecmodel_path': codecmodel_path, - # Text tokenizer - use a simple AutoTokenizer - 'text_tokenizers': { - 'test_tokenizer': { - '_target_': 'AutoTokenizer', - 'pretrained_model': 'gpt2', - }, - }, - # Phoneme tokenizer - 'phoneme_tokenizer': { - '_target_': 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer', - 'tokenizer_path': 'scripts/tts_dataset_files/bpe_ipa_tokenizer_2048_en_de_es_fr_hi_it_vi_zh.json', - }, - 'phoneme_stacking_factor': 1, - # Training modes (single streaming mode) - 'training_modes': [ - { - 'text_input_mode': 'streaming', - 'streaming_phonemes_delay': 4, - 'streaming_speech_delay': 8, - }, - ], - 'frame_stacking_factor': 2, - 'cfg_unconditional_prob': 0.0, - 'dropout_text_input_prob': 0.0, - 'local_transformer_type': 'none', - 'run_val_inference': False, - # Optim placeholder (required by ModelPT but not used) - 'optim': { - '_target_': 'torch.optim.AdamW', - 'lr': 1e-4, - }, - # No dataloaders - } - return OmegaConf.create(cfg_dict) - - -def create_synthetic_batch( - model, - batch_size=2, - text_lens_list=None, - audio_frames_list=None, - context_text_lens_list=None, - context_audio_frames_list=None, - phoneme_lens_list=None, - device='cpu', -): - """Create a synthetic batch with random valid token IDs and variable lengths per item. - - If *_list args are None, defaults to uniform lengths for all items. - """ - num_codebooks = model.num_audio_codebooks - codebook_size = model.codebook_size - text_vocab_size = model.bos_id # valid text tokens are [0, bos_id) - phoneme_vocab_size = model.phoneme_tokenizer.vocab_size - 2 # exclude BOS/EOS - - # Defaults - if text_lens_list is None: - text_lens_list = [20] * batch_size - if audio_frames_list is None: - audio_frames_list = [30] * batch_size - if context_text_lens_list is None: - context_text_lens_list = [10] * batch_size - if context_audio_frames_list is None: - context_audio_frames_list = [15] * batch_size - if phoneme_lens_list is None: - phoneme_lens_list = [25] * batch_size - - assert len(text_lens_list) == batch_size - assert len(audio_frames_list) == batch_size - assert len(context_text_lens_list) == batch_size - assert len(context_audio_frames_list) == batch_size - assert len(phoneme_lens_list) == batch_size - - # Max lengths for padding - max_text_len = max(text_lens_list) - max_audio_frames = max(audio_frames_list) - max_context_text_len = max(context_text_lens_list) - max_context_audio_frames = max(context_audio_frames_list) - max_phoneme_len = max(phoneme_lens_list) - - # Text tokens: random tokens + EOS at the end (matching dataset behavior) - text = torch.zeros(batch_size, max_text_len, dtype=torch.long, device=device) - for b in range(batch_size): - tl = text_lens_list[b] - text[b, : tl - 1] = torch.randint(0, text_vocab_size, (tl - 1,), device=device) - text[b, tl - 1] = model.eos_id # EOS as last valid token - text_lens = torch.tensor(text_lens_list, dtype=torch.long, device=device) - - # Context text tokens - context_text_tokens = torch.zeros(batch_size, max_context_text_len, dtype=torch.long, device=device) - for b in range(batch_size): - cl = context_text_lens_list[b] - context_text_tokens[b, :cl] = torch.randint(0, text_vocab_size, (cl,), device=device) - context_text_tokens_lens = torch.tensor(context_text_lens_list, dtype=torch.long, device=device) - - # Audio codes (raw, without BOS/EOS) - audio_codes = torch.zeros(batch_size, num_codebooks, max_audio_frames, dtype=torch.long, device=device) - for b in range(batch_size): - af = audio_frames_list[b] - audio_codes[b, :, :af] = torch.randint(0, codebook_size, (num_codebooks, af), device=device) - audio_codes_lens = torch.tensor(audio_frames_list, dtype=torch.long, device=device) - - # Context audio codes (raw, without BOS/EOS) - context_audio_codes = torch.zeros( - batch_size, num_codebooks, max_context_audio_frames, dtype=torch.long, device=device - ) - for b in range(batch_size): - caf = context_audio_frames_list[b] - context_audio_codes[b, :, :caf] = torch.randint(0, codebook_size, (num_codebooks, caf), device=device) - context_audio_codes_lens = torch.tensor(context_audio_frames_list, dtype=torch.long, device=device) - - # Phoneme tokens (raw IDs, BOS/EOS will be added by the model) - phoneme_tokens = torch.zeros(batch_size, max_phoneme_len, dtype=torch.long, device=device) - for b in range(batch_size): - pl = phoneme_lens_list[b] - phoneme_tokens[b, :pl] = torch.randint(0, phoneme_vocab_size, (pl,), device=device) - phoneme_tokens_lens = torch.tensor(phoneme_lens_list, dtype=torch.long, device=device) - - batch = { - 'text': text, - 'text_lens': text_lens, - 'context_text_tokens': context_text_tokens, - 'context_text_tokens_lens': context_text_tokens_lens, - 'audio_codes': audio_codes, - 'audio_codes_lens': audio_codes_lens, - 'context_audio_codes': context_audio_codes, - 'context_audio_codes_lens': context_audio_codes_lens, - 'phoneme_tokens': phoneme_tokens, - 'phoneme_tokens_lens': phoneme_tokens_lens, - } - return batch - - -def compare_audio_codes(model, pb_output, ib_output, batch): - """Compare audio codes from process_batch and infer_batch. Returns True if all match.""" - C = model.num_audio_codebooks - S = model.frame_stacking_factor - C_stacked = C * S - V = model.num_all_tokens_per_codebook - pb_logits = pb_output.logits # (B, T_stacked, C_stacked * V) - T_stacked = pb_logits.size(1) - batch_size = batch['text'].size(0) - - # Extract per-codebook argmax at stacked resolution - pb_stacked_codes_list = [] - for cb_idx in range(C_stacked): - si = cb_idx * V - ei = si + V - cb_logits = pb_logits[:, :, si:ei] # (B, T_stacked, V) - cb_preds = cb_logits.argmax(dim=-1) # (B, T_stacked) - pb_stacked_codes_list.append(cb_preds) - pb_stacked_codes = torch.stack(pb_stacked_codes_list, dim=1) # (B, C_stacked, T_stacked) - - # Unstack: (B, C*S, T_stacked) -> (B, C, S, T_stacked) -> (B, C, T_stacked, S) -> (B, C, T_stacked*S) - pb_unstacked = pb_stacked_codes.view(batch_size, C, S, T_stacked) - pb_unstacked = pb_unstacked.permute(0, 1, 3, 2).contiguous() - pb_unstacked = pb_unstacked.reshape(batch_size, C, T_stacked * S) - pb_unstacked_lens = pb_output.audio_codes_lens_target * S - - ib_codes = ib_output.predicted_codes - ib_codes_lens = ib_output.predicted_codes_lens - - print(f" process_batch argmax codes (unstacked): {pb_unstacked.shape}, lens: {pb_unstacked_lens.tolist()}") - print(f" infer_batch predicted codes: {ib_codes.shape}, lens: {ib_codes_lens.tolist()}") - - all_match = True - for b in range(batch_size): - pb_len = pb_unstacked_lens[b].item() - ib_len = ib_codes_lens[b].item() - compare_len = min(pb_len, ib_len) - - if compare_len == 0: - print(f" Batch item {b}: No codes to compare (pb_len={pb_len}, ib_len={ib_len})") - continue - - pb_codes_b = pb_unstacked[b, :, :compare_len] - ib_codes_b = ib_codes[b, :, :compare_len] - - matches = (pb_codes_b == ib_codes_b).all() - num_matching = (pb_codes_b == ib_codes_b).sum().item() - total = pb_codes_b.numel() - match_pct = 100.0 * num_matching / total if total > 0 else 0.0 - - print(f" Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}") - print(f" Audio match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)") - - if not matches: - all_match = False - mismatch_mask = pb_codes_b != ib_codes_b - mismatch_positions = mismatch_mask.nonzero(as_tuple=False) - num_show = min(10, mismatch_positions.size(0)) - for i in range(num_show): - cb, t = mismatch_positions[i].tolist() - print( - f" Mismatch at codebook={cb}, time={t}: " - f"pb={pb_codes_b[cb, t].item()}, ib={ib_codes_b[cb, t].item()}" - ) - - return all_match - - -def compare_phoneme_predictions(model, pb_output, ib_output, batch): - """Compare phoneme predictions from process_batch and infer_batch. Returns True if all match.""" - if pb_output.phoneme_logits is None: - print(" No phoneme logits from process_batch (no phoneme tokenizer?). Skipping.") - return True - if ib_output.predicted_phoneme_tokens is None: - print(" No phoneme predictions from infer_batch. Skipping.") - return True - - batch_size = batch['text'].size(0) - phoneme_stacking_factor = model.phoneme_stacking_factor - phoneme_vocab_size = model.phoneme_vocab_size - - # Extract argmax phoneme predictions from process_batch logits - # phoneme_logits: (B, T_phoneme, phoneme_stacking_factor * phoneme_vocab_size) - pb_phoneme_logits = pb_output.phoneme_logits - T_phoneme = pb_phoneme_logits.size(1) - - pb_phoneme_preds_list = [] - for sf_idx in range(phoneme_stacking_factor): - si = sf_idx * phoneme_vocab_size - ei = si + phoneme_vocab_size - sf_logits = pb_phoneme_logits[:, :, si:ei] # (B, T_phoneme, V_phoneme) - sf_preds = sf_logits.argmax(dim=-1) # (B, T_phoneme) - pb_phoneme_preds_list.append(sf_preds) - pb_phoneme_preds = torch.stack(pb_phoneme_preds_list, dim=1) # (B, phoneme_stacking_factor, T_phoneme) - pb_phoneme_lens = pb_output.phoneme_tokens_lens_target # (B,) number of phoneme prediction steps - - # infer_batch phoneme predictions: (B, phoneme_stacking_factor, T_all_steps) - ib_phoneme_preds = ib_output.predicted_phoneme_tokens - ib_phoneme_lens = ib_output.predicted_phoneme_tokens_lens - - print(f" process_batch phoneme preds: {pb_phoneme_preds.shape}, lens: {pb_phoneme_lens.tolist()}") - print(f" infer_batch phoneme preds: {ib_phoneme_preds.shape}, lens: {ib_phoneme_lens.tolist()}") - - # Get start indices for infer_batch phoneme predictions - ib_start_idx = ib_output.phoneme_prediction_start_idx # (B,) - - all_match = True - for b in range(batch_size): - pb_len = pb_phoneme_lens[b].item() - ib_len = ib_phoneme_lens[b].item() - compare_len = min(pb_len, ib_len) - - if compare_len == 0: - print(f" Batch item {b}: No phonemes to compare (pb_len={pb_len}, ib_len={ib_len})") - continue - - # process_batch phoneme preds start from 0 (already sliced to prediction region) - pb_ph_b = pb_phoneme_preds[b, :, :compare_len] - - # infer_batch phoneme preds: slice from start_idx for this batch item - start = max(0, ib_start_idx[b].item()) - ib_ph_b = ib_phoneme_preds[b, :, start : start + compare_len] - - matches = (pb_ph_b == ib_ph_b).all() - num_matching = (pb_ph_b == ib_ph_b).sum().item() - total = pb_ph_b.numel() - match_pct = 100.0 * num_matching / total if total > 0 else 0.0 - - print(f" Batch item {b}: pb_len={pb_len}, ib_len={ib_len}, compare_len={compare_len}") - print(f" Phoneme match: {matches.item()}, {num_matching}/{total} ({match_pct:.1f}%)") - - if not matches: - all_match = False - mismatch_mask = pb_ph_b != ib_ph_b - mismatch_positions = mismatch_mask.nonzero(as_tuple=False) - num_show = min(10, mismatch_positions.size(0)) - for i in range(num_show): - sf, t = mismatch_positions[i].tolist() - print( - f" Mismatch at stacking_factor={sf}, time={t}: " - f"pb={pb_ph_b[sf, t].item()}, ib={ib_ph_b[sf, t].item()}" - ) - - return all_match - - -def run_single_test(model, batch, test_name, device): - """Run a single test comparing process_batch and infer_batch outputs.""" - print(f"\n{'='*60}") - print(f"TEST: {test_name}") - print(f"{'='*60}") - - for k, v in batch.items(): - if isinstance(v, torch.Tensor): - print(f" {k}: shape={v.shape}, dtype={v.dtype}") - - # Run process_batch - print("\n Running process_batch...") - training_mode = model.training_modes[0] - with torch.inference_mode(): - pb_output = model.process_batch( - text=batch['text'], - text_lens=batch['text_lens'], - context_text_tokens=batch['context_text_tokens'], - context_text_tokens_lens=batch['context_text_tokens_lens'], - audio_codes=batch['audio_codes'], - audio_codes_lens=batch['audio_codes_lens'], - context_audio_codes=batch['context_audio_codes'], - context_audio_codes_lens=batch['context_audio_codes_lens'], - phoneme_tokens=batch['phoneme_tokens'], - phoneme_tokens_lens=batch['phoneme_tokens_lens'], - mode='val', - training_mode=training_mode, - ) - - # Run infer_batch (teacher-forced) - print(" Running infer_batch (teacher-forced)...") - ib_output = model.infer_batch( - batch=batch, - max_decoder_steps=1000, - temperature=0.0, - topk=80, - use_cfg=False, - use_local_transformer_for_inference=False, - phoneme_input_type='gt', - phoneme_sampling_method='argmax', - use_teacher_forced=True, - ) - - # Compare audio codes - print("\n --- Audio Codes Comparison ---") - audio_match = compare_audio_codes(model, pb_output, ib_output, batch) - - # Compare phoneme predictions - print("\n --- Phoneme Predictions Comparison ---") - phoneme_match = compare_phoneme_predictions(model, pb_output, ib_output, batch) - - success = audio_match and phoneme_match - if success: - print(f"\n ✓ {test_name}: PASSED (audio + phoneme match)") - else: - parts = [] - if not audio_match: - parts.append("audio") - if not phoneme_match: - parts.append("phoneme") - print(f"\n ✗ {test_name}: FAILED ({' and '.join(parts)} mismatch)") - - return success - - -def main(): - parser = argparse.ArgumentParser(description='Test infer_batch vs process_batch') - parser.add_argument('--codecmodel_path', type=str, required=True, help='Path to codec model .nemo file') - parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') - args = parser.parse_args() - - device = args.device - print(f"Using device: {device}") - - # 1. Build config and model - print("Building minimal config...") - cfg = build_minimal_config(args.codecmodel_path) - - print("Instantiating EasyMagpieTTSModel (tiny NemotronH + real codec)...") - model = EasyMagpieTTSModel(cfg=cfg, trainer=None) - model = model.to(device) - model.eval() - print(f" num_audio_codebooks={model.num_audio_codebooks}, codebook_size={model.codebook_size}") - print(f" frame_stacking_factor={model.frame_stacking_factor}") - print(f" phoneme_vocab_size={model.phoneme_tokenizer.vocab_size}") - - # Define test configurations: (test_name, kwargs_for_create_synthetic_batch) - test_configs = [ - ( - "Uniform lengths (B=2, text=20, audio=30, ctx_text=10, ctx_audio=15, phoneme=25)", - dict( - batch_size=2, - text_lens_list=[20, 20], - audio_frames_list=[30, 30], - context_text_lens_list=[10, 10], - context_audio_frames_list=[15, 15], - phoneme_lens_list=[25, 25], - ), - ), - ( - "Variable text & context lens (B=2, text=[15,25], ctx_text=[8,12], ctx_audio=[10,20])", - dict( - batch_size=2, - text_lens_list=[15, 25], - audio_frames_list=[30, 30], - context_text_lens_list=[8, 12], - context_audio_frames_list=[10, 20], - phoneme_lens_list=[20, 30], - ), - ), - ( - "Variable audio & phoneme lens (B=2, audio=[20,40], phoneme=[15,35])", - dict( - batch_size=2, - text_lens_list=[20, 20], - audio_frames_list=[20, 40], - context_text_lens_list=[10, 10], - context_audio_frames_list=[15, 15], - phoneme_lens_list=[15, 35], - ), - ), - ( - "All different (B=3)", - dict( - batch_size=3, - text_lens_list=[12, 20, 28], - audio_frames_list=[20, 30, 40], - context_text_lens_list=[6, 10, 14], - context_audio_frames_list=[8, 15, 22], - phoneme_lens_list=[15, 25, 35], - ), - ), - ] - - all_passed = True - for test_name, kwargs in test_configs: - batch = create_synthetic_batch(model, device=device, **kwargs) - passed = run_single_test(model, batch, test_name, device) - if not passed: - all_passed = False - - # Final summary - print(f"\n{'='*60}") - if all_passed: - print("✓ ALL TESTS PASSED") - else: - print("✗ SOME TESTS FAILED") - sys.exit(1) - print(f"{'='*60}") - - -if __name__ == '__main__': - main() diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh index 368b5c83bba5..b6d87e91a254 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh @@ -14,7 +14,7 @@ # Tests a 4x-stacked model with local transformer inference. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh index a591497f22e0..4e917733f59a 100755 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \ --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \ --datasets_json_path "examples/tts/evalset_config.json" \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh index 5ed8d48f5aff..8eb30eb40c36 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh index 3a9415bbc2b3..eed95fc5a64e 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh index ec8b6b885212..c21454d39cb1 100755 --- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \ --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \ --datasets_json_path "examples/tts/evalset_config.json" \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh index a0694c16b9ba..96e20304197a 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci_longform_tiny \ From 5eaf1a4b93aae1e1326f3cbc4349f3b6b024c75c Mon Sep 17 00:00:00 2001 From: shehzeen Date: Wed, 11 Mar 2026 17:31:53 +0000 Subject: [PATCH 88/94] Apply isort and black reformatting Signed-off-by: shehzeen --- .../tts/modules/magpietts_inference/inference.py | 10 +++------- .../tts/modules/magpietts_inference/utils.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index d5d34537e088..c343a9d31f9a 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -125,9 +125,7 @@ def build_identifier(self) -> str: class EasyMagpieInferenceConfig(BaseInferenceConfig): """Configuration for decoder-only EasyMagpieTTSModel inference.""" - model_inference_parameters: EasyModelInferenceParameters = field( - default_factory=EasyModelInferenceParameters - ) + model_inference_parameters: EasyModelInferenceParameters = field(default_factory=EasyModelInferenceParameters) phoneme_input_type: str = "gt" phoneme_sampling_method: str = "argmax" dropout_text_input: bool = False @@ -176,8 +174,7 @@ def create_dataset( dataset_meta: dict, context_duration_min: Optional[float] = None, context_duration_max: Optional[float] = None, - ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]: - ... + ) -> Union[ChunkedTTSInferenceDataset, MagpieTTSDataset]: ... @abc.abstractmethod def run_inference_on_dataset( @@ -189,8 +186,7 @@ def run_inference_on_dataset( save_cross_attention_maps: bool = True, save_context_audio: bool = True, save_predicted_codes: bool = True, - ) -> Tuple[List[dict], List[str], List[str]]: - ... + ) -> Tuple[List[dict], List[str], List[str]]: ... # -- shared helpers ------------------------------------------------------ diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index ca89356494fa..a14cd0789f7a 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -553,4 +553,4 @@ def get_experiment_name_from_checkpoint_path(checkpoint_path: str) -> str: Returns: The experiment name (parent directory of checkpoints folder). """ - return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path))) \ No newline at end of file + return os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path))) From 6d96df464ab0eb445844668a7f6aa77b0449346e Mon Sep 17 00:00:00 2001 From: Shehzeen Hussain Date: Wed, 11 Mar 2026 21:24:08 -0700 Subject: [PATCH 89/94] Paarthneekhara/magpietts decoderonly 2601 (#70) * clean up code, rename back to magpietts_inference.py Signed-off-by: Shehzeen Hussain * bug fixes, inference runs now Signed-off-by: Shehzeen Hussain --------- Signed-off-by: Shehzeen Hussain --- docs/source/tts/magpietts-longform.rst | 6 +- docs/source/tts/magpietts.rst | 2 +- .../{tts_infer.py => magpietts_inference.py} | 60 ++++++++----------- nemo/collections/tts/models/easy_magpietts.py | 25 +------- .../tts/models/easy_magpietts_inference.py | 44 ++++++++++---- .../modules/magpietts_inference/inference.py | 14 ++--- .../tts/modules/magpietts_inference/utils.py | 34 ++++++++--- ...S_InferEvaluate_Magpietts_FrameStacking.sh | 2 +- ...TS_InferEvaluate_Magpietts_MoE_ZeroShot.sh | 2 +- ...TS_InferEvaluate_Magpietts_SeenSpeakers.sh | 2 +- ...L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh | 2 +- ...Evaluatelongform_Magpietts_MoE_ZeroShot.sh | 2 +- ...nferEvaluatelongform_Magpietts_ZeroShot.sh | 2 +- 13 files changed, 102 insertions(+), 95 deletions(-) rename examples/tts/{tts_infer.py => magpietts_inference.py} (94%) diff --git a/docs/source/tts/magpietts-longform.rst b/docs/source/tts/magpietts-longform.rst index fb3eeb659d33..33aef42a5abe 100644 --- a/docs/source/tts/magpietts-longform.rst +++ b/docs/source/tts/magpietts-longform.rst @@ -169,7 +169,7 @@ The ``do_tts`` method automatically detects whether longform inference is needed sf.write("output.wav", long_audio[0].cpu().numpy(), 22050) -Method 2: Using CLI (``tts_infer.py``) +Method 2: Using CLI (``magpietts_inference.py``) ------------------------------------------------ For batch inference from manifests: @@ -177,7 +177,7 @@ For batch inference from manifests: .. code-block:: bash # Auto-detect longform based on text length (default) - python examples/tts/tts_infer.py \ + python examples/tts/magpietts_inference.py \ --nemo_files /path/to/magpietts.nemo \ --datasets_json_path /path/to/evalset_config.json \ --out_dir /path/to/output \ @@ -185,7 +185,7 @@ For batch inference from manifests: --longform_mode auto # Force longform inference for all inputs - python examples/tts/tts_infer.py \ + python examples/tts/magpietts_inference.py \ --nemo_files /path/to/magpietts.nemo \ --datasets_json_path /path/to/evalset_config.json \ --out_dir /path/to/output \ diff --git a/docs/source/tts/magpietts.rst b/docs/source/tts/magpietts.rst index 6d297a694596..b79c11ea88ff 100644 --- a/docs/source/tts/magpietts.rst +++ b/docs/source/tts/magpietts.rst @@ -130,7 +130,7 @@ Several parameters control the generation behavior. The temperature setting affe .. code-block:: bash - python examples/tts/tts_infer.py \ + python examples/tts/magpietts_inference.py \ --nemo_files /path/to/magpietts_model.nemo \ --codecmodel_path /path/to/audio_codec.nemo \ --datasets your_evaluation_set \ diff --git a/examples/tts/tts_infer.py b/examples/tts/magpietts_inference.py similarity index 94% rename from examples/tts/tts_infer.py rename to examples/tts/magpietts_inference.py index 2c3bec0aa7f7..fca92fccddc4 100644 --- a/examples/tts/tts_infer.py +++ b/examples/tts/magpietts_inference.py @@ -26,7 +26,7 @@ Example usage: # MagpieTTS inference (encoder-decoder, default) - python examples/tts/tts_infer.py \\ + python examples/tts/magpietts_inference.py \\ --model_type magpie \\ --nemo_files /path/to/model.nemo \\ --datasets_json_path /path/to/evalset_config.json \\ @@ -34,7 +34,7 @@ --codecmodel_path /path/to/codec.nemo # EasyMagpieTTS inference (decoder-only) - python examples/tts/tts_infer.py \\ + python examples/tts/magpietts_inference.py \\ --model_type easy_magpie \\ --nemo_files /path/to/model.nemo \\ --datasets_json_path /path/to/evalset_config.json \\ @@ -42,7 +42,7 @@ --codecmodel_path /path/to/codec.nemo # With evaluation - python examples/tts/tts_infer.py \\ + python examples/tts/magpietts_inference.py \\ --model_type magpie \\ --hparams_files /path/to/hparams.yaml \\ --checkpoint_files /path/to/model.ckpt \\ @@ -66,7 +66,7 @@ import numpy as np from nemo.collections.asr.parts.utils.manifest_utils import read_manifest -from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters +from nemo.collections.tts.models.easy_magpietts_inference import EasyModelInferenceParameters from nemo.collections.tts.models.magpietts import ModelInferenceParameters from nemo.collections.tts.modules.magpietts_inference.evaluate_generated_audio import load_evalset_config from nemo.collections.tts.modules.magpietts_inference.evaluation import ( @@ -161,11 +161,6 @@ def filter_datasets(dataset_meta_info: dict, datasets: Optional[List[str]]) -> L return datasets -# --------------------------------------------------------------------------- -# Core inference + evaluation orchestration (model-type agnostic) -# --------------------------------------------------------------------------- - - def run_inference_and_evaluation( runner: BaseInferenceRunner, checkpoint_name: str, @@ -355,15 +350,18 @@ def run_inference_and_evaluation( return None, None -# --------------------------------------------------------------------------- -# CLI argument parser -# --------------------------------------------------------------------------- +def _get_shared_inference_param_names() -> set: + """Return the field names shared by ModelInferenceParameters and EasyModelInferenceParameters.""" + magpie_fields = {f.name for f in fields(ModelInferenceParameters)} + easy_fields = {f.name for f in fields(EasyModelInferenceParameters)} + return magpie_fields & easy_fields def _add_inference_param_fields( group: argparse._ArgumentGroup, param_cls: type, skip_fields: Optional[set] = None, + only_fields: Optional[set] = None, ) -> None: """Auto-generate argparse arguments from fields of a dataclass. @@ -371,12 +369,15 @@ def _add_inference_param_fields( group: The argparse argument group to add arguments to. param_cls: The dataclass whose fields to add. skip_fields: Field names to skip (already added by another group). + only_fields: If provided, only add fields whose names are in this set. """ if skip_fields is None: skip_fields = set() for f in fields(param_cls): if f.name in skip_fields: continue + if only_fields is not None and f.name not in only_fields: + continue extra_args: dict = {"type": f.type} if f.type == bool: extra_args = {"action": "store_true"} @@ -399,7 +400,7 @@ def _add_common_args(parser: argparse.ArgumentParser) -> None: default='magpie', choices=['magpie', 'easy_magpie'], help='Model type: "magpie" for encoder-decoder MagpieTTSModel, ' - '"easy_magpie" for decoder-only EasyMagpieTTSModel', + '"easy_magpie" for decoder-only EasyMagpieTTSInferenceModel', ) # Model loading @@ -469,8 +470,9 @@ def _add_common_args(parser: argparse.ArgumentParser) -> None: infer_group.add_argument('--use_cfg', action='store_true', help='Enable classifier-free guidance') infer_group.add_argument('--use_local_transformer', action='store_true') - # Shared model inference parameters (max_decoder_steps, temperature, topk, cfg_scale) - _add_inference_param_fields(infer_group, EasyModelInferenceParameters) + # Model inference parameters shared by both MagpieTTS and EasyMagpieTTS + shared_param_names = _get_shared_inference_param_names() + _add_inference_param_fields(infer_group, ModelInferenceParameters, only_fields=shared_param_names) # Evaluation eval_group = parser.add_argument_group('Evaluation') @@ -499,9 +501,8 @@ def _add_magpie_args(parser: argparse.ArgumentParser) -> None: group = parser.add_argument_group('MagpieTTS-specific Parameters') # MagpieTTS-specific model inference parameters (attention prior, EOS, etc.) - # Skip fields already added by the common inference group. - shared_field_names = {f.name for f in fields(EasyModelInferenceParameters)} - _add_inference_param_fields(group, ModelInferenceParameters, skip_fields=shared_field_names) + shared_param_names = _get_shared_inference_param_names() + _add_inference_param_fields(group, ModelInferenceParameters, skip_fields=shared_param_names) group.add_argument('--maskgit_n_steps', type=int, default=3) group.add_argument('--maskgit_noise_scale', type=float, default=0.0) @@ -514,7 +515,7 @@ def _add_magpie_args(parser: argparse.ArgumentParser) -> None: def _add_easy_magpie_args(parser: argparse.ArgumentParser) -> None: - """Add arguments specific to decoder-only EasyMagpieTTSModel.""" + """Add arguments specific to decoder-only EasyMagpieTTSInferenceModel.""" group = parser.add_argument_group('EasyMagpieTTS-specific Parameters') group.add_argument( '--phoneme_input_type', @@ -532,9 +533,10 @@ def _add_easy_magpie_args(parser: argparse.ArgumentParser) -> None: ) group.add_argument('--dropout_text_input', action='store_true', help='Force dropout on text input') group.add_argument( - '--legacy_context_stacking', - action='store_true', - help='Use audio_bos_id/audio_eos_id for context stacking', + '--phoneme_tokenizer_path', + type=str, + default=None, + help='Override path to the phoneme tokenizer file (overrides the path stored in the checkpoint config)', ) @@ -551,11 +553,6 @@ def create_argument_parser() -> argparse.ArgumentParser: return parser -# --------------------------------------------------------------------------- -# Config builders (one per model type) -# --------------------------------------------------------------------------- - - def _build_inference_params_from_args(param_cls: type, args): """Extract inference parameters from parsed CLI args for the given dataclass.""" params = {} @@ -592,15 +589,8 @@ def _build_easy_magpie_config(args) -> EasyMagpieInferenceConfig: phoneme_input_type=args.phoneme_input_type, phoneme_sampling_method=args.phoneme_sampling_method, dropout_text_input=args.dropout_text_input, - legacy_context_stacking=args.legacy_context_stacking, ) - -# --------------------------------------------------------------------------- -# Entry point -# --------------------------------------------------------------------------- - - def main(argv=None): """Entry point for TTS inference and evaluation.""" parser = create_argument_parser() @@ -656,6 +646,7 @@ def main(argv=None): legacy_codebooks=args.legacy_codebooks, legacy_text_conditioning=args.legacy_text_conditioning, hparams_from_wandb=args.hparams_file_from_wandb, + phoneme_tokenizer_path=getattr(args, 'phoneme_tokenizer_path', None), ) model, checkpoint_name = load_fn(model_config) @@ -693,6 +684,7 @@ def main(argv=None): codecmodel_path=args.codecmodel_path, legacy_codebooks=args.legacy_codebooks, legacy_text_conditioning=args.legacy_text_conditioning, + phoneme_tokenizer_path=getattr(args, 'phoneme_tokenizer_path', None), ) model, checkpoint_name = load_fn(model_config) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 19705eed1ad3..5a117432b986 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -14,7 +14,7 @@ import json import os import random -from dataclasses import dataclass, fields +from dataclasses import dataclass from typing import Dict, List, Optional, Tuple import numpy as np @@ -98,29 +98,6 @@ class ProcessBatchOutput: selected_training_mode: Optional[str] -@dataclass -class EasyModelInferenceParameters: - """Inference parameters for the decoder-only EasyMagpieTTS model. - - Attributes: - max_decoder_steps: Maximum number of decoder steps. - temperature: Sampling temperature. - topk: Number of top-probability tokens to consider in sampling. - cfg_scale: Scale factor for classifier-free guidance. - """ - - max_decoder_steps: int = 500 - temperature: float = 0.7 - topk: int = 80 - cfg_scale: float = 2.5 - - @classmethod - def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters': - field_names = {field.name for field in fields(cls)} - filtered_data = {k: v for k, v in data.items() if k in field_names} - return cls(**filtered_data) - - class EasyMagpieTTSModel(EasyMagpieTTSInferenceModel): """ Magpie-TTS Model Decoder Only Model with training support. diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py index 765c234e2683..59db7decda0e 100644 --- a/nemo/collections/tts/models/easy_magpietts_inference.py +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import time -from dataclasses import dataclass +from dataclasses import dataclass, fields from functools import partial from typing import Any, Dict, List, Optional, Sequence, Tuple @@ -184,6 +184,29 @@ class InferBatchOutput: phoneme_prediction_start_idx: Optional[torch.Tensor] = None # (B,) start index into predicted_phoneme_tokens +@dataclass +class EasyModelInferenceParameters: + """Inference parameters for the decoder-only EasyMagpieTTS model. + + Attributes: + max_decoder_steps: Maximum number of decoder steps. + temperature: Sampling temperature. + topk: Number of top-probability tokens to consider in sampling. + cfg_scale: Scale factor for classifier-free guidance. + """ + + max_decoder_steps: int = 300 + temperature: float = 0.7 + topk: int = 80 + cfg_scale: float = 2.5 + + @classmethod + def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters': + field_names = {field.name for field in fields(cls)} + filtered_data = {k: v for k, v in data.items() if k in field_names} + return cls(**filtered_data) + + class EasyMagpieTTSInferenceModel(BaseMagpieTTSModel): """ Inference-only base class for EasyMagpieTTS decoder-only model. @@ -319,6 +342,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self.pad_context_text_to_max_duration = False self.add_language_to_context_text = cfg.get('add_language_to_context_text', False) + self.ignore_phoneme_languages = cfg.get('ignore_phoneme_languages', []) super().__init__(cfg=cfg, trainer=trainer) @@ -465,6 +489,12 @@ def _get_state_dict_keys_to_exclude(self): '_codec_model', ] + def setup_training_data(self, train_data_config=None): + pass + + def setup_validation_data(self, val_data_config=None): + pass + def codes_to_audio(self, codes, codes_len): # codes: (B, C, T') self._codec_model.eval() @@ -734,19 +764,11 @@ def prepare_context_tensors( eos_id=self.context_audio_eos_id, ) - # Use legacy audio_bos_id/audio_eos_id if flag is set - stack_bos_id = ( - self.audio_bos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_bos_id - ) - stack_eos_id = ( - self.audio_eos_id if getattr(self, 'legacy_context_stacking', False) else self.context_audio_eos_id - ) - context_audio_codes, context_audio_codes_lens = self.stack_codes( context_audio_codes, context_audio_codes_lens, - stack_bos_id, - stack_eos_id, + self.context_audio_bos_id, + self.context_audio_eos_id, self.frame_stacking_factor, self.num_audio_codebooks, ) diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index c343a9d31f9a..ab501075c98d 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -21,7 +21,7 @@ MagpieInferenceRunner handles the encoder-decoder MagpieTTSModel (chunked text, generate_speech + codes_to_audio). -EasyMagpieInferenceRunner handles the decoder-only EasyMagpieTTSModel +EasyMagpieInferenceRunner handles the decoder-only EasyMagpieTTSInferenceModel (infer_batch, returns audio directly). """ from __future__ import annotations @@ -40,7 +40,7 @@ from nemo.collections.asr.parts.utils.manifest_utils import read_manifest from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPATokenizer from nemo.collections.tts.data.text_to_speech_dataset import ChunkedTTSInferenceDataset, MagpieTTSDataset -from nemo.collections.tts.models.easy_magpietts import EasyModelInferenceParameters +from nemo.collections.tts.models.easy_magpietts_inference import EasyModelInferenceParameters from nemo.collections.tts.models.magpietts import ModelInferenceParameters from nemo.collections.tts.parts.utils.tts_dataset_utils import stack_tensors from nemo.utils import logging @@ -123,13 +123,12 @@ def build_identifier(self) -> str: @dataclass class EasyMagpieInferenceConfig(BaseInferenceConfig): - """Configuration for decoder-only EasyMagpieTTSModel inference.""" + """Configuration for decoder-only EasyMagpieTTSInferenceModel inference.""" model_inference_parameters: EasyModelInferenceParameters = field(default_factory=EasyModelInferenceParameters) phoneme_input_type: str = "gt" phoneme_sampling_method: str = "argmax" dropout_text_input: bool = False - legacy_context_stacking: bool = False def build_identifier(self) -> str: parts = [ @@ -538,19 +537,18 @@ def _compute_end_of_text_flags( # --------------------------------------------------------------------------- -# EasyMagpieInferenceRunner (decoder-only EasyMagpieTTSModel) +# EasyMagpieInferenceRunner (decoder-only EasyMagpieTTSInferenceModel) # --------------------------------------------------------------------------- class EasyMagpieInferenceRunner(BaseInferenceRunner): - """Runner for decoder-only EasyMagpieTTSModel. + """Runner for decoder-only EasyMagpieTTSInferenceModel. Uses MagpieTTSDataset and model.infer_batch() which returns audio directly. """ def __init__(self, model, config: EasyMagpieInferenceConfig): super().__init__(model, config) - self.model.legacy_context_stacking = config.legacy_context_stacking def create_dataset( self, @@ -583,6 +581,8 @@ def create_dataset( pad_context_text_to_max_duration=False, context_duration_min=context_duration_min, context_duration_max=context_duration_max, + ignore_phoneme_languages=self.config.get('ignore_phoneme_languages', []), + add_language_to_context_text=self.model.add_language_to_context_text ) dataset.text_tokenizer = self.model.tokenizer diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index a14cd0789f7a..9c67125f4343 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -28,7 +28,7 @@ import torch from omegaconf import DictConfig, OmegaConf, open_dict -from nemo.collections.tts.models import EasyMagpieTTSModel, MagpieTTSModel +from nemo.collections.tts.models import EasyMagpieTTSInferenceModel, MagpieTTSModel from nemo.utils import logging @@ -119,6 +119,7 @@ class ModelLoadConfig: legacy_codebooks: Use legacy codebook indices for old checkpoints. legacy_text_conditioning: Use legacy text conditioning for old checkpoints. hparams_from_wandb: Whether hparams file is from wandb export. + phoneme_tokenizer_path: Override path to the phoneme tokenizer file (EasyMagpieTTS only). """ hparams_file: Optional[str] = None @@ -128,6 +129,7 @@ class ModelLoadConfig: legacy_codebooks: bool = False legacy_text_conditioning: bool = False hparams_from_wandb: bool = False + phoneme_tokenizer_path: Optional[str] = None def validate(self) -> None: """Validate that the configuration is complete and consistent.""" @@ -336,8 +338,13 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma return model, checkpoint_name -def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[EasyMagpieTTSModel, str]: - """Load an EasyMagpieTTSModel (decoder-only) from checkpoint or NeMo archive. +def load_easy_magpie_model( + config: ModelLoadConfig, device: str = "cuda" +) -> Tuple[EasyMagpieTTSInferenceModel, str]: + """Load an EasyMagpieTTSInferenceModel (decoder-only) from checkpoint or NeMo archive. + + Uses the inference-only base class rather than the full training model, + which avoids pulling in training-specific dependencies. Supports two loading modes: 1. Checkpoint mode: hparams.yaml + .ckpt file @@ -367,8 +374,10 @@ def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tup model_cfg.codecmodel_path = config.codecmodel_path model_cfg.train_ds = None model_cfg.validation_ds = None + if config.phoneme_tokenizer_path and hasattr(model_cfg, 'phoneme_tokenizer'): + model_cfg.phoneme_tokenizer.tokenizer_path = config.phoneme_tokenizer_path - model = EasyMagpieTTSModel(cfg=model_cfg) + model = EasyMagpieTTSInferenceModel(cfg=model_cfg) logging.info(f"Loading weights from checkpoint: {config.checkpoint_file}") ckpt = torch.load(config.checkpoint_file) @@ -378,22 +387,29 @@ def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tup checkpoint_name = os.path.basename(config.checkpoint_file).replace(".ckpt", "") else: if config.nemo_file.startswith("nvidia/"): - model = EasyMagpieTTSModel.from_pretrained(config.nemo_file) + model = EasyMagpieTTSInferenceModel.from_pretrained(config.nemo_file) checkpoint_name = config.nemo_file.split("/")[-1] else: logging.info(f"Loading model from NeMo archive: {config.nemo_file}") - model_cfg = EasyMagpieTTSModel.restore_from(config.nemo_file, return_config=True) + model_cfg = EasyMagpieTTSInferenceModel.restore_from(config.nemo_file, return_config=True) with open_dict(model_cfg): model_cfg.codecmodel_path = config.codecmodel_path model_cfg.train_ds = None model_cfg.validation_ds = None + if config.phoneme_tokenizer_path and hasattr(model_cfg, 'phoneme_tokenizer'): + model_cfg.phoneme_tokenizer.tokenizer_path = config.phoneme_tokenizer_path + # Override target so restore_from instantiates the inference class, + # not the training subclass stored in the .nemo config. + model_cfg.target = ( + 'nemo.collections.tts.models.easy_magpietts_inference.EasyMagpieTTSInferenceModel' + ) - model = EasyMagpieTTSModel.restore_from(config.nemo_file, override_config_path=model_cfg) + model = EasyMagpieTTSInferenceModel.restore_from(config.nemo_file, override_config_path=model_cfg) checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "") model.to(device) - model.eval() + model.eval().float() logging.info("EasyMagpieTTS model loaded and ready for inference.") return model, checkpoint_name @@ -480,7 +496,7 @@ def log_model_architecture_summary(model) -> Tuple[str, Dict[str, dict]]: Detects and logs MoE configuration for each transformer component, computing FLOPs metrics and parameter counts. Gracefully handles - decoder-only models (EasyMagpieTTSModel) that use HuggingFace/Nemotron + decoder-only models (EasyMagpieTTSInferenceModel) that use HuggingFace/Nemotron decoders without the d_model/d_ffn config structure. Args: diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh index b6d87e91a254..368b5c83bba5 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_FrameStacking.sh @@ -14,7 +14,7 @@ # Tests a 4x-stacked model with local transformer inference. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh index 4e917733f59a..a591497f22e0 100755 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_MoE_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \ --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \ --datasets_json_path "examples/tts/evalset_config.json" \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh index 8eb30eb40c36..5ed8d48f5aff 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_SeenSpeakers.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh index eed95fc5a64e..3a9415bbc2b3 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluate_Magpietts_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh index c21454d39cb1..ec8b6b885212 100755 --- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ --nemo_files "/home/TestData/tts/2602_MoE/moe16_sinkhorn_top1_valLoss5.0469_step2625132_epoch524.nemo" \ --codecmodel_path "/home/TestData/tts/21fps_causal_codecmodel.nemo" \ --datasets_json_path "examples/tts/evalset_config.json" \ diff --git a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh index 96e20304197a..a0694c16b9ba 100644 --- a/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh +++ b/tests/functional_tests/L2_TTS_InferEvaluatelongform_Magpietts_ZeroShot.sh @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/tts_infer.py \ +TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts_inference.py \ --codecmodel_path /home/TestData/tts/21fps_causal_codecmodel.nemo \ --datasets_json_path examples/tts/evalset_config.json \ --datasets an4_val_ci_longform_tiny \ From cb6926733fb8b05ad008368bab3b533c9c48f28e Mon Sep 17 00:00:00 2001 From: shehzeen Date: Thu, 12 Mar 2026 04:24:49 +0000 Subject: [PATCH 90/94] Apply isort and black reformatting Signed-off-by: shehzeen --- examples/tts/magpietts_inference.py | 1 + .../tts/modules/magpietts_inference/inference.py | 2 +- nemo/collections/tts/modules/magpietts_inference/utils.py | 8 ++------ 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py index fca92fccddc4..b8a91d3ea307 100644 --- a/examples/tts/magpietts_inference.py +++ b/examples/tts/magpietts_inference.py @@ -591,6 +591,7 @@ def _build_easy_magpie_config(args) -> EasyMagpieInferenceConfig: dropout_text_input=args.dropout_text_input, ) + def main(argv=None): """Entry point for TTS inference and evaluation.""" parser = create_argument_parser() diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index ab501075c98d..e936f81439be 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -582,7 +582,7 @@ def create_dataset( context_duration_min=context_duration_min, context_duration_max=context_duration_max, ignore_phoneme_languages=self.config.get('ignore_phoneme_languages', []), - add_language_to_context_text=self.model.add_language_to_context_text + add_language_to_context_text=self.model.add_language_to_context_text, ) dataset.text_tokenizer = self.model.tokenizer diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py index 9c67125f4343..47b2553e99eb 100644 --- a/nemo/collections/tts/modules/magpietts_inference/utils.py +++ b/nemo/collections/tts/modules/magpietts_inference/utils.py @@ -338,9 +338,7 @@ def load_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[Ma return model, checkpoint_name -def load_easy_magpie_model( - config: ModelLoadConfig, device: str = "cuda" -) -> Tuple[EasyMagpieTTSInferenceModel, str]: +def load_easy_magpie_model(config: ModelLoadConfig, device: str = "cuda") -> Tuple[EasyMagpieTTSInferenceModel, str]: """Load an EasyMagpieTTSInferenceModel (decoder-only) from checkpoint or NeMo archive. Uses the inference-only base class rather than the full training model, @@ -401,9 +399,7 @@ def load_easy_magpie_model( model_cfg.phoneme_tokenizer.tokenizer_path = config.phoneme_tokenizer_path # Override target so restore_from instantiates the inference class, # not the training subclass stored in the .nemo config. - model_cfg.target = ( - 'nemo.collections.tts.models.easy_magpietts_inference.EasyMagpieTTSInferenceModel' - ) + model_cfg.target = 'nemo.collections.tts.models.easy_magpietts_inference.EasyMagpieTTSInferenceModel' model = EasyMagpieTTSInferenceModel.restore_from(config.nemo_file, override_config_path=model_cfg) checkpoint_name = os.path.basename(config.nemo_file).replace(".nemo", "") From db9763dcdcf6aea2baa333015653d070c1f89513 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Thu, 12 Mar 2026 16:35:03 -0400 Subject: [PATCH 91/94] refactoring to remove magpie base class Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/__init__.py | 2 - nemo/collections/tts/models/base_magpietts.py | 527 ----------------- nemo/collections/tts/models/easy_magpietts.py | 55 +- .../tts/models/easy_magpietts_inference.py | 117 +++- .../easy_magpietts_preference_optimization.py | 7 +- nemo/collections/tts/models/magpietts.py | 128 +++-- .../magpietts_preference_optimization.py | 3 +- .../modules/magpietts_inference/inference.py | 6 +- .../tts/modules/magpietts_modules.py | 538 ++++++++++++++++++ 9 files changed, 774 insertions(+), 609 deletions(-) delete mode 100644 nemo/collections/tts/models/base_magpietts.py diff --git a/nemo/collections/tts/models/__init__.py b/nemo/collections/tts/models/__init__.py index 28d49bca1c81..576077bdcddf 100644 --- a/nemo/collections/tts/models/__init__.py +++ b/nemo/collections/tts/models/__init__.py @@ -14,7 +14,6 @@ from nemo.collections.tts.models.aligner import AlignerModel from nemo.collections.tts.models.audio_codec import AudioCodecModel -from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel from nemo.collections.tts.models.easy_magpietts import EasyMagpieTTSModel from nemo.collections.tts.models.easy_magpietts_inference import EasyMagpieTTSInferenceModel from nemo.collections.tts.models.easy_magpietts_preference_optimization import EasyMagpieTTSModelOnlinePO @@ -32,7 +31,6 @@ __all__ = [ "AlignerModel", "AudioCodecModel", - "BaseMagpieTTSModel", "FastPitchModel", "FastPitchModel_SSL", "SSLDisentangler", diff --git a/nemo/collections/tts/models/base_magpietts.py b/nemo/collections/tts/models/base_magpietts.py deleted file mode 100644 index f031ebf98fab..000000000000 --- a/nemo/collections/tts/models/base_magpietts.py +++ /dev/null @@ -1,527 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, List, Optional - -import numpy as np -import torch -from hydra.utils import instantiate -from torch.utils.data import get_worker_info - -from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers -from nemo.collections.tts.modules.magpietts_modules import SpecialAudioToken, cosine_schedule -from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths -from nemo.core.classes import ModelPT -from nemo.utils import logging - - -def worker_init_fn(worker_id): - """Per-worker init for DataLoader workers. - - Sets up tokenizers for the dataset (text and optionally phoneme) - when using multiprocessing. - """ - logging.info(f"Worker {worker_id} initializing...") - worker_info = get_worker_info() - dataset = worker_info.dataset - tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type) - dataset.text_tokenizer = tokenizer - if hasattr(dataset, 'phoneme_tokenizer_config'): - dataset.phoneme_tokenizer = instantiate(dataset.phoneme_tokenizer_config) - - -class BaseMagpieTTSModel(ModelPT): - """Base class for MagpieTTS models. - - Contains shared functionality for audio codec helpers, special token - manipulation, local transformer functions, and state dict handling. - Subclasses (EasyMagpieTTSModel, MagpieTTSModel) provide their own - ``__init__``, data loading, training/inference logic, etc. - """ - - def _get_state_dict_keys_to_exclude(self) -> List[str]: - """Return list of key substrings to exclude from checkpoint save/load. - - Subclasses should override to specify model-specific exclusions - (e.g. codec model, eval models). - """ - return ['_codec_model'] - - def state_dict(self, destination=None, prefix='', keep_vars=False): - if hasattr(self, '_no_state_dict') and self._no_state_dict: - return {} - state_dict = super().state_dict(destination, prefix, keep_vars) - keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude() - for key in list(state_dict.keys()): - if any(substring in key for substring in keys_substrings_to_exclude): - del state_dict[key] - return state_dict - - def load_state_dict(self, state_dict, strict=True): - if not strict: - super().load_state_dict(state_dict, strict=False) - modules_to_skip = self._get_state_dict_keys_to_exclude() - for name, child in self.named_children(): - if name in modules_to_skip: - continue - if any(param.numel() > 0 for param in child.parameters()): - new_state_dict = {} - for key in state_dict.keys(): - name_with_dot = f"{name}." - if key.startswith(name_with_dot): - new_state_dict[key[len(name_with_dot) :]] = state_dict[key] - child.load_state_dict(new_state_dict) - - def setup_optimizer_param_groups(self): - """Exclude frozen eval/inference-only models from the optimizer.""" - modules_to_exclude = set(self._get_state_dict_keys_to_exclude()) - - excluded_param_ids = set() - for name, module in self.named_children(): - if name in modules_to_exclude: - for param in module.parameters(): - excluded_param_ids.add(id(param)) - - trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids] - - logging.info( - f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, " - f"{len(excluded_param_ids)} params excluded (eval models)" - ) - - self._optimizer_param_groups = [{"params": trainable_params}] - - def add_eos_token(self, codes, codes_len, eos_id, num_eos_tokens=1): - # codes: (B, C, T') - codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0) - codes_len = codes_len + num_eos_tokens - for idx in range(codes.size(0)): - codes[idx, :, codes_len[idx] - 1] = eos_id - return codes, codes_len - - def add_special_tokens(self, codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1): - # codes: (B, C, T') - codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id) - codes_len = codes_len + num_bos_tokens - codes, codes_len = self.add_eos_token( - codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens - ) - return codes, codes_len - - def remove_bos_token(self, codes, codes_len, num_tokens=1): - codes = codes[:, :, num_tokens:] - codes_len = codes_len - num_tokens - return codes, codes_len - - def remove_embedded_bos_token(self, embedded, embedded_len): - embedded = embedded[:, 1:, :] - embedded_len = embedded_len - 1 - return embedded, embedded_len - - def remove_eos_token(self, codes, codes_len): - codes_len = codes_len - 1 - codes = codes[:, :, :-1] - mask = get_mask_from_lengths(lengths=codes_len) - codes = codes * mask.unsqueeze(1) - return codes, codes_len - - def remove_embedded_eos_token(self, embedded, embedded_len): - # embedded: (B, T', D) - embedded_len = embedded_len - 1 - embedded = embedded[:, :-1, :] - mask = get_mask_from_lengths(lengths=embedded_len) - embedded = embedded * mask.unsqueeze(2) - return embedded, embedded_len - - def remove_special_tokens(self, codes, codes_len, num_bos_tokens=1): - codes, codes_len = self.remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens) - codes, codes_len = self.remove_eos_token(codes=codes, codes_len=codes_len) - return codes, codes_len - - def audio_to_codes(self, audio, audio_len, sample_rate=None): - self._codec_model.eval() - with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): - codes, codes_len = self._codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate) - return codes, codes_len - - def codes_to_audio(self, codes, codes_len): - # codes: (B, C, T') - self._codec_model.eval() - with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): - if self._codec_converter is not None: - codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) - audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) - return audio, audio_len, codes - - def pad_audio_codes(self, audio_codes: torch.Tensor): - """Pads the time dimension of the audio codes to a multiple of the frame stacking factor. - - Args: - audio_codes: (B, C, T) - Returns: - (B, C, T_padded) - """ - T = audio_codes.size(2) - T_padded = int(np.ceil(T / self.frame_stacking_factor) * self.frame_stacking_factor) - num_pad = T_padded - T - audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad)) - return audio_codes - - def clear_forbidden_logits(self, logits: torch.Tensor, forbid_audio_eos: bool = False) -> torch.Tensor: - """Sets logits of forbidden tokens to ``-inf`` so they will never be sampled. - - Specifically, we forbid sampling of all special tokens except AUDIO_EOS - which is allowed by default. - - Args: - logits: (B, C, num_audio_tokens_per_codebook) - forbid_audio_eos: If True, also forbid AUDIO_EOS tokens from being sampled. - """ - logits[ - :, - :, - SpecialAudioToken.get_forbidden_tokens(self.codebook_size, forbid_audio_eos=forbid_audio_eos), - ] = float('-inf') - return logits - - def maskgit_create_random_mask(self, codes): - """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN.""" - B, C, T = codes.shape - rand_values = torch.rand(B, T, device=codes.device) - frac_masked = cosine_schedule(rand_values) - n_masked = torch.ceil(frac_masked * C).long() - random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) - mask_indices = torch.arange(C, device=codes.device).view(1, C, 1) - mask = mask_indices < n_masked.view(B, 1, T) - mask = torch.gather(mask, 1, random_permutations) - return mask - - def maskgit_apply_random_mask(self, codes): - """Randomly replaces some codes with MASK_TOKEN following the cosine schedule.""" - mask = self.maskgit_create_random_mask(codes) - codes_with_mask = torch.where(mask, self.mask_token_id, codes) - return codes_with_mask, mask - - def compute_local_transformer_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): - """Predicts the logits for all codebooks using the local transformer. - - Used in both autoregressive (AR) and MaskGit (MG) modes during - training and validation (not inference/sampling). - - The sequence layout is slightly different between AR and MG modes, as shown below - (using an 8-codebook setup as an example):: - - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | AR target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | none | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | MG target | none | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | Input | Magpie | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - | | Latent | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - | Seq. Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | - +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ - - Args: - dec_out: (B, T', E) - audio_codes_target: (B, C, T') - targets_offset_by_one: if False, target for index 0 is codebook 0 (AR); - if True, target for index 1 is codebook 0 (MaskGit). - """ - C = self.num_audio_codebooks - dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) - local_transformer_input = [dec_out_all] - audio_codes_target = self.pad_audio_codes(audio_codes_target).long() - for fs_index in range(self.frame_stacking_factor): - for codebook_num in range(C): - codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor] - codes = codes.reshape(-1) - codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes) - codebook_embedding = self.audio_in_projection(codebook_embedding) - local_transformer_input.append(codebook_embedding) - - local_transformer_input = torch.stack(local_transformer_input, dim=1) - local_transformer_input = self.local_transformer_in_projection(local_transformer_input) - _mask = torch.ones( - local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device - ) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] - if not targets_offset_by_one: - local_transformer_output = local_transformer_output[:, :-1, :] - else: - local_transformer_output = local_transformer_output[:, 1:, :] - - local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output) - - all_code_logits = [] - for fs_index in range(self.frame_stacking_factor): - for codebook_num in range(audio_codes_target.size(1)): - codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C]( - local_transformer_output[:, codebook_num + fs_index * C, :] - ) - all_code_logits.append(codebook_logits) - all_code_logits = torch.cat(all_code_logits, dim=1) - - all_code_logits = all_code_logits.view( - audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1 - ) - - return all_code_logits - - def local_transformer_sample_autoregressive( - self, - dec_output: torch.Tensor, - temperature: float = 0.7, - topk: int = 80, - unfinished_items: Dict[int, bool] = {}, - finished_items: Dict[int, bool] = {}, - use_cfg: bool = False, - cfg_scale: float = 1.0, - use_kv_cache: bool = True, - forbid_audio_eos: bool = False, - sanitize_logits: bool = False, - ) -> torch.Tensor: - """Sample audio codes autoregressively across codebooks using the local transformer. - - Uses multinomial sampling with temperature, top-k, and - classifier-free guidance (CFG). - - Args: - dec_output: Decoder output tensor (B, E). - temperature: Sampling temperature. When <= 0, uses argmax. - topk: Number of top-probability tokens to consider. - unfinished_items: Batch indices that have not completed generation (EOS forbidden). - finished_items: Batch indices that are completed (EOS forced). - use_cfg: Whether to use classifier-free guidance (doubled batch). - cfg_scale: Scale factor for CFG. - use_kv_cache: Whether to use key-value caching in the local transformer. - forbid_audio_eos: Whether to globally forbid audio EOS. - sanitize_logits: Whether to clamp/clean logits before sampling. - - Returns: - Sampled audio codes (B, num_codebooks, frame_stacking_factor). - """ - self.local_transformer.reset_cache(use_cache=use_kv_cache) - dec_output = dec_output.unsqueeze(1) # (B, 1, E) - local_transformer_input = self.local_transformer_in_projection(dec_output) - all_preds = [] - for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor): - _mask = torch.ones( - local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device - ) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] - - lt_out_for_proj = self.local_transformer_audio_out_projection(local_transformer_output[:, -1, :]) - codebook_logits = self.local_transformer_out_projections[codebook_num](lt_out_for_proj) - - if use_cfg: - actual_batch_size = codebook_logits.size(0) // 2 - conditional_logits = codebook_logits[:actual_batch_size] - unconditional_logits = codebook_logits[actual_batch_size:] - cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits - codebook_logits[:actual_batch_size] = cfg_logits - - if sanitize_logits: - codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) - codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) - - for item_idx in unfinished_items: - codebook_logits[item_idx, self.audio_eos_id] = float('-inf') - for item_idx in finished_items: - codebook_logits[item_idx, :] = float('-inf') - codebook_logits[item_idx, self.audio_eos_id] = 0.0 - - codebook_logits = self.clear_forbidden_logits( - codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos - ).squeeze(1) - - codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] - indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1) - codebook_logits_rescored = codebook_logits.clone() - codebook_logits_rescored[indices_to_remove] = float('-inf') - - if temperature <= 0.0: - codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) - else: - codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) - codebook_preds = torch.multinomial(codebook_probs, 1) - - if use_cfg: - codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] - all_preds.append(codebook_preds) - - next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1) - next_local_transformer_input = self.audio_in_projection(next_local_transformer_input) - next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) - local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) - - all_preds = torch.cat(all_preds, dim=1) # (B, num_codebooks * frame_stacking_factor) - all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1) - if use_cfg: - all_preds = all_preds[:actual_batch_size] - - return all_preds - - def local_transformer_sample_maskgit( - self, - dec_output: torch.Tensor, - temperature: float = 0.7, - topk: int = 80, - unfinished_items: Dict[int, bool] = {}, - finished_items: Dict[int, bool] = {}, - use_cfg: bool = False, - cfg_scale: float = 1.0, - n_steps: int = 3, - noise_scale: float = 0.0, - fixed_schedule: Optional[List[int]] = None, - dynamic_cfg_scale: bool = False, - sampling_type: Optional[str] = None, - forbid_audio_eos: bool = False, - ) -> torch.Tensor: - """Sample audio codes using MaskGit-like iterative prediction with the local transformer. - - If frame-stacking is enabled, the codes for all frames in the stack - are sampled, treated as one long sequence. - - Args: - dec_output: Decoder output tensor (B, E). - temperature: Sampling temperature. - topk: Number of top-probability tokens to consider. - unfinished_items: Batch indices that have not completed generation. - finished_items: Batch indices that are completed. - use_cfg: Whether to use classifier-free guidance. - cfg_scale: Scale factor for CFG. - n_steps: Number of iterative refinement steps. - noise_scale: Scale factor for noise added to confidence scores. - fixed_schedule: Fixed schedule for number of tokens to unmask per step. - dynamic_cfg_scale: Whether to dynamically adjust CFG scale. - sampling_type: Sampling strategy (``"default"``, ``"causal"``, - ``"purity_causal"``, ``"purity_default"``). - forbid_audio_eos: Whether to globally forbid audio EOS. - - Returns: - Sampled audio codes (B, num_codebooks, frame_stacking_factor). - """ - device = dec_output.device - self.local_transformer.reset_cache(use_cache=False) - dec_output = dec_output.unsqueeze(1) - local_transformer_input_init = self.local_transformer_in_projection(dec_output) - codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor - B = dec_output.size(0) - - min_confidence = 0 - max_confidence = 5 - confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device) - codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long) - sampled_codes = codes.clone() - if fixed_schedule is not None: - n_steps = len(fixed_schedule) - for step in range(n_steps): - progress = step / n_steps - frac_masked = cosine_schedule(torch.tensor(progress)) - if sampling_type == "causal" or sampling_type == "purity_causal": - frac_masked = torch.ones_like(frac_masked) * (1.0 - progress) - if fixed_schedule is None: - n_masked = torch.ceil(codebook_seq_len * frac_masked).long() - else: - n_masked = codebook_seq_len - fixed_schedule[step] - n_unmasked = codebook_seq_len - n_masked - - if sampling_type == "causal" or sampling_type == "purity_causal": - n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1)) - confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = min_confidence - 1 - - _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) - if use_cfg: - actual_batch_size = topk_indices.size(0) // 2 - assert ( - topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size] - ).all(), "Topk indices are not the same for conditional and unconditional codes" - - unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices) - codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) - - local_transformer_input = local_transformer_input_init - for codebook_num in range(codebook_seq_len): - next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) - next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) - local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) - - _mask = torch.ones(B, codebook_seq_len + 1, device=device) - local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] - - logits = [] - for codebook_num in range(codebook_seq_len): - codebook_logits = self.local_transformer_out_projections[codebook_num]( - local_transformer_output[:, codebook_num + 1, :] - ) - logits.append(codebook_logits) - logits = torch.stack(logits, dim=1) - - if use_cfg: - actual_batch_size = logits.size(0) // 2 - conditional_logits = logits[:actual_batch_size] - unconditional_logits = logits[actual_batch_size:] - if not dynamic_cfg_scale: - current_cfg_scale = cfg_scale - else: - progress = step / (n_steps - 1) - interp = progress - current_cfg_scale = (cfg_scale - 1) * interp + 1.0 - cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits - logits[:actual_batch_size] = cfg_logits - - logits = self.clear_forbidden_logits(logits, forbid_audio_eos=forbid_audio_eos) - - for item_idx in unfinished_items: - logits[item_idx, self.audio_eos_id] = float('-inf') - for item_idx in finished_items: - logits[item_idx, :, :] = float('-inf') - logits[item_idx, :, self.audio_eos_id] = 0.0 - - logits_topk = torch.topk(logits, topk, dim=-1)[0] - indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) - logits_rescored = logits.clone() - logits_rescored[indices_to_remove] = float('-inf') - probs = torch.softmax(logits_rescored / temperature, dim=-1) - sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len) - if use_cfg: - sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size] - probs[actual_batch_size:] = probs[:actual_batch_size] - if sampling_type != "purity_causal" and sampling_type != "purity_default": - confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) - else: - confidences = probs.max(dim=2)[0] - sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) - if noise_scale > 0.0: - noise = (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps) - confidences += noise - confidences[actual_batch_size:] = confidences[:actual_batch_size] - confidence_eps = 0.1 - assert ( - confidences.max() + confidence_eps < max_confidence - ), f"Predicted confidence is approaching max_confidence: {confidences.max()}" - confidences.scatter_( - index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float) - ) - codes = sampled_codes - assert not ( - codes == self.mask_token_id - ).any(), "Codes contain mask tokens after completion of MaskGit sampling" - - codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1) - - if use_cfg: - codes = codes[:actual_batch_size] - return codes diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index 5a117432b986..ccc8f315a3c2 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -33,7 +33,6 @@ from nemo.collections.asr.parts.mixins.transcription import TranscribeConfig from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.tts.data.text_to_speech_dataset_lhotse import MagpieTTSLhotseDataset, setup_tokenizers -from nemo.collections.tts.models.base_magpietts import worker_init_fn from nemo.collections.tts.models.easy_magpietts_inference import ( EasyMagpieTTSInferenceModel, InferBatchOutput, @@ -41,7 +40,13 @@ StreamingState, TrainingMode, ) -from nemo.collections.tts.modules.magpietts_modules import LocalTransformerType +from nemo.collections.tts.modules.magpietts_modules import ( + LocalTransformerType, + add_special_tokens, + remove_eos_token, + remove_special_tokens, + worker_init_fn, +) from nemo.collections.tts.parts.utils.helpers import ( compute_utmos_scores_from_filepaths, get_mask_from_lengths, @@ -229,25 +234,34 @@ def log_val_audio_example( wandb_audio_log = {} pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens_target) - pred_audio_codes, _ = self.remove_eos_token( + pred_audio_codes, _ = remove_eos_token( codes=pred_audio_codes, codes_len=audio_codes_lens_target, ) - pred_audio, pred_audio_lens, _ = self.codes_to_audio(pred_audio_codes, audio_codes_lens_target - 1) - target_audio_codes, _ = self.remove_eos_token( + pred_audio_codes, pred_audio_codes_lens = self._prepare_codes_for_decode(pred_audio_codes, audio_codes_lens_target - 1) + pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio( + pred_audio_codes, pred_audio_codes_lens, + ) + target_audio_codes, _ = remove_eos_token( codes=target_audio_codes, codes_len=audio_codes_lens_target, ) - target_audio, target_audio_lens, _ = self.codes_to_audio(target_audio_codes, audio_codes_lens_target - 1) + target_audio_codes, target_audio_codes_lens = self._prepare_codes_for_decode(target_audio_codes, audio_codes_lens_target - 1) + target_audio, target_audio_lens, _ = self._codec_helper.codes_to_audio( + target_audio_codes, target_audio_codes_lens, + ) context_audio, context_audio_lens = None, None if context_audio_codes is not None and context_audio_codes.shape[2] > 3: # > 3 ensures, it is a valid context audio tensor (and not dummy tensor used in text context) - context_audio_codes, context_audio_codes_lens = self.remove_special_tokens( + context_audio_codes, context_audio_codes_lens = remove_special_tokens( codes=context_audio_codes, codes_len=context_audio_codes_lens, ) - context_audio, context_audio_lens, _ = self.codes_to_audio(context_audio_codes, context_audio_codes_lens) + context_audio_codes, context_audio_codes_lens = self._prepare_codes_for_decode(context_audio_codes, context_audio_codes_lens) + context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio( + context_audio_codes, context_audio_codes_lens, + ) for logger in self.loggers: is_wandb = isinstance(logger, WandbLogger) @@ -545,7 +559,7 @@ def prepare_audio_channel_embeddings( ).long() # Add BOS and EOS tokens - audio_codes, audio_codes_lens = self.add_special_tokens( + audio_codes, audio_codes_lens = add_special_tokens( codes=audio_codes, codes_len=audio_codes_lens, bos_id=self.audio_bos_id, @@ -859,7 +873,7 @@ def process_batch( local_transformer_logits = None if self.local_transformer_type != LocalTransformerType.NO_LT: assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" - local_transformer_logits = self.compute_local_transformer_logits( + local_transformer_logits = self._lt_helper.compute_logits( pred_embeddings, audio_codes_target, targets_offset_by_one=False ) local_transformer_loss, _ = self.compute_loss( @@ -918,7 +932,9 @@ def training_step(self, batch, batch_idx): else: context_audio = batch['context_audio'] context_audio_lens = batch['context_audio_lens'] - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes( + context_audio, context_audio_lens + ) if 'audio_codes' in batch: audio_codes = batch['audio_codes'] @@ -926,7 +942,7 @@ def training_step(self, batch, batch_idx): else: audio = batch['audio'] audio_lens = batch['audio_lens'] - audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens) + audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes(audio, audio_lens) batch_output = self.process_batch( text=batch['text'], @@ -1013,7 +1029,9 @@ def validation_step(self, batch, batch_idx): else: context_audio = batch['context_audio'] context_audio_lens = batch['context_audio_lens'] - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes( + context_audio, context_audio_lens + ) if 'audio_codes' in batch: audio_codes = batch['audio_codes'] @@ -1021,7 +1039,7 @@ def validation_step(self, batch, batch_idx): else: audio = batch['audio'] audio_lens = batch['audio_lens'] - audio_codes, audio_codes_lens = self.audio_to_codes(audio, audio_lens) + audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes(audio, audio_lens) batch_output = self.process_batch( text=batch['text'], @@ -1095,12 +1113,15 @@ def validation_step(self, batch, batch_idx): predicted_audio_paths = [] context_audio_paths = [] - context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self.remove_special_tokens( + context_audio_codes_cleaned, context_audio_codes_lens_cleaned = remove_special_tokens( codes=context_audio_codes, codes_len=context_audio_codes_lens, ) - context_audio_cleaned, context_audio_lens_cleaned, _ = self.codes_to_audio( - context_audio_codes_cleaned, context_audio_codes_lens_cleaned + context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self._prepare_codes_for_decode( + context_audio_codes_cleaned, context_audio_codes_lens_cleaned, + ) + context_audio_cleaned, context_audio_lens_cleaned, _ = self._codec_helper.codes_to_audio( + context_audio_codes_cleaned, context_audio_codes_lens_cleaned, ) for idx in range(infer_output.predicted_audio.size(0)): diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py index 59db7decda0e..555c30308e39 100644 --- a/nemo/collections/tts/models/easy_magpietts_inference.py +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -27,15 +27,19 @@ from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers from nemo.collections.tts.models import AudioCodecModel -from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel from nemo.collections.tts.modules import transformer_2501 from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter from nemo.collections.tts.modules.magpietts_modules import ( CharAwareSubwordEncoder, + CodecHelper, + LocalTransformerHelper, LocalTransformerType, SpecialAudioToken, + add_special_tokens, + remove_special_tokens, ) from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths +from nemo.core.classes import ModelPT from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging @@ -207,7 +211,7 @@ def from_dict(cls, data: dict) -> 'EasyModelInferenceParameters': return cls(**filtered_data) -class EasyMagpieTTSInferenceModel(BaseMagpieTTSModel): +class EasyMagpieTTSInferenceModel(ModelPT): """ Inference-only base class for EasyMagpieTTS decoder-only model. @@ -350,6 +354,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self._codec_model = codec_model self._codec_model.freeze() # Lightning does requires_grad = False and self.eval() self._codec_converter = codec_converter + self._codec_helper = CodecHelper(self._codec_model, self._codec_converter) # Audio embedding dimension - can be smaller than hidden_dim to reduce parameters self.audio_embedding_dim = cfg.get('audio_embedding_dim', cfg.hidden_dim) @@ -484,33 +489,84 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): ) self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) - def _get_state_dict_keys_to_exclude(self): + self._lt_helper = LocalTransformerHelper( + local_transformer=self.local_transformer, + audio_embeddings=self.audio_embeddings, + audio_in_projection=self.audio_in_projection, + local_transformer_in_projection=self.local_transformer_in_projection, + local_transformer_audio_out_projection=self.local_transformer_audio_out_projection, + local_transformer_out_projections=self.local_transformer_out_projections, + num_audio_codebooks=self.num_audio_codebooks, + frame_stacking_factor=self.frame_stacking_factor, + audio_eos_id=self.audio_eos_id, + mask_token_id=self.mask_token_id, + codebook_size=self.codebook_size, + ) + + def _get_state_dict_keys_to_exclude(self) -> List[str]: return [ '_codec_model', ] + def state_dict(self, destination=None, prefix='', keep_vars=False): + if hasattr(self, '_no_state_dict') and self._no_state_dict: + return {} + state_dict = super().state_dict(destination, prefix, keep_vars) + keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude() + for key in list(state_dict.keys()): + if any(substring in key for substring in keys_substrings_to_exclude): + del state_dict[key] + return state_dict + + def load_state_dict(self, state_dict, strict=True): + if not strict: + super().load_state_dict(state_dict, strict=False) + modules_to_skip = self._get_state_dict_keys_to_exclude() + for name, child in self.named_children(): + if name in modules_to_skip: + continue + if any(param.numel() > 0 for param in child.parameters()): + new_state_dict = {} + for key in state_dict.keys(): + name_with_dot = f"{name}." + if key.startswith(name_with_dot): + new_state_dict[key[len(name_with_dot) :]] = state_dict[key] + child.load_state_dict(new_state_dict) + + def setup_optimizer_param_groups(self): + """Exclude frozen eval/inference-only models from the optimizer.""" + modules_to_exclude = set(self._get_state_dict_keys_to_exclude()) + + excluded_param_ids = set() + for name, module in self.named_children(): + if name in modules_to_exclude: + for param in module.parameters(): + excluded_param_ids.add(id(param)) + + trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids] + + logging.info( + f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, " + f"{len(excluded_param_ids)} params excluded (eval models)" + ) + + self._optimizer_param_groups = [{"params": trainable_params}] + def setup_training_data(self, train_data_config=None): pass def setup_validation_data(self, val_data_config=None): pass - def codes_to_audio(self, codes, codes_len): - # codes: (B, C, T') - self._codec_model.eval() + def _prepare_codes_for_decode(self, codes, codes_len, min_len=4): + """Unstack frame-stacked codes and pad short sequences before decoding.""" if self.frame_stacking_factor > 1 and codes.size(1) == self.num_audio_codebooks * self.frame_stacking_factor: codes, codes_len = self.unstack_codes(codes, codes_len, self.frame_stacking_factor) - - with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): - if self._codec_converter is not None: - codes = self._codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) - if codes_len.min() < 4: - codes = torch.nn.functional.pad(input=codes, pad=(0, 4 - codes_len.min()), value=0) - codes_len = torch.where(codes_len < 4, torch.ones_like(codes_len) * 4, codes_len) - codes = codes[:, :, : codes_len.max()] - - audio, audio_len = self._codec_model.decode(tokens=codes, tokens_len=codes_len) - return audio, audio_len, codes + if min_len > 0 and codes_len.min() < min_len: + codes = torch.nn.functional.pad(input=codes, pad=(0, min_len - codes_len.min()), value=0) + codes_len = torch.where(codes_len < min_len, torch.ones_like(codes_len) * min_len, codes_len) + codes = codes[:, :, : codes_len.max()] + return codes, codes_len def embed_audio_tokens(self, audio_tokens): # audio_tokens: (B, C, T') @@ -750,14 +806,16 @@ def prepare_context_tensors( if context_audio_codes is None: if context_audio is None: raise ValueError("Either context_audio_codes or context_audio must be provided") - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes( + context_audio, context_audio_lens + ) if self._codec_converter is not None: context_audio_codes = self._codec_converter.convert_original_to_new( audio_tokens=context_audio_codes, audio_lens=context_audio_codes_lens ).long() - context_audio_codes, context_audio_codes_lens = self.add_special_tokens( + context_audio_codes, context_audio_codes_lens = add_special_tokens( codes=context_audio_codes, codes_len=context_audio_codes_lens, bos_id=self.context_audio_bos_id, @@ -921,7 +979,7 @@ def _sample_audio_codes( """ if use_local_transformer_for_inference: if self.local_transformer_type == LocalTransformerType.AR: - audio_codes_next = self.local_transformer_sample_autoregressive( + audio_codes_next = self._lt_helper.sample_autoregressive( dec_output=last_hidden[:, -1, :], temperature=temperature, topk=topk, @@ -1663,7 +1721,10 @@ def streaming_finalize( # No need to remove EOS - end_indices already point to the frame before EOS # Decode to audio (codes are already unstacked: B, C, T) - audio, audio_len, decoded_codes = self.codes_to_audio(predicted_codes, predicted_codes_lens) + predicted_codes, predicted_codes_lens = self._prepare_codes_for_decode(predicted_codes, predicted_codes_lens) + audio, audio_len, decoded_codes = self._codec_helper.codes_to_audio( + predicted_codes, predicted_codes_lens, + ) return StreamingFinalizeOutput( audio=audio, @@ -1740,7 +1801,9 @@ def infer_batch( else: context_audio = batch['context_audio'] context_audio_lens = batch['context_audio_lens'] - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes( + context_audio, context_audio_lens + ) # Optional GT phoneme tokens for teacher forcing gt_phoneme_tokens = batch.get('phoneme_tokens') @@ -1761,7 +1824,9 @@ def infer_batch( elif 'audio' in batch: gt_audio = batch['audio'] gt_audio_lens = batch['audio_lens'] - gt_audio_codes, gt_audio_codes_lens = self.audio_to_codes(gt_audio, gt_audio_lens) + gt_audio_codes, gt_audio_codes_lens = self._codec_helper.audio_to_codes( + gt_audio, gt_audio_lens + ) else: raise ValueError("Teacher forcing requires 'audio_codes' or 'audio' in batch") @@ -1771,7 +1836,7 @@ def infer_batch( audio_tokens=gt_audio_codes, audio_lens=gt_audio_codes_lens ).long() - gt_audio_codes_processed, gt_audio_codes_lens_processed = self.add_special_tokens( + gt_audio_codes_processed, gt_audio_codes_lens_processed = add_special_tokens( codes=gt_audio_codes, codes_len=gt_audio_codes_lens, bos_id=self.audio_bos_id, @@ -1977,7 +2042,9 @@ def do_tts( context_audio = context_audio.to(device) context_audio_lens = torch.tensor([context_audio.size(1)], dtype=torch.long, device=device) with torch.inference_mode(): - context_audio_codes, context_audio_codes_lens = self.audio_to_codes(context_audio, context_audio_lens) + context_audio_codes, context_audio_codes_lens = self._codec_helper.audio_to_codes( + context_audio, context_audio_lens + ) else: context_audio_codes = torch.zeros( 1, diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py index 020e7af77aa5..600ddda579bd 100644 --- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py @@ -437,7 +437,10 @@ def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]: context_codes = self._codec_converter.convert_original_to_new( audio_tokens=context_codes, audio_lens=context_lens ).long() - context_audio, context_audio_lens, _ = self.codes_to_audio(context_codes, context_lens) + context_codes, context_lens = self._prepare_codes_for_decode(context_codes, context_lens) + context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio( + context_codes, context_lens, + ) return self._save_waveforms_to_paths( waveforms=context_audio, waveform_lens=context_audio_lens, @@ -462,7 +465,7 @@ def _run_easy_process_batch( context_audio_codes = batch['context_audio_codes'] context_audio_codes_lens = batch['context_audio_codes_lens'] else: - context_audio_codes, context_audio_codes_lens = model.audio_to_codes( + context_audio_codes, context_audio_codes_lens = model._codec_helper.audio_to_codes( batch['context_audio'], batch['context_audio_lens'] ) diff --git a/nemo/collections/tts/models/magpietts.py b/nemo/collections/tts/models/magpietts.py index 28af39542f21..f710bb853986 100644 --- a/nemo/collections/tts/models/magpietts.py +++ b/nemo/collections/tts/models/magpietts.py @@ -38,15 +38,26 @@ from nemo.collections.tts.losses.aligner_loss import ForwardSumLoss from nemo.collections.tts.losses.moe_loss import MoEAuxiliaryLoss, compute_expert_usage from nemo.collections.tts.models import AudioCodecModel -from nemo.collections.tts.models.base_magpietts import BaseMagpieTTSModel, worker_init_fn from nemo.collections.tts.modules import transformer_2501 from nemo.collections.tts.modules.aligner import AlignmentEncoder from nemo.collections.tts.modules.audio_codec_modules import VectorQuantizerIndexConverter from nemo.collections.tts.modules.magpietts_modules import ( CharAwareSubwordEncoder, + CodecHelper, EOSDetectionMethod, + LocalTransformerHelper, LocalTransformerType, SpecialAudioToken, + add_eos_token, + add_special_tokens, + clear_forbidden_logits, + pad_audio_codes, + remove_bos_token, + remove_embedded_bos_token, + remove_embedded_eos_token, + remove_eos_token, + remove_special_tokens, + worker_init_fn, ) from nemo.collections.tts.parts.utils.helpers import ( binarize_attention_parallel, @@ -59,6 +70,7 @@ get_tokenizer_for_language, stack_tensors, ) +from nemo.core.classes import ModelPT from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging @@ -299,7 +311,7 @@ def from_dict(cls, data: dict) -> 'ModelInferenceParameters': return cls(**filtered_data) -class MagpieTTSModel(BaseMagpieTTSModel): +class MagpieTTSModel(ModelPT): """ Magpie-TTS Model Base Class used for training a TTS model that can generate audio codes from transcript and a context audio/text @@ -470,13 +482,14 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): self._codec_model = codec_model self._codec_model.freeze() # Lightning does requires_grad = False and self.eval() self._codec_converter = codec_converter + self._codec_helper = CodecHelper(self._codec_model, self._codec_converter) audio_embeddings = [] for _ in range(self.num_audio_codebooks * self.frame_stacking_factor): audio_embeddings.append(nn.Embedding(self.num_all_tokens_per_codebook, cfg.embedding_dim)) self.audio_embeddings = nn.ModuleList(audio_embeddings) - # Identity projections required by BaseMagpieTTSModel local transformer methods. + # Identity projections required by LocalTransformerHelper methods. # MagpieTTSModel embeds directly in embedding_dim, so no projection is needed. self.audio_in_projection = nn.Identity() self.local_transformer_audio_out_projection = nn.Identity() @@ -537,6 +550,20 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): ) self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) + self._lt_helper = LocalTransformerHelper( + local_transformer=self.local_transformer, + audio_embeddings=self.audio_embeddings, + audio_in_projection=self.audio_in_projection, + local_transformer_in_projection=self.local_transformer_in_projection, + local_transformer_audio_out_projection=self.local_transformer_audio_out_projection, + local_transformer_out_projections=self.local_transformer_out_projections, + num_audio_codebooks=self.num_audio_codebooks, + frame_stacking_factor=self.frame_stacking_factor, + audio_eos_id=self.audio_eos_id, + mask_token_id=self.mask_token_id, + codebook_size=self.codebook_size, + ) + if cfg.get('use_alignment_encoder', False): self.alignment_encoder = AlignmentEncoder( n_mel_channels=cfg.embedding_dim, @@ -751,6 +778,35 @@ def _get_state_dict_keys_to_exclude(self): keys.append('context_encoder') return keys + def state_dict(self, destination=None, prefix='', keep_vars=False): + if hasattr(self, '_no_state_dict') and self._no_state_dict: + return {} + state_dict = super().state_dict(destination, prefix, keep_vars) + keys_substrings_to_exclude = self._get_state_dict_keys_to_exclude() + for key in list(state_dict.keys()): + if any(substring in key for substring in keys_substrings_to_exclude): + del state_dict[key] + return state_dict + + def setup_optimizer_param_groups(self): + """Exclude frozen eval/inference-only models from the optimizer.""" + modules_to_exclude = set(self._get_state_dict_keys_to_exclude()) + + excluded_param_ids = set() + for name, module in self.named_children(): + if name in modules_to_exclude: + for param in module.parameters(): + excluded_param_ids.add(id(param)) + + trainable_params = [p for p in self.parameters() if id(p) not in excluded_param_ids] + + logging.info( + f"setup_optimizer_param_groups: {len(trainable_params)} params in optimizer, " + f"{len(excluded_param_ids)} params excluded (eval models)" + ) + + self._optimizer_param_groups = [{"params": trainable_params}] + def check_frame_stacking_config_validity(self): """ Check if the configuration is compatible with frame stacking. @@ -997,7 +1053,7 @@ def load_state_dict(self, state_dict, strict=True): def embed_audio_tokens(self, audio_tokens, audio_tokens_lens): B, C, T = audio_tokens.shape - audio_tokens = self.pad_audio_codes(audio_tokens).long() + audio_tokens = pad_audio_codes(audio_tokens, self.frame_stacking_factor).long() audio_embedding = None for i in range(self.frame_stacking_factor): for c in range(C): @@ -1045,7 +1101,7 @@ def compute_loss(self, logits, audio_codes, audio_codes_lens, mask_tokens_mask=N # repeat loss mask for each codebook to simplify code below loss_mask = loss_mask.unsqueeze(1).repeat(1, audio_codes.size(1), 1) total_codebook_loss = None - audio_codes = self.pad_audio_codes(audio_codes).long() + audio_codes = pad_audio_codes(audio_codes, self.frame_stacking_factor).long() for fs_index in range(frame_stacking_factor): for codebook in range(audio_codes.size(1)): si = (codebook + self.num_audio_codebooks * fs_index) * self.num_all_tokens_per_codebook @@ -1210,8 +1266,8 @@ def sample_codes_from_logits( codebook_logits[item_idx, self.audio_eos_id] = 0.0 # Disallow generation of special tokens - codebook_logits = self.clear_forbidden_logits( - codebook_logits.unsqueeze(1), forbid_audio_eos=forbid_audio_eos + codebook_logits = clear_forbidden_logits( + codebook_logits.unsqueeze(1), self.codebook_size, forbid_audio_eos=forbid_audio_eos ).squeeze(1) codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] # (B, topk) @@ -1301,25 +1357,29 @@ def _prepare_audio_examples( with torch.no_grad(): # Decode predictions: convert logits to codes, remove EOS token, then decode to audio pred_audio_codes = self.logits_to_audio_codes(logits, audio_codes_lens) - pred_audio_codes, pred_audio_codes_lens = self.remove_eos_token( + pred_audio_codes, pred_audio_codes_lens = remove_eos_token( codes=pred_audio_codes, codes_len=audio_codes_lens ) - pred_audio, pred_audio_lens, _ = self.codes_to_audio(pred_audio_codes, pred_audio_codes_lens) + pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio( + pred_audio_codes, pred_audio_codes_lens + ) # Decode targets: remove EOS token, then decode to audio - target_audio_codes, target_audio_codes_lens = self.remove_eos_token( + target_audio_codes, target_audio_codes_lens = remove_eos_token( codes=target_audio_codes, codes_len=audio_codes_lens ) - target_audio, target_audio_lens, _ = self.codes_to_audio(target_audio_codes, target_audio_codes_lens) + target_audio, target_audio_lens, _ = self._codec_helper.codes_to_audio( + target_audio_codes, target_audio_codes_lens + ) # Decode context audio if available (shape check ensures it's not a dummy tensor used in text context) # This does not handle the case in which a batch has a mixture of text and audio context examples context_audio, context_audio_lens = None, None if context_audio_codes is not None and context_audio_codes.shape[2] > 3: - context_audio_codes, context_audio_codes_lens = self.remove_special_tokens( + context_audio_codes, context_audio_codes_lens = remove_special_tokens( codes=context_audio_codes, codes_len=context_audio_codes_lens ) - context_audio, context_audio_lens, _ = self.codes_to_audio( + context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio( context_audio_codes, context_audio_codes_lens ) @@ -1539,14 +1599,15 @@ def _get_context_audio_codes(self, batch: Dict[str, torch.Tensor]) -> Tuple[torc codes = batch['context_audio_codes'] lens = batch['context_audio_codes_lens'] else: - codes, lens = self.audio_to_codes( - batch['context_audio'], batch['context_audio_lens'], batch.get('context_sample_rate') + codes, lens = self._codec_helper.audio_to_codes( + batch['context_audio'], batch['context_audio_lens'], + sample_rate=batch.get('context_sample_rate'), ) if self._codec_converter is not None: codes = self._codec_converter.convert_original_to_new(audio_tokens=codes, audio_lens=lens) - codes, lens = self.add_special_tokens( + codes, lens = add_special_tokens( codes=codes, codes_len=lens, bos_id=self.context_audio_bos_id, @@ -1950,8 +2011,9 @@ def process_batch(self, batch): disable_alignment_loss = False if 'audio_codes' not in batch: - audio_codes, audio_codes_lens = self.audio_to_codes( - batch['audio'], batch['audio_lens'], batch.get('sample_rate') + audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes( + batch['audio'], batch['audio_lens'], + sample_rate=batch.get('sample_rate'), ) else: audio_codes = batch['audio_codes'] @@ -1962,7 +2024,7 @@ def process_batch(self, batch): audio_tokens=audio_codes, audio_lens=audio_codes_lens ) - audio_codes, audio_codes_lens = self.add_special_tokens( + audio_codes, audio_codes_lens = add_special_tokens( codes=audio_codes, codes_len=audio_codes_lens, bos_id=self.audio_bos_id, @@ -1976,7 +2038,7 @@ def process_batch(self, batch): # Note: if a tensor lacks the `_unstacked` suffix, it can be assumed to be in the frame-stacked domain # Remove EOS token for decoder inputs - audio_codes_embedded_input, audio_codes_lens_input = self.remove_embedded_eos_token( + audio_codes_embedded_input, audio_codes_lens_input = remove_embedded_eos_token( embedded=audio_codes_embedded_all, embedded_len=audio_codes_lens_all ) use_cfg = self.training and (self.cfg_unconditional_prob > 0.0) and (context_tensors.cond is not None) @@ -2009,7 +2071,7 @@ def process_batch(self, batch): random_embedded, random_embedded_lens = self.embed_audio_tokens( audio_tokens=random_audio_tokens, audio_tokens_lens=audio_codes_lens ) # (B T E) - random_embedded, random_embedded_lens = self.remove_embedded_eos_token( + random_embedded, random_embedded_lens = remove_embedded_eos_token( embedded=random_embedded, embedded_len=random_embedded_lens ) dec_dropout_mask = ( @@ -2028,7 +2090,7 @@ def process_batch(self, batch): audio_codes_mask = torch.cat([additional_decoder_mask, audio_codes_mask], dim=1) # Remove BOS token for aligner targets - audio_codes_embedded_target, audio_codes_lens_target = self.remove_embedded_bos_token( + audio_codes_embedded_target, audio_codes_lens_target = remove_embedded_bos_token( embedded=audio_codes_embedded_all, embedded_len=audio_codes_lens_all ) aligner_encoder_loss = None @@ -2083,7 +2145,7 @@ def process_batch(self, batch): logits = logits[:, dec_context_size:, :] # Remove the context audio embeddings from the logits # Remove BOS tokens from decoder targets - audio_codes_target_unstacked, audio_codes_lens_target_unstacked = self.remove_bos_token( + audio_codes_target_unstacked, audio_codes_lens_target_unstacked = remove_bos_token( codes=audio_codes, codes_len=audio_codes_lens, num_tokens=self.frame_stacking_factor ) # Codebook loss (parallel) @@ -2116,10 +2178,10 @@ def process_batch(self, batch): if self.local_transformer_type == LocalTransformerType.MASKGIT: # Maskgit # randomly replace some positions with MASK_TOKEN - audio_codes_masked, mask_tokens_mask = self.maskgit_apply_random_mask(audio_codes_target_unstacked) + audio_codes_masked, mask_tokens_mask = self._lt_helper.apply_random_mask(audio_codes_target_unstacked) # TODO @rfejgin: the very last position might be padding but the local transformer might look at it as part of # of a pair where the first position is valid. Is this an issue? - local_transformer_logits = self.compute_local_transformer_logits( + local_transformer_logits = self._lt_helper.compute_logits( dec_out[:, dec_context_size:, :], audio_codes_masked, targets_offset_by_one=True ) local_transformer_loss, _ = self.compute_loss( @@ -2132,7 +2194,7 @@ def process_batch(self, batch): else: # Autoregressive assert self.local_transformer_type == LocalTransformerType.AR, "Unexpected local transformer type" - local_transformer_logits = self.compute_local_transformer_logits( + local_transformer_logits = self._lt_helper.compute_logits( dec_out[:, dec_context_size:, :], audio_codes_target_unstacked, targets_offset_by_one=False ) local_transformer_loss, _ = self.compute_loss( @@ -2903,7 +2965,7 @@ def infer_batch( if use_local_transformer_for_inference: if self.local_transformer_type == LocalTransformerType.AR: # Autoregressive sampling with local transformer - audio_codes_next = self.local_transformer_sample_autoregressive( + audio_codes_next = self._lt_helper.sample_autoregressive( dec_output=dec_out[:, -1, :], temperature=self.inference_parameters.temperature, topk=self.inference_parameters.topk, @@ -2915,7 +2977,7 @@ def infer_batch( forbid_audio_eos=forbid_audio_eos, ) elif self.local_transformer_type == LocalTransformerType.MASKGIT: - audio_codes_next = self.local_transformer_sample_maskgit( + audio_codes_next = self._lt_helper.sample_maskgit( dec_output=dec_out[:, -1, :], temperature=self.inference_parameters.temperature, topk=self.inference_parameters.topk, @@ -2982,7 +3044,7 @@ def infer_batch( predicted_codes_lens = torch.tensor(predicted_lens, device=text.device).long() predicted_codes = predicted_codes[:, :, : predicted_codes_lens.max()] - predicted_audio, predicted_audio_lens, predicted_codes = self.codes_to_audio( + predicted_audio, predicted_audio_lens, predicted_codes = self._codec_helper.codes_to_audio( predicted_codes, predicted_codes_lens ) end_time = time.time() @@ -3682,7 +3744,9 @@ def do_tts( if len(all_codes) > 0: concatenated_codes = torch.cat(all_codes, dim=1).unsqueeze(0) codes_lens = torch.tensor([concatenated_codes.shape[2]], device=self.device, dtype=torch.long) - predicted_audio, predicted_audio_lens, _ = self.codes_to_audio(concatenated_codes, codes_lens) + predicted_audio, predicted_audio_lens, _ = self._codec_helper.codes_to_audio( + concatenated_codes, codes_lens + ) return predicted_audio, predicted_audio_lens else: return torch.zeros(1, 0, device=self.device), torch.zeros(1, device=self.device, dtype=torch.long) @@ -4489,7 +4553,7 @@ def generate_speech( if use_local_transformer_for_inference: if self.local_transformer_type == LocalTransformerType.AR: # Autoregressive sampling with local transformer - audio_codes_next = self.local_transformer_sample_autoregressive( + audio_codes_next = self._lt_helper.sample_autoregressive( dec_output=dec_out[:, -1, :], temperature=self.inference_parameters.temperature, topk=self.inference_parameters.topk, @@ -4501,7 +4565,7 @@ def generate_speech( forbid_audio_eos=forbid_audio_eos, ) elif self.local_transformer_type == LocalTransformerType.MASKGIT: - audio_codes_next = self.local_transformer_sample_maskgit( + audio_codes_next = self._lt_helper.sample_maskgit( dec_output=dec_out[:, -1, :], temperature=self.inference_parameters.temperature, topk=self.inference_parameters.topk, diff --git a/nemo/collections/tts/models/magpietts_preference_optimization.py b/nemo/collections/tts/models/magpietts_preference_optimization.py index d754f5718130..ce36df483ede 100644 --- a/nemo/collections/tts/models/magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/magpietts_preference_optimization.py @@ -51,6 +51,7 @@ PYNINI_AVAILABLE = False from nemo.collections.tts.models import MagpieTTSModel +from nemo.collections.tts.modules.magpietts_modules import add_eos_token class MagpieTTSModelOfflinePODataGen(MagpieTTSModel): @@ -904,7 +905,7 @@ def process_batch_online_po(self, batch, n_generations_per_item, mode='train'): with torch.no_grad(): reference_model_output = self._reference_model.process_batch(batch_repeated) - codebook_targets, _ = self.add_eos_token( + codebook_targets, _ = add_eos_token( codes=predicted_codes, codes_len=predicted_codes_lens, eos_id=self.audio_eos_id ) diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index e936f81439be..2cebff638977 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -307,7 +307,7 @@ class MagpieInferenceRunner(BaseInferenceRunner): """Runner for encoder-decoder MagpieTTSModel. Uses ChunkedTTSInferenceDataset and model.generate_speech() per chunk, - then model.codes_to_audio() to produce waveforms. + then codes_to_audio() to produce waveforms. """ def __init__(self, model, config: MagpieInferenceConfig): @@ -472,8 +472,8 @@ def _run_unified_inference( predicted_codes = stack_tensors(predicted_codes_list, max_lens=[max_code_len]).cuda() predicted_codes_lens_tensor = torch.tensor(predicted_codes_lens, dtype=torch.long, device='cuda') - predicted_audio, predicted_audio_lens, _ = self.model.codes_to_audio( - predicted_codes, predicted_codes_lens_tensor + predicted_audio, predicted_audio_lens, _ = self.model._codec_helper.codes_to_audio( + predicted_codes, predicted_codes_lens_tensor, ) total_audio_samples = sum(predicted_audio_lens.cpu().tolist()) diff --git a/nemo/collections/tts/modules/magpietts_modules.py b/nemo/collections/tts/modules/magpietts_modules.py index 8569b691242f..3f11e8231488 100644 --- a/nemo/collections/tts/modules/magpietts_modules.py +++ b/nemo/collections/tts/modules/magpietts_modules.py @@ -15,13 +15,18 @@ from __future__ import annotations from enum import Enum +from typing import Dict, List, Optional +import numpy as np import torch +from hydra.utils import instantiate from torch import Tensor +from torch.utils.data import get_worker_info from nemo.collections.tts.modules import transformer_2501 from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths from nemo.core.classes.module import NeuralModule +from nemo.utils import logging from nemo.utils.enum import PrettyStrEnum @@ -251,3 +256,536 @@ def forward(self, subword_ids: Tensor, subword_mask: Tensor | None = None) -> Te subword_emb[subword_mask.unsqueeze(-1).expand(-1, -1, mean_emb.size(-1))] = mean_emb.view(-1) return subword_emb + + +# --------------------------------------------------------------------------- +# Audio code utility functions +# --------------------------------------------------------------------------- + + +def worker_init_fn(worker_id): + """Per-worker init for DataLoader workers. + + Sets up tokenizers for the dataset (text and optionally phoneme) + when using multiprocessing. + """ + from nemo.collections.tts.data.text_to_speech_dataset_lhotse import setup_tokenizers + + logging.info(f"Worker {worker_id} initializing...") + worker_info = get_worker_info() + dataset = worker_info.dataset + tokenizer = setup_tokenizers(dataset.tokenizer_config, mode=dataset.dataset_type) + dataset.text_tokenizer = tokenizer + if hasattr(dataset, 'phoneme_tokenizer_config'): + dataset.phoneme_tokenizer = instantiate(dataset.phoneme_tokenizer_config) + + +def add_eos_token(codes, codes_len, eos_id, num_eos_tokens=1): + """Appends EOS tokens at the end of each sequence in the batch. + + Args: + codes: (B, C, T') + codes_len: (B,) + eos_id: Token id to use as EOS. + num_eos_tokens: Number of EOS tokens to append. + """ + codes = torch.nn.functional.pad(input=codes, pad=(0, num_eos_tokens), value=0) + codes_len = codes_len + num_eos_tokens + for idx in range(codes.size(0)): + codes[idx, :, codes_len[idx] - 1] = eos_id + return codes, codes_len + + +def add_special_tokens(codes, codes_len, bos_id, eos_id, num_bos_tokens=1, num_eos_tokens=1): + """Prepends BOS and appends EOS tokens to each sequence. + + Args: + codes: (B, C, T') + """ + codes = torch.nn.functional.pad(input=codes, pad=(num_bos_tokens, 0), value=bos_id) + codes_len = codes_len + num_bos_tokens + codes, codes_len = add_eos_token(codes=codes, codes_len=codes_len, eos_id=eos_id, num_eos_tokens=num_eos_tokens) + return codes, codes_len + + +def remove_bos_token(codes, codes_len, num_tokens=1): + codes = codes[:, :, num_tokens:] + codes_len = codes_len - num_tokens + return codes, codes_len + + +def remove_embedded_bos_token(embedded, embedded_len): + embedded = embedded[:, 1:, :] + embedded_len = embedded_len - 1 + return embedded, embedded_len + + +def remove_eos_token(codes, codes_len): + codes_len = codes_len - 1 + codes = codes[:, :, :-1] + mask = get_mask_from_lengths(lengths=codes_len) + codes = codes * mask.unsqueeze(1) + return codes, codes_len + + +def remove_embedded_eos_token(embedded, embedded_len): + """Remove the last token from embedded sequences. + + Args: + embedded: (B, T', D) + """ + embedded_len = embedded_len - 1 + embedded = embedded[:, :-1, :] + mask = get_mask_from_lengths(lengths=embedded_len) + embedded = embedded * mask.unsqueeze(2) + return embedded, embedded_len + + +def remove_special_tokens(codes, codes_len, num_bos_tokens=1): + codes, codes_len = remove_bos_token(codes=codes, codes_len=codes_len, num_tokens=num_bos_tokens) + codes, codes_len = remove_eos_token(codes=codes, codes_len=codes_len) + return codes, codes_len + + +def pad_audio_codes(audio_codes: torch.Tensor, frame_stacking_factor: int) -> torch.Tensor: + """Pads the time dimension of audio codes to a multiple of *frame_stacking_factor*. + + Args: + audio_codes: (B, C, T) + frame_stacking_factor: Factor to pad to. + Returns: + (B, C, T_padded) + """ + T = audio_codes.size(2) + T_padded = int(np.ceil(T / frame_stacking_factor) * frame_stacking_factor) + num_pad = T_padded - T + audio_codes = torch.nn.functional.pad(input=audio_codes, pad=(0, num_pad)) + return audio_codes + + +def clear_forbidden_logits(logits: torch.Tensor, codebook_size: int, forbid_audio_eos: bool = False) -> torch.Tensor: + """Sets logits of forbidden tokens to ``-inf`` so they will never be sampled. + + Specifically, we forbid sampling of all special tokens except AUDIO_EOS + which is allowed by default. + + Args: + logits: (B, C, num_audio_tokens_per_codebook) or compatible shape. + codebook_size: Base codebook size (excluding special tokens). + forbid_audio_eos: If True, also forbid AUDIO_EOS tokens from being sampled. + """ + logits[ + :, + :, + SpecialAudioToken.get_forbidden_tokens(codebook_size, forbid_audio_eos=forbid_audio_eos), + ] = float('-inf') + return logits + + +class CodecHelper: + """Thin wrapper around a codec model and optional token converter. + + Instantiate once per model and use ``audio_to_codes`` / ``codes_to_audio`` + without having to pass the codec objects every time. + """ + + def __init__(self, codec_model, codec_converter=None): + self.codec_model = codec_model + self.codec_converter = codec_converter + + def audio_to_codes(self, audio, audio_len, sample_rate=None): + """Encode audio waveforms into codec codes.""" + self.codec_model.eval() + with torch.no_grad(), torch.autocast(device_type=audio.device.type, dtype=torch.float32): + codes, codes_len = self.codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=sample_rate) + return codes, codes_len + + def codes_to_audio(self, codes, codes_len): + """Decode codec codes back into audio waveforms. + + ``codes`` must already be unstacked to the shape the codec expects. + """ + self.codec_model.eval() + with torch.no_grad(), torch.autocast(device_type=codes.device.type, dtype=torch.float32): + if self.codec_converter is not None: + codes = self.codec_converter.convert_new_to_original(audio_tokens=codes, audio_lens=codes_len) + audio, audio_len = self.codec_model.decode(tokens=codes, tokens_len=codes_len) + return audio, audio_len, codes + + +# --------------------------------------------------------------------------- +# LocalTransformerHelper +# --------------------------------------------------------------------------- + + +class LocalTransformerHelper: + """Orchestrates local-transformer forward passes and sampling. + + This is a plain Python class (not ``nn.Module``) that holds *references* + to nn.Module sub-modules owned by the parent model. Keeping it non-Module + preserves checkpoint key compatibility. + + Args: + local_transformer: The local transformer module. + audio_embeddings: List/ModuleList of per-codebook embedding layers. + audio_in_projection: Linear projection applied after per-codebook embedding. + local_transformer_in_projection: Projection into the local transformer input space. + local_transformer_audio_out_projection: Projection applied to local transformer output + before the per-codebook output heads. + local_transformer_out_projections: List/ModuleList of per-codebook output heads. + num_audio_codebooks: Number of audio codebooks (C). + frame_stacking_factor: Frame stacking factor (S). + audio_eos_id: Token id for audio EOS. + mask_token_id: Token id used for MaskGit masking. + codebook_size: Base codebook size (excluding special tokens). + """ + + def __init__( + self, + local_transformer, + audio_embeddings, + audio_in_projection, + local_transformer_in_projection, + local_transformer_audio_out_projection, + local_transformer_out_projections, + num_audio_codebooks: int, + frame_stacking_factor: int, + audio_eos_id: int, + mask_token_id: int, + codebook_size: int, + ): + self.local_transformer = local_transformer + self.audio_embeddings = audio_embeddings + self.audio_in_projection = audio_in_projection + self.local_transformer_in_projection = local_transformer_in_projection + self.local_transformer_audio_out_projection = local_transformer_audio_out_projection + self.local_transformer_out_projections = local_transformer_out_projections + self.num_audio_codebooks = num_audio_codebooks + self.frame_stacking_factor = frame_stacking_factor + self.audio_eos_id = audio_eos_id + self.mask_token_id = mask_token_id + self.codebook_size = codebook_size + + def create_random_mask(self, codes): + """Creates a mask where True indicates positions that should be replaced with MASK_TOKEN.""" + B, C, T = codes.shape + rand_values = torch.rand(B, T, device=codes.device) + frac_masked = cosine_schedule(rand_values) + n_masked = torch.ceil(frac_masked * C).long() + random_permutations = torch.argsort(torch.rand(B, C, T, device=codes.device), dim=1) + mask_indices = torch.arange(C, device=codes.device).view(1, C, 1) + mask = mask_indices < n_masked.view(B, 1, T) + mask = torch.gather(mask, 1, random_permutations) + return mask + + def apply_random_mask(self, codes): + """Randomly replaces some codes with MASK_TOKEN following the cosine schedule.""" + mask = self.create_random_mask(codes) + codes_with_mask = torch.where(mask, self.mask_token_id, codes) + return codes_with_mask, mask + + def compute_logits(self, dec_out, audio_codes_target, targets_offset_by_one=False): + """Predicts the logits for all codebooks using the local transformer. + + Used in both autoregressive (AR) and MaskGit (MG) modes during + training and validation (not inference/sampling). + + The sequence layout is slightly different between AR and MG modes, as shown below + (using an 8-codebook setup as an example):: + + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | AR target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | none | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | MG target | none | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | Input | Magpie | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + | | Latent | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | or MASK | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + | Seq. Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | + +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ + + Args: + dec_out: (B, T', E) + audio_codes_target: (B, C, T') + targets_offset_by_one: if False, target for index 0 is codebook 0 (AR); + if True, target for index 1 is codebook 0 (MaskGit). + """ + C = self.num_audio_codebooks + dec_out_all = dec_out.reshape(-1, dec_out.size(-1)) # (B*T', E) + local_transformer_input = [dec_out_all] + audio_codes_target = pad_audio_codes(audio_codes_target, self.frame_stacking_factor).long() + for fs_index in range(self.frame_stacking_factor): + for codebook_num in range(C): + codes = audio_codes_target[:, codebook_num, fs_index :: self.frame_stacking_factor] + codes = codes.reshape(-1) + codebook_embedding = self.audio_embeddings[codebook_num + fs_index * C](codes) + codebook_embedding = self.audio_in_projection(codebook_embedding) + local_transformer_input.append(codebook_embedding) + + local_transformer_input = torch.stack(local_transformer_input, dim=1) + local_transformer_input = self.local_transformer_in_projection(local_transformer_input) + _mask = torch.ones( + local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device + ) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] + if not targets_offset_by_one: + local_transformer_output = local_transformer_output[:, :-1, :] + else: + local_transformer_output = local_transformer_output[:, 1:, :] + + local_transformer_output = self.local_transformer_audio_out_projection(local_transformer_output) + + all_code_logits = [] + for fs_index in range(self.frame_stacking_factor): + for codebook_num in range(audio_codes_target.size(1)): + codebook_logits = self.local_transformer_out_projections[codebook_num + fs_index * C]( + local_transformer_output[:, codebook_num + fs_index * C, :] + ) + all_code_logits.append(codebook_logits) + all_code_logits = torch.cat(all_code_logits, dim=1) + + all_code_logits = all_code_logits.view( + audio_codes_target.size(0), audio_codes_target.size(2) // self.frame_stacking_factor, -1 + ) + + return all_code_logits + + def sample_autoregressive( + self, + dec_output: torch.Tensor, + temperature: float = 0.7, + topk: int = 80, + unfinished_items: Dict[int, bool] = {}, + finished_items: Dict[int, bool] = {}, + use_cfg: bool = False, + cfg_scale: float = 1.0, + use_kv_cache: bool = True, + forbid_audio_eos: bool = False, + sanitize_logits: bool = False, + ) -> torch.Tensor: + """Sample audio codes autoregressively across codebooks using the local transformer. + + Args: + dec_output: Decoder output tensor (B, E). + temperature: Sampling temperature. When <= 0, uses argmax. + topk: Number of top-probability tokens to consider. + unfinished_items: Batch indices that have not completed generation (EOS forbidden). + finished_items: Batch indices that are completed (EOS forced). + use_cfg: Whether to use classifier-free guidance (doubled batch). + cfg_scale: Scale factor for CFG. + use_kv_cache: Whether to use key-value caching in the local transformer. + forbid_audio_eos: Whether to globally forbid audio EOS. + sanitize_logits: Whether to clamp/clean logits before sampling. + + Returns: + Sampled audio codes (B, num_codebooks, frame_stacking_factor). + """ + self.local_transformer.reset_cache(use_cache=use_kv_cache) + dec_output = dec_output.unsqueeze(1) # (B, 1, E) + local_transformer_input = self.local_transformer_in_projection(dec_output) + all_preds = [] + for codebook_num in range(self.num_audio_codebooks * self.frame_stacking_factor): + _mask = torch.ones( + local_transformer_input.size(0), local_transformer_input.size(1), device=local_transformer_input.device + ) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] + + lt_out_for_proj = self.local_transformer_audio_out_projection(local_transformer_output[:, -1, :]) + codebook_logits = self.local_transformer_out_projections[codebook_num](lt_out_for_proj) + + if use_cfg: + actual_batch_size = codebook_logits.size(0) // 2 + conditional_logits = codebook_logits[:actual_batch_size] + unconditional_logits = codebook_logits[actual_batch_size:] + cfg_logits = cfg_scale * conditional_logits + (1.0 - cfg_scale) * unconditional_logits + codebook_logits[:actual_batch_size] = cfg_logits + + if sanitize_logits: + codebook_logits = torch.nan_to_num(codebook_logits, nan=0.0, posinf=100.0, neginf=-100.0) + codebook_logits = codebook_logits.clamp(min=-100.0, max=100.0) + + for item_idx in unfinished_items: + codebook_logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + codebook_logits[item_idx, :] = float('-inf') + codebook_logits[item_idx, self.audio_eos_id] = 0.0 + + codebook_logits = clear_forbidden_logits( + codebook_logits.unsqueeze(1), self.codebook_size, forbid_audio_eos=forbid_audio_eos + ).squeeze(1) + + codebook_logits_topk = torch.topk(codebook_logits, topk, dim=-1)[0] + indices_to_remove = codebook_logits < codebook_logits_topk[:, -1].unsqueeze(-1) + codebook_logits_rescored = codebook_logits.clone() + codebook_logits_rescored[indices_to_remove] = float('-inf') + + if temperature <= 0.0: + codebook_preds = codebook_logits_rescored.argmax(dim=-1, keepdim=True) + else: + codebook_probs = torch.softmax(codebook_logits_rescored / temperature, dim=-1) + codebook_preds = torch.multinomial(codebook_probs, 1) + + if use_cfg: + codebook_preds[actual_batch_size:] = codebook_preds[:actual_batch_size] + all_preds.append(codebook_preds) + + next_local_transformer_input = self.audio_embeddings[codebook_num](codebook_preds.squeeze(-1)).unsqueeze(1) + next_local_transformer_input = self.audio_in_projection(next_local_transformer_input) + next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) + local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) + + all_preds = torch.cat(all_preds, dim=1) # (B, num_codebooks * frame_stacking_factor) + all_preds = all_preds.reshape(-1, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1) + if use_cfg: + all_preds = all_preds[:actual_batch_size] + + return all_preds + + def sample_maskgit( + self, + dec_output: torch.Tensor, + temperature: float = 0.7, + topk: int = 80, + unfinished_items: Dict[int, bool] = {}, + finished_items: Dict[int, bool] = {}, + use_cfg: bool = False, + cfg_scale: float = 1.0, + n_steps: int = 3, + noise_scale: float = 0.0, + fixed_schedule: Optional[List[int]] = None, + dynamic_cfg_scale: bool = False, + sampling_type: Optional[str] = None, + forbid_audio_eos: bool = False, + ) -> torch.Tensor: + """Sample audio codes using MaskGit-like iterative prediction with the local transformer. + + Args: + dec_output: Decoder output tensor (B, E). + temperature: Sampling temperature. + topk: Number of top-probability tokens to consider. + unfinished_items: Batch indices that have not completed generation. + finished_items: Batch indices that are completed. + use_cfg: Whether to use classifier-free guidance. + cfg_scale: Scale factor for CFG. + n_steps: Number of iterative refinement steps. + noise_scale: Scale factor for noise added to confidence scores. + fixed_schedule: Fixed schedule for number of tokens to unmask per step. + dynamic_cfg_scale: Whether to dynamically adjust CFG scale. + sampling_type: Sampling strategy. + forbid_audio_eos: Whether to globally forbid audio EOS. + + Returns: + Sampled audio codes (B, num_codebooks, frame_stacking_factor). + """ + device = dec_output.device + self.local_transformer.reset_cache(use_cache=False) + dec_output = dec_output.unsqueeze(1) + local_transformer_input_init = self.local_transformer_in_projection(dec_output) + codebook_seq_len = self.num_audio_codebooks * self.frame_stacking_factor + B = dec_output.size(0) + + min_confidence = 0 + max_confidence = 5 + confidences = min_confidence * torch.ones(B, codebook_seq_len, device=device) + codes = self.mask_token_id * torch.ones((B, codebook_seq_len), device=device, dtype=torch.long) + sampled_codes = codes.clone() + if fixed_schedule is not None: + n_steps = len(fixed_schedule) + for step in range(n_steps): + progress = step / n_steps + frac_masked = cosine_schedule(torch.tensor(progress)) + if sampling_type == "causal" or sampling_type == "purity_causal": + frac_masked = torch.ones_like(frac_masked) * (1.0 - progress) + if fixed_schedule is None: + n_masked = torch.ceil(codebook_seq_len * frac_masked).long() + else: + n_masked = codebook_seq_len - fixed_schedule[step] + n_unmasked = codebook_seq_len - n_masked + + if sampling_type == "causal" or sampling_type == "purity_causal": + n_frames_to_allow = int(np.floor(progress * self.frame_stacking_factor + 1)) + confidences[:, n_frames_to_allow * self.num_audio_codebooks :] = min_confidence - 1 + + _, topk_indices = torch.topk(confidences, k=n_unmasked, dim=1) + if use_cfg: + actual_batch_size = topk_indices.size(0) // 2 + assert ( + topk_indices[actual_batch_size:] == topk_indices[:actual_batch_size] + ).all(), "Topk indices are not the same for conditional and unconditional codes" + + unmasked_codes = torch.gather(sampled_codes, dim=1, index=topk_indices) + codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) + + local_transformer_input = local_transformer_input_init + for codebook_num in range(codebook_seq_len): + next_local_transformer_input = self.audio_embeddings[codebook_num](codes[:, codebook_num]).unsqueeze(1) + next_local_transformer_input = self.local_transformer_in_projection(next_local_transformer_input) + local_transformer_input = torch.cat([local_transformer_input, next_local_transformer_input], dim=1) + + _mask = torch.ones(B, codebook_seq_len + 1, device=device) + local_transformer_output = self.local_transformer(local_transformer_input, _mask)['output'] + + logits = [] + for codebook_num in range(codebook_seq_len): + codebook_logits = self.local_transformer_out_projections[codebook_num]( + local_transformer_output[:, codebook_num + 1, :] + ) + logits.append(codebook_logits) + logits = torch.stack(logits, dim=1) + + if use_cfg: + actual_batch_size = logits.size(0) // 2 + conditional_logits = logits[:actual_batch_size] + unconditional_logits = logits[actual_batch_size:] + if not dynamic_cfg_scale: + current_cfg_scale = cfg_scale + else: + progress = step / (n_steps - 1) + interp = progress + current_cfg_scale = (cfg_scale - 1) * interp + 1.0 + cfg_logits = current_cfg_scale * conditional_logits + (1.0 - current_cfg_scale) * unconditional_logits + logits[:actual_batch_size] = cfg_logits + + logits = clear_forbidden_logits(logits, self.codebook_size, forbid_audio_eos=forbid_audio_eos) + + for item_idx in unfinished_items: + logits[item_idx, self.audio_eos_id] = float('-inf') + for item_idx in finished_items: + logits[item_idx, :, :] = float('-inf') + logits[item_idx, :, self.audio_eos_id] = 0.0 + + logits_topk = torch.topk(logits, topk, dim=-1)[0] + indices_to_remove = logits < logits_topk[:, :, -1].unsqueeze(-1) + logits_rescored = logits.clone() + logits_rescored[indices_to_remove] = float('-inf') + probs = torch.softmax(logits_rescored / temperature, dim=-1) + sampled_codes = torch.multinomial(probs.view(B * codebook_seq_len, -1), 1).view(B, codebook_seq_len) + if use_cfg: + sampled_codes[actual_batch_size:] = sampled_codes[:actual_batch_size] + probs[actual_batch_size:] = probs[:actual_batch_size] + if sampling_type != "purity_causal" and sampling_type != "purity_default": + confidences = torch.gather(probs, dim=2, index=sampled_codes.unsqueeze(-1)).squeeze(-1) + else: + confidences = probs.max(dim=2)[0] + sampled_codes.scatter_(dim=1, index=topk_indices, src=unmasked_codes) + if noise_scale > 0.0: + noise = (torch.rand_like(confidences) - 0.5) * noise_scale * (1 - (step + 2) / n_steps) + confidences += noise + confidences[actual_batch_size:] = confidences[:actual_batch_size] + confidence_eps = 0.1 + assert ( + confidences.max() + confidence_eps < max_confidence + ), f"Predicted confidence is approaching max_confidence: {confidences.max()}" + confidences.scatter_( + index=topk_indices, dim=1, src=max_confidence * torch.ones_like(topk_indices, dtype=torch.float) + ) + codes = sampled_codes + assert not ( + codes == self.mask_token_id + ).any(), "Codes contain mask tokens after completion of MaskGit sampling" + + codes = codes.reshape(B, self.frame_stacking_factor, self.num_audio_codebooks).permute(0, 2, 1) + + if use_cfg: + codes = codes[:actual_batch_size] + return codes From 7883ed976534bdb0ccf24f8865673ef36ef7e12f Mon Sep 17 00:00:00 2001 From: paarthneekhara Date: Thu, 12 Mar 2026 20:39:18 +0000 Subject: [PATCH 92/94] Apply isort and black reformatting Signed-off-by: paarthneekhara --- nemo/collections/tts/models/easy_magpietts.py | 27 +++++++++++++------ .../tts/models/easy_magpietts_inference.py | 11 ++++---- .../easy_magpietts_preference_optimization.py | 3 ++- nemo/collections/tts/models/magpietts.py | 10 +++---- .../modules/magpietts_inference/inference.py | 3 ++- 5 files changed, 34 insertions(+), 20 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts.py b/nemo/collections/tts/models/easy_magpietts.py index ccc8f315a3c2..31ad48f9dbfe 100644 --- a/nemo/collections/tts/models/easy_magpietts.py +++ b/nemo/collections/tts/models/easy_magpietts.py @@ -238,17 +238,23 @@ def log_val_audio_example( codes=pred_audio_codes, codes_len=audio_codes_lens_target, ) - pred_audio_codes, pred_audio_codes_lens = self._prepare_codes_for_decode(pred_audio_codes, audio_codes_lens_target - 1) + pred_audio_codes, pred_audio_codes_lens = self._prepare_codes_for_decode( + pred_audio_codes, audio_codes_lens_target - 1 + ) pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio( - pred_audio_codes, pred_audio_codes_lens, + pred_audio_codes, + pred_audio_codes_lens, ) target_audio_codes, _ = remove_eos_token( codes=target_audio_codes, codes_len=audio_codes_lens_target, ) - target_audio_codes, target_audio_codes_lens = self._prepare_codes_for_decode(target_audio_codes, audio_codes_lens_target - 1) + target_audio_codes, target_audio_codes_lens = self._prepare_codes_for_decode( + target_audio_codes, audio_codes_lens_target - 1 + ) target_audio, target_audio_lens, _ = self._codec_helper.codes_to_audio( - target_audio_codes, target_audio_codes_lens, + target_audio_codes, + target_audio_codes_lens, ) context_audio, context_audio_lens = None, None @@ -258,9 +264,12 @@ def log_val_audio_example( codes=context_audio_codes, codes_len=context_audio_codes_lens, ) - context_audio_codes, context_audio_codes_lens = self._prepare_codes_for_decode(context_audio_codes, context_audio_codes_lens) + context_audio_codes, context_audio_codes_lens = self._prepare_codes_for_decode( + context_audio_codes, context_audio_codes_lens + ) context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio( - context_audio_codes, context_audio_codes_lens, + context_audio_codes, + context_audio_codes_lens, ) for logger in self.loggers: @@ -1118,10 +1127,12 @@ def validation_step(self, batch, batch_idx): codes_len=context_audio_codes_lens, ) context_audio_codes_cleaned, context_audio_codes_lens_cleaned = self._prepare_codes_for_decode( - context_audio_codes_cleaned, context_audio_codes_lens_cleaned, + context_audio_codes_cleaned, + context_audio_codes_lens_cleaned, ) context_audio_cleaned, context_audio_lens_cleaned, _ = self._codec_helper.codes_to_audio( - context_audio_codes_cleaned, context_audio_codes_lens_cleaned, + context_audio_codes_cleaned, + context_audio_codes_lens_cleaned, ) for idx in range(infer_output.predicted_audio.size(0)): diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py index 555c30308e39..9167c14a92d5 100644 --- a/nemo/collections/tts/models/easy_magpietts_inference.py +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -1721,9 +1721,12 @@ def streaming_finalize( # No need to remove EOS - end_indices already point to the frame before EOS # Decode to audio (codes are already unstacked: B, C, T) - predicted_codes, predicted_codes_lens = self._prepare_codes_for_decode(predicted_codes, predicted_codes_lens) + predicted_codes, predicted_codes_lens = self._prepare_codes_for_decode( + predicted_codes, predicted_codes_lens + ) audio, audio_len, decoded_codes = self._codec_helper.codes_to_audio( - predicted_codes, predicted_codes_lens, + predicted_codes, + predicted_codes_lens, ) return StreamingFinalizeOutput( @@ -1824,9 +1827,7 @@ def infer_batch( elif 'audio' in batch: gt_audio = batch['audio'] gt_audio_lens = batch['audio_lens'] - gt_audio_codes, gt_audio_codes_lens = self._codec_helper.audio_to_codes( - gt_audio, gt_audio_lens - ) + gt_audio_codes, gt_audio_codes_lens = self._codec_helper.audio_to_codes(gt_audio, gt_audio_lens) else: raise ValueError("Teacher forcing requires 'audio_codes' or 'audio' in batch") diff --git a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py index 600ddda579bd..46287373909c 100644 --- a/nemo/collections/tts/models/easy_magpietts_preference_optimization.py +++ b/nemo/collections/tts/models/easy_magpietts_preference_optimization.py @@ -439,7 +439,8 @@ def _get_reference_audio_paths(self, batch_repeated: Dict) -> List[str]: ).long() context_codes, context_lens = self._prepare_codes_for_decode(context_codes, context_lens) context_audio, context_audio_lens, _ = self._codec_helper.codes_to_audio( - context_codes, context_lens, + context_codes, + context_lens, ) return self._save_waveforms_to_paths( waveforms=context_audio, diff --git a/nemo/collections/tts/models/magpietts.py b/nemo/collections/tts/models/magpietts.py index f710bb853986..97b0de063008 100644 --- a/nemo/collections/tts/models/magpietts.py +++ b/nemo/collections/tts/models/magpietts.py @@ -1360,9 +1360,7 @@ def _prepare_audio_examples( pred_audio_codes, pred_audio_codes_lens = remove_eos_token( codes=pred_audio_codes, codes_len=audio_codes_lens ) - pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio( - pred_audio_codes, pred_audio_codes_lens - ) + pred_audio, pred_audio_lens, _ = self._codec_helper.codes_to_audio(pred_audio_codes, pred_audio_codes_lens) # Decode targets: remove EOS token, then decode to audio target_audio_codes, target_audio_codes_lens = remove_eos_token( @@ -1600,7 +1598,8 @@ def _get_context_audio_codes(self, batch: Dict[str, torch.Tensor]) -> Tuple[torc lens = batch['context_audio_codes_lens'] else: codes, lens = self._codec_helper.audio_to_codes( - batch['context_audio'], batch['context_audio_lens'], + batch['context_audio'], + batch['context_audio_lens'], sample_rate=batch.get('context_sample_rate'), ) @@ -2012,7 +2011,8 @@ def process_batch(self, batch): if 'audio_codes' not in batch: audio_codes, audio_codes_lens = self._codec_helper.audio_to_codes( - batch['audio'], batch['audio_lens'], + batch['audio'], + batch['audio_lens'], sample_rate=batch.get('sample_rate'), ) else: diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index 2cebff638977..d7f6e48b7e9e 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -473,7 +473,8 @@ def _run_unified_inference( predicted_codes_lens_tensor = torch.tensor(predicted_codes_lens, dtype=torch.long, device='cuda') predicted_audio, predicted_audio_lens, _ = self.model._codec_helper.codes_to_audio( - predicted_codes, predicted_codes_lens_tensor, + predicted_codes, + predicted_codes_lens_tensor, ) total_audio_samples = sum(predicted_audio_lens.cpu().tolist()) From 9d95ed4a87b7c3c489183615ecf196fc8e84918c Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Thu, 12 Mar 2026 17:22:57 -0400 Subject: [PATCH 93/94] bug fixed Signed-off-by: Paarth Neekhara --- nemo/collections/tts/modules/magpietts_inference/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/modules/magpietts_inference/inference.py b/nemo/collections/tts/modules/magpietts_inference/inference.py index 2cebff638977..3e0a7f36274a 100644 --- a/nemo/collections/tts/modules/magpietts_inference/inference.py +++ b/nemo/collections/tts/modules/magpietts_inference/inference.py @@ -581,7 +581,7 @@ def create_dataset( pad_context_text_to_max_duration=False, context_duration_min=context_duration_min, context_duration_max=context_duration_max, - ignore_phoneme_languages=self.config.get('ignore_phoneme_languages', []), + ignore_phoneme_languages=self.model.cfg.get('ignore_phoneme_languages', []), add_language_to_context_text=self.model.add_language_to_context_text, ) dataset.text_tokenizer = self.model.tokenizer From 50dd98d7e571cf60f23c0b0d8759a1cb439efa71 Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Sat, 14 Mar 2026 13:09:38 -0700 Subject: [PATCH 94/94] bug fix in easy magpie LT training Signed-off-by: Paarth Neekhara --- nemo/collections/tts/models/easy_magpietts_inference.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/models/easy_magpietts_inference.py b/nemo/collections/tts/models/easy_magpietts_inference.py index 9167c14a92d5..65abd37ce957 100644 --- a/nemo/collections/tts/models/easy_magpietts_inference.py +++ b/nemo/collections/tts/models/easy_magpietts_inference.py @@ -489,6 +489,10 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): ) self.local_transformer_out_projections = nn.ModuleList(local_transformer_out_projections) + # EasyMagpie stacks frames into the channel dimension (B, C*S, T_stacked) + # via stack_codes, unlike Magpie which keeps them interleaved in time (B, C, T_full). + # We pass num_audio_codebooks=C*S and frame_stacking_factor=1 so the helper + # treats each stacked channel as an independent codebook without time-domain striding. self._lt_helper = LocalTransformerHelper( local_transformer=self.local_transformer, audio_embeddings=self.audio_embeddings, @@ -496,8 +500,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): local_transformer_in_projection=self.local_transformer_in_projection, local_transformer_audio_out_projection=self.local_transformer_audio_out_projection, local_transformer_out_projections=self.local_transformer_out_projections, - num_audio_codebooks=self.num_audio_codebooks, - frame_stacking_factor=self.frame_stacking_factor, + num_audio_codebooks=self.num_audio_codebooks * self.frame_stacking_factor, + frame_stacking_factor=1, audio_eos_id=self.audio_eos_id, mask_token_id=self.mask_token_id, codebook_size=self.codebook_size,